From bd296fa43c8d7338b65af1c5ca7cfb02fc9c6daf Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Sat, 17 Jan 2009 11:18:14 -0500 Subject: [PATCH 001/319] Restore LitReader refactoring (again) --- src/calibre/ebooks/lit/reader.py | 363 +++++++++++++++++-------------- 1 file changed, 201 insertions(+), 162 deletions(-) diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index 461c067382..0e7f9a1ccf 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -7,20 +7,24 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' \ 'and Marshall T. Vandegrift ' -import sys, struct, cStringIO, os +import sys, struct, os import functools import re from urlparse import urldefrag +from cStringIO import StringIO +from urllib import unquote as urlunquote from lxml import etree from calibre.ebooks.lit import LitError from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP import calibre.ebooks.lit.mssha1 as mssha1 -from calibre.ebooks.oeb.base import urlnormalize +from calibre.ebooks.oeb.base import XML_PARSER, urlnormalize from calibre.ebooks import DRMError from calibre import plugins lzx, lxzerror = plugins['lzx'] msdes, msdeserror = plugins['msdes'] +__all__ = ["LitReader"] + XML_DECL = """ """ OPF_DECL = """ @@ -108,6 +112,9 @@ def consume_sized_utf8_string(bytes, zpad=False): pos += 1 return u''.join(result), bytes[pos:] +def encode(string): + return unicode(string).encode('ascii', 'xmlcharrefreplace') + class UnBinary(object): AMPERSAND_RE = re.compile( r'&(?!(?:#[0-9]+|#x[0-9a-fA-F]+|[a-zA-Z_:][a-zA-Z0-9.-_:]+);)') @@ -118,13 +125,13 @@ class UnBinary(object): def __init__(self, bin, path, manifest={}, map=HTML_MAP): self.manifest = manifest self.tag_map, self.attr_map, self.tag_to_attr_map = map - self.opf = map is OPF_MAP - self.bin = bin + self.is_html = map is HTML_MAP self.dir = os.path.dirname(path) - self.buf = cStringIO.StringIO() - self.binary_to_text() - self.raw = self.buf.getvalue().lstrip().decode('utf-8') + buf = StringIO() + self.binary_to_text(bin, buf) + self.raw = buf.getvalue().lstrip() self.escape_reserved() + self._tree = None def escape_reserved(self): raw = self.raw @@ -151,18 +158,28 @@ class UnBinary(object): return '/'.join(relpath) def __unicode__(self): + return self.raw.decode('utf-8') + + def __str__(self): return self.raw + + def tree(): + def fget(self): + if not self._tree: + self._tree = etree.fromstring(self.raw, parser=XML_PARSER) + return self._tree + return property(fget=fget) + tree = tree() - def binary_to_text(self, base=0, depth=0): + def binary_to_text(self, bin, buf, index=0, depth=0): tag_name = current_map = None dynamic_tag = errors = 0 in_censorship = is_goingdown = False state = 'text' - index = base flags = 0 - while index < len(self.bin): - c, index = read_utf8_char(self.bin, index) + while index < len(bin): + c, index = read_utf8_char(bin, index) oc = ord(c) if state == 'text': @@ -175,7 +192,7 @@ class UnBinary(object): c = '>>' elif c == '<': c = '<<' - self.buf.write(c.encode('ascii', 'xmlcharrefreplace')) + buf.write(encode(c)) elif state == 'get flags': if oc == 0: @@ -188,7 +205,7 @@ class UnBinary(object): state = 'text' if oc == 0 else 'get attr' if flags & FLAG_OPENING: tag = oc - self.buf.write('<') + buf.write('<') if not (flags & FLAG_CLOSING): is_goingdown = True if tag == 0x8000: @@ -205,7 +222,7 @@ class UnBinary(object): tag_name = '?'+unichr(tag)+'?' current_map = self.tag_to_attr_map[tag] print 'WARNING: tag %s unknown' % unichr(tag) - self.buf.write(unicode(tag_name).encode('utf-8')) + buf.write(encode(tag_name)) elif flags & FLAG_CLOSING: if depth == 0: raise LitError('Extra closing tag') @@ -217,15 +234,14 @@ class UnBinary(object): if not is_goingdown: tag_name = None dynamic_tag = 0 - self.buf.write(' />') + buf.write(' />') else: - self.buf.write('>') - index = self.binary_to_text(base=index, depth=depth+1) + buf.write('>') + index = self.binary_to_text(bin, buf, index, depth+1) is_goingdown = False if not tag_name: raise LitError('Tag ends before it begins.') - self.buf.write(u''.join( - ('')).encode('utf-8')) + buf.write(encode(u''.join(('')))) dynamic_tag = 0 tag_name = None state = 'text' @@ -245,7 +261,7 @@ class UnBinary(object): in_censorship = True state = 'get value length' continue - self.buf.write(' ' + unicode(attr).encode('utf-8') + '=') + buf.write(' ' + encode(attr) + '=') if attr in ['href', 'src']: state = 'get href length' else: @@ -253,40 +269,39 @@ class UnBinary(object): elif state == 'get value length': if not in_censorship: - self.buf.write('"') + buf.write('"') count = oc - 1 if count == 0: if not in_censorship: - self.buf.write('"') + buf.write('"') in_censorship = False state = 'get attr' continue state = 'get value' if oc == 0xffff: continue - if count < 0 or count > (len(self.bin) - index): + if count < 0 or count > (len(bin) - index): raise LitError('Invalid character count %d' % count) elif state == 'get value': if count == 0xfffe: if not in_censorship: - self.buf.write('%s"' % (oc - 1)) + buf.write('%s"' % (oc - 1)) in_censorship = False state = 'get attr' elif count > 0: if not in_censorship: - self.buf.write(c.encode( - 'ascii', 'xmlcharrefreplace')) + buf.write(encode(c)) count -= 1 if count == 0: if not in_censorship: - self.buf.write('"') + buf.write('"') in_censorship = False state = 'get attr' elif state == 'get custom length': count = oc - 1 - if count <= 0 or count > len(self.bin)-index: + if count <= 0 or count > len(bin)-index: raise LitError('Invalid character count %d' % count) dynamic_tag += 1 state = 'get custom' @@ -296,26 +311,26 @@ class UnBinary(object): tag_name += c count -= 1 if count == 0: - self.buf.write(unicode(tag_name).encode('utf-8')) + buf.write(encode(tag_name)) state = 'get attr' elif state == 'get attr length': count = oc - 1 - if count <= 0 or count > (len(self.bin) - index): + if count <= 0 or count > (len(bin) - index): raise LitError('Invalid character count %d' % count) - self.buf.write(' ') + buf.write(' ') state = 'get custom attr' elif state == 'get custom attr': - self.buf.write(unicode(c).encode('utf-8')) + buf.write(encode(c)) count -= 1 if count == 0: - self.buf.write('=') + buf.write('=') state = 'get value length' elif state == 'get href length': count = oc - 1 - if count <= 0 or count > (len(self.bin) - index): + if count <= 0 or count > (len(bin) - index): raise LitError('Invalid character count %d' % count) href = '' state = 'get href' @@ -329,10 +344,11 @@ class UnBinary(object): if frag: path = '#'.join((path, frag)) path = urlnormalize(path) - self.buf.write((u'"%s"' % path).encode('utf-8')) + buf.write(encode(u'"%s"' % path)) state = 'get attr' return index + class DirectoryEntry(object): def __init__(self, name, section, offset, size): self.name = name @@ -347,6 +363,7 @@ class DirectoryEntry(object): def __str__(self): return repr(self) + class ManifestItem(object): def __init__(self, original, internal, mime_type, offset, root, state): self.original = original @@ -374,65 +391,87 @@ class ManifestItem(object): % (self.internal, self.path, self.mime_type, self.offset, self.root, self.state) + def preserve(function): def wrapper(self, *args, **kwargs): - opos = self._stream.tell() + opos = self.stream.tell() try: return function(self, *args, **kwargs) finally: - self._stream.seek(opos) + self.stream.seek(opos) functools.update_wrapper(wrapper, function) return wrapper -class LitReader(object): +class LitFile(object): PIECE_SIZE = 16 - XML_PARSER = etree.XMLParser( - recover=True, resolve_entities=False) + + def __init__(self, filename_or_stream): + if hasattr(filename_or_stream, 'read'): + self.stream = filename_or_stream + else: + self.stream = open(filename_or_stream, 'rb') + try: + self.opf_path = os.path.splitext( + os.path.basename(self.stream.name))[0] + '.opf' + except AttributeError: + self.opf_path = 'content.opf' + if self.magic != 'ITOLITLS': + raise LitError('Not a valid LIT file') + if self.version != 1: + raise LitError('Unknown LIT version %d' % (self.version,)) + self.read_secondary_header() + self.read_header_pieces() + self.read_section_names() + self.read_manifest() + self.read_drm() + + def warn(self, msg): + print "WARNING: %s" % (msg,) def magic(): @preserve def fget(self): - self._stream.seek(0) - return self._stream.read(8) + self.stream.seek(0) + return self.stream.read(8) return property(fget=fget) magic = magic() def version(): def fget(self): - self._stream.seek(8) - return u32(self._stream.read(4)) + self.stream.seek(8) + return u32(self.stream.read(4)) return property(fget=fget) version = version() def hdr_len(): @preserve def fget(self): - self._stream.seek(12) - return int32(self._stream.read(4)) + self.stream.seek(12) + return int32(self.stream.read(4)) return property(fget=fget) hdr_len = hdr_len() def num_pieces(): @preserve def fget(self): - self._stream.seek(16) - return int32(self._stream.read(4)) + self.stream.seek(16) + return int32(self.stream.read(4)) return property(fget=fget) num_pieces = num_pieces() def sec_hdr_len(): @preserve def fget(self): - self._stream.seek(20) - return int32(self._stream.read(4)) + self.stream.seek(20) + return int32(self.stream.read(4)) return property(fget=fget) sec_hdr_len = sec_hdr_len() def guid(): @preserve def fget(self): - self._stream.seek(24) - return self._stream.read(16) + self.stream.seek(24) + return self.stream.read(16) return property(fget=fget) guid = guid() @@ -442,44 +481,27 @@ class LitReader(object): size = self.hdr_len \ + (self.num_pieces * self.PIECE_SIZE) \ + self.sec_hdr_len - self._stream.seek(0) - return self._stream.read(size) + self.stream.seek(0) + return self.stream.read(size) return property(fget=fget) header = header() - def __init__(self, filename_or_stream): - if hasattr(filename_or_stream, 'read'): - self._stream = filename_or_stream - else: - self._stream = open(filename_or_stream, 'rb') - if self.magic != 'ITOLITLS': - raise LitError('Not a valid LIT file') - if self.version != 1: - raise LitError('Unknown LIT version %d' % (self.version,)) - self.entries = {} - self._read_secondary_header() - self._read_header_pieces() - self._read_section_names() - self._read_manifest() - self._read_meta() - self._read_drm() - @preserve def __len__(self): - self._stream.seek(0, 2) - return self._stream.tell() + self.stream.seek(0, 2) + return self.stream.tell() @preserve - def _read_raw(self, offset, size): - self._stream.seek(offset) - return self._stream.read(size) + def read_raw(self, offset, size): + self.stream.seek(offset) + return self.stream.read(size) - def _read_content(self, offset, size): - return self._read_raw(self.content_offset + offset, size) + def read_content(self, offset, size): + return self.read_raw(self.content_offset + offset, size) - def _read_secondary_header(self): + def read_secondary_header(self): offset = self.hdr_len + (self.num_pieces * self.PIECE_SIZE) - bytes = self._read_raw(offset, self.sec_hdr_len) + bytes = self.read_raw(offset, self.sec_hdr_len) offset = int32(bytes[4:]) while offset < len(bytes): blocktype = bytes[offset:offset+4] @@ -507,21 +529,21 @@ class LitReader(object): if not hasattr(self, 'content_offset'): raise LitError('Could not figure out the content offset') - def _read_header_pieces(self): + def read_header_pieces(self): src = self.header[self.hdr_len:] for i in xrange(self.num_pieces): piece = src[i * self.PIECE_SIZE:(i + 1) * self.PIECE_SIZE] if u32(piece[4:]) != 0 or u32(piece[12:]) != 0: raise LitError('Piece %s has 64bit value' % repr(piece)) offset, size = u32(piece), int32(piece[8:]) - piece = self._read_raw(offset, size) + piece = self.read_raw(offset, size) if i == 0: continue # Dont need this piece elif i == 1: if u32(piece[8:]) != self.entry_chunklen or \ u32(piece[12:]) != self.entry_unknown: raise LitError('Secondary header does not match piece') - self._read_directory(piece) + self.read_directory(piece) elif i == 2: if u32(piece[8:]) != self.count_chunklen or \ u32(piece[12:]) != self.count_unknown: @@ -532,12 +554,13 @@ class LitReader(object): elif i == 4: self.piece4_guid = piece - def _read_directory(self, piece): + def read_directory(self, piece): if not piece.startswith('IFCM'): raise LitError('Header piece #1 is not main directory.') chunk_size, num_chunks = int32(piece[8:12]), int32(piece[24:28]) if (32 + (num_chunks * chunk_size)) != len(piece): - raise LitError('IFCM HEADER has incorrect length') + raise LitError('IFCM header has incorrect length') + self.entries = {} for i in xrange(num_chunks): offset = 32 + (i * chunk_size) chunk = piece[offset:offset + chunk_size] @@ -571,17 +594,17 @@ class LitReader(object): entry = DirectoryEntry(name, section, offset, size) self.entries[name] = entry - def _read_section_names(self): + def read_section_names(self): if '::DataSpace/NameList' not in self.entries: raise LitError('Lit file does not have a valid NameList') raw = self.get_file('::DataSpace/NameList') if len(raw) < 4: raise LitError('Invalid Namelist section') pos = 4 - self.num_sections = u16(raw[2:pos]) - self.section_names = [""]*self.num_sections - self.section_data = [None]*self.num_sections - for section in xrange(self.num_sections): + num_sections = u16(raw[2:pos]) + self.section_names = [""] * num_sections + self.section_data = [None] * num_sections + for section in xrange(num_sections): size = u16(raw[pos:pos+2]) pos += 2 size = size*2 + 2 @@ -591,11 +614,12 @@ class LitReader(object): raw[pos:pos+size].decode('utf-16-le').rstrip('\000') pos += size - def _read_manifest(self): + def read_manifest(self): if '/manifest' not in self.entries: raise LitError('Lit file does not have a valid manifest') raw = self.get_file('/manifest') self.manifest = {} + self.paths = {self.opf_path: None} while raw: slen, raw = ord(raw[0]), raw[1:] if slen == 0: break @@ -634,28 +658,9 @@ class LitReader(object): for item in mlist: if item.path[0] == '/': item.path = os.path.basename(item.path) + self.paths[item.path] = item - def _pretty_print(self, xml): - f = cStringIO.StringIO(xml.encode('utf-8')) - doc = etree.parse(f, parser=self.XML_PARSER) - pretty = etree.tostring(doc, encoding='ascii', pretty_print=True) - return XML_DECL + unicode(pretty) - - def _read_meta(self): - path = 'content.opf' - raw = self.get_file('/meta') - xml = OPF_DECL - try: - xml += unicode(UnBinary(raw, path, self.manifest, OPF_MAP)) - except LitError: - if 'PENGUIN group' not in raw: raise - print "WARNING: attempting PENGUIN malformed OPF fix" - raw = raw.replace( - 'PENGUIN group', '\x00\x01\x18\x00PENGUIN group', 1) - xml += unicode(UnBinary(raw, path, self.manifest, OPF_MAP)) - self.meta = xml - - def _read_drm(self): + def read_drm(self): self.drmlevel = 0 if '/DRMStorage/Licenses/EUL' in self.entries: self.drmlevel = 5 @@ -666,7 +671,7 @@ class LitReader(object): else: return if self.drmlevel < 5: - msdes.deskey(self._calculate_deskey(), msdes.DE1) + msdes.deskey(self.calculate_deskey(), msdes.DE1) bookkey = msdes.des(self.get_file('/DRMStorage/DRMSealed')) if bookkey[0] != '\000': raise LitError('Unable to decrypt title key!') @@ -674,7 +679,7 @@ class LitReader(object): else: raise DRMError("Cannot access DRM-protected book") - def _calculate_deskey(self): + def calculate_deskey(self): hashfiles = ['/meta', '/DRMStorage/DRMSource'] if self.drmlevel == 3: hashfiles.append('/DRMStorage/DRMBookplate') @@ -698,18 +703,18 @@ class LitReader(object): def get_file(self, name): entry = self.entries[name] if entry.section == 0: - return self._read_content(entry.offset, entry.size) + return self.read_content(entry.offset, entry.size) section = self.get_section(entry.section) return section[entry.offset:entry.offset+entry.size] def get_section(self, section): data = self.section_data[section] if not data: - data = self._get_section(section) + data = self.get_section_uncached(section) self.section_data[section] = data return data - def _get_section(self, section): + def get_section_uncached(self, section): name = self.section_names[section] path = '::DataSpace/Storage/' + name transform = self.get_file(path + '/Transform/List') @@ -721,29 +726,29 @@ class LitReader(object): raise LitError("ControlData is too short") guid = msguid(transform) if guid == DESENCRYPT_GUID: - content = self._decrypt(content) + content = self.decrypt(content) control = control[csize:] elif guid == LZXCOMPRESS_GUID: reset_table = self.get_file( '/'.join(('::DataSpace/Storage', name, 'Transform', LZXCOMPRESS_GUID, 'InstanceData/ResetTable'))) - content = self._decompress(content, control, reset_table) + content = self.decompress(content, control, reset_table) control = control[csize:] else: raise LitError("Unrecognized transform: %s." % repr(guid)) transform = transform[16:] return content - def _decrypt(self, content): + def decrypt(self, content): length = len(content) extra = length & 0x7 if extra > 0: - self._warn("content length not a multiple of block size") + self.warn("content length not a multiple of block size") content += "\0" * (8 - extra) msdes.deskey(self.bookkey, msdes.DE1) return msdes.des(content) - def _decompress(self, content, control, reset_table): + def decompress(self, content, control, reset_table): if len(control) < 32 or control[CONTROL_TAG:CONTROL_TAG+4] != "LZXC": raise LitError("Invalid ControlData tag value") if len(reset_table) < (RESET_INTERVAL + 8): @@ -784,7 +789,7 @@ class LitReader(object): result.append( lzx.decompress(content[base:size], window_bytes)) except lzx.LZXError: - self._warn("LZX decompression error; skipping chunk") + self.warn("LZX decompression error; skipping chunk") bytes_remaining -= window_bytes base = size accum += int32(reset_table[RESET_INTERVAL:]) @@ -794,55 +799,88 @@ class LitReader(object): try: result.append(lzx.decompress(content[base:], bytes_remaining)) except lzx.LZXError: - self._warn("LZX decompression error; skipping chunk") + self.warn("LZX decompression error; skipping chunk") bytes_remaining = 0 if bytes_remaining > 0: raise LitError("Failed to completely decompress section") return ''.join(result) - def get_entry_content(self, entry, pretty_print=False): - if 'spine' in entry.state: - name = '/'.join(('/data', entry.internal, 'content')) - path = entry.path - raw = self.get_file(name) - decl, map = (OPF_DECL, OPF_MAP) \ - if name == '/meta' else (HTML_DECL, HTML_MAP) - content = decl + unicode(UnBinary(raw, path, self.manifest, map)) - if pretty_print: - content = self._pretty_print(content) - content = content.encode('utf-8') - else: - name = '/'.join(('/data', entry.internal)) - content = self.get_file(name) - return content - - def extract_content(self, output_dir=os.getcwdu(), pretty_print=False): - output_dir = os.path.abspath(output_dir) - try: - opf_path = os.path.splitext( - os.path.basename(self._stream.name))[0] + '.opf' - except AttributeError: - opf_path = 'content.opf' - opf_path = os.path.join(output_dir, opf_path) - self._ensure_dir(opf_path) - with open(opf_path, 'wb') as f: - xml = self.meta - if pretty_print: - xml = self._pretty_print(xml) - f.write(xml.encode('utf-8')) - for entry in self.manifest.values(): - path = os.path.join(output_dir, entry.path) - self._ensure_dir(path) - with open(path, 'wb') as f: - f.write(self.get_entry_content(entry, pretty_print)) +class LitReader(object): + def __init__(self, filename_or_stream): + self._litfile = LitFile(filename_or_stream) + + def namelist(self): + return self._litfile.paths.keys() + + def exists(self, name): + return urlunquote(name) in self._litfile.paths + + def read_xml(self, name): + entry = self._litfile.paths[urlunquote(name)] if name else None + if entry is None: + content = self._read_meta() + elif 'spine' in entry.state: + internal = '/'.join(('/data', entry.internal, 'content')) + raw = self._litfile.get_file(internal) + unbin = UnBinary(raw, name, self._litfile.manifest, HTML_MAP) + content = unbin.tree + else: + raise LitError('Requested non-XML content as XML') + return content + + def read(self, name, pretty_print=False): + entry = self._litfile.paths[urlunquote(name)] if name else None + if entry is None: + meta = self._read_meta() + content = OPF_DECL + etree.tostring( + meta, encoding='ascii', pretty_print=pretty_print) + elif 'spine' in entry.state: + internal = '/'.join(('/data', entry.internal, 'content')) + raw = self._litfile.get_file(internal) + unbin = UnBinary(raw, name, self._litfile.manifest, HTML_MAP) + content = HTML_DECL + if pretty_print: + content += etree.tostring(unbin.tree, + encoding='ascii', pretty_print=True) + else: + content += str(unbin) + else: + internal = '/'.join(('/data', entry.internal)) + content = self._litfile.get_file(internal) + return content + + def meta(): + def fget(self): + return self.read(self._litfile.opf_path) + return property(fget=fget) + meta = meta() + def _ensure_dir(self, path): dir = os.path.dirname(path) if not os.path.isdir(dir): os.makedirs(dir) + + def extract_content(self, output_dir=os.getcwdu(), pretty_print=False): + for name in self.namelist(): + path = os.path.join(output_dir, name) + self._ensure_dir(path) + with open(path, 'wb') as f: + f.write(self.read(name, pretty_print=pretty_print)) + + def _read_meta(self): + path = 'content.opf' + raw = self._litfile.get_file('/meta') + try: + unbin = UnBinary(raw, path, self._litfile.manifest, OPF_MAP) + except LitError: + if 'PENGUIN group' not in raw: raise + print "WARNING: attempting PENGUIN malformed OPF fix" + raw = raw.replace( + 'PENGUIN group', '\x00\x01\x18\x00PENGUIN group', 1) + unbin = UnBinary(raw, path, self._litfile.manifest, OPF_MAP) + return unbin.tree - def _warn(self, msg): - print "WARNING: %s" % (msg,) def option_parser(): from calibre.utils.config import OptionParser @@ -852,7 +890,8 @@ def option_parser(): help=_('Output directory. Defaults to current directory.')) parser.add_option( '-p', '--pretty-print', default=False, action='store_true', - help=_('Legibly format extracted markup. May modify meaningful whitespace.')) + help=_('Legibly format extracted markup.' \ + ' May modify meaningful whitespace.')) parser.add_option( '--verbose', default=False, action='store_true', help=_('Useful for debugging.')) From cba3bb55e4108842d9e10ff5d9cc75e2f15b0361 Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Sat, 17 Jan 2009 14:43:16 -0500 Subject: [PATCH 002/319] Minor clean-ups to CSS flattening --- src/calibre/ebooks/oeb/transforms/flatcss.py | 31 ++++++++------------ 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py index 7110c2db2d..375003c1a5 100644 --- a/src/calibre/ebooks/oeb/transforms/flatcss.py +++ b/src/calibre/ebooks/oeb/transforms/flatcss.py @@ -144,7 +144,8 @@ class CSSFlattener(object): value = round(value / slineh) * dlineh cssdict[property] = "%0.5fem" % (value / fsize) - def flatten_node(self, node, stylizer, names, styles, psize, left=0): + def flatten_node(self, node, stylizer, names, styles, psize, left=0, + valigned=False): if not isinstance(node.tag, basestring) \ or namespace(node.tag) != XHTML_NS: return @@ -154,18 +155,6 @@ class CSSFlattener(object): if 'align' in node.attrib: cssdict['text-align'] = node.attrib['align'] del node.attrib['align'] - if node.tag == XHTML('font'): - node.tag = XHTML('span') - if 'size' in node.attrib: - size = node.attrib['size'].strip() - if size: - fnums = self.context.source.fnums - if size[0] in ('+', '-'): - # Oh, the warcrimes - cssdict['font-size'] = fnums[3+int(size)] - else: - cssdict['font-size'] = fnums[int(size)] - del node.attrib['size'] if 'color' in node.attrib: cssdict['color'] = node.attrib['color'] del node.attrib['color'] @@ -173,7 +162,7 @@ class CSSFlattener(object): cssdict['background-color'] = node.attrib['bgcolor'] del node.attrib['bgcolor'] if cssdict: - if 'font-size' in cssdict: + if 'font-size' in cssdict or tag == 'body': fsize = self.fmap[style['font-size']] cssdict['font-size'] = "%0.5fem" % (fsize / psize) psize = fsize @@ -197,10 +186,13 @@ class CSSFlattener(object): cssdict['display'] = 'inline' else: cssdict['display'] = 'block' - if 'vertical-align' in cssdict \ - and cssdict['vertical-align'] == 'sup': - cssdict['vertical-align'] = 'super' - if self.lineh and 'line-height' not in cssdict: + if 'vertical-align' in cssdict: + if cssdict['vertical-align'] == 'sup': + cssdict['vertical-align'] = 'text-top' + if style['vertical-align'] != 'baseline': + cssdict['line-height'] = '0' + valigned = True + if self.lineh and 'line-height' not in cssdict and not valigned: lineh = self.lineh / psize cssdict['line-height'] = "%0.5fem" % lineh if cssdict: @@ -220,7 +212,8 @@ class CSSFlattener(object): if 'style' in node.attrib: del node.attrib['style'] for child in node: - self.flatten_node(child, stylizer, names, styles, psize, left) + self.flatten_node(child, stylizer, names, styles, psize, left, + valigned) def flatten_head(self, item, stylizer, href): html = item.data From 76de6aef24f99929957676fde5e98f86f209345b Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Sun, 18 Jan 2009 21:44:43 -0500 Subject: [PATCH 003/319] Use etree.html to handle HTML entities and not UTF-8 encodings --- src/calibre/ebooks/oeb/base.py | 20 +++++++------------- src/calibre/ebooks/oeb/transforms/flatcss.py | 8 ++++---- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 4248657e23..a903136610 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -15,10 +15,10 @@ from urlparse import urldefrag, urlparse, urlunparse from urllib import unquote as urlunquote import logging import re -import htmlentitydefs import uuid import copy from lxml import etree +from lxml import html from calibre import LoggingInterface XML_PARSER = etree.XMLParser(recover=True) @@ -67,14 +67,6 @@ OEB_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME, SVG_MIME]) MS_COVER_TYPE = 'other.ms-coverimage-standard' -recode = lambda s: s.decode('iso-8859-1').encode('ascii', 'xmlcharrefreplace') -ENTITYDEFS = dict((k, recode(v)) for k, v in htmlentitydefs.entitydefs.items()) -del ENTITYDEFS['lt'] -del ENTITYDEFS['gt'] -del ENTITYDEFS['quot'] -del ENTITYDEFS['amp'] -del recode - def element(parent, *args, **kwargs): if parent is not None: @@ -298,7 +290,6 @@ class Metadata(object): class Manifest(object): class Item(object): - ENTITY_RE = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);') NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)') def __init__(self, id, href, media_type, @@ -317,9 +308,12 @@ class Manifest(object): % (self.id, self.href, self.media_type) def _force_xhtml(self, data): - repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0)) - data = self.ENTITY_RE.sub(repl, data) - data = etree.fromstring(data, parser=XML_PARSER) + try: + data = etree.fromstring(data, parser=XML_PARSER) + except etree.XMLSyntaxError: + data = html.fromstring(data, parser=XML_PARSER) + data = etree.tostring(data, encoding=unicode) + data = etree.fromstring(data, parser=XML_PARSER) if namespace(data.tag) != XHTML_NS: data.attrib['xmlns'] = XHTML_NS data = etree.tostring(data) diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py index 375003c1a5..4877b28f51 100644 --- a/src/calibre/ebooks/oeb/transforms/flatcss.py +++ b/src/calibre/ebooks/oeb/transforms/flatcss.py @@ -161,11 +161,11 @@ class CSSFlattener(object): if 'bgcolor' in node.attrib: cssdict['background-color'] = node.attrib['bgcolor'] del node.attrib['bgcolor'] + if 'font-size' in cssdict or tag == 'body': + fsize = self.fmap[style['font-size']] + cssdict['font-size'] = "%0.5fem" % (fsize / psize) + psize = fsize if cssdict: - if 'font-size' in cssdict or tag == 'body': - fsize = self.fmap[style['font-size']] - cssdict['font-size'] = "%0.5fem" % (fsize / psize) - psize = fsize if self.lineh and self.fbase and tag != 'body': self.clean_edges(cssdict, style, psize) margin = style['margin-left'] From a9f4ab2346c78c63d60478036c4ddec0ececdf46 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 5 Feb 2009 17:00:04 -0800 Subject: [PATCH 004/319] Minor fixes --- src/calibre/ebooks/lrf/meta.py | 7 ++++++- src/calibre/ebooks/metadata/__init__.py | 13 ++++++++----- src/calibre/ebooks/metadata/cli.py | 21 +++++++++++++++++---- src/calibre/ebooks/metadata/opf.xml | 6 +++--- src/calibre/ebooks/metadata/opf2.py | 23 ++++++++++++++++++++++- 5 files changed, 56 insertions(+), 14 deletions(-) diff --git a/src/calibre/ebooks/lrf/meta.py b/src/calibre/ebooks/lrf/meta.py index 331e101ddd..322835f470 100644 --- a/src/calibre/ebooks/lrf/meta.py +++ b/src/calibre/ebooks/lrf/meta.py @@ -229,6 +229,9 @@ def get_metadata(stream): mi.author = lrf.author.strip() mi.comments = lrf.free_text.strip() mi.category = lrf.category.strip()+', '+lrf.classification.strip() + tags = [x.strip() for x in mi.category.split(',') if x.strip()] + if tags: + mi.tags = tags mi.publisher = lrf.publisher.strip() mi.cover_data = lrf.get_cover() try: @@ -624,7 +627,9 @@ def set_metadata(stream, mi): lrf.title = mi.title if mi.authors: lrf.author = ', '.join(mi.authors) - if mi.category: + if mi.tags: + lrf.category = mi.tags[0] + if getattr(mi, 'category', False): lrf.category = mi.category if mi.comments: lrf.free_text = mi.comments diff --git a/src/calibre/ebooks/metadata/__init__.py b/src/calibre/ebooks/metadata/__init__.py index 063e56190b..e3c434342a 100644 --- a/src/calibre/ebooks/metadata/__init__.py +++ b/src/calibre/ebooks/metadata/__init__.py @@ -185,7 +185,7 @@ class MetaInformation(object): @staticmethod def copy(mi): ans = MetaInformation(mi.title, mi.authors) - for attr in ('author_sort', 'title_sort', 'comments', + for attr in ('author_sort', 'title_sort', 'comments', 'category', 'publisher', 'series', 'series_index', 'rating', 'isbn', 'tags', 'cover_data', 'application_id', 'guide', 'manifest', 'spine', 'toc', 'cover', 'language', 'book_producer'): @@ -210,7 +210,7 @@ class MetaInformation(object): #: mi.cover_data = (ext, data) self.cover_data = getattr(mi, 'cover_data', (None, None)) - for x in ('author_sort', 'title_sort', 'comments', 'publisher', + for x in ('author_sort', 'title_sort', 'comments', 'category', 'publisher', 'series', 'series_index', 'rating', 'isbn', 'language', 'application_id', 'manifest', 'toc', 'spine', 'guide', 'cover', 'book_producer', @@ -228,7 +228,7 @@ class MetaInformation(object): if mi.authors and mi.authors[0] != _('Unknown'): self.authors = mi.authors - for attr in ('author_sort', 'title_sort', 'comments', + for attr in ('author_sort', 'title_sort', 'comments', 'category', 'publisher', 'series', 'series_index', 'rating', 'isbn', 'application_id', 'manifest', 'spine', 'toc', 'cover', 'language', 'guide', 'book_producer'): @@ -251,10 +251,11 @@ class MetaInformation(object): return '%d'%x if int(x) == x else '%.2f'%x def __unicode__(self): - ans = [ fmt('Title', self.title) ] + ans = [] def fmt(x, y): ans.append(u'%-20s: %s'%(unicode(x), unicode(y))) - + + fmt('Title', self.title) if self.title_sort: fmt('Title sort', self.title_sort) if self.authors: @@ -264,6 +265,8 @@ class MetaInformation(object): fmt('Publisher', self.publisher) if getattr(self, 'book_producer', False): fmt('Book Producer', self.book_producer) + if self.category: + ans += u'Category : ' + unicode(self.category) + u'\n' if self.comments: fmt('Comments', self.comments) if self.isbn: diff --git a/src/calibre/ebooks/metadata/cli.py b/src/calibre/ebooks/metadata/cli.py index 75b541d9c9..4101f34047 100644 --- a/src/calibre/ebooks/metadata/cli.py +++ b/src/calibre/ebooks/metadata/cli.py @@ -26,6 +26,7 @@ from calibre.customize.ui import metadata_readers, metadata_writers from calibre.ebooks.metadata.meta import get_metadata, set_metadata from calibre.ebooks.metadata import string_to_authors, authors_to_sort_string, \ title_sort, MetaInformation +from calibre.ebooks.lrf.meta import LRFMetaFile from calibre import prints def config(): @@ -50,6 +51,8 @@ def config(): help=_('Set the ebook description.')) c.add_opt('publisher', ['-p', '--publisher'], help=_('Set the ebook publisher.')) + c.add_opt('category', ['--category'], + help=_('Set the book category.')) c.add_opt('series', ['-s', '--series'], help=_('Set the series this ebook belongs to.')) c.add_opt('series_index', ['-i', '--index'], @@ -75,6 +78,9 @@ def config(): help=_('Read metadata from the specified OPF file and use it to ' 'set metadata in the ebook. Metadata specified on the' 'command line will override metadata read from the OPF file')) + + c.add_opt('lrf_bookid', ['--lrf-bookid'], + help=_('Set the BookID in LRF files')) return c def filetypes(): @@ -102,12 +108,12 @@ def do_set_metadata(opts, mi, stream, stream_type): for pref in config().option_set.preferences: if pref.name in ('to_opf', 'from_opf', 'authors', 'title_sort', - 'author_sort', 'get_cover', 'cover', 'tags'): + 'author_sort', 'get_cover', 'cover', 'tags', + 'lrf_bookid'): continue val = getattr(opts, pref.name, None) if val is not None: - setattr(mi, pref.name, getattr()) - + setattr(mi, pref.name, val) if getattr(opts, 'authors', None) is not None: mi.authors = string_to_authors(opts.authors) mi.author_sort = authors_to_sort_string(mi.authors) @@ -158,11 +164,18 @@ def main(args=sys.argv): do_set_metadata(opts, mi, stream, stream_type) stream.seek(0) stream.flush() + lrf = None + if stream_type == 'lrf': + if opts.lrf_bookid is not None: + lrf = LRFMetaFile(stream) + lrf.book_id = opts.lrf_bookid mi = get_metadata(stream, stream_type) - prints(_('Changed metadata')+'::') + prints('\n' + _('Changed metadata') + '::') metadata = unicode(mi) metadata = '\t'+'\n\t'.join(metadata.split('\n')) prints(metadata) + if lrf is not None: + prints('\tBookID:', lrf.book_id) if opts.to_opf is not None: from calibre.ebooks.metadata.opf2 import OPFCreator diff --git a/src/calibre/ebooks/metadata/opf.xml b/src/calibre/ebooks/metadata/opf.xml index d95268f306..703e82b5c1 100644 --- a/src/calibre/ebooks/metadata/opf.xml +++ b/src/calibre/ebooks/metadata/opf.xml @@ -6,13 +6,13 @@ > - ${mi.title} + ${mi.title} ${author} - ${'%s (%s)'%(__appname__, __version__)} [http://${__appname__}.kovidgoyal.net] + ${'%s (%s)'%(__appname__, __version__)} [http://${__appname__}.kovidgoyal.net] ${mi.application_id} ${mi.language if mi.language else 'UND'} - ${mi.category} + ${mi.category} ${mi.comments} ${mi.publisher} ${mi.isbn} diff --git a/src/calibre/ebooks/metadata/opf2.py b/src/calibre/ebooks/metadata/opf2.py index 718d615e71..f051ad8568 100644 --- a/src/calibre/ebooks/metadata/opf2.py +++ b/src/calibre/ebooks/metadata/opf2.py @@ -414,6 +414,7 @@ class OPF(object): metadata_path = XPath('descendant::*[re:match(name(), "metadata", "i")]') metadata_elem_path = XPath('descendant::*[re:match(name(), concat($name, "$"), "i") or (re:match(name(), "meta$", "i") and re:match(@name, concat("^calibre:", $name, "$"), "i"))]') + title_path = XPath('descendant::*[re:match(name(), "title", "i")]') authors_path = XPath('descendant::*[re:match(name(), "creator", "i") and (@role="aut" or @opf:role="aut" or (not(@role) and not(@opf:role)))]') bkp_path = XPath('descendant::*[re:match(name(), "contributor", "i") and (@role="bkp" or @opf:role="bkp")]') tags_path = XPath('descendant::*[re:match(name(), "subject", "i")]') @@ -503,7 +504,7 @@ class OPF(object): def set_text(self, elem, content): if elem.tag == self.META: - elem.attib['content'] = content + elem.attrib['content'] = content else: elem.text = content @@ -645,6 +646,26 @@ class OPF(object): return property(fget=fget, fset=fset) + @apply + def title_sort(): + + def fget(self): + matches = self.title_path(self.metadata) + if matches: + for match in matches: + ans = match.get('{%s}file-as'%self.NAMESPACES['opf'], None) + if not ans: + ans = match.get('file-as', None) + if ans: + return ans + + def fset(self, val): + matches = self.title_path(self.metadata) + if matches: + matches[0].set('file-as', unicode(val)) + + return property(fget=fget, fset=fset) + @apply def tags(): From 972e2161c71e3308ed0b2dd1d277c21ab0a2af12 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 6 Feb 2009 23:35:51 -0800 Subject: [PATCH 005/319] Remove use of the apply builtin as it is deprecated --- src/calibre/__init__.py | 4 +- src/calibre/devices/libusb.py | 10 ++--- src/calibre/devices/prs500/books.py | 18 ++++---- src/calibre/devices/prs500/cli/main.py | 26 +++++------ src/calibre/devices/prs500/prstypes.py | 57 ++++++++++++------------- src/calibre/devices/prs505/books.py | 18 ++++---- src/calibre/devices/usbms/books.py | 9 ++-- src/calibre/ebooks/html.py | 10 ++--- src/calibre/ebooks/lrf/tags.py | 16 +++---- src/calibre/ebooks/metadata/opf.py | 14 +++--- src/calibre/ebooks/metadata/opf2.py | 42 +++++++++--------- src/calibre/ebooks/metadata/toc.py | 6 +-- src/calibre/ebooks/oeb/base.py | 26 +++++------ src/calibre/gui2/viewer/documentview.py | 52 +++++++++++----------- src/calibre/library/database.py | 6 +-- src/calibre/library/database2.py | 6 +-- 16 files changed, 160 insertions(+), 160 deletions(-) diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index fe140df032..cb3c05c7b9 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -2,7 +2,9 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import sys, os, re, logging, time, subprocess, atexit, mimetypes +import sys, os, re, logging, time, subprocess, atexit, mimetypes, \ + __builtin__ +__builtin__.__dict__['dynamic_property'] = lambda(func): func(None) from htmlentitydefs import name2codepoint from math import floor from logging import Formatter diff --git a/src/calibre/devices/libusb.py b/src/calibre/devices/libusb.py index 226a99f239..09261e10c5 100644 --- a/src/calibre/devices/libusb.py +++ b/src/calibre/devices/libusb.py @@ -116,8 +116,8 @@ class Device(Structure): raise Error("Cannot open device") return handle.contents - @apply - def configurations(): + @dynamic_property + def configurations(self): doc = """ List of device configurations. See L{ConfigDescriptor} """ def fget(self): ans = [] @@ -127,8 +127,8 @@ class Device(Structure): return property(doc=doc, fget=fget) class Bus(Structure): - @apply - def device_list(): + @dynamic_property + def device_list(self): doc = \ """ Flat list of devices on this bus. @@ -360,4 +360,4 @@ def get_devices(): for dev in devices: device = (dev.device_descriptor.idVendor, dev.device_descriptor.idProduct, dev.device_descriptor.bcdDevice) ans.append(device) - return ans + return ans \ No newline at end of file diff --git a/src/calibre/devices/prs500/books.py b/src/calibre/devices/prs500/books.py index 6c57920487..d567511ec6 100644 --- a/src/calibre/devices/prs500/books.py +++ b/src/calibre/devices/prs500/books.py @@ -55,8 +55,8 @@ class Book(object): size = book_metadata_field("size", formatter=int) # When setting this attribute you must use an epoch datetime = book_metadata_field("date", formatter=strptime, setter=strftime) - @apply - def title_sorter(): + @dynamic_property + def title_sorter(self): doc = '''String to sort the title. If absent, title is returned''' def fget(self): src = self.elem.getAttribute('titleSorter').strip() @@ -67,8 +67,8 @@ class Book(object): self.elem.setAttribute('titleSorter', sortable_title(unicode(val))) return property(doc=doc, fget=fget, fset=fset) - @apply - def thumbnail(): + @dynamic_property + def thumbnail(self): doc = \ """ The thumbnail. Should be a height 68 image. @@ -88,15 +88,15 @@ class Book(object): return decode(rc) return property(fget=fget, doc=doc) - @apply - def path(): + @dynamic_property + def path(self): doc = """ Absolute path to book on device. Setting not supported. """ def fget(self): return self.root + self.rpath return property(fget=fget, doc=doc) - @apply - def db_id(): + @dynamic_property + def db_id(self): doc = '''The database id in the application database that this file corresponds to''' def fget(self): match = re.search(r'_(\d+)$', self.rpath.rpartition('.')[0]) @@ -378,4 +378,4 @@ class BookList(_BookList): def write(self, stream): """ Write XML representation of DOM tree to C{stream} """ - stream.write(self.document.toxml('utf-8')) + stream.write(self.document.toxml('utf-8')) \ No newline at end of file diff --git a/src/calibre/devices/prs500/cli/main.py b/src/calibre/devices/prs500/cli/main.py index dfd3eb1ed6..4a94bf41af 100755 --- a/src/calibre/devices/prs500/cli/main.py +++ b/src/calibre/devices/prs500/cli/main.py @@ -39,8 +39,8 @@ class FileFormatter(object): self.name = file.name self.path = file.path - @apply - def mode_string(): + @dynamic_property + def mode_string(self): doc=""" The mode string for this file. There are only two modes read-only and read-write """ def fget(self): mode, x = "-", "-" @@ -50,8 +50,8 @@ class FileFormatter(object): return mode return property(doc=doc, fget=fget) - @apply - def isdir_name(): + @dynamic_property + def isdir_name(self): doc='''Return self.name + '/' if self is a directory''' def fget(self): name = self.name @@ -61,8 +61,8 @@ class FileFormatter(object): return property(doc=doc, fget=fget) - @apply - def name_in_color(): + @dynamic_property + def name_in_color(self): doc=""" The name in ANSI text. Directories are blue, ebooks are green """ def fget(self): cname = self.name @@ -75,22 +75,22 @@ class FileFormatter(object): return cname return property(doc=doc, fget=fget) - @apply - def human_readable_size(): + @dynamic_property + def human_readable_size(self): doc=""" File size in human readable form """ def fget(self): return human_readable(self.size) return property(doc=doc, fget=fget) - @apply - def modification_time(): + @dynamic_property + def modification_time(self): doc=""" Last modified time in the Linux ls -l format """ def fget(self): return time.strftime("%Y-%m-%d %H:%M", time.localtime(self.wtime)) return property(doc=doc, fget=fget) - @apply - def creation_time(): + @dynamic_property + def creation_time(self): doc=""" Last modified time in the Linux ls -l format """ def fget(self): return time.strftime("%Y-%m-%d %H:%M", time.localtime(self.ctime)) @@ -334,4 +334,4 @@ def main(): return 0 if __name__ == '__main__': - main() + main() \ No newline at end of file diff --git a/src/calibre/devices/prs500/prstypes.py b/src/calibre/devices/prs500/prstypes.py index 4e1294fc1c..3efbfcab31 100755 --- a/src/calibre/devices/prs500/prstypes.py +++ b/src/calibre/devices/prs500/prstypes.py @@ -284,8 +284,8 @@ class Command(TransferBuffer): # Length of the data part of this packet length = field(start=12, fmt=DWORD) - @apply - def data(): + @dynamic_property + def data(self): doc = \ """ The data part of this command. Returned/set as/by a TransferBuffer. @@ -447,8 +447,8 @@ class LongCommand(Command): self.length = 16 self.command = command - @apply - def command(): + @dynamic_property + def command(self): doc = \ """ Usually carries extra information needed for the command @@ -568,8 +568,8 @@ class FileOpen(PathCommand): PathCommand.__init__(self, path, FileOpen.NUMBER, path_len_at_byte=20) self.mode = mode - @apply - def mode(): + @dynamic_property + def mode(self): doc = \ """ The file open mode. Is either L{FileOpen.READ} @@ -651,8 +651,8 @@ class Response(Command): raise PacketError("Response packets must have their number set to " \ + hex(0x00001000)) - @apply - def data(): + @dynamic_property + def data(self): doc = \ """ The last 3 DWORDs (12 bytes) of data in this @@ -681,43 +681,43 @@ class ListResponse(Response): PATH_NOT_FOUND = 0xffffffd7 #: Queried path is not found PERMISSION_DENIED = 0xffffffd6 #: Permission denied - @apply - def is_file(): + @dynamic_property + def is_file(self): doc = """ True iff queried path is a file """ def fget(self): return self.code == ListResponse.IS_FILE return property(doc=doc, fget=fget) - @apply - def is_invalid(): + @dynamic_property + def is_invalid(self): doc = """ True iff queried path is invalid """ def fget(self): return self.code == ListResponse.IS_INVALID return property(doc=doc, fget=fget) - @apply - def path_not_found(): + @dynamic_property + def path_not_found(self): doc = """ True iff queried path is not found """ def fget(self): return self.code == ListResponse.PATH_NOT_FOUND return property(doc=doc, fget=fget) - @apply - def permission_denied(): + @dynamic_property + def permission_denied(self): doc = """ True iff permission is denied for path operations """ def fget(self): return self.code == ListResponse.PERMISSION_DENIED return property(doc=doc, fget=fget) - @apply - def is_unmounted(): + @dynamic_property + def is_unmounted(self): doc = """ True iff queried path is unmounted (i.e. removed storage card) """ def fget(self): return self.code == ListResponse.IS_UNMOUNTED return property(doc=doc, fget=fget) - @apply - def is_eol(): + @dynamic_property + def is_eol(self): doc = """ True iff there are no more items in the list """ def fget(self): return self.code == ListResponse.IS_EOL @@ -759,8 +759,8 @@ class FileProperties(Answer): # 0 = default permissions, 4 = read only permissions = field(start=36, fmt=DWORD) - @apply - def is_dir(): + @dynamic_property + def is_dir(self): doc = """True if path points to a directory, False if it points to a file.""" def fget(self): @@ -776,8 +776,8 @@ class FileProperties(Answer): return property(doc=doc, fget=fget, fset=fset) - @apply - def is_readonly(): + @dynamic_property + def is_readonly(self): doc = """ Whether this file is readonly.""" def fget(self): @@ -801,8 +801,8 @@ class IdAnswer(Answer): """ Defines the structure of packets that contain identifiers for queries. """ - @apply - def id(): + @dynamic_property + def id(self): doc = \ """ The identifier. C{unsigned int} stored in 4 bytes @@ -841,8 +841,8 @@ class ListAnswer(Answer): name_length = field(start=20, fmt=DWORD) name = stringfield(name_length, start=24) - @apply - def is_dir(): + @dynamic_property + def is_dir(self): doc = \ """ True if list item points to a directory, False if it points to a file. @@ -859,4 +859,3 @@ class ListAnswer(Answer): return property(doc=doc, fget=fget, fset=fset) - diff --git a/src/calibre/devices/prs505/books.py b/src/calibre/devices/prs505/books.py index b63b089fdd..53ab374613 100644 --- a/src/calibre/devices/prs505/books.py +++ b/src/calibre/devices/prs505/books.py @@ -64,8 +64,8 @@ class Book(object): # When setting this attribute you must use an epoch datetime = book_metadata_field("date", formatter=strptime, setter=strftime) - @apply - def title_sorter(): + @dynamic_property + def title_sorter(self): doc = '''String to sort the title. If absent, title is returned''' def fget(self): src = self.elem.getAttribute('titleSorter').strip() @@ -76,8 +76,8 @@ class Book(object): self.elem.setAttribute('titleSorter', sortable_title(unicode(val))) return property(doc=doc, fget=fget, fset=fset) - @apply - def thumbnail(): + @dynamic_property + def thumbnail(self): doc = \ """ The thumbnail. Should be a height 68 image. @@ -99,15 +99,15 @@ class Book(object): return decode(rc) return property(fget=fget, doc=doc) - @apply - def path(): + @dynamic_property + def path(self): doc = """ Absolute path to book on device. Setting not supported. """ def fget(self): return self.mountpath + self.rpath return property(fget=fget, doc=doc) - @apply - def db_id(): + @dynamic_property + def db_id(self): doc = '''The database id in the application database that this file corresponds to''' def fget(self): match = re.search(r'_(\d+)$', self.rpath.rpartition('.')[0]) @@ -412,4 +412,4 @@ def fix_ids(main, card): regen_ids(main) regen_ids(card) - main.set_next_id(str(main.max_id()+1)) + main.set_next_id(str(main.max_id()+1)) \ No newline at end of file diff --git a/src/calibre/devices/usbms/books.py b/src/calibre/devices/usbms/books.py index fffed41549..2875c04b88 100644 --- a/src/calibre/devices/usbms/books.py +++ b/src/calibre/devices/usbms/books.py @@ -21,15 +21,15 @@ class Book(object): def __eq__(self, other): return self.path == other.path - @apply - def title_sorter(): + @dynamic_property + def title_sorter(self): doc = '''String to sort the title. If absent, title is returned''' def fget(self): return re.sub('^\s*A\s+|^\s*The\s+|^\s*An\s+', '', self.title).rstrip() return property(doc=doc, fget=fget) - @apply - def thumbnail(): + @dynamic_property + def thumbnail(self): return None def __str__(self): @@ -44,4 +44,3 @@ class BookList(_BookList): def set_tags(self, book, tags): pass - diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py index 9a273c42ce..5e87351375 100644 --- a/src/calibre/ebooks/html.py +++ b/src/calibre/ebooks/html.py @@ -31,8 +31,8 @@ from cssutils import CSSParser class HTMLElement(HtmlElement): - @apply - def specified_font_size(): + @dynamic_property + def specified_font_size(self): def fget(self): ans = self.get('specified_font_size', '') @@ -47,8 +47,8 @@ class HTMLElement(HtmlElement): return property(fget=fget, fset=fset) - @apply - def computed_font_size(): + @dynamic_property + def computed_font_size(self): def fget(self): ans = self.get('computed_font_size', '') if ans == '': @@ -1148,4 +1148,4 @@ output = %s if __name__ == '__main__': - sys.exit(main()) + sys.exit(main()) \ No newline at end of file diff --git a/src/calibre/ebooks/lrf/tags.py b/src/calibre/ebooks/lrf/tags.py index c8ef312ae3..17db193e1a 100644 --- a/src/calibre/ebooks/lrf/tags.py +++ b/src/calibre/ebooks/lrf/tags.py @@ -207,32 +207,32 @@ class Tag(object): s += " at %08X, contents: %s" % (self.offset, repr(self.contents)) return s - @apply - def byte(): + @dynamic_property + def byte(self): def fget(self): if len(self.contents) != 1: raise LRFParseError("Bad parameter for tag ID: %04X" % self.id) return struct.unpack(" Date: Sat, 7 Feb 2009 10:03:00 -0500 Subject: [PATCH 006/319] Refactor OPF de-serialization into OEBReader. --- src/calibre/ebooks/oeb/base.py | 644 ++++-------------------------- src/calibre/ebooks/oeb/factory.py | 20 + src/calibre/ebooks/oeb/reader.py | 535 +++++++++++++++++++++++++ src/calibre/ebooks/oeb/writer.py | 107 +++++ 4 files changed, 742 insertions(+), 564 deletions(-) create mode 100644 src/calibre/ebooks/oeb/factory.py create mode 100644 src/calibre/ebooks/oeb/reader.py create mode 100644 src/calibre/ebooks/oeb/writer.py diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 80d4797905..8eb73935a5 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -6,22 +6,18 @@ from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Marshall T. Vandegrift ' -import os, sys, re, uuid, copy -from mimetypes import types_map, guess_type +import os, sys, re, uuid +from mimetypes import types_map from collections import defaultdict -from types import StringTypes -from itertools import izip, count, chain +from itertools import count from urlparse import urldefrag, urlparse, urlunparse from urllib import unquote as urlunquote from lxml import etree, html import calibre from calibre import LoggingInterface from calibre.translations.dynamic import translate -from calibre.startup import get_lang from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.oeb.entitydefs import ENTITYDEFS -from calibre.ebooks.metadata.epub import CoverRenderer -from calibre.ptempfile import TemporaryDirectory XML_NS = 'http://www.w3.org/XML/1998/namespace' XHTML_NS = 'http://www.w3.org/1999/xhtml' @@ -39,14 +35,13 @@ NCX_NS = 'http://www.daisy.org/z3986/2005/ncx/' SVG_NS = 'http://www.w3.org/2000/svg' XLINK_NS = 'http://www.w3.org/1999/xlink' CALIBRE_NS = 'http://calibre.kovidgoyal.net/2009/metadata' -XPNSMAP = { - 'h' : XHTML_NS, 'o1' : OPF1_NS, 'o2' : OPF2_NS, - 'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS, - 'xsi': XSI_NS, 'dt' : DCTERMS_NS, 'ncx': NCX_NS, - 'svg': SVG_NS, 'xl' : XLINK_NS - } -DC_PREFIXES = ('d11', 'd10', 'd09') - +XPNSMAP = {'h' : XHTML_NS, 'o1' : OPF1_NS, 'o2' : OPF2_NS, + 'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS, + 'xsi': XSI_NS, 'dt' : DCTERMS_NS, 'ncx': NCX_NS, + 'svg': SVG_NS, 'xl' : XLINK_NS} +OPF1_NSMAP = {'dc': DC11_NS, 'oebpackage': OPF1_NS} +OPF2_NSMAP = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS, + 'xsi': XSI_NS, 'calibre': CALIBRE_NS} def XML(name): return '{%s}%s' % (XML_NS, name) @@ -105,7 +100,8 @@ SVG_MIME = types_map['.svg'] BINARY_MIME = 'application/octet-stream' OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css']) -OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, 'text/x-oeb-document']) +OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, + 'text/x-oeb-document']) OEB_RASTER_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME]) OEB_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME, SVG_MIME]) @@ -167,8 +163,9 @@ def XPath(expr): def xpath(elem, expr): return elem.xpath(expr, namespaces=XPNSMAP) -def xml2str(root): - return etree.tostring(root, encoding='utf-8', xml_declaration=True) +def xml2str(root, pretty_print=False): + return etree.tostring(root, encoding='utf-8', xml_declaration=True, + pretty_print=pretty_print) ASCII_CHARS = set(chr(x) for x in xrange(128)) UNIBYTE_CHARS = set(chr(x) for x in xrange(256)) @@ -213,16 +210,38 @@ class Logger(LoggingInterface, object): return object.__getattribute__(self, 'log_' + name) -class AbstractContainer(object): - def read_xml(self, path): - return etree.fromstring( - self.read(path), base_url=os.path.dirname(path)) +class NullContainer(object): + def read(self, path): + raise OEBError('Attempt to read from NullContainer') -class DirContainer(AbstractContainer): - def __init__(self, rootdir): - self.rootdir = unicode(rootdir) + def write(self, path): + raise OEBError('Attempt to write to NullContainer') + + def exists(self, path): + return False + + def namelist(self): + return [] + +class DirContainer(object): + def __init__(self, path): + path = unicode(path) + ext = os.path.splitext(path)[1].lower() + if ext == '.opf': + self.opfname = os.path.basename(path) + self.rootdir = os.path.dirname(path) + return + self.rootdir = path + for path in self.namelist(): + ext = os.path.splitext(path)[1].lower() + if ext == '.opf': + self.opfname = fname + return + self.opfname = None def read(self, path): + if path is None: + path = self.opfname path = os.path.join(self.rootdir, path) with open(urlunquote(path), 'rb') as f: return f.read() @@ -239,33 +258,14 @@ class DirContainer(AbstractContainer): path = os.path.join(self.rootdir, path) return os.path.isfile(urlunquote(path)) -class DirWriter(object): - def __init__(self, version='2.0', page_map=False): - self.version = version - self.page_map = page_map - - def dump(self, oeb, path): - version = int(self.version[0]) - opfname = None - if os.path.splitext(path)[1].lower() == '.opf': - opfname = os.path.basename(path) - path = os.path.dirname(path) - if not os.path.isdir(path): - os.mkdir(path) - output = DirContainer(path) - for item in oeb.manifest.values(): - output.write(item.href, str(item)) - if version == 1: - metadata = oeb.to_opf1() - elif version == 2: - metadata = oeb.to_opf2(page_map=self.page_map) - else: - raise OEBError("Unrecognized OPF version %r" % self.version) - for mime, (href, data) in metadata.items(): - if opfname and mime == OPF_MIME: - href = opfname - output.write(href, xml2str(data)) - return + def namelist(self): + names = [] + for root, dirs, files in os.walk(self.rootdir): + for fname in files: + fname = os.path.join(root, fname) + fname = fname.replace('\\', '/') + names.append(fname) + return names class Metadata(object): @@ -279,9 +279,6 @@ class Metadata(object): OPF_ATTRS = {'role': OPF('role'), 'file-as': OPF('file-as'), 'scheme': OPF('scheme'), 'event': OPF('event'), 'type': XSI('type'), 'lang': XML('lang'), 'id': 'id'} - OPF1_NSMAP = {'dc': DC11_NS, 'oebpackage': OPF1_NS} - OPF2_NSMAP = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS, - 'xsi': XSI_NS, 'calibre': CALIBRE_NS} class Item(object): @@ -337,18 +334,20 @@ class Metadata(object): if attr != nsattr: attrib[nsattr] = attrib.pop(attr) - scheme = Attribute(lambda term : 'scheme' if term == OPF('meta') else OPF('scheme'), + scheme = Attribute(lambda term: 'scheme' if \ + term == OPF('meta') else OPF('scheme'), [DC('identifier'), OPF('meta')]) file_as = Attribute(OPF('file-as'), [DC('creator'), DC('contributor')]) role = Attribute(OPF('role'), [DC('creator'), DC('contributor')]) event = Attribute(OPF('event'), [DC('date')]) id = Attribute('id') - type = Attribute(XSI('type'), [DC('date'), DC('format'), DC('type')]) + type = Attribute(XSI('type'), [DC('date'), DC('format'), + DC('type')]) lang = Attribute(XML('lang'), [DC('contributor'), DC('coverage'), - DC('creator'), DC('publisher'), - DC('relation'), DC('rights'), - DC('source'), DC('subject'), - OPF('meta')]) + DC('creator'), DC('publisher'), + DC('relation'), DC('rights'), + DC('source'), DC('subject'), + OPF('meta')]) def __getitem__(self, key): return self.attrib[key] @@ -445,21 +444,19 @@ class Metadata(object): return nsmap return property(fget=fget) - @apply def _opf2_nsmap(): def fget(self): nsmap = self._nsmap - nsmap.update(self.OPF2_NSMAP) + nsmap.update(OPF2_NSMAP) return nsmap return property(fget=fget) - def to_opf1(self, parent=None): nsmap = self._opf1_nsmap nsrmap = dict((value, key) for key, value in nsmap.items()) elem = element(parent, 'metadata', nsmap=nsmap) - dcmeta = element(elem, 'dc-metadata', nsmap=self.OPF1_NSMAP) + dcmeta = element(elem, 'dc-metadata', nsmap=OPF1_NSMAP) xmeta = element(elem, 'x-metadata') for term in self.items: for item in self.items[term]: @@ -608,7 +605,7 @@ class Manifest(object): def __str__(self): data = self.data if isinstance(data, etree._Element): - return xml2str(data) + return xml2str(data, pretty_print=self.oeb.pretty_print) if isinstance(data, unicode): return data.encode('utf-8') return str(data) @@ -756,7 +753,7 @@ class Spine(object): self.items = [] def _linear(self, linear): - if isinstance(linear, StringTypes): + if isinstance(linear, basestring): linear = linear.lower() if linear is None or linear in ('yes', 'true'): linear = True @@ -838,7 +835,7 @@ class Guide(object): ('text', __('Main Text'))] TYPES = set(t for t, _ in _TYPES_TITLES) TITLES = dict(_TYPES_TITLES) - ORDER = dict((t, i) for (t, _), i in izip(_TYPES_TITLES, count(0))) + ORDER = dict((t, i) for i, (t, _) in enumerate(_TYPES_TITLES)) def __init__(self, oeb, type, title, href): self.oeb = oeb @@ -1044,493 +1041,25 @@ class PageList(object): class OEBBook(object): - COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]') - COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]') - - def __init__(self, opfpath=None, container=None, encoding=None, - logger=FauxLogger()): - if opfpath and not container: - container = DirContainer(os.path.dirname(opfpath)) - opfpath = os.path.basename(opfpath) - self.container = container + def __init__(self, encoding=None, pretty_print=False, logger=FauxLogger()): self.encoding = encoding + self.pretty_print = pretty_print self.logger = logger - if opfpath or container: - opf = self._read_opf(opfpath) - self._all_from_opf(opf) - - def _clean_opf(self, opf): - nsmap = {} - for elem in opf.iter(tag=etree.Element): - nsmap.update(elem.nsmap) - for elem in opf.iter(tag=etree.Element): - if namespace(elem.tag) in ('', OPF1_NS): - elem.tag = OPF(barename(elem.tag)) - nsmap.update(Metadata.OPF2_NSMAP) - attrib = dict(opf.attrib) - nroot = etree.Element(OPF('package'), - nsmap={None: OPF2_NS}, attrib=attrib) - metadata = etree.SubElement(nroot, OPF('metadata'), nsmap=nsmap) - ignored = (OPF('dc-metadata'), OPF('x-metadata')) - for elem in xpath(opf, 'o2:metadata//*'): - if elem.tag in ignored: - continue - if namespace(elem.tag) in DC_NSES: - tag = barename(elem.tag).lower() - elem.tag = '{%s}%s' % (DC11_NS, tag) - metadata.append(elem) - for element in xpath(opf, 'o2:metadata//o2:meta'): - metadata.append(element) - for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'): - for element in xpath(opf, tag): - nroot.append(element) - return nroot - - def _read_opf(self, opfpath): - data = self.container.read(opfpath) - data = self.decode(data) - data = XMLDECL_RE.sub('', data) - try: - opf = etree.fromstring(data) - except etree.XMLSyntaxError: - repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0)) - data = ENTITY_RE.sub(repl, data) - opf = etree.fromstring(data) - self.logger.warn('OPF contains invalid HTML named entities') - ns = namespace(opf.tag) - if ns not in ('', OPF1_NS, OPF2_NS): - raise OEBError('Invalid namespace %r for OPF document' % ns) - opf = self._clean_opf(opf) - return opf - - def _metadata_from_opf(self, opf): - uid = opf.get('unique-identifier', None) + self.version = '2.0' + self.container = NullContainer() + self.metadata = Metadata(self) self.uid = None - self.metadata = metadata = Metadata(self) - for elem in xpath(opf, '/o2:package/o2:metadata//*'): - term = elem.tag - value = elem.text - attrib = dict(elem.attrib) - nsmap = elem.nsmap - if term == OPF('meta'): - term = qname(attrib.pop('name', None), nsmap) - value = attrib.pop('content', None) - if value: - value = COLLAPSE_RE.sub(' ', value.strip()) - if term and (value or attrib): - metadata.add(term, value, attrib, nsmap=nsmap) - haveuuid = haveid = False - for ident in metadata.identifier: - if unicode(ident).startswith('urn:uuid:'): - haveuuid = True - if 'id' in ident.attrib: - haveid = True - if not (haveuuid and haveid): - bookid = "urn:uuid:%s" % str(uuid.uuid4()) - metadata.add('identifier', bookid, id='calibre-uuid') - if uid is None: - self.logger.warn(u'Unique-identifier not specified') - for item in metadata.identifier: - if not item.id: - continue - if uid is None or item.id == uid: - self.uid = item - break - else: - self.logger.warn(u'Unique-identifier %r not found' % uid) - for ident in metadata.identifier: - if 'id' in ident.attrib: - self.uid = metadata.identifier[0] - break - if not metadata.language: - self.logger.warn(u'Language not specified') - metadata.add('language', get_lang()) - if not metadata.creator: - self.logger.warn('Creator not specified') - metadata.add('creator', self.translate(__('Unknown'))) - if not metadata.title: - self.logger.warn('Title not specified') - metadata.add('title', self.translate(__('Unknown'))) - - def _manifest_add_missing(self): - manifest = self.manifest - known = set(manifest.hrefs) - unchecked = set(manifest.values()) - while unchecked: - new = set() - for item in unchecked: - if (item.media_type in OEB_DOCS or - item.media_type[-4:] in ('/xml', '+xml')) and \ - item.data is not None: - hrefs = [sel(item.data) for sel in LINK_SELECTORS] - for href in chain(*hrefs): - href, _ = urldefrag(href) - if not href: - continue - href = item.abshref(urlnormalize(href)) - scheme = urlparse(href).scheme - if not scheme and href not in known: - new.add(href) - elif item.media_type in OEB_STYLES: - for match in CSSURL_RE.finditer(item.data): - href, _ = urldefrag(match.group('url')) - href = item.abshref(urlnormalize(href)) - scheme = urlparse(href).scheme - if not scheme and href not in known: - new.add(href) - unchecked.clear() - for href in new: - known.add(href) - if not self.container.exists(href): - self.logger.warn('Referenced file %r not found' % href) - continue - self.logger.warn('Referenced file %r not in manifest' % href) - id, _ = manifest.generate(id='added') - guessed = guess_type(href)[0] - media_type = guessed or BINARY_MIME - added = manifest.add(id, href, media_type) - unchecked.add(added) - - def _manifest_from_opf(self, opf): - self.manifest = manifest = Manifest(self) - for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'): - id = elem.get('id') - href = elem.get('href') - media_type = elem.get('media-type', None) - if media_type is None: - media_type = elem.get('mediatype', None) - if media_type is None or media_type == 'text/xml': - guessed = guess_type(href)[0] - media_type = guessed or media_type or BINARY_MIME - fallback = elem.get('fallback') - if href in manifest.hrefs: - self.logger.warn(u'Duplicate manifest entry for %r' % href) - continue - if not self.container.exists(href): - self.logger.warn(u'Manifest item %r not found' % href) - continue - if id in manifest.ids: - self.logger.warn(u'Duplicate manifest id %r' % id) - id, href = manifest.generate(id, href) - manifest.add(id, href, media_type, fallback) - self._manifest_add_missing() - - def _spine_add_extra(self): - manifest = self.manifest - spine = self.spine - unchecked = set(spine) - selector = XPath('h:body//h:a/@href') - extras = set() - while unchecked: - new = set() - for item in unchecked: - if item.media_type not in OEB_DOCS: - # TODO: handle fallback chains - continue - for href in selector(item.data): - href, _ = urldefrag(href) - if not href: - continue - href = item.abshref(urlnormalize(href)) - if href not in manifest.hrefs: - continue - found = manifest.hrefs[href] - if found.media_type not in OEB_DOCS or \ - found in spine or found in extras: - continue - new.add(found) - extras.update(new) - unchecked = new - version = int(self.version[0]) - for item in sorted(extras): - if version >= 2: - self.logger.warn( - 'Spine-referenced file %r not in spine' % item.href) - spine.add(item, linear=False) - - def _spine_from_opf(self, opf): - self.spine = spine = Spine(self) - for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'): - idref = elem.get('idref') - if idref not in self.manifest: - self.logger.warn(u'Spine item %r not found' % idref) - continue - item = self.manifest[idref] - spine.add(item, elem.get('linear')) - if len(spine) == 0: - raise OEBError("Spine is empty") - self._spine_add_extra() - - def _guide_from_opf(self, opf): - self.guide = guide = Guide(self) - for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'): - href = elem.get('href') - path = urldefrag(href)[0] - if path not in self.manifest.hrefs: - self.logger.warn(u'Guide reference %r not found' % href) - continue - guide.add(elem.get('type'), elem.get('title'), href) - - def _find_ncx(self, opf): - result = xpath(opf, '/o2:package/o2:spine/@toc') - if result: - id = result[0] - if id not in self.manifest.ids: - return None - item = self.manifest.ids[id] - self.manifest.remove(item) - return item - for item in self.manifest.values(): - if item.media_type == NCX_MIME: - self.manifest.remove(item) - return item - return None - - def _toc_from_navpoint(self, item, toc, navpoint): - children = xpath(navpoint, 'ncx:navPoint') - for child in children: - title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()')) - title = COLLAPSE_RE.sub(' ', title.strip()) - href = xpath(child, 'ncx:content/@src') - if not title or not href: - continue - href = item.abshref(urlnormalize(href[0])) - path, _ = urldefrag(href) - if path not in self.manifest.hrefs: - self.logger.warn('TOC reference %r not found' % href) - continue - id = child.get('id') - klass = child.get('class') - node = toc.add(title, href, id=id, klass=klass) - self._toc_from_navpoint(item, node, child) - - def _toc_from_ncx(self, item): - if item is None: - return False - ncx = item.data - title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()')) - title = COLLAPSE_RE.sub(' ', title.strip()) - title = title or unicode(self.metadata.title[0]) - self.toc = toc = TOC(title) - navmaps = xpath(ncx, 'ncx:navMap') - for navmap in navmaps: - self._toc_from_navpoint(item, toc, navmap) - return True - - def _toc_from_tour(self, opf): - result = xpath(opf, 'o2:tours/o2:tour') - if not result: - return False - tour = result[0] - self.toc = toc = TOC(tour.get('title')) - sites = xpath(tour, 'o2:site') - for site in sites: - title = site.get('title') - href = site.get('href') - if not title or not href: - continue - path, _ = urldefrag(urlnormalize(href)) - if path not in self.manifest.hrefs: - self.logger.warn('TOC reference %r not found' % href) - continue - id = site.get('id') - toc.add(title, href, id=id) - return True - - def _toc_from_html(self, opf): - if 'toc' not in self.guide: - return False - self.toc = toc = TOC() - itempath, frag = urldefrag(self.guide['toc'].href) - item = self.manifest.hrefs[itempath] - html = item.data - if frag: - elems = xpath(html, './/*[@id="%s"]' % frag) - if not elems: - elems = xpath(html, './/*[@name="%s"]' % frag) - elem = elems[0] if elems else html - while elem != html and not xpath(elem, './/h:a[@href]'): - elem = elem.getparent() - html = elem - titles = defaultdict(list) - order = [] - for anchor in xpath(html, './/h:a[@href]'): - href = anchor.attrib['href'] - href = item.abshref(urlnormalize(href)) - path, frag = urldefrag(href) - if path not in self.manifest.hrefs: - continue - title = ' '.join(xpath(anchor, './/text()')) - title = COLLAPSE_RE.sub(' ', title.strip()) - if href not in titles: - order.append(href) - titles[href].append(title) - for href in order: - toc.add(' '.join(titles[href]), href) - return True - - def _toc_from_spine(self, opf): - self.toc = toc = TOC() - titles = [] - headers = [] - for item in self.spine: - if not item.linear: continue - html = item.data - title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) - title = COLLAPSE_RE.sub(' ', title.strip()) - if title: - titles.append(title) - headers.append('(unlabled)') - for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): - expr = '/h:html/h:body//h:%s[position()=1]/text()' - header = ''.join(xpath(html, expr % tag)) - header = COLLAPSE_RE.sub(' ', header.strip()) - if header: - headers[-1] = header - break - use = titles - if len(titles) > len(set(titles)): - use = headers - for title, item in izip(use, self.spine): - if not item.linear: continue - toc.add(title, item.href) - return True - - def _toc_from_opf(self, opf, item): - if self._toc_from_ncx(item): return - if self._toc_from_tour(opf): return - self.logger.warn('No metadata table of contents found') - if self._toc_from_html(opf): return - self._toc_from_spine(opf) - - def _pages_from_ncx(self, opf, item): - if item is None: - return False - ncx = item.data - ptargets = xpath(ncx, 'ncx:pageList/ncx:pageTarget') - if not ptargets: - return False - pages = self.pages = PageList() - for ptarget in ptargets: - name = ''.join(xpath(ptarget, 'ncx:navLabel/ncx:text/text()')) - name = COLLAPSE_RE.sub(' ', name.strip()) - href = xpath(ptarget, 'ncx:content/@src') - if not href: - continue - href = item.abshref(urlnormalize(href[0])) - id = ptarget.get('id') - type = ptarget.get('type', 'normal') - klass = ptarget.get('class') - pages.add(name, href, type=type, id=id, klass=klass) - return True - - def _find_page_map(self, opf): - result = xpath(opf, '/o2:package/o2:spine/@page-map') - if result: - id = result[0] - if id not in self.manifest.ids: - return None - item = self.manifest.ids[id] - self.manifest.remove(item) - return item - for item in self.manifest.values(): - if item.media_type == PAGE_MAP_MIME: - self.manifest.remove(item) - return item - return None - - def _pages_from_page_map(self, opf): - item = self._find_page_map(opf) - if item is None: - return False - pmap = item.data - pages = self.pages = PageList() - for page in xpath(pmap, 'o2:page'): - name = page.get('name', '') - href = page.get('href') - if not href: - continue - name = COLLAPSE_RE.sub(' ', name.strip()) - href = item.abshref(urlnormalize(href)) - type = 'normal' - if not name: - type = 'special' - elif name.lower().strip('ivxlcdm') == '': - type = 'front' - pages.add(name, href, type=type) - return True - - def _pages_from_opf(self, opf, item): - if self._pages_from_ncx(opf, item): return - if self._pages_from_page_map(opf): return + self.manifest = Manifest(self) + self.spine = Spine(self) + self.guide = Guide(self) + self.toc = TOC() self.pages = PageList() - return - - def _cover_from_html(self, hcover): - with TemporaryDirectory('_html_cover') as tdir: - writer = DirWriter() - writer.dump(self, tdir) - path = os.path.join(tdir, urlunquote(hcover.href)) - renderer = CoverRenderer(path) - data = renderer.image_data - id, href = self.manifest.generate('cover', 'cover.jpeg') - item = self.manifest.add(id, href, JPEG_MIME, data=data) - return item - - def _locate_cover_image(self): - if self.metadata.cover: - id = str(self.metadata.cover[0]) - item = self.manifest.ids.get(id, None) - if item is not None and item.media_type in OEB_IMAGES: - return item - else: - self.logger.warn('Invalid cover image @id %r' % id) - hcover = self.spine[0] - if 'cover' in self.guide: - href = self.guide['cover'].href - item = self.manifest.hrefs[href] - media_type = item.media_type - if media_type in OEB_IMAGES: - return item - elif media_type in OEB_DOCS: - hcover = item - html = hcover.data - if MS_COVER_TYPE in self.guide: - href = self.guide[MS_COVER_TYPE].href - item = self.manifest.hrefs.get(href, None) - if item is not None and item.media_type in OEB_IMAGES: - return item - if self.COVER_SVG_XP(html): - svg = copy.deepcopy(self.COVER_SVG_XP(html)[0]) - href = os.path.splitext(hcover.href)[0] + '.svg' - id, href = self.manifest.generate(hcover.id, href) - item = self.manifest.add(id, href, SVG_MIME, data=svg) - return item - if self.COVER_OBJECT_XP(html): - object = self.COVER_OBJECT_XP(html)[0] - href = hcover.abshref(object.get('data')) - item = self.manifest.hrefs.get(href, None) - if item is not None and item.media_type in OEB_IMAGES: - return item - return self._cover_from_html(hcover) - - def _ensure_cover_image(self): - cover = self._locate_cover_image() - if self.metadata.cover: - self.metadata.cover[0].value = cover.id - return - self.metadata.add('cover', cover.id) - - def _all_from_opf(self, opf): - self.version = opf.get('version', '1.2') - self._metadata_from_opf(opf) - self._manifest_from_opf(opf) - self._spine_from_opf(opf) - self._guide_from_opf(opf) - item = self._find_ncx(opf) - self._toc_from_opf(opf, item) - self._pages_from_opf(opf, item) - self._ensure_cover_image() + + @classmethod + def generate(cls, opts): + encoding = opts.encoding + pretty_print = opts.pretty_print + return cls(encoding=encoding, pretty_print=pretty_print) def translate(self, text): lang = str(self.metadata.language[0]) @@ -1652,16 +1181,3 @@ class OEBBook(object): spine.attrib['page-map'] = id results[PAGE_MAP_MIME] = (href, self.pages.to_page_map()) return results - - -def main(argv=sys.argv): - for arg in argv[1:]: - oeb = OEBBook(arg) - for name, doc in oeb.to_opf1().values(): - print etree.tostring(doc, pretty_print=True) - for name, doc in oeb.to_opf2(page_map=True).values(): - print etree.tostring(doc, pretty_print=True) - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/oeb/factory.py b/src/calibre/ebooks/oeb/factory.py new file mode 100644 index 0000000000..dcb0942e85 --- /dev/null +++ b/src/calibre/ebooks/oeb/factory.py @@ -0,0 +1,20 @@ +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2008, Marshall T. Vandegrift ' + +import os +from calibre.ebooks.oeb.base import OEBError +from calibre.ebooks.oeb.reader import OEBReader + +__all__ = ['get_reader'] + +READER_REGISTRY = { + '.opf': OEBReader, + } + +def ReaderFactory(path): + ext = os.path.splitext(path)[1].lower() + if not ext: + return OEBReader + return READER_REGISTRY[ext]() diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py new file mode 100644 index 0000000000..2d22ff0cd2 --- /dev/null +++ b/src/calibre/ebooks/oeb/reader.py @@ -0,0 +1,535 @@ +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2008, Marshall T. Vandegrift ' + +import sys, os, uuid, copy +from itertools import izip, chain +from urlparse import urldefrag, urlparse +from urllib import unquote as urlunquote +from mimetypes import guess_type +from collections import defaultdict +from lxml import etree +from calibre.ebooks.oeb.base import OPF1_NS, OPF2_NS, OPF2_NSMAP, DC11_NS, \ + DC_NSES, OPF +from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_IMAGES, \ + PAGE_MAP_MIME, JPEG_MIME, NCX_MIME, SVG_MIME +from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, CSSURL_RE, \ + ENTITY_RE, LINK_SELECTORS, MS_COVER_TYPE +from calibre.ebooks.oeb.base import namespace, barename, qname, XPath, xpath +from calibre.ebooks.oeb.base import urlnormalize, xml2str +from calibre.ebooks.oeb.base import OEBError, OEBBook, DirContainer +from calibre.ebooks.oeb.writer import OEBWriter +from calibre.ebooks.oeb.entitydefs import ENTITYDEFS +from calibre.ebooks.metadata.epub import CoverRenderer +from calibre.startup import get_lang +from calibre.ptempfile import TemporaryDirectory + +__all__ = ['OEBReader'] + +class OEBReader(object): + + COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]') + COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]') + + Container = DirContainer + + DEFAULT_PROFILE = 'PRS505' + + def __call__(self, oeb, path): + self.oeb = oeb + self.logger = oeb.logger + oeb.container = self.Container(path) + opf = self._read_opf() + self._all_from_opf(opf) + return oeb + + def _clean_opf(self, opf): + nsmap = {} + for elem in opf.iter(tag=etree.Element): + nsmap.update(elem.nsmap) + for elem in opf.iter(tag=etree.Element): + if namespace(elem.tag) in ('', OPF1_NS): + elem.tag = OPF(barename(elem.tag)) + nsmap.update(OPF2_NSMAP) + attrib = dict(opf.attrib) + nroot = etree.Element(OPF('package'), + nsmap={None: OPF2_NS}, attrib=attrib) + metadata = etree.SubElement(nroot, OPF('metadata'), nsmap=nsmap) + ignored = (OPF('dc-metadata'), OPF('x-metadata')) + for elem in xpath(opf, 'o2:metadata//*'): + if elem.tag in ignored: + continue + if namespace(elem.tag) in DC_NSES: + tag = barename(elem.tag).lower() + elem.tag = '{%s}%s' % (DC11_NS, tag) + metadata.append(elem) + for element in xpath(opf, 'o2:metadata//o2:meta'): + metadata.append(element) + for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'): + for element in xpath(opf, tag): + nroot.append(element) + return nroot + + def _read_opf(self): + data = self.oeb.container.read(None) + data = self.oeb.decode(data) + data = XMLDECL_RE.sub('', data) + try: + opf = etree.fromstring(data) + except etree.XMLSyntaxError: + repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0)) + data = ENTITY_RE.sub(repl, data) + opf = etree.fromstring(data) + self.logger.warn('OPF contains invalid HTML named entities') + ns = namespace(opf.tag) + if ns not in ('', OPF1_NS, OPF2_NS): + raise OEBError('Invalid namespace %r for OPF document' % ns) + opf = self._clean_opf(opf) + return opf + + def _metadata_from_opf(self, opf): + uid = opf.get('unique-identifier', None) + self.oeb.uid = None + metadata = self.oeb.metadata + for elem in xpath(opf, '/o2:package/o2:metadata//*'): + term = elem.tag + value = elem.text + attrib = dict(elem.attrib) + nsmap = elem.nsmap + if term == OPF('meta'): + term = qname(attrib.pop('name', None), nsmap) + value = attrib.pop('content', None) + if value: + value = COLLAPSE_RE.sub(' ', value.strip()) + if term and (value or attrib): + metadata.add(term, value, attrib, nsmap=nsmap) + haveuuid = haveid = False + for ident in metadata.identifier: + if unicode(ident).startswith('urn:uuid:'): + haveuuid = True + if 'id' in ident.attrib: + haveid = True + if not (haveuuid and haveid): + bookid = "urn:uuid:%s" % str(uuid.uuid4()) + metadata.add('identifier', bookid, id='calibre-uuid') + if uid is None: + self.logger.warn(u'Unique-identifier not specified') + for item in metadata.identifier: + if not item.id: + continue + if uid is None or item.id == uid: + self.oeb.uid = item + break + else: + self.logger.warn(u'Unique-identifier %r not found' % uid) + for ident in metadata.identifier: + if 'id' in ident.attrib: + self.oeb.uid = metadata.identifier[0] + break + if not metadata.language: + self.logger.warn(u'Language not specified') + metadata.add('language', get_lang()) + if not metadata.creator: + self.logger.warn('Creator not specified') + metadata.add('creator', self.oeb.translate(__('Unknown'))) + if not metadata.title: + self.logger.warn('Title not specified') + metadata.add('title', self.oeb.translate(__('Unknown'))) + + def _manifest_add_missing(self): + manifest = self.oeb.manifest + known = set(manifest.hrefs) + unchecked = set(manifest.values()) + while unchecked: + new = set() + for item in unchecked: + if (item.media_type in OEB_DOCS or + item.media_type[-4:] in ('/xml', '+xml')) and \ + item.data is not None: + hrefs = [sel(item.data) for sel in LINK_SELECTORS] + for href in chain(*hrefs): + href, _ = urldefrag(href) + if not href: + continue + href = item.abshref(urlnormalize(href)) + scheme = urlparse(href).scheme + if not scheme and href not in known: + new.add(href) + elif item.media_type in OEB_STYLES: + for match in CSSURL_RE.finditer(item.data): + href, _ = urldefrag(match.group('url')) + href = item.abshref(urlnormalize(href)) + scheme = urlparse(href).scheme + if not scheme and href not in known: + new.add(href) + unchecked.clear() + for href in new: + known.add(href) + if not self.oeb.container.exists(href): + self.logger.warn('Referenced file %r not found' % href) + continue + self.logger.warn('Referenced file %r not in manifest' % href) + id, _ = manifest.generate(id='added') + guessed = guess_type(href)[0] + media_type = guessed or BINARY_MIME + added = manifest.add(id, href, media_type) + unchecked.add(added) + + def _manifest_from_opf(self, opf): + manifest = self.oeb.manifest + for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'): + id = elem.get('id') + href = elem.get('href') + media_type = elem.get('media-type', None) + if media_type is None: + media_type = elem.get('mediatype', None) + if media_type is None or media_type == 'text/xml': + guessed = guess_type(href)[0] + media_type = guessed or media_type or BINARY_MIME + fallback = elem.get('fallback') + if href in manifest.hrefs: + self.logger.warn(u'Duplicate manifest entry for %r' % href) + continue + if not self.oeb.container.exists(href): + self.logger.warn(u'Manifest item %r not found' % href) + continue + if id in manifest.ids: + self.logger.warn(u'Duplicate manifest id %r' % id) + id, href = manifest.generate(id, href) + manifest.add(id, href, media_type, fallback) + self._manifest_add_missing() + + def _spine_add_extra(self): + manifest = self.oeb.manifest + spine = self.oeb.spine + unchecked = set(spine) + selector = XPath('h:body//h:a/@href') + extras = set() + while unchecked: + new = set() + for item in unchecked: + if item.media_type not in OEB_DOCS: + # TODO: handle fallback chains + continue + for href in selector(item.data): + href, _ = urldefrag(href) + if not href: + continue + href = item.abshref(urlnormalize(href)) + if href not in manifest.hrefs: + continue + found = manifest.hrefs[href] + if found.media_type not in OEB_DOCS or \ + found in spine or found in extras: + continue + new.add(found) + extras.update(new) + unchecked = new + version = int(self.oeb.version[0]) + for item in sorted(extras): + if version >= 2: + self.logger.warn( + 'Spine-referenced file %r not in spine' % item.href) + spine.add(item, linear=False) + + def _spine_from_opf(self, opf): + spine = self.oeb.spine + manifest = self.oeb.manifest + for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'): + idref = elem.get('idref') + if idref not in manifest.ids: + self.logger.warn(u'Spine item %r not found' % idref) + continue + item = manifest.ids[idref] + spine.add(item, elem.get('linear')) + if len(spine) == 0: + raise OEBError("Spine is empty") + self._spine_add_extra() + + def _guide_from_opf(self, opf): + guide = self.oeb.guide + manifest = self.oeb.manifest + for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'): + href = elem.get('href') + path = urldefrag(href)[0] + if path not in manifest.hrefs: + self.logger.warn(u'Guide reference %r not found' % href) + continue + guide.add(elem.get('type'), elem.get('title'), href) + + def _find_ncx(self, opf): + result = xpath(opf, '/o2:package/o2:spine/@toc') + if result: + id = result[0] + if id not in self.oeb.manifest.ids: + return None + item = self.oeb.manifest.ids[id] + self.oeb.manifest.remove(item) + return item + for item in self.oeb.manifest.values(): + if item.media_type == NCX_MIME: + self.oeb.manifest.remove(item) + return item + return None + + def _toc_from_navpoint(self, item, toc, navpoint): + children = xpath(navpoint, 'ncx:navPoint') + for child in children: + title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()')) + title = COLLAPSE_RE.sub(' ', title.strip()) + href = xpath(child, 'ncx:content/@src') + if not title or not href: + continue + href = item.abshref(urlnormalize(href[0])) + path, _ = urldefrag(href) + if path not in self.oeb.manifest.hrefs: + self.logger.warn('TOC reference %r not found' % href) + continue + id = child.get('id') + klass = child.get('class') + node = toc.add(title, href, id=id, klass=klass) + self._toc_from_navpoint(item, node, child) + + def _toc_from_ncx(self, item): + if item is None: + return False + ncx = item.data + title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()')) + title = COLLAPSE_RE.sub(' ', title.strip()) + title = title or unicode(self.oeb.metadata.title[0]) + toc = self.oeb.toc + toc.title = title + navmaps = xpath(ncx, 'ncx:navMap') + for navmap in navmaps: + self._toc_from_navpoint(item, toc, navmap) + return True + + def _toc_from_tour(self, opf): + result = xpath(opf, 'o2:tours/o2:tour') + if not result: + return False + tour = result[0] + toc = self.oeb.toc + toc.title = tour.get('title') + sites = xpath(tour, 'o2:site') + for site in sites: + title = site.get('title') + href = site.get('href') + if not title or not href: + continue + path, _ = urldefrag(urlnormalize(href)) + if path not in self.oeb.manifest.hrefs: + self.logger.warn('TOC reference %r not found' % href) + continue + id = site.get('id') + toc.add(title, href, id=id) + return True + + def _toc_from_html(self, opf): + if 'toc' not in self.oeb.guide: + return False + itempath, frag = urldefrag(self.oeb.guide['toc'].href) + item = self.oeb.manifest.hrefs[itempath] + html = item.data + if frag: + elems = xpath(html, './/*[@id="%s"]' % frag) + if not elems: + elems = xpath(html, './/*[@name="%s"]' % frag) + elem = elems[0] if elems else html + while elem != html and not xpath(elem, './/h:a[@href]'): + elem = elem.getparent() + html = elem + titles = defaultdict(list) + order = [] + for anchor in xpath(html, './/h:a[@href]'): + href = anchor.attrib['href'] + href = item.abshref(urlnormalize(href)) + path, frag = urldefrag(href) + if path not in self.oeb.manifest.hrefs: + continue + title = ' '.join(xpath(anchor, './/text()')) + title = COLLAPSE_RE.sub(' ', title.strip()) + if href not in titles: + order.append(href) + titles[href].append(title) + toc = self.oeb.toc + for href in order: + toc.add(' '.join(titles[href]), href) + return True + + def _toc_from_spine(self, opf): + toc = self.oeb.toc + titles = [] + headers = [] + for item in self.oeb.spine: + if not item.linear: continue + html = item.data + title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) + title = COLLAPSE_RE.sub(' ', title.strip()) + if title: + titles.append(title) + headers.append('(unlabled)') + for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): + expr = '/h:html/h:body//h:%s[position()=1]/text()' + header = ''.join(xpath(html, expr % tag)) + header = COLLAPSE_RE.sub(' ', header.strip()) + if header: + headers[-1] = header + break + use = titles + if len(titles) > len(set(titles)): + use = headers + for title, item in izip(use, self.oeb.spine): + if not item.linear: continue + toc.add(title, item.href) + return True + + def _toc_from_opf(self, opf, item): + if self._toc_from_ncx(item): return + if self._toc_from_tour(opf): return + self.logger.warn('No metadata table of contents found') + if self._toc_from_html(opf): return + self._toc_from_spine(opf) + + def _pages_from_ncx(self, opf, item): + if item is None: + return False + ncx = item.data + ptargets = xpath(ncx, 'ncx:pageList/ncx:pageTarget') + if not ptargets: + return False + pages = self.oeb.pages + for ptarget in ptargets: + name = ''.join(xpath(ptarget, 'ncx:navLabel/ncx:text/text()')) + name = COLLAPSE_RE.sub(' ', name.strip()) + href = xpath(ptarget, 'ncx:content/@src') + if not href: + continue + href = item.abshref(urlnormalize(href[0])) + id = ptarget.get('id') + type = ptarget.get('type', 'normal') + klass = ptarget.get('class') + pages.add(name, href, type=type, id=id, klass=klass) + return True + + def _find_page_map(self, opf): + result = xpath(opf, '/o2:package/o2:spine/@page-map') + if result: + id = result[0] + if id not in self.oeb.manifest.ids: + return None + item = self.oeb.manifest.ids[id] + self.oeb.manifest.remove(item) + return item + for item in self.oeb.manifest.values(): + if item.media_type == PAGE_MAP_MIME: + self.oeb.manifest.remove(item) + return item + return None + + def _pages_from_page_map(self, opf): + item = self._find_page_map(opf) + if item is None: + return False + pmap = item.data + pages = self.oeb.pages + for page in xpath(pmap, 'o2:page'): + name = page.get('name', '') + href = page.get('href') + if not href: + continue + name = COLLAPSE_RE.sub(' ', name.strip()) + href = item.abshref(urlnormalize(href)) + type = 'normal' + if not name: + type = 'special' + elif name.lower().strip('ivxlcdm') == '': + type = 'front' + pages.add(name, href, type=type) + return True + + def _pages_from_opf(self, opf, item): + if self._pages_from_ncx(opf, item): return + if self._pages_from_page_map(opf): return + return + + def _cover_from_html(self, hcover): + with TemporaryDirectory('_html_cover') as tdir: + writer = OEBWriter() + writer(self.oeb, tdir) + path = os.path.join(tdir, urlunquote(hcover.href)) + renderer = CoverRenderer(path) + data = renderer.image_data + id, href = self.oeb.manifest.generate('cover', 'cover.jpeg') + item = self.oeb.manifest.add(id, href, JPEG_MIME, data=data) + return item + + def _locate_cover_image(self): + if self.oeb.metadata.cover: + id = str(self.oeb.metadata.cover[0]) + item = self.oeb.manifest.ids.get(id, None) + if item is not None and item.media_type in OEB_IMAGES: + return item + else: + self.logger.warn('Invalid cover image @id %r' % id) + hcover = self.oeb.spine[0] + if 'cover' in self.oeb.guide: + href = self.oeb.guide['cover'].href + item = self.oeb.manifest.hrefs[href] + media_type = item.media_type + if media_type in OEB_IMAGES: + return item + elif media_type in OEB_DOCS: + hcover = item + html = hcover.data + if MS_COVER_TYPE in self.oeb.guide: + href = self.oeb.guide[MS_COVER_TYPE].href + item = self.oeb.manifest.hrefs.get(href, None) + if item is not None and item.media_type in OEB_IMAGES: + return item + if self.COVER_SVG_XP(html): + svg = copy.deepcopy(self.COVER_SVG_XP(html)[0]) + href = os.path.splitext(hcover.href)[0] + '.svg' + id, href = self.oeb.manifest.generate(hcover.id, href) + item = self.oeb.manifest.add(id, href, SVG_MIME, data=svg) + return item + if self.COVER_OBJECT_XP(html): + object = self.COVER_OBJECT_XP(html)[0] + href = hcover.abshref(object.get('data')) + item = self.oeb.manifest.hrefs.get(href, None) + if item is not None and item.media_type in OEB_IMAGES: + return item + return self._cover_from_html(hcover) + + def _ensure_cover_image(self): + cover = self._locate_cover_image() + if self.oeb.metadata.cover: + self.oeb.metadata.cover[0].value = cover.id + return + self.oeb.metadata.add('cover', cover.id) + + def _all_from_opf(self, opf): + self.oeb.version = opf.get('version', '1.2') + self._metadata_from_opf(opf) + self._manifest_from_opf(opf) + self._spine_from_opf(opf) + self._guide_from_opf(opf) + item = self._find_ncx(opf) + self._toc_from_opf(opf, item) + self._pages_from_opf(opf, item) + self._ensure_cover_image() + + +def main(argv=sys.argv): + reader = OEBReader() + for arg in argv[1:]: + oeb = reader(OEBBook(), arg) + for name, doc in oeb.to_opf1().values(): + print etree.tostring(doc, pretty_print=True) + for name, doc in oeb.to_opf2(page_map=True).values(): + print etree.tostring(doc, pretty_print=True) + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/ebooks/oeb/writer.py b/src/calibre/ebooks/oeb/writer.py new file mode 100644 index 0000000000..e55db670d6 --- /dev/null +++ b/src/calibre/ebooks/oeb/writer.py @@ -0,0 +1,107 @@ +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2008, Marshall T. Vandegrift ' + +import sys, os, logging +from calibre.ebooks.oeb.base import OPF_MIME, xml2str +from calibre.ebooks.oeb.base import Logger, DirContainer, OEBBook +from calibre.utils.config import Config + +__all__ = ['OEBWriter'] + +class OEBWriter(object): + DEFAULT_PROFILE = 'PRS505' + + def __init__(self, version='2.0', page_map=False, pretty_print=False): + self.version = version + self.page_map = page_map + self.pretty_print = pretty_print + + @classmethod + def config(cls, cfg): + oeb = cfg.add_group('oeb', _('OPF/NCX/etc. generation options.')) + versions = ['1.2', '2.0'] + oeb('opf_version', ['--opf-version'], default='2.0', choices=versions, + help=_('OPF version to generate. Default is %default.')) + oeb('adobe_page_map', ['--adobe-page-map'], default=False, + help=_('Generate an Adobe "page-map" file if pagination ' + 'information is avaliable.')) + return cfg + + @classmethod + def generate(cls, opts): + version = opts.opf_version + page_map = opts.adobe_page_map + pretty_print = opts.pretty_print + return cls(version=version, page_map=page_map, + pretty_print=pretty_print) + + def __call__(self, oeb, path): + version = int(self.version[0]) + opfname = None + if os.path.splitext(path)[1].lower() == '.opf': + opfname = os.path.basename(path) + path = os.path.dirname(path) + if not os.path.isdir(path): + os.mkdir(path) + output = DirContainer(path) + for item in oeb.manifest.values(): + output.write(item.href, str(item)) + if version == 1: + metadata = oeb.to_opf1() + elif version == 2: + metadata = oeb.to_opf2(page_map=self.page_map) + else: + raise OEBError("Unrecognized OPF version %r" % self.version) + pretty_print = self.pretty_print + for mime, (href, data) in metadata.items(): + if opfname and mime == OPF_MIME: + href = opfname + output.write(href, xml2str(data, pretty_print=pretty_print)) + return + + +def option_parser(): + cfg = Config('oeb', _('Options to control OEB conversion.')) + OEBWriter.config(cfg) + parser = cfg.option_parser() + parser.add_option('--encoding', default=None, + help=_('Character encoding for files. Default is to auto detect.')) + parser.add_option('-o', '--output', default=None, + help=_('Output file. Default is derived from input filename.')) + parser.add_option('-p', '--pretty-print', action='store_true', + default=False, help=_('Produce more human-readable XML output.')) + parser.add_option('-v', '--verbose', default=0, action='count', + help=_('Useful for debugging.')) + return parser + +def any2oeb(opts, inpath): + from calibre.ebooks.oeb.factory import ReaderFactory + logger = Logger(logging.getLogger('any2oeb')) + logger.setup_cli_handler(opts.verbose) + outpath = opts.output + if outpath is None: + outpath = os.path.basename(inpath) + outpath = os.path.splitext(outpath)[0] + encoding = opts.encoding + pretty_print = opts.pretty_print + oeb = OEBBook(encoding=encoding, pretty_print=pretty_print, logger=logger) + reader = ReaderFactory(inpath) + reader(oeb, inpath) + writer = OEBWriter.generate(opts) + writer(oeb, outpath) + return 0 + +def main(argv=sys.argv): + parser = option_parser() + opts, args = parser.parse_args(argv[1:]) + if len(args) != 1: + parser.print_help() + return 1 + inpath = args[0] + retval = any2oeb(opts, inpath) + return retval + +if __name__ == '__main__': + sys.exit(main()) From 9c2a4e36eccbe57528a5167717812be4986e78dc Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 8 Feb 2009 20:26:01 -0800 Subject: [PATCH 007/319] IGN:... --- src/calibre/ebooks/metadata/cli.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/metadata/cli.py b/src/calibre/ebooks/metadata/cli.py index 4101f34047..8053b82e90 100644 --- a/src/calibre/ebooks/metadata/cli.py +++ b/src/calibre/ebooks/metadata/cli.py @@ -31,21 +31,21 @@ from calibre import prints def config(): c = StringConfig('') - c.add_opt('title', ['-t', '--title'], + c.add_opt('title', ['-t', '--title'], help=_('Set the title.')) c.add_opt('authors', ['-a', '--authors'], help=_('Set the authors. Multiple authors should be separated ' 'by the & character. Author names should be in the order ' 'Firstname Lastname.')) - c.add_opt('title_sort', ['--title-sort'], + c.add_opt('title_sort', ['--title-sort'], help=_('The version of the title to be used for sorting. ' 'If unspecified, and the title is specified, it will ' 'be auto-generated from the title.')) - c.add_opt('author_sort', ['--author-sort'], + c.add_opt('author_sort', ['--author-sort'], help=_('String to be used when sorting by author. ' 'If unspecified, and the author(s) are specified, it will ' 'be auto-generated from the author(s).')) - c.add_opt('cover', ['--cover'], + c.add_opt('cover', ['--cover'], help=_('Set the cover to the specified file.')) c.add_opt('comments', ['-c', '--comments'], help=_('Set the ebook description.')) @@ -195,4 +195,4 @@ def main(args=sys.argv): return 0 if __name__ == '__main__': - sys.exit(main()) \ No newline at end of file + sys.exit(main()) From e5984c02c7bc7ded3b2afd7aa4ff5e85a167dd03 Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Tue, 10 Feb 2009 23:50:35 -0500 Subject: [PATCH 008/319] Document OEBBook. --- src/calibre/ebooks/lit/writer.py | 2 +- src/calibre/ebooks/oeb/base.py | 308 +++++++++++++++++++++++++++---- 2 files changed, 269 insertions(+), 41 deletions(-) diff --git a/src/calibre/ebooks/lit/writer.py b/src/calibre/ebooks/lit/writer.py index 4a059b6433..bebba8938b 100644 --- a/src/calibre/ebooks/lit/writer.py +++ b/src/calibre/ebooks/lit/writer.py @@ -312,7 +312,7 @@ class LitWriter(object): cover = None if oeb.metadata.cover: id = str(oeb.metadata.cover[0]) - cover = oeb.manifest[id] + cover = oeb.manifest.ids[id] for type, title in ALL_MS_COVER_TYPES: if type not in oeb.guide: oeb.guide.add(type, title, cover.href) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 771a27a81a..ce16fa76e5 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -5,6 +5,7 @@ from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Marshall T. Vandegrift ' +__docformat__ = 'restructuredtext en' import os, sys, re, uuid from mimetypes import types_map @@ -175,6 +176,7 @@ URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ' URL_UNSAFE = [ASCII_CHARS - URL_SAFE, UNIBYTE_CHARS - URL_SAFE] def urlquote(href): + """Quote URL-unsafe characters, allowing IRI-safe characters.""" result = [] unsafe = 0 if isinstance(href, unicode) else 1 unsafe = URL_UNSAFE[unsafe] @@ -185,6 +187,9 @@ def urlquote(href): return ''.join(result) def urlnormalize(href): + """Convert a URL into normalized form, with all and only URL-unsafe + characters URL quoted. + """ parts = urlparse(href) if not parts.scheme: path, frag = urldefrag(href) @@ -196,21 +201,30 @@ def urlnormalize(href): class OEBError(Exception): + """Generic OEB-processing error.""" pass class FauxLogger(object): + """Fake logging interface.""" def __getattr__(self, name): return self def __call__(self, message): print message class Logger(LoggingInterface, object): + """A logging object which provides both the standard `logging.Logger` and + calibre-specific interfaces. + """ def __getattr__(self, name): return object.__getattribute__(self, 'log_' + name) class NullContainer(object): + """An empty container. + + For use with book formats which do not support container-like access. + """ def read(self, path): raise OEBError('Attempt to read from NullContainer') @@ -224,6 +238,8 @@ class NullContainer(object): return [] class DirContainer(object): + """Filesystem directory container.""" + def __init__(self, path): path = unicode(path) ext = os.path.splitext(path)[1].lower() @@ -269,20 +285,38 @@ class DirContainer(object): class Metadata(object): - DC_TERMS = set([ - 'contributor', 'coverage', 'creator', 'date', - 'description', 'format', 'identifier', 'language', - 'publisher', 'relation', 'rights', 'source', 'subject', - 'title', 'type' - ]) + """A collection of OEB data model metadata. + + Provides access to the list of items associated with a particular metadata + term via the term's local name using either Python container or attribute + syntax. Return an empty list for any terms with no currently associated + metadata items. + """ + + DC_TERMS = set(['contributor', 'coverage', 'creator', 'date', + 'description', 'format', 'identifier', 'language', + 'publisher', 'relation', 'rights', 'source', + 'subject', 'title', 'type']) CALIBRE_TERMS = set(['series', 'series_index', 'rating']) OPF_ATTRS = {'role': OPF('role'), 'file-as': OPF('file-as'), 'scheme': OPF('scheme'), 'event': OPF('event'), 'type': XSI('type'), 'lang': XML('lang'), 'id': 'id'} class Item(object): - + """An item of OEB data model metadata. + + The metadata term or name may be accessed via the :attr:`term` or + :attr:`name` attributes. The metadata value or content may be accessed + via the :attr:`value` or :attr:`content` attributes, or via Unicode or + string representations of the object. + + OEB data model metadata attributes may be accessed either via their + fully-qualified names using the Python container access syntax, or via + their local names using Python attribute syntax. Only attributes + allowed by the OPF 2.0 specification are supported. + """ class Attribute(object): + """Smart accessor for allowed OEB metadata item attributes.""" def __init__(self, attr, allowed=None): if not callable(attr): @@ -333,10 +367,24 @@ class Metadata(object): nsattr = 'scheme' if attr != nsattr: attrib[nsattr] = attrib.pop(attr) - + + @dynamic_property + def name(self): + def fget(self): + return self.term + return property(fget=fget) + + @dynamic_property + def content(self): + def fget(self): + return self.value + def fset(self, value): + self.value = value + return property(fget=fget, fset=fset) + scheme = Attribute(lambda term: 'scheme' if \ term == OPF('meta') else OPF('scheme'), - [DC('identifier'), OPF('meta')]) + [DC('identifier'), OPF('meta')]) file_as = Attribute(OPF('file-as'), [DC('creator'), DC('contributor')]) role = Attribute(OPF('role'), [DC('creator'), DC('contributor')]) event = Attribute(OPF('event'), [DC('date')]) @@ -405,6 +453,7 @@ class Metadata(object): self.items = defaultdict(list) def add(self, term, value, attrib={}, nsmap={}, **kwargs): + """Add a new metadata item.""" item = self.Item(term, value, attrib, nsmap, **kwargs) items = self.items[barename(item.term)] items.append(item) @@ -477,8 +526,40 @@ class Metadata(object): class Manifest(object): + """Collection of files composing an OEB data model book. + + Provides access to the content of the files composing the book and + attributes associated with those files, including their internal paths, + unique identifiers, and MIME types. + + Itself acts as a :class:`set` of manifest items, and provides the following + instance data member for dictionary-like access: + + :attr:`ids`: A dictionary in which the keys are the unique identifiers of + the manifest items and the values are the items themselves. + :attr:`hrefs`: A dictionary in which the keys are the internal paths of the + manifest items and the values are the items themselves. + """ class Item(object): + """An OEB data model book content file. + + Provides the following data members for accessing the file content and + metadata associated with this particular file. + + :attr:`id`: Unique identifier. + :attr:`href`: Book-internal path. + :attr:`media_type`: MIME type of the file content. + :attr:`fallback`: Unique id of any fallback manifest item associated + with this manifest item. + :attr:`spine_position`: Display/reading order index for book textual + content. `None` for manifest items which are not part of the + book's textual content. + :attr:`linear`: `True` for textual content items which are part of the + primary linear reading order and `False` for textual content items + which are not (such as footnotes). Meaningless for items which + have a :attr:`spine_position` of `None`. + """ NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)') META_XP = XPath('/h:html/h:head/h:meta[@http-equiv="Content-Type"]') @@ -584,6 +665,18 @@ class Manifest(object): @dynamic_property def data(self): + doc = """Provides MIME type sensitive access to the manifest + entry's associated content. + + - XHTML, HTML, and variant content is parsed as necessary to + convert and and return as an lxml.etree element in the XHTML + namespace. + - XML content is parsed and returned as an lxml.etree element. + - CSS and CSS-variant content is parsed and returned as a cssutils + CSS DOM stylesheet. + - All other content is returned as a :class:`str` object with no + special parsing. + """ def fget(self): if self._data is not None: return self._data @@ -600,7 +693,7 @@ class Manifest(object): self._data = value def fdel(self): self._data = None - return property(fget, fset, fdel) + return property(fget, fset, fdel, doc=doc) def __str__(self): data = self.data @@ -631,6 +724,9 @@ class Manifest(object): return cmp(skey, okey) def relhref(self, href): + """Convert the URL provided in :param:`href` from a book-absolute + reference to a reference relative to this manifest item. + """ if urlparse(href).scheme: return href if '/' not in self.href: @@ -649,6 +745,9 @@ class Manifest(object): return relhref def abshref(self, href): + """Convert the URL provided in :param:`href` from a reference + relative to this manifest item to a book-absolute reference. + """ if urlparse(href).scheme: return href path, frag = urldefrag(href) @@ -663,25 +762,46 @@ class Manifest(object): def __init__(self, oeb): self.oeb = oeb + self.items = set() self.ids = {} self.hrefs = {} def add(self, id, href, media_type, fallback=None, loader=None, data=None): + """Add a new item to the book manifest. + + The item's :param:`id`, :param:`href`, and :param:`media_type` are all + required. A :param:`fallback` item-id is required for any items with a + MIME type which is not one of the OPS core media types. Either the + item's data itself may be provided with :param:`data`, or a loader + function for the data may be provided with :param:`loader`, or the + item's data may latter be set manually via the :attr:`data` attribute. + """ item = self.Item( self.oeb, id, href, media_type, fallback, loader, data) + self.items.add(item) self.ids[item.id] = item self.hrefs[item.href] = item return item def remove(self, item): + """Removes :param:`item` from the manifest.""" if item in self.ids: item = self.ids[item] del self.ids[item.id] del self.hrefs[item.href] + self.items.remove(item) if item in self.oeb.spine: self.oeb.spine.remove(item) def generate(self, id=None, href=None): + """Generate a new unique identifier and/or internal path for use in + creating a new manifest item, using the provided :param:`id` and/or + :param:`href` as bases. + + Returns an two-tuple of the new id and path. If either :param:`id` or + :param:`href` are `None` then the corresponding item in the return + tuple will also be `None`. + """ if id is not None: base = id index = 1 @@ -698,26 +818,16 @@ class Manifest(object): return id, href def __iter__(self): - for id in self.ids: - yield id - - def __getitem__(self, id): - return self.ids[id] - - def values(self): - for item in self.ids.values(): + for item in self.items: yield item + values = __iter__ - def items(self): - for id, item in self.ids.items(): - yield id, item - - def __contains__(self, key): - return key in self.ids + def __contains__(self, item): + return item in self.items def to_opf1(self, parent=None): elem = element(parent, 'manifest') - for item in self.ids.values(): + for item in self.items: media_type = item.media_type if media_type in OEB_DOCS: media_type = OEB_DOC_MIME @@ -732,7 +842,7 @@ class Manifest(object): def to_opf2(self, parent=None): elem = element(parent, OPF('manifest')) - for item in self.ids.values(): + for item in self.items: media_type = item.media_type if media_type in OEB_DOCS: media_type = XHTML_MIME @@ -747,7 +857,13 @@ class Manifest(object): class Spine(object): - + """Collection of manifest items composing an OEB data model book's main + textual content. + + The spine manages which manifest items compose the book's main textual + content and the sequence in which they appear. Provides Python container + access as a list-like object. + """ def __init__(self, oeb): self.oeb = oeb self.items = [] @@ -762,12 +878,14 @@ class Spine(object): return linear def add(self, item, linear=None): + """Append :param:`item` to the end of the `Spine`.""" item.linear = self._linear(linear) item.spine_position = len(self.items) self.items.append(item) return item def insert(self, index, item, linear): + """Insert :param:`item` at position :param:`index` in the `Spine`.""" item.linear = self._linear(linear) item.spine_position = index self.items.insert(index, item) @@ -776,6 +894,7 @@ class Spine(object): return item def remove(self, item): + """Remove :param:`item` from the `Spine`.""" index = item.spine_position self.items.pop(index) for i in xrange(index, len(self.items)): @@ -813,9 +932,24 @@ class Spine(object): class Guide(object): + """Collection of references to standard frequently-occurring sections + within an OEB data model book. + + Provides dictionary-like access, in which the keys are the OEB reference + type identifiers and the values are `Reference` objects. + """ class Reference(object): - + """Reference to a standard book section. + + Provides the following instance data members: + + :attr:`type`: Reference type identifier, as chosen from the list + allowed in the OPF 2.0 specification. + :attr:`title`: Human-readable section title. + :attr:`href`: Book-internal URL of the referenced section. May include + a fragment identifier. + """ _TYPES_TITLES = [('cover', __('Cover')), ('title-page', __('Title Page')), ('toc', __('Table of Contents')), @@ -867,17 +1001,19 @@ class Guide(object): @dynamic_property def item(self): + doc = """The manifest item associated with this reference.""" def fget(self): path = urldefrag(self.href)[0] hrefs = self.oeb.manifest.hrefs return hrefs.get(path, None) - return property(fget=fget) + return property(fget=fget, doc=doc) def __init__(self, oeb): self.oeb = oeb self.refs = {} def add(self, type, title, href): + """Add a new reference to the `Guide`.""" ref = self.Reference(self.oeb, type, title, href) self.refs[type] = ref return ref @@ -925,8 +1061,19 @@ class Guide(object): return elem +# TODO: This needs beefing up to support the interface of toc.TOC class TOC(object): - # This needs beefing up to support the interface of toc.TOC + """Represents a hierarchical table of contents or navigation tree for + accessing arbitrary semantic sections within an OEB data model book. + + Acts as a node within the navigation tree. Provides list-like access to + sub-nodes. Provides the follow node instance data attributes: + + :attr:`title`: The title of this navigation node. + :attr:`href`: Book-internal URL referenced by this node. + :attr:`klass`: Optional semantic class referenced by this node. + :attr:`id`: Option unique identifier for this node. + """ def __init__(self, title=None, href=None, klass=None, id=None): self.title = title self.href = urlnormalize(href) if href else href @@ -935,17 +1082,26 @@ class TOC(object): self.nodes = [] def add(self, title, href, klass=None, id=None): + """Create and return a new sub-node of this node.""" node = TOC(title, href, klass, id) self.nodes.append(node) return node + def iter(self): + """Iterate over this node and all descendants in depth-first order.""" + yield self + for child in self.nodes: + for node in child.iter(): + yield node + def iterdescendants(self): - for node in self.nodes: - yield node - for child in node.iterdescendants(): - yield child + """Iterate over all descendant nodes in depth-first order.""" + for child in self.nodes: + for node in child.iter(): + yield node def __iter__(self): + """Iterate over all immediate child nodes.""" for node in self.nodes: yield node @@ -953,6 +1109,9 @@ class TOC(object): return self.nodes[index] def autolayer(self): + """Make sequences of children pointing to the same content file into + children of the first node referencing that file. + """ prev = None for node in list(self.nodes): if prev and urldefrag(prev.href)[0] == urldefrag(node.href)[0]: @@ -961,10 +1120,12 @@ class TOC(object): else: prev = node - def depth(self, level=0): - if self.nodes: - return self.nodes[0].depth(level+1) - return level + def depth(self): + """The maximum depth of the navigation tree rooted at this node.""" + try: + return max(node.depth() for node in self.nodes) + 1 + except ValueError: + return 1 def to_opf1(self, tour): for node in self.nodes: @@ -989,12 +1150,34 @@ class TOC(object): class PageList(object): + """Collection of named "pages" to mapped positions within an OEB data model + book's textual content. + + Provides list-like access to the pages. + """ class Page(object): + """Represents a mapping between a page name and a position within + the book content. + + Provides the following instance data attributes: + + :attr:`name`: The name of this page. Generally a number. + :attr:`href`: Book-internal URL at which point this page begins. + :attr:`type`: Must be one of 'front' (for prefatory pages, as commonly + labeled in print with small-case Roman numerals), 'normal' (for + standard pages, as commonly labeled in print with Arabic numerals), + or 'special' (for other pages, as commonly not labeled in any + fashion in print, such as the cover and title pages). + :attr:`klass`: Optional semantic class of this page. + :attr:`id`: Optional unique identifier for this page. + """ + TYPES = set(['front', 'normal', 'special']) + def __init__(self, name, href, type='normal', klass=None, id=None): - self.name = name + self.name = unicode(name) self.href = urlnormalize(href) - self.type = type + self.type = type if type in self.TYPES else 'normal' self.id = id self.klass = klass @@ -1002,6 +1185,7 @@ class PageList(object): self.pages = [] def add(self, name, href, type='normal', klass=None, id=None): + """Create a new page and add it to the `PageList`.""" page = self.Page(name, href, type, klass, id) self.pages.append(page) return page @@ -1015,6 +1199,12 @@ class PageList(object): def __getitem__(self, index): return self.pages[index] + + def pop(self, index=-1): + return self.pages.pop(index) + + def remove(self, page): + return self.pages.remove(page) def to_ncx(self, parent=None): plist = element(parent, NCX('pageList'), id=str(uuid.uuid4())) @@ -1040,8 +1230,33 @@ class PageList(object): class OEBBook(object): + """Representation of a book in the IDPF OEB data model.""" def __init__(self, encoding=None, pretty_print=False, logger=FauxLogger()): + """Create empty book. Optional arguments: + + :param:`encoding`: Default encoding for textual content read + from an external container. + :param:`pretty_print`: Whether or not the canonical string form + of XML markup is pretty-printed. + :prama:`logger`: A Logger object to use for logging all messages + related to the processing of this book. It is accessible + via the instance data member :attr:`logger`. + + It provides the following public instance data members for + accessing various parts of the OEB data model: + + :attr:`metadata`: Metadata such as title, author name(s), etc. + :attr:`manifest`: Manifest of all files included in the book, + including MIME types and fallback information. + :attr:`spine`: In-order list of manifest items which compose + the textual content of the book. + :attr:`guide`: Collection of references to standard positions + within the text, such as the cover, preface, etc. + :attr:`toc`: Hierarchical table of contents. + :attr:`pages`: List of "pages," such as indexed to a print edition of + the same text. + """ self.encoding = encoding self.pretty_print = pretty_print self.logger = logger @@ -1057,16 +1272,19 @@ class OEBBook(object): @classmethod def generate(cls, opts): + """Generate an OEBBook instance from command-line options.""" encoding = opts.encoding pretty_print = opts.pretty_print return cls(encoding=encoding, pretty_print=pretty_print) def translate(self, text): + """Translate :param:`text` into the book's primary language.""" lang = str(self.metadata.language[0]) lang = lang.split('-', 1)[0].lower() return translate(lang, text) def decode(self, data): + """Automatically decode :param:`data` into a `unicode` object.""" if isinstance(data, unicode): return data if data[:2] in ('\xff\xfe', '\xfe\xff'): @@ -1089,6 +1307,11 @@ class OEBBook(object): return data def to_opf1(self): + """Produce OPF 1.2 representing the book's metadata and structure. + + Returns a dictionary in which the keys are MIME types and the values + are tuples of (default) filenames and lxml.etree element structures. + """ package = etree.Element('package', attrib={'unique-identifier': self.uid.id}) self.metadata.to_opf1(package) @@ -1160,6 +1383,11 @@ class OEBBook(object): return ncx def to_opf2(self, page_map=False): + """Produce OPF 2.0 representing the book's metadata and structure. + + Returns a dictionary in which the keys are MIME types and the values + are tuples of (default) filenames and lxml.etree element structures. + """ results = {} package = etree.Element(OPF('package'), attrib={'version': '2.0', 'unique-identifier': self.uid.id}, From 5dca63111427af5a8caddbff0d96a63b1bc9f5fe Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Wed, 11 Feb 2009 10:00:54 -0500 Subject: [PATCH 009/319] Demonstrable modularization of e-book conversion. --- src/calibre/ebooks/lit/reader.py | 1 + src/calibre/ebooks/mobi/mobiml.py | 10 ++- src/calibre/ebooks/mobi/writer.py | 40 +++++++-- src/calibre/ebooks/oeb/base.py | 11 +-- src/calibre/ebooks/oeb/factory.py | 87 +++++++++++++++++-- src/calibre/ebooks/oeb/reader.py | 24 +++++ src/calibre/ebooks/oeb/transforms/flatcss.py | 10 ++- src/calibre/ebooks/oeb/transforms/htmltoc.py | 13 ++- .../ebooks/oeb/transforms/manglecase.py | 10 ++- .../ebooks/oeb/transforms/rasterize.py | 10 ++- .../ebooks/oeb/transforms/trimmanifest.py | 10 ++- src/calibre/ebooks/oeb/writer.py | 57 +++--------- 12 files changed, 210 insertions(+), 73 deletions(-) diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index dd42434101..8cbb9514a8 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -802,6 +802,7 @@ class LitFile(object): class LitContainer(object): + """Simple Container-interface, read-only accessor for LIT files.""" def __init__(self, filename_or_stream): self._litfile = LitFile(filename_or_stream) diff --git a/src/calibre/ebooks/mobi/mobiml.py b/src/calibre/ebooks/mobi/mobiml.py index 7ecd127452..b7418a5d19 100644 --- a/src/calibre/ebooks/mobi/mobiml.py +++ b/src/calibre/ebooks/mobi/mobiml.py @@ -82,7 +82,15 @@ class MobiMLizer(object): def __init__(self, ignore_tables=False): self.ignore_tables = ignore_tables - def transform(self, oeb, context): + @classmethod + def config(cls, cfg): + return cfg + + @classmethod + def generate(cls, opts): + return cls() + + def __call__(self, oeb, context): oeb.logger.info('Converting XHTML to Mobipocket markup...') self.oeb = oeb self.profile = profile = context.dest diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index 380bdbf518..1b5d3ae652 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -295,6 +295,11 @@ class Serializer(object): class MobiWriter(object): COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+') + + DEFAULT_PROFILE = 'CybookG3' + + TRANSFORMS = [HTMLTOCAdder, CaseMangler, CSSFlattener, SVGRasterizer, + ManifestTrimmer, MobiMLizer] def __init__(self, compression=None, imagemax=None, prefer_author_sort=False): @@ -302,7 +307,32 @@ class MobiWriter(object): self._imagemax = imagemax or OTHER_MAX_IMAGE_SIZE self._prefer_author_sort = prefer_author_sort - def dump(self, oeb, path): + @classmethod + def config(cls, cfg): + """Add any book-writing options to the :class:`Config` object + :param:`cfg`. + """ + mobi = cfg.add_group('mobipocket', _('Mobipocket-specific options.')) + mobi('compress', ['--compress'], default=False, + help=_('Compress file text using PalmDOC compression. ' + 'Results in smaller files, but takes a long time to run.')) + mobi('rescale_images', ['--rescale-images'], default=False, + help=_('Modify images to meet Palm device size limitations.')) + mobi('prefer_author_sort', ['--prefer-author-sort'], default=False, + help=_('When present, use the author sorting information for ' + 'generating the Mobipocket author metadata.')) + return cfg + + @classmethod + def generate(cls, opts): + """Generate a Writer instance from command-line options.""" + compression = PALMDOC if opts.compress else UNCOMPRESSED + imagemax = PALM_MAX_IMAGE_SIZE if opts.rescale_images else None + prefer_author_sort = opts.prefer_author_sort + return cls(compression=compression, imagemax=imagemax, + prefer_author_sort=prefer_author_sort) + + def __call__(self, oeb, path): if hasattr(path, 'write'): return self._dump_stream(oeb, path) with open(path, 'w+b') as stream: @@ -533,20 +563,12 @@ def config(defaults=None): c = StringConfig(defaults, desc) mobi = c.add_group('mobipocket', _('Mobipocket-specific options.')) - mobi('compress', ['--compress'], default=False, - help=_('Compress file text using PalmDOC compression. ' - 'Results in smaller files, but takes a long time to run.')) - mobi('rescale_images', ['--rescale-images'], default=False, - help=_('Modify images to meet Palm device size limitations.')) mobi('toc_title', ['--toc-title'], default=None, help=_('Title for any generated in-line table of contents.')) mobi('ignore_tables', ['--ignore-tables'], default=False, help=_('Render HTML tables as blocks of text instead of actual ' 'tables. This is neccessary if the HTML contains very large ' 'or complex tables.')) - mobi('prefer_author_sort', ['--prefer-author-sort'], default=False, - help=_('When present, use the author sorting information for ' - 'generating the Mobipocket author metadata.')) profiles = c.add_group('profiles', _('Device renderer profiles. ' 'Affects conversion of font sizes, image rescaling and rasterization ' 'of tables. Valid profiles are: %s.') % ', '.join(_profiles)) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index ce16fa76e5..c9d01b03fe 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -820,8 +820,10 @@ class Manifest(object): def __iter__(self): for item in self.items: yield item - values = __iter__ + def values(self): + return list(self.items) + def __contains__(self, item): return item in self.items @@ -1134,7 +1136,7 @@ class TOC(object): node.to_opf1(tour) return tour - def to_ncx(self, parent, depth=1): + def to_ncx(self, parent): for node in self.nodes: id = node.id or unicode(uuid.uuid4()) attrib = {'id': id, 'playOrder': '0'} @@ -1143,9 +1145,8 @@ class TOC(object): point = element(parent, NCX('navPoint'), attrib=attrib) label = etree.SubElement(point, NCX('navLabel')) element(label, NCX('text')).text = node.title - href = node.href if depth > 1 else urldefrag(node.href)[0] - element(point, NCX('content'), src=href) - node.to_ncx(point, depth+1) + element(point, NCX('content'), src=node.href) + node.to_ncx(point) return parent diff --git a/src/calibre/ebooks/oeb/factory.py b/src/calibre/ebooks/oeb/factory.py index 1ce33a4f00..684451044b 100644 --- a/src/calibre/ebooks/oeb/factory.py +++ b/src/calibre/ebooks/oeb/factory.py @@ -6,20 +6,93 @@ from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Marshall T. Vandegrift ' -import os +import sys, os, logging +from itertools import chain from calibre.ebooks.oeb.base import OEBError from calibre.ebooks.oeb.reader import OEBReader +from calibre.ebooks.oeb.writer import OEBWriter from calibre.ebooks.lit.reader import LitReader +from calibre.ebooks.lit.writer import LitWriter +from calibre.ebooks.mobi.reader import MobiReader +from calibre.ebooks.mobi.writer import MobiWriter +from calibre.ebooks.oeb.base import Logger, OEBBook +from calibre.ebooks.oeb.profile import Context +from calibre.utils.config import Config __all__ = ['get_reader'] -READER_REGISTRY = { - '.opf': OEBReader, - '.lit': LitReader, +REGISTRY = { + '.opf': (OEBReader, None), + '.lit': (LitReader, LitWriter), + '.mobi': (MobiReader, MobiWriter), } def ReaderFactory(path): - ext = os.path.splitext(path)[1].lower() - if not ext: + if os.path.isdir(path): return OEBReader - return READER_REGISTRY[ext]() + ext = os.path.splitext(path)[1].lower() + Reader = REGISTRY.get(ext, (None, None))[0] + if Reader is None: + raise OEBError('Unknown e-book file extension %r' % ext) + return Reader + +def WriterFactory(path): + if os.path.isdir(path): + return OEBWriter + ext = os.path.splitext(path)[1].lower() + if not os.path.exists(path) and not ext: + return OEBWriter + Writer = REGISTRY.get(ext, (None, None))[1] + if Writer is None: + raise OEBError('Unknown e-book file extension %r' % ext) + return Writer + + +def option_parser(Reader, Writer): + cfg = Config('ebook-convert', _('Options to control e-book conversion.')) + Reader.config(cfg) + for Transform in chain(Reader.TRANSFORMS, Writer.TRANSFORMS): + Transform.config(cfg) + Writer.config(cfg) + parser = cfg.option_parser() + parser.add_option('--encoding', default=None, + help=_('Character encoding for input. Default is to auto detect.')) + parser.add_option('-o', '--output', default=None, + help=_('Output file. Default is derived from input filename.')) + parser.add_option('-p', '--pretty-print', action='store_true', + default=False, help=_('Produce more human-readable XML output.')) + parser.add_option('-v', '--verbose', default=0, action='count', + help=_('Useful for debugging.')) + return parser + +def main(argv=sys.argv): + if len(argv) < 3: + print _("Usage: ebook-convert INFILE OUTFILE [OPTIONS..]") + return 1 + inpath, outpath = argv[1], argv[2] + Reader = ReaderFactory(inpath) + Writer = WriterFactory(outpath) + parser = option_parser(Reader, Writer) + opts, args = parser.parse_args(argv[3:]) + if len(args) != 0: + parser.print_help() + return 1 + logger = Logger(logging.getLogger('ebook-convert')) + logger.setup_cli_handler(opts.verbose) + encoding = opts.encoding + pretty_print = opts.pretty_print + oeb = OEBBook(encoding=encoding, pretty_print=pretty_print, logger=logger) + context = Context(Reader.DEFAULT_PROFILE, Writer.DEFAULT_PROFILE) + reader = Reader.generate(opts) + writer = Writer.generate(opts) + transforms = [] + for Transform in chain(Reader.TRANSFORMS, Writer.TRANSFORMS): + transforms.append(Transform.generate(opts)) + reader(oeb, inpath) + for transform in transforms: + transform(oeb, context) + writer(oeb, outpath) + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py index aa23ce1e96..0fce1c2b0d 100644 --- a/src/calibre/ebooks/oeb/reader.py +++ b/src/calibre/ebooks/oeb/reader.py @@ -31,15 +31,39 @@ from calibre.ptempfile import TemporaryDirectory __all__ = ['OEBReader'] class OEBReader(object): + """Read an OEBPS 1.x or OPF/OPS 2.0 file collection.""" COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]') COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]') Container = DirContainer + """Container type used to access book files. Override in sub-classes.""" DEFAULT_PROFILE = 'PRS505' + """Default renderer profile for content read with this Reader.""" + + TRANSFORMS = [] + """List of transforms to apply to content read with this Reader.""" + + def __init__(self): + return + @classmethod + def config(cls, cfg): + """Add any book-reading options to the :class:`Config` object + :param:`cfg`. + """ + return + + @classmethod + def generate(cls, opts): + """Generate a Reader instance from command-line options.""" + return cls() + def __call__(self, oeb, path): + """Read the book at :param:`path` into the :class:`OEBBook` object + :param:`oeb`. + """ self.oeb = oeb self.logger = oeb.logger oeb.container = self.Container(path) diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py index 01afcb08e2..ac9684a624 100644 --- a/src/calibre/ebooks/oeb/transforms/flatcss.py +++ b/src/calibre/ebooks/oeb/transforms/flatcss.py @@ -94,7 +94,15 @@ class CSSFlattener(object): self.unfloat = unfloat self.untable = untable - def transform(self, oeb, context): + @classmethod + def config(cls, cfg): + return cfg + + @classmethod + def generate(cls, opts): + return cls() + + def __call__(self, oeb, context): oeb.logger.info('Flattening CSS and remapping font sizes...') self.oeb = oeb self.context = context diff --git a/src/calibre/ebooks/oeb/transforms/htmltoc.py b/src/calibre/ebooks/oeb/transforms/htmltoc.py index 5508b58ec3..0040f39c14 100644 --- a/src/calibre/ebooks/oeb/transforms/htmltoc.py +++ b/src/calibre/ebooks/oeb/transforms/htmltoc.py @@ -52,7 +52,18 @@ class HTMLTOCAdder(object): self.title = title self.style = style - def transform(self, oeb, context): + @classmethod + def config(cls, cfg): + group = cfg.add_group('htmltoc', _('HTML TOC generation options.')) + group('toc_title', ['--toc-title'], default=None, + help=_('Title for any generated in-line table of contents.')) + return cfg + + @classmethod + def generate(cls, opts): + return cls(title=opts.toc_title) + + def __call__(self, oeb, context): if 'toc' in oeb.guide: return oeb.logger.info('Generating in-line TOC...') diff --git a/src/calibre/ebooks/oeb/transforms/manglecase.py b/src/calibre/ebooks/oeb/transforms/manglecase.py index 3a3d91364f..c819475a4d 100644 --- a/src/calibre/ebooks/oeb/transforms/manglecase.py +++ b/src/calibre/ebooks/oeb/transforms/manglecase.py @@ -29,7 +29,15 @@ CASE_MANGLER_CSS = """ TEXT_TRANSFORMS = set(['capitalize', 'uppercase', 'lowercase']) class CaseMangler(object): - def transform(self, oeb, context): + @classmethod + def config(cls, cfg): + return cfg + + @classmethod + def generate(cls, opts): + return cls() + + def __call__(self, oeb, context): oeb.logger.info('Applying case-transforming CSS...') self.oeb = oeb self.profile = context.source diff --git a/src/calibre/ebooks/oeb/transforms/rasterize.py b/src/calibre/ebooks/oeb/transforms/rasterize.py index 12a2812898..aef5c2c98b 100644 --- a/src/calibre/ebooks/oeb/transforms/rasterize.py +++ b/src/calibre/ebooks/oeb/transforms/rasterize.py @@ -34,7 +34,15 @@ class SVGRasterizer(object): if QApplication.instance() is None: QApplication([]) - def transform(self, oeb, context): + @classmethod + def config(cls, cfg): + return cfg + + @classmethod + def generate(cls, opts): + return cls() + + def __call__(self, oeb, context): oeb.logger.info('Rasterizing SVG images...') self.oeb = oeb self.profile = context.dest diff --git a/src/calibre/ebooks/oeb/transforms/trimmanifest.py b/src/calibre/ebooks/oeb/transforms/trimmanifest.py index a1d28e5a99..a5e7042617 100644 --- a/src/calibre/ebooks/oeb/transforms/trimmanifest.py +++ b/src/calibre/ebooks/oeb/transforms/trimmanifest.py @@ -17,7 +17,15 @@ from calibre.ebooks.oeb.base import LINK_SELECTORS, CSSURL_RE from calibre.ebooks.oeb.base import urlnormalize class ManifestTrimmer(object): - def transform(self, oeb, context): + @classmethod + def config(cls, cfg): + return cfg + + @classmethod + def generate(cls, opts): + return cls() + + def __call__(self, oeb, context): oeb.logger.info('Trimming unused files from manifest...') used = set() hrefs = oeb.manifest.hrefs diff --git a/src/calibre/ebooks/oeb/writer.py b/src/calibre/ebooks/oeb/writer.py index c84db30c98..235965b50f 100644 --- a/src/calibre/ebooks/oeb/writer.py +++ b/src/calibre/ebooks/oeb/writer.py @@ -9,13 +9,16 @@ __copyright__ = '2008, Marshall T. Vandegrift ' import sys, os, logging from calibre.ebooks.oeb.base import OPF_MIME, xml2str from calibre.ebooks.oeb.base import Logger, DirContainer, OEBBook -from calibre.utils.config import Config __all__ = ['OEBWriter'] class OEBWriter(object): DEFAULT_PROFILE = 'PRS505' + """Default renderer profile for content written with this Writer.""" + TRANSFORMS = [] + """List of transforms to apply to content written with this Writer.""" + def __init__(self, version='2.0', page_map=False, pretty_print=False): self.version = version self.page_map = page_map @@ -23,6 +26,9 @@ class OEBWriter(object): @classmethod def config(cls, cfg): + """Add any book-writing options to the :class:`Config` object + :param:`cfg`. + """ oeb = cfg.add_group('oeb', _('OPF/NCX/etc. generation options.')) versions = ['1.2', '2.0'] oeb('opf_version', ['--opf-version'], default='2.0', choices=versions, @@ -34,6 +40,7 @@ class OEBWriter(object): @classmethod def generate(cls, opts): + """Generate a Writer instance from command-line options.""" version = opts.opf_version page_map = opts.adobe_page_map pretty_print = opts.pretty_print @@ -41,6 +48,9 @@ class OEBWriter(object): pretty_print=pretty_print) def __call__(self, oeb, path): + """Read the book in the :class:`OEBBook` object :param:`oeb` to a file + at :param:`path`. + """ version = int(self.version[0]) opfname = None if os.path.splitext(path)[1].lower() == '.opf': @@ -63,48 +73,3 @@ class OEBWriter(object): href = opfname output.write(href, xml2str(data, pretty_print=pretty_print)) return - - -def option_parser(): - cfg = Config('oeb', _('Options to control OEB conversion.')) - OEBWriter.config(cfg) - parser = cfg.option_parser() - parser.add_option('--encoding', default=None, - help=_('Character encoding for files. Default is to auto detect.')) - parser.add_option('-o', '--output', default=None, - help=_('Output file. Default is derived from input filename.')) - parser.add_option('-p', '--pretty-print', action='store_true', - default=False, help=_('Produce more human-readable XML output.')) - parser.add_option('-v', '--verbose', default=0, action='count', - help=_('Useful for debugging.')) - return parser - -def any2oeb(opts, inpath): - from calibre.ebooks.oeb.factory import ReaderFactory - logger = Logger(logging.getLogger('any2oeb')) - logger.setup_cli_handler(opts.verbose) - outpath = opts.output - if outpath is None: - outpath = os.path.basename(inpath) - outpath = os.path.splitext(outpath)[0] - encoding = opts.encoding - pretty_print = opts.pretty_print - oeb = OEBBook(encoding=encoding, pretty_print=pretty_print, logger=logger) - reader = ReaderFactory(inpath) - reader(oeb, inpath) - writer = OEBWriter.generate(opts) - writer(oeb, outpath) - return 0 - -def main(argv=sys.argv): - parser = option_parser() - opts, args = parser.parse_args(argv[1:]) - if len(args) != 1: - parser.print_help() - return 1 - inpath = args[0] - retval = any2oeb(opts, inpath) - return retval - -if __name__ == '__main__': - sys.exit(main()) From 459d350af3634a8ca1fbf1498f985c5a96ec325a Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Wed, 11 Feb 2009 10:32:08 -0500 Subject: [PATCH 010/319] Pretty much full utility for LIT->MOBI direct conversion pipeline. --- src/calibre/ebooks/mobi/mobiml.py | 7 ++++++- src/calibre/ebooks/mobi/writer.py | 26 +++++++++++++++++--------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/src/calibre/ebooks/mobi/mobiml.py b/src/calibre/ebooks/mobi/mobiml.py index b7418a5d19..534366da7d 100644 --- a/src/calibre/ebooks/mobi/mobiml.py +++ b/src/calibre/ebooks/mobi/mobiml.py @@ -84,11 +84,16 @@ class MobiMLizer(object): @classmethod def config(cls, cfg): + group = cfg.add_group('mobiml', _('Mobipocket markup options.')) + group('ignore_tables', ['--ignore-tables'], default=False, + help=_('Render HTML tables as blocks of text instead of actual ' + 'tables. This is neccessary if the HTML contains very ' + 'large or complex tables.')) return cfg @classmethod def generate(cls, opts): - return cls() + return cls(ignore_tables=opts.ignore_tables) def __call__(self, oeb, context): oeb.logger.info('Converting XHTML to Mobipocket markup...') diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index 1b5d3ae652..86ac6f6dc9 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -292,13 +292,28 @@ class Serializer(object): buffer.seek(hoff) buffer.write('%010d' % ioff) - + +class MobiFlattener(object): + def config(self, cfg): + return cfg + + def generate(self, opts): + return self + + def __call__(self, oeb, context): + fbase = context.dest.fbase + fkey = context.dest.fnums.values() + flattener = CSSFlattener( + fbase=fbase, fkey=fkey, unfloat=True, untable=True) + return flattener(oeb, context) + + class MobiWriter(object): COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+') DEFAULT_PROFILE = 'CybookG3' - TRANSFORMS = [HTMLTOCAdder, CaseMangler, CSSFlattener, SVGRasterizer, + TRANSFORMS = [HTMLTOCAdder, CaseMangler, MobiFlattener(), SVGRasterizer, ManifestTrimmer, MobiMLizer] def __init__(self, compression=None, imagemax=None, @@ -562,13 +577,6 @@ def config(defaults=None): else: c = StringConfig(defaults, desc) - mobi = c.add_group('mobipocket', _('Mobipocket-specific options.')) - mobi('toc_title', ['--toc-title'], default=None, - help=_('Title for any generated in-line table of contents.')) - mobi('ignore_tables', ['--ignore-tables'], default=False, - help=_('Render HTML tables as blocks of text instead of actual ' - 'tables. This is neccessary if the HTML contains very large ' - 'or complex tables.')) profiles = c.add_group('profiles', _('Device renderer profiles. ' 'Affects conversion of font sizes, image rescaling and rasterization ' 'of tables. Valid profiles are: %s.') % ', '.join(_profiles)) From 9ff64bd715de0294f2d10d8f39ef282452a212e5 Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 11 Feb 2009 20:19:11 -0500 Subject: [PATCH 011/319] Beginning of PDF conversion --- src/calibre/ebooks/pdf/from_any.py | 69 +++++++++++++ src/calibre/ebooks/pdf/writer.py | 153 +++++++++++++++++++++++++++++ src/calibre/linux.py | 1 + src/calibre/parallel.py | 3 + 4 files changed, 226 insertions(+) create mode 100644 src/calibre/ebooks/pdf/from_any.py create mode 100644 src/calibre/ebooks/pdf/writer.py diff --git a/src/calibre/ebooks/pdf/from_any.py b/src/calibre/ebooks/pdf/from_any.py new file mode 100644 index 0000000000..e4fb937cdb --- /dev/null +++ b/src/calibre/ebooks/pdf/from_any.py @@ -0,0 +1,69 @@ +''' +Convert any ebook format to PDF. +''' + +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net ' \ + 'and Marshall T. Vandegrift ' \ + 'and John Schember ' +__docformat__ = 'restructuredtext en' + +import sys, os, glob, logging + +from calibre.ebooks.epub.from_any import any2epub, formats, USAGE +from calibre.ebooks.epub import config as common_config +from calibre.ptempfile import TemporaryDirectory +from calibre.ebooks.pdf.writer import oeb2pdf, config as pdf_config + +def config(defaults=None): + c = common_config(defaults=defaults, name='pdf') + c.remove_opt('profile') + pdfc = pdf_config(defaults=defaults) + c.update(pdfc) + return c + +def option_parser(usage=USAGE): + usage = usage % ('PDF', formats()) + parser = config().option_parser(usage=usage) + return parser + +def any2pdf(opts, path, notification=None): + ext = os.path.splitext(path)[1] + if not ext: + raise ValueError('Unknown file type: '+path) + ext = ext.lower()[1:] + + if opts.output is None: + opts.output = os.path.splitext(os.path.basename(path))[0]+'.pdf' + + opts.output = os.path.abspath(opts.output) + orig_output = opts.output + + with TemporaryDirectory('_any2pdf') as tdir: + oebdir = os.path.join(tdir, 'oeb') + os.mkdir(oebdir) + opts.output = os.path.join(tdir, 'dummy.epub') + opts.profile = 'None' + opts.dont_split_on_page_breaks = True + orig_bfs = opts.base_font_size2 + opts.base_font_size2 = 0 + any2epub(opts, path, create_epub=False, oeb_cover=True, extract_to=oebdir) + opts.base_font_size2 = orig_bfs + opf = glob.glob(os.path.join(oebdir, '*.opf'))[0] + opts.output = orig_output + logging.getLogger('html2epub').info(_('Creating PDF file from EPUB...')) + oeb2pdf(opts, opf) + +def main(args=sys.argv): + parser = option_parser() + opts, args = parser.parse_args(args) + if len(args) < 2: + parser.print_help() + print 'No input file specified.' + return 1 + any2pdf(opts, args[1]) + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py new file mode 100644 index 0000000000..d46d1fc0ed --- /dev/null +++ b/src/calibre/ebooks/pdf/writer.py @@ -0,0 +1,153 @@ +''' +Write content to PDF. +''' +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' + +import os, logging, shutil, sys +from calibre.ebooks.oeb.base import Logger, OEBBook +from calibre.ebooks.oeb.profile import Context +from calibre.ebooks.epub.iterator import SpineItem +from calibre.ebooks.metadata.opf2 import OPF +from calibre.ptempfile import PersistentTemporaryDirectory +from calibre.customize.ui import run_plugins_on_postprocess +from calibre.utils.config import Config, StringConfig + +from PyQt4 import QtCore +from PyQt4.Qt import QUrl, QEventLoop, SIGNAL, QObject, QApplication, QPrinter, \ + QMetaObject +from PyQt4.Qt import * +from PyQt4.QtWebKit import QWebView + +from pyPdf import PdfFileWriter, PdfFileReader + +class PDFWriter(QObject): + def __init__(self): + if QApplication.instance() is None: + QApplication([]) + QObject.__init__(self) + + self.loop = QEventLoop() + self.view = QWebView() + self.connect(self.view, SIGNAL('loadFinished(bool)'), self._render_html) + self.render_queue = [] + self.combine_queue = [] + self.tmp_path = PersistentTemporaryDirectory('_any2pdf_parts') + + def dump(self, oeb, oebpath, path): + self._reset() + + opf = OPF(oebpath, os.path.dirname(oebpath)) + self.render_queue = [SpineItem(i.path) for i in opf.spine] + + self.path = path + + QMetaObject.invokeMethod(self, "_render_book", Qt.QueuedConnection) + self.loop.exec_() + + @QtCore.pyqtSignature('_render_book()') + def _render_book(self): + if len(self.render_queue) == 0: + self._write() + else: + self._render_next() + + def _render_next(self): + item = str(self.render_queue.pop(0)) + self.combine_queue.append(os.path.join(self.tmp_path, '%s.pdf' % os.path.basename(item))) + + self.view.load(QUrl(item)) + + def _render_html(self, ok): + if ok: + printer = QPrinter(QPrinter.HighResolution) + printer.setPageMargins(1, 1, 1, 1, QPrinter.Inch) + printer.setOutputFormat(QPrinter.PdfFormat) + printer.setOutputFileName(os.path.join(self.tmp_path, '%s.pdf' % os.path.basename(str(self.view.url().toLocalFile())))) + self.view.print_(printer) + self._render_book() + + def _reset(self): + self.render_queue = [] + self.combine_queue = [] + self.path = '' + if os.path.exists(self.tmp_path): + shutil.rmtree(self.tmp_path, True) + self.tmp_path = PersistentTemporaryDirectory('_any2pdf_parts') + + def _write(self): + print self.path + try: + outPDF = PdfFileWriter() + for item in self.combine_queue: + inputPDF = PdfFileReader(file(item, 'rb')) + for page in inputPDF.pages: + outPDF.addPage(page) + outputStream = file(self.path, 'wb') + outPDF.write(outputStream) + outputStream.close() + finally: + self._reset() + self.loop.exit(0) + + +def config(defaults=None): + desc = _('Options to control the conversion to PDF') + if defaults is None: + c = Config('pdf', desc) + else: + c = StringConfig(defaults, desc) + + pdf = c.add_group('PDF', _('PDF options.')) + + return c + + +def option_parser(): + c = config() + parser = c.option_parser(usage='%prog '+_('[options]')+' file.opf') + parser.add_option( + '-o', '--output', default=None, + help=_('Output file. Default is derived from input filename.')) + parser.add_option( + '-v', '--verbose', default=0, action='count', + help=_('Useful for debugging.')) + return parser + +def oeb2pdf(opts, inpath): + logger = Logger(logging.getLogger('oeb2pdf')) + logger.setup_cli_handler(opts.verbose) + outpath = opts.output + if outpath is None: + outpath = os.path.basename(inpath) + outpath = os.path.splitext(outpath)[0] + '.pdf' +# source = opts.source_profile +# if source not in Context.PROFILES: +# logger.error(_('Unknown source profile %r') % source) +# return 1 +# dest = opts.dest_profile +# if dest not in Context.PROFILES: +# logger.error(_('Unknown destination profile %r') % dest) +# return 1 + + oeb = OEBBook(inpath, logger=logger, encoding=opts.encoding) + writer = PDFWriter() + writer.dump(oeb, inpath, outpath) + run_plugins_on_postprocess(outpath, 'pdf') + logger.info(_('Output written to ') + outpath) + +def main(argv=sys.argv): + parser = option_parser() + opts, args = parser.parse_args(argv[1:]) + if len(args) != 1: + parser.print_help() + return 1 + inpath = args[0] + retval = oeb2pdf(opts, inpath) + return retval + +if __name__ == '__main__': + sys.exit(main()) + diff --git a/src/calibre/linux.py b/src/calibre/linux.py index edcfa99342..81b8424199 100644 --- a/src/calibre/linux.py +++ b/src/calibre/linux.py @@ -51,6 +51,7 @@ entry_points = { 'any2epub = calibre.ebooks.epub.from_any:main', 'any2lit = calibre.ebooks.lit.from_any:main', 'any2mobi = calibre.ebooks.mobi.from_any:main', + 'any2pdf = calibre.ebooks.pdf.from_any:main', 'lrf2lrs = calibre.ebooks.lrf.lrfparser:main', 'lrs2lrf = calibre.ebooks.lrf.lrs.convert_from:main', 'pdfreflow = calibre.ebooks.lrf.pdf.reflow:main', diff --git a/src/calibre/parallel.py b/src/calibre/parallel.py index f9b4513c78..7bdb606642 100644 --- a/src/calibre/parallel.py +++ b/src/calibre/parallel.py @@ -71,6 +71,9 @@ PARALLEL_FUNCS = { 'any2mobi' : ('calibre.ebooks.mobi.from_any', 'any2mobi', {}, None), + 'any2pdf' : + ('calibre.ebooks.pdf.from_any', 'any2pdf', {}, None), + 'feeds2mobi' : ('calibre.ebooks.mobi.from_feeds', 'main', {}, 'notification'), From 2f3562ca20095b01dd74e9dd6cc139327382b995 Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 12 Feb 2009 17:19:38 -0500 Subject: [PATCH 012/319] PDF writer cleanups --- src/calibre/ebooks/pdf/writer.py | 49 +++++++++++++++----------------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index d46d1fc0ed..079b972990 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -7,8 +7,8 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' import os, logging, shutil, sys -from calibre.ebooks.oeb.base import Logger, OEBBook -from calibre.ebooks.oeb.profile import Context + +from calibre import LoggingInterface from calibre.ebooks.epub.iterator import SpineItem from calibre.ebooks.metadata.opf2 import OPF from calibre.ptempfile import PersistentTemporaryDirectory @@ -17,8 +17,7 @@ from calibre.utils.config import Config, StringConfig from PyQt4 import QtCore from PyQt4.Qt import QUrl, QEventLoop, SIGNAL, QObject, QApplication, QPrinter, \ - QMetaObject -from PyQt4.Qt import * + QMetaObject, Qt from PyQt4.QtWebKit import QWebView from pyPdf import PdfFileWriter, PdfFileReader @@ -29,6 +28,8 @@ class PDFWriter(QObject): QApplication([]) QObject.__init__(self) + self.logger = logging.getLogger('oeb2pdf') + self.loop = QEventLoop() self.view = QWebView() self.connect(self.view, SIGNAL('loadFinished(bool)'), self._render_html) @@ -36,12 +37,12 @@ class PDFWriter(QObject): self.combine_queue = [] self.tmp_path = PersistentTemporaryDirectory('_any2pdf_parts') - def dump(self, oeb, oebpath, path): - self._reset() + def dump(self, oebpath, path): + self._delete_tmpdir() opf = OPF(oebpath, os.path.dirname(oebpath)) self.render_queue = [SpineItem(i.path) for i in opf.spine] - + self.combine_queue = [] self.path = path QMetaObject.invokeMethod(self, "_render_book", Qt.QueuedConnection) @@ -57,28 +58,32 @@ class PDFWriter(QObject): def _render_next(self): item = str(self.render_queue.pop(0)) self.combine_queue.append(os.path.join(self.tmp_path, '%s.pdf' % os.path.basename(item))) + + self.logger.info('Processing %s...' % item) self.view.load(QUrl(item)) def _render_html(self, ok): if ok: + item_path = os.path.join(self.tmp_path, '%s.pdf' % os.path.basename(str(self.view.url().toLocalFile()))) + + self.logger.debug('\tRendering item as %s' % item_path) + printer = QPrinter(QPrinter.HighResolution) printer.setPageMargins(1, 1, 1, 1, QPrinter.Inch) printer.setOutputFormat(QPrinter.PdfFormat) - printer.setOutputFileName(os.path.join(self.tmp_path, '%s.pdf' % os.path.basename(str(self.view.url().toLocalFile())))) + printer.setOutputFileName(item_path) self.view.print_(printer) self._render_book() - def _reset(self): - self.render_queue = [] - self.combine_queue = [] - self.path = '' + def _delete_tmpdir(self): if os.path.exists(self.tmp_path): shutil.rmtree(self.tmp_path, True) self.tmp_path = PersistentTemporaryDirectory('_any2pdf_parts') def _write(self): - print self.path + self.logger.info('Combining individual PDF parts...') + try: outPDF = PdfFileWriter() for item in self.combine_queue: @@ -89,7 +94,7 @@ class PDFWriter(QObject): outPDF.write(outputStream) outputStream.close() finally: - self._reset() + self._delete_tmpdir() self.loop.exit(0) @@ -117,26 +122,18 @@ def option_parser(): return parser def oeb2pdf(opts, inpath): - logger = Logger(logging.getLogger('oeb2pdf')) + logger = LoggingInterface(logging.getLogger('oeb2pdf')) logger.setup_cli_handler(opts.verbose) + outpath = opts.output if outpath is None: outpath = os.path.basename(inpath) outpath = os.path.splitext(outpath)[0] + '.pdf' -# source = opts.source_profile -# if source not in Context.PROFILES: -# logger.error(_('Unknown source profile %r') % source) -# return 1 -# dest = opts.dest_profile -# if dest not in Context.PROFILES: -# logger.error(_('Unknown destination profile %r') % dest) -# return 1 - oeb = OEBBook(inpath, logger=logger, encoding=opts.encoding) writer = PDFWriter() - writer.dump(oeb, inpath, outpath) + writer.dump(inpath, outpath) run_plugins_on_postprocess(outpath, 'pdf') - logger.info(_('Output written to ') + outpath) + logger.log_info(_('Output written to ') + outpath) def main(argv=sys.argv): parser = option_parser() From 86ad16de680d524dc966ee6b781055a33238b5c6 Mon Sep 17 00:00:00 2001 From: John Schember Date: Fri, 13 Feb 2009 07:38:32 -0500 Subject: [PATCH 013/319] Fix bug 1059: First page of pdf used as cover image --- src/calibre/ebooks/metadata/pdf.py | 100 +++++++++++++++++++++++------ 1 file changed, 81 insertions(+), 19 deletions(-) diff --git a/src/calibre/ebooks/metadata/pdf.py b/src/calibre/ebooks/metadata/pdf.py index ad59351248..8ff652c01b 100644 --- a/src/calibre/ebooks/metadata/pdf.py +++ b/src/calibre/ebooks/metadata/pdf.py @@ -1,16 +1,37 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' '''Read meta information from PDF files''' -import sys, os, re +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal ' + +import sys, os, re, StringIO from calibre.ebooks.metadata import MetaInformation, authors_to_string, get_parser -from pyPdf import PdfFileReader +from calibre.ptempfile import TemporaryDirectory +from pyPdf import PdfFileReader, PdfFileWriter +import Image +try: + from calibre.utils.PythonMagickWand import \ + NewMagickWand, MagickReadImage, MagickSetImageFormat, MagickWriteImage + _imagemagick_loaded = True +except: + _imagemagick_loaded = False -def get_metadata(stream): +def get_metadata(stream, extract_cover=True): """ Return metadata as a L{MetaInfo} object """ mi = MetaInformation(_('Unknown'), [_('Unknown')]) stream.seek(0) + + if extract_cover and _imagemagick_loaded: + try: + cdata = get_cover(stream) + if cdata is not None: + mi.cover_data = ('jpg', cdata) + except: + import traceback + traceback.print_exc() + try: info = PdfFileReader(stream).getDocumentInfo() if info.title: @@ -45,27 +66,68 @@ def set_metadata(stream, mi): stream.write(raw) stream.seek(0) +def get_cover(stream): + try: + pdf = PdfFileReader(stream) + output = PdfFileWriter() + + if len(pdf.pages) >= 1: + output.addPage(pdf.getPage(0)) + + with TemporaryDirectory('_pdfmeta') as tdir: + cover_path = os.path.join(tdir, 'cover.pdf') + + outputStream = file(cover_path, "wb") + output.write(outputStream) + outputStream.close() + + wand = NewMagickWand() + MagickReadImage(wand, cover_path) + MagickSetImageFormat(wand, 'JPEG') + MagickWriteImage(wand, '%s.jpg' % cover_path) + + img = Image.open('%s.jpg' % cover_path) + + data = StringIO.StringIO() + img.save(data, 'JPEG') + return data.getvalue() + except: + import traceback + traceback.print_exc() + def option_parser(): p = get_parser('pdf') p.remove_option('--category') p.remove_option('--comment') + p.add_option('--get-cover', default=False, action='store_true', + help=_('Extract the cover')) return p def main(args=sys.argv): - #p = option_parser() - #opts, args = p.parse_args(args) - if len(args) != 2: - print >>sys.stderr, _('Usage: pdf-meta file.pdf') - print >>sys.stderr, _('No filename specified.') - return 1 - - stream = open(os.path.abspath(os.path.expanduser(args[1])), 'r+b') - #mi = MetaInformation(opts.title, opts.authors) - #if mi.title or mi.authors: - # set_metadata(stream, mi) - print unicode(get_metadata(stream)).encode('utf-8') - + p = option_parser() + opts, args = p.parse_args(args) + + with open(os.path.abspath(os.path.expanduser(args[1])), 'r+b') as stream: + mi = get_metadata(stream, extract_cover=opts.get_cover) + changed = False + if opts.title: + mi.title = opts.title + changed = True + if opts.authors: + mi.authors = opts.authors.split(',') + changed = True + + if changed: + set_metadata(stream, mi) + print unicode(get_metadata(stream, extract_cover=False)).encode('utf-8') + + if mi.cover_data[1] is not None: + cpath = os.path.splitext(os.path.basename(args[1]))[0] + '_cover.jpg' + with open(cpath, 'wb') as f: + f.write(mi.cover_data[1]) + print 'Cover saved to', f.name + return 0 if __name__ == '__main__': - sys.exit(main()) \ No newline at end of file + sys.exit(main()) From cef40cd73ce243a47a0f2f4611e0e415e4ff00de Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 14 Feb 2009 07:44:07 -0500 Subject: [PATCH 014/319] Make PDF conversion work with epubs which have content in multiple directories. --- src/calibre/ebooks/pdf/writer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index 079b972990..cfdd3bb336 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -56,8 +56,8 @@ class PDFWriter(QObject): self._render_next() def _render_next(self): - item = str(self.render_queue.pop(0)) - self.combine_queue.append(os.path.join(self.tmp_path, '%s.pdf' % os.path.basename(item))) + item = str(self.render_queue.pop(0)) + self.combine_queue.append(os.path.join(self.tmp_path, '%s_%i.pdf' % (os.path.basename(item), len(self.combine_queue)))) self.logger.info('Processing %s...' % item) @@ -65,7 +65,7 @@ class PDFWriter(QObject): def _render_html(self, ok): if ok: - item_path = os.path.join(self.tmp_path, '%s.pdf' % os.path.basename(str(self.view.url().toLocalFile()))) + item_path = os.path.join(self.tmp_path, '%s_%i.pdf' % (os.path.basename(str(self.view.url().toLocalFile())), len(self.combine_queue) - 1)) self.logger.debug('\tRendering item as %s' % item_path) From 21619e1b4fd187f19f171f5523d2cbd92fc7a3b4 Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 19 Feb 2009 21:06:52 -0500 Subject: [PATCH 015/319] implement bug #739 --- src/calibre/gui2/images/print-preview.svg | 14298 ++++++++++++++++++++ src/calibre/gui2/images/print.svg | 14229 +++++++++++++++++++ src/calibre/gui2/viewer/main.py | 11 + src/calibre/gui2/viewer/main.ui | 15 +- src/calibre/gui2/viewer/printing.py | 125 + 5 files changed, 28676 insertions(+), 2 deletions(-) create mode 100644 src/calibre/gui2/images/print-preview.svg create mode 100644 src/calibre/gui2/images/print.svg create mode 100644 src/calibre/gui2/viewer/printing.py diff --git a/src/calibre/gui2/images/print-preview.svg b/src/calibre/gui2/images/print-preview.svg new file mode 100644 index 0000000000..6ffe4fafa8 --- /dev/null +++ b/src/calibre/gui2/images/print-preview.svg @@ -0,0 +1,14298 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/calibre/gui2/images/print.svg b/src/calibre/gui2/images/print.svg new file mode 100644 index 0000000000..dffa8b94ba --- /dev/null +++ b/src/calibre/gui2/images/print.svg @@ -0,0 +1,14229 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/calibre/gui2/viewer/main.py b/src/calibre/gui2/viewer/main.py index fedebc66d7..6be7d9a9ae 100644 --- a/src/calibre/gui2/viewer/main.py +++ b/src/calibre/gui2/viewer/main.py @@ -12,6 +12,7 @@ from PyQt4.Qt import QMovie, QApplication, Qt, QIcon, QTimer, QWidget, SIGNAL, \ QToolButton, QMenu, QInputDialog from calibre.gui2.viewer.main_ui import Ui_EbookViewer +from calibre.gui2.viewer.printing import Printing from calibre.gui2.main_window import MainWindow from calibre.gui2 import Application, ORG_NAME, APP_UID, choose_files, \ info_dialog, error_dialog @@ -267,6 +268,16 @@ class EbookViewer(MainWindow, Ui_EbookViewer): self.tool_bar2.setContextMenuPolicy(Qt.PreventContextMenu) self.tool_bar.widgetForAction(self.action_bookmark).setPopupMode(QToolButton.MenuButtonPopup) self.action_full_screen.setCheckable(True) + + self.print_menu = QMenu() + self.print_menu.addAction(QIcon(':/images/print-preview.svg'), _('Print Preview')) + self.action_print.setMenu(self.print_menu) + self.tool_bar.widgetForAction(self.action_print).setPopupMode(QToolButton.MenuButtonPopup) + self.connect(self.action_print, SIGNAL("triggered(bool)"), partial(self.print_book, preview=False)) + self.connect(self.print_menu.actions()[0], SIGNAL("triggered(bool)"), partial(self.print_book, preview=True)) + + def print_book(self, preview): + Printing(self.iterator.spine, preview) def toggle_fullscreen(self, x): if self.isFullScreen(): diff --git a/src/calibre/gui2/viewer/main.ui b/src/calibre/gui2/viewer/main.ui index 59f813b2bd..c4a571be00 100644 --- a/src/calibre/gui2/viewer/main.ui +++ b/src/calibre/gui2/viewer/main.ui @@ -27,8 +27,8 @@ - - + + about:blank @@ -89,6 +89,8 @@ + + @@ -234,6 +236,15 @@ Toggle full screen + + + + :/images/print.svg:/images/print.svg + + + Print + + diff --git a/src/calibre/gui2/viewer/printing.py b/src/calibre/gui2/viewer/printing.py new file mode 100644 index 0000000000..8644cd311b --- /dev/null +++ b/src/calibre/gui2/viewer/printing.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python + +import os, sys, traceback, urlparse + +from BeautifulSoup import BeautifulSoup, Tag + +from calibre.ebooks.epub.iterator import EbookIterator +from calibre.ptempfile import TemporaryDirectory + +from PyQt4 import QtCore +from PyQt4.Qt import QUrl, QEventLoop, SIGNAL, QObject, QApplication, Qt, \ + QPrinter, QPrintPreviewDialog, QPrintDialog, QDialog, QMetaObject, Q_ARG +from PyQt4 import QtCore +from PyQt4.QtWebKit import QWebView + +PRINTCSS = 'body{width:100%;margin:0;padding:0;font-family:Arial;color:#000;background:none;font-size:12pt;text-align:left;}h1,h2,h3,h4,h5,h6{font-family:Helvetica;}h1{font-size:19pt;}h2{font-size:17pt;}h3{font-size:15pt;}h4,h5,h6{font-size:12pt;}pre,code,samp{font:10ptCourier,monospace;white-space:pre-wrap;page-break-inside:avoid;}blockquote{margin:1.3em;padding:1em;font-size:10pt;}hr{background-color:#ccc;}aimg{border:none;}a:link,a:visited{background:transparent;font-weight:700;text-decoration:underline;color:#333;}a:link:after,a{color:#000;}table{margin:1px;text-align:left;}th{border-bottom:1pxsolid#333;font-weight:bold;}td{border-bottom:1pxsolid#333;}th,td{padding:4px10px4px0;}tfoot{font-style:italic;}caption{background:#fff;margin-bottom:2em;text-align:left;}thead{display:table-header-group;}tr{page-break-inside:avoid;}#header,.header,#footer,.footer,#navbar,.navbar,#navigation,.navigation,#rightSideBar,.rightSideBar,#leftSideBar,.leftSideBar{display:none;}' + +class Printing(QObject): + def __init__(self, spine, preview): + if QApplication.instance() is None: + QApplication([]) + QObject.__init__(self) + self.loop = QEventLoop() + + self.view = QWebView() + if preview: + self.connect(self.view, SIGNAL('loadFinished(bool)'), self.print_preview) + else: + self.connect(self.view, SIGNAL('loadFinished(bool)'), self.print_book) + + self.process_content(spine) + + def process_content(self, spine): + content = '' + + for path in spine: + raw = self.raw_content(path) + content += self.parsed_content(raw, path) + + refined_content = self.refine_content(content) + + base = os.path.splitdrive(spine[0])[0] + base = base if base != '' else '/' + + QMetaObject.invokeMethod(self, "load_content", Qt.QueuedConnection, Q_ARG('QString', refined_content), Q_ARG('QString', base)) + self.loop.exec_() + + @QtCore.pyqtSignature('load_content(QString, QString)') + def load_content(self, content, base): + self.view.setHtml(content, QUrl(base)) + + def raw_content(self, path): + return open(path, 'rb').read().decode(path.encoding) + + def parsed_content(self, raw_content, path): + dom_tree = BeautifulSoup(raw_content).body + + # Remove sytle information that is applied to the entire document. + # This does not remove styles applied within a tag. + styles = dom_tree.findAll('style') + for s in styles: + s.extract() + + scripts = dom_tree.findAll('script') + for s in scripts: + s.extract() + + # Convert all relative links to absolute paths. + links = dom_tree.findAll(src=True) + for s in links: + if QUrl(s['src']).isRelative(): + s['src'] = urlparse.urljoin(path, s['src']) + links = dom_tree.findAll(href=True) + for s in links: + if QUrl(s['href']).isRelative(): + s['href'] = urlparse.urljoin(path, s['href']) + + return unicode(dom_tree) + + # Adds the begenning and endings tags to the document. + # Adds the print css. + def refine_content(self, content): + dom_tree = BeautifulSoup('%s' % content) + + css = dom_tree.findAll('link') + for c in css: + c.extract() + + print_css = Tag(BeautifulSoup(), 'style', [('type', 'text/css'), ('title', 'override_css')]) + print_css.insert(0, PRINTCSS) + dom_tree.findAll('head')[0].insert(0, print_css) + + return unicode(dom_tree) + + def print_preview(self, ok): + printer = QPrinter(QPrinter.HighResolution) + printer.setPageMargins(1, 1, 1, 1, QPrinter.Inch) + + previewDialog = QPrintPreviewDialog(printer) + + self.connect(previewDialog, SIGNAL('paintRequested(QPrinter *)'), self.view.print_) + previewDialog.exec_() + self.disconnect(previewDialog, SIGNAL('paintRequested(QPrinter *)'), self.view.print_) + + self.loop.quit() + + def print_book(self, ok): + printer = QPrinter(QPrinter.HighResolution) + printer.setPageMargins(1, 1, 1, 1, QPrinter.Inch) + + printDialog = QPrintDialog(printer) + printDialog.setWindowTitle(_("Print eBook")) + + printDialog.exec_() + if printDialog.result() == QDialog.Accepted: + self.view.print_(printer) + + self.loop.quit() + +def main(): + return 0 + +if __name__ == '__main__': + sys.exit(main()) + From cb9f9c9ff1be450663d10822c89cbd3a71deb344 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 19 Feb 2009 19:22:17 -0800 Subject: [PATCH 016/319] IGN:... --- src/calibre/customize/__init__.py | 4 +- src/calibre/customize/conversion.py | 72 +++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 src/calibre/customize/conversion.py diff --git a/src/calibre/customize/__init__.py b/src/calibre/customize/__init__.py index 3d48f42535..b43b242fd8 100644 --- a/src/calibre/customize/__init__.py +++ b/src/calibre/customize/__init__.py @@ -220,4 +220,6 @@ class MetadataWriterPlugin(Plugin): ''' pass - + + + \ No newline at end of file diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py new file mode 100644 index 0000000000..36b2781c9d --- /dev/null +++ b/src/calibre/customize/conversion.py @@ -0,0 +1,72 @@ +''' +Defines the plugin sytem for conversions. +''' +import re + +from calibre.customize import Plugin + + +class ConversionOption(object): + + ''' + Class representing conversion options + ''' + + def __init__(self, name=None, default=None, help=None, long_switch=None, + short_switch=None, choices=None, gui_label=None, + category=None): + self.name = name + self.default = default + self.help = help + self.long_switch = long_switch + self.short_switch = short_switch + self.choices = choices + self.gui_label = gui_label + self.category = category + + self.validate_parameters() + + def validate_parameters(self): + ''' + Validate the parameters passed to :method:`__init__`. + ''' + if re.match(r'[a-zA-Z_]([a-zA-Z0-9_])*', self.name) is None: + raise ValueError(self.name + ' is not a valid Python identifier') + if not (isinstance(self.default, (int, float, str, unicode)) or \ + self.default is None): + raise ValueError(unicode(self.default) + + ' is not a string or a number') + if not self.help: + raise ValueError('You must set the help text') + +class ConversionPlugin(Plugin): + + ''' + The base class for all conversion related plugins. + ''' + #: List of options + #: Each option must be a dictionary. The dictionary can contain several + #: keys defining the option. The ones marked by a * are required, the rest + #: are optional. The keys are:: + #: + #: *'name' : A valid python identifier. + #: *'default' : The default value for this option. + #: *'help' : + #: 'short_switch' : A suggestion for a short form of the command line + #: switch (for example if name is 'title', this + #: could be 't'). It is only used if no prior + #: conversion plugin has claimed it. + options = [] + + type = _('Conversion') + can_be_disabled = False + supported_platforms = ['windows', 'osx', 'linux'] + + +class InputFormatPlugin(ConversionPlugin): + + #: Set of file types for which this plugin should be run + #: For example: ``set(['lit', 'mobi', 'prc'])`` + file_types = set([]) + + From 480a3cd3bbfb68f55d2b47d0d36d3e67d8e6834a Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 21 Feb 2009 13:41:22 -0500 Subject: [PATCH 017/319] Fix Cybook cover size being a postage stamp withing the thumbnail generated. --- src/calibre/devices/cybookg3/driver.py | 2 ++ src/calibre/devices/cybookg3/t2b.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/calibre/devices/cybookg3/driver.py b/src/calibre/devices/cybookg3/driver.py index f092473675..16f27e03e8 100644 --- a/src/calibre/devices/cybookg3/driver.py +++ b/src/calibre/devices/cybookg3/driver.py @@ -35,6 +35,8 @@ class CYBOOKG3(USBMS): EBOOK_DIR_CARD = "eBooks" SUPPORTS_SUB_DIRS = True + THUMBNAIL_HEIGHT = 144 + def upload_books(self, files, names, on_card=False, end_session=True, metadata=None): if on_card and not self._card_prefix: diff --git a/src/calibre/devices/cybookg3/t2b.py b/src/calibre/devices/cybookg3/t2b.py index 5bf512f22d..7aaeeb63d7 100644 --- a/src/calibre/devices/cybookg3/t2b.py +++ b/src/calibre/devices/cybookg3/t2b.py @@ -30,7 +30,7 @@ def write_t2b(t2bfile, coverdata=None): if coverdata != None: coverdata = StringIO.StringIO(coverdata) cover = Image.open(coverdata).convert("L") - cover.thumbnail((96, 144)) + cover.thumbnail((96, 144), Image.ANTIALIAS) t2bcover = Image.new('L', (96, 144), 'white') x, y = cover.size From e496f88ed0c4d007425e0a6ae52631359d0aca5b Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 25 Feb 2009 09:50:42 -0500 Subject: [PATCH 018/319] eBook-viewer: Basic bookmark manager --- src/calibre/ebooks/epub/iterator.py | 5 +- src/calibre/gui2/viewer/bookmarkmanager.py | 93 ++++++++++++++++++++++ src/calibre/gui2/viewer/bookmarkmanager.ui | 57 +++++++++++++ src/calibre/gui2/viewer/main.py | 24 +++++- src/calibre/gui2/viewer/printing.py | 4 + 5 files changed, 178 insertions(+), 5 deletions(-) create mode 100644 src/calibre/gui2/viewer/bookmarkmanager.py create mode 100644 src/calibre/gui2/viewer/bookmarkmanager.ui diff --git a/src/calibre/ebooks/epub/iterator.py b/src/calibre/ebooks/epub/iterator.py index e953cbda51..82b9aa09ef 100644 --- a/src/calibre/ebooks/epub/iterator.py +++ b/src/calibre/ebooks/epub/iterator.py @@ -195,5 +195,8 @@ class EbookIterator(object): self.bookmarks.append(bm) self.save_bookmarks() + def set_bookmarks(self, bookmarks): + self.bookmarks = bookmarks + def __exit__(self, *args): - self._tdir.__exit__(*args) \ No newline at end of file + self._tdir.__exit__(*args) diff --git a/src/calibre/gui2/viewer/bookmarkmanager.py b/src/calibre/gui2/viewer/bookmarkmanager.py new file mode 100644 index 0000000000..6dcd662754 --- /dev/null +++ b/src/calibre/gui2/viewer/bookmarkmanager.py @@ -0,0 +1,93 @@ +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' + + +from PyQt4.Qt import Qt, QDialog, QAbstractTableModel, QVariant, SIGNAL, \ + QModelIndex, QInputDialog, QLineEdit + +from calibre.gui2.viewer.bookmarkmanager_ui import Ui_BookmarkManager +from calibre.gui2 import NONE, qstring_to_unicode + +class BookmarkManager(QDialog, Ui_BookmarkManager): + def __init__(self, parent, bookmarks): + QDialog.__init__(self, parent) + + self.setupUi(self) + + self.bookmarks = bookmarks[:] + self.set_bookmarks() + + self.connect(self.button_revert, SIGNAL('clicked()'), self.set_bookmarks) + self.connect(self.button_delete, SIGNAL('clicked()'), self.delete_bookmark) + self.connect(self.button_edit, SIGNAL('clicked()'), self.edit_bookmark) + + def set_bookmarks(self): + self._model = BookmarkTableModel(self, self.bookmarks) + self.bookmarks_table.setModel(self._model) + + def delete_bookmark(self): + indexes = self.bookmarks_table.selectionModel().selectedIndexes() + if indexes != []: + self._model.remove_row(indexes[0].row()) + + def edit_bookmark(self): + indexes = self.bookmarks_table.selectionModel().selectedIndexes() + if indexes != []: + title, ok = QInputDialog.getText(self, _('Edit bookmark'), _('New title for bookmark:'), QLineEdit.Normal, self._model.data(indexes[0], Qt.DisplayRole).toString()) + title = QVariant(unicode(title).strip()) + if ok and title: + self._model.setData(indexes[0], title, Qt.EditRole) + + def get_bookmarks(self): + return self._model.bookmarks + + +class BookmarkTableModel(QAbstractTableModel): + headers = [_("Name")] + + def __init__(self, parent, bookmarks): + QAbstractTableModel.__init__(self, parent) + + self.bookmarks = bookmarks[:] + + def rowCount(self, parent): + if parent and parent.isValid(): + return 0 + return len(self.bookmarks) + + def columnCount(self, parent): + if parent and parent.isValid(): + return 0 + return len(self.headers) + + def data(self, index, role): + if role in (Qt.DisplayRole, Qt.EditRole): + ans = self.bookmarks[index.row()][0] + return NONE if ans is None else QVariant(ans) + return NONE + + def setData(self, index, value, role): + if role == Qt.EditRole: + self.bookmarks[index.row()] = (qstring_to_unicode(value.toString()).strip(), self.bookmarks[index.row()][1]) + self.emit(SIGNAL("dataChanged(QModelIndex, QModelIndex)"), index, index) + return True + return False + + def flags(self, index): + flags = QAbstractTableModel.flags(self, index) + flags |= Qt.ItemIsEditable + return flags + + def headerData(self, section, orientation, role): + if role != Qt.DisplayRole: + return NONE + if orientation == Qt.Horizontal: + return QVariant(self.headers[section]) + else: + return QVariant(section+1) + + def remove_row(self, row): + self.beginRemoveRows(QModelIndex(), row, row) + del self.bookmarks[row] + self.endRemoveRows() + diff --git a/src/calibre/gui2/viewer/bookmarkmanager.ui b/src/calibre/gui2/viewer/bookmarkmanager.ui new file mode 100644 index 0000000000..44e044b52a --- /dev/null +++ b/src/calibre/gui2/viewer/bookmarkmanager.ui @@ -0,0 +1,57 @@ + + BookmarkManager + + + + 0 + 0 + 400 + 300 + + + + Dialog + + + + + + false + + + true + + + QAbstractItemView::SingleSelection + + + false + + + + + + + Revert + + + + + + + Delete + + + + + + + Edit + + + + + + + + diff --git a/src/calibre/gui2/viewer/main.py b/src/calibre/gui2/viewer/main.py index ec0d878ae8..990281a471 100644 --- a/src/calibre/gui2/viewer/main.py +++ b/src/calibre/gui2/viewer/main.py @@ -13,6 +13,7 @@ from PyQt4.Qt import QMovie, QApplication, Qt, QIcon, QTimer, QWidget, SIGNAL, \ from calibre.gui2.viewer.main_ui import Ui_EbookViewer from calibre.gui2.viewer.printing import Printing +from calibre.gui2.viewer.bookmarkmanager import BookmarkManager from calibre.gui2.main_window import MainWindow from calibre.gui2 import Application, ORG_NAME, APP_UID, choose_files, \ info_dialog, error_dialog @@ -263,7 +264,11 @@ class EbookViewer(MainWindow, Ui_EbookViewer): self.connect(self.toc, SIGNAL('clicked(QModelIndex)'), self.toc_clicked) self.connect(self.reference, SIGNAL('goto(PyQt_PyObject)'), self.goto) + + self.bookmarks_menu = QMenu() + self.action_bookmark.setMenu(self.bookmarks_menu) self.set_bookmarks([]) + if pathtoebook is not None: f = functools.partial(self.load_ebook, pathtoebook) QTimer.singleShot(50, f) @@ -488,17 +493,28 @@ class EbookViewer(MainWindow, Ui_EbookViewer): self.setCursor(Qt.BusyCursor) def set_bookmarks(self, bookmarks): - menu = QMenu() + self.bookmarks_menu.clear() + self.bookmarks_menu.addAction(_("Manage Bookmarks"), self.manage_bookmarks) + self.bookmarks_menu.addSeparator() current_page = None for bm in bookmarks: if bm[0] == 'calibre_current_page_bookmark': current_page = bm else: - menu.addAction(bm[0], partial(self.goto_bookmark, bm)) - self.action_bookmark.setMenu(menu) - self._menu = menu + self.bookmarks_menu.addAction(bm[0], partial(self.goto_bookmark, bm)) return current_page + def manage_bookmarks(self): + bmm = BookmarkManager(self, self.iterator.bookmarks) + bmm.exec_() + + bookmarks = bmm.get_bookmarks() + + if bookmarks != self.iterator.bookmarks: + self.iterator.set_bookmarks(bookmarks) + self.iterator.save_bookmarks() + self.set_bookmarks(bookmarks) + def save_current_position(self): try: pos = self.view.bookmark() diff --git a/src/calibre/gui2/viewer/printing.py b/src/calibre/gui2/viewer/printing.py index 8644cd311b..e948360338 100644 --- a/src/calibre/gui2/viewer/printing.py +++ b/src/calibre/gui2/viewer/printing.py @@ -1,5 +1,9 @@ #!/usr/bin/env python +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' + + import os, sys, traceback, urlparse from BeautifulSoup import BeautifulSoup, Tag From 32968d2332be57cd2cc653acc9c3f06c42ae9aee Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 25 Feb 2009 10:01:31 -0800 Subject: [PATCH 019/319] IGN:... --- .pydevproject | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pydevproject b/.pydevproject index 1d5708bb48..509137a36a 100644 --- a/.pydevproject +++ b/.pydevproject @@ -2,7 +2,7 @@ -python 2.5 +python 2.6 /calibre-pluginize/src From a1c6108b7228f1d82b0df946ec616a9e9d19f60d Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 25 Feb 2009 15:01:42 -0500 Subject: [PATCH 020/319] ebook-viewer: Bookmark Manager: Import/Export, new UI layout. --- src/calibre/gui2/viewer/bookmarkmanager.py | 47 ++++++++- src/calibre/gui2/viewer/bookmarkmanager.ui | 109 ++++++++++++++++----- src/calibre/gui2/viewer/main.py | 3 +- 3 files changed, 132 insertions(+), 27 deletions(-) diff --git a/src/calibre/gui2/viewer/bookmarkmanager.py b/src/calibre/gui2/viewer/bookmarkmanager.py index 6dcd662754..60709a2ac1 100644 --- a/src/calibre/gui2/viewer/bookmarkmanager.py +++ b/src/calibre/gui2/viewer/bookmarkmanager.py @@ -1,9 +1,12 @@ +from __future__ import with_statement + __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' +import cPickle, os from PyQt4.Qt import Qt, QDialog, QAbstractTableModel, QVariant, SIGNAL, \ - QModelIndex, QInputDialog, QLineEdit + QModelIndex, QInputDialog, QLineEdit, QFileDialog from calibre.gui2.viewer.bookmarkmanager_ui import Ui_BookmarkManager from calibre.gui2 import NONE, qstring_to_unicode @@ -20,9 +23,13 @@ class BookmarkManager(QDialog, Ui_BookmarkManager): self.connect(self.button_revert, SIGNAL('clicked()'), self.set_bookmarks) self.connect(self.button_delete, SIGNAL('clicked()'), self.delete_bookmark) self.connect(self.button_edit, SIGNAL('clicked()'), self.edit_bookmark) + self.connect(self.button_export, SIGNAL('clicked()'), self.export_bookmarks) + self.connect(self.button_import, SIGNAL('clicked()'), self.import_bookmarks) - def set_bookmarks(self): - self._model = BookmarkTableModel(self, self.bookmarks) + def set_bookmarks(self, bookmarks=None): + if bookmarks == None: + bookmarks = self.bookmarks[:] + self._model = BookmarkTableModel(self, bookmarks) self.bookmarks_table.setModel(self._model) def delete_bookmark(self): @@ -41,6 +48,40 @@ class BookmarkManager(QDialog, Ui_BookmarkManager): def get_bookmarks(self): return self._model.bookmarks + def export_bookmarks(self): + filename = QFileDialog.getSaveFileName(self, _("Export Bookmarks"), '%s%suntitled.pickle' % (os.getcwdu(), os.sep), _("Pickled Bookmarks (*.pickle)")) + if filename == '': + return + + with open(filename, 'w') as fileobj: + cPickle.dump(self._model.bookmarks, fileobj) + + def import_bookmarks(self): + filename = QFileDialog.getOpenFileName(self, _("Import Bookmarks"), '%s' % os.getcwdu(), _("Pickled Bookmarks (*.pickle)")) + if filename == '': + return + + imported = None + with open(filename, 'r') as fileobj: + imported = cPickle.load(fileobj) + + if imported != None: + bad = False + try: + for bm in imported: + if len(bm) != 2: + bad = True + break + except: + pass + + if not bad: + bookmarks = self._model.bookmarks[:] + for bm in imported: + if bm not in bookmarks and bm[0] != 'calibre_current_page_bookmark': + bookmarks.append(bm) + self.set_bookmarks(bookmarks) + class BookmarkTableModel(QAbstractTableModel): headers = [_("Name")] diff --git a/src/calibre/gui2/viewer/bookmarkmanager.ui b/src/calibre/gui2/viewer/bookmarkmanager.ui index 44e044b52a..110b5db841 100644 --- a/src/calibre/gui2/viewer/bookmarkmanager.ui +++ b/src/calibre/gui2/viewer/bookmarkmanager.ui @@ -5,15 +5,59 @@ 0 0 - 400 - 300 + 451 + 363 - Dialog + Bookmark Manager - + + + + Actions + + + + + + Edit + + + + + + + Delete + + + + + + + Reset + + + + + + + Export + + + + + + + Import + + + + + + + false @@ -29,29 +73,48 @@ - - - - Revert - - - - - - - Delete - - - - - - - Edit + + + + QDialogButtonBox::Cancel|QDialogButtonBox::Ok - + + + buttonBox + accepted() + BookmarkManager + accept() + + + 225 + 337 + + + 225 + 181 + + + + + buttonBox + rejected() + BookmarkManager + reject() + + + 225 + 337 + + + 225 + 181 + + + + diff --git a/src/calibre/gui2/viewer/main.py b/src/calibre/gui2/viewer/main.py index 990281a471..79c42c2a81 100644 --- a/src/calibre/gui2/viewer/main.py +++ b/src/calibre/gui2/viewer/main.py @@ -506,7 +506,8 @@ class EbookViewer(MainWindow, Ui_EbookViewer): def manage_bookmarks(self): bmm = BookmarkManager(self, self.iterator.bookmarks) - bmm.exec_() + if bmm.exec_() != BookmarkManager.Accepted: + return bookmarks = bmm.get_bookmarks() From 3dc42b5302f6fec969f4996115f3dda071013903 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 1 Mar 2009 10:23:25 -0500 Subject: [PATCH 021/319] Add close at end of get_file --- src/calibre/devices/usbms/driver.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/calibre/devices/usbms/driver.py b/src/calibre/devices/usbms/driver.py index cadc61e584..3b9e8c0715 100644 --- a/src/calibre/devices/usbms/driver.py +++ b/src/calibre/devices/usbms/driver.py @@ -167,6 +167,7 @@ class USBMS(Device): path = self.munge_path(path) src = open(path, 'rb') shutil.copyfileobj(src, outfile, 10*1024*1024) + src.close() def put_file(self, infile, path, replace_file=False, end_session=True): path = self.munge_path(path) From 8119b9e86870b4e2206aa7339db8f6e485ce552d Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 1 Mar 2009 18:10:14 -0500 Subject: [PATCH 022/319] Fix open files with PDF metadata reader --- src/calibre/ebooks/metadata/meta.py | 14 +++++++------- src/calibre/ebooks/metadata/pdf.py | 6 ++++-- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/calibre/ebooks/metadata/meta.py b/src/calibre/ebooks/metadata/meta.py index 1241238f26..43053a43b9 100644 --- a/src/calibre/ebooks/metadata/meta.py +++ b/src/calibre/ebooks/metadata/meta.py @@ -39,13 +39,13 @@ def metadata_from_formats(formats): return mi2 for path, ext in zip(formats, extensions): - stream = open(path, 'rb') - try: - mi.smart_update(get_metadata(stream, stream_type=ext, use_libprs_metadata=True)) - except: - continue - if getattr(mi, 'application_id', None) is not None: - return mi + with open(path, 'rb') as stream: + try: + mi.smart_update(get_metadata(stream, stream_type=ext, use_libprs_metadata=True)) + except: + continue + if getattr(mi, 'application_id', None) is not None: + return mi if not mi.title: mi.title = _('Unknown') diff --git a/src/calibre/ebooks/metadata/pdf.py b/src/calibre/ebooks/metadata/pdf.py index 8ff652c01b..8f73e04050 100644 --- a/src/calibre/ebooks/metadata/pdf.py +++ b/src/calibre/ebooks/metadata/pdf.py @@ -67,6 +67,8 @@ def set_metadata(stream, mi): stream.seek(0) def get_cover(stream): + data = StringIO.StringIO() + try: pdf = PdfFileReader(stream) output = PdfFileWriter() @@ -88,12 +90,12 @@ def get_cover(stream): img = Image.open('%s.jpg' % cover_path) - data = StringIO.StringIO() img.save(data, 'JPEG') - return data.getvalue() except: import traceback traceback.print_exc() + + return data.getvalue() def option_parser(): p = get_parser('pdf') From e7270d2db88f36e4e921316e7f50b9e1bcef49b9 Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 3 Mar 2009 20:00:11 -0500 Subject: [PATCH 023/319] USBMS: Devices that support subdirs use a similar layout as Calibre on the computer --- src/calibre/devices/cybookg3/driver.py | 13 +++++++++++-- src/calibre/devices/usbms/driver.py | 18 +++++++++++------- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/src/calibre/devices/cybookg3/driver.py b/src/calibre/devices/cybookg3/driver.py index eef32594eb..6a77c09479 100644 --- a/src/calibre/devices/cybookg3/driver.py +++ b/src/calibre/devices/cybookg3/driver.py @@ -69,14 +69,23 @@ class CYBOOKG3(USBMS): for infile in files: newpath = path mdata = metadata.next() - + if self.SUPPORTS_SUB_DIRS: if 'tags' in mdata.keys(): for tag in mdata['tags']: - if tag.startswith('/'): + if tag.startswith('News'): + newpath = os.path.join(newpath, 'news') + newpath = os.path.join(newpath, mdata.get('title', '')) + newpath = os.path.join(newpath, mdata.get('timestamp', '')) + elif tag.startswith('/'): + newpath = path newpath += tag newpath = os.path.normpath(newpath) break + + if newpath == path: + newpath = os.path.join(newpath, mdata.get('authors', '')) + newpath = os.path.join(newpath, mdata.get('title', '')) if not os.path.exists(newpath): os.makedirs(newpath) diff --git a/src/calibre/devices/usbms/driver.py b/src/calibre/devices/usbms/driver.py index abdf2f0d81..a088bf1a32 100644 --- a/src/calibre/devices/usbms/driver.py +++ b/src/calibre/devices/usbms/driver.py @@ -103,10 +103,20 @@ class USBMS(Device): if 'tags' in mdata.keys(): for tag in mdata['tags']: - if tag.startswith('/'): + if tag.startswith('News'): + newpath = os.path.join(newpath, 'news') + newpath = os.path.join(newpath, mdata.get('title', '')) + newpath = os.path.join(newpath, mdata.get('timestamp', '')) + break + elif tag.startswith('/'): + newpath = path newpath += tag newpath = os.path.normpath(newpath) break + + if newpath == path: + newpath = os.path.join(newpath, mdata.get('authors', '')) + newpath = os.path.join(newpath, mdata.get('title', '')) if not os.path.exists(newpath): os.makedirs(newpath) @@ -166,14 +176,8 @@ class USBMS(Device): def get_file(self, path, outfile, end_session=True): path = self.munge_path(path) -<<<<<<< TREE - src = open(path, 'rb') - shutil.copyfileobj(src, outfile, 10*1024*1024) - src.close() -======= with open(path, 'rb') as src: shutil.copyfileobj(src, outfile, 10*1024*1024) ->>>>>>> MERGE-SOURCE def put_file(self, infile, path, replace_file=False, end_session=True): path = self.munge_path(path) From 925a86fb0c991c51a4665cac1ff7a7f191ec39a7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 6 Mar 2009 21:38:35 -0800 Subject: [PATCH 024/319] Beginnings of the new conversion framework. Input plugins for MOBI and EPUB. --- src/calibre/__init__.py | 92 ++--------- src/calibre/customize/builtins.py | 7 +- src/calibre/customize/conversion.py | 183 ++++++++++++++++++---- src/calibre/customize/profiles.py | 27 ++++ src/calibre/customize/ui.py | 19 ++- src/calibre/ebooks/conversion/__init__.py | 4 + src/calibre/ebooks/conversion/plumber.py | 30 ++++ src/calibre/ebooks/epub/__init__.py | 32 ---- src/calibre/ebooks/epub/input.py | 76 +++++++++ src/calibre/ebooks/mobi/input.py | 29 ++++ src/calibre/ebooks/mobi/reader.py | 167 +++++++++----------- src/calibre/utils/logging.py | 92 +++++++++++ src/calibre/utils/terminfo.py | 2 +- 13 files changed, 525 insertions(+), 235 deletions(-) create mode 100644 src/calibre/customize/profiles.py create mode 100644 src/calibre/ebooks/conversion/__init__.py create mode 100644 src/calibre/ebooks/conversion/plumber.py create mode 100644 src/calibre/ebooks/epub/input.py create mode 100644 src/calibre/ebooks/mobi/input.py create mode 100644 src/calibre/utils/logging.py diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index e69d42c90a..de133ddb57 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -90,28 +90,11 @@ def prints(*args, **kwargs): if i != len(args)-1: file.write(sep) file.write(end) - file.flush() class CommandLineError(Exception): pass -class ColoredFormatter(Formatter): - def format(self, record): - ln = record.__dict__['levelname'] - col = '' - if ln == 'CRITICAL': - col = terminal_controller.YELLOW - elif ln == 'ERROR': - col = terminal_controller.RED - elif ln in ['WARN', 'WARNING']: - col = terminal_controller.BLUE - elif ln == 'INFO': - col = terminal_controller.GREEN - elif ln == 'DEBUG': - col = terminal_controller.CYAN - record.__dict__['levelname'] = col + record.__dict__['levelname'] + terminal_controller.NORMAL - return Formatter.format(self, record) def setup_cli_handlers(logger, level): @@ -335,66 +318,23 @@ def english_sort(x, y): ''' return cmp(_spat.sub('', x), _spat.sub('', y)) -class LoggingInterface: +class ColoredFormatter(Formatter): - def __init__(self, logger): - self.__logger = self.logger = logger - - def setup_cli_handler(self, verbosity): - for handler in self.__logger.handlers: - if isinstance(handler, logging.StreamHandler): - return - if os.environ.get('CALIBRE_WORKER', None) is not None and self.__logger.handlers: - return - stream = sys.stdout - formatter = logging.Formatter() - level = logging.INFO - if verbosity > 0: - formatter = ColoredFormatter('[%(levelname)s] %(message)s') if verbosity > 1 else \ - ColoredFormatter('%(levelname)s: %(message)s') - level = logging.DEBUG - if verbosity > 1: - stream = sys.stderr - - handler = logging.StreamHandler(stream) - handler.setFormatter(formatter) - handler.setLevel(level) - self.__logger.addHandler(handler) - self.__logger.setLevel(level) - - - def ___log(self, func, msg, args, kwargs): - args = [msg] + list(args) - for i in range(len(args)): - if not isinstance(args[i], basestring): - continue - if sys.version_info[:2] > (2, 5): - if not isinstance(args[i], unicode): - args[i] = args[i].decode(preferred_encoding, 'replace') - elif isinstance(args[i], unicode): - args[i] = args[i].encode(preferred_encoding, 'replace') - func(*args, **kwargs) - - def log_debug(self, msg, *args, **kwargs): - self.___log(self.__logger.debug, msg, args, kwargs) - - def log_info(self, msg, *args, **kwargs): - self.___log(self.__logger.info, msg, args, kwargs) - - def log_warning(self, msg, *args, **kwargs): - self.___log(self.__logger.warning, msg, args, kwargs) - - def log_warn(self, msg, *args, **kwargs): - self.___log(self.__logger.warning, msg, args, kwargs) - - def log_error(self, msg, *args, **kwargs): - self.___log(self.__logger.error, msg, args, kwargs) - - def log_critical(self, msg, *args, **kwargs): - self.___log(self.__logger.critical, msg, args, kwargs) - - def log_exception(self, msg, *args): - self.___log(self.__logger.exception, msg, args, {}) + def format(self, record): + ln = record.__dict__['levelname'] + col = '' + if ln == 'CRITICAL': + col = terminal_controller.YELLOW + elif ln == 'ERROR': + col = terminal_controller.RED + elif ln in ['WARN', 'WARNING']: + col = terminal_controller.BLUE + elif ln == 'INFO': + col = terminal_controller.GREEN + elif ln == 'DEBUG': + col = terminal_controller.CYAN + record.__dict__['levelname'] = col + record.__dict__['levelname'] + terminal_controller.NORMAL + return Formatter.format(self, record) def walk(dir): ''' A nice interface to os.walk ''' diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 14d3c79062..fafe8e5afa 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -242,8 +242,13 @@ class MOBIMetadataWriter(MetadataWriterPlugin): set_metadata(stream, mi) -plugins = [HTML2ZIP] +from calibre.ebooks.epub.input import EPUBInput +from calibre.ebooks.mobi.input import MOBIInput +from calibre.customize.profiles import input_profiles + +plugins = [HTML2ZIP, EPUBInput, MOBIInput] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataWriter')] +plugins += input_profiles \ No newline at end of file diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index 36b2781c9d..aa7b0c1dea 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -1,28 +1,30 @@ +from __future__ import with_statement ''' Defines the plugin sytem for conversions. ''' -import re +import re, os, shutil +from lxml import html + +from calibre import CurrentDir from calibre.customize import Plugin - class ConversionOption(object): ''' Class representing conversion options ''' - def __init__(self, name=None, default=None, help=None, long_switch=None, - short_switch=None, choices=None, gui_label=None, - category=None): + def __init__(self, name=None, help=None, long_switch=None, + short_switch=None, choices=None): self.name = name - self.default = default self.help = help self.long_switch = long_switch self.short_switch = short_switch self.choices = choices - self.gui_label = gui_label - self.category = category + + if self.long_switch is None: + self.long_switch = '--'+self.name.replace('_', '-') self.validate_parameters() @@ -32,41 +34,156 @@ class ConversionOption(object): ''' if re.match(r'[a-zA-Z_]([a-zA-Z0-9_])*', self.name) is None: raise ValueError(self.name + ' is not a valid Python identifier') - if not (isinstance(self.default, (int, float, str, unicode)) or \ - self.default is None): + if not self.help: + raise ValueError('You must set the help text') + + +class OptionRecommendation(object): + LOW = 1 + MED = 2 + HIGH = 3 + + def __init__(self, recommeded_value, level=LOW, **kwargs): + ''' + An option recommendation. That is, an option as well as its recommended + value and the level of the recommendation. + ''' + self.level = level + self.recommended_value = recommeded_value + self.option = kwargs.pop('option', None) + if self.option is None: + self.option = ConversionOption(**kwargs) + + self.validate_parameters() + + def validate_parameters(self): + if self.option.choices and self.recommended_value not in \ + self.option.choices: + raise ValueError('Recommended value not in choices') + if not (isinstance(self.recommended_value, (int, float, str, unicode))\ + or self.default is None): raise ValueError(unicode(self.default) + ' is not a string or a number') - if not self.help: - raise ValueError('You must set the help text') + -class ConversionPlugin(Plugin): - +class InputFormatPlugin(Plugin): ''' - The base class for all conversion related plugins. + InputFormatPlugins are responsible for converting a document into + HTML+OPF+CSS+etc. + The results of the conversion *must* be encoded in UTF-8. + The main action happens in :method:`convert`. ''' - #: List of options - #: Each option must be a dictionary. The dictionary can contain several - #: keys defining the option. The ones marked by a * are required, the rest - #: are optional. The keys are:: - #: - #: *'name' : A valid python identifier. - #: *'default' : The default value for this option. - #: *'help' : - #: 'short_switch' : A suggestion for a short form of the command line - #: switch (for example if name is 'title', this - #: could be 't'). It is only used if no prior - #: conversion plugin has claimed it. - options = [] - type = _('Conversion') + type = _('Conversion Input') can_be_disabled = False supported_platforms = ['windows', 'osx', 'linux'] - -class InputFormatPlugin(ConversionPlugin): - #: Set of file types for which this plugin should be run - #: For example: ``set(['lit', 'mobi', 'prc'])`` + #: For example: ``set(['azw', 'mobi', 'prc'])`` file_types = set([]) + #: Options shared by all Input format plugins. Do not override + #: in sub-classes. Use :member:`options` instead. Every option must be an + #: instance of :class:`OptionRecommendation`. + common_options = set([ + OptionRecommendation(name='debug_input', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Save the output from the input plugin to the specified ' + 'directory. Useful if you are unsure at which stage ' + 'of the conversion process a bug is occurring. ' + 'WARNING: This completely deletes the contents of ' + 'the specified directory.') + ), + + OptionRecommendation(name='input_encoding', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Specify the character encoding of the input document. If ' + 'set this option will override any encoding declared by the ' + 'document itself. Particularly useful for documents that ' + 'do not declare an encoding or that have erroneous ' + 'encoding declarations.') + ), + + ]) + + #: Options to customize the behavior of this plugin. Every option must be an + #: instance of :class:`OptionRecommendation`. + options = set([]) + + def convert(self, stream, options, file_ext, parse_cache, log): + ''' + This method must be implemented in sub-classes. It must return + the path to the created OPF file. All output should be contained in + the current directory. If this plugin creates files outside the current + directory they must be deleted/marked for deletion before this method + returns. + + :param stream: A file like object that contains the input file. + + :param options: Options to customize the conversion process. + Guaranteed to have attributes corresponding + to all the options declared by this plugin. In + addition, it will have a verbose attribute that + takes integral values from zero upwards. Higher numbers + mean be more verbose. Another useful attribute is + ``input_profile`` that is an instance of + :class:`calibre.customize.profiles.InputProfile`. + + :param file_ext: The extension (without the .) of the input file. It + is guaranteed to be one of the `file_types` supported + by this plugin. + + :param parse_cache: A dictionary that maps absolute file paths to + parsed representations of their contents. For + HTML the representation is an lxml element of + the root of the tree. For CSS it is a cssutils + stylesheet. If this plugin parses any of the + output files, it should add them to the cache + so that later stages of the conversion wont + have to re-parse them. If a parsed representation + is in the cache, there is no need to actually + write the file to disk. + + :param log: A :class:`calibre.utils.logging.Log` object. All output + should use this object. + ''' + raise NotImplementedError + + def __call__(self, stream, options, file_ext, parse_cache, log, output_dir): + log('InputFormatPlugin: %s running'%self.name, end=' ') + if hasattr(stream, 'name'): + log('on', stream.name) + + with CurrentDir(output_dir): + for x in os.listdir('.'): + shutil.rmtree(x) if os.path.isdir(x) else os.remove(x) + + + ret = self.convert(stream, options, file_ext, parse_cache, log) + for key in list(parse_cache.keys()): + if os.path.abspath(key) != key: + log.warn(('InputFormatPlugin: %s returned a ' + 'relative path: %s')%(self.name, key) + ) + parse_cache[os.path.abspath(key)] = parse_cache.pop(key) + + if options.debug_input is not None: + options.debug_input = os.path.abspath(options.debug_input) + if not os.path.exists(options.debug_input): + os.makedirs(options.debug_input) + shutil.rmtree(options.debug_input) + for f, obj in parse_cache.items(): + if hasattr(obj, 'cssText'): + raw = obj.cssText + else: + raw = html.tostring(obj, encoding='utf-8', method='xml', + include_meta_content_type=True, pretty_print=True) + if isinstance(raw, unicode): + raw = raw.encode('utf-8') + open(f, 'wb').write(raw) + shutil.copytree('.', options.debug_input) + + + + return ret diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py new file mode 100644 index 0000000000..002f56879f --- /dev/null +++ b/src/calibre/customize/profiles.py @@ -0,0 +1,27 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from calibre.customize import Plugin + +class InputProfile(Plugin): + + author = 'Kovid Goyal' + supported_platforms = set(['windows', 'osx', 'linux']) + can_be_disabled = False + type = _('Input profile') + +# TODO: Add some real information to this profile. All other profiles must +# inherit from this profile and override as needed + + name = 'Default Input Profile' + short_name = 'default' # Used in the CLI so dont spaces etc. in it + description = _('This profile tries to provide sane defaults and is useful ' + 'if you know nothing about the input document.') + +input_profiles = [InputProfile] + + + + diff --git a/src/calibre/customize/ui.py b/src/calibre/customize/ui.py index 95bf01ff6d..1cdafae4f0 100644 --- a/src/calibre/customize/ui.py +++ b/src/calibre/customize/ui.py @@ -6,13 +6,14 @@ import os, shutil, traceback, functools, sys from calibre.customize import Plugin, FileTypePlugin, MetadataReaderPlugin, \ MetadataWriterPlugin +from calibre.customize.conversion import InputFormatPlugin +from calibre.customize.profiles import InputProfile from calibre.customize.builtins import plugins as builtin_plugins from calibre.constants import __version__, iswindows, isosx from calibre.ebooks.metadata import MetaInformation from calibre.utils.config import make_config_dir, Config, ConfigProxy, \ plugin_dir, OptionParser - version = tuple([int(x) for x in __version__.split('.')]) platform = 'linux' @@ -70,7 +71,10 @@ _on_import = {} _on_preprocess = {} _on_postprocess = {} - +def input_profiles(): + for plugin in _initialized_plugins: + if isinstance(plugin, InputProfile): + yield plugin def reread_filetype_plugins(): global _on_import @@ -234,6 +238,17 @@ def find_plugin(name): if plugin.name == name: return plugin +def input_format_plugins(): + for plugin in _initialized_plugins: + if isinstance(plugin, InputFormatPlugin): + yield plugin + +def plugin_for_input_format(fmt): + for plugin in input_format_plugins(): + if fmt in plugin.file_types: + return plugin + + def disable_plugin(plugin_or_name): x = getattr(plugin_or_name, 'name', plugin_or_name) plugin = find_plugin(x) diff --git a/src/calibre/ebooks/conversion/__init__.py b/src/calibre/ebooks/conversion/__init__.py new file mode 100644 index 0000000000..384ccfb79c --- /dev/null +++ b/src/calibre/ebooks/conversion/__init__.py @@ -0,0 +1,4 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py new file mode 100644 index 0000000000..ac7490bd39 --- /dev/null +++ b/src/calibre/ebooks/conversion/plumber.py @@ -0,0 +1,30 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + + +from calibre.customize.conversion import OptionRecommendation +from calibre.customize.ui import input_profiles + +pipeline_options = [ + +OptionRecommendation(name='verbose', + recommended_value=0, level=OptionRecommendation.LOW, + short_switch='v', + help=_('Level of verbosity. Specify multiple times for greater ' + 'verbosity.') + ), + + +OptionRecommendation(name='input_profile', + recommended_value='default', level=OptionRecommendation.LOW, + choices=[x.short_name for x in input_profiles()], + help=_('Specify the input profile. The input profile gives the ' + 'conversion system information on how to interpret ' + 'various information in the input document. For ' + 'example resolution dependent lengths (i.e. lengths in ' + 'pixels).') + ), + +] \ No newline at end of file diff --git a/src/calibre/ebooks/epub/__init__.py b/src/calibre/ebooks/epub/__init__.py index aa17024d50..989391902b 100644 --- a/src/calibre/ebooks/epub/__init__.py +++ b/src/calibre/ebooks/epub/__init__.py @@ -40,38 +40,6 @@ def rules(stylesheets): if r.type == r.STYLE_RULE: yield r -def decrypt_font(key, path): - raw = open(path, 'rb').read() - crypt = raw[:1024] - key = cycle(iter(key)) - decrypt = ''.join([chr(ord(x)^key.next()) for x in crypt]) - with open(path, 'wb') as f: - f.write(decrypt) - f.write(raw[1024:]) - -def process_encryption(encfile, opf): - key = None - m = re.search(r'(?i)(urn:uuid:[0-9a-f-]+)', open(opf, 'rb').read()) - if m: - key = m.group(1) - key = list(map(ord, uuid.UUID(key).bytes)) - try: - root = etree.parse(encfile) - for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'): - algorithm = em.get('Algorithm', '') - if algorithm != 'http://ns.adobe.com/pdf/enc#RC': - return False - cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0] - uri = cr.get('URI') - path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/'))) - if os.path.exists(path): - decrypt_font(key, path) - return True - except: - import traceback - traceback.print_exc() - return False - def initialize_container(path_to_container, opf_name='metadata.opf'): ''' Create an empty EPUB document, with a default skeleton. diff --git a/src/calibre/ebooks/epub/input.py b/src/calibre/ebooks/epub/input.py new file mode 100644 index 0000000000..1b69424a9e --- /dev/null +++ b/src/calibre/ebooks/epub/input.py @@ -0,0 +1,76 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import os, re, uuid +from itertools import cycle + +from lxml import etree + +from calibre.customize.conversion import InputFormatPlugin + +class EPUBInput(InputFormatPlugin): + + name = 'EPUB Input' + author = 'Kovid Goyal' + description = 'Convert EPUB files (.epub) to HTML' + file_types = set(['epub']) + + @classmethod + def decrypt_font(cls, key, path): + raw = open(path, 'rb').read() + crypt = raw[:1024] + key = cycle(iter(key)) + decrypt = ''.join([chr(ord(x)^key.next()) for x in crypt]) + with open(path, 'wb') as f: + f.write(decrypt) + f.write(raw[1024:]) + + @classmethod + def process_ecryption(cls, encfile, opf, log): + key = None + m = re.search(r'(?i)(urn:uuid:[0-9a-f-]+)', open(opf, 'rb').read()) + if m: + key = m.group(1) + key = list(map(ord, uuid.UUID(key).bytes)) + try: + root = etree.parse(encfile) + for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'): + algorithm = em.get('Algorithm', '') + if algorithm != 'http://ns.adobe.com/pdf/enc#RC': + return False + cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0] + uri = cr.get('URI') + path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/'))) + if os.path.exists(path): + cls.decrypt_font(key, path) + return True + except: + import traceback + traceback.print_exc() + return False + + def convert(self, stream, options, file_ext, parse_cache, log): + from calibre.utils.zipfile import ZipFile + from calibre import walk + from calibre.ebooks import DRMError + zf = ZipFile(stream) + zf.extractall(os.getcwd()) + encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml')) + opf = None + for f in walk('.'): + if f.lower().endswith('.opf'): + opf = f + break + path = getattr(stream, 'name', 'stream') + + if opf is None: + raise ValueError('%s is not a valid EPUB file'%path) + + if os.path.exists(encfile): + if not self.process_encryption(encfile, opf, log): + raise DRMError(os.path.basename(path)) + + return opf + diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py new file mode 100644 index 0000000000..1ce9950677 --- /dev/null +++ b/src/calibre/ebooks/mobi/input.py @@ -0,0 +1,29 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import os + +from calibre.customize.conversion import InputFormatPlugin + +class MOBIInput(InputFormatPlugin): + + name = 'MOBI Input' + author = 'Kovid Goyal' + description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML' + file_types = set(['mobi', 'prc', 'azw']) + + def convert(self, stream, options, file_ext, parse_cache, log): + from calibre.ebooks.mobi.reader import MobiReader + mr = MobiReader(stream, log, options.input_encoding, + options.debug_input) + mr.extract_content(output_dir=os.getcwdu(), parse_cache) + raw = parse_cache.get('calibre_raw_mobi_markup', False) + if raw: + if isinstance(raw, unicode): + raw = raw.encode('utf-8') + open('debug-raw.html', 'wb').write(raw) + + return mr.created_opf_path + diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index 2c80cc1c8c..18663660b4 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -5,7 +5,7 @@ __copyright__ = '2008, Kovid Goyal ' Read data from .mobi files ''' -import sys, struct, os, cStringIO, re, functools +import struct, os, cStringIO, re, functools try: from PIL import Image as PILImage @@ -35,8 +35,10 @@ class EXTHHeader(object): pos = 0 self.mi = MetaInformation(_('Unknown'), [_('Unknown')]) self.has_fake_cover = True + left = self.num_items - for i in range(self.num_items): + while left > 0: + left -= 1 id, size = struct.unpack('>LL', raw[pos:pos+8]) content = raw[pos+8:pos+size] pos += size @@ -76,7 +78,8 @@ class EXTHHeader(object): class BookHeader(object): - def __init__(self, raw, ident): + def __init__(self, raw, ident, user_encoding, log): + self.log = log self.compression_type = raw[:2] self.records, self.records_size = struct.unpack('>HH', raw[8:12]) self.encryption_type, = struct.unpack('>H', raw[12:14]) @@ -92,8 +95,8 @@ class BookHeader(object): else: self.ancient = False self.doctype = raw[16:20] - self.length, self.type, self.codepage, self.unique_id, self.version = \ - struct.unpack('>LLLLL', raw[20:40]) + self.length, self.type, self.codepage, self.unique_id, \ + self.version = struct.unpack('>LLLLL', raw[20:40]) try: @@ -102,8 +105,9 @@ class BookHeader(object): 65001 : 'utf-8', }[self.codepage] except (IndexError, KeyError): - print '[WARNING] Unknown codepage %d. Assuming cp-1252'%self.codepage - self.codec = 'cp1252' + self.codec = 'cp1252' if user_encoding is None else user_encoding + log.warn('Unknown codepage %d. Assuming %s'%(self.codepage, + self.codec)) if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length: self.extra_flags = 0 @@ -138,9 +142,24 @@ class MobiReader(object): PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE) IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex') - def __init__(self, filename_or_stream, verbose=False): - self.verbose = verbose + def __init__(self, filename_or_stream, log, user_encoding=None, debug=None): + self.log = log + self.debug = debug self.embedded_mi = None + self.base_css_rules = ''' + blockquote { margin: 0em 0em 0em 1.25em; text-align: justify } + + p { margin: 0em; text-align: justify } + + .bold { font-weight: bold } + + .italic { font-style: italic } + + .mbp_pagebreak { + page-break-after: always; margin: 0; display: block + } + ''' + self.tag_css_rules = [] if hasattr(filename_or_stream, 'read'): stream = filename_or_stream @@ -177,17 +196,21 @@ class MobiReader(object): self.sections.append((section(i), self.section_headers[i])) - self.book_header = BookHeader(self.sections[0][0], self.ident) + self.book_header = BookHeader(self.sections[0][0], self.ident, + user_encoding, self.log) self.name = self.name.decode(self.book_header.codec, 'replace') - def extract_content(self, output_dir=os.getcwdu()): + def extract_content(self, output_dir, parse_cache): output_dir = os.path.abspath(output_dir) if self.book_header.encryption_type != 0: raise DRMError(self.name) processed_records = self.extract_text() + if self.debug is not None: + self.parse_cache['calibre_raw_mobi_markup'] = self.mobi_html self.add_anchors() - self.processed_html = self.processed_html.decode(self.book_header.codec, 'ignore') + self.processed_html = self.processed_html.decode(self.book_header.codec, + 'ignore') for pat in ENCODING_PATS: self.processed_html = pat.sub('', self.processed_html) e2u = functools.partial(entity_to_unicode, @@ -203,16 +226,10 @@ class MobiReader(object): self.processed_html = \ re.compile('', re.IGNORECASE).sub( '\n\n' - '\n', + '\t\n', self.processed_html) - if self.verbose: - print 'Parsing HTML...' + self.log.debug('Parsing HTML...') root = html.fromstring(self.processed_html) self.upshift_markup(root) guides = root.xpath('//guide') @@ -230,25 +247,24 @@ class MobiReader(object): ref.attrib['href'] = os.path.basename(htmlfile)+ref.attrib['href'] except AttributeError: pass - if self.verbose: - print 'Serializing...' - with open(htmlfile, 'wb') as f: - raw = html.tostring(root, encoding='utf-8', method='xml', - include_meta_content_type=True, pretty_print=True) - raw = raw.replace('', - '\n\n') - f.write(raw) + parse_cache[htmlfile] = root self.htmlfile = htmlfile - if self.book_header.exth is not None or self.embedded_mi is not None: - if self.verbose: - print 'Creating OPF...' - ncx = cStringIO.StringIO() - opf = self.create_opf(htmlfile, guide, root) - opf.render(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'), ncx) - ncx = ncx.getvalue() - if ncx: - open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx) + self.log.debug('Creating OPF...') + ncx = cStringIO.StringIO() + opf = self.create_opf(htmlfile, guide, root) + self.created_opf_path = os.path.splitext(htmlfile)[0]+'.opf' + opf.render(open(self.created_opf_path, 'wb'), ncx) + ncx = ncx.getvalue() + if ncx: + open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx) + + with open('styles.css', 'wb') as s: + s.write(self.base_css_rules+'\n\n') + for rule in self.tag_css_rules: + if isinstance(rule, unicode): + rule = rule.encode('utf-8') + s.write(rule+'\n\n') def read_embedded_metadata(self, root, elem, guide): raw = ''+html.tostring(elem, encoding='utf-8')+'' @@ -277,8 +293,7 @@ class MobiReader(object): def cleanup_html(self): - if self.verbose: - print 'Cleaning up HTML...' + self.log.debug('Cleaning up HTML...') self.processed_html = re.sub(r'
', '', self.processed_html) if self.book_header.ancient and '')+'' @@ -286,8 +301,7 @@ class MobiReader(object): self.processed_html = self.processed_html.replace('> <', '>\n<') def upshift_markup(self, root): - if self.verbose: - print 'Converting style information to CSS...' + self.log.debug('Converting style information to CSS...') size_map = { 'xx-small' : '0.5', 'x-small' : '1', @@ -298,7 +312,7 @@ class MobiReader(object): 'xx-large' : '6', } mobi_version = self.book_header.mobi_version - for tag in root.iter(etree.Element): + for i, tag in enumerate(root.iter(etree.Element)): if tag.tag in ('country-region', 'place', 'placetype', 'placename', 'state', 'city'): tag.tag = 'span' @@ -352,8 +366,7 @@ class MobiReader(object): elif tag.tag == 'pre': if not tag.text: tag.tag = 'div' - if styles: - attrib['style'] = '; '.join(styles) + if 'filepos-id' in attrib: attrib['id'] = attrib.pop('filepos-id') if 'filepos' in attrib: @@ -362,15 +375,24 @@ class MobiReader(object): attrib['href'] = "#filepos%d" % int(filepos) except ValueError: pass + + if styles: + attrib['id'] = attrib.get('id', 'calibre_mr_gid%d'%i) + self.tag_css_rules.append('#%s {%s}'%(attrib['id'], + '; '.join(styles))) + def create_opf(self, htmlfile, guide=None, root=None): mi = getattr(self.book_header.exth, 'mi', self.embedded_mi) + if mi is None: + mi = MetaInformation(self.title, [_('Unknown')]) opf = OPFCreator(os.path.dirname(htmlfile), mi) if hasattr(self.book_header.exth, 'cover_offset'): opf.cover = 'images/%05d.jpg'%(self.book_header.exth.cover_offset+1) elif mi.cover is not None: opf.cover = mi.cover - manifest = [(htmlfile, 'text/x-oeb1-document')] + manifest = [(htmlfile, 'text/x-oeb1-document'), + (os.path.abspath('styles.css'), 'text/css')] bp = os.path.dirname(htmlfile) for i in getattr(self, 'image_names', []): manifest.append((os.path.join(bp, 'images/', i), 'image/jpeg')) @@ -441,8 +463,7 @@ class MobiReader(object): return data[:len(data)-trail_size] def extract_text(self): - if self.verbose: - print 'Extracting text...' + self.log.debug('Extracting text...') text_sections = [self.text_section(i) for i in range(1, self.book_header.records+1)] processed_records = list(range(0, self.book_header.records+1)) @@ -472,12 +493,11 @@ class MobiReader(object): def replace_page_breaks(self): self.processed_html = self.PAGE_BREAK_PAT.sub( - '
', + '
', self.processed_html) def add_anchors(self): - if self.verbose: - print 'Adding anchors...' + self.log.debug('Adding anchors...') positions = set([]) link_pattern = re.compile(r'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''', re.IGNORECASE) @@ -507,8 +527,7 @@ class MobiReader(object): def extract_images(self, processed_records, output_dir): - if self.verbose: - print 'Extracting images...' + self.log.debug('Extracting images...') output_dir = os.path.abspath(os.path.join(output_dir, 'images')) if not os.path.exists(output_dir): os.makedirs(output_dir) @@ -535,14 +554,17 @@ class MobiReader(object): im.convert('RGB').save(open(path, 'wb'), format='JPEG') def get_metadata(stream): - mr = MobiReader(stream) + from calibre.utils.logging import Log + log = Log() + mr = MobiReader(stream, log) if mr.book_header.exth is None: mi = MetaInformation(mr.name, [_('Unknown')]) else: mi = mr.create_opf('dummy.html') try: if hasattr(mr.book_header.exth, 'cover_offset'): - cover_index = mr.book_header.first_image_index + mr.book_header.exth.cover_offset + cover_index = mr.book_header.first_image_index + \ + mr.book_header.exth.cover_offset data = mr.sections[int(cover_index)][0] else: data = mr.sections[mr.book_header.first_image_index][0] @@ -552,42 +574,7 @@ def get_metadata(stream): im.convert('RGBA').save(obuf, format='JPEG') mi.cover_data = ('jpg', obuf.getvalue()) except: - import traceback - traceback.print_exc() + log.exception() return mi -def option_parser(): - from calibre.utils.config import OptionParser - parser = OptionParser(usage=_('%prog [options] myebook.mobi')) - parser.add_option('-o', '--output-dir', default='.', - help=_('Output directory. Defaults to current directory.')) - parser.add_option('-v', '--verbose', default=False, action='store_true', - help='Useful for debugging.') - return parser - - -def main(args=sys.argv): - parser = option_parser() - opts, args = parser.parse_args(args) - if len(args) != 2: - parser.print_help() - return 1 - - mr = MobiReader(args[1], verbose=opts.verbose) - opts.output_dir = os.path.abspath(opts.output_dir) - mr.extract_content(opts.output_dir) - if opts.verbose: - oname = os.path.join(opts.output_dir, 'debug-raw.html') - dat = mr.mobi_html - if isinstance(dat, unicode): - dat = dat.encode('utf-8') - open(oname, 'wb').write(dat) - print _('Raw MOBI HTML saved in'), oname - - print _('OEB ebook created in'), opts.output_dir - - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/utils/logging.py b/src/calibre/utils/logging.py new file mode 100644 index 0000000000..ae2e1a792b --- /dev/null +++ b/src/calibre/utils/logging.py @@ -0,0 +1,92 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +'A simplified logging system' + +DEBUG = 0 +INFO = 1 +WARN = 2 +ERROR = 3 + +import sys, traceback +from functools import partial + +from calibre import prints +from calibre.utils.terminfo import TerminalController + +class ANSIStream: + + def __init__(self, stream=sys.stdout): + self.stream = stream + tc = TerminalController(stream) + self.color = { + DEBUG: tc.GREEN, + INFO:'', + WARN: tc.YELLOW, + ERROR: tc.RED + } + self.normal = tc.NORMAL + + def prints(self, level, *args, **kwargs): + self.stream.write(self.color[level]) + kwargs['file'] = self.stream + prints(*args, **kwargs) + self.stream.write(self.normal) + + def flush(self): + self.stream.flush() + +class HTMLStream: + + def __init__(self, stream=sys.stdout): + self.stream = stream + self.color = { + DEBUG: '', + INFO:'', + WARN: '', + ERROR: '' + } + self.normal = '' + + def prints(self, level, *args, **kwargs): + self.stream.write(self.color[level]) + kwargs['file'] = self.stream + prints(*args, **kwargs) + self.stream.write(self.normal) + + def flush(self): + self.stream.flush() + +class Log(object): + + DEBUG = DEBUG + INFO = INFO + WARN = WARN + ERROR = ERROR + + def __init__(self, level=INFO): + self.filter_level = level + default_output = ANSIStream() + self.outputs = [default_output] + + self.debug = partial(self.prints, DEBUG) + self.info = partial(self.prints, INFO) + self.warn = self.warning = partial(self.prints, WARN) + self.error = partial(self.prints, ERROR) + + + def prints(self, level, *args, **kwargs): + if level < self.filter_level: + return + for output in self.outputs: + output.prints(level, *args, **kwargs) + + def exception(self, *args, **kwargs): + limit = kwargs.pop('limit', None) + self.prints(ERROR, *args, **kwargs) + self.prints(DEBUG, traceback.format_exc(limit)) + + def __call__(self, *args, **kwargs): + self.prints(INFO, *args, **kwargs) \ No newline at end of file diff --git a/src/calibre/utils/terminfo.py b/src/calibre/utils/terminfo.py index 075c0e694d..fd394cbfe9 100644 --- a/src/calibre/utils/terminfo.py +++ b/src/calibre/utils/terminfo.py @@ -33,7 +33,7 @@ class TerminalController: >>> term = TerminalController() >>> if term.CLEAR_SCREEN: - ... print 'This terminal supports clearning the screen.' + ... print 'This terminal supports clearing the screen.' Finally, if the width and height of the terminal are known, then they will be stored in the `COLS` and `LINES` attributes. From 04ea5c974af58e4703b651878c289d74718b2697 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 7 Mar 2009 08:44:20 -0500 Subject: [PATCH 025/319] Configurable margins on PDF output --- src/calibre/ebooks/pdf/writer.py | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index cfdd3bb336..c189407dac 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -22,8 +22,15 @@ from PyQt4.QtWebKit import QWebView from pyPdf import PdfFileWriter, PdfFileReader +class PDFMargins: + def __init__(self, margin=1): + self.top = margin + self.bottom = margin + self.left = margin + self.right = margin + class PDFWriter(QObject): - def __init__(self): + def __init__(self, margins=PDFMargins()): if QApplication.instance() is None: QApplication([]) QObject.__init__(self) @@ -36,6 +43,7 @@ class PDFWriter(QObject): self.render_queue = [] self.combine_queue = [] self.tmp_path = PersistentTemporaryDirectory('_any2pdf_parts') + self.margins = margins def dump(self, oebpath, path): self._delete_tmpdir() @@ -70,7 +78,7 @@ class PDFWriter(QObject): self.logger.debug('\tRendering item as %s' % item_path) printer = QPrinter(QPrinter.HighResolution) - printer.setPageMargins(1, 1, 1, 1, QPrinter.Inch) + printer.setPageMargins(self.margins.left, self.margins.top, self.margins.right, self.margins.bottom, QPrinter.Inch) printer.setOutputFormat(QPrinter.PdfFormat) printer.setOutputFileName(item_path) self.view.print_(printer) @@ -106,9 +114,17 @@ def config(defaults=None): c = StringConfig(defaults, desc) pdf = c.add_group('PDF', _('PDF options.')) + + pdf('margin_top', ['--margin_top'], default=1, + help=_('The top margin around the document in inches.')) + pdf('margin_bottom', ['--margin_bottom'], default=1, + help=_('The bottom margin around the document in inches.')) + pdf('margin_left', ['--margin_left'], default=1, + help=_('The left margin around the document in inches.')) + pdf('margin_right', ['--margin_right'], default=1, + help=_('The right margin around the document in inches.')) return c - def option_parser(): c = config() @@ -118,7 +134,7 @@ def option_parser(): help=_('Output file. Default is derived from input filename.')) parser.add_option( '-v', '--verbose', default=0, action='count', - help=_('Useful for debugging.')) + help=_('Useful for debugging.')) return parser def oeb2pdf(opts, inpath): @@ -130,7 +146,13 @@ def oeb2pdf(opts, inpath): outpath = os.path.basename(inpath) outpath = os.path.splitext(outpath)[0] + '.pdf' - writer = PDFWriter() + margins = PDFMargins() + margins.top = opts.margin_top + margins.bottom = opts.margin_bottom + margins.left = opts.margin_left + margins.right = opts.margin_right + + writer = PDFWriter(margins) writer.dump(inpath, outpath) run_plugins_on_postprocess(outpath, 'pdf') logger.log_info(_('Output written to ') + outpath) From e12384139a326d5faa51cbe3adff9f124bcd1fcd Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 7 Mar 2009 13:26:14 -0500 Subject: [PATCH 026/319] USBMS: Convert authors to string when settings path --- src/calibre/devices/cybookg3/driver.py | 3 ++- src/calibre/devices/usbms/driver.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/calibre/devices/cybookg3/driver.py b/src/calibre/devices/cybookg3/driver.py index 6a77c09479..0a9d69f7a9 100644 --- a/src/calibre/devices/cybookg3/driver.py +++ b/src/calibre/devices/cybookg3/driver.py @@ -7,6 +7,7 @@ Device driver for Bookeen's Cybook Gen 3 import os, shutil from itertools import cycle +from calibre.ebooks.metadata import authors_to_string from calibre.devices.errors import FreeSpaceError from calibre.devices.usbms.driver import USBMS import calibre.devices.cybookg3.t2b as t2b @@ -84,7 +85,7 @@ class CYBOOKG3(USBMS): break if newpath == path: - newpath = os.path.join(newpath, mdata.get('authors', '')) + newpath = os.path.join(newpath, authors_to_string(mdata.get('authors', ''))) newpath = os.path.join(newpath, mdata.get('title', '')) if not os.path.exists(newpath): diff --git a/src/calibre/devices/usbms/driver.py b/src/calibre/devices/usbms/driver.py index a088bf1a32..06bfa243f1 100644 --- a/src/calibre/devices/usbms/driver.py +++ b/src/calibre/devices/usbms/driver.py @@ -115,7 +115,7 @@ class USBMS(Device): break if newpath == path: - newpath = os.path.join(newpath, mdata.get('authors', '')) + newpath = os.path.join(newpath, authors_to_string(mdata.get('authors', ''))) newpath = os.path.join(newpath, mdata.get('title', '')) if not os.path.exists(newpath): From 0a1c9f9919b7e0642913166f6d4918a2a4e302aa Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Sun, 8 Mar 2009 14:03:23 -0400 Subject: [PATCH 027/319] Clean up merge artifacts. --- src/calibre/ebooks/lit/reader.py | 36 +++++++++++--------------------- 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index 1ac68f3866..f32a65e010 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -129,8 +129,6 @@ class UnBinary(object): self.tag_map, self.attr_map, self.tag_to_attr_map = map self.is_html = map is HTML_MAP self.tag_atoms, self.attr_atoms = atoms - self.opf = map is OPF_MAP - self.bin = bin self.dir = os.path.dirname(path) buf = StringIO() self.binary_to_text(bin, buf) @@ -210,7 +208,8 @@ class UnBinary(object): continue if flags & FLAG_ATOM: if not self.tag_atoms or tag not in self.tag_atoms: - raise LitError("atom tag %d not in atom tag list" % tag) + raise LitError( + "atom tag %d not in atom tag list" % tag) tag_name = self.tag_atoms[tag] current_map = self.attr_atoms elif tag < len(self.tag_map): @@ -295,7 +294,7 @@ class UnBinary(object): c = '"' elif c == '<': c = '<' - self.buf.write(c.encode('ascii', 'xmlcharrefreplace')) + buf.write(c.encode('ascii', 'xmlcharrefreplace')) count -= 1 if count == 0: if not in_censorship: @@ -841,24 +840,7 @@ class LitFile(object): if len(attrs) != nentries: self._warn("damaged or invalid atoms attributes table") return (tags, attrs) - - def get_entry_content(self, entry, pretty_print=False): - if 'spine' in entry.state: - name = '/'.join(('/data', entry.internal, 'content')) - path = entry.path - raw = self.get_file(name) - decl, map = (OPF_DECL, OPF_MAP) \ - if name == '/meta' else (HTML_DECL, HTML_MAP) - atoms = self.get_atoms(entry) - content = decl + unicode(UnBinary(raw, path, self.manifest, map, atoms)) - if pretty_print: - content = self._pretty_print(content) - content = content.encode('utf-8') - else: - internal = '/'.join(('/data', entry.internal)) - content = self._litfile.get_file(internal) - return content - + class LitContainer(object): """Simple Container-interface, read-only accessor for LIT files.""" @@ -879,9 +861,15 @@ class LitContainer(object): elif 'spine' in entry.state: internal = '/'.join(('/data', entry.internal, 'content')) raw = self._litfile.get_file(internal) - unbin = UnBinary(raw, name, self._litfile.manifest, HTML_MAP) + manifest = self._litfile.manifest + atoms = self._litfile.get_atoms(entry) + unbin = UnBinary(raw, name, manifest, HTML_MAP, atoms) content = HTML_DECL + str(unbin) - + else: + internal = '/'.join(('/data', entry.internal)) + content = self._litfile.get_file(internal) + return content + def _read_meta(self): path = 'content.opf' raw = self._litfile.get_file('/meta') From 4e128c10736b2695976812b6d4ca893f152a995e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 8 Mar 2009 13:45:54 -0700 Subject: [PATCH 028/319] Commit so I can pull from trunk --- src/calibre/__init__.py | 19 --- src/calibre/customize/builtins.py | 3 +- src/calibre/customize/conversion.py | 47 +++++++- src/calibre/customize/profiles.py | 36 +++++- src/calibre/customize/ui.py | 24 +++- src/calibre/ebooks/conversion/cli.py | 146 +++++++++++++++++++++++ src/calibre/ebooks/conversion/plumber.py | 73 +++++++++++- src/calibre/ebooks/html.py | 7 +- src/calibre/ebooks/mobi/input.py | 7 +- src/calibre/ebooks/oeb/output.py | 17 +++ src/calibre/linux.py | 1 + src/calibre/utils/logging.py | 28 +++-- 12 files changed, 356 insertions(+), 52 deletions(-) create mode 100644 src/calibre/ebooks/conversion/cli.py create mode 100644 src/calibre/ebooks/oeb/output.py diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index de133ddb57..030aab8317 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -7,7 +7,6 @@ import sys, os, re, logging, time, subprocess, atexit, mimetypes, \ __builtin__.__dict__['dynamic_property'] = lambda(func): func(None) from htmlentitydefs import name2codepoint from math import floor -from logging import Formatter from PyQt4.QtCore import QUrl from PyQt4.QtGui import QDesktopServices @@ -318,24 +317,6 @@ def english_sort(x, y): ''' return cmp(_spat.sub('', x), _spat.sub('', y)) -class ColoredFormatter(Formatter): - - def format(self, record): - ln = record.__dict__['levelname'] - col = '' - if ln == 'CRITICAL': - col = terminal_controller.YELLOW - elif ln == 'ERROR': - col = terminal_controller.RED - elif ln in ['WARN', 'WARNING']: - col = terminal_controller.BLUE - elif ln == 'INFO': - col = terminal_controller.GREEN - elif ln == 'DEBUG': - col = terminal_controller.CYAN - record.__dict__['levelname'] = col + record.__dict__['levelname'] + terminal_controller.NORMAL - return Formatter.format(self, record) - def walk(dir): ''' A nice interface to os.walk ''' for record in os.walk(dir): diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index fafe8e5afa..ca21bbb215 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -244,9 +244,10 @@ class MOBIMetadataWriter(MetadataWriterPlugin): from calibre.ebooks.epub.input import EPUBInput from calibre.ebooks.mobi.input import MOBIInput +from calibre.ebooks.oeb.output import OEBOutput from calibre.customize.profiles import input_profiles -plugins = [HTML2ZIP, EPUBInput, MOBIInput] +plugins = [HTML2ZIP, EPUBInput, MOBIInput, OEBOutput] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index aa7b0c1dea..f20cc4ae85 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -37,19 +37,24 @@ class ConversionOption(object): if not self.help: raise ValueError('You must set the help text') + def __hash__(self): + return hash(self.name) + + def __eq__(self, other): + return hash(self) == hash(other) class OptionRecommendation(object): LOW = 1 MED = 2 HIGH = 3 - def __init__(self, recommeded_value, level=LOW, **kwargs): + def __init__(self, recommended_value=None, level=LOW, **kwargs): ''' An option recommendation. That is, an option as well as its recommended value and the level of the recommendation. ''' self.level = level - self.recommended_value = recommeded_value + self.recommended_value = recommended_value self.option = kwargs.pop('option', None) if self.option is None: self.option = ConversionOption(**kwargs) @@ -59,10 +64,12 @@ class OptionRecommendation(object): def validate_parameters(self): if self.option.choices and self.recommended_value not in \ self.option.choices: - raise ValueError('Recommended value not in choices') + raise ValueError('OpRec: %s: Recommended value not in choices'% + self.option.name) if not (isinstance(self.recommended_value, (int, float, str, unicode))\ - or self.default is None): - raise ValueError(unicode(self.default) + + or self.recommended_value is None): + raise ValueError('OpRec: %s:'%self.option.name + + repr(self.recommended_value) + ' is not a string or a number') @@ -186,4 +193,34 @@ class InputFormatPlugin(Plugin): return ret + + +class OutputFormatPlugin(Plugin): + ''' + OutputFormatPlugins are responsible for converting an OEB document + (OPF+HTML) into an output ebook. + + The OEB document can be assumed to be encoded in UTF-8. + The main action happens in :method:`convert`. + ''' + + type = _('Conversion Output') + can_be_disabled = False + supported_platforms = ['windows', 'osx', 'linux'] + + #: The file type (extension without leading period) that this + #: plugin outputs + file_type = None + + #: Options shared by all Input format plugins. Do not override + #: in sub-classes. Use :member:`options` instead. Every option must be an + #: instance of :class:`OptionRecommendation`. + common_options = set([]) + + #: Options to customize the behavior of this plugin. Every option must be an + #: instance of :class:`OptionRecommendation`. + options = set([]) + + def convert(self, oeb_book, input_plugin, options, parse_cache, log): + raise NotImplementedError diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py index 002f56879f..a3a7e22298 100644 --- a/src/calibre/customize/profiles.py +++ b/src/calibre/customize/profiles.py @@ -3,6 +3,7 @@ __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' +import sys, re from calibre.customize import Plugin class InputProfile(Plugin): @@ -16,12 +17,43 @@ class InputProfile(Plugin): # inherit from this profile and override as needed name = 'Default Input Profile' - short_name = 'default' # Used in the CLI so dont spaces etc. in it + short_name = 'default' # Used in the CLI so dont use spaces etc. in it description = _('This profile tries to provide sane defaults and is useful ' 'if you know nothing about the input document.') input_profiles = [InputProfile] - +class OutputProfile(Plugin): + author = 'Kovid Goyal' + supported_platforms = set(['windows', 'osx', 'linux']) + can_be_disabled = False + type = _('Output profile') + + name = 'Default Output Profile' + short_name = 'default' # Used in the CLI so dont use spaces etc. in it + description = _('This profile tries to provide sane defaults and is useful ' + 'if you want to produce a document intended to be read at a ' + 'computer or on a range of devices.') + + epub_flow_size = sys.maxint + screen_size = None + remove_special_chars = False + remove_object_tags = False + +class SonyReader(OutputProfile): + + name = 'Sony Reader' + short_name = 'sony' + description = _('This profile is intended for the SONY PRS line. ' + 'The 500/505/700 etc.') + + epub_flow_size = 270000 + screen_size = (590, 765) + remove_special_chars = re.compile(u'[\u200b\u00ad]') + remove_object_tags = True + + + +output_profiles = [OutputProfile, SonyReader] \ No newline at end of file diff --git a/src/calibre/customize/ui.py b/src/calibre/customize/ui.py index 1cdafae4f0..d8b7ebf6d8 100644 --- a/src/calibre/customize/ui.py +++ b/src/calibre/customize/ui.py @@ -6,8 +6,8 @@ import os, shutil, traceback, functools, sys from calibre.customize import Plugin, FileTypePlugin, MetadataReaderPlugin, \ MetadataWriterPlugin -from calibre.customize.conversion import InputFormatPlugin -from calibre.customize.profiles import InputProfile +from calibre.customize.conversion import InputFormatPlugin, OutputFormatPlugin +from calibre.customize.profiles import InputProfile, OutputProfile from calibre.customize.builtins import plugins as builtin_plugins from calibre.constants import __version__, iswindows, isosx from calibre.ebooks.metadata import MetaInformation @@ -76,6 +76,12 @@ def input_profiles(): if isinstance(plugin, InputProfile): yield plugin +def output_profiles(): + for plugin in _initialized_plugins: + if isinstance(plugin, OutputProfile): + yield plugin + + def reread_filetype_plugins(): global _on_import global _on_preprocess @@ -245,9 +251,19 @@ def input_format_plugins(): def plugin_for_input_format(fmt): for plugin in input_format_plugins(): - if fmt in plugin.file_types: + if fmt.lower() in plugin.file_types: return plugin - + +def output_format_plugins(): + for plugin in _initialized_plugins: + if isinstance(plugin, OutputFormatPlugin): + yield plugin + +def plugin_for_output_format(fmt): + for plugin in output_format_plugins(): + if fmt.lower() == plugin.file_type: + return plugin + def disable_plugin(plugin_or_name): x = getattr(plugin_or_name, 'name', plugin_or_name) diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py new file mode 100644 index 0000000000..174fa87a5d --- /dev/null +++ b/src/calibre/ebooks/conversion/cli.py @@ -0,0 +1,146 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +''' +Command line interface to conversion sub-system +''' + +USAGE = '%prog ' + _('''\ +input_file output_file [options] + +Convert an ebook from one format to another. + +input_file is the input and output_file is the output. Both must be +specified as the first two arguments to the command. + +The output ebook format is guessed from the file extension of +output_file. output_file can also be of the special format .EXT where +EXT is the output file extension. In this case, the name of the output +file is derived the name of the input file. Note that the filenames must +not start with a hyphen. Finally, if output_file has no extension, then +it is treated as a directory and an "open ebook" (OEB) consisting of HTML files +is written to that directory. These files are the files that would normally +have been passed to the output plugin. + + +After specifying the input +and output file you can customize the conversion by specifying various +options, listed below. + +For full documentation of the conversion system see + +''') + 'http://calibre.kovidgoyal.net/user_manual/conversion.html' + +import sys, os + +from calibre.utils.config import OptionParser +from calibre.utils.logging import Log +from calibre.constants import preferred_encoding + +def print_help(parser, log): + help = parser.format_help().encode(preferred_encoding, 'replace') + log(help) + +def check_command_line_options(parser, args, log): + if len(args) < 3 or args[1].startswith('-') or args[2].startswith('-'): + print_help(parser) + log.error('\n\nYou must specify the input AND output files') + raise SystemExit(1) + + input = os.path.abspath(args[1]) + if not os.access(input, os.R_OK): + log.error('Cannot read from', input) + raise SystemExit(1) + + output = args[2] + if output.startswith('.'): + output = os.path.splitext(os.path.basename(input))[0]+output + output = os.path.abspath(output) + + if '.' in output: + if os.path.exists(output): + log.warn('WARNING:', output, 'exists. Deleting.') + os.remove(output) + + return input, output + +def option_recommendation_to_cli_option(add_option, rec): + opt = rec.option + switches = [opt.short_switch] if opt.short_switch else [] + switches.append(opt.long_switch) + add_option(opt.name, switches=switches, help=opt.help, + choices=opt.choices, default=rec.recommended_value) + +def add_input_output_options(parser, plumber): + input_options, output_options = \ + plumber.input_options, plumber.output_options + + def add_options(group, options): + for opt in options: + option_recommendation_to_cli_option(group, opt) + + if input_options: + io = parser.add_group(plumber.input_fmt.upper() + ' ' + _('OPTIONS')) + add_options(io, input_options) + + if output_options: + oo = parser.add_group(plumber.output_fmt.upper() + ' ' + _('OPTIONS')) + add_options(oo, output_options) + +def add_pipeline_options(parser, plumber): + groups = { + '' : ('', + [ + 'input_profile', + 'output_profile', + ] + ), + + 'DEBUG': (_('Options to help with debugging the conversion'), + [ + 'verbose', + ]), + + + } + + + for group, spec in groups.items(): + desc, options = spec + if group: + group = parser.add_option_group(group, desc) + add_option = group if group != '' else parser.add_option + + for name in options: + rec = plumber.get_option_by_name(name) + if rec.level < rec.HIGH: + option_recommendation_to_cli_option(add_option, rec) + + + + +def main(args=sys.argv): + log = Log() + parser = OptionParser(usage=USAGE) + fargs = parser.parse_args(args)[1] + + input, output = check_command_line_options(parser, fargs, log) + + from calibre.ebooks.conversion.plumber import Plumber + + plumber = Plumber(input, output, log) + add_input_output_options(parser, plumber) + add_pipeline_options(parser, plumber) + + opts = parser.parse_args(args)[0] + recommendations = [(n.dest, getattr(opts, n.dest)) \ + for n in parser.options_iter()] + + plumber.merge_ui_recommendations(recommendations) + + return 0 + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index ac7490bd39..742653251d 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -3,11 +3,15 @@ __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' +import os from calibre.customize.conversion import OptionRecommendation -from calibre.customize.ui import input_profiles +from calibre.customize.ui import input_profiles, output_profiles, \ + plugin_for_input_format, plugin_for_output_format -pipeline_options = [ +class Plumber(object): + + pipeline_options = [ OptionRecommendation(name='verbose', recommended_value=0, level=OptionRecommendation.LOW, @@ -16,7 +20,6 @@ OptionRecommendation(name='verbose', 'verbosity.') ), - OptionRecommendation(name='input_profile', recommended_value='default', level=OptionRecommendation.LOW, choices=[x.short_name for x in input_profiles()], @@ -27,4 +30,66 @@ OptionRecommendation(name='input_profile', 'pixels).') ), -] \ No newline at end of file +OptionRecommendation(name='output_profile', + recommended_value='default', level=OptionRecommendation.LOW, + choices=[x.short_name for x in output_profiles()], + help=_('Specify the output profile. The output profile ' + 'tells the conversion system how to optimize the ' + 'created document for the specified device. In some cases, ' + 'an output profile is required to produce documents that ' + 'will work on a device. For example EPUB on the SONY reader.' + ) + ), + +] + + def __init__(self, input, output, log): + self.input = input + self.output = output + self.log = log + + input_fmt = os.path.splitext(input)[1] + if not input_fmt: + raise ValueError('Input file must have and extension') + input_fmt = input_fmt[1:].lower() + + output_fmt = os.path.splitext(input)[1] + if not output_fmt: + output_fmt = '.oeb' + output_fmt = output_fmt[1:].lower() + + self.input_plugin = plugin_for_input_format(input_fmt) + self.output_plugin = plugin_for_output_format(output_fmt) + + if self.input_plugin is None: + raise ValueError('No plugin to handle input format: '+input_fmt) + + if self.output_plugin is None: + raise ValueError('No plugin to handle output format: '+output_fmt) + + self.input_fmt = input_fmt + self.output_fmt = output_fmt + + self.input_options = self.input_plugin.options.union( + self.input_plugin.common_options) + self.output_options = self.output_plugin.options.union( + self.output_plugin.common_options) + + self.merge_plugin_recommendations() + + def get_option_by_name(self, name): + for group in (self.input_options, self.pipeline_options, + self.output_options): + for rec in group: + if rec.option == name: + return rec + + def merge_plugin_recommendations(self): + pass + + def merge_ui_recommendations(self, recommendations): + pass + + + + \ No newline at end of file diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py index 710b544007..191d552709 100644 --- a/src/calibre/ebooks/html.py +++ b/src/calibre/ebooks/html.py @@ -19,11 +19,10 @@ from lxml.html import HtmlElementClassLookup, HTMLParser as _HTMLParser, \ from lxml.etree import XPath get_text = XPath("//text()") -from calibre import LoggingInterface, unicode_path, entity_to_unicode +from calibre import unicode_path, entity_to_unicode from calibre.ebooks.chardet import xml_to_unicode, ENCODING_PATS from calibre.utils.config import Config, StringConfig from calibre.ebooks.metadata import MetaInformation -from calibre.ebooks.metadata.meta import get_metadata from calibre.ebooks.metadata.opf2 import OPF, OPFCreator from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile from calibre.utils.zipfile import ZipFile @@ -401,7 +400,7 @@ class PreProcessor(object): html = rule[0].sub(rule[1], html) return html -class Parser(PreProcessor, LoggingInterface): +class Parser(PreProcessor): # SELF_CLOSING_TAGS = 'hr|br|link|img|meta|input|area|base|basefont' # SELF_CLOSING_RULES = [re.compile(p[0]%SELF_CLOSING_TAGS, re.IGNORECASE) for p in # [ @@ -412,7 +411,6 @@ class Parser(PreProcessor, LoggingInterface): # ] def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, name='htmlparser'): - LoggingInterface.__init__(self, logging.getLogger(name)) self.setup_cli_handler(opts.verbose) self.htmlfile = htmlfile self.opts = opts @@ -1038,6 +1036,7 @@ def merge_metadata(htmlfile, opf, opts): if opf: mi = MetaInformation(opf) elif htmlfile: + from calibre.ebooks.metadata.meta import get_metadata try: mi = get_metadata(open(htmlfile, 'rb'), 'html') except: diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py index 1ce9950677..fa56b5c6b4 100644 --- a/src/calibre/ebooks/mobi/input.py +++ b/src/calibre/ebooks/mobi/input.py @@ -3,8 +3,6 @@ __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import os - from calibre.customize.conversion import InputFormatPlugin class MOBIInput(InputFormatPlugin): @@ -18,12 +16,11 @@ class MOBIInput(InputFormatPlugin): from calibre.ebooks.mobi.reader import MobiReader mr = MobiReader(stream, log, options.input_encoding, options.debug_input) - mr.extract_content(output_dir=os.getcwdu(), parse_cache) + mr.extract_content('.', parse_cache) raw = parse_cache.get('calibre_raw_mobi_markup', False) if raw: if isinstance(raw, unicode): raw = raw.encode('utf-8') open('debug-raw.html', 'wb').write(raw) - return mr.created_opf_path - + return mr.created_opf_path \ No newline at end of file diff --git a/src/calibre/ebooks/oeb/output.py b/src/calibre/ebooks/oeb/output.py new file mode 100644 index 0000000000..0a74f488cf --- /dev/null +++ b/src/calibre/ebooks/oeb/output.py @@ -0,0 +1,17 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from calibre.customize.conversion import OutputFormatPlugin + +class OEBOutput(OutputFormatPlugin): + + name = 'OEB Output' + author = 'Kovid Goyal' + file_type = 'oeb' + + + def convert(self, oeb_book, input_plugin, options, parse_cache, log): + pass + diff --git a/src/calibre/linux.py b/src/calibre/linux.py index 427b41ca5f..ae6cb10818 100644 --- a/src/calibre/linux.py +++ b/src/calibre/linux.py @@ -18,6 +18,7 @@ entry_points = { 'console_scripts': [ \ 'ebook-device = calibre.devices.prs500.cli.main:main', 'ebook-meta = calibre.ebooks.metadata.cli:main', + 'ebook-convert = calibre.ebooks.convert.cli:main', 'txt2lrf = calibre.ebooks.lrf.txt.convert_from:main', 'html2lrf = calibre.ebooks.lrf.html.convert_from:main', 'html2oeb = calibre.ebooks.html:main', diff --git a/src/calibre/utils/logging.py b/src/calibre/utils/logging.py index ae2e1a792b..d5a55ac48b 100644 --- a/src/calibre/utils/logging.py +++ b/src/calibre/utils/logging.py @@ -13,13 +13,25 @@ ERROR = 3 import sys, traceback from functools import partial -from calibre import prints -from calibre.utils.terminfo import TerminalController -class ANSIStream: + + +class Stream(object): + + def __init__(self, stream): + from calibre import prints + self._prints = prints + self.stream = stream + + def flush(self): + self.stream.flush() + + +class ANSIStream(Stream): def __init__(self, stream=sys.stdout): - self.stream = stream + Stream.__init__(self, stream) + from calibre.utils.terminfo import TerminalController tc = TerminalController(stream) self.color = { DEBUG: tc.GREEN, @@ -32,16 +44,16 @@ class ANSIStream: def prints(self, level, *args, **kwargs): self.stream.write(self.color[level]) kwargs['file'] = self.stream - prints(*args, **kwargs) + self._prints(*args, **kwargs) self.stream.write(self.normal) def flush(self): self.stream.flush() -class HTMLStream: +class HTMLStream(Stream): def __init__(self, stream=sys.stdout): - self.stream = stream + Stream.__init__(self, stream) self.color = { DEBUG: '', INFO:'', @@ -53,7 +65,7 @@ class HTMLStream: def prints(self, level, *args, **kwargs): self.stream.write(self.color[level]) kwargs['file'] = self.stream - prints(*args, **kwargs) + self._prints(*args, **kwargs) self.stream.write(self.normal) def flush(self): From 3ea639199653b6552bd019f0b15aa64686088d15 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 8 Mar 2009 14:49:43 -0700 Subject: [PATCH 029/319] pluginize installs again. Also working framework for ebook-convert --- src/calibre/customize/builtins.py | 4 +- src/calibre/customize/conversion.py | 2 +- src/calibre/ebooks/conversion/cli.py | 69 ++++++++++++--------- src/calibre/ebooks/conversion/plumber.py | 4 +- src/calibre/ebooks/epub/from_any.py | 2 +- src/calibre/ebooks/epub/pages.py | 2 +- src/calibre/ebooks/epub/split.py | 5 +- src/calibre/ebooks/lrf/html/convert_from.py | 5 +- src/calibre/ebooks/mobi/writer.py | 2 +- src/calibre/ebooks/oeb/base.py | 9 --- src/calibre/library/database2.py | 2 +- src/calibre/linux.py | 48 ++------------ src/calibre/web/feeds/news.py | 5 +- src/calibre/web/fetch/simple.py | 5 +- 14 files changed, 60 insertions(+), 104 deletions(-) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index ca21bbb215..b6a6141612 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -245,11 +245,11 @@ class MOBIMetadataWriter(MetadataWriterPlugin): from calibre.ebooks.epub.input import EPUBInput from calibre.ebooks.mobi.input import MOBIInput from calibre.ebooks.oeb.output import OEBOutput -from calibre.customize.profiles import input_profiles +from calibre.customize.profiles import input_profiles, output_profiles plugins = [HTML2ZIP, EPUBInput, MOBIInput, OEBOutput] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataWriter')] -plugins += input_profiles \ No newline at end of file +plugins += input_profiles + output_profiles \ No newline at end of file diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index f20cc4ae85..10e5a44ddd 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -24,7 +24,7 @@ class ConversionOption(object): self.choices = choices if self.long_switch is None: - self.long_switch = '--'+self.name.replace('_', '-') + self.long_switch = self.name.replace('_', '-') self.validate_parameters() diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index 174fa87a5d..83bcb453e9 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -12,28 +12,29 @@ input_file output_file [options] Convert an ebook from one format to another. -input_file is the input and output_file is the output. Both must be +input_file is the input and output_file is the output. Both must be \ specified as the first two arguments to the command. -The output ebook format is guessed from the file extension of -output_file. output_file can also be of the special format .EXT where -EXT is the output file extension. In this case, the name of the output -file is derived the name of the input file. Note that the filenames must -not start with a hyphen. Finally, if output_file has no extension, then -it is treated as a directory and an "open ebook" (OEB) consisting of HTML files -is written to that directory. These files are the files that would normally -have been passed to the output plugin. +The output ebook format is guessed from the file extension of \ +output_file. output_file can also be of the special format .EXT where \ +EXT is the output file extension. In this case, the name of the output \ +file is derived the name of the input file. Note that the filenames must \ +not start with a hyphen. Finally, if output_file has no extension, then \ +it is treated as a directory and an "open ebook" (OEB) consisting of HTML \ +files is written to that directory. These files are the files that would \ +normally have been passed to the output plugin. - -After specifying the input -and output file you can customize the conversion by specifying various -options, listed below. +After specifying the input \ +and output file you can customize the conversion by specifying various \ +options. the available options depend on the input and output file types. \ +To get help on them specify the input and output file and then use the -h \ +option. For full documentation of the conversion system see - ''') + 'http://calibre.kovidgoyal.net/user_manual/conversion.html' import sys, os +from optparse import OptionGroup, Option from calibre.utils.config import OptionParser from calibre.utils.logging import Log @@ -68,10 +69,11 @@ def check_command_line_options(parser, args, log): def option_recommendation_to_cli_option(add_option, rec): opt = rec.option - switches = [opt.short_switch] if opt.short_switch else [] - switches.append(opt.long_switch) - add_option(opt.name, switches=switches, help=opt.help, + switches = ['-'+opt.short_switch] if opt.short_switch else [] + switches.append('--'+opt.long_switch) + attrs = dict(dest=opt.name, help=opt.help, choices=opt.choices, default=rec.recommended_value) + add_option(Option(*switches, **attrs)) def add_input_output_options(parser, plumber): input_options, output_options = \ @@ -82,12 +84,18 @@ def add_input_output_options(parser, plumber): option_recommendation_to_cli_option(group, opt) if input_options: - io = parser.add_group(plumber.input_fmt.upper() + ' ' + _('OPTIONS')) - add_options(io, input_options) + title = plumber.input_fmt.upper() + ' ' + _('OPTIONS') + io = OptionGroup(parser, title, _('Options to control the processing' + ' of the input file')) + add_options(io.add_option, input_options) + parser.add_option_group(io) if output_options: - oo = parser.add_group(plumber.output_fmt.upper() + ' ' + _('OPTIONS')) - add_options(oo, output_options) + title = plumber.output_fmt.upper() + ' ' + _('OPTIONS') + oo = OptionGroup(parser, title, _('Options to control the processing' + ' of the output file')) + add_options(oo.add_option, output_options) + parser.add_option_group(oo) def add_pipeline_options(parser, plumber): groups = { @@ -106,27 +114,28 @@ def add_pipeline_options(parser, plumber): } + group_order = ['', 'DEBUG'] - for group, spec in groups.items(): - desc, options = spec + for group in group_order: + desc, options = groups[group] if group: - group = parser.add_option_group(group, desc) - add_option = group if group != '' else parser.add_option + group = OptionGroup(parser, group, desc) + parser.add_option_group(group) + add_option = group.add_option if group != '' else parser.add_option for name in options: rec = plumber.get_option_by_name(name) if rec.level < rec.HIGH: option_recommendation_to_cli_option(add_option, rec) - - - def main(args=sys.argv): log = Log() parser = OptionParser(usage=USAGE) - fargs = parser.parse_args(args)[1] + if len(args) < 3: + print_help(parser, log) + return 1 - input, output = check_command_line_options(parser, fargs, log) + input, output = check_command_line_options(parser, args, log) from calibre.ebooks.conversion.plumber import Plumber diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 742653251d..bd4d365af8 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -50,10 +50,10 @@ OptionRecommendation(name='output_profile', input_fmt = os.path.splitext(input)[1] if not input_fmt: - raise ValueError('Input file must have and extension') + raise ValueError('Input file must have an extension') input_fmt = input_fmt[1:].lower() - output_fmt = os.path.splitext(input)[1] + output_fmt = os.path.splitext(output)[1] if not output_fmt: output_fmt = '.oeb' output_fmt = output_fmt[1:].lower() diff --git a/src/calibre/ebooks/epub/from_any.py b/src/calibre/ebooks/epub/from_any.py index 9a8e251108..b3e5281525 100644 --- a/src/calibre/ebooks/epub/from_any.py +++ b/src/calibre/ebooks/epub/from_any.py @@ -12,7 +12,7 @@ from contextlib import nested from calibre import extract, walk from calibre.ebooks import DRMError -from calibre.ebooks.epub import config as common_config, process_encryption +from calibre.ebooks.epub import config as common_config from calibre.ebooks.epub.from_html import convert as html2epub, find_html_index from calibre.ptempfile import TemporaryDirectory from calibre.ebooks.metadata import MetaInformation diff --git a/src/calibre/ebooks/epub/pages.py b/src/calibre/ebooks/epub/pages.py index 1ab5edde86..4737107a6c 100644 --- a/src/calibre/ebooks/epub/pages.py +++ b/src/calibre/ebooks/epub/pages.py @@ -11,7 +11,7 @@ __docformat__ = 'restructuredtext en' import os, re from itertools import count, chain from calibre.ebooks.oeb.base import XHTML, XHTML_NS -from calibre.ebooks.oeb.base import OEBBook, DirWriter +from calibre.ebooks.oeb.base import OEBBook from lxml import etree, html from lxml.etree import XPath diff --git a/src/calibre/ebooks/epub/split.py b/src/calibre/ebooks/epub/split.py index 9814c40df5..c3099c1682 100644 --- a/src/calibre/ebooks/epub/split.py +++ b/src/calibre/ebooks/epub/split.py @@ -15,7 +15,7 @@ from lxml.cssselect import CSSSelector from calibre.ebooks.metadata.opf2 import OPF from calibre.ebooks.epub import tostring, rules -from calibre import CurrentDir, LoggingInterface +from calibre import CurrentDir XPath = functools.partial(_XPath, namespaces={'re':'http://exslt.org/regular-expressions'}) content = functools.partial(os.path.join, 'content') @@ -32,10 +32,9 @@ class SplitError(ValueError): -class Splitter(LoggingInterface): +class Splitter(object): def __init__(self, path, opts, stylesheet_map, opf): - LoggingInterface.__init__(self, logging.getLogger('htmlsplit')) self.setup_cli_handler(opts.verbose) self.path = path self.always_remove = not opts.preserve_tag_structure or \ diff --git a/src/calibre/ebooks/lrf/html/convert_from.py b/src/calibre/ebooks/lrf/html/convert_from.py index 2bd63d1d8f..056666b301 100644 --- a/src/calibre/ebooks/lrf/html/convert_from.py +++ b/src/calibre/ebooks/lrf/html/convert_from.py @@ -31,7 +31,7 @@ from calibre.ebooks.lrf import option_parser as lrf_option_parser from calibre.ebooks import ConversionError from calibre.ebooks.lrf.html.table import Table from calibre import filename_to_utf8, setup_cli_handlers, __appname__, \ - fit_image, LoggingInterface, preferred_encoding + fit_image, preferred_encoding from calibre.ptempfile import PersistentTemporaryFile from calibre.devices.interface import Device from calibre.ebooks.lrf.html.color_map import lrs_color @@ -78,7 +78,7 @@ def tag_regex(tagname): return dict(open=r'(?:<\s*%(t)s\s+[^<>]*?>|<\s*%(t)s\s*>)'%dict(t=tagname), \ close=r''%dict(t=tagname)) -class HTMLConverter(object, LoggingInterface): +class HTMLConverter(object): SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}") PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE) IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction) @@ -209,7 +209,6 @@ class HTMLConverter(object, LoggingInterface): ''' # Defaults for various formatting tags object.__setattr__(self, 'options', options) - LoggingInterface.__init__(self, logger) self.fonts = fonts #: dict specifying font families to use # Memory self.scaled_images = {} #: Temporary files with scaled version of images diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index fdabfaa618..86224488c0 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -24,7 +24,7 @@ from calibre.ebooks.oeb.base import XML_NS, XHTML, XHTML_NS, OEB_DOCS, \ OEB_RASTER_IMAGES from calibre.ebooks.oeb.base import xpath, barename, namespace, prefixname from calibre.ebooks.oeb.base import urlnormalize -from calibre.ebooks.oeb.base import Logger, OEBBook +from calibre.ebooks.oeb.base import OEBBook from calibre.ebooks.oeb.profile import Context from calibre.ebooks.oeb.transforms.flatcss import CSSFlattener from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 2e160d1571..f7c472320e 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -15,7 +15,6 @@ from urlparse import urldefrag, urlparse, urlunparse from urllib import unquote as urlunquote from lxml import etree, html import calibre -from calibre import LoggingInterface from calibre.translations.dynamic import translate from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.oeb.entitydefs import ENTITYDEFS @@ -212,14 +211,6 @@ class FauxLogger(object): def __call__(self, message): print message -class Logger(LoggingInterface, object): - """A logging object which provides both the standard `logging.Logger` and - calibre-specific interfaces. - """ - def __getattr__(self, name): - return object.__getattribute__(self, 'log_' + name) - - class NullContainer(object): """An empty container. diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py index f8b63f1124..cb823e6c73 100644 --- a/src/calibre/library/database2.py +++ b/src/calibre/library/database2.py @@ -15,7 +15,7 @@ from PyQt4.QtCore import QCoreApplication, QThread, QReadWriteLock from PyQt4.QtGui import QApplication, QImage __app = None -from calibre.library import title_sort +from calibre.ebooks.metadata import title_sort from calibre.library.database import LibraryDatabase from calibre.library.sqlite import connect, IntegrityError from calibre.utils.search_query_parser import SearchQueryParser diff --git a/src/calibre/linux.py b/src/calibre/linux.py index 369dfa3d2c..e08222ed3a 100644 --- a/src/calibre/linux.py +++ b/src/calibre/linux.py @@ -1,9 +1,8 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' ''' Post installation script for linux ''' -import sys, os, re, shutil +import sys, os, shutil from subprocess import check_call, call -from tempfile import NamedTemporaryFile from calibre import __version__, __appname__ from calibre.devices import devices @@ -18,16 +17,8 @@ entry_points = { 'console_scripts': [ \ 'ebook-device = calibre.devices.prs500.cli.main:main', 'ebook-meta = calibre.ebooks.metadata.cli:main', - 'ebook-convert = calibre.ebooks.convert.cli:main', - 'txt2lrf = calibre.ebooks.lrf.txt.convert_from:main', - 'html2lrf = calibre.ebooks.lrf.html.convert_from:main', - 'html2oeb = calibre.ebooks.html:main', - 'html2epub = calibre.ebooks.epub.from_html:main', - 'odt2oeb = calibre.ebooks.odt.to_oeb:main', + 'ebook-convert = calibre.ebooks.conversion.cli:main', 'markdown-calibre = calibre.ebooks.markdown.markdown:main', - 'lit2lrf = calibre.ebooks.lrf.lit.convert_from:main', - 'epub2lrf = calibre.ebooks.lrf.epub.convert_from:main', - 'rtf2lrf = calibre.ebooks.lrf.rtf.convert_from:main', 'web2disk = calibre.web.fetch.simple:main', 'feeds2disk = calibre.web.feeds.main:main', 'calibre-server = calibre.library.server:main', @@ -35,22 +26,10 @@ entry_points = { 'feeds2epub = calibre.ebooks.epub.from_feeds:main', 'feeds2mobi = calibre.ebooks.mobi.from_feeds:main', 'web2lrf = calibre.ebooks.lrf.web.convert_from:main', - 'pdf2lrf = calibre.ebooks.lrf.pdf.convert_from:main', - 'mobi2lrf = calibre.ebooks.lrf.mobi.convert_from:main', - 'fb22lrf = calibre.ebooks.lrf.fb2.convert_from:main', - 'any2lrf = calibre.ebooks.lrf.any.convert_from:main', - 'any2epub = calibre.ebooks.epub.from_any:main', - 'any2lit = calibre.ebooks.lit.from_any:main', - 'any2mobi = calibre.ebooks.mobi.from_any:main', 'lrf2lrs = calibre.ebooks.lrf.lrfparser:main', 'lrs2lrf = calibre.ebooks.lrf.lrs.convert_from:main', - 'pdfreflow = calibre.ebooks.lrf.pdf.reflow:main', 'isbndb = calibre.ebooks.metadata.isbndb:main', 'librarything = calibre.ebooks.metadata.library_thing:main', - 'mobi2oeb = calibre.ebooks.mobi.reader:main', - 'oeb2mobi = calibre.ebooks.mobi.writer:main', - 'lit2oeb = calibre.ebooks.lit.reader:main', - 'oeb2lit = calibre.ebooks.lit.writer:main', 'comic2lrf = calibre.ebooks.lrf.comic.convert_from:main', 'comic2epub = calibre.ebooks.epub.from_comic:main', 'comic2mobi = calibre.ebooks.mobi.from_comic:main', @@ -61,7 +40,6 @@ entry_points = { 'calibre-parallel = calibre.parallel:main', 'calibre-customize = calibre.customize.ui:main', 'pdftrim = calibre.ebooks.pdf.pdftrim:main' , - 'any2pdf = calibre.ebooks.pdf.from_any:main', ], 'gui_scripts' : [ __appname__+' = calibre.gui2.main:main', @@ -172,25 +150,16 @@ def setup_completion(fatal_errors): from calibre.ebooks.lrf.lrfparser import option_parser as lrf2lrsop from calibre.gui2.lrf_renderer.main import option_parser as lrfviewerop from calibre.ebooks.lrf.pdf.reflow import option_parser as pdfhtmlop - from calibre.ebooks.mobi.reader import option_parser as mobioeb - from calibre.ebooks.lit.reader import option_parser as lit2oeb from calibre.web.feeds.main import option_parser as feeds2disk from calibre.web.feeds.recipes import titles as feed_titles from calibre.ebooks.lrf.feeds.convert_from import option_parser as feeds2lrf from calibre.ebooks.lrf.comic.convert_from import option_parser as comicop - from calibre.ebooks.epub.from_html import option_parser as html2epub - from calibre.ebooks.html import option_parser as html2oeb - from calibre.ebooks.odt.to_oeb import option_parser as odt2oeb from calibre.ebooks.epub.from_feeds import option_parser as feeds2epub from calibre.ebooks.mobi.from_feeds import option_parser as feeds2mobi - from calibre.ebooks.epub.from_any import option_parser as any2epub - from calibre.ebooks.lit.from_any import option_parser as any2lit from calibre.ebooks.epub.from_comic import option_parser as comic2epub - from calibre.ebooks.mobi.from_any import option_parser as any2mobi - from calibre.ebooks.mobi.writer import option_parser as oeb2mobi - from calibre.gui2.main import option_parser as guiop + from calibre.gui2.main import option_parser as guiop any_formats = ['epub', 'htm', 'html', 'xhtml', 'xhtm', 'rar', 'zip', - 'txt', 'lit', 'rtf', 'pdf', 'prc', 'mobi', 'fb2', 'odt'] + 'txt', 'lit', 'rtf', 'pdf', 'prc', 'mobi', 'fb2', 'odt'] f = open_file('/etc/bash_completion.d/libprs500') f.close() os.remove(f.name) @@ -210,16 +179,10 @@ def setup_completion(fatal_errors): f.write(opts_and_exts('pdf2lrf', htmlop, ['pdf'])) f.write(opts_and_exts('any2lrf', htmlop, any_formats)) f.write(opts_and_exts('calibre', guiop, any_formats)) - f.write(opts_and_exts('any2epub', any2epub, any_formats)) - f.write(opts_and_exts('any2lit', any2lit, any_formats)) - f.write(opts_and_exts('any2mobi', any2mobi, any_formats)) - f.write(opts_and_exts('oeb2mobi', oeb2mobi, ['opf'])) f.write(opts_and_exts('lrf2lrs', lrf2lrsop, ['lrf'])) f.write(opts_and_exts('ebook-meta', metaop, list(meta_filetypes()))) f.write(opts_and_exts('lrfviewer', lrfviewerop, ['lrf'])) f.write(opts_and_exts('pdfrelow', pdfhtmlop, ['pdf'])) - f.write(opts_and_exts('mobi2oeb', mobioeb, ['mobi', 'prc'])) - f.write(opts_and_exts('lit2oeb', lit2oeb, ['lit'])) f.write(opts_and_exts('comic2lrf', comicop, ['cbz', 'cbr'])) f.write(opts_and_exts('comic2epub', comic2epub, ['cbz', 'cbr'])) f.write(opts_and_exts('comic2mobi', comic2epub, ['cbz', 'cbr'])) @@ -228,9 +191,6 @@ def setup_completion(fatal_errors): f.write(opts_and_words('feeds2lrf', feeds2lrf, feed_titles)) f.write(opts_and_words('feeds2epub', feeds2epub, feed_titles)) f.write(opts_and_words('feeds2mobi', feeds2mobi, feed_titles)) - f.write(opts_and_exts('html2epub', html2epub, ['html', 'htm', 'xhtm', 'xhtml', 'opf'])) - f.write(opts_and_exts('html2oeb', html2oeb, ['html', 'htm', 'xhtm', 'xhtml'])) - f.write(opts_and_exts('odt2oeb', odt2oeb, ['odt'])) f.write(''' _prs500_ls() { diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index 4773d551c3..7d61cead5b 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -17,7 +17,7 @@ from PyQt4.Qt import QApplication, QFile, Qt, QPalette, QSize, QImage, QPainter, from PyQt4.QtWebKit import QWebPage -from calibre import browser, __appname__, iswindows, LoggingInterface, \ +from calibre import browser, __appname__, iswindows, \ strftime, __version__, preferred_encoding from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag from calibre.ebooks.metadata.opf2 import OPFCreator @@ -32,7 +32,7 @@ from calibre.ptempfile import PersistentTemporaryFile from calibre.gui2 import images_rc # Needed for default cover -class BasicNewsRecipe(object, LoggingInterface): +class BasicNewsRecipe(object): ''' Abstract base class that contains logic needed in all feed fetchers. ''' @@ -444,7 +444,6 @@ class BasicNewsRecipe(object, LoggingInterface): :param parser: Command line option parser. Used to intelligently merge options. :param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional. ''' - LoggingInterface.__init__(self, logging.getLogger('feeds2disk')) if not isinstance(self.title, unicode): self.title = unicode(self.title, 'utf-8', 'replace') diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py index 4da3f4019c..51a4554a50 100644 --- a/src/calibre/web/fetch/simple.py +++ b/src/calibre/web/fetch/simple.py @@ -15,7 +15,7 @@ from PIL import Image from cStringIO import StringIO from calibre import setup_cli_handlers, browser, sanitize_file_name, \ - relpath, LoggingInterface, unicode_path + relpath, unicode_path from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.config import OptionParser @@ -80,7 +80,7 @@ class DummyLock(object): def __enter__(self, *args): return self def __exit__(self, *args): pass -class RecursiveFetcher(object, LoggingInterface): +class RecursiveFetcher(object): LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in ('.exe\s*$', '.mp3\s*$', '.ogg\s*$', '^\s*mailto:', '^\s*$')) #ADBLOCK_FILTER = tuple(re.compile(i, re.IGNORECASE) for it in @@ -93,7 +93,6 @@ class RecursiveFetcher(object, LoggingInterface): DUMMY_LOCK = DummyLock() def __init__(self, options, logger, image_map={}, css_map={}, job_info=None): - LoggingInterface.__init__(self, logger) self.base_dir = os.path.abspath(os.path.expanduser(options.dir)) if not os.path.exists(self.base_dir): os.makedirs(self.base_dir) From 6bb46288283b6ec265a09fd474f09684a821a318 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 9 Mar 2009 13:13:51 -0700 Subject: [PATCH 030/319] Commit so I can pull from trunk --- src/calibre/ebooks/conversion/cli.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index 83bcb453e9..9a320bc40f 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -127,10 +127,13 @@ def add_pipeline_options(parser, plumber): rec = plumber.get_option_by_name(name) if rec.level < rec.HIGH: option_recommendation_to_cli_option(add_option, rec) - + +def option_parser(): + return OptionParser(usage=USAGE) + def main(args=sys.argv): log = Log() - parser = OptionParser(usage=USAGE) + parser = option_parser() if len(args) < 3: print_help(parser, log) return 1 @@ -147,9 +150,9 @@ def main(args=sys.argv): recommendations = [(n.dest, getattr(opts, n.dest)) \ for n in parser.options_iter()] - plumber.merge_ui_recommendations(recommendations) + plumber.merge_ui_recommendations(recommendations) return 0 if __name__ == '__main__': - sys.exit(main()) \ No newline at end of file + sys.exit(main()) From 5067a62e71161482ad755c3bd6007cf1a454be6a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 10 Mar 2009 19:30:05 -0700 Subject: [PATCH 031/319] IGN:... --- src/calibre/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index 4751e82a20..942df667e9 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -88,6 +88,8 @@ def prints(*args, **kwargs): for i, arg in enumerate(args): if isinstance(arg, unicode): arg = arg.encode(preferred_encoding) + if not isinstance(arg, str): + arg = str(arg) file.write(arg) if i != len(args)-1: file.write(sep) From 741d6384097f8a2d6d47d53393110738db1ac963 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 10 Mar 2009 22:57:12 -0700 Subject: [PATCH 032/319] Conversion pipeline framework is finally taking shape --- src/calibre/customize/conversion.py | 21 ++- src/calibre/ebooks/conversion/cli.py | 23 ++- src/calibre/ebooks/conversion/plumber.py | 155 +++++++++++++++++- src/calibre/ebooks/epub/input.py | 3 +- src/calibre/ebooks/mobi/input.py | 8 +- src/calibre/ebooks/mobi/writer.py | 4 +- src/calibre/ebooks/oeb/base.py | 22 +-- src/calibre/ebooks/oeb/reader.py | 11 +- src/calibre/ebooks/oeb/transforms/flatcss.py | 4 - src/calibre/ebooks/oeb/transforms/htmltoc.py | 3 - .../ebooks/oeb/transforms/manglecase.py | 7 - .../ebooks/oeb/transforms/rasterize.py | 9 +- 12 files changed, 208 insertions(+), 62 deletions(-) diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index 10e5a44ddd..a77e32beee 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -117,7 +117,11 @@ class InputFormatPlugin(Plugin): #: instance of :class:`OptionRecommendation`. options = set([]) - def convert(self, stream, options, file_ext, parse_cache, log): + #: A set of 3-tuples of the form + #: (option_name, recommended_value, recommendation_level) + recommendations = set([]) + + def convert(self, stream, options, file_ext, parse_cache, log, accelerators): ''' This method must be implemented in sub-classes. It must return the path to the created OPF file. All output should be contained in @@ -153,10 +157,16 @@ class InputFormatPlugin(Plugin): :param log: A :class:`calibre.utils.logging.Log` object. All output should use this object. + + :param accelarators: A dictionary of various information that the input + plugin can get easily that would speed up the + subsequent stages of the conversion. + ''' raise NotImplementedError - def __call__(self, stream, options, file_ext, parse_cache, log, output_dir): + def __call__(self, stream, options, file_ext, parse_cache, log, + accelerators, output_dir): log('InputFormatPlugin: %s running'%self.name, end=' ') if hasattr(stream, 'name'): log('on', stream.name) @@ -166,7 +176,8 @@ class InputFormatPlugin(Plugin): shutil.rmtree(x) if os.path.isdir(x) else os.remove(x) - ret = self.convert(stream, options, file_ext, parse_cache, log) + ret = self.convert(stream, options, file_ext, parse_cache, + log, accelerators) for key in list(parse_cache.keys()): if os.path.abspath(key) != key: log.warn(('InputFormatPlugin: %s returned a ' @@ -221,6 +232,10 @@ class OutputFormatPlugin(Plugin): #: instance of :class:`OptionRecommendation`. options = set([]) + #: A set of 3-tuples of the form + #: (option_name, recommended_value, recommendation_level) + recommendations = set([]) + def convert(self, oeb_book, input_plugin, options, parse_cache, log): raise NotImplementedError diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index 9a320bc40f..f52264f8d0 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -39,6 +39,7 @@ from optparse import OptionGroup, Option from calibre.utils.config import OptionParser from calibre.utils.logging import Log from calibre.constants import preferred_encoding +from calibre.customize.conversion import OptionRecommendation def print_help(parser, log): help = parser.format_help().encode(preferred_encoding, 'replace') @@ -84,16 +85,16 @@ def add_input_output_options(parser, plumber): option_recommendation_to_cli_option(group, opt) if input_options: - title = plumber.input_fmt.upper() + ' ' + _('OPTIONS') + title = _('INPUT OPTIONS') io = OptionGroup(parser, title, _('Options to control the processing' - ' of the input file')) + ' of the input %s file')%plumber.input_fmt) add_options(io.add_option, input_options) parser.add_option_group(io) if output_options: title = plumber.output_fmt.upper() + ' ' + _('OPTIONS') oo = OptionGroup(parser, title, _('Options to control the processing' - ' of the output file')) + ' of the output %s file')%plumber.input_fmt) add_options(oo.add_option, output_options) parser.add_option_group(oo) @@ -106,6 +107,9 @@ def add_pipeline_options(parser, plumber): ] ), + 'METADATA' : (_('Options to set metadata in the output'), + plumber.metadata_option_names, + ), 'DEBUG': (_('Options to help with debugging the conversion'), [ 'verbose', @@ -114,7 +118,7 @@ def add_pipeline_options(parser, plumber): } - group_order = ['', 'DEBUG'] + group_order = ['', 'METADATA', 'DEBUG'] for group in group_order: desc, options = groups[group] @@ -147,11 +151,16 @@ def main(args=sys.argv): add_pipeline_options(parser, plumber) opts = parser.parse_args(args)[0] - recommendations = [(n.dest, getattr(opts, n.dest)) \ - for n in parser.options_iter()] - + recommendations = [(n.dest, getattr(opts, n.dest), + OptionRecommendation.HIGH) \ + for n in parser.options_iter() + if n.dest] plumber.merge_ui_recommendations(recommendations) + plumber.run() + + log(_('Output saved to'), ' ', plumber.output) + return 0 if __name__ == '__main__': diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index bd4d365af8..75a6687c4e 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -9,9 +9,23 @@ from calibre.customize.conversion import OptionRecommendation from calibre.customize.ui import input_profiles, output_profiles, \ plugin_for_input_format, plugin_for_output_format +class OptionValues(object): + pass + class Plumber(object): - pipeline_options = [ + metadata_option_names = [ + 'title', 'authors', 'title_sort', 'author_sort', 'cover', 'comments', + 'publisher', 'series', 'series_index', 'rating', 'isbn', + 'tags', 'book_producer', 'language' + ] + + def __init__(self, input, output, log): + self.input = input + self.output = output + self.log = log + + self.pipeline_options = [ OptionRecommendation(name='verbose', recommended_value=0, level=OptionRecommendation.LOW, @@ -40,13 +54,72 @@ OptionRecommendation(name='output_profile', 'will work on a device. For example EPUB on the SONY reader.' ) ), + +OptionRecommendation(name='read_metadata_from_opf', + recommended_value=None, level=OptionRecommendation.LOW, + short_switch='m', + help=_('Read metadata from the specified OPF file. Metadata read ' + 'from this file will override any metadata in the source ' + 'file.') + ), + +OptionRecommendation(name='title', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Set the title.')), +OptionRecommendation(name='authors', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Set the authors. Multiple authors should be separated ')), + +OptionRecommendation(name='title_sort', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('The version of the title to be used for sorting. ')), + +OptionRecommendation(name='author_sort', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('String to be used when sorting by author. ')), + +OptionRecommendation(name='cover', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Set the cover to the specified file.')), + +OptionRecommendation(name='comments', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Set the ebook description.')), + +OptionRecommendation(name='publisher', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Set the ebook publisher.')), + +OptionRecommendation(name='series', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Set the series this ebook belongs to.')), + +OptionRecommendation(name='series_index', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Set the index of the book in this series.')), + +OptionRecommendation(name='rating', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Set the rating. Should be a number between 1 and 5.')), + +OptionRecommendation(name='isbn', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Set the ISBN of the book.')), + +OptionRecommendation(name='tags', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Set the tags for the book. Should be a comma separated list.')), + +OptionRecommendation(name='book_producer', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Set the book producer.')), + +OptionRecommendation(name='language', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Set the language.')), ] - def __init__(self, input, output, log): - self.input = input - self.output = output - self.log = log input_fmt = os.path.splitext(input)[1] if not input_fmt: @@ -85,11 +158,79 @@ OptionRecommendation(name='output_profile', return rec def merge_plugin_recommendations(self): - pass + for source in (self.input_plugin, self.output_plugin): + for name, val, level in source.recommendations: + rec = self.get_option_by_name(name) + if rec is not None and rec.level <= level: + rec.recommended_value = val def merge_ui_recommendations(self, recommendations): - pass + for name, val, level in recommendations: + rec = self.get_option_by_name(name) + if rec is not None and rec.level <= level and rec.level < rec.HIGH: + rec.recommended_value = val + def read_user_metadata(self): + from calibre.ebooks.metadata import MetaInformation, string_to_authors + from calibre.ebooks.metadata.opf2 import OPF + mi = MetaInformation(None, []) + if self.opts.read_metadata_from_opf is not None: + self.opts.read_metadata_from_opf = os.path.abspath( + self.opts.read_metadata_from_opf) + opf = OPF(open(self.opts.read_metadata_from_opf, 'rb'), + os.path.dirname(self.opts.read_metadata_from_opf)) + mi = MetaInformation(opf) + for x in self.metadata_option_names: + val = getattr(self.opts, x, None) + if val is not None: + if x == 'authors': + val = string_to_authors(val) + elif x == 'tags': + val = [i.strip() for i in val.split(',')] + elif x in ('rating', 'series_index'): + val = float(val) + setattr(mi, x, val) + if mi.cover: + mi.cover_data = ('', open(mi.cover, 'rb').read()) + mi.cover = None + self.user_metadata = mi + + def setup_options(self): + self.opts = OptionValues() + for group in (self.input_options, self.pipeline_options, + self.output_options): + for rec in group: + setattr(self.opts, rec.option.name, rec.recommended_value) + + for x in input_profiles(): + if x.short_name == self.opts.input_profile: + self.opts.input_profile = x + break + + for x in output_profiles(): + if x.short_name == self.opts.output_profile: + self.opts.output_profile = x + break + + self.read_user_metadata() + + def run(self): + self.setup_options() + from calibre.customize.ui import run_plugins_on_preprocess + self.input = run_plugins_on_preprocess(self.input) + + from calibre.ebooks.oeb.reader import OEBReader + from calibre.ebooks.oeb.base import OEBBook + parse_cache, accelerators = {}, {} + + opfpath = self.input_plugin(open(self.input, 'rb'), self.opts, + self.input_fmt, parse_cache, self.log, + accelerators) + + self.reader = OEBReader() + self.oeb = OEBBook(self.log, parse_cache=parse_cache) + self.reader(self.oeb, opfpath) + \ No newline at end of file diff --git a/src/calibre/ebooks/epub/input.py b/src/calibre/ebooks/epub/input.py index 1b69424a9e..4c1cdbfcf5 100644 --- a/src/calibre/ebooks/epub/input.py +++ b/src/calibre/ebooks/epub/input.py @@ -51,7 +51,8 @@ class EPUBInput(InputFormatPlugin): traceback.print_exc() return False - def convert(self, stream, options, file_ext, parse_cache, log): + def convert(self, stream, options, file_ext, parse_cache, log, + accelerators): from calibre.utils.zipfile import ZipFile from calibre import walk from calibre.ebooks import DRMError diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py index fa56b5c6b4..b3400c54e1 100644 --- a/src/calibre/ebooks/mobi/input.py +++ b/src/calibre/ebooks/mobi/input.py @@ -12,7 +12,8 @@ class MOBIInput(InputFormatPlugin): description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML' file_types = set(['mobi', 'prc', 'azw']) - def convert(self, stream, options, file_ext, parse_cache, log): + def convert(self, stream, options, file_ext, parse_cache, log, + accelerators): from calibre.ebooks.mobi.reader import MobiReader mr = MobiReader(stream, log, options.input_encoding, options.debug_input) @@ -22,5 +23,8 @@ class MOBIInput(InputFormatPlugin): if isinstance(raw, unicode): raw = raw.encode('utf-8') open('debug-raw.html', 'wb').write(raw) - + for f, root in parse_cache.items(): + if '.' in f: + accelerators[f] = {'pagebreaks':root.xpath( + '//div[@class="mbp_pagebreak"]')} return mr.created_opf_path \ No newline at end of file diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index 86224488c0..6ebeba3739 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -9,7 +9,6 @@ __copyright__ = '2008, Marshall T. Vandegrift ' import sys import os from struct import pack -import functools import time import random from cStringIO import StringIO @@ -18,11 +17,10 @@ from itertools import izip, count from collections import defaultdict from urlparse import urldefrag import logging -from lxml import etree from PIL import Image from calibre.ebooks.oeb.base import XML_NS, XHTML, XHTML_NS, OEB_DOCS, \ OEB_RASTER_IMAGES -from calibre.ebooks.oeb.base import xpath, barename, namespace, prefixname +from calibre.ebooks.oeb.base import namespace, prefixname from calibre.ebooks.oeb.base import urlnormalize from calibre.ebooks.oeb.base import OEBBook from calibre.ebooks.oeb.profile import Context diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index f7c472320e..59ce1f7b95 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -7,7 +7,7 @@ __license__ = 'GPL v3' __copyright__ = '2008, Marshall T. Vandegrift ' __docformat__ = 'restructuredtext en' -import os, sys, re, uuid +import os, re, uuid from mimetypes import types_map from collections import defaultdict from itertools import count @@ -203,14 +203,6 @@ class OEBError(Exception): """Generic OEB-processing error.""" pass - -class FauxLogger(object): - """Fake logging interface.""" - def __getattr__(self, name): - return self - def __call__(self, message): - print message - class NullContainer(object): """An empty container. @@ -1224,16 +1216,20 @@ class PageList(object): class OEBBook(object): """Representation of a book in the IDPF OEB data model.""" - def __init__(self, encoding=None, pretty_print=False, logger=FauxLogger()): + def __init__(self, logger, parse_cache={}, encoding='utf-8', + pretty_print=False): """Create empty book. Optional arguments: + :param parse_cache: A cache of parsed XHTML/CSS. Keys are absolute + paths to te cached files and values are lxml root objects and + cssutils stylesheets. :param:`encoding`: Default encoding for textual content read from an external container. :param:`pretty_print`: Whether or not the canonical string form of XML markup is pretty-printed. - :prama:`logger`: A Logger object to use for logging all messages + :param:`logger`: A Log object to use for logging all messages related to the processing of this book. It is accessible - via the instance data member :attr:`logger`. + via the instance data members :attr:`logger,log`. It provides the following public instance data members for accessing various parts of the OEB data model: @@ -1251,7 +1247,7 @@ class OEBBook(object): """ self.encoding = encoding self.pretty_print = pretty_print - self.logger = logger + self.logger = self.log = logger self.version = '2.0' self.container = NullContainer() self.metadata = Metadata(self) diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py index 0fce1c2b0d..dbafa5afac 100644 --- a/src/calibre/ebooks/oeb/reader.py +++ b/src/calibre/ebooks/oeb/reader.py @@ -19,9 +19,9 @@ from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_IMAGES, \ PAGE_MAP_MIME, JPEG_MIME, NCX_MIME, SVG_MIME from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, CSSURL_RE, \ ENTITY_RE, LINK_SELECTORS, MS_COVER_TYPE -from calibre.ebooks.oeb.base import namespace, barename, qname, XPath, xpath -from calibre.ebooks.oeb.base import urlnormalize, xml2str -from calibre.ebooks.oeb.base import OEBError, OEBBook, DirContainer +from calibre.ebooks.oeb.base import namespace, barename, qname, XPath, xpath, \ + urlnormalize, BINARY_MIME, \ + OEBError, OEBBook, DirContainer from calibre.ebooks.oeb.writer import OEBWriter from calibre.ebooks.oeb.entitydefs import ENTITYDEFS from calibre.ebooks.metadata.epub import CoverRenderer @@ -45,9 +45,6 @@ class OEBReader(object): TRANSFORMS = [] """List of transforms to apply to content read with this Reader.""" - def __init__(self): - return - @classmethod def config(cls, cfg): """Add any book-reading options to the :class:`Config` object @@ -65,7 +62,7 @@ class OEBReader(object): :param:`oeb`. """ self.oeb = oeb - self.logger = oeb.logger + self.logger = self.log = oeb.logger oeb.container = self.Container(path) opf = self._read_opf() self._all_from_opf(opf) diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py index ede2a027ed..9833b3b4d0 100644 --- a/src/calibre/ebooks/oeb/transforms/flatcss.py +++ b/src/calibre/ebooks/oeb/transforms/flatcss.py @@ -6,18 +6,14 @@ from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Marshall T. Vandegrift ' -import sys -import os import re import operator import math -from itertools import chain from collections import defaultdict from lxml import etree from calibre.ebooks.oeb.base import XHTML, XHTML_NS from calibre.ebooks.oeb.base import CSS_MIME, OEB_STYLES from calibre.ebooks.oeb.base import namespace, barename -from calibre.ebooks.oeb.base import OEBBook from calibre.ebooks.oeb.stylizer import Stylizer COLLAPSE = re.compile(r'[ \t\r\n\v]+') diff --git a/src/calibre/ebooks/oeb/transforms/htmltoc.py b/src/calibre/ebooks/oeb/transforms/htmltoc.py index 0040f39c14..4504059531 100644 --- a/src/calibre/ebooks/oeb/transforms/htmltoc.py +++ b/src/calibre/ebooks/oeb/transforms/htmltoc.py @@ -6,9 +6,6 @@ from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Marshall T. Vandegrift ' -import sys -import os -from lxml import etree from calibre.ebooks.oeb.base import XML, XHTML, XHTML_NS from calibre.ebooks.oeb.base import XHTML_MIME, CSS_MIME from calibre.ebooks.oeb.base import element diff --git a/src/calibre/ebooks/oeb/transforms/manglecase.py b/src/calibre/ebooks/oeb/transforms/manglecase.py index c819475a4d..4b852db6c4 100644 --- a/src/calibre/ebooks/oeb/transforms/manglecase.py +++ b/src/calibre/ebooks/oeb/transforms/manglecase.py @@ -6,13 +6,6 @@ from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Marshall T. Vandegrift ' -import sys -import os -import re -import operator -import math -from itertools import chain -from collections import defaultdict from lxml import etree from calibre.ebooks.oeb.base import XHTML, XHTML_NS from calibre.ebooks.oeb.base import CSS_MIME diff --git a/src/calibre/ebooks/oeb/transforms/rasterize.py b/src/calibre/ebooks/oeb/transforms/rasterize.py index aef5c2c98b..2d86fe63b5 100644 --- a/src/calibre/ebooks/oeb/transforms/rasterize.py +++ b/src/calibre/ebooks/oeb/transforms/rasterize.py @@ -6,7 +6,6 @@ from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Marshall T. Vandegrift ' -import sys import os from urlparse import urldefrag import base64 @@ -20,9 +19,9 @@ from PyQt4.QtGui import QImage from PyQt4.QtGui import QPainter from PyQt4.QtSvg import QSvgRenderer from PyQt4.QtGui import QApplication -from calibre.ebooks.oeb.base import XHTML_NS, XHTML, SVG_NS, SVG, XLINK -from calibre.ebooks.oeb.base import SVG_MIME, PNG_MIME, JPEG_MIME -from calibre.ebooks.oeb.base import xml2str, xpath, namespace, barename +from calibre.ebooks.oeb.base import XHTML, XLINK +from calibre.ebooks.oeb.base import SVG_MIME, PNG_MIME +from calibre.ebooks.oeb.base import xml2str, xpath from calibre.ebooks.oeb.base import urlnormalize from calibre.ebooks.oeb.stylizer import Stylizer @@ -88,7 +87,7 @@ class SVGRasterizer(object): hrefs = self.oeb.manifest.hrefs for elem in xpath(svg, '//svg:*[@xl:href]'): href = urlnormalize(elem.attrib[XLINK('href')]) - path, frag = urldefrag(href) + path = urldefrag(href)[0] if not path: continue abshref = item.abshref(path) From 7664abbd2a0adbb55bd31356c6fde6837f4a3fe2 Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 11 Mar 2009 07:57:09 -0400 Subject: [PATCH 033/319] Auto convert when sending to device --- src/calibre/gui2/library.py | 5 +- src/calibre/gui2/main.py | 87 +++++++++++++++++++-- src/calibre/gui2/tools.py | 129 ++++++++++++++++++++++++++++++- src/calibre/library/database2.py | 1 + 4 files changed, 211 insertions(+), 11 deletions(-) diff --git a/src/calibre/gui2/library.py b/src/calibre/gui2/library.py index c0f8eac796..a248c356b2 100644 --- a/src/calibre/gui2/library.py +++ b/src/calibre/gui2/library.py @@ -420,6 +420,7 @@ class BooksModel(QAbstractTableModel): def get_preferred_formats(self, rows, formats, paths=False, set_metadata=False, specific_format=None): ans = [] + need_auto = [] if specific_format is not None: formats = [specific_format.lower()] for row in (row.row() for row in rows): @@ -444,8 +445,9 @@ class BooksModel(QAbstractTableModel): pt.close() if paths else pt.seek(0) ans.append(pt) else: + need_auto.append(row) ans.append(None) - return ans + return ans, need_auto def id(self, row): return self.db.id(getattr(row, 'row', lambda:row)()) @@ -1069,3 +1071,4 @@ class SearchBox(QLineEdit): self.emit(SIGNAL('search(PyQt_PyObject, PyQt_PyObject)'), txt, False) self.end(False) self.initial_state = False + diff --git a/src/calibre/gui2/main.py b/src/calibre/gui2/main.py index 4ecfc08f58..6a72fb381d 100644 --- a/src/calibre/gui2/main.py +++ b/src/calibre/gui2/main.py @@ -38,7 +38,8 @@ from calibre.gui2.dialogs.metadata_bulk import MetadataBulkDialog from calibre.gui2.dialogs.jobs import JobsDialog from calibre.gui2.dialogs.conversion_error import ConversionErrorDialog from calibre.gui2.tools import convert_single_ebook, convert_bulk_ebooks, \ - set_conversion_defaults, fetch_scheduled_recipe + set_conversion_defaults, fetch_scheduled_recipe, \ + auto_convert_ebook from calibre.gui2.dialogs.config import ConfigDialog from calibre.gui2.dialogs.search import SearchDialog from calibre.gui2.dialogs.choose_format import ChooseFormatDialog @@ -904,9 +905,8 @@ class Main(MainWindow, Ui_MainWindow): on_card = config['send_to_storage_card_by_default'] self.sync_to_device(on_card, False, specific_format=fmt) - - def sync_to_device(self, on_card, delete_from_library, specific_format=None): - rows = self.library_view.selectionModel().selectedRows() + def sync_to_device(self, on_card, delete_from_library, specific_format=None, send_rows=None, auto_convert=True): + rows = self.library_view.selectionModel().selectedRows() if send_rows is None else send_rows if not self.device_manager or not rows or len(rows) == 0: return ids = iter(self.library_view.model().id(r) for r in rows) @@ -917,7 +917,7 @@ class Main(MainWindow, Ui_MainWindow): if cdata: mi['cover'] = self.cover_to_thumbnail(cdata) metadata, full_metadata = iter(metadata), iter(full_metadata) - _files = self.library_view.model().get_preferred_formats(rows, + _files, _auto_rows = self.library_view.model().get_preferred_formats(rows, self.device_manager.device_class.FORMATS, paths=True, set_metadata=True, specific_format=specific_format) @@ -952,10 +952,29 @@ class Main(MainWindow, Ui_MainWindow): remove = remove_ids if delete_from_library else [] self.upload_books(gf, names, good, on_card, memory=(_files, remove)) self.status_bar.showMessage(_('Sending books to device.'), 5000) + if bad: + if specific_format is None: + if 'epub' in self.device_manager.device_class.FORMATS: + format = 'epub' + elif 'mobi' in self.device_manager.device_class.FORMATS or 'prc' in self.device_manager.device_class.FORMATS: + format = 'mobi' + elif 'lrf' in self.device_manager.device_class.FORMATS: + format = 'lrf' + else: + format = specific_format + + if format not in ('epub', 'mobi'): + auto_convert = False + bad = '\n'.join('
  • %s
  • '%(i,) for i in bad) - d = warning_dialog(self, _('No suitable formats'), - _('Could not upload the following books to the device, as no suitable formats were found:
      %s
    ')%(bad,)) + if auto_convert: + d = info_dialog(self, _('No suitable formats'), + _('Auto converting the following books before uploading to the device:
      %s
    ')%(bad,)) + self.auto_convert(_auto_rows, on_card, format) + else: + d = warning_dialog(self, _('No suitable formats'), + _('Could not upload the following books to the device, as no suitable formats were found:
      %s
    ')%(bad,)) d.exec_() @@ -1048,6 +1067,32 @@ class Main(MainWindow, Ui_MainWindow): ############################### Convert #################################### + def auto_convert(self, rows, on_card, format): + previous = self.library_view.currentIndex() + + comics, others = [], [] + db = self.library_view.model().db + for r in rows: + formats = db.formats(r) + if not formats: continue + formats = formats.lower().split(',') + if 'cbr' in formats or 'cbz' in formats: + comics.append(r) + else: + others.append(r) + + jobs, changed, bad_rows = auto_convert_ebook(format, self, self.library_view.model().db, comics, others) + for func, args, desc, fmt, id, temp_files in jobs: + if id not in bad_rows: + job = self.job_manager.run_job(Dispatcher(self.book_auto_converted), + func, args=args, description=desc) + self.conversion_jobs[job] = (temp_files, fmt, id, on_card) + + if changed: + self.library_view.model().refresh_rows(rows) + current = self.library_view.currentIndex() + self.library_view.model().current_changed(current, previous) + def get_books_for_conversion(self): rows = [r.row() for r in self.library_view.selectionModel().selectedRows()] if not rows or len(rows) == 0: @@ -1108,7 +1153,32 @@ class Main(MainWindow, Ui_MainWindow): self.library_view.model().refresh_rows(rows) current = self.library_view.currentIndex() self.library_view.model().current_changed(current, previous) - + + def book_auto_converted(self, job): + temp_files, fmt, book_id, on_card = self.conversion_jobs.pop(job) + try: + if job.exception is not None: + self.job_exception(job) + return + data = open(temp_files[-1].name, 'rb') + self.library_view.model().db.add_format(book_id, fmt, data, index_is_id=True) + data.close() + self.status_bar.showMessage(job.description + (' completed'), 2000) + finally: + for f in temp_files: + try: + if os.path.exists(f.name): + os.remove(f.name) + except: + pass + self.tags_view.recount() + if self.current_view() is self.library_view: + current = self.library_view.currentIndex() + self.library_view.model().current_changed(current, QModelIndex()) + + r = self.library_view.model().index(self.library_view.model().db.row(book_id), 0) + self.sync_to_device(on_card, False, specific_format=fmt, send_rows=[r], auto_convert=False) + def book_converted(self, job): temp_files, fmt, book_id = self.conversion_jobs.pop(job) try: @@ -1618,3 +1688,4 @@ if __name__ == '__main__': log = open(logfile).read().decode('utf-8', 'ignore') d = QErrorMessage('Error:%s
    Traceback:
    %sLog:
    '%(unicode(err), unicode(tb), log)) d.exec_() + diff --git a/src/calibre/gui2/tools.py b/src/calibre/gui2/tools.py index aca2da74e2..0bf78ffaa7 100644 --- a/src/calibre/gui2/tools.py +++ b/src/calibre/gui2/tools.py @@ -18,7 +18,9 @@ from calibre.gui2 import warning_dialog from calibre.ptempfile import PersistentTemporaryFile from calibre.ebooks.lrf import preferred_source_formats as LRF_PREFERRED_SOURCE_FORMATS from calibre.ebooks.metadata.opf import OPFCreator -from calibre.ebooks.epub.from_any import SOURCE_FORMATS as EPUB_PREFERRED_SOURCE_FORMATS +from calibre.ebooks.epub.from_any import SOURCE_FORMATS as EPUB_PREFERRED_SOURCE_FORMATS, config as epubconfig +from calibre.ebooks.mobi.from_any import config as mobiconfig +from calibre.ebooks.lrf.comic.convert_from import config as comicconfig def get_dialog(fmt): return { @@ -26,6 +28,122 @@ def get_dialog(fmt): 'mobi':MOBIConvert, }[fmt] +def get_config(fmt): + return { + 'epub':epubconfig, + 'mobi':mobiconfig, + }[fmt] + +def auto_convert(fmt, parent, db, comics, others): + changed = False + jobs = [] + + total = sum(map(len, (others, comics))) + if total == 0: + return + parent.status_bar.showMessage(_('Starting auto conversion of %d books')%total, 2000) + + i = 0 + bad_rows = [] + + for i, row in enumerate(others+comics): + row_id = db.id(row) + + if row in others: + temp_files = [] + + data = None + for _fmt in EPUB_PREFERRED_SOURCE_FORMATS: + try: + data = db.format(row, _fmt.upper()) + if data is not None: + break + except: + continue + if data is None: + bad_rows.append(row) + continue + + defaults = db.conversion_options(db.id(row), fmt) + defaults = defaults if defaults else '' + options = get_config(fmt)(defaults=defaults).parse() + + mi = db.get_metadata(row) + opf = OPFCreator(os.getcwdu(), mi) + opf_file = PersistentTemporaryFile('.opf') + opf.render(opf_file) + opf_file.close() + pt = PersistentTemporaryFile('.'+_fmt.lower()) + pt.write(data) + pt.close() + of = PersistentTemporaryFile('.'+fmt) + of.close() + cover = db.cover(row) + cf = None + if cover: + cf = PersistentTemporaryFile('.jpeg') + cf.write(cover) + cf.close() + options.cover = cf.name + options.output = of.name + options.from_opf = opf_file.name + args = [options, pt.name] + desc = _('Auto convert book %d of %d (%s)')%(i+1, total, repr(mi.title)) + temp_files = [cf] if cf is not None else [] + temp_files.extend([opf_file, pt, of]) + jobs.append(('any2'+fmt, args, desc, fmt.upper(), row_id, temp_files)) + + changed = True + else: + defaults = db.conversion_options(db.id(row), fmt) + defaults = defaults if defaults else '' + options = comicconfig(defaults=defaults).parse() + + mi = db.get_metadata(row) + if mi.title: + options.title = mi.title + if mi.authors: + options.author = ','.join(mi.authors) + data = None + for _fmt in ['cbz', 'cbr']: + try: + data = db.format(row, _fmt.upper()) + if data is not None: + break + except: + continue + + if data is None: + bad_rows.append(row) + continue + + pt = PersistentTemporaryFile('.'+_fmt.lower()) + pt.write(data) + pt.close() + of = PersistentTemporaryFile('.'+fmt) + of.close() + setattr(options, 'output', of.name) + options.verbose = 1 + args = [pt.name, options] + desc = _('Convert book %d of %d (%s)')%(i+1, total, repr(mi.title)) + jobs.append(('comic2'+fmt, args, desc, fmt.upper(), row_id, [pt, of])) + + changed = True + + if bad_rows: + res = [] + for row in bad_rows: + title = db.title(row) + res.append('
  • %s
  • '%title) + + msg = _('

    Could not convert %d of %d books, because no suitable source format was found.

      %s
    ')%(len(res), total, '\n'.join(res)) + warning_dialog(parent, _('Could not convert some books'), msg).exec_() + + return jobs, changed, bad_rows + +def auto_convert_lrf(fmt, parent, db, comics, others): + pass + def convert_single(fmt, parent, db, comics, others): changed = False jobs = [] @@ -386,6 +504,12 @@ def fetch_scheduled_recipe(recipe, script): args.append(script) return 'feeds2'+fmt, [args], _('Fetch news from ')+recipe.title, fmt.upper(), [pt] +def auto_convert_ebook(*args): + fmt = args[0] if args[0] else 'epub' + if fmt == 'lrf': + return auto_convert_lrf() + elif fmt in ('epub', 'mobi'): + return auto_convert(*args) def convert_single_ebook(*args): fmt = prefs['output_format'].lower() @@ -410,4 +534,5 @@ def set_conversion_defaults(comic, parent, db): def fetch_news(data): fmt = prefs['output_format'].lower() - return _fetch_news(data, fmt) \ No newline at end of file + return _fetch_news(data, fmt) + diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py index cb823e6c73..28f861ae3a 100644 --- a/src/calibre/library/database2.py +++ b/src/calibre/library/database2.py @@ -1567,3 +1567,4 @@ books_series_link feeds break return duplicates + From 29486d653e262f4174bcfb0a1189e6490166fd68 Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Wed, 18 Mar 2009 19:51:35 -0400 Subject: [PATCH 034/319] Convert OEBBook to store cssutils-parsed CSS. --- src/calibre/ebooks/lit/writer.py | 4 +- src/calibre/ebooks/oeb/base.py | 58 ++++++++++++++----- src/calibre/ebooks/oeb/factory.py | 7 ++- src/calibre/ebooks/oeb/reader.py | 2 +- src/calibre/ebooks/oeb/stylizer.py | 26 ++++----- .../ebooks/oeb/transforms/trimmanifest.py | 2 +- src/calibre/ebooks/oeb/writer.py | 2 +- 7 files changed, 64 insertions(+), 37 deletions(-) diff --git a/src/calibre/ebooks/lit/writer.py b/src/calibre/ebooks/lit/writer.py index bebba8938b..73216057b5 100644 --- a/src/calibre/ebooks/lit/writer.py +++ b/src/calibre/ebooks/lit/writer.py @@ -27,7 +27,7 @@ from calibre.ebooks.oeb.base import OEB_DOCS, XHTML_MIME, OEB_STYLES, \ CSS_MIME, OPF_MIME, XML_NS, XML from calibre.ebooks.oeb.base import namespace, barename, prefixname, \ urlnormalize, xpath -from calibre.ebooks.oeb.base import Logger, OEBBook +from calibre.ebooks.oeb.base import OEBBook from calibre.ebooks.oeb.profile import Context from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.transforms.flatcss import CSSFlattener @@ -732,7 +732,7 @@ def option_parser(): return parser def oeb2lit(opts, inpath): - logger = Logger(logging.getLogger('oeb2lit')) + logger = logging.getLogger('oeb2lit') logger.setup_cli_handler(opts.verbose) outpath = opts.output if outpath is None: diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 59ce1f7b95..1e91fbe17d 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -13,8 +13,11 @@ from collections import defaultdict from itertools import count from urlparse import urldefrag, urlparse, urlunparse from urllib import unquote as urlunquote +import logging from lxml import etree, html import calibre +from cssutils import CSSParser +from cssutils.css import CSSStyleSheet from calibre.translations.dynamic import translate from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.oeb.entitydefs import ENTITYDEFS @@ -99,6 +102,8 @@ PNG_MIME = types_map['.png'] SVG_MIME = types_map['.svg'] BINARY_MIME = 'application/octet-stream' +XHTML_CSS_NAMESPACE = u'@namespace "%s";\n' % XHTML_NS + OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css']) OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, 'text/x-oeb-document']) @@ -565,7 +570,7 @@ class Manifest(object): return 'Item(id=%r, href=%r, media_type=%r)' \ % (self.id, self.href, self.media_type) - def _force_xhtml(self, data): + def _parse_xhtml(self, data): # Convert to Unicode and normalize line endings data = self.oeb.decode(data) data = XMLDECL_RE.sub('', data) @@ -645,6 +650,27 @@ class Manifest(object): 'File %r missing element' % self.href) etree.SubElement(data, XHTML('body')) return data + + def _parse_css(self, data): + data = self.oeb.decode(data) + data = XHTML_CSS_NAMESPACE + data + parser = CSSParser(log=self.oeb.logger, loglevel=logging.WARNING, + fetcher=self._fetch_css) + data = parser.parseString(data, href=self.href) + data.namespaces['h'] = XHTML_NS + return data + + def _fetch_css(self, path): + hrefs = self.oeb.manifest.hrefs + if path not in hrefs: + self.oeb.logger.warn('CSS import of missing file %r' % path) + return (None, None) + item = hrefs[path] + if item.media_type not in OEB_STYLES: + self.oeb.logger.warn('CSS import of non-CSS file %r' % path) + return (None, None) + data = item.data.cssText + return ('utf-8', data) @dynamic_property def data(self): @@ -661,15 +687,19 @@ class Manifest(object): special parsing. """ def fget(self): - if self._data is not None: - return self._data - data = self._loader(self.href) - if self.media_type in OEB_DOCS: - data = self._force_xhtml(data) + data = self._data + if data is None: + if self._loader is None: + return None + data = self._loader(self.href) + if not isinstance(data, basestring): + pass # already parsed + elif self.media_type in OEB_DOCS: + data = self._parse_xhtml(data) elif self.media_type[-4:] in ('+xml', '/xml'): data = etree.fromstring(data) elif self.media_type in OEB_STYLES: - data = self.oeb.decode(data) + data = self._parse_css(data) self._data = data return data def fset(self, value): @@ -677,7 +707,7 @@ class Manifest(object): def fdel(self): self._data = None return property(fget, fset, fdel, doc=doc) - + def __str__(self): data = self.data if isinstance(data, etree._Element): @@ -726,7 +756,7 @@ class Manifest(object): if frag: relhref = '#'.join((relhref, frag)) return relhref - + def abshref(self, href): """Convert the URL provided in :param:`href` from a reference relative to this manifest item to a book-absolute reference. @@ -748,7 +778,7 @@ class Manifest(object): self.items = set() self.ids = {} self.hrefs = {} - + def add(self, id, href, media_type, fallback=None, loader=None, data=None): """Add a new item to the book manifest. @@ -765,7 +795,7 @@ class Manifest(object): self.ids[item.id] = item self.hrefs[item.href] = item return item - + def remove(self, item): """Removes :param:`item` from the manifest.""" if item in self.ids: @@ -775,7 +805,7 @@ class Manifest(object): self.items.remove(item) if item in self.oeb.spine: self.oeb.spine.remove(item) - + def generate(self, id=None, href=None): """Generate a new unique identifier and/or internal path for use in creating a new manifest item, using the provided :param:`id` and/or @@ -803,13 +833,13 @@ class Manifest(object): def __iter__(self): for item in self.items: yield item - + def values(self): return list(self.items) def __contains__(self, item): return item in self.items - + def to_opf1(self, parent=None): elem = element(parent, 'manifest') for item in self.items: diff --git a/src/calibre/ebooks/oeb/factory.py b/src/calibre/ebooks/oeb/factory.py index 684451044b..8add71d20d 100644 --- a/src/calibre/ebooks/oeb/factory.py +++ b/src/calibre/ebooks/oeb/factory.py @@ -8,6 +8,7 @@ __copyright__ = '2008, Marshall T. Vandegrift ' import sys, os, logging from itertools import chain +import calibre from calibre.ebooks.oeb.base import OEBError from calibre.ebooks.oeb.reader import OEBReader from calibre.ebooks.oeb.writer import OEBWriter @@ -15,7 +16,7 @@ from calibre.ebooks.lit.reader import LitReader from calibre.ebooks.lit.writer import LitWriter from calibre.ebooks.mobi.reader import MobiReader from calibre.ebooks.mobi.writer import MobiWriter -from calibre.ebooks.oeb.base import Logger, OEBBook +from calibre.ebooks.oeb.base import OEBBook from calibre.ebooks.oeb.profile import Context from calibre.utils.config import Config @@ -77,8 +78,8 @@ def main(argv=sys.argv): if len(args) != 0: parser.print_help() return 1 - logger = Logger(logging.getLogger('ebook-convert')) - logger.setup_cli_handler(opts.verbose) + logger = logging.getLogger('ebook-convert') + calibre.setup_cli_handlers(logger, logging.DEBUG) encoding = opts.encoding pretty_print = opts.pretty_print oeb = OEBBook(encoding=encoding, pretty_print=pretty_print, logger=logger) diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py index dbafa5afac..c62540e15a 100644 --- a/src/calibre/ebooks/oeb/reader.py +++ b/src/calibre/ebooks/oeb/reader.py @@ -181,7 +181,7 @@ class OEBReader(object): if not scheme and href not in known: new.add(href) elif item.media_type in OEB_STYLES: - for match in CSSURL_RE.finditer(item.data): + for match in CSSURL_RE.finditer(item.data.cssText): href, _ = urldefrag(match.group('url')) href = item.abshref(urlnormalize(href)) scheme = urlparse(href).scheme diff --git a/src/calibre/ebooks/oeb/stylizer.py b/src/calibre/ebooks/oeb/stylizer.py index 3b5c3e19d0..8bc82883e3 100644 --- a/src/calibre/ebooks/oeb/stylizer.py +++ b/src/calibre/ebooks/oeb/stylizer.py @@ -115,8 +115,7 @@ class Stylizer(object): cssname = os.path.splitext(basename)[0] + '.css' stylesheets = [HTML_CSS_STYLESHEET] head = xpath(tree, '/h:html/h:head')[0] - parser = cssutils.CSSParser() - parser.setFetcher(self._fetch_css_file) + parser = cssutils.CSSParser(fetcher=self._fetch_css_file) for elem in head: if elem.tag == XHTML('style') and elem.text \ and elem.get('type', CSS_MIME) in OEB_STYLES: @@ -135,14 +134,7 @@ class Stylizer(object): 'Stylesheet %r referenced by file %r not in manifest' % (path, item.href)) continue - if sitem in self.STYLESHEETS: - stylesheet = self.STYLESHEETS[sitem] - else: - data = self._fetch_css_file(path)[1] - stylesheet = parser.parseString(data, href=path) - stylesheet.namespaces['h'] = XHTML_NS - self.STYLESHEETS[sitem] = stylesheet - stylesheets.append(stylesheet) + stylesheets.append(sitem.data) rules = [] index = 0 self.stylesheets = set() @@ -159,9 +151,9 @@ class Stylizer(object): for _, _, cssdict, text, _ in rules: try: selector = CSSSelector(text) - except (AssertionError, ExpressionError, etree.XPathSyntaxError,\ - NameError, # gets thrown on OS X instead of SelectorSyntaxError - SelectorSyntaxError): + except (AssertionError, ExpressionError, etree.XPathSyntaxError, + NameError, # thrown on OS X instead of SelectorSyntaxError + SelectorSyntaxError): continue for elem in selector(tree): self.style(elem)._update_cssdict(cssdict) @@ -171,9 +163,13 @@ class Stylizer(object): def _fetch_css_file(self, path): hrefs = self.oeb.manifest.hrefs if path not in hrefs: + self.logger.warn('CSS import of missing file %r' % path) return (None, None) - data = hrefs[path].data - data = XHTML_CSS_NAMESPACE + data + item = hrefs[path] + if item.media_type not in OEB_STYLES: + self.logger.warn('CSS import of non-CSS file %r' % path) + return (None, None) + data = item.data.cssText return ('utf-8', data) def flatten_rule(self, rule, href, index): diff --git a/src/calibre/ebooks/oeb/transforms/trimmanifest.py b/src/calibre/ebooks/oeb/transforms/trimmanifest.py index c731800999..119ebcc73d 100644 --- a/src/calibre/ebooks/oeb/transforms/trimmanifest.py +++ b/src/calibre/ebooks/oeb/transforms/trimmanifest.py @@ -53,7 +53,7 @@ class ManifestTrimmer(object): if found not in used: new.add(found) elif item.media_type == CSS_MIME: - for match in CSSURL_RE.finditer(item.data): + for match in CSSURL_RE.finditer(item.data.cssText): href = match.group('url') href = item.abshref(urlnormalize(href)) if href in oeb.manifest.hrefs: diff --git a/src/calibre/ebooks/oeb/writer.py b/src/calibre/ebooks/oeb/writer.py index 235965b50f..8789d03470 100644 --- a/src/calibre/ebooks/oeb/writer.py +++ b/src/calibre/ebooks/oeb/writer.py @@ -8,7 +8,7 @@ __copyright__ = '2008, Marshall T. Vandegrift ' import sys, os, logging from calibre.ebooks.oeb.base import OPF_MIME, xml2str -from calibre.ebooks.oeb.base import Logger, DirContainer, OEBBook +from calibre.ebooks.oeb.base import DirContainer, OEBBook __all__ = ['OEBWriter'] From 36fd295ca12540e73ee7bcde2b3e896a5da53478 Mon Sep 17 00:00:00 2001 From: John Schember Date: Fri, 20 Mar 2009 19:39:26 -0400 Subject: [PATCH 035/319] any2txt converter --- src/calibre/ebooks/htmlsymbols.py | 219 ++++++++++++++++++++++++++ src/calibre/ebooks/txt/__init__.py | 9 ++ src/calibre/ebooks/txt/from_any.py | 74 +++++++++ src/calibre/ebooks/txt/writer.py | 237 +++++++++++++++++++++++++++++ 4 files changed, 539 insertions(+) create mode 100644 src/calibre/ebooks/htmlsymbols.py create mode 100644 src/calibre/ebooks/txt/__init__.py create mode 100644 src/calibre/ebooks/txt/from_any.py create mode 100644 src/calibre/ebooks/txt/writer.py diff --git a/src/calibre/ebooks/htmlsymbols.py b/src/calibre/ebooks/htmlsymbols.py new file mode 100644 index 0000000000..9b50f20fcd --- /dev/null +++ b/src/calibre/ebooks/htmlsymbols.py @@ -0,0 +1,219 @@ +# -*- coding: utf-8 -*- +''' +Maping of non-acii symbols and their corresponding html entity number and name +''' +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' + +# http://www.w3schools.com/tags/ref_symbols.asp +HTML_SYMBOLS = { + # Math Symbols + u'∀' : ['∀', '∀'], # for all + u'∂' : ['∂', '∂'], # part + u'∃' : ['∃', '&exists;'], # exists + u'∅' : ['∅', '∅'], # empty + u'∇' : ['∇', '∇'], # nabla + u'∈' : ['∈', '∈'], # isin + u'∉' : ['∉', '∉'], # notin + u'∋' : ['∋', '∋'], # ni + u'∏' : ['∏', '∏'], # prod + u'∑' : ['∑', '∑'], # sum + u'−' : ['−', '−'], # minus + u'∗' : ['∗', '∗'], # lowast + u'√' : ['√', '√'], # square root + u'∝' : ['∝', '∝'], # proportional to + u'∞' : ['∞', '∞'], # infinity + u'∠' : ['∠', '∠'], # angle + u'∧' : ['∧', '∧'], # and + u'∨' : ['∨', '∨'], # or + u'∩' : ['∩', '∩'], # cap + u'∪' : ['∪', '∪'], # cup + u'∫' : ['∫', '∫'], # integral + u'∴' : ['∴', '∴'], # therefore + u'∼' : ['∼', '∼'], # simular to + u'≅' : ['≅', '≅'], # approximately equal + u'≈' : ['≈', '≈'], # almost equal + u'≠' : ['≠', '≠'], # not equal + u'≡' : ['≡', '≡'], # equivalent + u'≤' : ['≤', '≤'], # less or equal + u'≥' : ['≥', '≥'], # greater or equal + u'⊂' : ['⊂', '⊂'], # subset of + u'⊃' : ['⊃', '⊃'], # superset of + u'⊄' : ['⊄', '⊄'], # not subset of + u'⊆' : ['⊆', '⊆'], # subset or equal + u'⊇' : ['⊇', '⊇'], # superset or equal + u'⊕' : ['⊕', '⊕'], # circled plus + u'⊗' : ['⊗', '⊗'], # cirled times + u'⊥' : ['⊥', '⊥'], # perpendicular + u'⋅' : ['⋅', '⋅'], # dot operator + # Greek Letters + u'Α' : ['Α', 'Α'], # Alpha + u'Β' : ['Β', 'Β'], # Beta + u'Γ' : ['Γ', 'Γ'], # Gamma + u'Δ' : ['Δ', 'Δ'], # Delta + u'Ε' : ['Ε', 'Ε'], # Epsilon + u'Ζ' : ['Ζ', 'Ζ'], # Zeta + u'Η' : ['Η', 'Η'], # Eta + u'Θ' : ['Θ', 'Θ'], # Theta + u'Ι' : ['Ι', 'Ι'], # Iota + u'Κ' : ['Κ', 'Κ'], # Kappa + u'Λ' : ['Λ', 'Λ'], # Lambda + u'Μ' : ['Μ', 'Μ'], # Mu + u'Ν' : ['Ν', 'Ν'], # Nu + u'Ξ' : ['Ξ', 'Ξ'], # Xi + u'Ο' : ['Ο', 'Ο'], # Omicron + u'Π' : ['Π', 'Π'], # Pi + u'Ρ' : ['Ρ', 'Ρ'], # Rho + u'Σ' : ['Σ', 'Σ'], # Sigma + u'Τ' : ['Τ', 'Τ'], # Tau + u'Υ' : ['Υ', 'Υ'], # Upsilon + u'Φ' : ['Φ', 'Φ'], # Phi + u'Χ' : ['Χ', 'Χ'], # Chi + u'Ψ' : ['Ψ', 'Ψ'], # Psi + u'ω' : ['ω', 'ω'], # omega + u'ϑ' : ['ϑ', 'ϑ'], # theta symbol + u'ϒ' : ['ϒ', 'ϒ'], # upsilon symbol + u'ϖ' : ['ϖ', 'ϖ'], # pi symbol + # Other + u'Œ' : ['Œ', 'Œ'], # capital ligature OE + u'œ' : ['œ', 'œ'], # small ligature oe + u'Š' : ['Š', 'Š'], # capital S with caron + u'š' : ['š', 'š'], # small S with caron + u'Ÿ' : ['Ÿ', 'Ÿ'], # capital Y with diaeres + u'ƒ' : ['ƒ', 'ƒ'], # f with hook + u'ˆ' : ['ˆ', 'ˆ'], # modifier letter circumflex accent + u'˜' : ['˜', '˜'], # small tilde + u'–' : ['–', '–'], # en dash + u'—' : ['—', '—'], # em dash + u'‘' : ['‘', '‘'], # left single quotation mark + u'’' : ['’', '’'], # right single quotation mark + u'‚' : ['‚', '‚'], # single low-9 quotation mark + u'“' : ['“', '“'], # left double quotation mark + u'”' : ['”', '”'], # right double quotation mark + u'„' : ['„', '„'], # double low-9 quotation mark + u'†' : ['†', '†'], # dagger + u'‡' : ['‡', '‡'], # double dagger + u'•' : ['•', '•'], # bullet + u'…' : ['…', '…'], # horizontal ellipsis + u'‰' : ['‰', '‰'], # per mille + u'′' : ['′', '′'], # minutes + u'″' : ['″', '″'], # seconds + u'‹' : ['‹', '‹'], # single left angle quotation + u'›' : ['›', '›'], # single right angle quotation + u'‾' : ['‾', '‾'], # overline + u'€' : ['€', '€'], # euro + u'™' : ['™', '™'], # trademark + u'←' : ['←', '←'], # left arrow + u'↑' : ['↑', '↑'], # up arrow + u'→' : ['→', '→'], # right arrow + u'↓' : ['↓', '↓'], # down arrow + u'↔' : ['↔', '↔'], # left right arrow + u'↵' : ['↵', '↵'], # carriage return arrow + u'⌈' : ['⌈', '⌈'], # left ceiling + u'⌉' : ['⌉', '⌉'], # right ceiling + u'⌊' : ['⌊', '⌊'], # left floor + u'⌋' : ['⌋', '⌋'], # right floor + u'◊' : ['◊', '◊'], # lozenge + u'♠' : ['♠', '♠'], # spade + u'♣' : ['♣', '♣'], # club + u'♥' : ['♥', '♥'], # heart + u'♦' : ['♦', '♦'], # diamond + # Extra http://www.ascii.cl/htmlcodes.htm + u'<' : ['<', '<'], # less than sign + u'>' : ['>', '>'], # greater than sign + u'¡' : ['¡', '¡'], # inverted exclamation mark + u'¢' : ['¢', '¢'], # cent sign + u'£' : ['£', '£'], # pound sign + u'¤' : ['¤', '¤'], # currency sign + u'¥' : ['¥', '¥'], # yen sign + u'¦' : ['¦', '¦'], # broken vertical bar + u'§' : ['§', '§'], # section sign + u'¨' : ['¨', '¨'], # spacing diaeresis - umlaut + u'©' : ['©', '©'], # copyright sign + u'ª' : ['ª', 'ª'], # feminine ordinal indicator + u'«' : ['«', '«'], # left double angle quotes + u'¬' : ['¬', '¬'], # not sign + u'®' : ['®', '®'], # registered trade mark sign + u'¯' : ['¯', '¯'], # spacing macron - overline + u'°' : ['°', '°'], # degree sign + u'±' : ['±', '±'], # plus-or-minus sign + u'²' : ['²', '²'], # superscript two - squared + u'³' : ['³', '³'], # superscript three - cubed + u'´' : ['´', '´'], # acute accent - spacing acute + u'µ' : ['µ', 'µ'], # micro sign + u'¶' : ['¶', '¶'], # pilcrow sign - paragraph sign + u'·' : ['·', '·'], # middle dot - Georgian comma + u'¸' : ['¸', '¸'], # spacing cedilla + u'¹' : ['¹', '¹'], # superscript one + u'º' : ['º', 'º'], # masculine ordinal indicator + u'»' : ['»', '»'], # right double angle quotes + u'¼' : ['¼', '¼'], # fraction one quarter + u'½' : ['½', '½'], # fraction one half + u'¾' : ['¾', '¾'], # fraction three quarters + u'¿' : ['¿', '¿'], # inverted question mark + u'À' : ['À', 'À'], # latin capital letter A with grave + u'Á' : ['Á', 'Á'], # latin capital letter A with acute + u'Â' : ['Â', 'Â'], # latin capital letter A with circumflex + u'Ã' : ['Ã', 'Ã'], # latin capital letter A with tilde + u'Ä' : ['Ä', 'Ä'], # latin capital letter A with diaeresis + u'Å' : ['Å', 'Å'], # latin capital letter A with ring above + u'Æ' : ['Æ', 'Æ'], # latin capital letter AE + u'Ç' : ['Ç', 'Ç'], # latin capital letter C with cedilla + u'È' : ['È', 'È'], # latin capital letter E with grave + u'É' : ['É', 'É'], # latin capital letter E with acute + u'Ê' : ['Ê', 'Ê'], # latin capital letter E with circumflex + u'Ë' : ['Ë', 'Ë'], # latin capital letter E with diaeresis + u'Ì' : ['Ì', 'Ì'], # latin capital letter I with grave + u'Í' : ['Í', 'Í'], # latin capital letter I with acute + u'Î' : ['Î', 'Î'], # latin capital letter I with circumflex + u'Ï' : ['Ï', 'Ï'], # latin capital letter I with diaeresis + u'Ð' : ['Ð', 'Ð'], # latin capital letter ETH + u'Ñ' : ['Ñ', 'Ñ'], # latin capital letter N with tilde + u'Ò' : ['Ò', 'Ò'], # latin capital letter O with grave + u'Ó' : ['Ó', 'Ó'], # latin capital letter O with acute + u'Ô' : ['Ô', 'Ô'], # latin capital letter O with circumflex + u'Õ' : ['Õ', 'Õ'], # latin capital letter O with tilde + u'Ö' : ['Ö', 'Ö'], # latin capital letter O with diaeresis + u'×' : ['×', '×'], # multiplication sign + u'Ø' : ['Ø', 'Ø'], # latin capital letter O with slash + u'Ù' : ['Ù', 'Ù'], # latin capital letter U with grave + u'Ú' : ['Ú', 'Ú'], # latin capital letter U with acute + u'Û' : ['Û', 'Û'], # latin capital letter U with circumflex + u'Ü' : ['Ü', 'Ü'], # latin capital letter U with diaeresis + u'Ý' : ['Ý', 'Ý'], # latin capital letter Y with acute + u'Þ' : ['Þ', 'Þ'], # latin capital letter THORN + u'ß' : ['ß', 'ß'], # latin small letter sharp s - ess-zed + u'à' : ['à', 'à'], # latin small letter a with grave + u'á' : ['á', 'á'], # latin small letter a with acute + u'â' : ['â', 'â'], # latin small letter a with circumflex + u'ã' : ['ã', 'ã'], # latin small letter a with tilde + u'ä' : ['ä', 'ä'], # latin small letter a with diaeresis + u'å' : ['å', 'å'], # latin small letter a with ring above + u'æ' : ['æ', 'æ'], # latin small letter ae + u'ç' : ['ç', 'ç'], # latin small letter c with cedilla + u'è' : ['è', 'è'], # latin small letter e with grave + u'é' : ['é', 'é'], # latin small letter e with acute + u'ê' : ['ê', 'ê'], # latin small letter e with circumflex + u'ë' : ['ë', 'ë'], # latin small letter e with diaeresis + u'ì' : ['ì', 'ì'], # latin small letter i with grave + u'í' : ['í', 'í'], # latin small letter i with acute + u'î' : ['î', 'î'], # latin small letter i with circumflex + u'ï' : ['ï', 'ï'], # latin small letter i with diaeresis + u'ð' : ['ð', 'ð'], # latin small letter eth + u'ñ' : ['ñ', 'ñ'], # latin small letter n with tilde + u'ò' : ['ò', 'ò'], # latin small letter o with grave + u'ó' : ['ó', 'ó'], # latin small letter o with acute + u'ô' : ['ô', 'ô'], # latin small letter o with circumflex + u'õ' : ['õ', 'õ'], # latin small letter o with tilde + u'ö' : ['ö', 'ö'], # latin small letter o with diaeresis + u'÷' : ['÷', '÷'], # division sign + u'ø' : ['ø', 'ø'], # latin small letter o with slash + u'ù' : ['ù', 'ù'], # latin small letter u with grave + u'ú' : ['ú', 'ú'], # latin small letter u with acute + u'û' : ['û', 'û'], # latin small letter u with circumflex + u'ü' : ['ü', 'ü'], # latin small letter u with diaeresis + u'ý' : ['ý', 'ý'], # latin small letter y with acute + u'þ' : ['þ', 'þ'], # latin small letter thorn + u'ÿ' : ['ÿ', 'ÿ'], # latin small letter y with diaeresis + } + diff --git a/src/calibre/ebooks/txt/__init__.py b/src/calibre/ebooks/txt/__init__.py new file mode 100644 index 0000000000..dfdbbdb5e2 --- /dev/null +++ b/src/calibre/ebooks/txt/__init__.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python +__license__ = 'GPL v3' +__copyright__ = '2008, John Schember john@nachtimwald.com' +__docformat__ = 'restructuredtext en' + +''' +Used for txt output +''' + diff --git a/src/calibre/ebooks/txt/from_any.py b/src/calibre/ebooks/txt/from_any.py new file mode 100644 index 0000000000..caf5364c3c --- /dev/null +++ b/src/calibre/ebooks/txt/from_any.py @@ -0,0 +1,74 @@ +''' +Convert any ebook format to TXT. +''' + +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net ' \ + 'and Marshall T. Vandegrift ' \ + 'and John Schember ' +__docformat__ = 'restructuredtext en' + +import sys, os, glob, logging + +from calibre.ebooks.epub.from_any import any2epub, formats, USAGE +from calibre.ebooks.epub import config as common_config +from calibre.ptempfile import TemporaryDirectory +from calibre.ebooks.txt.writer import oeb2txt, config as txt_config + +def config(defaults=None): + c = common_config(defaults=defaults, name='txt') + c.remove_opt('profile') + del c.option_set.groups['metadata'] + del c.option_set.groups['traversal'] + del c.option_set.groups['structure detection'] + del c.option_set.groups['toc'] + del c.option_set.groups['page layout'] + txtc = txt_config(defaults=defaults) + c.update(txtc) + return c + +def option_parser(usage=USAGE): + usage = usage % ('TXT', formats()) + parser = config().option_parser(usage=usage) + return parser + +def any2txt(opts, path, notification=None): + ext = os.path.splitext(path)[1] + if not ext: + raise ValueError('Unknown file type: '+path) + ext = ext.lower()[1:] + + if opts.output is None: + opts.output = os.path.splitext(os.path.basename(path))[0]+'.txt' + + opts.output = os.path.abspath(opts.output) + orig_output = opts.output + + with TemporaryDirectory('_any2txt') as tdir: + oebdir = os.path.join(tdir, 'oeb') + os.mkdir(oebdir) + opts.output = os.path.join(tdir, 'dummy.epub') + opts.profile = 'None' + opts.dont_split_on_page_breaks = True + orig_bfs = opts.base_font_size2 + opts.base_font_size2 = 0 + any2epub(opts, path, create_epub=False, oeb_cover=False, extract_to=oebdir) + opts.base_font_size2 = orig_bfs + opf = glob.glob(os.path.join(oebdir, '*.opf'))[0] + opts.output = orig_output + logging.getLogger('html2epub').info(_('Creating TXT file from EPUB...')) + oeb2txt(opts, opf) + +def main(args=sys.argv): + parser = option_parser() + opts, args = parser.parse_args(args) + if len(args) < 2: + parser.print_help() + print 'No input file specified.' + return 1 + any2txt(opts, args[1]) + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/ebooks/txt/writer.py b/src/calibre/ebooks/txt/writer.py new file mode 100644 index 0000000000..0fbf4a634c --- /dev/null +++ b/src/calibre/ebooks/txt/writer.py @@ -0,0 +1,237 @@ +# -*- coding: utf-8 -*- +''' +Write content to TXT. +''' +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' + +import os, logging, re, sys + +from BeautifulSoup import BeautifulSoup + +from calibre import LoggingInterface +from calibre.ebooks.htmlsymbols import HTML_SYMBOLS +from calibre.ebooks.epub.iterator import SpineItem +from calibre.ebooks.metadata import authors_to_string +from calibre.ebooks.metadata.meta import metadata_from_formats +from calibre.ebooks.metadata.opf2 import OPF +from calibre.customize.ui import run_plugins_on_postprocess +from calibre.utils.config import Config, StringConfig + +class TXTWriter(object): + def __init__(self, newline): + self.newline = newline + + def dump(self, oebpath, path, metadata): + opf = OPF(oebpath, os.path.dirname(oebpath)) + spine = [SpineItem(i.path) for i in opf.spine] + + tmpout = '' + for item in spine: + with open(item, 'r') as itemf: + content = itemf.read().decode(item.encoding) + # Convert newlines to unix style \n for processing. These + # will be changed to the specified type later in the process. + content = self.unix_newlines(content) + content = self.strip_html(content) + content = self.replace_html_symbols(content) + content = self.cleanup_text(content) + content = self.specified_newlines(content) + tmpout = tmpout + content + + # Prepend metadata + if metadata.author != None and metadata.author != '': + tmpout = (u'%s%s%s%s' % (metadata.author.upper(), self.newline, self.newline, self.newline)) + tmpout + if metadata.title != None and metadata.title != '': + tmpout = (u'%s%s%s%s' % (metadata.title.upper(), self.newline, self.newline, self.newline)) + tmpout + + # Put two blank lines at end of file + + end = tmpout[-3 * len(self.newline):] + for i in range(3 - end.count(self.newline)): + tmpout = tmpout + self.newline + + os.remove(path) + with open(path, 'w+b') as out: + out.write(tmpout.encode('utf-8')) + + def strip_html(self, html): + stripped = u'' + + for dom_tree in BeautifulSoup(html).findAll('body'): + text = unicode(dom_tree) + + # Remove unnecessary tags + for tag in ['script', 'style']: + text = re.sub('(?imu)<[ ]*%s[ ]*.*?>(.*)' % (tag, tag), '', text) + text = re.sub('', '', text) + + # Headings usually indicate Chapters. + # We are going to use a marker to insert the proper number of + # newline characters at the end of cleanup_text because cleanup_text + # remove excessive (more than 2 newlines). + for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: + text = re.sub('(?imu)<[ ]*%s[ ]*.*?>' % tag, '-vzxedxy-', text) + text = re.sub('(?imu)' % tag, '-vlgzxey-', text) + + # Separate content with space. + for tag in ['td']: + text = re.sub('(?imu)', ' ', text) + + # Separate content with empty line. + for tag in ['p', 'div', 'pre', 'li', 'table', 'tr']: + text = re.sub('(?imu)' % tag, '\n\n', text) + + for tag in ['hr', 'br']: + text = re.sub('(?imu)<[ ]*%s[ ]*/*?>' % tag, '\n\n', text) + + # Remove any tags that do not need special processing. + text = re.sub('<.*?>', '', text) + + stripped = stripped + text + + return stripped + + def replace_html_symbols(self, content): + for symbol in HTML_SYMBOLS: + for code in HTML_SYMBOLS[symbol]: + content = content.replace(code, symbol) + return content + + def cleanup_text(self, text): + # Replace bad characters. + text = text.replace(u'\xc2', '') + text = text.replace(u'\xa0', ' ') + + # Replace tabs, vertical tags and form feeds with single space. + #text = re.sub('\xc2\xa0', '', text) + text = text.replace('\t+', ' ') + text = text.replace('\v+', ' ') + text = text.replace('\f+', ' ') + + # Single line paragraph. + r = re.compile('.\n.') + while True: + mo = r.search(text) + if mo == None: + break + text = '%s %s' % (text[:mo.start()+1], text[mo.end()-1:]) + + # Remove multiple spaces. + text = re.sub('[ ]+', ' ', text) + text = re.sub('(?imu)^[ ]+', '', text) + text = re.sub('(?imu)[ ]+$', '', text) + + # Remove excessive newlines. + text = re.sub('\n[ ]+\n', '\n\n', text) + text = re.sub('\n{3,}', '\n\n', text) + + # Replace markers with the proper characters. + text = text.replace('-vzxedxy-', '\n\n\n\n\n') + text = text.replace('-vlgzxey-', '\n\n\n') + + return text + + def unix_newlines(self, text): + text = text.replace('\r\n', '\n') + text = text.replace('\r', '\n') + + return text + + def specified_newlines(self, text): + if self.newline == '\n': + return text + + return text.replace('\n', self.newline) + +class TxtMetadata(object): + def __init__(self): + self.author = None + self.title = None + self.series = None + + +class TxtNewlines(object): + NEWLINE_TYPES = { + 'system' : os.linesep, + 'unix' : '\n', + 'old_mac' : '\r', + 'windows' : '\r\n' + } + + def __init__(self, newline_type): + self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep) + + +def config(defaults=None): + desc = _('Options to control the conversion to TXT') + if defaults is None: + c = Config('txt', desc) + else: + c = StringConfig(defaults, desc) + + txt = c.add_group('TXT', _('TXT options.')) + + txt('newline', ['--newline'], default='system', + help=_('Type of newline to use. Options are %s. Default is \'system\'. ' + 'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. ' + 'For Mac OS X use \'unix\'. \'system\' will default to the newline ' + 'type used by this OS.' % sorted(TxtNewlines.NEWLINE_TYPES.keys()))) + txt('prepend_author', ['--prepend-author'], default='true', + help=_('Write the author to the beginning of the file. ' + 'Default is \'true\'. Use \'false\' to disable.')) + txt('prepend_title', ['--prepend-title'], default='true', + help=_('Write the title to the beginning of the file. ' + 'Default is \'true\'. Use \'false\' to disable.')) + + return c + +def option_parser(): + c = config() + parser = c.option_parser(usage='%prog '+_('[options]')+' file.opf') + parser.add_option( + '-o', '--output', default=None, + help=_('Output file. Default is derived from input filename.')) + parser.add_option( + '-v', '--verbose', default=0, action='count', + help=_('Useful for debugging.')) + return parser + +def oeb2txt(opts, inpath): + logger = LoggingInterface(logging.getLogger('oeb2txt')) + logger.setup_cli_handler(opts.verbose) + + outpath = opts.output + if outpath is None: + outpath = os.path.basename(inpath) + outpath = os.path.splitext(outpath)[0] + '.txt' + + mi = metadata_from_formats([inpath]) + metadata = TxtMetadata() + if opts.prepend_author.lower() == 'true': + metadata.author = opts.authors if opts.authors else authors_to_string(mi.authors) + if opts.prepend_title.lower() == 'true': + metadata.title = opts.title if opts.title else mi.title + + newline = TxtNewlines(opts.newline) + + writer = TXTWriter(newline.newline) + writer.dump(inpath, outpath, metadata) + run_plugins_on_postprocess(outpath, 'txt') + logger.log_info(_('Output written to ') + outpath) + +def main(argv=sys.argv): + parser = option_parser() + opts, args = parser.parse_args(argv[1:]) + if len(args) != 1: + parser.print_help() + return 1 + inpath = args[0] + retval = oeb2txt(opts, inpath) + return retval + +if __name__ == '__main__': + sys.exit(main()) + From 8d124f92d6ea9ac262542507330b6d19b9a0421c Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 21 Mar 2009 07:37:26 -0400 Subject: [PATCH 036/319] Only remove output file if it exists before writing to it in txt output --- src/calibre/ebooks/txt/writer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/txt/writer.py b/src/calibre/ebooks/txt/writer.py index 0fbf4a634c..84376ca2e7 100644 --- a/src/calibre/ebooks/txt/writer.py +++ b/src/calibre/ebooks/txt/writer.py @@ -53,7 +53,8 @@ class TXTWriter(object): for i in range(3 - end.count(self.newline)): tmpout = tmpout + self.newline - os.remove(path) + if os.path.exists(path): + os.remove(path) with open(path, 'w+b') as out: out.write(tmpout.encode('utf-8')) From 11013c26657fe56b2581061a8243981dc3ff0d6a Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 21 Mar 2009 17:31:15 -0400 Subject: [PATCH 037/319] More html symbols --- src/calibre/ebooks/htmlsymbols.py | 91 +++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/src/calibre/ebooks/htmlsymbols.py b/src/calibre/ebooks/htmlsymbols.py index 9b50f20fcd..fa10873845 100644 --- a/src/calibre/ebooks/htmlsymbols.py +++ b/src/calibre/ebooks/htmlsymbols.py @@ -119,6 +119,97 @@ HTML_SYMBOLS = { u'♥' : ['♥', '♥'], # heart u'♦' : ['♦', '♦'], # diamond # Extra http://www.ascii.cl/htmlcodes.htm + u' ' : [' '], # space + u'!' : ['!'], # exclamation point + u'#' : ['#'], # number sign + u'$' : ['$'], # dollar sign + u'%' : ['%'], # percent sign + u'\'' : ['''], # single quote + u'(' : ['('], # opening parenthesis + u')' : [')'], # closing parenthesis + u'*' : ['*'], # asterisk + u'+' : ['+'], # plus sign + u',' : [','], # comma + u'-' : ['-'], # minus sign - hyphen + u'.' : ['.'], # period + u'/' : ['/'], # slash + u'0' : ['0'], # zero + u'1' : ['1'], # one + u'2' : ['2'], # two + u'3' : ['3'], # three + u'4' : ['4'], # four + u'5' : ['5'], # five + u'6' : ['6'], # six + u'7' : ['7'], # seven + u'8' : ['8'], # eight + u'9' : ['9'], # nine + u':' : [':'], # colon + u';' : [';'], # semicolon + u'=' : ['='], # equal sign + u'?' : ['?'], # question mark + u'@' : ['@'], # at symbol + u'A' : ['A'], # + u'B' : ['B'], # + u'C' : ['C'], # + u'D' : ['D'], # + u'E' : ['E'], # + u'F' : ['F'], # + u'G' : ['G'], # + u'H' : ['H'], # + u'I' : ['I'], # + u'J' : ['J'], # + u'K' : ['K'], # + u'L' : ['L'], # + u'M' : ['M'], # + u'N' : ['N'], # + u'O' : ['O'], # + u'P' : ['P'], # + u'Q' : ['Q'], # + u'R' : ['R'], # + u'S' : ['S'], # + u'T' : ['T'], # + u'U' : ['U'], # + u'V' : ['V'], # + u'W' : ['W'], # + u'X' : ['X'], # + u'Y' : ['Y'], # + u'Z' : ['Z'], # + u'[' : ['['], # opening bracket + u'\\' : ['\'], # backslash + u']' : [']'], # closing bracket + u'^' : ['^'], # caret - circumflex + u'_' : ['_'], # underscore + u'`' : ['`'], # grave accent + u'a' : ['a'], # + u'b' : ['b'], # + u'c' : ['c'], # + u'd' : ['d'], # + u'e' : ['e'], # + u'f' : ['f'], # + u'g' : ['g'], # + u'h' : ['h'], # + u'i' : ['i'], # + u'j' : ['j'], # + u'k' : ['k'], # + u'l' : ['l'], # + u'm' : ['m'], # + u'n' : ['n'], # + u'o' : ['o'], # + u'p' : ['p'], # + u'q' : ['q'], # + u'r' : ['r'], # + u's' : ['s'], # + u't' : ['t'], # + u'u' : ['u'], # + u'v' : ['v'], # + u'w' : ['w'], # + u'x' : ['x'], # + u'y' : ['y'], # + u'z' : ['z'], # + u'{' : ['{'], # opening brace + u'|' : ['|'], # vertical bar + u'}' : ['}'], # closing brace + u'~' : ['~'], # equivalency sign - tilde u'<' : ['<', '<'], # less than sign u'>' : ['>', '>'], # greater than sign u'¡' : ['¡', '¡'], # inverted exclamation mark From 94c5e717a15bf4faf59bf23ab74f7caf6fc161be Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 21 Mar 2009 17:58:53 -0400 Subject: [PATCH 038/319] Txt output: remove more tags, ensure no spaces at beginning and end of lines --- src/calibre/ebooks/txt/writer.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/txt/writer.py b/src/calibre/ebooks/txt/writer.py index 84376ca2e7..205d8423e3 100644 --- a/src/calibre/ebooks/txt/writer.py +++ b/src/calibre/ebooks/txt/writer.py @@ -68,6 +68,9 @@ class TXTWriter(object): for tag in ['script', 'style']: text = re.sub('(?imu)<[ ]*%s[ ]*.*?>(.*)' % (tag, tag), '', text) text = re.sub('', '', text) + text = re.sub('<\?.*?\?>', '', text) + text = re.sub('<@.*?@>', '', text) + text = re.sub('<%.*?%>', '', text) # Headings usually indicate Chapters. # We are going to use a marker to insert the proper number of @@ -107,7 +110,6 @@ class TXTWriter(object): text = text.replace(u'\xa0', ' ') # Replace tabs, vertical tags and form feeds with single space. - #text = re.sub('\xc2\xa0', '', text) text = text.replace('\t+', ' ') text = text.replace('\v+', ' ') text = text.replace('\f+', ' ') @@ -122,8 +124,6 @@ class TXTWriter(object): # Remove multiple spaces. text = re.sub('[ ]+', ' ', text) - text = re.sub('(?imu)^[ ]+', '', text) - text = re.sub('(?imu)[ ]+$', '', text) # Remove excessive newlines. text = re.sub('\n[ ]+\n', '\n\n', text) @@ -133,6 +133,10 @@ class TXTWriter(object): text = text.replace('-vzxedxy-', '\n\n\n\n\n') text = text.replace('-vlgzxey-', '\n\n\n') + # Replace spaces at the beginning and end of lines + text = re.sub('(?imu)^[ ]+', '', text) + text = re.sub('(?imu)[ ]+$', '', text) + return text def unix_newlines(self, text): From 9abca9d60feb6896ef11c0da04a755fd24feb867 Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 23 Mar 2009 19:07:14 -0400 Subject: [PATCH 039/319] Do not enable edit rows in device tab for devices that do not support editing ebook metadata. --- src/calibre/devices/interface.py | 2 ++ src/calibre/devices/usbms/driver.py | 1 + src/calibre/gui2/library.py | 12 +++++++++--- src/calibre/gui2/main.py | 2 ++ 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/calibre/devices/interface.py b/src/calibre/devices/interface.py index ed51962236..21790e3c46 100644 --- a/src/calibre/devices/interface.py +++ b/src/calibre/devices/interface.py @@ -24,6 +24,8 @@ class Device(object): # it can be a list of the BCD numbers of all devices supported by this driver. BCD = None THUMBNAIL_HEIGHT = 68 # Height for thumbnails on device + # Whether the metadata on books can be set via the GUI. + CAN_SET_METADATA = True def __init__(self, key='-1', log_packets=False, report_progress=None) : """ diff --git a/src/calibre/devices/usbms/driver.py b/src/calibre/devices/usbms/driver.py index 4285881447..68041a19cd 100644 --- a/src/calibre/devices/usbms/driver.py +++ b/src/calibre/devices/usbms/driver.py @@ -35,6 +35,7 @@ class USBMS(Device): EBOOK_DIR_MAIN = '' EBOOK_DIR_CARD = '' SUPPORTS_SUB_DIRS = False + CAN_SET_METADATA = False def __init__(self, key='-1', log_packets=False, report_progress=None): Device.__init__(self, key=key, log_packets=log_packets, diff --git a/src/calibre/gui2/library.py b/src/calibre/gui2/library.py index d7581bf458..9f82b3b318 100644 --- a/src/calibre/gui2/library.py +++ b/src/calibre/gui2/library.py @@ -708,6 +708,9 @@ class BooksView(TableView): def close(self): self._model.close() + + def set_editable(self, editable): + self._model.set_editable(editable) def connect_to_search_box(self, sb): QObject.connect(sb, SIGNAL('search(PyQt_PyObject, PyQt_PyObject)'), @@ -785,7 +788,7 @@ class DeviceBooksModel(BooksModel): self.unknown = str(self.trUtf8('Unknown')) self.marked_for_deletion = {} self.search_engine = OnDeviceSearch(self) - + self.editable = True def mark_for_deletion(self, job, rows): self.marked_for_deletion[job] = self.indices(rows) @@ -793,7 +796,6 @@ class DeviceBooksModel(BooksModel): indices = self.row_indices(row) self.emit(SIGNAL('dataChanged(QModelIndex, QModelIndex)'), indices[0], indices[-1]) - def deletion_done(self, job, succeeded=True): if not self.marked_for_deletion.has_key(job): return @@ -818,7 +820,7 @@ class DeviceBooksModel(BooksModel): if self.map[index.row()] in self.indices_to_be_deleted(): return Qt.ItemIsUserCheckable # Can't figure out how to get the disabled flag in python flags = QAbstractTableModel.flags(self, index) - if index.isValid(): + if index.isValid() and self.editable: if index.column() in [0, 1] or (index.column() == 4 and self.db.supports_tags()): flags |= Qt.ItemIsEditable return flags @@ -999,6 +1001,10 @@ class DeviceBooksModel(BooksModel): self.sort(col, self.sorted_on[1]) done = True return done + + def set_editable(self, editable): + self.editable = editable + class SearchBox(QLineEdit): diff --git a/src/calibre/gui2/main.py b/src/calibre/gui2/main.py index 76775ae9bf..f297d1465c 100644 --- a/src/calibre/gui2/main.py +++ b/src/calibre/gui2/main.py @@ -585,7 +585,9 @@ class Main(MainWindow, Ui_MainWindow): return mainlist, cardlist = job.result self.memory_view.set_database(mainlist) + self.memory_view.set_editable(self.device_manager.device_class.CAN_SET_METADATA) self.card_view.set_database(cardlist) + self.card_view.set_editable(self.device_manager.device_class.CAN_SET_METADATA) for view in (self.memory_view, self.card_view): view.sortByColumn(3, Qt.DescendingOrder) if not view.restore_column_widths(): From 4579b1057130ae27d6f9b312b3a94ab1a1e86107 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 28 Mar 2009 14:39:17 -0400 Subject: [PATCH 040/319] PDF merging utility --- src/calibre/ebooks/pdf/pdfmerge.py | 94 ++++++++++++++++++++++++++++++ src/calibre/linux.py | 3 +- 2 files changed, 96 insertions(+), 1 deletion(-) create mode 100644 src/calibre/ebooks/pdf/pdfmerge.py diff --git a/src/calibre/ebooks/pdf/pdfmerge.py b/src/calibre/ebooks/pdf/pdfmerge.py new file mode 100644 index 0000000000..e8554dbc6b --- /dev/null +++ b/src/calibre/ebooks/pdf/pdfmerge.py @@ -0,0 +1,94 @@ +''' +Merge PDF files into a single PDF document. +''' +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os, sys, re + +from calibre.ebooks.metadata.meta import metadata_from_formats +from calibre.ebooks.metadata import authors_to_string +from calibre.utils.config import Config, StringConfig + +from pyPdf import PdfFileWriter, PdfFileReader + +def config(defaults=None): + desc = _('Options to control the transformation of pdf') + default_crop=10 + if defaults is None: + c = Config('trimpdf', desc) + else: + c = StringConfig(defaults, desc) + c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count', + help=_('Be verbose, useful for debugging. Can be specified multiple times for greater verbosity.')) + c.add_opt('output', ['-o', '--output'],default='merged.pdf', + help=_('Path to output file. By default a file is created in the current directory.')) + return c + +def option_parser(): + c = config() + return c.option_parser(usage=_('''\ + %prog [options] file1.pdf file2.pdf ... + + Merges individual pdfs. Metadata will be used from the first PDF specified. + ''')) + +def merge_files(in_paths, out_path, metadata=None): + if metadata == None: + title = _('Unknown') + author = _('Unknown') + else: + title = metadata.title + author = authors_to_string(metadata.authors) + + out_pdf = PdfFileWriter(title=title, author=author) + + for pdf_path in in_paths: + pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb')) + for page in pdf.pages: + out_pdf.addPage(page) + + with open(out_path, 'wb') as out_file: + out_pdf.write(out_file) + +def verify_files(files): + invalid = [] + + for pdf_path in files: + try: + with open(os.path.abspath(pdf_path), 'rb') as pdf_file: + pdf = PdfFileReader(pdf_file) + if pdf.isEncrypted or pdf.numPages <= 0: + raise Exception + except: + invalid.append(pdf_path) + return invalid + +def main(args=sys.argv): + parser = option_parser() + opts, args = parser.parse_args(args) + args = args[1:] + + if len(args) < 2: + print 'Error: Two or more PDF files are required.\n\n' + print parser.get_usage() + return 2 + + bad_pdfs = verify_files(args) + if bad_pdfs != []: + for pdf in bad_pdfs: + print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % pdf + return 2 + + mi = metadata_from_formats([args[0]]) + + merge_files(args, opts.output, mi) + + return 0 + +if __name__ == '__main__': + sys.exit(main()) + diff --git a/src/calibre/linux.py b/src/calibre/linux.py index 15dcb6fed9..c7a6099623 100644 --- a/src/calibre/linux.py +++ b/src/calibre/linux.py @@ -39,7 +39,8 @@ entry_points = { 'calibre-fontconfig = calibre.utils.fontconfig:main', 'calibre-parallel = calibre.parallel:main', 'calibre-customize = calibre.customize.ui:main', - 'pdftrim = calibre.ebooks.pdf.pdftrim:main' , + 'pdftrim = calibre.ebooks.pdf.pdftrim:main', + 'pdfmerge = calibre.ebooks.pdf.pdfmerge:main', 'fetch-ebook-metadata = calibre.ebooks.metadata.fetch:main', ], 'gui_scripts' : [ From a5228d56d2b04fedba4c77791ac2893c5bd1c6b7 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 28 Mar 2009 21:57:42 -0400 Subject: [PATCH 041/319] Add PDF splitting utility --- src/calibre/ebooks/pdf/pdfmerge.py | 6 +- src/calibre/ebooks/pdf/pdfsplit.py | 189 +++++++++++++++++++++++++++++ src/calibre/linux.py | 1 + 3 files changed, 193 insertions(+), 3 deletions(-) create mode 100644 src/calibre/ebooks/pdf/pdfsplit.py diff --git a/src/calibre/ebooks/pdf/pdfmerge.py b/src/calibre/ebooks/pdf/pdfmerge.py index e8554dbc6b..4a741c4f5a 100644 --- a/src/calibre/ebooks/pdf/pdfmerge.py +++ b/src/calibre/ebooks/pdf/pdfmerge.py @@ -7,7 +7,7 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import os, sys, re +import os, sys from calibre.ebooks.metadata.meta import metadata_from_formats from calibre.ebooks.metadata import authors_to_string @@ -24,7 +24,7 @@ def config(defaults=None): c = StringConfig(defaults, desc) c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count', help=_('Be verbose, useful for debugging. Can be specified multiple times for greater verbosity.')) - c.add_opt('output', ['-o', '--output'],default='merged.pdf', + c.add_opt('output', ['-o', '--output'], default='merged.pdf', help=_('Path to output file. By default a file is created in the current directory.')) return c @@ -33,7 +33,7 @@ def option_parser(): return c.option_parser(usage=_('''\ %prog [options] file1.pdf file2.pdf ... - Merges individual pdfs. Metadata will be used from the first PDF specified. + Merges individual PDFs. Metadata will be used from the first PDF specified. ''')) def merge_files(in_paths, out_path, metadata=None): diff --git a/src/calibre/ebooks/pdf/pdfsplit.py b/src/calibre/ebooks/pdf/pdfsplit.py new file mode 100644 index 0000000000..460dbef148 --- /dev/null +++ b/src/calibre/ebooks/pdf/pdfsplit.py @@ -0,0 +1,189 @@ +''' +Split PDF file into multiple PDF documents. +''' +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os, sys, re + +from calibre.ebooks.metadata.meta import metadata_from_formats +from calibre.ebooks.metadata import authors_to_string +from calibre.utils.config import Config, StringConfig + +from pyPdf import PdfFileWriter, PdfFileReader + +def config(defaults=None): + desc = _('Options to control the transformation of pdf') + default_crop=10 + if defaults is None: + c = Config('trimpdf', desc) + else: + c = StringConfig(defaults, desc) + c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count', + help=_('Be verbose, useful for debugging. Can be specified multiple times for greater verbosity.')) + c.add_opt('output', ['-o', '--output'], default='split.pdf', + help=_('Path to output file. By default a file is created in the current directory. \ + The file name will be the base name for the output.')) + return c + +def option_parser(): + c = config() + return c.option_parser(usage=_('''\ + + %prog [options] file.pdf page_to_split_on ... + %prog [options] file.pdf page_range_to_split_on ... + + Ex. + + %prog file.pdf 6 + %prog file.pdf 6-12 + %prog file.pdf 6-12 8 10 9-20 + + Split a PDF. + ''')) + +def split_pdf(in_path, pages, page_ranges, out_name, metadata=None): + pdf = PdfFileReader(open(os.path.abspath(in_path), 'rb')) + total_pages = pdf.numPages - 1 + + for index in pages+page_ranges: + if index in pages: + write_pdf(pdf, out_name, '%s' % (index + 1), index, total_pages, metadata) + else: + + write_pdf(pdf, out_name, '%s-%s' % (index[0] + 1, index[1] + 1), index[0], index[1], metadata) + +def write_pdf(pdf, name, suffix, start, end, metadata=None): + if metadata == None: + title = _('Unknown') + author = _('Unknown') + else: + title = metadata.title + author = authors_to_string(metadata.authors) + + out_pdf = PdfFileWriter(title=title, author=author) + for page_num in range(start, end + 1): + out_pdf.addPage(pdf.getPage(page_num)) + with open('%s%s.pdf' % (name, suffix), 'wb') as out_file: + out_pdf.write(out_file) + +def split_args(args): + pdf = '' + pages = [] + page_ranges = [] + bad = [] + + for arg in args: + arg = arg.strip() + # Find the pdf input + if re.search('(?iu)^.*?\.pdf[ ]*$', arg) != None: + if pdf == '': + pdf = arg + else: + bad.append(arg) + # Find single indexes + elif re.search('^[ ]*\d+[ ]*$', arg) != None: + pages.append(arg) + # Find index ranges + elif re.search('^[ ]*\d+[ ]*-[ ]*\d+[ ]*$', arg) != None: + mo = re.search('^[ ]*(?P\d+)[ ]*-[ ]*(?P\d+)[ ]*$', arg) + start = mo.group('start') + end = mo.group('end') + + # check to see if the range is really a single index + if start == end: + pages.append(start) + else: + page_ranges.append([start, end]) + else: + bad.append(arg) + + bad = sorted(list(set(bad))) + + return pdf, pages, page_ranges, bad + +# Remove duplicates from pages and page_ranges. +# Set pages higher than the total number of pages in the pdf to the last page. +# Return pages and page_ranges as lists of ints. +def clean_page_list(pdf_path, pages, page_ranges): + pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb')) + + total_pages = pdf.numPages + sorted_pages = [] + sorted_ranges = [] + + for index in pages: + index = int(index) + if index > total_pages: + sorted_pages.append(total_pages - 1) + else: + sorted_pages.append(index - 1) + + for start, end in page_ranges: + start = int(start) + end = int(end) + + if start > total_pages and end > total_pages: + sorted_pages.append(total_pages - 1) + continue + + if start > total_pages: + start = total_pages + if end > total_pages: + end = total_pages + page_range = sorted([start - 1, end - 1]) + if page_range not in sorted_ranges: + sorted_ranges.append(page_range) + + # Remove duplicates and sort + pages = sorted(list(set(sorted_pages))) + page_ranges = sorted(sorted_ranges) + + return pages, page_ranges + +# Return True if the pdf is valid. +def valid_pdf(pdf_path): + try: + with open(os.path.abspath(pdf_path), 'rb') as pdf_file: + pdf = PdfFileReader(pdf_file) + if pdf.isEncrypted or pdf.numPages <= 0: + raise Exception + except: + return False + return True + +def main(args=sys.argv): + parser = option_parser() + opts, args = parser.parse_args(args) + + pdf, pages, page_ranges, unknown = split_args(args[1:]) + + if pdf == '' and (pages == [] or page_ranges == []): + print 'Error: PDF and where to split is required.\n\n' + print parser.get_usage() + return 2 + + if unknown != []: + for arg in unknown: + print 'Error: Unknown argument `%s`' % arg + print parser.get_usage() + return 2 + + if not valid_pdf(pdf): + print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % pdf + return 2 + + pages, page_ranges = clean_page_list(pdf, pages, page_ranges) + + mi = metadata_from_formats([pdf]) + + split_pdf(pdf, pages, page_ranges, os.path.splitext(opts.output)[0], mi) + + return 0 + +if __name__ == '__main__': + sys.exit(main()) + diff --git a/src/calibre/linux.py b/src/calibre/linux.py index c7a6099623..3ba6f55bc8 100644 --- a/src/calibre/linux.py +++ b/src/calibre/linux.py @@ -41,6 +41,7 @@ entry_points = { 'calibre-customize = calibre.customize.ui:main', 'pdftrim = calibre.ebooks.pdf.pdftrim:main', 'pdfmerge = calibre.ebooks.pdf.pdfmerge:main', + 'pdfsplit = calibre.ebooks.pdf.pdfsplit:main', 'fetch-ebook-metadata = calibre.ebooks.metadata.fetch:main', ], 'gui_scripts' : [ From ffa5f36fae29af536d8eb4a8eb8082eefc917f86 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 29 Mar 2009 09:10:47 -0400 Subject: [PATCH 042/319] bzr 'command sub-command' style wrapper for pdf manipulation --- src/calibre/ebooks/pdf/manipulate.py | 67 +++++++++++++++++++ .../ebooks/pdf/{pdfmerge.py => merge.py} | 13 ++-- .../ebooks/pdf/{pdfsplit.py => split.py} | 21 +++--- .../ebooks/pdf/{pdftrim.py => trim.py} | 10 +-- src/calibre/linux.py | 5 +- 5 files changed, 89 insertions(+), 27 deletions(-) create mode 100644 src/calibre/ebooks/pdf/manipulate.py rename src/calibre/ebooks/pdf/{pdfmerge.py => merge.py} (92%) rename src/calibre/ebooks/pdf/{pdfsplit.py => split.py} (93%) rename src/calibre/ebooks/pdf/{pdftrim.py => trim.py} (95%) diff --git a/src/calibre/ebooks/pdf/manipulate.py b/src/calibre/ebooks/pdf/manipulate.py new file mode 100644 index 0000000000..0e75734bb9 --- /dev/null +++ b/src/calibre/ebooks/pdf/manipulate.py @@ -0,0 +1,67 @@ +''' +Command line interface to run pdf manipulation commands. +''' +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import string, sys + +from calibre.utils.config import Config, StringConfig +from calibre.ebooks.pdf import merge, split, trim + +COMMANDS = { + 'merge' : merge, + 'split' : split, + 'trim' : trim, + } + +def config(defaults=None): + desc = _('Options to control the transformation of pdf') + if defaults is None: + c = Config('trimpdf', desc) + else: + c = StringConfig(defaults, desc) + return c + +def option_parser(): + c = config() + return c.option_parser(usage=_('''\ + + %prog command ... + + command can be one of the following: + [%%commands] + + Use %prog command --help to get more information about a specific command + + Manipulate a PDF. + '''.replace('%%commands', string.join(sorted(COMMANDS.keys()), ', ')))) + +def main(args=sys.argv): + parser = option_parser() + + if len(args) < 2: + print 'Error: No command sepecified.\n' + print parser.get_usage() + return 2 + + command = args[1].lower().strip() + + if command in COMMANDS.keys(): + del args[1] + return COMMANDS[command].main(args, command) + else: + parser.parse_args(args) + print 'Unknown command %s.\n' % command + print parser.get_usage() + return 2 + + # We should never get here. + return 0 + +if __name__ == '__main__': + sys.exit(main()) + diff --git a/src/calibre/ebooks/pdf/pdfmerge.py b/src/calibre/ebooks/pdf/merge.py similarity index 92% rename from src/calibre/ebooks/pdf/pdfmerge.py rename to src/calibre/ebooks/pdf/merge.py index 4a741c4f5a..7ae35d1065 100644 --- a/src/calibre/ebooks/pdf/pdfmerge.py +++ b/src/calibre/ebooks/pdf/merge.py @@ -17,9 +17,8 @@ from pyPdf import PdfFileWriter, PdfFileReader def config(defaults=None): desc = _('Options to control the transformation of pdf') - default_crop=10 if defaults is None: - c = Config('trimpdf', desc) + c = Config('mergepdf', desc) else: c = StringConfig(defaults, desc) c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count', @@ -28,13 +27,13 @@ def config(defaults=None): help=_('Path to output file. By default a file is created in the current directory.')) return c -def option_parser(): +def option_parser(name): c = config() return c.option_parser(usage=_('''\ - %prog [options] file1.pdf file2.pdf ... + %prog %%name [options] file1.pdf file2.pdf ... Merges individual PDFs. Metadata will be used from the first PDF specified. - ''')) + '''.replace('%%name', name))) def merge_files(in_paths, out_path, metadata=None): if metadata == None: @@ -67,8 +66,8 @@ def verify_files(files): invalid.append(pdf_path) return invalid -def main(args=sys.argv): - parser = option_parser() +def main(args=sys.argv, name=''): + parser = option_parser(name) opts, args = parser.parse_args(args) args = args[1:] diff --git a/src/calibre/ebooks/pdf/pdfsplit.py b/src/calibre/ebooks/pdf/split.py similarity index 93% rename from src/calibre/ebooks/pdf/pdfsplit.py rename to src/calibre/ebooks/pdf/split.py index 460dbef148..36517fb704 100644 --- a/src/calibre/ebooks/pdf/pdfsplit.py +++ b/src/calibre/ebooks/pdf/split.py @@ -17,9 +17,8 @@ from pyPdf import PdfFileWriter, PdfFileReader def config(defaults=None): desc = _('Options to control the transformation of pdf') - default_crop=10 if defaults is None: - c = Config('trimpdf', desc) + c = Config('splitpdf', desc) else: c = StringConfig(defaults, desc) c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count', @@ -29,21 +28,21 @@ def config(defaults=None): The file name will be the base name for the output.')) return c -def option_parser(): +def option_parser(name): c = config() return c.option_parser(usage=_('''\ - %prog [options] file.pdf page_to_split_on ... - %prog [options] file.pdf page_range_to_split_on ... + %prog %%name [options] file.pdf page_to_split_on ... + %prog %%name [options] file.pdf page_range_to_split_on ... Ex. - %prog file.pdf 6 - %prog file.pdf 6-12 - %prog file.pdf 6-12 8 10 9-20 + %prog %%name file.pdf 6 + %prog %%name file.pdf 6-12 + %prog %%name file.pdf 6-12 8 10 9-20 Split a PDF. - ''')) + '''.replace('%%name', name))) def split_pdf(in_path, pages, page_ranges, out_name, metadata=None): pdf = PdfFileReader(open(os.path.abspath(in_path), 'rb')) @@ -155,8 +154,8 @@ def valid_pdf(pdf_path): return False return True -def main(args=sys.argv): - parser = option_parser() +def main(args=sys.argv, name=''): + parser = option_parser(name) opts, args = parser.parse_args(args) pdf, pages, page_ranges, unknown = split_args(args[1:]) diff --git a/src/calibre/ebooks/pdf/pdftrim.py b/src/calibre/ebooks/pdf/trim.py similarity index 95% rename from src/calibre/ebooks/pdf/pdftrim.py rename to src/calibre/ebooks/pdf/trim.py index c1e8fa2494..c999d24a46 100644 --- a/src/calibre/ebooks/pdf/pdftrim.py +++ b/src/calibre/ebooks/pdf/trim.py @@ -33,16 +33,16 @@ def config(defaults=None): return c -def option_parser(): +def option_parser(name): c = config() return c.option_parser(usage=_('''\ - %prog [options] file.pdf + %prog %%name [options] file.pdf Crops a pdf. - ''')) + '''.replace('%%name', name))) -def main(args=sys.argv): - parser = option_parser() +def main(args=sys.argv, name=''): + parser = option_parser(name) opts, args = parser.parse_args(args) try: source = os.path.abspath(args[1]) diff --git a/src/calibre/linux.py b/src/calibre/linux.py index 3ba6f55bc8..6bfe665557 100644 --- a/src/calibre/linux.py +++ b/src/calibre/linux.py @@ -39,10 +39,7 @@ entry_points = { 'calibre-fontconfig = calibre.utils.fontconfig:main', 'calibre-parallel = calibre.parallel:main', 'calibre-customize = calibre.customize.ui:main', - 'pdftrim = calibre.ebooks.pdf.pdftrim:main', - 'pdfmerge = calibre.ebooks.pdf.pdfmerge:main', - 'pdfsplit = calibre.ebooks.pdf.pdfsplit:main', - 'fetch-ebook-metadata = calibre.ebooks.metadata.fetch:main', + 'pdfmanipulate = calibre.ebooks.pdf.manipulate:main', ], 'gui_scripts' : [ __appname__+' = calibre.gui2.main:main', From 9a81882d4f9b306289ffb0dd564e2a1f2f006f9e Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 29 Mar 2009 09:34:43 -0400 Subject: [PATCH 043/319] Remove unnecessary options from pdf manipulation routines --- src/calibre/ebooks/pdf/manipulate.py | 2 +- src/calibre/ebooks/pdf/merge.py | 2 -- src/calibre/ebooks/pdf/split.py | 2 -- src/calibre/ebooks/pdf/trim.py | 2 -- 4 files changed, 1 insertion(+), 7 deletions(-) diff --git a/src/calibre/ebooks/pdf/manipulate.py b/src/calibre/ebooks/pdf/manipulate.py index 0e75734bb9..15c9404e25 100644 --- a/src/calibre/ebooks/pdf/manipulate.py +++ b/src/calibre/ebooks/pdf/manipulate.py @@ -21,7 +21,7 @@ COMMANDS = { def config(defaults=None): desc = _('Options to control the transformation of pdf') if defaults is None: - c = Config('trimpdf', desc) + c = Config('manipulatepdf', desc) else: c = StringConfig(defaults, desc) return c diff --git a/src/calibre/ebooks/pdf/merge.py b/src/calibre/ebooks/pdf/merge.py index 7ae35d1065..c0385080ad 100644 --- a/src/calibre/ebooks/pdf/merge.py +++ b/src/calibre/ebooks/pdf/merge.py @@ -21,8 +21,6 @@ def config(defaults=None): c = Config('mergepdf', desc) else: c = StringConfig(defaults, desc) - c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count', - help=_('Be verbose, useful for debugging. Can be specified multiple times for greater verbosity.')) c.add_opt('output', ['-o', '--output'], default='merged.pdf', help=_('Path to output file. By default a file is created in the current directory.')) return c diff --git a/src/calibre/ebooks/pdf/split.py b/src/calibre/ebooks/pdf/split.py index 36517fb704..cc6965dd68 100644 --- a/src/calibre/ebooks/pdf/split.py +++ b/src/calibre/ebooks/pdf/split.py @@ -21,8 +21,6 @@ def config(defaults=None): c = Config('splitpdf', desc) else: c = StringConfig(defaults, desc) - c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count', - help=_('Be verbose, useful for debugging. Can be specified multiple times for greater verbosity.')) c.add_opt('output', ['-o', '--output'], default='split.pdf', help=_('Path to output file. By default a file is created in the current directory. \ The file name will be the base name for the output.')) diff --git a/src/calibre/ebooks/pdf/trim.py b/src/calibre/ebooks/pdf/trim.py index c999d24a46..b32312fee8 100644 --- a/src/calibre/ebooks/pdf/trim.py +++ b/src/calibre/ebooks/pdf/trim.py @@ -16,8 +16,6 @@ def config(defaults=None): c = Config('trimpdf', desc) else: c = StringConfig(defaults, desc) - c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count', - help=_('Be verbose, useful for debugging. Can be specified multiple times for greater verbosity.')) c.add_opt('output', ['-o', '--output'],default='cropped.pdf', help=_('Path to output file. By default a file is created in the current directory.')) c.add_opt('bottom_left_x', [ '-x', '--leftx'], default=default_crop, From 9e15e485883c6b589967a5ebe4d9f8bc58ae0982 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 29 Mar 2009 10:18:29 -0400 Subject: [PATCH 044/319] PDF info command --- src/calibre/ebooks/pdf/info.py | 89 ++++++++++++++++++++++++++++ src/calibre/ebooks/pdf/manipulate.py | 3 +- 2 files changed, 91 insertions(+), 1 deletion(-) create mode 100644 src/calibre/ebooks/pdf/info.py diff --git a/src/calibre/ebooks/pdf/info.py b/src/calibre/ebooks/pdf/info.py new file mode 100644 index 0000000000..46f1f11681 --- /dev/null +++ b/src/calibre/ebooks/pdf/info.py @@ -0,0 +1,89 @@ +''' +Merge PDF files into a single PDF document. +''' +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os, re, sys, time + +from calibre.utils.config import Config, StringConfig + +from pyPdf import PdfFileWriter, PdfFileReader + + +def config(defaults=None): + desc = _('Options to control the transformation of pdf') + if defaults is None: + c = Config('manipulatepdf', desc) + else: + c = StringConfig(defaults, desc) + return c + +def option_parser(name): + c = config() + return c.option_parser(usage=_('''\ + %prog %%name [options] file.pdf ... + + Get info about a PDF. + '''.replace('%%name', name))) + +def print_info(pdf_path): + with open(os.path.abspath(pdf_path), 'rb') as pdf_file: + pdf = PdfFileReader(pdf_file) + print _('Title: %s' % pdf.documentInfo.title) + print _('Author: %s' % pdf.documentInfo.author) + print _('Creator: %s' % pdf.documentInfo.creator) + print _('Producer: %s' % pdf.documentInfo.producer) + print _('Creation Date: %s' % time.strftime('%a %b %d %H:%M:%S %Y', time.gmtime(os.path.getctime(pdf_path)))) + print _('Modification Date: %s' % time.strftime('%a %b %d %H:%M:%S %Y', time.gmtime(os.path.getmtime(pdf_path)))) + print _('Pages: %s' % pdf.numPages) + print _('Encrypted: %s' % pdf.isEncrypted) + try: + print _('File Size: %s bytes' % os.path.getsize(pdf_path)) + except: pass + try: + pdf_file.seek(0) + vline = pdf_file.readline() + mo = re.search('(?iu)^%...-(?P\d+\.\d+)', vline) + if mo != None: + print _('PDF Version: %s' % mo.group('version')) + except: pass + +def verify_files(files): + invalid = [] + + for pdf_path in files: + try: + with open(os.path.abspath(pdf_path), 'rb') as pdf_file: + pdf = PdfFileReader(pdf_file) + except: + invalid.append(pdf_path) + return invalid + +def main(args=sys.argv, name=''): + parser = option_parser(name) + opts, args = parser.parse_args(args) + args = args[1:] + + if len(args) < 1: + print 'Error: No PDF sepecified.\n' + print parser.get_usage() + return 2 + + bad_pdfs = verify_files(args) + if bad_pdfs != []: + for pdf in bad_pdfs: + print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % pdf + return 2 + + for pdf in args: + print_info(pdf) + + return 0 + +if __name__ == '__main__': + sys.exit(main()) + diff --git a/src/calibre/ebooks/pdf/manipulate.py b/src/calibre/ebooks/pdf/manipulate.py index 15c9404e25..262aaf78d4 100644 --- a/src/calibre/ebooks/pdf/manipulate.py +++ b/src/calibre/ebooks/pdf/manipulate.py @@ -10,9 +10,10 @@ __docformat__ = 'restructuredtext en' import string, sys from calibre.utils.config import Config, StringConfig -from calibre.ebooks.pdf import merge, split, trim +from calibre.ebooks.pdf import info, merge, split, trim COMMANDS = { + 'info' : info, 'merge' : merge, 'split' : split, 'trim' : trim, From 1ed9efeb3904075310e05d3c18b1475617428f19 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 29 Mar 2009 10:20:21 -0400 Subject: [PATCH 045/319] Added subject to pdf info command --- src/calibre/ebooks/pdf/info.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/calibre/ebooks/pdf/info.py b/src/calibre/ebooks/pdf/info.py index 46f1f11681..115e411ce4 100644 --- a/src/calibre/ebooks/pdf/info.py +++ b/src/calibre/ebooks/pdf/info.py @@ -35,6 +35,7 @@ def print_info(pdf_path): pdf = PdfFileReader(pdf_file) print _('Title: %s' % pdf.documentInfo.title) print _('Author: %s' % pdf.documentInfo.author) + print _('Subject: %s' % pdf.documentInfo.subject) print _('Creator: %s' % pdf.documentInfo.creator) print _('Producer: %s' % pdf.documentInfo.producer) print _('Creation Date: %s' % time.strftime('%a %b %d %H:%M:%S %Y', time.gmtime(os.path.getctime(pdf_path)))) From b98ada75f7741349614f82b073a9f8f9c7288804 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 29 Mar 2009 18:26:44 -0700 Subject: [PATCH 046/319] IGN:... --- src/calibre/customize/conversion.py | 134 +++++++++--------- src/calibre/ebooks/conversion/cli.py | 46 +++--- src/calibre/ebooks/conversion/plumber.py | 119 ++++++++++------ src/calibre/ebooks/oeb/base.py | 2 +- src/calibre/ebooks/oeb/output.py | 12 +- src/calibre/ebooks/oeb/transforms/__init__.py | 10 ++ .../ebooks/oeb/transforms/trimmanifest.py | 5 +- 7 files changed, 187 insertions(+), 141 deletions(-) diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index a77e32beee..5cf497d904 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -1,6 +1,6 @@ from __future__ import with_statement ''' -Defines the plugin sytem for conversions. +Defines the plugin system for conversions. ''' import re, os, shutil @@ -10,24 +10,24 @@ from calibre import CurrentDir from calibre.customize import Plugin class ConversionOption(object): - + ''' Class representing conversion options ''' - - def __init__(self, name=None, help=None, long_switch=None, + + def __init__(self, name=None, help=None, long_switch=None, short_switch=None, choices=None): self.name = name self.help = help self.long_switch = long_switch self.short_switch = short_switch self.choices = choices - + if self.long_switch is None: self.long_switch = self.name.replace('_', '-') - + self.validate_parameters() - + def validate_parameters(self): ''' Validate the parameters passed to :method:`__init__`. @@ -36,21 +36,21 @@ class ConversionOption(object): raise ValueError(self.name + ' is not a valid Python identifier') if not self.help: raise ValueError('You must set the help text') - + def __hash__(self): return hash(self.name) - + def __eq__(self, other): return hash(self) == hash(other) - + class OptionRecommendation(object): LOW = 1 MED = 2 HIGH = 3 - + def __init__(self, recommended_value=None, level=LOW, **kwargs): ''' - An option recommendation. That is, an option as well as its recommended + An option recommendation. That is, an option as well as its recommended value and the level of the recommendation. ''' self.level = level @@ -58,9 +58,9 @@ class OptionRecommendation(object): self.option = kwargs.pop('option', None) if self.option is None: self.option = ConversionOption(**kwargs) - + self.validate_parameters() - + def validate_parameters(self): if self.option.choices and self.recommended_value not in \ self.option.choices: @@ -68,30 +68,30 @@ class OptionRecommendation(object): self.option.name) if not (isinstance(self.recommended_value, (int, float, str, unicode))\ or self.recommended_value is None): - raise ValueError('OpRec: %s:'%self.option.name + - repr(self.recommended_value) + + raise ValueError('OpRec: %s:'%self.option.name + + repr(self.recommended_value) + ' is not a string or a number') - + class InputFormatPlugin(Plugin): ''' - InputFormatPlugins are responsible for converting a document into + InputFormatPlugins are responsible for converting a document into HTML+OPF+CSS+etc. The results of the conversion *must* be encoded in UTF-8. The main action happens in :method:`convert`. ''' - + type = _('Conversion Input') can_be_disabled = False supported_platforms = ['windows', 'osx', 'linux'] - + #: Set of file types for which this plugin should be run #: For example: ``set(['azw', 'mobi', 'prc'])`` file_types = set([]) - + #: Options shared by all Input format plugins. Do not override #: in sub-classes. Use :member:`options` instead. Every option must be an - #: instance of :class:`OptionRecommendation`. + #: instance of :class:`OptionRecommendation`. common_options = set([ OptionRecommendation(name='debug_input', recommended_value=None, level=OptionRecommendation.LOW, @@ -101,7 +101,7 @@ class InputFormatPlugin(Plugin): 'WARNING: This completely deletes the contents of ' 'the specified directory.') ), - + OptionRecommendation(name='input_encoding', recommended_value=None, level=OptionRecommendation.LOW, help=_('Specify the character encoding of the input document. If ' @@ -110,73 +110,73 @@ class InputFormatPlugin(Plugin): 'do not declare an encoding or that have erroneous ' 'encoding declarations.') ), - + ]) - + #: Options to customize the behavior of this plugin. Every option must be an - #: instance of :class:`OptionRecommendation`. + #: instance of :class:`OptionRecommendation`. options = set([]) - - #: A set of 3-tuples of the form + + #: A set of 3-tuples of the form #: (option_name, recommended_value, recommendation_level) recommendations = set([]) - + def convert(self, stream, options, file_ext, parse_cache, log, accelerators): ''' This method must be implemented in sub-classes. It must return - the path to the created OPF file. All output should be contained in + the path to the created OPF file. All output should be contained in the current directory. If this plugin creates files outside the current - directory they must be deleted/marked for deletion before this method + directory they must be deleted/marked for deletion before this method returns. - + :param stream: A file like object that contains the input file. - - :param options: Options to customize the conversion process. + + :param options: Options to customize the conversion process. Guaranteed to have attributes corresponding - to all the options declared by this plugin. In + to all the options declared by this plugin. In addition, it will have a verbose attribute that takes integral values from zero upwards. Higher numbers - mean be more verbose. Another useful attribute is - ``input_profile`` that is an instance of + mean be more verbose. Another useful attribute is + ``input_profile`` that is an instance of :class:`calibre.customize.profiles.InputProfile`. - + :param file_ext: The extension (without the .) of the input file. It is guaranteed to be one of the `file_types` supported by this plugin. - + :param parse_cache: A dictionary that maps absolute file paths to parsed representations of their contents. For - HTML the representation is an lxml element of + HTML the representation is an lxml element of the root of the tree. For CSS it is a cssutils stylesheet. If this plugin parses any of the output files, it should add them to the cache so that later stages of the conversion wont have to re-parse them. If a parsed representation - is in the cache, there is no need to actually + is in the cache, there is no need to actually write the file to disk. - - :param log: A :class:`calibre.utils.logging.Log` object. All output + + :param log: A :class:`calibre.utils.logging.Log` object. All output should use this object. - + :param accelarators: A dictionary of various information that the input plugin can get easily that would speed up the subsequent stages of the conversion. - + ''' raise NotImplementedError - - def __call__(self, stream, options, file_ext, parse_cache, log, + + def __call__(self, stream, options, file_ext, parse_cache, log, accelerators, output_dir): log('InputFormatPlugin: %s running'%self.name, end=' ') if hasattr(stream, 'name'): log('on', stream.name) - + with CurrentDir(output_dir): for x in os.listdir('.'): shutil.rmtree(x) if os.path.isdir(x) else os.remove(x) - - - ret = self.convert(stream, options, file_ext, parse_cache, + + + ret = self.convert(stream, options, file_ext, parse_cache, log, accelerators) for key in list(parse_cache.keys()): if os.path.abspath(key) != key: @@ -184,7 +184,7 @@ class InputFormatPlugin(Plugin): 'relative path: %s')%(self.name, key) ) parse_cache[os.path.abspath(key)] = parse_cache.pop(key) - + if options.debug_input is not None: options.debug_input = os.path.abspath(options.debug_input) if not os.path.exists(options.debug_input): @@ -194,15 +194,15 @@ class InputFormatPlugin(Plugin): if hasattr(obj, 'cssText'): raw = obj.cssText else: - raw = html.tostring(obj, encoding='utf-8', method='xml', + raw = html.tostring(obj, encoding='utf-8', method='xml', include_meta_content_type=True, pretty_print=True) if isinstance(raw, unicode): raw = raw.encode('utf-8') open(f, 'wb').write(raw) shutil.copytree('.', options.debug_input) - - - + + + return ret @@ -210,32 +210,32 @@ class OutputFormatPlugin(Plugin): ''' OutputFormatPlugins are responsible for converting an OEB document (OPF+HTML) into an output ebook. - + The OEB document can be assumed to be encoded in UTF-8. The main action happens in :method:`convert`. ''' - + type = _('Conversion Output') can_be_disabled = False supported_platforms = ['windows', 'osx', 'linux'] - + #: The file type (extension without leading period) that this #: plugin outputs file_type = None - + #: Options shared by all Input format plugins. Do not override #: in sub-classes. Use :member:`options` instead. Every option must be an - #: instance of :class:`OptionRecommendation`. + #: instance of :class:`OptionRecommendation`. common_options = set([]) - + #: Options to customize the behavior of this plugin. Every option must be an - #: instance of :class:`OptionRecommendation`. + #: instance of :class:`OptionRecommendation`. options = set([]) - - #: A set of 3-tuples of the form + + #: A set of 3-tuples of the form #: (option_name, recommended_value, recommendation_level) recommendations = set([]) - def convert(self, oeb_book, input_plugin, options, parse_cache, log): + def convert(self, oeb_book, input_plugin, options, context, log): raise NotImplementedError - + diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index f52264f8d0..211761e415 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -30,7 +30,7 @@ options. the available options depend on the input and output file types. \ To get help on them specify the input and output file and then use the -h \ option. -For full documentation of the conversion system see +For full documentation of the conversion system see ''') + 'http://calibre.kovidgoyal.net/user_manual/conversion.html' import sys, os @@ -50,22 +50,22 @@ def check_command_line_options(parser, args, log): print_help(parser) log.error('\n\nYou must specify the input AND output files') raise SystemExit(1) - + input = os.path.abspath(args[1]) if not os.access(input, os.R_OK): log.error('Cannot read from', input) raise SystemExit(1) - + output = args[2] if output.startswith('.'): output = os.path.splitext(os.path.basename(input))[0]+output output = os.path.abspath(output) - + if '.' in output: if os.path.exists(output): log.warn('WARNING:', output, 'exists. Deleting.') os.remove(output) - + return input, output def option_recommendation_to_cli_option(add_option, rec): @@ -79,18 +79,18 @@ def option_recommendation_to_cli_option(add_option, rec): def add_input_output_options(parser, plumber): input_options, output_options = \ plumber.input_options, plumber.output_options - + def add_options(group, options): for opt in options: option_recommendation_to_cli_option(group, opt) - + if input_options: title = _('INPUT OPTIONS') io = OptionGroup(parser, title, _('Options to control the processing' ' of the input %s file')%plumber.input_fmt) add_options(io.add_option, input_options) parser.add_option_group(io) - + if output_options: title = plumber.output_fmt.upper() + ' ' + _('OPTIONS') oo = OptionGroup(parser, title, _('Options to control the processing' @@ -106,7 +106,7 @@ def add_pipeline_options(parser, plumber): 'output_profile', ] ), - + 'METADATA' : (_('Options to set metadata in the output'), plumber.metadata_option_names, ), @@ -114,19 +114,19 @@ def add_pipeline_options(parser, plumber): [ 'verbose', ]), - - + + } - + group_order = ['', 'METADATA', 'DEBUG'] - + for group in group_order: desc, options = groups[group] if group: group = OptionGroup(parser, group, desc) parser.add_option_group(group) add_option = group.add_option if group != '' else parser.add_option - + for name in options: rec = plumber.get_option_by_name(name) if rec.level < rec.HIGH: @@ -141,27 +141,27 @@ def main(args=sys.argv): if len(args) < 3: print_help(parser, log) return 1 - + input, output = check_command_line_options(parser, args, log) - + from calibre.ebooks.conversion.plumber import Plumber - + plumber = Plumber(input, output, log) add_input_output_options(parser, plumber) add_pipeline_options(parser, plumber) - + opts = parser.parse_args(args)[0] - recommendations = [(n.dest, getattr(opts, n.dest), + recommendations = [(n.dest, getattr(opts, n.dest), OptionRecommendation.HIGH) \ for n in parser.options_iter() if n.dest] plumber.merge_ui_recommendations(recommendations) - + plumber.run() - + log(_('Output saved to'), ' ', plumber.output) - + return 0 - + if __name__ == '__main__': sys.exit(main()) diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 75a6687c4e..44e2fda0c3 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -5,7 +5,7 @@ __docformat__ = 'restructuredtext en' import os -from calibre.customize.conversion import OptionRecommendation +from calibre.customize.conversion import OptionRecommendation from calibre.customize.ui import input_profiles, output_profiles, \ plugin_for_input_format, plugin_for_output_format @@ -13,23 +13,35 @@ class OptionValues(object): pass class Plumber(object): - + ''' + The `Plumber` manages the conversion pipeline. An UI should call the methods + :method:`merge_ui_recommendations` and then :method:`run`. The plumber will + take care of the rest. + ''' + metadata_option_names = [ - 'title', 'authors', 'title_sort', 'author_sort', 'cover', 'comments', - 'publisher', 'series', 'series_index', 'rating', 'isbn', + 'title', 'authors', 'title_sort', 'author_sort', 'cover', 'comments', + 'publisher', 'series', 'series_index', 'rating', 'isbn', 'tags', 'book_producer', 'language' ] - + def __init__(self, input, output, log): + ''' + :param input: Path to input file. + :param output: Path to output file/directory + ''' self.input = input self.output = output self.log = log - + + # Initialize the conversion options that are independent of input and + # output formats. The input and output plugins can still disable these + # options via recommendations. self.pipeline_options = [ -OptionRecommendation(name='verbose', +OptionRecommendation(name='verbose', recommended_value=0, level=OptionRecommendation.LOW, - short_switch='v', + short_switch='v', help=_('Level of verbosity. Specify multiple times for greater ' 'verbosity.') ), @@ -54,15 +66,15 @@ OptionRecommendation(name='output_profile', 'will work on a device. For example EPUB on the SONY reader.' ) ), - -OptionRecommendation(name='read_metadata_from_opf', + +OptionRecommendation(name='read_metadata_from_opf', recommended_value=None, level=OptionRecommendation.LOW, - short_switch='m', + short_switch='m', help=_('Read metadata from the specified OPF file. Metadata read ' - 'from this file will override any metadata in the source ' + 'from this file will override any metadata in the source ' 'file.') ), - + OptionRecommendation(name='title', recommended_value=None, level=OptionRecommendation.LOW, help=_('Set the title.')), @@ -120,57 +132,70 @@ OptionRecommendation(name='language', help=_('Set the language.')), ] - + input_fmt = os.path.splitext(input)[1] if not input_fmt: raise ValueError('Input file must have an extension') input_fmt = input_fmt[1:].lower() - + output_fmt = os.path.splitext(output)[1] if not output_fmt: output_fmt = '.oeb' output_fmt = output_fmt[1:].lower() - + self.input_plugin = plugin_for_input_format(input_fmt) self.output_plugin = plugin_for_output_format(output_fmt) - + if self.input_plugin is None: raise ValueError('No plugin to handle input format: '+input_fmt) - + if self.output_plugin is None: raise ValueError('No plugin to handle output format: '+output_fmt) - + self.input_fmt = input_fmt self.output_fmt = output_fmt - + + # Build set of all possible options. Two options are equal iff their + # names are the same. self.input_options = self.input_plugin.options.union( self.input_plugin.common_options) self.output_options = self.output_plugin.options.union( - self.output_plugin.common_options) - + self.output_plugin.common_options) + + # Remove the options that have been disabled by recommendations from the + # plugins. self.merge_plugin_recommendations() def get_option_by_name(self, name): - for group in (self.input_options, self.pipeline_options, + for group in (self.input_options, self.pipeline_options, self.output_options): for rec in group: if rec.option == name: return rec - + def merge_plugin_recommendations(self): for source in (self.input_plugin, self.output_plugin): for name, val, level in source.recommendations: rec = self.get_option_by_name(name) if rec is not None and rec.level <= level: rec.recommended_value = val - + def merge_ui_recommendations(self, recommendations): + ''' + Merge recommendations from the UI. As long as the UI recommendation + level is >= the baseline recommended level, the UI value is used, + *except* if the baseline has a recommendation level of `HIGH`. + ''' for name, val, level in recommendations: rec = self.get_option_by_name(name) if rec is not None and rec.level <= level and rec.level < rec.HIGH: rec.recommended_value = val - + def read_user_metadata(self): + ''' + Read all metadata specified by the user. Command line options override + metadata from a specified OPF file. + ''' from calibre.ebooks.metadata import MetaInformation, string_to_authors from calibre.ebooks.metadata.opf2 import OPF mi = MetaInformation(None, []) @@ -194,43 +219,55 @@ OptionRecommendation(name='language', mi.cover_data = ('', open(mi.cover, 'rb').read()) mi.cover = None self.user_metadata = mi - - + + def setup_options(self): + ''' + Setup the `self.opts` object. + ''' self.opts = OptionValues() - for group in (self.input_options, self.pipeline_options, + for group in (self.input_options, self.pipeline_options, self.output_options): for rec in group: setattr(self.opts, rec.option.name, rec.recommended_value) - + for x in input_profiles(): if x.short_name == self.opts.input_profile: self.opts.input_profile = x break - + for x in output_profiles(): if x.short_name == self.opts.output_profile: self.opts.output_profile = x break - + self.read_user_metadata() - + def run(self): + ''' + Run the conversion pipeline + ''' + # Setup baseline option values self.setup_options() + + # Run any preprocess plugins from calibre.customize.ui import run_plugins_on_preprocess self.input = run_plugins_on_preprocess(self.input) - + + # Create an OEBBook from the input file. The input plugin does all the + # heavy lifting. from calibre.ebooks.oeb.reader import OEBReader from calibre.ebooks.oeb.base import OEBBook parse_cache, accelerators = {}, {} - - opfpath = self.input_plugin(open(self.input, 'rb'), self.opts, + + opfpath = self.input_plugin(open(self.input, 'rb'), self.opts, self.input_fmt, parse_cache, self.log, accelerators) - + self.reader = OEBReader() - self.oeb = OEBBook(self.log, parse_cache=parse_cache) + self.oeb = OEBBook(self.log, parse_cache=parse_cache) + # Read OEB Book into OEBBook self.reader(self.oeb, opfpath) - - - \ No newline at end of file + + + diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 60328b6c81..c1e3549b10 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -1260,7 +1260,7 @@ class OEBBook(object): """Create empty book. Optional arguments: :param parse_cache: A cache of parsed XHTML/CSS. Keys are absolute - paths to te cached files and values are lxml root objects and + paths to the cached files and values are lxml root objects and cssutils stylesheets. :param:`encoding`: Default encoding for textual content read from an external container. diff --git a/src/calibre/ebooks/oeb/output.py b/src/calibre/ebooks/oeb/output.py index 0a74f488cf..d8d52859eb 100644 --- a/src/calibre/ebooks/oeb/output.py +++ b/src/calibre/ebooks/oeb/output.py @@ -6,12 +6,12 @@ __docformat__ = 'restructuredtext en' from calibre.customize.conversion import OutputFormatPlugin class OEBOutput(OutputFormatPlugin): - + name = 'OEB Output' author = 'Kovid Goyal' file_type = 'oeb' - - - def convert(self, oeb_book, input_plugin, options, parse_cache, log): - pass - + + + def convert(self, oeb_book, input_plugin, options, context, log): + pass + diff --git a/src/calibre/ebooks/oeb/transforms/__init__.py b/src/calibre/ebooks/oeb/transforms/__init__.py index e69de29bb2..3d1a86922e 100644 --- a/src/calibre/ebooks/oeb/transforms/__init__.py +++ b/src/calibre/ebooks/oeb/transforms/__init__.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + + + diff --git a/src/calibre/ebooks/oeb/transforms/trimmanifest.py b/src/calibre/ebooks/oeb/transforms/trimmanifest.py index 119ebcc73d..cae56315e5 100644 --- a/src/calibre/ebooks/oeb/transforms/trimmanifest.py +++ b/src/calibre/ebooks/oeb/transforms/trimmanifest.py @@ -20,11 +20,10 @@ class ManifestTrimmer(object): @classmethod def generate(cls, opts): return cls() - + def __call__(self, oeb, context): oeb.logger.info('Trimming unused files from manifest...') used = set() - hrefs = oeb.manifest.hrefs for term in oeb.metadata: for item in oeb.metadata[term]: if item.value in oeb.manifest.hrefs: @@ -42,7 +41,7 @@ class ManifestTrimmer(object): while unchecked: new = set() for item in unchecked: - if (item.media_type in OEB_DOCS or + if (item.media_type in OEB_DOCS or item.media_type[-4:] in ('/xml', '+xml')) and \ item.data is not None: hrefs = [sel(item.data) for sel in LINK_SELECTORS] From 44799e05efc6a4696f98a8fcf4f7350876427bb2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 29 Mar 2009 21:09:04 -0700 Subject: [PATCH 047/319] Conversion pipeline: Dont choke on HTML/CSS files that fail to parse correctly. Instead remove them from the mainfest. Preprocessing code migrated from epub layer to OEBBook. --- session.vim | 2 +- src/calibre/customize/conversion.py | 37 +----- src/calibre/customize/profiles.py | 28 ++--- src/calibre/ebooks/conversion/plumber.py | 10 +- src/calibre/ebooks/conversion/preprocess.py | 123 ++++++++++++++++++++ src/calibre/ebooks/mobi/input.py | 21 ++-- src/calibre/ebooks/oeb/base.py | 42 ++++--- src/calibre/ebooks/oeb/reader.py | 86 +++++++++----- 8 files changed, 242 insertions(+), 107 deletions(-) create mode 100644 src/calibre/ebooks/conversion/preprocess.py diff --git a/session.vim b/session.vim index 9d326c5822..454b468ae0 100644 --- a/session.vim +++ b/session.vim @@ -1,5 +1,5 @@ " Project wide builtins -let g:pyflakes_builtins += ["dynamic_property"] +let g:pyflakes_builtins += ["dynamic_property", '__'] python << EOFPY import os diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index 5cf497d904..3ebabc4d52 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -4,8 +4,6 @@ Defines the plugin system for conversions. ''' import re, os, shutil -from lxml import html - from calibre import CurrentDir from calibre.customize import Plugin @@ -121,7 +119,7 @@ class InputFormatPlugin(Plugin): #: (option_name, recommended_value, recommendation_level) recommendations = set([]) - def convert(self, stream, options, file_ext, parse_cache, log, accelerators): + def convert(self, stream, options, file_ext, log, accelerators): ''' This method must be implemented in sub-classes. It must return the path to the created OPF file. All output should be contained in @@ -144,17 +142,6 @@ class InputFormatPlugin(Plugin): is guaranteed to be one of the `file_types` supported by this plugin. - :param parse_cache: A dictionary that maps absolute file paths to - parsed representations of their contents. For - HTML the representation is an lxml element of - the root of the tree. For CSS it is a cssutils - stylesheet. If this plugin parses any of the - output files, it should add them to the cache - so that later stages of the conversion wont - have to re-parse them. If a parsed representation - is in the cache, there is no need to actually - write the file to disk. - :param log: A :class:`calibre.utils.logging.Log` object. All output should use this object. @@ -165,7 +152,7 @@ class InputFormatPlugin(Plugin): ''' raise NotImplementedError - def __call__(self, stream, options, file_ext, parse_cache, log, + def __call__(self, stream, options, file_ext, log, accelerators, output_dir): log('InputFormatPlugin: %s running'%self.name, end=' ') if hasattr(stream, 'name'): @@ -176,33 +163,15 @@ class InputFormatPlugin(Plugin): shutil.rmtree(x) if os.path.isdir(x) else os.remove(x) - ret = self.convert(stream, options, file_ext, parse_cache, + ret = self.convert(stream, options, file_ext, log, accelerators) - for key in list(parse_cache.keys()): - if os.path.abspath(key) != key: - log.warn(('InputFormatPlugin: %s returned a ' - 'relative path: %s')%(self.name, key) - ) - parse_cache[os.path.abspath(key)] = parse_cache.pop(key) - if options.debug_input is not None: options.debug_input = os.path.abspath(options.debug_input) if not os.path.exists(options.debug_input): os.makedirs(options.debug_input) shutil.rmtree(options.debug_input) - for f, obj in parse_cache.items(): - if hasattr(obj, 'cssText'): - raw = obj.cssText - else: - raw = html.tostring(obj, encoding='utf-8', method='xml', - include_meta_content_type=True, pretty_print=True) - if isinstance(raw, unicode): - raw = raw.encode('utf-8') - open(f, 'wb').write(raw) shutil.copytree('.', options.debug_input) - - return ret diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py index a3a7e22298..bd11a89bed 100644 --- a/src/calibre/customize/profiles.py +++ b/src/calibre/customize/profiles.py @@ -7,7 +7,7 @@ import sys, re from calibre.customize import Plugin class InputProfile(Plugin): - + author = 'Kovid Goyal' supported_platforms = set(['windows', 'osx', 'linux']) can_be_disabled = False @@ -20,40 +20,40 @@ class InputProfile(Plugin): short_name = 'default' # Used in the CLI so dont use spaces etc. in it description = _('This profile tries to provide sane defaults and is useful ' 'if you know nothing about the input document.') - + input_profiles = [InputProfile] - + class OutputProfile(Plugin): - + author = 'Kovid Goyal' supported_platforms = set(['windows', 'osx', 'linux']) can_be_disabled = False type = _('Output profile') - + name = 'Default Output Profile' short_name = 'default' # Used in the CLI so dont use spaces etc. in it description = _('This profile tries to provide sane defaults and is useful ' 'if you want to produce a document intended to be read at a ' 'computer or on a range of devices.') - + epub_flow_size = sys.maxint screen_size = None - remove_special_chars = False + remove_special_chars = None remove_object_tags = False - + class SonyReader(OutputProfile): - + name = 'Sony Reader' short_name = 'sony' description = _('This profile is intended for the SONY PRS line. ' 'The 500/505/700 etc.') - + epub_flow_size = 270000 screen_size = (590, 765) remove_special_chars = re.compile(u'[\u200b\u00ad]') remove_object_tags = True - - - -output_profiles = [OutputProfile, SonyReader] \ No newline at end of file + + + +output_profiles = [OutputProfile, SonyReader] diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 44e2fda0c3..0e2f98fde4 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -8,6 +8,7 @@ import os from calibre.customize.conversion import OptionRecommendation from calibre.customize.ui import input_profiles, output_profiles, \ plugin_for_input_format, plugin_for_output_format +from calibre.ebooks.conversion.preprocess import HTMLPreProcessor class OptionValues(object): pass @@ -258,16 +259,17 @@ OptionRecommendation(name='language', # heavy lifting. from calibre.ebooks.oeb.reader import OEBReader from calibre.ebooks.oeb.base import OEBBook - parse_cache, accelerators = {}, {} + accelerators = {} opfpath = self.input_plugin(open(self.input, 'rb'), self.opts, - self.input_fmt, parse_cache, self.log, + self.input_fmt, self.log, accelerators) - + html_preprocessor = HTMLPreProcessor() self.reader = OEBReader() - self.oeb = OEBBook(self.log, parse_cache=parse_cache) + self.oeb = OEBBook(self.log, html_preprocessor=html_preprocessor) # Read OEB Book into OEBBook self.reader(self.oeb, opfpath) + diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py new file mode 100644 index 0000000000..f544a331d8 --- /dev/null +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import re, functools + +from calibre import entity_to_unicode + +XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>') +SVG_NS = 'http://www.w3.org/2000/svg' +XLINK_NS = 'http://www.w3.org/1999/xlink' + +convert_entities = functools.partial(entity_to_unicode, exceptions=['quot', 'apos', 'lt', 'gt', 'amp']) +_span_pat = re.compile('', re.DOTALL|re.IGNORECASE) + + +def sanitize_head(match): + x = match.group(1) + x = _span_pat.sub('', x) + return '\n'+x+'\n' + + +class CSSPreProcessor(object): + + PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}') + + def __call__(self, data): + data = self.PAGE_PAT.sub('', data) + return data + +class HTMLPreProcessor(object): + + PREPROCESS = [ + # Some idiotic HTML generators (Frontpage I'm looking at you) + # Put all sorts of crap into . This messes up lxml + (re.compile(r']*>(.*?)', re.IGNORECASE|re.DOTALL), + sanitize_head), + # Convert all entities, since lxml doesn't handle them well + (re.compile(r'&(\S+?);'), convert_entities), + # Remove the ', re.IGNORECASE), + lambda match: ''), + ] + + # Fix pdftohtml markup + PDFTOHTML = [ + # Remove
    tags + (re.compile(r'', re.IGNORECASE), lambda match: '
    '), + # Remove page numbers + (re.compile(r'\d+
    ', re.IGNORECASE), lambda match: ''), + # Remove
    and replace

    with

    + (re.compile(r'\s*', re.IGNORECASE), lambda match: '

    '), + (re.compile(r'(.*)', re.IGNORECASE), + lambda match: match.group() if \ + re.match('<', match.group(1).lstrip()) or \ + len(match.group(1)) < 40 else match.group(1)), + # Remove hyphenation + (re.compile(r'-\n\r?'), lambda match: ''), + + # Remove gray background + (re.compile(r']+>'), lambda match : ''), + + # Remove non breaking spaces + (re.compile(ur'\u00a0'), lambda match : ' '), + + ] + + # Fix Book Designer markup + BOOK_DESIGNER = [ + # HR + (re.compile('


    ', re.IGNORECASE), + lambda match : ' '), + # Create header tags + (re.compile('<]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?', re.IGNORECASE), + lambda match : '

    %s

    '%(match.group(2) if match.group(2) else 'center', match.group(3))), + (re.compile('<]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?', re.IGNORECASE), + lambda match : '

    %s

    '%(match.group(2) if match.group(2) else 'center', match.group(3))), + (re.compile('<]*?id=title[^><]*?>(.*?)
    ', re.IGNORECASE|re.DOTALL), + lambda match : '

    %s

    '%(match.group(1),)), + (re.compile('<]*?id=subtitle[^><]*?>(.*?)
    ', re.IGNORECASE|re.DOTALL), + lambda match : '

    %s

    '%(match.group(1),)), + ] + + def is_baen(self, src): + return re.compile(r'<]*id=BookTitle', raw) is not None + + def is_pdftohtml(self, src): + return '' in src[:1000] + + def __call__(self, html, remove_special_chars=None): + if remove_special_chars is not None: + html = remove_special_chars.sub('', html) + if self.is_baen(html): + rules = [] + elif self.is_book_designer(html): + rules = self.BOOK_DESIGNER + elif self.is_pdftohtml(html): + rules = self.PDFTOHTML + else: + rules = [] + for rule in self.PREPROCESS + rules: + html = rule[0].sub(rule[1], html) + + # Handle broken XHTML w/ SVG (ugh) + if 'svg:' in html and SVG_NS not in html: + html = html.replace( + ' Date: Sun, 29 Mar 2009 22:35:03 -0700 Subject: [PATCH 048/319] Conversion pipeline: Create input and output profiles --- src/calibre/customize/profiles.py | 195 +++++++++++++++++++++++++++--- 1 file changed, 180 insertions(+), 15 deletions(-) diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py index bd11a89bed..5e1ff297bb 100644 --- a/src/calibre/customize/profiles.py +++ b/src/calibre/customize/profiles.py @@ -4,7 +4,36 @@ __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' import sys, re -from calibre.customize import Plugin +from itertools import izip + +from calibre.customize import Plugin as _Plugin + +FONT_SIZES = [('xx-small', 1), + ('x-small', None), + ('small', 2), + ('medium', 3), + ('large', 4), + ('x-large', 5), + ('xx-large', 6), + (None, 7)] + + +class Plugin(_Plugin): + + fbase = 12 + fsizes = [5, 7, 9, 12, 13.5, 17, 20, 22, 24] + screen_size = (800, 600) + dpi = 100 + + def initialize(self): + self.width, self.height = self.screen_size + fsizes = list(self.fsizes) + self.fsizes = [] + for (name, num), size in izip(FONT_SIZES, fsizes): + self.fsizes.append((name, num, float(size))) + self.fnames = dict((name, sz) for name, _, sz in self.fsizes if name) + self.fnums = dict((num, sz) for _, num, sz in self.fsizes if num) + class InputProfile(Plugin): @@ -13,15 +42,88 @@ class InputProfile(Plugin): can_be_disabled = False type = _('Input profile') -# TODO: Add some real information to this profile. All other profiles must -# inherit from this profile and override as needed - name = 'Default Input Profile' short_name = 'default' # Used in the CLI so dont use spaces etc. in it description = _('This profile tries to provide sane defaults and is useful ' 'if you know nothing about the input document.') -input_profiles = [InputProfile] + +class SonyReaderInput(InputProfile): + + name = 'Sony Reader' + short_name = 'sony' + description = _('This profile is intended for the SONY PRS line. ' + 'The 500/505/700 etc.') + + screen_size = (584, 754) + dpi = 168.451 + fbase = 12 + fsizes = [7.5, 9, 10, 12, 15.5, 20, 22, 24] + + +class MSReaderInput(InputProfile): + + name = 'Microsoft Reader' + short_name = 'msreader' + description = _('This profile is intended for the Microsoft Reader.') + + screen_size = (480, 652) + dpi = 96 + fbase = 13 + fsizes = [10, 11, 13, 16, 18, 20, 22, 26] + +class MobipocketInput(InputProfile): + + name = 'Mobipocket Books' + short_name = 'mobipocket' + description = _('This profile is intended for the Mobipocket books.') + + # Unfortunately MOBI books are not narrowly targeted, so this information is + # quite likely to be spurious + screen_size = (600, 800) + dpi = 96 + fbase = 18 + fsizes = [14, 14, 16, 18, 20, 22, 24, 26] + +class HanlinV3Input(InputProfile): + + name = 'Hanlin V3' + short_name = 'hanlinv3' + description = _('This profile is intended for the Hanlin V3 and its clones.') + + # Screen size is a best guess + screen_size = (584, 754) + dpi = 168.451 + fbase = 16 + fsizes = [12, 12, 14, 16, 18, 20, 22, 24] + +class CybookG3Input(InputProfile): + + name = 'Cybook G3' + short_name = 'cybookg3' + description = _('This profile is intended for the Cybook G3.') + + # Screen size is a best guess + screen_size = (600, 800) + dpi = 168.451 + fbase = 16 + fsizes = [12, 12, 14, 16, 18, 20, 22, 24] + +class KindleInput(InputProfile): + + name = 'Kindle' + short_name = 'kindle' + description = _('This profile is intended for the Amazon Kindle.') + + # Screen size is a best guess + screen_size = (525, 640) + dpi = 168.451 + fbase = 16 + fsizes = [12, 12, 14, 16, 18, 20, 22, 24] + + +input_profiles = [InputProfile, SonyReaderInput, MSReaderInput, + MobipocketInput, HanlinV3Input, CybookG3Input, KindleInput] class OutputProfile(Plugin): @@ -37,23 +139,86 @@ class OutputProfile(Plugin): 'if you want to produce a document intended to be read at a ' 'computer or on a range of devices.') - epub_flow_size = sys.maxint - screen_size = None - remove_special_chars = None - remove_object_tags = False + # ADE dies an agonizing, long drawn out death if HTML files have more + # bytes than this. + flow_size = sys.maxint + # ADE runs screaming when it sees these characters + remove_special_chars = re.compile(u'[\u200b\u00ad]') + # ADE falls to the ground in a dead faint when it sees an + remove_object_tags = True -class SonyReader(OutputProfile): +class SonyReaderOutput(OutputProfile): name = 'Sony Reader' short_name = 'sony' description = _('This profile is intended for the SONY PRS line. ' 'The 500/505/700 etc.') - epub_flow_size = 270000 - screen_size = (590, 765) - remove_special_chars = re.compile(u'[\u200b\u00ad]') - remove_object_tags = True + flow_size = 270000 + screen_size = (600, 775) + dpi = 168.451 + fbase = 12 + fsizes = [7.5, 9, 10, 12, 15.5, 20, 22, 24] +class MSReaderOutput(OutputProfile): + name = 'Microsoft Reader' + short_name = 'msreader' + description = _('This profile is intended for the Microsoft Reader.') -output_profiles = [OutputProfile, SonyReader] + screen_size = (480, 652) + dpi = 96 + fbase = 13 + fsizes = [10, 11, 13, 16, 18, 20, 22, 26] + +class MobipocketOutput(OutputProfile): + + name = 'Mobipocket Books' + short_name = 'mobipocket' + description = _('This profile is intended for the Mobipocket books.') + + # Unfortunately MOBI books are not narrowly targeted, so this information is + # quite likely to be spurious + screen_size = (600, 800) + dpi = 96 + fbase = 18 + fsizes = [14, 14, 16, 18, 20, 22, 24, 26] + +class HanlinV3Output(OutputProfile): + + name = 'Hanlin V3' + short_name = 'hanlinv3' + description = _('This profile is intended for the Hanlin V3 and its clones.') + + # Screen size is a best guess + screen_size = (584, 754) + dpi = 168.451 + fbase = 16 + fsizes = [12, 12, 14, 16, 18, 20, 22, 24] + +class CybookG3Output(OutputProfile): + + name = 'Cybook G3' + short_name = 'cybookg3' + description = _('This profile is intended for the Cybook G3.') + + # Screen size is a best guess + screen_size = (600, 800) + dpi = 168.451 + fbase = 16 + fsizes = [12, 12, 14, 16, 18, 20, 22, 24] + +class KindleOutput(OutputProfile): + + name = 'Kindle' + short_name = 'kindle' + description = _('This profile is intended for the Amazon Kindle.') + + # Screen size is a best guess + screen_size = (525, 640) + dpi = 168.451 + fbase = 16 + fsizes = [12, 12, 14, 16, 18, 20, 22, 24] + +output_profiles = [OutputProfile, SonyReaderOutput, MSReaderOutput, + MobipocketOutput, HanlinV3Output, CybookG3Output, KindleOutput] From 87580e27ba9e270e1c104b32b9fdc6d0b41fd283 Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 30 Mar 2009 19:03:49 -0400 Subject: [PATCH 049/319] TXT metadata reader --- src/calibre/customize/builtins.py | 10 ++++++++++ src/calibre/ebooks/metadata/txt.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 src/calibre/ebooks/metadata/txt.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index d37e241891..2cbf036c1f 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -159,6 +159,16 @@ class ODTMetadataReader(MetadataReaderPlugin): def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.odt import get_metadata return get_metadata(stream) + +class TXTMetadataReader(MetaReaderPlugin): + + name = 'Read TXT metadata' + file_types = set(['txt']) + description = _('Read metadata from %s files') % 'TXT' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.txt import get_metadata + return get_metadata(stream) class LRXMetadataReader(MetadataReaderPlugin): diff --git a/src/calibre/ebooks/metadata/txt.py b/src/calibre/ebooks/metadata/txt.py new file mode 100644 index 0000000000..5a5ab13ae9 --- /dev/null +++ b/src/calibre/ebooks/metadata/txt.py @@ -0,0 +1,30 @@ +'''Read meta information from TXT files''' + +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' + +import re + +from calibre.ebooks.metadata import MetaInformation + +def get_metadata(stream, extract_cover=True): + """ Return metadata as a L{MetaInfo} object """ + mi = MetaInformation(_('Unknown'), [_('Unknown')]) + stream.seek(0) + + mdata = '' + for x in range(0, 4): + line = stream.readline() + if line == '': + break + else: + mdata += line + + mo = re.search('(?u)^[ ]*(?P.+)[ ]*\n\n\n[ ]*(?P<author>.+)[ ]*\n$', mdata) + if mo != None: + mi.title = mo.group('title') + mi.authors = mo.group('author').split(',') + + return mi From 9aa2fbfbecd0ba57226a9930dad11ab47f52daf6 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Mon, 30 Mar 2009 16:26:24 -0700 Subject: [PATCH 050/319] The complete framework for the conversion pipeline --- src/calibre/customize/conversion.py | 19 ++++++- src/calibre/ebooks/conversion/cli.py | 11 +++- src/calibre/ebooks/conversion/plumber.py | 67 +++++++++++++++++++++++- src/calibre/ebooks/oeb/output.py | 25 ++++++++- 4 files changed, 117 insertions(+), 5 deletions(-) diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index 3ebabc4d52..b25704569b 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -205,6 +205,23 @@ class OutputFormatPlugin(Plugin): #: (option_name, recommended_value, recommendation_level) recommendations = set([]) - def convert(self, oeb_book, input_plugin, options, context, log): + def convert(self, oeb_book, output, input_plugin, opts, log): + ''' + Render the contents of `oeb_book` (which is an instance of + :class:`calibre.ebooks.oeb.OEBBook` to the file specified by output. + + :param output: Either a file like object or a string. If it is a string + it is the path to a directory that may or may not exist. The output + plugin should write its output into that directory. If it is a file like + object, the output plugin should write its output into the file. + + :param input_plugin: The input plugin that was used at the beginning of + the conversion pipeline. + + :param opts: Conversion options. Guaranteed to have attributes + corresponding to the OptionRecommendations of this plugin. + + :param log: The logger. Print debug/info messages etc. using this. + ''' raise NotImplementedError diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index 211761e415..9994b61a7c 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -106,6 +106,15 @@ def add_pipeline_options(parser, plumber): 'output_profile', ] ), + 'LOOK AND FEEL' : ( + _('Options to control the look and feel of the output'), + [ + 'base_font_size', + 'font_size_mapping', + 'line_height', + 'linearize_tables', + ] + ), 'METADATA' : (_('Options to set metadata in the output'), plumber.metadata_option_names, @@ -118,7 +127,7 @@ def add_pipeline_options(parser, plumber): } - group_order = ['', 'METADATA', 'DEBUG'] + group_order = ['', 'LOOK AND FEEL', 'METADATA', 'DEBUG'] for group in group_order: desc, options = groups[group] diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 0e2f98fde4..5393aaf034 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -68,6 +68,45 @@ OptionRecommendation(name='output_profile', ) ), +OptionRecommendation(name='base_font_size', + recommended_value=0, level=OptionRecommendation.LOW, + help=_('The base font size in pts. All font sizes in the produced book ' + 'will be rescaled based on this size. By choosing a larger ' + 'size you can make the fonts in the output bigger and vice ' + 'versa. By default, the base font size is chosen based on ' + 'the output profile you chose.' + ) + ), + +OptionRecommendation(name='font_size_mapping', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Mapping from CSS font names to font sizes in pts. ' + 'An example setting is 12,12,14,16,18,20,22,24. ' + 'These are the mappings for the sizes xx-small to xx-large, ' + 'with the final size being for huge fonts. The font ' + 'rescaling algorithm uses these sizes to intelligently ' + 'rescale fonts. The default is to use a mapping based on ' + 'the output profile you chose.' + ) + ), + +OptionRecommendation(name='line_height', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('The line height in pts. Controls spacing between consecutive ' + 'lines of text. By default ??' + ) + ), + +OptionRecommendation(name='linearize_tables', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Some badly designed documents use tables to control the ' + 'layout of text on the page. When converted these documents ' + 'often have text that runs of the page and other artifacts. ' + 'This option will extract the content from the tables and ' + 'present it in a linear fashion.' + ) + ), + OptionRecommendation(name='read_metadata_from_opf', recommended_value=None, level=OptionRecommendation.LOW, short_switch='m', @@ -268,8 +307,34 @@ OptionRecommendation(name='language', self.reader = OEBReader() self.oeb = OEBBook(self.log, html_preprocessor=html_preprocessor) # Read OEB Book into OEBBook + self.log.info('Parsing all content...') self.reader(self.oeb, opfpath) - + self.opts.source = self.opts.input_profile + self.opts.dest = self.opts.output_profile + + from calibre.ebooks.oeb.transforms.flatcss import CSSFlattener + fbase = self.opts.base_font_size + if fbase == 0: + fbase = self.opts.dest.fbase + fkey = self.opts.font_size_mapping + if fkey is None: + fkey = self.opts.dest.fsizes + + flattener = CSSFlattener(fbase=fbase, fkey=fkey, + lineh=self.opts.line_height, + untable=self.opts.linearize_tables) + self.log.info('Flattening CSS...') + flattener(self.oeb, self.opts) + + from calibre.ebooks.oeb.transforms.trimmanifest import ManifestTrimmer + + self.log.info('Cleaning up manifest...') + trimmer = ManifestTrimmer() + trimmer(self.oeb, self.opts) + + self.log.info('Creating %s output...'%self.output_plugin.name) + self.output_plugin(self.oeb, self.output, self.input_plugin, self.opts, + self.log) diff --git a/src/calibre/ebooks/oeb/output.py b/src/calibre/ebooks/oeb/output.py index d8d52859eb..b26934e18a 100644 --- a/src/calibre/ebooks/oeb/output.py +++ b/src/calibre/ebooks/oeb/output.py @@ -3,7 +3,12 @@ __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' +import os + +from lxml import etree + from calibre.customize.conversion import OutputFormatPlugin +from calibre import CurrentDir class OEBOutput(OutputFormatPlugin): @@ -12,6 +17,22 @@ class OEBOutput(OutputFormatPlugin): file_type = 'oeb' - def convert(self, oeb_book, input_plugin, options, context, log): - pass + def convert(self, oeb_book, output_path, input_plugin, opts, log): + self.log, self.opts = log, opts + if not os.path.exists(output_path): + os.makedirs(output_path) + from calibre.ebooks.oeb.base import OPF_MIME, NCX_MIME, PAGE_MAP_MIME + with CurrentDir(output_path): + results = oeb_book.to_opf2(page_map=True) + for key in (OPF_MIME, NCX_MIME, PAGE_MAP_MIME): + href, root = results.pop(key, None) + if root is not None: + raw = etree.tostring(root, pretty_print=True, + encoding='utf-8') + with open(href, 'wb') as f: + f.write(raw) + + for item in oeb_book.manifest: + print item.href + From 90362ab56ae0594651571117c0e934e108c7b877 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Tue, 31 Mar 2009 18:41:49 -0400 Subject: [PATCH 051/319] txt output now uses new conversion pipeline --- src/calibre/customize/builtins.py | 5 +- src/calibre/ebooks/conversion/plumber.py | 2 +- src/calibre/ebooks/metadata/txt.py | 2 +- src/calibre/ebooks/txt/from_any.py | 74 ------------- src/calibre/ebooks/txt/output.py | 62 +++++++++++ src/calibre/ebooks/txt/writer.py | 130 ++++------------------- 6 files changed, 90 insertions(+), 185 deletions(-) delete mode 100644 src/calibre/ebooks/txt/from_any.py create mode 100644 src/calibre/ebooks/txt/output.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 2cbf036c1f..acc7ba71ec 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -160,7 +160,7 @@ class ODTMetadataReader(MetadataReaderPlugin): from calibre.ebooks.metadata.odt import get_metadata return get_metadata(stream) -class TXTMetadataReader(MetaReaderPlugin): +class TXTMetadataReader(MetadataReaderPlugin): name = 'Read TXT metadata' file_types = set(['txt']) @@ -266,9 +266,10 @@ class MOBIMetadataWriter(MetadataWriterPlugin): from calibre.ebooks.epub.input import EPUBInput from calibre.ebooks.mobi.input import MOBIInput from calibre.ebooks.oeb.output import OEBOutput +from calibre.ebooks.txt.output import TXTOutput from calibre.customize.profiles import input_profiles, output_profiles -plugins = [HTML2ZIP, EPUBInput, MOBIInput, OEBOutput] +plugins = [HTML2ZIP, EPUBInput, MOBIInput, OEBOutput, TXTOutput] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 5393aaf034..da41423750 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -195,7 +195,7 @@ OptionRecommendation(name='language', self.input_fmt = input_fmt self.output_fmt = output_fmt - # Build set of all possible options. Two options are equal iff their + # Build set of all possible options. Two options are equal if their # names are the same. self.input_options = self.input_plugin.options.union( self.input_plugin.common_options) diff --git a/src/calibre/ebooks/metadata/txt.py b/src/calibre/ebooks/metadata/txt.py index 5a5ab13ae9..6283c72256 100644 --- a/src/calibre/ebooks/metadata/txt.py +++ b/src/calibre/ebooks/metadata/txt.py @@ -22,7 +22,7 @@ def get_metadata(stream, extract_cover=True): else: mdata += line - mo = re.search('(?u)^[ ]*(?P<title>.+)[ ]*\n\n\n[ ]*(?P<author>.+)[ ]*\n$', mdata) + mo = re.search('(?u)^[ ]*(?P<title>.+)[ ]*(\n{3}|(\r\n){3}|\r{3})[ ]*(?P<author>.+)[ ]*(\n|\r\n|\r)$', mdata) if mo != None: mi.title = mo.group('title') mi.authors = mo.group('author').split(',') diff --git a/src/calibre/ebooks/txt/from_any.py b/src/calibre/ebooks/txt/from_any.py deleted file mode 100644 index caf5364c3c..0000000000 --- a/src/calibre/ebooks/txt/from_any.py +++ /dev/null @@ -1,74 +0,0 @@ -''' -Convert any ebook format to TXT. -''' - -from __future__ import with_statement - -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net ' \ - 'and Marshall T. Vandegrift <llasram@gmail.com>' \ - 'and John Schember <john@nachtimwald.com>' -__docformat__ = 'restructuredtext en' - -import sys, os, glob, logging - -from calibre.ebooks.epub.from_any import any2epub, formats, USAGE -from calibre.ebooks.epub import config as common_config -from calibre.ptempfile import TemporaryDirectory -from calibre.ebooks.txt.writer import oeb2txt, config as txt_config - -def config(defaults=None): - c = common_config(defaults=defaults, name='txt') - c.remove_opt('profile') - del c.option_set.groups['metadata'] - del c.option_set.groups['traversal'] - del c.option_set.groups['structure detection'] - del c.option_set.groups['toc'] - del c.option_set.groups['page layout'] - txtc = txt_config(defaults=defaults) - c.update(txtc) - return c - -def option_parser(usage=USAGE): - usage = usage % ('TXT', formats()) - parser = config().option_parser(usage=usage) - return parser - -def any2txt(opts, path, notification=None): - ext = os.path.splitext(path)[1] - if not ext: - raise ValueError('Unknown file type: '+path) - ext = ext.lower()[1:] - - if opts.output is None: - opts.output = os.path.splitext(os.path.basename(path))[0]+'.txt' - - opts.output = os.path.abspath(opts.output) - orig_output = opts.output - - with TemporaryDirectory('_any2txt') as tdir: - oebdir = os.path.join(tdir, 'oeb') - os.mkdir(oebdir) - opts.output = os.path.join(tdir, 'dummy.epub') - opts.profile = 'None' - opts.dont_split_on_page_breaks = True - orig_bfs = opts.base_font_size2 - opts.base_font_size2 = 0 - any2epub(opts, path, create_epub=False, oeb_cover=False, extract_to=oebdir) - opts.base_font_size2 = orig_bfs - opf = glob.glob(os.path.join(oebdir, '*.opf'))[0] - opts.output = orig_output - logging.getLogger('html2epub').info(_('Creating TXT file from EPUB...')) - oeb2txt(opts, opf) - -def main(args=sys.argv): - parser = option_parser() - opts, args = parser.parse_args(args) - if len(args) < 2: - parser.print_help() - print 'No input file specified.' - return 1 - any2txt(opts, args[1]) - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py new file mode 100644 index 0000000000..21498074ac --- /dev/null +++ b/src/calibre/ebooks/txt/output.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +import os + +from calibre.customize.conversion import OutputFormatPlugin, \ + OptionRecommendation +from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines, TxtMetadata +from calibre.ebooks.metadata import authors_to_string + +class TXTOutput(OutputFormatPlugin): + + name = 'TXT Output' + author = 'John Schember' + file_type = 'txt' + + options = set([ + OptionRecommendation(name='newline', recommended_value='system', + level=OptionRecommendation.LOW, long_switch='newline', + short_switch='n', choices=TxtNewlines.NEWLINE_TYPES.keys(), + help=_('Type of newline to use. Options are %s. Default is \'system\'. ' + 'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. ' + 'For Mac OS X use \'unix\'. \'system\' will default to the newline ' + 'type used by this OS.' % sorted(TxtNewlines.NEWLINE_TYPES.keys()))), + OptionRecommendation(name='prepend_author', recommended_value='true', + level=OptionRecommendation.LOW, long_switch='prepend_author', + choices=['true', 'false'], + help=_('Write the author to the beginning of the file. ' + 'Default is \'true\'. Use \'false\' to disable.')), + OptionRecommendation(name='prepend_title', recommended_value='true', + choices=['true', 'false'], + level=OptionRecommendation.LOW, long_switch='prepend_title', + help=_('Write the title to the beginning of the file. ' + 'Default is \'true\'. Use \'false\' to disable.')) + ]) + + def convert(self, oeb_book, output_path, input_plugin, opts, log): + metadata = TxtMetadata() + if opts.prepend_author.lower() == 'true': + metadata.author = opts.authors if opts.authors else authors_to_string(oeb_book.metadata.authors) + if opts.prepend_title.lower() == 'true': + metadata.title = opts.title if opts.title else oeb_book.metadata.title + + writer = TxtWriter(TxtNewlines(opts.newline).newline, log) + txt = writer.dump(oeb_book.spine, metadata) + + close = False + if not hasattr(output_path, 'write'): + close = True + if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '': + os.makedirs(os.path.dirname(output_path)) + out_stream = open(output_path, 'wb') + else: + out_stream = output_path + + out_stream.seek(0) + out_stream.write(txt) + + if close: + out_stream.close() diff --git a/src/calibre/ebooks/txt/writer.py b/src/calibre/ebooks/txt/writer.py index 205d8423e3..eabc2d64ed 100644 --- a/src/calibre/ebooks/txt/writer.py +++ b/src/calibre/ebooks/txt/writer.py @@ -1,34 +1,26 @@ # -*- coding: utf-8 -*- +from __future__ import with_statement ''' Write content to TXT. ''' -from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' -import os, logging, re, sys +import os, re, sys + +from calibre.ebooks.htmlsymbols import HTML_SYMBOLS from BeautifulSoup import BeautifulSoup -from calibre import LoggingInterface -from calibre.ebooks.htmlsymbols import HTML_SYMBOLS -from calibre.ebooks.epub.iterator import SpineItem -from calibre.ebooks.metadata import authors_to_string -from calibre.ebooks.metadata.meta import metadata_from_formats -from calibre.ebooks.metadata.opf2 import OPF -from calibre.customize.ui import run_plugins_on_postprocess -from calibre.utils.config import Config, StringConfig - -class TXTWriter(object): - def __init__(self, newline): +class TxtWriter(object): + def __init__(self, newline, log): self.newline = newline + self.log = log - def dump(self, oebpath, path, metadata): - opf = OPF(oebpath, os.path.dirname(oebpath)) - spine = [SpineItem(i.path) for i in opf.spine] - - tmpout = '' + def dump(self, spine, metadata): + out = u'' for item in spine: with open(item, 'r') as itemf: content = itemf.read().decode(item.encoding) @@ -39,25 +31,21 @@ class TXTWriter(object): content = self.replace_html_symbols(content) content = self.cleanup_text(content) content = self.specified_newlines(content) - tmpout = tmpout + content + out += content # Prepend metadata if metadata.author != None and metadata.author != '': - tmpout = (u'%s%s%s%s' % (metadata.author.upper(), self.newline, self.newline, self.newline)) + tmpout + out = (u'%s%s%s%s' % (metadata.author.upper(), self.newline, self.newline, self.newline)) + out if metadata.title != None and metadata.title != '': - tmpout = (u'%s%s%s%s' % (metadata.title.upper(), self.newline, self.newline, self.newline)) + tmpout + out = (u'%s%s%s%s' % (metadata.title.upper(), self.newline, self.newline, self.newline)) + out # Put two blank lines at end of file - - end = tmpout[-3 * len(self.newline):] + end = out[-3 * len(self.newline):] for i in range(3 - end.count(self.newline)): - tmpout = tmpout + self.newline + out += self.newline + + return out - if os.path.exists(path): - os.remove(path) - with open(path, 'w+b') as out: - out.write(tmpout.encode('utf-8')) - def strip_html(self, html): stripped = u'' @@ -149,14 +137,8 @@ class TXTWriter(object): if self.newline == '\n': return text - return text.replace('\n', self.newline) - -class TxtMetadata(object): - def __init__(self): - self.author = None - self.title = None - self.series = None - + return text.replace('\n', self.newline) + class TxtNewlines(object): NEWLINE_TYPES = { @@ -170,73 +152,7 @@ class TxtNewlines(object): self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep) -def config(defaults=None): - desc = _('Options to control the conversion to TXT') - if defaults is None: - c = Config('txt', desc) - else: - c = StringConfig(defaults, desc) - - txt = c.add_group('TXT', _('TXT options.')) - - txt('newline', ['--newline'], default='system', - help=_('Type of newline to use. Options are %s. Default is \'system\'. ' - 'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. ' - 'For Mac OS X use \'unix\'. \'system\' will default to the newline ' - 'type used by this OS.' % sorted(TxtNewlines.NEWLINE_TYPES.keys()))) - txt('prepend_author', ['--prepend-author'], default='true', - help=_('Write the author to the beginning of the file. ' - 'Default is \'true\'. Use \'false\' to disable.')) - txt('prepend_title', ['--prepend-title'], default='true', - help=_('Write the title to the beginning of the file. ' - 'Default is \'true\'. Use \'false\' to disable.')) - - return c - -def option_parser(): - c = config() - parser = c.option_parser(usage='%prog '+_('[options]')+' file.opf') - parser.add_option( - '-o', '--output', default=None, - help=_('Output file. Default is derived from input filename.')) - parser.add_option( - '-v', '--verbose', default=0, action='count', - help=_('Useful for debugging.')) - return parser - -def oeb2txt(opts, inpath): - logger = LoggingInterface(logging.getLogger('oeb2txt')) - logger.setup_cli_handler(opts.verbose) - - outpath = opts.output - if outpath is None: - outpath = os.path.basename(inpath) - outpath = os.path.splitext(outpath)[0] + '.txt' - - mi = metadata_from_formats([inpath]) - metadata = TxtMetadata() - if opts.prepend_author.lower() == 'true': - metadata.author = opts.authors if opts.authors else authors_to_string(mi.authors) - if opts.prepend_title.lower() == 'true': - metadata.title = opts.title if opts.title else mi.title - - newline = TxtNewlines(opts.newline) - - writer = TXTWriter(newline.newline) - writer.dump(inpath, outpath, metadata) - run_plugins_on_postprocess(outpath, 'txt') - logger.log_info(_('Output written to ') + outpath) - -def main(argv=sys.argv): - parser = option_parser() - opts, args = parser.parse_args(argv[1:]) - if len(args) != 1: - parser.print_help() - return 1 - inpath = args[0] - retval = oeb2txt(opts, inpath) - return retval - -if __name__ == '__main__': - sys.exit(main()) - +class TxtMetadata(object): + def __init__(self): + self.title = None + self.author = None From 79e509eeb48bf7156e62bae9ca9291311dd25778 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Tue, 31 Mar 2009 20:23:49 -0400 Subject: [PATCH 052/319] Move PDF output to use new conversion framework --- src/calibre/customize/builtins.py | 3 +- src/calibre/ebooks/pdf/from_any.py | 69 --------------------- src/calibre/ebooks/pdf/output.py | 62 +++++++++++++++++++ src/calibre/ebooks/pdf/writer.py | 99 +++++------------------------- src/calibre/ebooks/txt/output.py | 1 + 5 files changed, 79 insertions(+), 155 deletions(-) delete mode 100644 src/calibre/ebooks/pdf/from_any.py create mode 100644 src/calibre/ebooks/pdf/output.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index acc7ba71ec..932261c45d 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -267,9 +267,10 @@ from calibre.ebooks.epub.input import EPUBInput from calibre.ebooks.mobi.input import MOBIInput from calibre.ebooks.oeb.output import OEBOutput from calibre.ebooks.txt.output import TXTOutput +from calibre.ebooks.pdf.output import PDFOutput from calibre.customize.profiles import input_profiles, output_profiles -plugins = [HTML2ZIP, EPUBInput, MOBIInput, OEBOutput, TXTOutput] +plugins = [HTML2ZIP, EPUBInput, MOBIInput, OEBOutput, TXTOutput, PDFOutput] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/ebooks/pdf/from_any.py b/src/calibre/ebooks/pdf/from_any.py deleted file mode 100644 index e4fb937cdb..0000000000 --- a/src/calibre/ebooks/pdf/from_any.py +++ /dev/null @@ -1,69 +0,0 @@ -''' -Convert any ebook format to PDF. -''' - -from __future__ import with_statement - -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net ' \ - 'and Marshall T. Vandegrift <llasram@gmail.com>' \ - 'and John Schember <john@nachtimwald.com>' -__docformat__ = 'restructuredtext en' - -import sys, os, glob, logging - -from calibre.ebooks.epub.from_any import any2epub, formats, USAGE -from calibre.ebooks.epub import config as common_config -from calibre.ptempfile import TemporaryDirectory -from calibre.ebooks.pdf.writer import oeb2pdf, config as pdf_config - -def config(defaults=None): - c = common_config(defaults=defaults, name='pdf') - c.remove_opt('profile') - pdfc = pdf_config(defaults=defaults) - c.update(pdfc) - return c - -def option_parser(usage=USAGE): - usage = usage % ('PDF', formats()) - parser = config().option_parser(usage=usage) - return parser - -def any2pdf(opts, path, notification=None): - ext = os.path.splitext(path)[1] - if not ext: - raise ValueError('Unknown file type: '+path) - ext = ext.lower()[1:] - - if opts.output is None: - opts.output = os.path.splitext(os.path.basename(path))[0]+'.pdf' - - opts.output = os.path.abspath(opts.output) - orig_output = opts.output - - with TemporaryDirectory('_any2pdf') as tdir: - oebdir = os.path.join(tdir, 'oeb') - os.mkdir(oebdir) - opts.output = os.path.join(tdir, 'dummy.epub') - opts.profile = 'None' - opts.dont_split_on_page_breaks = True - orig_bfs = opts.base_font_size2 - opts.base_font_size2 = 0 - any2epub(opts, path, create_epub=False, oeb_cover=True, extract_to=oebdir) - opts.base_font_size2 = orig_bfs - opf = glob.glob(os.path.join(oebdir, '*.opf'))[0] - opts.output = orig_output - logging.getLogger('html2epub').info(_('Creating PDF file from EPUB...')) - oeb2pdf(opts, opf) - -def main(args=sys.argv): - parser = option_parser() - opts, args = parser.parse_args(args) - if len(args) < 2: - parser.print_help() - print 'No input file specified.' - return 1 - any2pdf(opts, args[1]) - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/pdf/output.py b/src/calibre/ebooks/pdf/output.py new file mode 100644 index 0000000000..71bd77ee73 --- /dev/null +++ b/src/calibre/ebooks/pdf/output.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +''' +Convert OEB ebook format to PDF. +''' + +#unit, papersize, orientation, custom_size, profile + +import os + +from calibre.customize.conversion import OutputFormatPlugin, \ + OptionRecommendation +from calibre.ebooks.pdf.writer import PDFWriter, PDFMargins + +class PDFOutput(OutputFormatPlugin): + + name = 'PDF Output' + author = 'John Schember' + file_type = 'pdf' + + options = set([ + OptionRecommendation(name='margin_top', recommended_value='1', + level=OptionRecommendation.LOW, long_switch='margin_top', + help=_('The top margin around the document.')), + OptionRecommendation(name='margin_bottom', recommended_value='1', + level=OptionRecommendation.LOW, long_switch='margin_bottom', + help=_('The bottom margin around the document.')), + OptionRecommendation(name='margin_left', recommended_value='1', + level=OptionRecommendation.LOW, long_switch='margin_left', + help=_('The left margin around the document.')), + OptionRecommendation(name='margin_right', recommended_value='1', + level=OptionRecommendation.LOW, long_switch='margin_right', + help=_('The right margin around the document.')), + ]) + + def convert(self, oeb_book, output_path, input_plugin, opts, log): + margins = PDFMargins() + margins.top = opts.margin_top + margins.bottom = opts.margin_bottom + margins.left = opts.margin_left + margins.right = opts.margin_right + + writer = PDFWriter(log, margins) + + close = False + if not hasattr(output_path, 'write'): + close = True + if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '': + os.makedirs(os.path.dirname(output_path)) + out_stream = open(output_path, 'wb') + else: + out_stream = output_path + + out_stream.seek(0) + out_stream.truncate() + writer.dump(oeb_book.spine, out_stream) + + if close: + out_stream.close() diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index c189407dac..511c968a20 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -1,20 +1,17 @@ -''' -Write content to PDF. -''' +# -*- coding: utf-8 -*- from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' -import os, logging, shutil, sys +''' +Write content to PDF. +''' + +import os, shutil, sys -from calibre import LoggingInterface -from calibre.ebooks.epub.iterator import SpineItem -from calibre.ebooks.metadata.opf2 import OPF from calibre.ptempfile import PersistentTemporaryDirectory -from calibre.customize.ui import run_plugins_on_postprocess -from calibre.utils.config import Config, StringConfig - from PyQt4 import QtCore from PyQt4.Qt import QUrl, QEventLoop, SIGNAL, QObject, QApplication, QPrinter, \ QMetaObject, Qt @@ -29,13 +26,14 @@ class PDFMargins: self.left = margin self.right = margin + class PDFWriter(QObject): - def __init__(self, margins=PDFMargins()): + def __init__(self, log, margins=PDFMargins()): if QApplication.instance() is None: QApplication([]) QObject.__init__(self) - self.logger = logging.getLogger('oeb2pdf') + self.logger = log self.loop = QEventLoop() self.view = QWebView() @@ -45,13 +43,12 @@ class PDFWriter(QObject): self.tmp_path = PersistentTemporaryDirectory('_any2pdf_parts') self.margins = margins - def dump(self, oebpath, path): + def dump(self, spine, out_stream): self._delete_tmpdir() - opf = OPF(oebpath, os.path.dirname(oebpath)) - self.render_queue = [SpineItem(i.path) for i in opf.spine] + self.render_queue = spine[:] self.combine_queue = [] - self.path = path + self.out_stream = out_stream QMetaObject.invokeMethod(self, "_render_book", Qt.QueuedConnection) self.loop.exec_() @@ -98,75 +95,7 @@ class PDFWriter(QObject): inputPDF = PdfFileReader(file(item, 'rb')) for page in inputPDF.pages: outPDF.addPage(page) - outputStream = file(self.path, 'wb') - outPDF.write(outputStream) - outputStream.close() + outPDF.write(self.out_stream) finally: self._delete_tmpdir() self.loop.exit(0) - - -def config(defaults=None): - desc = _('Options to control the conversion to PDF') - if defaults is None: - c = Config('pdf', desc) - else: - c = StringConfig(defaults, desc) - - pdf = c.add_group('PDF', _('PDF options.')) - - pdf('margin_top', ['--margin_top'], default=1, - help=_('The top margin around the document in inches.')) - pdf('margin_bottom', ['--margin_bottom'], default=1, - help=_('The bottom margin around the document in inches.')) - pdf('margin_left', ['--margin_left'], default=1, - help=_('The left margin around the document in inches.')) - pdf('margin_right', ['--margin_right'], default=1, - help=_('The right margin around the document in inches.')) - - return c - -def option_parser(): - c = config() - parser = c.option_parser(usage='%prog '+_('[options]')+' file.opf') - parser.add_option( - '-o', '--output', default=None, - help=_('Output file. Default is derived from input filename.')) - parser.add_option( - '-v', '--verbose', default=0, action='count', - help=_('Useful for debugging.')) - return parser - -def oeb2pdf(opts, inpath): - logger = LoggingInterface(logging.getLogger('oeb2pdf')) - logger.setup_cli_handler(opts.verbose) - - outpath = opts.output - if outpath is None: - outpath = os.path.basename(inpath) - outpath = os.path.splitext(outpath)[0] + '.pdf' - - margins = PDFMargins() - margins.top = opts.margin_top - margins.bottom = opts.margin_bottom - margins.left = opts.margin_left - margins.right = opts.margin_right - - writer = PDFWriter(margins) - writer.dump(inpath, outpath) - run_plugins_on_postprocess(outpath, 'pdf') - logger.log_info(_('Output written to ') + outpath) - -def main(argv=sys.argv): - parser = option_parser() - opts, args = parser.parse_args(argv[1:]) - if len(args) != 1: - parser.print_help() - return 1 - inpath = args[0] - retval = oeb2pdf(opts, inpath) - return retval - -if __name__ == '__main__': - sys.exit(main()) - diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index 21498074ac..7d44172b3f 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -56,6 +56,7 @@ class TXTOutput(OutputFormatPlugin): out_stream = output_path out_stream.seek(0) + out_stream.truncate() out_stream.write(txt) if close: From e624b088d7e3067f4c546b66b6671059cefd2c4c Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Tue, 31 Mar 2009 18:51:46 -0700 Subject: [PATCH 053/319] Conversion pipeline now works for conversion from MOBI to OEB --- src/calibre/customize/conversion.py | 12 +++++-- src/calibre/customize/profiles.py | 4 ++- src/calibre/ebooks/conversion/cli.py | 4 +-- src/calibre/ebooks/conversion/plumber.py | 16 ++++++--- src/calibre/ebooks/mobi/reader.py | 12 +++---- src/calibre/ebooks/oeb/base.py | 4 +-- src/calibre/ebooks/oeb/output.py | 19 +++++++++-- src/calibre/ebooks/oeb/reader.py | 2 +- src/calibre/ebooks/oeb/transforms/flatcss.py | 12 +++---- src/calibre/utils/logging.py | 34 ++++++++++---------- 10 files changed, 74 insertions(+), 45 deletions(-) diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index b25704569b..6530e5f16c 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -170,7 +170,8 @@ class InputFormatPlugin(Plugin): if not os.path.exists(options.debug_input): os.makedirs(options.debug_input) shutil.rmtree(options.debug_input) - shutil.copytree('.', options.debug_input) + shutil.copytree(output_dir, options.debug_input) + log.info('Input debug saved to:', options.debug_input) return ret @@ -195,7 +196,14 @@ class OutputFormatPlugin(Plugin): #: Options shared by all Input format plugins. Do not override #: in sub-classes. Use :member:`options` instead. Every option must be an #: instance of :class:`OptionRecommendation`. - common_options = set([]) + common_options = set([ + OptionRecommendation(name='pretty_print', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('If specified, the output plugin will try to create output ' + 'that is as human readable as possible. May not have any effect ' + 'for some output plugins.') + ), + ]) #: Options to customize the behavior of this plugin. Every option must be an #: instance of :class:`OptionRecommendation`. diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py index 5e1ff297bb..8623a94ddd 100644 --- a/src/calibre/customize/profiles.py +++ b/src/calibre/customize/profiles.py @@ -25,9 +25,11 @@ class Plugin(_Plugin): screen_size = (800, 600) dpi = 100 - def initialize(self): + def __init__(self, *args, **kwargs): + _Plugin.__init__(self, *args, **kwargs) self.width, self.height = self.screen_size fsizes = list(self.fsizes) + self.fkey = list(self.fsizes) self.fsizes = [] for (name, num), size in izip(FONT_SIZES, fsizes): self.fsizes.append((name, num, float(size))) diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index 9994b61a7c..e8f4aa68e2 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -92,9 +92,9 @@ def add_input_output_options(parser, plumber): parser.add_option_group(io) if output_options: - title = plumber.output_fmt.upper() + ' ' + _('OPTIONS') + title = _('OUTPUT OPTIONS') oo = OptionGroup(parser, title, _('Options to control the processing' - ' of the output %s file')%plumber.input_fmt) + ' of the output %s')%plumber.output_fmt) add_options(oo.add_option, output_options) parser.add_option_group(oo) diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 5393aaf034..fe20e5877f 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -9,6 +9,7 @@ from calibre.customize.conversion import OptionRecommendation from calibre.customize.ui import input_profiles, output_profiles, \ plugin_for_input_format, plugin_for_output_format from calibre.ebooks.conversion.preprocess import HTMLPreProcessor +from calibre.ptempfile import PersistentTemporaryDirectory class OptionValues(object): pass @@ -289,6 +290,8 @@ OptionRecommendation(name='language', ''' # Setup baseline option values self.setup_options() + if self.opts.verbose: + self.log.filter_level = self.log.DEBUG # Run any preprocess plugins from calibre.customize.ui import run_plugins_on_preprocess @@ -300,9 +303,11 @@ OptionRecommendation(name='language', from calibre.ebooks.oeb.base import OEBBook accelerators = {} + tdir = PersistentTemporaryDirectory('_plumber') + opfpath = self.input_plugin(open(self.input, 'rb'), self.opts, self.input_fmt, self.log, - accelerators) + accelerators, tdir) html_preprocessor = HTMLPreProcessor() self.reader = OEBReader() self.oeb = OEBBook(self.log, html_preprocessor=html_preprocessor) @@ -316,15 +321,16 @@ OptionRecommendation(name='language', from calibre.ebooks.oeb.transforms.flatcss import CSSFlattener fbase = self.opts.base_font_size if fbase == 0: - fbase = self.opts.dest.fbase + fbase = float(self.opts.dest.fbase) fkey = self.opts.font_size_mapping if fkey is None: - fkey = self.opts.dest.fsizes + fkey = self.opts.dest.fkey + else: + fkey = map(float, fkey.split(',')) flattener = CSSFlattener(fbase=fbase, fkey=fkey, lineh=self.opts.line_height, untable=self.opts.linearize_tables) - self.log.info('Flattening CSS...') flattener(self.oeb, self.opts) from calibre.ebooks.oeb.transforms.trimmanifest import ManifestTrimmer @@ -334,7 +340,7 @@ OptionRecommendation(name='language', trimmer(self.oeb, self.opts) self.log.info('Creating %s output...'%self.output_plugin.name) - self.output_plugin(self.oeb, self.output, self.input_plugin, self.opts, + self.output_plugin.convert(self.oeb, self.output, self.input_plugin, self.opts, self.log) diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index 32e0126b12..fcd09d13c7 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -5,7 +5,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' Read data from .mobi files ''' -import struct, os, cStringIO, re, functools, datetime +import struct, os, cStringIO, re, functools, datetime, textwrap try: from PIL import Image as PILImage @@ -162,7 +162,7 @@ class MobiReader(object): self.log = log self.debug = debug self.embedded_mi = None - self.base_css_rules = ''' + self.base_css_rules = textwrap.dedent(''' blockquote { margin: 0em 0em 0em 1.25em; text-align: justify } p { margin: 0em; text-align: justify } @@ -174,7 +174,7 @@ class MobiReader(object): .mbp_pagebreak { page-break-after: always; margin: 0; display: block } - ''' + ''') self.tag_css_rules = [] if hasattr(filename_or_stream, 'read'): @@ -223,7 +223,7 @@ class MobiReader(object): processed_records = self.extract_text() if self.debug is not None: - self.parse_cache['calibre_raw_mobi_markup'] = self.mobi_html + parse_cache['calibre_raw_mobi_markup'] = self.mobi_html self.add_anchors() self.processed_html = self.processed_html.decode(self.book_header.codec, 'ignore') @@ -265,7 +265,6 @@ class MobiReader(object): pass parse_cache[htmlfile] = root self.htmlfile = htmlfile - self.log.debug('Creating OPF...') ncx = cStringIO.StringIO() opf = self.create_opf(htmlfile, guide, root) self.created_opf_path = os.path.splitext(htmlfile)[0]+'.opf' @@ -283,8 +282,7 @@ class MobiReader(object): if self.book_header.exth is not None or self.embedded_mi is not None: - if self.verbose: - print 'Creating OPF...' + self.log.debug('Creating OPF...') ncx = cStringIO.StringIO() opf = self.create_opf(htmlfile, guide, root) opf.render(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'), ncx) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 4ce984b9a8..e96de5112f 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -658,9 +658,9 @@ class Manifest(object): def _parse_css(self, data): data = self.oeb.decode(data) - data = self.CSSPreProcessor(data) + data = self.oeb.css_preprocessor(data) data = XHTML_CSS_NAMESPACE + data - parser = CSSParser(log=self.oeb.logger, loglevel=logging.WARNING, + parser = CSSParser(loglevel=logging.WARNING, fetcher=self._fetch_css) data = parser.parseString(data, href=self.href) data.namespaces['h'] = XHTML_NS diff --git a/src/calibre/ebooks/oeb/output.py b/src/calibre/ebooks/oeb/output.py index b26934e18a..2716ff57cd 100644 --- a/src/calibre/ebooks/oeb/output.py +++ b/src/calibre/ebooks/oeb/output.py @@ -25,7 +25,7 @@ class OEBOutput(OutputFormatPlugin): with CurrentDir(output_path): results = oeb_book.to_opf2(page_map=True) for key in (OPF_MIME, NCX_MIME, PAGE_MAP_MIME): - href, root = results.pop(key, None) + href, root = results.pop(key, [None, None]) if root is not None: raw = etree.tostring(root, pretty_print=True, encoding='utf-8') @@ -33,6 +33,21 @@ class OEBOutput(OutputFormatPlugin): f.write(raw) for item in oeb_book.manifest: - print item.href + path = os.path.abspath(item.href) + dir = os.path.dirname(path) + if not os.path.exists(dir): + os.makedirs(dir) + raw = item.data + if not isinstance(raw, basestring): + if hasattr(raw, 'cssText'): + raw = raw.cssText + else: + raw = etree.tostring(raw, encoding='utf-8', + pretty_print=opts.pretty_print) + raw = raw + '<?xml version="1.0" encoding="utf-8" ?>\n' + if isinstance(raw, unicode): + raw = raw.encode('utf-8') + with open(path, 'wb') as f: + f.write(raw) diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py index 60c2cf23bf..f4430ac07c 100644 --- a/src/calibre/ebooks/oeb/reader.py +++ b/src/calibre/ebooks/oeb/reader.py @@ -168,7 +168,7 @@ class OEBReader(object): data. ''' bad = [] - check = OEB_DOCS+OEB_STYLES + check = OEB_DOCS.union(OEB_STYLES) for item in list(self.oeb.manifest.values()): if item.media_type in check: try: diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py index 9833b3b4d0..b33042e10b 100644 --- a/src/calibre/ebooks/oeb/transforms/flatcss.py +++ b/src/calibre/ebooks/oeb/transforms/flatcss.py @@ -44,7 +44,7 @@ class KeyMapper(object): logb = abs(base - endp) result = sign * math.log(diff, logb) return result - + def __getitem__(self, ssize): ssize = asfloat(ssize, 0) if ssize in self.cache: @@ -75,7 +75,7 @@ class NullMapper(object): def __getitem__(self, ssize): return ssize - + def FontMapper(sbase=None, dbase=None, dkey=None): if sbase and dbase and dkey: return KeyMapper(sbase, dbase, dkey) @@ -101,7 +101,7 @@ class CSSFlattener(object): @classmethod def generate(cls, opts): return cls() - + def __call__(self, oeb, context): oeb.logger.info('Flattening CSS and remapping font sizes...') self.oeb = oeb @@ -127,7 +127,7 @@ class CSSFlattener(object): self.baseline_node(child, stylizer, sizes, csize) if child.tail: sizes[csize] += len(COLLAPSE.sub(' ', child.tail)) - + def baseline_spine(self): sizes = defaultdict(float) for item in self.oeb.spine: @@ -157,7 +157,7 @@ class CSSFlattener(object): else: value = round(value / slineh) * dlineh cssdict[property] = "%0.5fem" % (value / fsize) - + def flatten_node(self, node, stylizer, names, styles, psize, left=0): if not isinstance(node.tag, basestring) \ or namespace(node.tag) != XHTML_NS: @@ -267,7 +267,7 @@ class CSSFlattener(object): manifest.remove(item) item = manifest.add(id, href, CSS_MIME, data=css) return href - + def flatten_spine(self): names = defaultdict(int) styles = {} diff --git a/src/calibre/utils/logging.py b/src/calibre/utils/logging.py index d5a55ac48b..911e69f745 100644 --- a/src/calibre/utils/logging.py +++ b/src/calibre/utils/logging.py @@ -17,18 +17,18 @@ from functools import partial class Stream(object): - + def __init__(self, stream): from calibre import prints self._prints = prints self.stream = stream - + def flush(self): self.stream.flush() class ANSIStream(Stream): - + def __init__(self, stream=sys.stdout): Stream.__init__(self, stream) from calibre.utils.terminfo import TerminalController @@ -40,18 +40,18 @@ class ANSIStream(Stream): ERROR: tc.RED } self.normal = tc.NORMAL - + def prints(self, level, *args, **kwargs): self.stream.write(self.color[level]) kwargs['file'] = self.stream self._prints(*args, **kwargs) self.stream.write(self.normal) - + def flush(self): self.stream.flush() - + class HTMLStream(Stream): - + def __init__(self, stream=sys.stdout): Stream.__init__(self, stream) self.color = { @@ -61,13 +61,13 @@ class HTMLStream(Stream): ERROR: '<span style="color:red">' } self.normal = '</span>' - + def prints(self, level, *args, **kwargs): self.stream.write(self.color[level]) kwargs['file'] = self.stream self._prints(*args, **kwargs) self.stream.write(self.normal) - + def flush(self): self.stream.flush() @@ -77,28 +77,28 @@ class Log(object): INFO = INFO WARN = WARN ERROR = ERROR - + def __init__(self, level=INFO): self.filter_level = level default_output = ANSIStream() self.outputs = [default_output] - - self.debug = partial(self.prints, DEBUG) + + self.debug = partial(self.prints, DEBUG) self.info = partial(self.prints, INFO) self.warn = self.warning = partial(self.prints, WARN) - self.error = partial(self.prints, ERROR) - - + self.error = partial(self.prints, ERROR) + + def prints(self, level, *args, **kwargs): if level < self.filter_level: return for output in self.outputs: output.prints(level, *args, **kwargs) - + def exception(self, *args, **kwargs): limit = kwargs.pop('limit', None) self.prints(ERROR, *args, **kwargs) self.prints(DEBUG, traceback.format_exc(limit)) def __call__(self, *args, **kwargs): - self.prints(INFO, *args, **kwargs) \ No newline at end of file + self.prints(INFO, *args, **kwargs) From 596e3f71388cef57c1e7593c796431a984e66233 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Wed, 1 Apr 2009 07:39:41 -0400 Subject: [PATCH 054/319] More robust pdf output --- src/calibre/ebooks/pdf/output.py | 37 ++++++++-- src/calibre/ebooks/pdf/pageoptions.py | 98 +++++++++++++++++++++++++++ src/calibre/ebooks/pdf/writer.py | 18 ++--- 3 files changed, 135 insertions(+), 18 deletions(-) create mode 100644 src/calibre/ebooks/pdf/pageoptions.py diff --git a/src/calibre/ebooks/pdf/output.py b/src/calibre/ebooks/pdf/output.py index 71bd77ee73..5af4e4bed7 100644 --- a/src/calibre/ebooks/pdf/output.py +++ b/src/calibre/ebooks/pdf/output.py @@ -13,7 +13,9 @@ import os from calibre.customize.conversion import OutputFormatPlugin, \ OptionRecommendation -from calibre.ebooks.pdf.writer import PDFWriter, PDFMargins +from calibre.ebooks.pdf.writer import PDFWriter +from calibre.ebooks.pdf.pageoptions import UNITS, unit, PAPER_SIZES, \ + paper_size, ORIENTATIONS, orientation, PageOptions class PDFOutput(OutputFormatPlugin): @@ -34,16 +36,37 @@ class PDFOutput(OutputFormatPlugin): OptionRecommendation(name='margin_right', recommended_value='1', level=OptionRecommendation.LOW, long_switch='margin_right', help=_('The right margin around the document.')), + + OptionRecommendation(name='unit', recommended_value='inch', + level=OptionRecommendation.LOW, short_switch='u', + long_switch='unit', choices=UNITS.keys(), + help=_('The unit of measure. Default is inch. Choices ' + 'are %s' % UNITS.keys())), + OptionRecommendation(name='paper_size', recommended_value='letter', + level=OptionRecommendation.LOW, + long_switch='paper_size', choices=PAPER_SIZES.keys(), + help=_('The size of the paper. Default is letter. Choices ' + 'are %s' % PAPER_SIZES.keys())), + OptionRecommendation(name='orientation', recommended_value='portrait', + level=OptionRecommendation.LOW, + long_switch='orientation', choices=ORIENTATIONS.keys(), + help=_('The orientation of the page. Default is portrait. Choices ' + 'are %s' % ORIENTATIONS.keys())), ]) def convert(self, oeb_book, output_path, input_plugin, opts, log): - margins = PDFMargins() - margins.top = opts.margin_top - margins.bottom = opts.margin_bottom - margins.left = opts.margin_left - margins.right = opts.margin_right + popts = PageOptions() + + popts.set_margin_top(opts.margin_top) + popts.set_margin_bottom(opts.margin_bottom) + popts.set_margin_left(opts.margin_left) + popts.set_margin_right(opts.margin_right) + + popts.unit = unit(opts.unit) + popts.paper_size = paper_size(opts.paper_size) + popts.orientation = orientation(opts.orientation) - writer = PDFWriter(log, margins) + writer = PDFWriter(log, popts) close = False if not hasattr(output_path, 'write'): diff --git a/src/calibre/ebooks/pdf/pageoptions.py b/src/calibre/ebooks/pdf/pageoptions.py new file mode 100644 index 0000000000..26fae81662 --- /dev/null +++ b/src/calibre/ebooks/pdf/pageoptions.py @@ -0,0 +1,98 @@ +# -*- coding: utf-8 -*- +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +from PyQt4.Qt import QPrinter + +UNITS = { + 'millimeter' : QPrinter.Millimeter, + 'point' : QPrinter.Point, + 'inch' : QPrinter.Inch, + 'pica' : QPrinter.Pica, + 'didot' : QPrinter.Didot, + 'cicero' : QPrinter.Cicero, + 'devicepixel' : QPrinter.DevicePixel, + } + +def unit(unit): + return UNITS.get(unit, QPrinter.Inch) + +PAPER_SIZES = { + 'a0' : QPrinter.A0, # 841 x 1189 mm + 'a1' : QPrinter.A1, # 594 x 841 mm + 'a2' : QPrinter.A2, # 420 x 594 mm + 'a3' : QPrinter.A3, # 297 x 420 mm + 'a4' : QPrinter.A4, # 210 x 297 mm, 8.26 x 11.69 inches + 'a5' : QPrinter.A5, # 148 x 210 mm + 'a6' : QPrinter.A6, # 105 x 148 mm + 'a7' : QPrinter.A7, # 74 x 105 mm + 'a8' : QPrinter.A8, # 52 x 74 mm + 'a9' : QPrinter.A9, # 37 x 52 mm + 'b0' : QPrinter.B0, # 1030 x 1456 mm + 'b1' : QPrinter.B1, # 728 x 1030 mm + 'b2' : QPrinter.B2, # 515 x 728 mm + 'b3' : QPrinter.B3, # 364 x 515 mm + 'b4' : QPrinter.B4, # 257 x 364 mm + 'b5' : QPrinter.B5, # 182 x 257 mm, 7.17 x 10.13 inches + 'b6' : QPrinter.B6, # 128 x 182 mm + 'b7' : QPrinter.B7, # 91 x 128 mm + 'b8' : QPrinter.B8, # 64 x 91 mm + 'b9' : QPrinter.B9, # 45 x 64 mm + 'b10' : QPrinter.B10, # 32 x 45 mm + 'c5e' : QPrinter.C5E, # 163 x 229 mm + 'comm10e' : QPrinter.Comm10E, # 105 x 241 mm, U.S. Common 10 Envelope + 'dle' : QPrinter.DLE, # 110 x 220 mm + 'executive' : QPrinter.Executive, # 7.5 x 10 inches, 191 x 254 mm + 'folio' : QPrinter.Folio, # 210 x 330 mm + 'ledger' : QPrinter.Ledger, # 432 x 279 mm + 'legal' : QPrinter.Legal, # 8.5 x 14 inches, 216 x 356 mm + 'letter' : QPrinter.Letter, # 8.5 x 11 inches, 216 x 279 mm + 'tabloid' : QPrinter.Tabloid, # 279 x 432 mm + #'custom' : QPrinter.Custom, # Unknown, or a user defined size. + } + +def paper_size(size): + return PAPER_SIZES.get(size, QPrinter.Letter) + +ORIENTATIONS = { + 'portrait' : QPrinter.Portrait, + 'landscape' : QPrinter.Landscape, + } + +def orientation(orientation): + return ORIENTATIONS.get(orientation, QPrinter.Portrait) + + +class PageOptions(object): + margin_top = 1 + margin_bottom = 1 + margin_left = 1 + margin_right = 1 + unit = QPrinter.Inch + paper_size = QPrinter.Letter + orientation = QPrinter.Portrait + + def set_margin_top(self, size): + try: + self.margin_top = int(size) + except: + self.margin_top = 1 + + def set_margin_bottom(self, size): + try: + self.margin_bottom = int(size) + except: + self.margin_bottom = 1 + + def set_margin_left(self, size): + try: + self.margin_left = int(size) + except: + self.margin_left = 1 + + def set_margin_right(self, size): + try: + self.margin_right = int(size) + except: + self.margin_right = 1 diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index 511c968a20..cf77aebc14 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -12,23 +12,17 @@ Write content to PDF. import os, shutil, sys from calibre.ptempfile import PersistentTemporaryDirectory +from calibre.ebooks.pdf.pageoptions import PageOptions + from PyQt4 import QtCore from PyQt4.Qt import QUrl, QEventLoop, SIGNAL, QObject, QApplication, QPrinter, \ QMetaObject, Qt from PyQt4.QtWebKit import QWebView from pyPdf import PdfFileWriter, PdfFileReader - -class PDFMargins: - def __init__(self, margin=1): - self.top = margin - self.bottom = margin - self.left = margin - self.right = margin - class PDFWriter(QObject): - def __init__(self, log, margins=PDFMargins()): + def __init__(self, log, popts=PageOptions()): if QApplication.instance() is None: QApplication([]) QObject.__init__(self) @@ -41,7 +35,7 @@ class PDFWriter(QObject): self.render_queue = [] self.combine_queue = [] self.tmp_path = PersistentTemporaryDirectory('_any2pdf_parts') - self.margins = margins + self.popts = popts def dump(self, spine, out_stream): self._delete_tmpdir() @@ -75,7 +69,9 @@ class PDFWriter(QObject): self.logger.debug('\tRendering item as %s' % item_path) printer = QPrinter(QPrinter.HighResolution) - printer.setPageMargins(self.margins.left, self.margins.top, self.margins.right, self.margins.bottom, QPrinter.Inch) + printer.setPageMargins(self.popts.margin_left, self.popts.margin_top, self.popts.margin_right, self.popts.margin_bottom, self.popts.unit) + printer.setPaperSize(self.popts.paper_size) + printer.setOrientation(self.popts.orientation) printer.setOutputFormat(QPrinter.PdfFormat) printer.setOutputFileName(item_path) self.view.print_(printer) From 118fd6ece0625f9bb95657df74401abe46f775ad Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Wed, 1 Apr 2009 08:08:03 -0400 Subject: [PATCH 055/319] reverse pdfmanipulate command --- src/calibre/ebooks/pdf/manipulate.py | 11 ++-- src/calibre/ebooks/pdf/reverse.py | 88 ++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+), 5 deletions(-) create mode 100644 src/calibre/ebooks/pdf/reverse.py diff --git a/src/calibre/ebooks/pdf/manipulate.py b/src/calibre/ebooks/pdf/manipulate.py index 262aaf78d4..8c49650730 100644 --- a/src/calibre/ebooks/pdf/manipulate.py +++ b/src/calibre/ebooks/pdf/manipulate.py @@ -10,13 +10,14 @@ __docformat__ = 'restructuredtext en' import string, sys from calibre.utils.config import Config, StringConfig -from calibre.ebooks.pdf import info, merge, split, trim +from calibre.ebooks.pdf import info, merge, reverse, split, trim COMMANDS = { - 'info' : info, - 'merge' : merge, - 'split' : split, - 'trim' : trim, + 'info' : info, + 'merge' : merge, + 'reverse' : reverse, + 'split' : split, + 'trim' : trim, } def config(defaults=None): diff --git a/src/calibre/ebooks/pdf/reverse.py b/src/calibre/ebooks/pdf/reverse.py new file mode 100644 index 0000000000..87bb9018c1 --- /dev/null +++ b/src/calibre/ebooks/pdf/reverse.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +''' +Reverse content of PDF. +''' + +import os, sys + +from calibre.ebooks.metadata.meta import metadata_from_formats +from calibre.ebooks.metadata import authors_to_string +from calibre.utils.config import Config, StringConfig + +from pyPdf import PdfFileWriter, PdfFileReader + +def config(defaults=None): + desc = _('Options to control the transformation of pdf') + if defaults is None: + c = Config('reversepdf', desc) + else: + c = StringConfig(defaults, desc) + c.add_opt('output', ['-o', '--output'], default='reversed.pdf', + help=_('Path to output file. By default a file is created in the current directory.')) + return c + +def option_parser(name): + c = config() + return c.option_parser(usage=_('''\ + %prog %%name [options] file1.pdf + + Reverse PDF. + '''.replace('%%name', name))) + +def reverse(pdf_path, out_path, metadata=None): + if metadata == None: + title = _('Unknown') + author = _('Unknown') + else: + title = metadata.title + author = authors_to_string(metadata.authors) + + out_pdf = PdfFileWriter(title=title, author=author) + + pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb')) + for page in reversed(pdf.pages): + out_pdf.addPage(page) + + with open(out_path, 'wb') as out_file: + out_pdf.write(out_file) + +# Return True if the pdf is valid. +def valid_pdf(pdf_path): + try: + with open(os.path.abspath(pdf_path), 'rb') as pdf_file: + pdf = PdfFileReader(pdf_file) + if pdf.isEncrypted or pdf.numPages <= 0: + raise Exception + except: + return False + return True + + +def main(args=sys.argv, name=''): + parser = option_parser(name) + opts, args = parser.parse_args(args) + args = args[1:] + + if len(args) < 1: + print 'Error: A PDF file is required.\n\n' + print parser.get_usage() + return 2 + + if not valid_pdf(args[0]): + print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % args[0] + return 2 + + mi = metadata_from_formats([args[0]]) + + reverse(args[0], opts.output, mi) + + return 0 + +if __name__ == '__main__': + sys.exit(main()) From fca3f98e6a2b3e7dbd98da3618b1f3d805ad8c1e Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Wed, 1 Apr 2009 20:05:34 -0400 Subject: [PATCH 056/319] Text input plugin --- src/calibre/customize/builtins.py | 3 ++- src/calibre/ebooks/txt/input.py | 42 +++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 src/calibre/ebooks/txt/input.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 932261c45d..ab6d772121 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -265,12 +265,13 @@ class MOBIMetadataWriter(MetadataWriterPlugin): from calibre.ebooks.epub.input import EPUBInput from calibre.ebooks.mobi.input import MOBIInput +from calibre.ebooks.txt.input import TXTInput from calibre.ebooks.oeb.output import OEBOutput from calibre.ebooks.txt.output import TXTOutput from calibre.ebooks.pdf.output import PDFOutput from calibre.customize.profiles import input_profiles, output_profiles -plugins = [HTML2ZIP, EPUBInput, MOBIInput, OEBOutput, TXTOutput, PDFOutput] +plugins = [HTML2ZIP, EPUBInput, MOBIInput, TXTInput, OEBOutput, TXTOutput, PDFOutput] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py new file mode 100644 index 0000000000..a42c72866f --- /dev/null +++ b/src/calibre/ebooks/txt/input.py @@ -0,0 +1,42 @@ +# -*- coding: utf-8 -*- +from __future__ import with_statement + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +import os + +from calibre.customize.conversion import InputFormatPlugin +from calibre.ebooks.markdown import markdown +from calibre.ebooks.metadata.opf import OPFCreator +from calibre.ebooks.metadata import MetaInformation +#from calibre.ebooks.metadata.meta import metadata_from_formats + +class TXTInput(InputFormatPlugin): + + name = 'TXT Input' + author = 'John Schember' + description = 'Convert TXT files to HTML' + file_types = set(['txt']) + + def convert(self, stream, options, file_ext, log, + accelerators): + txt = stream.read() + + md = markdown.Markdown( + extensions=['footnotes', 'tables', 'toc'], + safe_mode=False,) + html = '<html><body>'+md.convert(txt)+'</body></html>' + with open('index.html', 'wb') as index: + index.write(html.encode('utf-8')) + + #mi = metadata_from_formats([stream.name]) + mi = MetaInformation(_('Unknown'), _('Unknown')) + opf = OPFCreator(os.getcwd(), mi) + opf.create_manifest([('index.html', None)]) + opf.create_spine(['index.html']) + with open('metadata.opf', 'wb') as opffile: + opf.render(opffile) + + return os.path.join(os.getcwd(), 'metadata.opf') From e6e9a2058657c7cb5b7e9cac52553d58ae010e99 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Wed, 1 Apr 2009 21:21:26 -0400 Subject: [PATCH 057/319] Txt output mostly fixed --- src/calibre/ebooks/txt/output.py | 6 +++--- src/calibre/ebooks/txt/writer.py | 19 +++++++++---------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index 7d44172b3f..5e58d47ef1 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -39,10 +39,10 @@ class TXTOutput(OutputFormatPlugin): def convert(self, oeb_book, output_path, input_plugin, opts, log): metadata = TxtMetadata() if opts.prepend_author.lower() == 'true': - metadata.author = opts.authors if opts.authors else authors_to_string(oeb_book.metadata.authors) + metadata.author = opts.authors if opts.authors else authors_to_string(oeb_book.metadata.authors.value) if oeb_book.metadata.authors != [] else _('Unknown') if opts.prepend_title.lower() == 'true': - metadata.title = opts.title if opts.title else oeb_book.metadata.title - + metadata.title = opts.title if opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown') + writer = TxtWriter(TxtNewlines(opts.newline).newline, log) txt = writer.dump(oeb_book.spine, metadata) diff --git a/src/calibre/ebooks/txt/writer.py b/src/calibre/ebooks/txt/writer.py index eabc2d64ed..2fb5f9550a 100644 --- a/src/calibre/ebooks/txt/writer.py +++ b/src/calibre/ebooks/txt/writer.py @@ -22,16 +22,15 @@ class TxtWriter(object): def dump(self, spine, metadata): out = u'' for item in spine: - with open(item, 'r') as itemf: - content = itemf.read().decode(item.encoding) - # Convert newlines to unix style \n for processing. These - # will be changed to the specified type later in the process. - content = self.unix_newlines(content) - content = self.strip_html(content) - content = self.replace_html_symbols(content) - content = self.cleanup_text(content) - content = self.specified_newlines(content) - out += content + content = unicode(item) + # Convert newlines to unix style \n for processing. These + # will be changed to the specified type later in the process. + content = self.unix_newlines(content) + content = self.strip_html(content) + content = self.replace_html_symbols(content) + content = self.cleanup_text(content) + content = self.specified_newlines(content) + out += content # Prepend metadata if metadata.author != None and metadata.author != '': From d7e9ca4bee2f2609c0ff636f1aacf052c5f08607 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Wed, 1 Apr 2009 20:42:02 -0700 Subject: [PATCH 058/319] MOBI Input:Fix passthrough of TOC to conversion pipeline --- src/calibre/ebooks/conversion/plumber.py | 2 +- src/calibre/ebooks/mobi/reader.py | 20 +++++++++++++------- src/calibre/ebooks/oeb/output.py | 2 +- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index c2fefa29db..6142cb555a 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -339,7 +339,7 @@ OptionRecommendation(name='language', trimmer = ManifestTrimmer() trimmer(self.oeb, self.opts) - self.log.info('Creating %s output...'%self.output_plugin.name) + self.log.info('Creating %s...'%self.output_plugin.name) self.output_plugin.convert(self.oeb, self.output, self.input_plugin, self.opts, self.log) diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index fcd09d13c7..a78b5085d9 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -266,12 +266,14 @@ class MobiReader(object): parse_cache[htmlfile] = root self.htmlfile = htmlfile ncx = cStringIO.StringIO() - opf = self.create_opf(htmlfile, guide, root) + opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) self.created_opf_path = os.path.splitext(htmlfile)[0]+'.opf' - opf.render(open(self.created_opf_path, 'wb'), ncx) + opf.render(open(self.created_opf_path, 'wb'), ncx, + ncx_manifest_entry=ncx_manifest_entry) ncx = ncx.getvalue() if ncx: - open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx) + ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx') + open(ncx_path, 'wb').write(ncx) with open('styles.css', 'wb') as s: s.write(self.base_css_rules+'\n\n') @@ -284,8 +286,9 @@ class MobiReader(object): if self.book_header.exth is not None or self.embedded_mi is not None: self.log.debug('Creating OPF...') ncx = cStringIO.StringIO() - opf = self.create_opf(htmlfile, guide, root) - opf.render(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'), ncx) + opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) + opf.render(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'), ncx, + ncx_manifest_entry ) ncx = ncx.getvalue() if ncx: open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx) @@ -434,7 +437,10 @@ class MobiReader(object): for ref in opf.guide: if ref.type.lower() == 'toc': toc = ref.href() + + ncx_manifest_entry = None if toc: + ncx_manifest_entry = 'toc.ncx' elems = root.xpath('//*[@id="%s"]'%toc.partition('#')[-1]) tocobj = None ent_pat = re.compile(r'&(\S+?);') @@ -461,7 +467,7 @@ class MobiReader(object): if tocobj is not None: opf.set_toc(tocobj) - return opf + return opf, ncx_manifest_entry def sizeof_trailing_entries(self, data): @@ -589,7 +595,7 @@ def get_metadata(stream): if mr.book_header.exth is None: mi = MetaInformation(mr.name, [_('Unknown')]) else: - mi = mr.create_opf('dummy.html') + mi = mr.create_opf('dummy.html')[0] try: if hasattr(mr.book_header.exth, 'cover_offset'): cover_index = mr.book_header.first_image_index + \ diff --git a/src/calibre/ebooks/oeb/output.py b/src/calibre/ebooks/oeb/output.py index 2716ff57cd..fc1366fbcd 100644 --- a/src/calibre/ebooks/oeb/output.py +++ b/src/calibre/ebooks/oeb/output.py @@ -44,7 +44,7 @@ class OEBOutput(OutputFormatPlugin): else: raw = etree.tostring(raw, encoding='utf-8', pretty_print=opts.pretty_print) - raw = raw + '<?xml version="1.0" encoding="utf-8" ?>\n' + raw = '<?xml version="1.0" encoding="utf-8" ?>\n'+raw if isinstance(raw, unicode): raw = raw.encode('utf-8') with open(path, 'wb') as f: From dc3a6a1f618aa964801762dc7ad8dccb945044fc Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Thu, 2 Apr 2009 06:10:50 -0400 Subject: [PATCH 059/319] Finish txt output --- src/calibre/ebooks/txt/output.py | 14 ++++---------- src/calibre/ebooks/txt/writer.py | 6 +++--- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index 5e58d47ef1..c1e48d98fd 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -24,23 +24,17 @@ class TXTOutput(OutputFormatPlugin): 'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. ' 'For Mac OS X use \'unix\'. \'system\' will default to the newline ' 'type used by this OS.' % sorted(TxtNewlines.NEWLINE_TYPES.keys()))), - OptionRecommendation(name='prepend_author', recommended_value='true', - level=OptionRecommendation.LOW, long_switch='prepend_author', + OptionRecommendation(name='prepend_metadata', recommended_value='false', + level=OptionRecommendation.LOW, long_switch='prepend_metadata', choices=['true', 'false'], - help=_('Write the author to the beginning of the file. ' + help=_('Write the title and author to the beginning of the file. ' 'Default is \'true\'. Use \'false\' to disable.')), - OptionRecommendation(name='prepend_title', recommended_value='true', - choices=['true', 'false'], - level=OptionRecommendation.LOW, long_switch='prepend_title', - help=_('Write the title to the beginning of the file. ' - 'Default is \'true\'. Use \'false\' to disable.')) ]) def convert(self, oeb_book, output_path, input_plugin, opts, log): metadata = TxtMetadata() - if opts.prepend_author.lower() == 'true': + if opts.prepend_metadata.lower() == 'true': metadata.author = opts.authors if opts.authors else authors_to_string(oeb_book.metadata.authors.value) if oeb_book.metadata.authors != [] else _('Unknown') - if opts.prepend_title.lower() == 'true': metadata.title = opts.title if opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown') writer = TxtWriter(TxtNewlines(opts.newline).newline, log) diff --git a/src/calibre/ebooks/txt/writer.py b/src/calibre/ebooks/txt/writer.py index 2fb5f9550a..efd3ec0a2f 100644 --- a/src/calibre/ebooks/txt/writer.py +++ b/src/calibre/ebooks/txt/writer.py @@ -34,9 +34,9 @@ class TxtWriter(object): # Prepend metadata if metadata.author != None and metadata.author != '': - out = (u'%s%s%s%s' % (metadata.author.upper(), self.newline, self.newline, self.newline)) + out - if metadata.title != None and metadata.title != '': - out = (u'%s%s%s%s' % (metadata.title.upper(), self.newline, self.newline, self.newline)) + out + if metadata.title != None and metadata.title != '': + out = (u'%s%s%s%s' % (metadata.author.upper(), self.newline, self.newline, self.newline)) + out + out = (u'%s%s%s%s' % (metadata.title.upper(), self.newline, self.newline, self.newline)) + out # Put two blank lines at end of file end = out[-3 * len(self.newline):] From 1d669699ba76d856e5af7b2e4199af0cf30c9343 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Thu, 2 Apr 2009 06:38:55 -0400 Subject: [PATCH 060/319] PDF output complete --- src/calibre/ebooks/pdf/output.py | 45 +++++++++++++++++++------------- src/calibre/ebooks/pdf/writer.py | 12 +++++---- 2 files changed, 34 insertions(+), 23 deletions(-) diff --git a/src/calibre/ebooks/pdf/output.py b/src/calibre/ebooks/pdf/output.py index 5af4e4bed7..e76bcdd3d7 100644 --- a/src/calibre/ebooks/pdf/output.py +++ b/src/calibre/ebooks/pdf/output.py @@ -1,4 +1,6 @@ # -*- coding: utf-8 -*- +from __future__ import with_statement + __license__ = 'GPL 3' __copyright__ = '2009, John Schember <john@nachtimwald.com>' __docformat__ = 'restructuredtext en' @@ -9,10 +11,12 @@ Convert OEB ebook format to PDF. #unit, papersize, orientation, custom_size, profile -import os +import os, glob from calibre.customize.conversion import OutputFormatPlugin, \ OptionRecommendation +from calibre.ebooks.oeb.output import OEBOutput +from calibre.ptempfile import TemporaryDirectory from calibre.ebooks.pdf.writer import PDFWriter from calibre.ebooks.pdf.pageoptions import UNITS, unit, PAPER_SIZES, \ paper_size, ORIENTATIONS, orientation, PageOptions @@ -65,21 +69,26 @@ class PDFOutput(OutputFormatPlugin): popts.unit = unit(opts.unit) popts.paper_size = paper_size(opts.paper_size) popts.orientation = orientation(opts.orientation) - - writer = PDFWriter(log, popts) + + with TemporaryDirectory('_any2pdf') as oebdir: + OEBOutput(None).convert(oeb_book, oebdir, input_plugin, opts, log) + + opf = glob.glob(os.path.join(oebdir, '*.opf'))[0] + + writer = PDFWriter(log, popts) - close = False - if not hasattr(output_path, 'write'): - close = True - if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '': - os.makedirs(os.path.dirname(output_path)) - out_stream = open(output_path, 'wb') - else: - out_stream = output_path - - out_stream.seek(0) - out_stream.truncate() - writer.dump(oeb_book.spine, out_stream) - - if close: - out_stream.close() + close = False + if not hasattr(output_path, 'write'): + close = True + if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '': + os.makedirs(os.path.dirname(output_path)) + out_stream = open(output_path, 'wb') + else: + out_stream = output_path + + out_stream.seek(0) + out_stream.truncate() + writer.dump(opf, out_stream) + + if close: + out_stream.close() diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index cf77aebc14..a618213189 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -13,6 +13,7 @@ import os, shutil, sys from calibre.ptempfile import PersistentTemporaryDirectory from calibre.ebooks.pdf.pageoptions import PageOptions +from calibre.ebooks.metadata.opf2 import OPF from PyQt4 import QtCore from PyQt4.Qt import QUrl, QEventLoop, SIGNAL, QObject, QApplication, QPrinter, \ @@ -37,10 +38,11 @@ class PDFWriter(QObject): self.tmp_path = PersistentTemporaryDirectory('_any2pdf_parts') self.popts = popts - def dump(self, spine, out_stream): + def dump(self, opfpath, out_stream): self._delete_tmpdir() - self.render_queue = spine[:] + opf = OPF(opfpath, os.path.dirname(opfpath)) + self.render_queue = [i.path for i in opf.spine] self.combine_queue = [] self.out_stream = out_stream @@ -56,7 +58,7 @@ class PDFWriter(QObject): def _render_next(self): item = str(self.render_queue.pop(0)) - self.combine_queue.append(os.path.join(self.tmp_path, '%s_%i.pdf' % (os.path.basename(item), len(self.combine_queue)))) + self.combine_queue.append(os.path.join(self.tmp_path, '%i.pdf' % (len(self.combine_queue) + 1))) self.logger.info('Processing %s...' % item) @@ -64,9 +66,9 @@ class PDFWriter(QObject): def _render_html(self, ok): if ok: - item_path = os.path.join(self.tmp_path, '%s_%i.pdf' % (os.path.basename(str(self.view.url().toLocalFile())), len(self.combine_queue) - 1)) + item_path = os.path.join(self.tmp_path, '%i.pdf' % len(self.combine_queue)) - self.logger.debug('\tRendering item as %s' % item_path) + self.logger.debug('\tRendering item %s as %i' % (os.path.basename(str(self.view.url().toLocalFile())), len(self.combine_queue))) printer = QPrinter(QPrinter.HighResolution) printer.setPageMargins(self.popts.margin_left, self.popts.margin_top, self.popts.margin_right, self.popts.margin_bottom, self.popts.unit) From 57e643caf6614688a8e88c453d5e3951fcbf02a9 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Thu, 2 Apr 2009 06:44:55 -0400 Subject: [PATCH 061/319] remove any2pdf references --- src/calibre/ebooks/pdf/output.py | 2 +- src/calibre/ebooks/pdf/writer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/pdf/output.py b/src/calibre/ebooks/pdf/output.py index e76bcdd3d7..230beed9ae 100644 --- a/src/calibre/ebooks/pdf/output.py +++ b/src/calibre/ebooks/pdf/output.py @@ -70,7 +70,7 @@ class PDFOutput(OutputFormatPlugin): popts.paper_size = paper_size(opts.paper_size) popts.orientation = orientation(opts.orientation) - with TemporaryDirectory('_any2pdf') as oebdir: + with TemporaryDirectory('_pdf_out') as oebdir: OEBOutput(None).convert(oeb_book, oebdir, input_plugin, opts, log) opf = glob.glob(os.path.join(oebdir, '*.opf'))[0] diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index a618213189..2aebd7322c 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -82,7 +82,7 @@ class PDFWriter(QObject): def _delete_tmpdir(self): if os.path.exists(self.tmp_path): shutil.rmtree(self.tmp_path, True) - self.tmp_path = PersistentTemporaryDirectory('_any2pdf_parts') + self.tmp_path = PersistentTemporaryDirectory('_pdf_out_parts') def _write(self): self.logger.info('Combining individual PDF parts...') From 54e7822128eedc2ee2950f4a4cbe4af18e8c7a2d Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Thu, 2 Apr 2009 18:40:42 -0400 Subject: [PATCH 062/319] PDF input and txt output tweaks --- src/calibre/customize/builtins.py | 3 +- src/calibre/ebooks/oeb/base.py | 11 +++++ src/calibre/ebooks/pdf/input.py | 38 +++++++++++++++ src/calibre/ebooks/pdf/pdftohtml.py | 75 +++++++++++++++++++++++++++++ src/calibre/ebooks/txt/output.py | 2 +- src/calibre/ebooks/txt/writer.py | 7 +-- 6 files changed, 128 insertions(+), 8 deletions(-) create mode 100644 src/calibre/ebooks/pdf/input.py create mode 100644 src/calibre/ebooks/pdf/pdftohtml.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index ab6d772121..30f423fce3 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -265,13 +265,14 @@ class MOBIMetadataWriter(MetadataWriterPlugin): from calibre.ebooks.epub.input import EPUBInput from calibre.ebooks.mobi.input import MOBIInput +from calibre.ebooks.pdf.input import PDFInput from calibre.ebooks.txt.input import TXTInput from calibre.ebooks.oeb.output import OEBOutput from calibre.ebooks.txt.output import TXTOutput from calibre.ebooks.pdf.output import PDFOutput from calibre.customize.profiles import input_profiles, output_profiles -plugins = [HTML2ZIP, EPUBInput, MOBIInput, TXTInput, OEBOutput, TXTOutput, PDFOutput] +plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, TXTInput, OEBOutput, TXTOutput, PDFOutput] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index e96de5112f..7d489ec3ae 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -173,6 +173,9 @@ def xml2str(root, pretty_print=False): return etree.tostring(root, encoding='utf-8', xml_declaration=True, pretty_print=pretty_print) +def xml2unicode(root, pretty_print=False): + return etree.tostring(root, pretty_print=pretty_print) + ASCII_CHARS = set(chr(x) for x in xrange(128)) UNIBYTE_CHARS = set(chr(x) for x in xrange(256)) URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ' @@ -721,6 +724,14 @@ class Manifest(object): if isinstance(data, unicode): return data.encode('utf-8') return str(data) + + def __unicode__(self): + data = self.data + if isinstance(data, etree._Element): + return xml2unicode(data, pretty_print=self.oeb.pretty_print) + if isinstance(data, unicode): + return data + return unicode(data) def __eq__(self, other): return id(self) == id(other) diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py new file mode 100644 index 0000000000..060b9f5367 --- /dev/null +++ b/src/calibre/ebooks/pdf/input.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +from __future__ import with_statement + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +import os + +from calibre.customize.conversion import InputFormatPlugin +from calibre.ebooks.pdf.pdftohtml import pdftohtml +from calibre.ebooks.metadata.opf import OPFCreator +from calibre.ebooks.metadata import MetaInformation +#from calibre.ebooks.metadata.meta import metadata_from_formats + +class PDFInput(InputFormatPlugin): + + name = 'PDF Input' + author = 'John Schember' + description = 'Convert PDF files to HTML' + file_types = set(['pdf']) + + def convert(self, stream, options, file_ext, log, + accelerators): + html = pdftohtml(stream.name) + + with open('index.html', 'wb') as index: + index.write(html.encode('utf-8')) + + #mi = metadata_from_formats([stream.name]) + mi = MetaInformation(_('Unknown'), _('Unknown')) + opf = OPFCreator(os.getcwd(), mi) + opf.create_manifest([('index.html', None)]) + opf.create_spine(['index.html']) + with open('metadata.opf', 'wb') as opffile: + opf.render(opffile) + + return os.path.join(os.getcwd(), 'metadata.opf') diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py new file mode 100644 index 0000000000..275cfadb08 --- /dev/null +++ b/src/calibre/ebooks/pdf/pdftohtml.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- +from __future__ import with_statement + +__license__ = 'GPL 3' +__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> \ + 2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +import errno, os, sys, subprocess +from functools import partial + +from calibre.ebooks import ConversionError, DRMError +from calibre import isosx, iswindows, islinux +from calibre import CurrentDir +from calibre.ptempfile import TemporaryDirectory + +PDFTOHTML = 'pdftohtml' +popen = subprocess.Popen +if isosx and hasattr(sys, 'frameworks_dir'): + PDFTOHTML = os.path.join(getattr(sys, 'frameworks_dir'), PDFTOHTML) +if iswindows and hasattr(sys, 'frozen'): + PDFTOHTML = os.path.join(os.path.dirname(sys.executable), 'pdftohtml.exe') + popen = partial(subprocess.Popen, creationflags=0x08) # CREATE_NO_WINDOW=0x08 so that no ugly console is popped up +if islinux and getattr(sys, 'frozen_path', False): + PDFTOHTML = os.path.join(getattr(sys, 'frozen_path'), 'pdftohtml') + +def pdftohtml(pdf_path): + ''' + Convert the pdf into html using the pdftohtml app. + @return: The HTML as a unicode string. + ''' + + if isinstance(pdf_path, unicode): + pdf_path = pdf_path.encode(sys.getfilesystemencoding()) + if not os.access(pdf_path, os.R_OK): + raise ConversionError, 'Cannot read from ' + pdf_path + + with TemporaryDirectory('_pdftohtml') as tdir: + index = os.path.join(tdir, 'index.html') + # This is neccessary as pdftohtml doesn't always (linux) respect absolute paths + pdf_path = os.path.abspath(pdf_path) + cmd = (PDFTOHTML, '-noframes', '-p', '-nomerge', pdf_path, os.path.basename(index)) + cwd = os.getcwd() + + with CurrentDir(tdir): + try: + p = popen(cmd, stderr=subprocess.PIPE) + except OSError, err: + if err.errno == 2: + raise ConversionError(_('Could not find pdftohtml, check it is in your PATH'), True) + else: + raise + + while True: + try: + ret = p.wait() + break + except OSError, e: + if e.errno == errno.EINTR: + continue + else: + raise + + if ret != 0: + err = p.stderr.read() + raise ConversionError, err + if not os.path.exists(index) or os.stat(index).st_size < 100: + raise DRMError() + + with open(index, 'rb') as i: + raw = i.read().decode('latin-1') + if not '<br' in raw[:4000]: + raise ConversionError(os.path.basename(pdf_path) + _(' is an image based PDF. Only conversion of text based PDFs is supported.'), True) + + return u'<!-- created by calibre\'s pdftohtml -->\n' + raw diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index c1e48d98fd..2d1ef98662 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -51,7 +51,7 @@ class TXTOutput(OutputFormatPlugin): out_stream.seek(0) out_stream.truncate() - out_stream.write(txt) + out_stream.write(txt.encode('utf-8')) if close: out_stream.close() diff --git a/src/calibre/ebooks/txt/writer.py b/src/calibre/ebooks/txt/writer.py index efd3ec0a2f..0f84c32804 100644 --- a/src/calibre/ebooks/txt/writer.py +++ b/src/calibre/ebooks/txt/writer.py @@ -102,12 +102,7 @@ class TxtWriter(object): text = text.replace('\f+', ' ') # Single line paragraph. - r = re.compile('.\n.') - while True: - mo = r.search(text) - if mo == None: - break - text = '%s %s' % (text[:mo.start()+1], text[mo.end()-1:]) + text = re.sub('(?<=.)\n(?=.)', ' ', text) # Remove multiple spaces. text = re.sub('[ ]+', ' ', text) From 044d1d65fbe8726ac31aef74116fa411c60a044e Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Thu, 2 Apr 2009 20:12:45 -0400 Subject: [PATCH 063/319] Get avaliable input/output file ext --- src/calibre/customize/ui.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/calibre/customize/ui.py b/src/calibre/customize/ui.py index d8b7ebf6d8..ee5dc03713 100644 --- a/src/calibre/customize/ui.py +++ b/src/calibre/customize/ui.py @@ -254,16 +254,31 @@ def plugin_for_input_format(fmt): if fmt.lower() in plugin.file_types: return plugin +def available_input_formats(): + formats = [] + for plugin in input_format_plugins(): + if not is_disabled(plugin): + for format in plugin.file_types: + formats.append(format) + return formats + def output_format_plugins(): for plugin in _initialized_plugins: if isinstance(plugin, OutputFormatPlugin): - yield plugin + yield plugin.file_type def plugin_for_output_format(fmt): for plugin in output_format_plugins(): if fmt.lower() == plugin.file_type: return plugin - + +def available_output_formats(): + formats = [] + for plugin in _initialized_plugins: + if isinstance(plugin, OutputFormatPlugin): + if not is_disabled(plugin): + formats.append(plugin.file_type) + return formats def disable_plugin(plugin_or_name): x = getattr(plugin_or_name, 'name', plugin_or_name) From a9a74acbdec1f843e91cecd29135a6a3827bd08b Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Thu, 2 Apr 2009 20:33:14 -0400 Subject: [PATCH 064/319] tweaks --- src/calibre/customize/ui.py | 9 ++++----- src/calibre/ebooks/pdf/pdftohtml.py | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/calibre/customize/ui.py b/src/calibre/customize/ui.py index ee5dc03713..af85ca523d 100644 --- a/src/calibre/customize/ui.py +++ b/src/calibre/customize/ui.py @@ -265,7 +265,7 @@ def available_input_formats(): def output_format_plugins(): for plugin in _initialized_plugins: if isinstance(plugin, OutputFormatPlugin): - yield plugin.file_type + yield plugin def plugin_for_output_format(fmt): for plugin in output_format_plugins(): @@ -274,10 +274,9 @@ def plugin_for_output_format(fmt): def available_output_formats(): formats = [] - for plugin in _initialized_plugins: - if isinstance(plugin, OutputFormatPlugin): - if not is_disabled(plugin): - formats.append(plugin.file_type) + for plugin in output_format_plugins(): + if not is_disabled(plugin): + formats.append(plugin.file_type) return formats def disable_plugin(plugin_or_name): diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py index 275cfadb08..168923ad1a 100644 --- a/src/calibre/ebooks/pdf/pdftohtml.py +++ b/src/calibre/ebooks/pdf/pdftohtml.py @@ -39,7 +39,7 @@ def pdftohtml(pdf_path): index = os.path.join(tdir, 'index.html') # This is neccessary as pdftohtml doesn't always (linux) respect absolute paths pdf_path = os.path.abspath(pdf_path) - cmd = (PDFTOHTML, '-noframes', '-p', '-nomerge', pdf_path, os.path.basename(index)) + cmd = (PDFTOHTML, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-i', '-q', pdf_path, os.path.basename(index)) cwd = os.getcwd() with CurrentDir(tdir): From 754923ce07cbd268039b70bb9c8563f217b17730 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Thu, 2 Apr 2009 20:44:48 -0400 Subject: [PATCH 065/319] pdf input to txt output giving correct output --- src/calibre/ebooks/pdf/input.py | 2 +- src/calibre/ebooks/pdf/pdftohtml.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py index 060b9f5367..6f55b71dd5 100644 --- a/src/calibre/ebooks/pdf/input.py +++ b/src/calibre/ebooks/pdf/input.py @@ -25,7 +25,7 @@ class PDFInput(InputFormatPlugin): html = pdftohtml(stream.name) with open('index.html', 'wb') as index: - index.write(html.encode('utf-8')) + index.write(html) #mi = metadata_from_formats([stream.name]) mi = MetaInformation(_('Unknown'), _('Unknown')) diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py index 168923ad1a..27cdb3f691 100644 --- a/src/calibre/ebooks/pdf/pdftohtml.py +++ b/src/calibre/ebooks/pdf/pdftohtml.py @@ -68,8 +68,8 @@ def pdftohtml(pdf_path): raise DRMError() with open(index, 'rb') as i: - raw = i.read().decode('latin-1') + raw = i.read() if not '<br' in raw[:4000]: raise ConversionError(os.path.basename(pdf_path) + _(' is an image based PDF. Only conversion of text based PDFs is supported.'), True) - return u'<!-- created by calibre\'s pdftohtml -->\n' + raw + return '<!-- created by calibre\'s pdftohtml -->\n' + raw From 70e3ea15bba8c1143508b435b3871cdb6b2cebee Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Thu, 2 Apr 2009 20:55:46 -0400 Subject: [PATCH 066/319] move pdf manipulate into its own module --- src/calibre/ebooks/pdf/{manipulate.py => manipulate/cli.py} | 2 +- src/calibre/ebooks/pdf/{ => manipulate}/info.py | 0 src/calibre/ebooks/pdf/{ => manipulate}/merge.py | 0 src/calibre/ebooks/pdf/{ => manipulate}/reverse.py | 0 src/calibre/ebooks/pdf/{ => manipulate}/split.py | 0 src/calibre/ebooks/pdf/{ => manipulate}/trim.py | 0 src/calibre/linux.py | 2 +- 7 files changed, 2 insertions(+), 2 deletions(-) rename src/calibre/ebooks/pdf/{manipulate.py => manipulate/cli.py} (95%) rename src/calibre/ebooks/pdf/{ => manipulate}/info.py (100%) rename src/calibre/ebooks/pdf/{ => manipulate}/merge.py (100%) rename src/calibre/ebooks/pdf/{ => manipulate}/reverse.py (100%) rename src/calibre/ebooks/pdf/{ => manipulate}/split.py (100%) rename src/calibre/ebooks/pdf/{ => manipulate}/trim.py (100%) diff --git a/src/calibre/ebooks/pdf/manipulate.py b/src/calibre/ebooks/pdf/manipulate/cli.py similarity index 95% rename from src/calibre/ebooks/pdf/manipulate.py rename to src/calibre/ebooks/pdf/manipulate/cli.py index 8c49650730..e82946f2ea 100644 --- a/src/calibre/ebooks/pdf/manipulate.py +++ b/src/calibre/ebooks/pdf/manipulate/cli.py @@ -10,7 +10,7 @@ __docformat__ = 'restructuredtext en' import string, sys from calibre.utils.config import Config, StringConfig -from calibre.ebooks.pdf import info, merge, reverse, split, trim +from calibre.ebooks.pdf.manipulate import info, merge, reverse, split, trim COMMANDS = { 'info' : info, diff --git a/src/calibre/ebooks/pdf/info.py b/src/calibre/ebooks/pdf/manipulate/info.py similarity index 100% rename from src/calibre/ebooks/pdf/info.py rename to src/calibre/ebooks/pdf/manipulate/info.py diff --git a/src/calibre/ebooks/pdf/merge.py b/src/calibre/ebooks/pdf/manipulate/merge.py similarity index 100% rename from src/calibre/ebooks/pdf/merge.py rename to src/calibre/ebooks/pdf/manipulate/merge.py diff --git a/src/calibre/ebooks/pdf/reverse.py b/src/calibre/ebooks/pdf/manipulate/reverse.py similarity index 100% rename from src/calibre/ebooks/pdf/reverse.py rename to src/calibre/ebooks/pdf/manipulate/reverse.py diff --git a/src/calibre/ebooks/pdf/split.py b/src/calibre/ebooks/pdf/manipulate/split.py similarity index 100% rename from src/calibre/ebooks/pdf/split.py rename to src/calibre/ebooks/pdf/manipulate/split.py diff --git a/src/calibre/ebooks/pdf/trim.py b/src/calibre/ebooks/pdf/manipulate/trim.py similarity index 100% rename from src/calibre/ebooks/pdf/trim.py rename to src/calibre/ebooks/pdf/manipulate/trim.py diff --git a/src/calibre/linux.py b/src/calibre/linux.py index b680ecc304..592a25c170 100644 --- a/src/calibre/linux.py +++ b/src/calibre/linux.py @@ -39,7 +39,7 @@ entry_points = { 'calibre-fontconfig = calibre.utils.fontconfig:main', 'calibre-parallel = calibre.parallel:main', 'calibre-customize = calibre.customize.ui:main', - 'pdfmanipulate = calibre.ebooks.pdf.manipulate:main', + 'pdfmanipulate = calibre.ebooks.pdf.manipulate.cli:main', 'fetch-ebook-metadata = calibre.ebooks.metadata.fetch:main', 'calibre-smtp = calibre.utils.smtp:main', ], From 7f5a619ad9e65cdf64b3d2a825d4516575c0e7c6 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Fri, 3 Apr 2009 21:06:31 -0400 Subject: [PATCH 067/319] pdfmanipulate moved to new command line option framework --- src/calibre/ebooks/__init__.py | 1 + src/calibre/ebooks/pdf/manipulate/__init__.py | 0 src/calibre/ebooks/pdf/manipulate/cli.py | 62 +++---- src/calibre/ebooks/pdf/manipulate/crop.py | 155 ++++++++++++++++++ src/calibre/ebooks/pdf/manipulate/info.py | 48 +++--- src/calibre/ebooks/pdf/manipulate/merge.py | 74 ++++++--- src/calibre/ebooks/pdf/manipulate/reverse.py | 64 +++++--- src/calibre/ebooks/pdf/manipulate/split.py | 92 +++++++---- src/calibre/ebooks/pdf/manipulate/trim.py | 93 ----------- src/calibre/ebooks/pdf/pdftohtml.py | 4 +- 10 files changed, 368 insertions(+), 225 deletions(-) create mode 100644 src/calibre/ebooks/pdf/manipulate/__init__.py create mode 100644 src/calibre/ebooks/pdf/manipulate/crop.py delete mode 100644 src/calibre/ebooks/pdf/manipulate/trim.py diff --git a/src/calibre/ebooks/__init__.py b/src/calibre/ebooks/__init__.py index e208b5a688..26d2394818 100644 --- a/src/calibre/ebooks/__init__.py +++ b/src/calibre/ebooks/__init__.py @@ -1,3 +1,4 @@ +from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' diff --git a/src/calibre/ebooks/pdf/manipulate/__init__.py b/src/calibre/ebooks/pdf/manipulate/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/calibre/ebooks/pdf/manipulate/cli.py b/src/calibre/ebooks/pdf/manipulate/cli.py index e82946f2ea..e3fcef559c 100644 --- a/src/calibre/ebooks/pdf/manipulate/cli.py +++ b/src/calibre/ebooks/pdf/manipulate/cli.py @@ -1,69 +1,69 @@ -''' -Command line interface to run pdf manipulation commands. -''' from __future__ import with_statement +# -*- coding: utf-8 -*- __license__ = 'GPL v3' __copyright__ = '2009, John Schember <john@nachtimwald.com>' __docformat__ = 'restructuredtext en' +''' +Command line interface to run pdf manipulation commands. +''' + import string, sys -from calibre.utils.config import Config, StringConfig -from calibre.ebooks.pdf.manipulate import info, merge, reverse, split, trim +from calibre.utils.config import OptionParser +from calibre.utils.logging import Log +from calibre.constants import preferred_encoding +from calibre.customize.conversion import OptionRecommendation +from calibre.ebooks.pdf.manipulate import crop, info, merge, reverse, split COMMANDS = { + 'crop' : crop, 'info' : info, 'merge' : merge, 'reverse' : reverse, 'split' : split, - 'trim' : trim, } -def config(defaults=None): - desc = _('Options to control the transformation of pdf') - if defaults is None: - c = Config('manipulatepdf', desc) - else: - c = StringConfig(defaults, desc) - return c +USAGE = '%prog ' + _('''command ... + +command can be one of the following: +[%%commands] + +Use %prog command --help to get more information about a specific command + +Manipulate a PDF. +'''.replace('%%commands', string.join(sorted(COMMANDS.keys()), ', '))) + +def print_help(parser, log): + help = parser.format_help().encode(preferred_encoding, 'replace') + log(help) def option_parser(): - c = config() - return c.option_parser(usage=_('''\ - - %prog command ... - - command can be one of the following: - [%%commands] - - Use %prog command --help to get more information about a specific command - - Manipulate a PDF. - '''.replace('%%commands', string.join(sorted(COMMANDS.keys()), ', ')))) + return OptionParser(usage=USAGE) def main(args=sys.argv): + log = Log() parser = option_parser() if len(args) < 2: print 'Error: No command sepecified.\n' - print parser.get_usage() - return 2 + print_help(parser, log) + return 1 command = args[1].lower().strip() - if command in COMMANDS.keys(): + if command in COMMANDS.keys(): del args[1] return COMMANDS[command].main(args, command) else: parser.parse_args(args) print 'Unknown command %s.\n' % command - print parser.get_usage() - return 2 + print_help(parser, log) + return 1 # We should never get here. return 0 if __name__ == '__main__': sys.exit(main()) - diff --git a/src/calibre/ebooks/pdf/manipulate/crop.py b/src/calibre/ebooks/pdf/manipulate/crop.py new file mode 100644 index 0000000000..c3eb70c56d --- /dev/null +++ b/src/calibre/ebooks/pdf/manipulate/crop.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, James Beal <james_@catbus.co.uk>, ' \ + '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +''' +Crop a pdf file +''' + +import os, sys, re +from optparse import OptionGroup, Option + +from calibre.ebooks.metadata.meta import metadata_from_formats +from calibre.ebooks.metadata import authors_to_string +from calibre.utils.config import OptionParser +from calibre.utils.logging import Log +from calibre.constants import preferred_encoding +from calibre.customize.conversion import OptionRecommendation + +from pyPdf import PdfFileWriter, PdfFileReader + +DEFAULT_CROP = '10' + +USAGE = '%prog %%name ' + _(''' +[options] file.pdf + +Crop a PDF file. +''') + +OPTIONS = set([ + OptionRecommendation(name='output', recommended_value='cropped.pdf', + level=OptionRecommendation.HIGH, long_switch='output', short_switch='o', + help=_('Path to output file. By default a file is created in the current directory.')), + OptionRecommendation(name='bottom_left_x', recommended_value=DEFAULT_CROP, + level=OptionRecommendation.LOW, long_switch='leftx', short_switch='x', + help=_('Number of pixels to crop from the left most x (default is %s) ' % DEFAULT_CROP)), + OptionRecommendation(name='bottom_left_y', recommended_value=DEFAULT_CROP, + level=OptionRecommendation.LOW, long_switch='lefty', short_switch='y', + help=_('Number of pixels to crop from the left most y (default is %s) ' % DEFAULT_CROP)), + OptionRecommendation(name='top_right_x', recommended_value=DEFAULT_CROP, + level=OptionRecommendation.LOW, long_switch='rightx', short_switch='v', + help=_('Number of pixels to crop from the right most x (default is %s) ' % DEFAULT_CROP)), + OptionRecommendation(name='top_right_y', recommended_value=DEFAULT_CROP, + level=OptionRecommendation.LOW, long_switch='right y', short_switch='w', + help=_('Number of pixels to crop from the right most y (default is %s)' % DEFAULT_CROP)), + OptionRecommendation(name='bounding', recommended_value=None, + level=OptionRecommendation.LOW, long_switch='bounding', short_switch='b', + help=_('A file generated by ghostscript which allows each page to be individually cropped `gs -dSAFER -dNOPAUSE -dBATCH -sDEVICE=bbox file.pdf 2> bounding`')), +]) + +def print_help(parser, log): + help = parser.format_help().encode(preferred_encoding, 'replace') + log(help) + +def option_parser(name): + usage = USAGE.replace('%%name', name) + return OptionParser(usage=usage) + +def option_recommendation_to_cli_option(add_option, rec): + opt = rec.option + switches = ['-'+opt.short_switch] if opt.short_switch else [] + switches.append('--'+opt.long_switch) + attrs = dict(dest=opt.name, help=opt.help, + choices=opt.choices, default=rec.recommended_value) + add_option(Option(*switches, **attrs)) + +def add_options(parser): + group = OptionGroup(parser, _('Crop Options:'), _('Options to control the transformation of pdf')) + parser.add_option_group(group) + add_option = group.add_option + + for rec in OPTIONS: + option_recommendation_to_cli_option(add_option, rec) + +def crop_pdf(pdf_path, opts, metadata=None): + if metadata == None: + title = _('Unknown') + author = _('Unknown') + else: + title = metadata.title + author = authors_to_string(metadata.authors) + + input_pdf = PdfFileReader(open(pdf_path, 'rb')) + + bounding_lines = [] + if opts.bounding != None: + try: + bounding = open(opts.bounding , 'r') + bounding_regex = re.compile('%%BoundingBox: (?P<bottom_x>\d+) (?P<bottom_y>\d+) (?P<top_x>\d+) (?P<top_y>\d+)') + except: + raise Exception('Error reading %s' % opts.bounding) + + lines = bounding.readlines() + for line in lines: + if line.startswith('%%BoundingBox:'): + bounding_lines.append(line) + if len(bounding_lines) != input_pdf.numPages: + raise Exception('Error bounding file %s page count does not correspond to specified pdf' % opts.bounding) + + output_pdf = PdfFileWriter(title=title,author=author) + blines = iter(bounding_lines) + for page in input_pdf.pages: + if bounding_lines != []: + mo = bounding_regex.search(blines.next()) + if mo == None: + raise Exception('Error in bounding file %s' % opts.bounding) + page.mediaBox.upperRight = (mo.group('top_x'), mo.group('top_y')) + page.mediaBox.lowerLeft = (mo.group('bottom_x'), mo.group('bottom_y')) + else: + page.mediaBox.upperRight = (page.bleedBox.getUpperRight_x() - opts.top_right_x, page.bleedBox.getUpperRight_y() - opts.top_right_y) + page.mediaBox.lowerLeft = (page.bleedBox.getLowerLeft_x() + opts.bottom_left_x, page.bleedBox.getLowerLeft_y() + opts.bottom_left_y) + output_pdf.addPage(page) + + with open(opts.output, 'wb') as output_file: + output_pdf.write(output_file) + +# Return True if the pdf is valid. +def valid_pdf(pdf_path): + try: + with open(os.path.abspath(pdf_path), 'rb') as pdf_file: + pdf = PdfFileReader(pdf_file) + if pdf.isEncrypted or pdf.numPages <= 0: + raise Exception + except: + return False + return True + +def main(args=sys.argv, name=''): + log = Log() + parser = option_parser(name) + add_options(parser) + + opts, args = parser.parse_args(args) + args = args[1:] + + if len(args) < 1: + print 'Error: A PDF file is required.\n' + print_help(parser, log) + return 1 + + if not valid_pdf(args[0]): + print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % args[0] + return 1 + + mi = metadata_from_formats([args[0]]) + + crop_pdf(args[0], opts, mi) + + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/ebooks/pdf/manipulate/info.py b/src/calibre/ebooks/pdf/manipulate/info.py index 115e411ce4..4aff524330 100644 --- a/src/calibre/ebooks/pdf/manipulate/info.py +++ b/src/calibre/ebooks/pdf/manipulate/info.py @@ -1,34 +1,37 @@ -''' -Merge PDF files into a single PDF document. -''' from __future__ import with_statement +# -*- coding: utf-8 -*- __license__ = 'GPL v3' __copyright__ = '2009, John Schember <john@nachtimwald.com>' __docformat__ = 'restructuredtext en' -import os, re, sys, time +''' +Merge PDF files into a single PDF document. +''' -from calibre.utils.config import Config, StringConfig +import os, re, sys, time +from optparse import OptionGroup, Option + +from calibre.utils.config import OptionParser +from calibre.utils.logging import Log +from calibre.constants import preferred_encoding +from calibre.customize.conversion import OptionRecommendation from pyPdf import PdfFileWriter, PdfFileReader +USAGE = '%prog %%name ' + _(''' +file.pdf ... -def config(defaults=None): - desc = _('Options to control the transformation of pdf') - if defaults is None: - c = Config('manipulatepdf', desc) - else: - c = StringConfig(defaults, desc) - return c +Get info about a PDF. +''') + +def print_help(parser, log): + help = parser.format_help().encode(preferred_encoding, 'replace') + log(help) def option_parser(name): - c = config() - return c.option_parser(usage=_('''\ - %prog %%name [options] file.pdf ... - - Get info about a PDF. - '''.replace('%%name', name))) + usage = USAGE.replace('%%name', name) + return OptionParser(usage=usage) def print_info(pdf_path): with open(os.path.abspath(pdf_path), 'rb') as pdf_file: @@ -65,20 +68,22 @@ def verify_files(files): return invalid def main(args=sys.argv, name=''): + log = Log() parser = option_parser(name) + opts, args = parser.parse_args(args) args = args[1:] if len(args) < 1: print 'Error: No PDF sepecified.\n' - print parser.get_usage() - return 2 + print_help(parser, log) + return 1 bad_pdfs = verify_files(args) if bad_pdfs != []: for pdf in bad_pdfs: print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % pdf - return 2 + return 1 for pdf in args: print_info(pdf) @@ -87,4 +92,3 @@ def main(args=sys.argv, name=''): if __name__ == '__main__': sys.exit(main()) - diff --git a/src/calibre/ebooks/pdf/manipulate/merge.py b/src/calibre/ebooks/pdf/manipulate/merge.py index c0385080ad..f0ecb9bd7a 100644 --- a/src/calibre/ebooks/pdf/manipulate/merge.py +++ b/src/calibre/ebooks/pdf/manipulate/merge.py @@ -1,37 +1,63 @@ -''' -Merge PDF files into a single PDF document. -''' from __future__ import with_statement +# -*- coding: utf-8 -*- __license__ = 'GPL v3' __copyright__ = '2009, John Schember <john@nachtimwald.com>' __docformat__ = 'restructuredtext en' +''' +Merge PDF files into a single PDF document. +''' + import os, sys +from optparse import OptionGroup, Option from calibre.ebooks.metadata.meta import metadata_from_formats from calibre.ebooks.metadata import authors_to_string -from calibre.utils.config import Config, StringConfig +from calibre.utils.config import OptionParser +from calibre.utils.logging import Log +from calibre.constants import preferred_encoding +from calibre.customize.conversion import OptionRecommendation from pyPdf import PdfFileWriter, PdfFileReader -def config(defaults=None): - desc = _('Options to control the transformation of pdf') - if defaults is None: - c = Config('mergepdf', desc) - else: - c = StringConfig(defaults, desc) - c.add_opt('output', ['-o', '--output'], default='merged.pdf', - help=_('Path to output file. By default a file is created in the current directory.')) - return c +USAGE = '%prog %%name ' + _(''' +[options] file1.pdf file2.pdf ... + +Metadata will be used from the first PDF specified. + +Merges individual PDFs. +''') + +OPTIONS = set([ + OptionRecommendation(name='output', recommended_value='merged.pdf', + level=OptionRecommendation.HIGH, long_switch='output', short_switch='o', + help=_('Path to output file. By default a file is created in the current directory.')), +]) + +def print_help(parser, log): + help = parser.format_help().encode(preferred_encoding, 'replace') + log(help) def option_parser(name): - c = config() - return c.option_parser(usage=_('''\ - %prog %%name [options] file1.pdf file2.pdf ... + usage = USAGE.replace('%%name', name) + return OptionParser(usage=usage) - Merges individual PDFs. Metadata will be used from the first PDF specified. - '''.replace('%%name', name))) +def option_recommendation_to_cli_option(add_option, rec): + opt = rec.option + switches = ['-'+opt.short_switch] if opt.short_switch else [] + switches.append('--'+opt.long_switch) + attrs = dict(dest=opt.name, help=opt.help, + choices=opt.choices, default=rec.recommended_value) + add_option(Option(*switches, **attrs)) + +def add_options(parser): + group = OptionGroup(parser, _('Merge Options:'), _('Options to control the transformation of pdf')) + parser.add_option_group(group) + add_option = group.add_option + + for rec in OPTIONS: + option_recommendation_to_cli_option(add_option, rec) def merge_files(in_paths, out_path, metadata=None): if metadata == None: @@ -65,20 +91,23 @@ def verify_files(files): return invalid def main(args=sys.argv, name=''): + log = Log() parser = option_parser(name) + add_options(parser) + opts, args = parser.parse_args(args) args = args[1:] if len(args) < 2: - print 'Error: Two or more PDF files are required.\n\n' - print parser.get_usage() - return 2 + print 'Error: Two or more PDF files are required.\n' + print_help(parser, log) + return 1 bad_pdfs = verify_files(args) if bad_pdfs != []: for pdf in bad_pdfs: print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % pdf - return 2 + return 1 mi = metadata_from_formats([args[0]]) @@ -88,4 +117,3 @@ def main(args=sys.argv, name=''): if __name__ == '__main__': sys.exit(main()) - diff --git a/src/calibre/ebooks/pdf/manipulate/reverse.py b/src/calibre/ebooks/pdf/manipulate/reverse.py index 87bb9018c1..189cbf009b 100644 --- a/src/calibre/ebooks/pdf/manipulate/reverse.py +++ b/src/calibre/ebooks/pdf/manipulate/reverse.py @@ -10,30 +10,52 @@ Reverse content of PDF. ''' import os, sys +from optparse import OptionGroup, Option from calibre.ebooks.metadata.meta import metadata_from_formats from calibre.ebooks.metadata import authors_to_string -from calibre.utils.config import Config, StringConfig +from calibre.utils.config import OptionParser +from calibre.utils.logging import Log +from calibre.constants import preferred_encoding +from calibre.customize.conversion import OptionRecommendation from pyPdf import PdfFileWriter, PdfFileReader -def config(defaults=None): - desc = _('Options to control the transformation of pdf') - if defaults is None: - c = Config('reversepdf', desc) - else: - c = StringConfig(defaults, desc) - c.add_opt('output', ['-o', '--output'], default='reversed.pdf', - help=_('Path to output file. By default a file is created in the current directory.')) - return c +USAGE = '%prog %%name ' + _(''' +[options] file.pdf + +Reverse PDF. +''') + +OPTIONS = set([ + OptionRecommendation(name='output', recommended_value='reversed.pdf', + level=OptionRecommendation.HIGH, long_switch='output', short_switch='o', + help=_('Path to output file. By default a file is created in the current directory.')), +]) + +def print_help(parser, log): + help = parser.format_help().encode(preferred_encoding, 'replace') + log(help) def option_parser(name): - c = config() - return c.option_parser(usage=_('''\ - %prog %%name [options] file1.pdf + usage = USAGE.replace('%%name', name) + return OptionParser(usage=usage) - Reverse PDF. - '''.replace('%%name', name))) +def option_recommendation_to_cli_option(add_option, rec): + opt = rec.option + switches = ['-'+opt.short_switch] if opt.short_switch else [] + switches.append('--'+opt.long_switch) + attrs = dict(dest=opt.name, help=opt.help, + choices=opt.choices, default=rec.recommended_value) + add_option(Option(*switches, **attrs)) + +def add_options(parser): + group = OptionGroup(parser, _('Reverse Options:'), _('Options to control the transformation of pdf')) + parser.add_option_group(group) + add_option = group.add_option + + for rec in OPTIONS: + option_recommendation_to_cli_option(add_option, rec) def reverse(pdf_path, out_path, metadata=None): if metadata == None: @@ -63,20 +85,22 @@ def valid_pdf(pdf_path): return False return True - def main(args=sys.argv, name=''): + log = Log() parser = option_parser(name) + add_options(parser) + opts, args = parser.parse_args(args) args = args[1:] if len(args) < 1: - print 'Error: A PDF file is required.\n\n' - print parser.get_usage() - return 2 + print 'Error: A PDF file is required.\n' + print_help(parser, log) + return 1 if not valid_pdf(args[0]): print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % args[0] - return 2 + return 1 mi = metadata_from_formats([args[0]]) diff --git a/src/calibre/ebooks/pdf/manipulate/split.py b/src/calibre/ebooks/pdf/manipulate/split.py index cc6965dd68..8996a4cb6b 100644 --- a/src/calibre/ebooks/pdf/manipulate/split.py +++ b/src/calibre/ebooks/pdf/manipulate/split.py @@ -1,46 +1,68 @@ -''' -Split PDF file into multiple PDF documents. -''' +# -*- coding: utf-8 -*- from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2009, John Schember <john@nachtimwald.com>' __docformat__ = 'restructuredtext en' +''' +Split PDF file into multiple PDF documents. +''' + import os, sys, re +from optparse import OptionGroup, Option from calibre.ebooks.metadata.meta import metadata_from_formats from calibre.ebooks.metadata import authors_to_string -from calibre.utils.config import Config, StringConfig +from calibre.utils.config import OptionParser +from calibre.utils.logging import Log +from calibre.constants import preferred_encoding +from calibre.customize.conversion import OptionRecommendation from pyPdf import PdfFileWriter, PdfFileReader -def config(defaults=None): - desc = _('Options to control the transformation of pdf') - if defaults is None: - c = Config('splitpdf', desc) - else: - c = StringConfig(defaults, desc) - c.add_opt('output', ['-o', '--output'], default='split.pdf', - help=_('Path to output file. By default a file is created in the current directory. \ - The file name will be the base name for the output.')) - return c +USAGE = _(''' +%prog %%name [options] file.pdf page_to_split_on ... +%prog %%name [options] file.pdf page_range_to_split_on ... + +Ex. + +%prog %%name file.pdf 6 +%prog %%name file.pdf 6-12 +%prog %%name file.pdf 6-12 8 10 9-20 + +Split a PDF. +''') + +OPTIONS = set([ + OptionRecommendation(name='output', recommended_value='split.pdf', + level=OptionRecommendation.HIGH, long_switch='output', short_switch='o', + help=_('Path to output file. By default a file is created in the current directory.')), +]) + +def print_help(parser, log): + help = parser.format_help().encode(preferred_encoding, 'replace') + log(help) def option_parser(name): - c = config() - return c.option_parser(usage=_('''\ - - %prog %%name [options] file.pdf page_to_split_on ... - %prog %%name [options] file.pdf page_range_to_split_on ... - - Ex. - - %prog %%name file.pdf 6 - %prog %%name file.pdf 6-12 - %prog %%name file.pdf 6-12 8 10 9-20 + usage = USAGE.replace('%%name', name) + return OptionParser(usage=usage) - Split a PDF. - '''.replace('%%name', name))) +def option_recommendation_to_cli_option(add_option, rec): + opt = rec.option + switches = ['-'+opt.short_switch] if opt.short_switch else [] + switches.append('--'+opt.long_switch) + attrs = dict(dest=opt.name, help=opt.help, + choices=opt.choices, default=rec.recommended_value) + add_option(Option(*switches, **attrs)) + +def add_options(parser): + group = OptionGroup(parser, _('Split Options:'), _('Options to control the transformation of pdf')) + parser.add_option_group(group) + add_option = group.add_option + + for rec in OPTIONS: + option_recommendation_to_cli_option(add_option, rec) def split_pdf(in_path, pages, page_ranges, out_name, metadata=None): pdf = PdfFileReader(open(os.path.abspath(in_path), 'rb')) @@ -153,25 +175,28 @@ def valid_pdf(pdf_path): return True def main(args=sys.argv, name=''): + log = Log() parser = option_parser(name) + add_options(parser) + opts, args = parser.parse_args(args) pdf, pages, page_ranges, unknown = split_args(args[1:]) if pdf == '' and (pages == [] or page_ranges == []): - print 'Error: PDF and where to split is required.\n\n' - print parser.get_usage() - return 2 + print 'Error: PDF and where to split is required.\n' + print_help(parser, log) + return 1 if unknown != []: for arg in unknown: print 'Error: Unknown argument `%s`' % arg - print parser.get_usage() - return 2 + print_help(parser, log) + return 1 if not valid_pdf(pdf): print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % pdf - return 2 + return 1 pages, page_ranges = clean_page_list(pdf, pages, page_ranges) @@ -183,4 +208,3 @@ def main(args=sys.argv, name=''): if __name__ == '__main__': sys.exit(main()) - diff --git a/src/calibre/ebooks/pdf/manipulate/trim.py b/src/calibre/ebooks/pdf/manipulate/trim.py deleted file mode 100644 index b32312fee8..0000000000 --- a/src/calibre/ebooks/pdf/manipulate/trim.py +++ /dev/null @@ -1,93 +0,0 @@ -from __future__ import with_statement -__license__ = 'GPL v3' -__copyright__ = '2009, James Beal, james_@catbus.co.uk' -__docformat__ = 'restructuredtext en' - -'crop a pdf file' - -import os, sys, re -from calibre.utils.config import Config, StringConfig -from pyPdf import PdfFileWriter, PdfFileReader - -def config(defaults=None): - desc = _('Options to control the transformation of pdf') - default_crop=10 - if defaults is None: - c = Config('trimpdf', desc) - else: - c = StringConfig(defaults, desc) - c.add_opt('output', ['-o', '--output'],default='cropped.pdf', - help=_('Path to output file. By default a file is created in the current directory.')) - c.add_opt('bottom_left_x', [ '-x', '--leftx'], default=default_crop, - help=_('Number of pixels to crop from the left most x (default is %d) ')%default_crop ) - c.add_opt('bottom_left_y', [ '-y', '--lefty'], default=default_crop, - help=_('Number of pixels to crop from the left most y (default is %d) ')%default_crop ) - c.add_opt('top_right_x', [ '-v', '--rightx'], default=default_crop, - help=_('Number of pixels to crop from the right most x (default is %d) ')%default_crop ) - c.add_opt('top_right_y', [ '-w', '--righty'], default=default_crop, - help=_('Number of pixels to crop from the right most y (default is %d)')%default_crop ) - c.add_opt('bounding', ['-b', '--bounding'], - help=_('A file generated by ghostscript which allows each page to be individually cropped [gs -dSAFER -dNOPAUSE -dBATCH -sDEVICE=bbox > bounding] ')) - return c - - -def option_parser(name): - c = config() - return c.option_parser(usage=_('''\ - %prog %%name [options] file.pdf - - Crops a pdf. - '''.replace('%%name', name))) - -def main(args=sys.argv, name=''): - parser = option_parser(name) - opts, args = parser.parse_args(args) - try: - source = os.path.abspath(args[1]) - input_pdf = PdfFileReader(file(source, "rb")) - except: - print "Unable to read input" - return 2 - title = _('Unknown') - author = _('Unknown') - try: - info = input_pdf.getDocumentInfo() - if info.title: - title = info.title - if info.author: - author = info.author - except: - pass - if opts.bounding != None: - try: - bounding = open( opts.bounding , 'r' ) - bounding_regex= re.compile('%%BoundingBox: (?P<bottom_x>[0-9]+) (?P<bottom_y>[0-9]+) (?P<top_x>[0-9]+) (?P<top_y>[0-9]+)') - except: - print 'Error opening %s' % opts.bounding - return 1 - output_pdf = PdfFileWriter(title=title,author=author) - for page_number in range (0, input_pdf.getNumPages() ): - page = input_pdf.getPage(page_number) - if opts.bounding != None: - while True: - line=bounding.readline() - match=bounding_regex.search(line) - if match !=None: - break - page.mediaBox.upperRight = (match.group('top_x'),match.group('top_y')) - page.mediaBox.lowerLeft = (match.group('bottom_x'),match.group('bottom_y')) - else: - page.mediaBox.upperRight = (page.bleedBox.getUpperRight_x()-opts.top_right_x,page.bleedBox.getUpperRight_y()-opts.top_right_y) - page.mediaBox.lowerLeft = (page.bleedBox.getLowerLeft_x()+opts.bottom_left_x,page.bleedBox.getLowerLeft_y()+opts.bottom_left_y) - output_pdf.addPage(page) - if opts.bounding != None: - bounding.close() - output_file = file(opts.output, "wb") - output_pdf.write(output_file) - output_file.close() - - - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py index 27cdb3f691..e7707479c3 100644 --- a/src/calibre/ebooks/pdf/pdftohtml.py +++ b/src/calibre/ebooks/pdf/pdftohtml.py @@ -2,8 +2,8 @@ from __future__ import with_statement __license__ = 'GPL 3' -__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> \ - 2009, John Schember <john@nachtimwald.com>' +__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>, ' \ + '2009, John Schember <john@nachtimwald.com>' __docformat__ = 'restructuredtext en' import errno, os, sys, subprocess From 697eabe9ae74a897f1ca2026fb9d0e45b8caf7ce Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Fri, 3 Apr 2009 22:05:41 -0400 Subject: [PATCH 068/319] Refactor pdf manipulate commands --- src/calibre/ebooks/pdf/manipulate/crop.py | 14 ++------ src/calibre/ebooks/pdf/manipulate/info.py | 14 ++------ src/calibre/ebooks/pdf/manipulate/merge.py | 16 ++------- src/calibre/ebooks/pdf/manipulate/reverse.py | 14 ++------ src/calibre/ebooks/pdf/manipulate/split.py | 14 ++------ src/calibre/ebooks/pdf/verify.py | 37 ++++++++++++++++++++ 6 files changed, 47 insertions(+), 62 deletions(-) create mode 100644 src/calibre/ebooks/pdf/verify.py diff --git a/src/calibre/ebooks/pdf/manipulate/crop.py b/src/calibre/ebooks/pdf/manipulate/crop.py index c3eb70c56d..fa996b754f 100644 --- a/src/calibre/ebooks/pdf/manipulate/crop.py +++ b/src/calibre/ebooks/pdf/manipulate/crop.py @@ -19,6 +19,7 @@ from calibre.utils.config import OptionParser from calibre.utils.logging import Log from calibre.constants import preferred_encoding from calibre.customize.conversion import OptionRecommendation +from calibre.ebooks.pdf.verify import is_valid_pdf from pyPdf import PdfFileWriter, PdfFileReader @@ -116,17 +117,6 @@ def crop_pdf(pdf_path, opts, metadata=None): with open(opts.output, 'wb') as output_file: output_pdf.write(output_file) - -# Return True if the pdf is valid. -def valid_pdf(pdf_path): - try: - with open(os.path.abspath(pdf_path), 'rb') as pdf_file: - pdf = PdfFileReader(pdf_file) - if pdf.isEncrypted or pdf.numPages <= 0: - raise Exception - except: - return False - return True def main(args=sys.argv, name=''): log = Log() @@ -141,7 +131,7 @@ def main(args=sys.argv, name=''): print_help(parser, log) return 1 - if not valid_pdf(args[0]): + if not is_valid_pdf(args[0]): print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % args[0] return 1 diff --git a/src/calibre/ebooks/pdf/manipulate/info.py b/src/calibre/ebooks/pdf/manipulate/info.py index 4aff524330..21a07fdeff 100644 --- a/src/calibre/ebooks/pdf/manipulate/info.py +++ b/src/calibre/ebooks/pdf/manipulate/info.py @@ -16,6 +16,7 @@ from calibre.utils.config import OptionParser from calibre.utils.logging import Log from calibre.constants import preferred_encoding from calibre.customize.conversion import OptionRecommendation +from calibre.ebooks.pdf.verify import is_valid_pdfs from pyPdf import PdfFileWriter, PdfFileReader @@ -56,17 +57,6 @@ def print_info(pdf_path): print _('PDF Version: %s' % mo.group('version')) except: pass -def verify_files(files): - invalid = [] - - for pdf_path in files: - try: - with open(os.path.abspath(pdf_path), 'rb') as pdf_file: - pdf = PdfFileReader(pdf_file) - except: - invalid.append(pdf_path) - return invalid - def main(args=sys.argv, name=''): log = Log() parser = option_parser(name) @@ -79,7 +69,7 @@ def main(args=sys.argv, name=''): print_help(parser, log) return 1 - bad_pdfs = verify_files(args) + bad_pdfs = is_valid_pdfs(args) if bad_pdfs != []: for pdf in bad_pdfs: print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % pdf diff --git a/src/calibre/ebooks/pdf/manipulate/merge.py b/src/calibre/ebooks/pdf/manipulate/merge.py index f0ecb9bd7a..1e285e3bdf 100644 --- a/src/calibre/ebooks/pdf/manipulate/merge.py +++ b/src/calibre/ebooks/pdf/manipulate/merge.py @@ -18,6 +18,7 @@ from calibre.utils.config import OptionParser from calibre.utils.logging import Log from calibre.constants import preferred_encoding from calibre.customize.conversion import OptionRecommendation +from calibre.ebooks.pdf.verify import is_valid_pdfs from pyPdf import PdfFileWriter, PdfFileReader @@ -76,19 +77,6 @@ def merge_files(in_paths, out_path, metadata=None): with open(out_path, 'wb') as out_file: out_pdf.write(out_file) - -def verify_files(files): - invalid = [] - - for pdf_path in files: - try: - with open(os.path.abspath(pdf_path), 'rb') as pdf_file: - pdf = PdfFileReader(pdf_file) - if pdf.isEncrypted or pdf.numPages <= 0: - raise Exception - except: - invalid.append(pdf_path) - return invalid def main(args=sys.argv, name=''): log = Log() @@ -103,7 +91,7 @@ def main(args=sys.argv, name=''): print_help(parser, log) return 1 - bad_pdfs = verify_files(args) + bad_pdfs = is_valid_pdfs(args) if bad_pdfs != []: for pdf in bad_pdfs: print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % pdf diff --git a/src/calibre/ebooks/pdf/manipulate/reverse.py b/src/calibre/ebooks/pdf/manipulate/reverse.py index 189cbf009b..564e523ae3 100644 --- a/src/calibre/ebooks/pdf/manipulate/reverse.py +++ b/src/calibre/ebooks/pdf/manipulate/reverse.py @@ -18,6 +18,7 @@ from calibre.utils.config import OptionParser from calibre.utils.logging import Log from calibre.constants import preferred_encoding from calibre.customize.conversion import OptionRecommendation +from calibre.ebooks.pdf.verify import is_valid_pdf from pyPdf import PdfFileWriter, PdfFileReader @@ -74,17 +75,6 @@ def reverse(pdf_path, out_path, metadata=None): with open(out_path, 'wb') as out_file: out_pdf.write(out_file) -# Return True if the pdf is valid. -def valid_pdf(pdf_path): - try: - with open(os.path.abspath(pdf_path), 'rb') as pdf_file: - pdf = PdfFileReader(pdf_file) - if pdf.isEncrypted or pdf.numPages <= 0: - raise Exception - except: - return False - return True - def main(args=sys.argv, name=''): log = Log() parser = option_parser(name) @@ -98,7 +88,7 @@ def main(args=sys.argv, name=''): print_help(parser, log) return 1 - if not valid_pdf(args[0]): + if not is_valid_pdf(args[0]): print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % args[0] return 1 diff --git a/src/calibre/ebooks/pdf/manipulate/split.py b/src/calibre/ebooks/pdf/manipulate/split.py index 8996a4cb6b..fb7e4d06d7 100644 --- a/src/calibre/ebooks/pdf/manipulate/split.py +++ b/src/calibre/ebooks/pdf/manipulate/split.py @@ -18,6 +18,7 @@ from calibre.utils.config import OptionParser from calibre.utils.logging import Log from calibre.constants import preferred_encoding from calibre.customize.conversion import OptionRecommendation +from calibre.ebooks.pdf.verify import is_valid_pdf from pyPdf import PdfFileWriter, PdfFileReader @@ -163,17 +164,6 @@ def clean_page_list(pdf_path, pages, page_ranges): return pages, page_ranges -# Return True if the pdf is valid. -def valid_pdf(pdf_path): - try: - with open(os.path.abspath(pdf_path), 'rb') as pdf_file: - pdf = PdfFileReader(pdf_file) - if pdf.isEncrypted or pdf.numPages <= 0: - raise Exception - except: - return False - return True - def main(args=sys.argv, name=''): log = Log() parser = option_parser(name) @@ -194,7 +184,7 @@ def main(args=sys.argv, name=''): print_help(parser, log) return 1 - if not valid_pdf(pdf): + if not is_valid_pdf(pdf): print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % pdf return 1 diff --git a/src/calibre/ebooks/pdf/verify.py b/src/calibre/ebooks/pdf/verify.py new file mode 100644 index 0000000000..35f7edf0be --- /dev/null +++ b/src/calibre/ebooks/pdf/verify.py @@ -0,0 +1,37 @@ +from __future__ import with_statement +# -*- coding: utf-8 -*- + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +''' +Verify PDF files. +''' + +import os + +from pyPdf import PdfFileWriter, PdfFileReader + +def is_valid_pdf(pdf_path): + ''' + Returns True if the pdf file is valid. + ''' + + try: + with open(os.path.abspath(pdf_path), 'rb') as pdf_file: + pdf = PdfFileReader(pdf_file) + except: + return False + return True + +def is_valid_pdfs(pdf_paths): + ''' + Returns a list of invalid pdf files. + ''' + + invalid = [] + for pdf_path in pdf_paths: + if not is_valid_pdf(pdf_path): + invalid.append(pdf_path) + return invalid From 08971e831637122a1307d1aa8307775a887cba91 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 4 Apr 2009 10:37:00 -0400 Subject: [PATCH 069/319] New pdf manipulate commands. Remove old option parser from pdf metadata. --- src/calibre/ebooks/metadata/pdf.py | 39 +------ src/calibre/ebooks/pdf/manipulate/cli.py | 5 +- src/calibre/ebooks/pdf/manipulate/crop.py | 8 +- src/calibre/ebooks/pdf/manipulate/decrypt.py | 115 +++++++++++++++++++ src/calibre/ebooks/pdf/manipulate/encrypt.py | 105 +++++++++++++++++ src/calibre/ebooks/pdf/manipulate/info.py | 16 ++- src/calibre/ebooks/pdf/manipulate/merge.py | 14 ++- src/calibre/ebooks/pdf/manipulate/reverse.py | 10 +- src/calibre/ebooks/pdf/manipulate/split.py | 6 +- src/calibre/ebooks/pdf/verify.py | 7 ++ src/calibre/ebooks/txt/input.py | 3 +- 11 files changed, 274 insertions(+), 54 deletions(-) create mode 100644 src/calibre/ebooks/pdf/manipulate/decrypt.py create mode 100644 src/calibre/ebooks/pdf/manipulate/encrypt.py diff --git a/src/calibre/ebooks/metadata/pdf.py b/src/calibre/ebooks/metadata/pdf.py index 8f73e04050..6b94b07275 100644 --- a/src/calibre/ebooks/metadata/pdf.py +++ b/src/calibre/ebooks/metadata/pdf.py @@ -7,7 +7,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' import sys, os, re, StringIO -from calibre.ebooks.metadata import MetaInformation, authors_to_string, get_parser +from calibre.ebooks.metadata import MetaInformation, authors_to_string from calibre.ptempfile import TemporaryDirectory from pyPdf import PdfFileReader, PdfFileWriter import Image @@ -96,40 +96,3 @@ def get_cover(stream): traceback.print_exc() return data.getvalue() - -def option_parser(): - p = get_parser('pdf') - p.remove_option('--category') - p.remove_option('--comment') - p.add_option('--get-cover', default=False, action='store_true', - help=_('Extract the cover')) - return p - -def main(args=sys.argv): - p = option_parser() - opts, args = p.parse_args(args) - - with open(os.path.abspath(os.path.expanduser(args[1])), 'r+b') as stream: - mi = get_metadata(stream, extract_cover=opts.get_cover) - changed = False - if opts.title: - mi.title = opts.title - changed = True - if opts.authors: - mi.authors = opts.authors.split(',') - changed = True - - if changed: - set_metadata(stream, mi) - print unicode(get_metadata(stream, extract_cover=False)).encode('utf-8') - - if mi.cover_data[1] is not None: - cpath = os.path.splitext(os.path.basename(args[1]))[0] + '_cover.jpg' - with open(cpath, 'wb') as f: - f.write(mi.cover_data[1]) - print 'Cover saved to', f.name - - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/pdf/manipulate/cli.py b/src/calibre/ebooks/pdf/manipulate/cli.py index e3fcef559c..edbba54a8d 100644 --- a/src/calibre/ebooks/pdf/manipulate/cli.py +++ b/src/calibre/ebooks/pdf/manipulate/cli.py @@ -15,10 +15,13 @@ from calibre.utils.config import OptionParser from calibre.utils.logging import Log from calibre.constants import preferred_encoding from calibre.customize.conversion import OptionRecommendation -from calibre.ebooks.pdf.manipulate import crop, info, merge, reverse, split +from calibre.ebooks.pdf.manipulate import crop, decrypt, encrypt, \ + info, merge, reverse, split COMMANDS = { 'crop' : crop, + 'decrypt' : decrypt, + 'encrypt' : encrypt, 'info' : info, 'merge' : merge, 'reverse' : reverse, diff --git a/src/calibre/ebooks/pdf/manipulate/crop.py b/src/calibre/ebooks/pdf/manipulate/crop.py index fa996b754f..7627823a89 100644 --- a/src/calibre/ebooks/pdf/manipulate/crop.py +++ b/src/calibre/ebooks/pdf/manipulate/crop.py @@ -25,7 +25,7 @@ from pyPdf import PdfFileWriter, PdfFileReader DEFAULT_CROP = '10' -USAGE = '%prog %%name ' + _(''' +USAGE = '\n%prog %%name ' + _('''\ [options] file.pdf Crop a PDF file. @@ -132,7 +132,11 @@ def main(args=sys.argv, name=''): return 1 if not is_valid_pdf(args[0]): - print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % args[0] + print 'Error: Could not read file `%s`.' % args[0] + return 1 + + if is_encrypted(args[0]): + print 'Error: file `%s` is encrypted.' % args[0] return 1 mi = metadata_from_formats([args[0]]) diff --git a/src/calibre/ebooks/pdf/manipulate/decrypt.py b/src/calibre/ebooks/pdf/manipulate/decrypt.py new file mode 100644 index 0000000000..5f4265b5ed --- /dev/null +++ b/src/calibre/ebooks/pdf/manipulate/decrypt.py @@ -0,0 +1,115 @@ +# -*- coding: utf-8 -*- +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +''' +Decrypt content of PDF. +''' + +import os, sys +from optparse import OptionGroup, Option + +from calibre.ebooks.metadata.meta import metadata_from_formats +from calibre.ebooks.metadata import authors_to_string +from calibre.utils.config import OptionParser +from calibre.utils.logging import Log +from calibre.constants import preferred_encoding +from calibre.customize.conversion import OptionRecommendation +from calibre.ebooks.pdf.verify import is_valid_pdf, is_encrypted + +from pyPdf import PdfFileWriter, PdfFileReader + +USAGE = '\n%prog %%name ' + _('''\ +[options] file.pdf password + +Decrypt a PDF. +''') + +OPTIONS = set([ + OptionRecommendation(name='output', recommended_value='decrypted.pdf', + level=OptionRecommendation.HIGH, long_switch='output', short_switch='o', + help=_('Path to output file. By default a file is created in the current directory.')), +]) + +class DecryptionError(Exception): + def __init__(self, pdf_path): + self.value = 'Unable to decrypt file `%s`.' % value + + def __str__(self): + return repr(self.value) + + +def print_help(parser, log): + help = parser.format_help().encode(preferred_encoding, 'replace') + log(help) + +def option_parser(name): + usage = USAGE.replace('%%name', name) + return OptionParser(usage=usage) + +def option_recommendation_to_cli_option(add_option, rec): + opt = rec.option + switches = ['-'+opt.short_switch] if opt.short_switch else [] + switches.append('--'+opt.long_switch) + attrs = dict(dest=opt.name, help=opt.help, + choices=opt.choices, default=rec.recommended_value) + add_option(Option(*switches, **attrs)) + +def add_options(parser): + group = OptionGroup(parser, _('Decrypt Options:'), _('Options to control the transformation of pdf')) + parser.add_option_group(group) + add_option = group.add_option + + for rec in OPTIONS: + option_recommendation_to_cli_option(add_option, rec) + +def decrypt(pdf_path, out_path, password): + pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb')) + + if pdf.decrypt(str(password)) == 0: + raise DecryptionError(pdf_path) + + title = pdf.documentInfo.title if pdf.documentInfo.title else _('Unknown') + author = pdf.documentInfo.author if pdf.documentInfo.author else _('Unknown') + out_pdf = PdfFileWriter(title=title, author=author) + + for page in pdf.pages: + out_pdf.addPage(page) + + with open(out_path, 'wb') as out_file: + out_pdf.write(out_file) + +def main(args=sys.argv, name=''): + log = Log() + parser = option_parser(name) + add_options(parser) + + opts, args = parser.parse_args(args) + args = args[1:] + + if len(args) < 2: + print 'Error: A PDF file and decryption password is required.\n' + print_help(parser, log) + return 1 + + if not is_valid_pdf(args[0]): + print 'Error: Could not read file `%s`.' % args[0] + return 1 + + if not is_encrypted(args[0]): + print 'Error: file `%s` is not encrypted.' % args[0] + return 1 + + try: + decrypt(args[0], opts.output, args[1]) + except DecryptionError, e: + print e.value + return 1 + + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/ebooks/pdf/manipulate/encrypt.py b/src/calibre/ebooks/pdf/manipulate/encrypt.py new file mode 100644 index 0000000000..15600fb07c --- /dev/null +++ b/src/calibre/ebooks/pdf/manipulate/encrypt.py @@ -0,0 +1,105 @@ +# -*- coding: utf-8 -*- +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +''' +Encrypt a PDF. +''' + +import os, sys +from optparse import OptionGroup, Option + +from calibre.utils.config import OptionParser +from calibre.utils.logging import Log +from calibre.constants import preferred_encoding +from calibre.customize.conversion import OptionRecommendation +from calibre.ebooks.pdf.verify import is_valid_pdf, is_encrypted + +from pyPdf import PdfFileWriter, PdfFileReader + +USAGE = '\n%prog %%name ' + _('''\ +[options] file.pdf password + +Encrypt a PDF. +''') + +OPTIONS = set([ + OptionRecommendation(name='output', recommended_value='encrypted.pdf', + level=OptionRecommendation.HIGH, long_switch='output', short_switch='o', + help=_('Path to output file. By default a file is created in the current directory.')), +]) + +def print_help(parser, log): + help = parser.format_help().encode(preferred_encoding, 'replace') + log(help) + +def option_parser(name): + usage = USAGE.replace('%%name', name) + return OptionParser(usage=usage) + +def option_recommendation_to_cli_option(add_option, rec): + opt = rec.option + switches = ['-'+opt.short_switch] if opt.short_switch else [] + switches.append('--'+opt.long_switch) + attrs = dict(dest=opt.name, help=opt.help, + choices=opt.choices, default=rec.recommended_value) + add_option(Option(*switches, **attrs)) + +def add_options(parser): + group = OptionGroup(parser, _('Encrypt Options:'), _('Options to control the transformation of pdf')) + parser.add_option_group(group) + add_option = group.add_option + + for rec in OPTIONS: + option_recommendation_to_cli_option(add_option, rec) + +def encrypt(pdf_path, out_path, password, metadata=None): + if metadata == None: + title = _('Unknown') + author = _('Unknown') + else: + title = metadata.title + author = authors_to_string(metadata.authors) + + out_pdf = PdfFileWriter(title=title, author=author) + + pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb')) + for page in pdf.pages: + out_pdf.addPage(page) + + with open(out_path, 'wb') as out_file: + out_pdf.encrypt(str(password)) + out_pdf.write(out_file) + +def main(args=sys.argv, name=''): + log = Log() + parser = option_parser(name) + add_options(parser) + + opts, args = parser.parse_args(args) + args = args[1:] + + if len(args) < 2: + print 'Error: A PDF file and decryption password is required.\n' + print_help(parser, log) + return 1 + + if not is_valid_pdf(args[0]): + print 'Error: Could not read file `%s`.' % args[0] + return 1 + + if is_encrypted(args[0]): + print 'Error: file `%s` is already encrypted.' % args[0] + return 1 + + mi = metadata_from_formats([args[0]]) + + encrypt(args[0], opts.output, args[1], mi) + + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/ebooks/pdf/manipulate/info.py b/src/calibre/ebooks/pdf/manipulate/info.py index 21a07fdeff..d1b52a602c 100644 --- a/src/calibre/ebooks/pdf/manipulate/info.py +++ b/src/calibre/ebooks/pdf/manipulate/info.py @@ -16,11 +16,11 @@ from calibre.utils.config import OptionParser from calibre.utils.logging import Log from calibre.constants import preferred_encoding from calibre.customize.conversion import OptionRecommendation -from calibre.ebooks.pdf.verify import is_valid_pdfs +from calibre.ebooks.pdf.verify import is_valid_pdfs, is_encrypted from pyPdf import PdfFileWriter, PdfFileReader -USAGE = '%prog %%name ' + _(''' +USAGE = '\n%prog %%name ' + _('''\ file.pdf ... Get info about a PDF. @@ -72,9 +72,17 @@ def main(args=sys.argv, name=''): bad_pdfs = is_valid_pdfs(args) if bad_pdfs != []: for pdf in bad_pdfs: - print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % pdf + print 'Error: Could not read file `%s`.' % pdf return 1 - + + enc = False + for pdf in args: + if is_encrypted(pdf): + enc = True + print 'Error: file `%s` is encrypted. Please decrypt first.' % pdf + if enc: + return 1 + for pdf in args: print_info(pdf) diff --git a/src/calibre/ebooks/pdf/manipulate/merge.py b/src/calibre/ebooks/pdf/manipulate/merge.py index 1e285e3bdf..fce7076e85 100644 --- a/src/calibre/ebooks/pdf/manipulate/merge.py +++ b/src/calibre/ebooks/pdf/manipulate/merge.py @@ -22,7 +22,7 @@ from calibre.ebooks.pdf.verify import is_valid_pdfs from pyPdf import PdfFileWriter, PdfFileReader -USAGE = '%prog %%name ' + _(''' +USAGE = '\n%prog %%name ' + _('''\ [options] file1.pdf file2.pdf ... Metadata will be used from the first PDF specified. @@ -94,9 +94,17 @@ def main(args=sys.argv, name=''): bad_pdfs = is_valid_pdfs(args) if bad_pdfs != []: for pdf in bad_pdfs: - print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % pdf + print 'Error: Could not read file `%s`.' % pdf return 1 - + + enc = False + for pdf in args: + if is_encrypted(pdf): + enc = True + print 'Error: file `%s` is encrypted.' % pdf + if enc: + return 1 + mi = metadata_from_formats([args[0]]) merge_files(args, opts.output, mi) diff --git a/src/calibre/ebooks/pdf/manipulate/reverse.py b/src/calibre/ebooks/pdf/manipulate/reverse.py index 564e523ae3..f2f3fa16da 100644 --- a/src/calibre/ebooks/pdf/manipulate/reverse.py +++ b/src/calibre/ebooks/pdf/manipulate/reverse.py @@ -22,10 +22,10 @@ from calibre.ebooks.pdf.verify import is_valid_pdf from pyPdf import PdfFileWriter, PdfFileReader -USAGE = '%prog %%name ' + _(''' +USAGE = '\n%prog %%name ' + _('''\ [options] file.pdf -Reverse PDF. +Reverse a PDF. ''') OPTIONS = set([ @@ -89,7 +89,11 @@ def main(args=sys.argv, name=''): return 1 if not is_valid_pdf(args[0]): - print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % args[0] + print 'Error: Could not read file `%s`.' % args[0] + return 1 + + if is_encrypted(args[0]): + print 'Error: file `%s` is encrypted.' % args[0] return 1 mi = metadata_from_formats([args[0]]) diff --git a/src/calibre/ebooks/pdf/manipulate/split.py b/src/calibre/ebooks/pdf/manipulate/split.py index fb7e4d06d7..19012797ae 100644 --- a/src/calibre/ebooks/pdf/manipulate/split.py +++ b/src/calibre/ebooks/pdf/manipulate/split.py @@ -185,7 +185,11 @@ def main(args=sys.argv, name=''): return 1 if not is_valid_pdf(pdf): - print 'Error: Could not read file `%s`. Is it a vaild PDF file or is it encrypted/DRMed?.' % pdf + print 'Error: Could not read file `%s`.' % pdf + return 1 + + if is_encrypted(args[0]): + print 'Error: file `%s` is encrypted.' % args[0] return 1 pages, page_ranges = clean_page_list(pdf, pages, page_ranges) diff --git a/src/calibre/ebooks/pdf/verify.py b/src/calibre/ebooks/pdf/verify.py index 35f7edf0be..3a8a8073ce 100644 --- a/src/calibre/ebooks/pdf/verify.py +++ b/src/calibre/ebooks/pdf/verify.py @@ -35,3 +35,10 @@ def is_valid_pdfs(pdf_paths): if not is_valid_pdf(pdf_path): invalid.append(pdf_path) return invalid + +def is_encrypted(pdf_path): + with open(os.path.abspath(pdf_path), 'rb') as pdf_file: + pdf = PdfFileReader(pdf_file) + if pdf.isEncrypted: + return True + return False diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index a42c72866f..fdc2851342 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -10,7 +10,6 @@ import os from calibre.customize.conversion import InputFormatPlugin from calibre.ebooks.markdown import markdown from calibre.ebooks.metadata.opf import OPFCreator -from calibre.ebooks.metadata import MetaInformation #from calibre.ebooks.metadata.meta import metadata_from_formats class TXTInput(InputFormatPlugin): @@ -32,7 +31,7 @@ class TXTInput(InputFormatPlugin): index.write(html.encode('utf-8')) #mi = metadata_from_formats([stream.name]) - mi = MetaInformation(_('Unknown'), _('Unknown')) + mi = None opf = OPFCreator(os.getcwd(), mi) opf.create_manifest([('index.html', None)]) opf.create_spine(['index.html']) From a60cd4c5672dbbf0e3273a2612232a0b12c57403 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 4 Apr 2009 19:59:43 -0400 Subject: [PATCH 070/319] Auto convert in GUI started --- src/calibre/gui2/device.py | 49 ++++++++++++++++++++++++++++++++------ src/calibre/gui2/main.py | 1 - src/calibre/gui2/tools.py | 6 +---- 3 files changed, 43 insertions(+), 13 deletions(-) diff --git a/src/calibre/gui2/device.py b/src/calibre/gui2/device.py index de11366b3b..ed001c30ba 100644 --- a/src/calibre/gui2/device.py +++ b/src/calibre/gui2/device.py @@ -10,12 +10,14 @@ from binascii import unhexlify from PyQt4.Qt import QMenu, QAction, QActionGroup, QIcon, SIGNAL, QPixmap, \ Qt +from calibre.customize.ui import available_input_formats, available_output_formats from calibre.devices import devices from calibre.gui2.dialogs.choose_format import ChooseFormatDialog from calibre.parallel import Job from calibre.devices.scanner import DeviceScanner from calibre.gui2 import config, error_dialog, Dispatcher, dynamic, \ - pixmap_to_data, warning_dialog + pixmap_to_data, warning_dialog, \ + info_dialog from calibre.ebooks.metadata import authors_to_string from calibre.gui2.dialogs.conversion_error import ConversionErrorDialog from calibre.devices.interface import Device @@ -575,10 +577,17 @@ class DeviceGUI(object): def sync_to_device(self, on_card, delete_from_library, - specific_format=None): - rows = self.library_view.selectionModel().selectedRows() + specific_format=None, send_rows=None, auto_convert=True): + rows = self.library_view.selectionModel().selectedRows() if send_rows is None else send_rows if not self.device_manager or not rows or len(rows) == 0: return + + _files, _auto_rows = self.library_view.model().get_preferred_formats(rows, + self.device_manager.device_class.FORMATS, + paths=True, set_metadata=True, + specific_format=specific_format) + rows = list(set(rows).difference(_auto_rows)) + ids = iter(self.library_view.model().id(r) for r in rows) metadata = self.library_view.model().get_metadata(rows) for mi in metadata: @@ -586,10 +595,7 @@ class DeviceGUI(object): if cdata: mi['cover'] = self.cover_to_thumbnail(cdata) metadata = iter(metadata) - _files = self.library_view.model().get_preferred_formats(rows, - self.device_manager.device_class.FORMATS, - paths=True, set_metadata=True, - specific_format=specific_format) + files = [getattr(f, 'name', None) for f in _files] bad, good, gf, names, remove_ids = [], [], [], [], [] for f in files: @@ -615,6 +621,35 @@ class DeviceGUI(object): remove = remove_ids if delete_from_library else [] self.upload_books(gf, names, good, on_card, memory=(_files, remove)) self.status_bar.showMessage(_('Sending books to device.'), 5000) + + auto = [] + if _auto_rows != []: + for row in _auto_rows: + if specific_format == None: + formats = self.library_view.model().db.formats(row).split(',') + formats = formats if formats != None else [] + if set(formats).intersection(available_input_formats()) is not None and set(self.device_manager.device_class.FORMATS).intersection(available_output_formats()) is not None: + auto.append(row) + else: + bad.append(self.library_view.model().title(row)) + else: + if specific_format in available_output_formats(): + auto.append(row) + else: + bad.append(self.library_view.model().title(row)) + + if auto != []: + autos = [self.library_view.model().title(row) for row in auto] + autos = '\n'.join('<li>%s</li>'%(i,) for i in autos) + d = info_dialog(self, _('No suitable formats'), + _('Auto converting the following books before uploading to the device:<br><ul>%s</ul>')%(autos,)) + for fmt in self.device_manager.device_class.FORMATS: + if fmt in list(set(self.device_manager.device_class.FORMATS).intersection(set(available_output_formats()))): + format = fmt + break + self.auto_convert(_auto_rows, on_card, format) + d.exec_() + if bad: bad = '\n'.join('<li>%s</li>'%(i,) for i in bad) d = warning_dialog(self, _('No suitable formats'), diff --git a/src/calibre/gui2/main.py b/src/calibre/gui2/main.py index f1f1e674b7..dcece08a3e 100644 --- a/src/calibre/gui2/main.py +++ b/src/calibre/gui2/main.py @@ -979,7 +979,6 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI): comics.append(r) else: others.append(r) - jobs, changed, bad_rows = auto_convert_ebook(format, self, self.library_view.model().db, comics, others) for func, args, desc, fmt, id, temp_files in jobs: if id not in bad_rows: diff --git a/src/calibre/gui2/tools.py b/src/calibre/gui2/tools.py index 0bf78ffaa7..07587d3c25 100644 --- a/src/calibre/gui2/tools.py +++ b/src/calibre/gui2/tools.py @@ -505,11 +505,7 @@ def fetch_scheduled_recipe(recipe, script): return 'feeds2'+fmt, [args], _('Fetch news from ')+recipe.title, fmt.upper(), [pt] def auto_convert_ebook(*args): - fmt = args[0] if args[0] else 'epub' - if fmt == 'lrf': - return auto_convert_lrf() - elif fmt in ('epub', 'mobi'): - return auto_convert(*args) + return auto_convert(*args) def convert_single_ebook(*args): fmt = prefs['output_format'].lower() From c1a37749a67e08a9b699298d644fd78a6810336e Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sun, 5 Apr 2009 00:01:37 -0400 Subject: [PATCH 071/319] Auto convert in GUI working --- src/calibre/gui2/device.py | 22 +++--- src/calibre/gui2/library.py | 6 +- src/calibre/gui2/main.py | 18 ++--- src/calibre/gui2/tools.py | 147 ++++++++++++++++-------------------- src/calibre/parallel.py | 3 + 5 files changed, 88 insertions(+), 108 deletions(-) diff --git a/src/calibre/gui2/device.py b/src/calibre/gui2/device.py index ed001c30ba..46cf9895d4 100644 --- a/src/calibre/gui2/device.py +++ b/src/calibre/gui2/device.py @@ -577,7 +577,7 @@ class DeviceGUI(object): def sync_to_device(self, on_card, delete_from_library, - specific_format=None, send_rows=None, auto_convert=True): + specific_format=None, send_rows=None, do_auto_convert=True): rows = self.library_view.selectionModel().selectedRows() if send_rows is None else send_rows if not self.device_manager or not rows or len(rows) == 0: return @@ -585,8 +585,12 @@ class DeviceGUI(object): _files, _auto_rows = self.library_view.model().get_preferred_formats(rows, self.device_manager.device_class.FORMATS, paths=True, set_metadata=True, - specific_format=specific_format) - rows = list(set(rows).difference(_auto_rows)) + specific_format=specific_format, + exclude_auto=do_auto_convert) + if do_auto_convert: + rows = list(set(rows).difference(_auto_rows)) + else: + _auto_rows = [] ids = iter(self.library_view.model().id(r) for r in rows) metadata = self.library_view.model().get_metadata(rows) @@ -626,9 +630,9 @@ class DeviceGUI(object): if _auto_rows != []: for row in _auto_rows: if specific_format == None: - formats = self.library_view.model().db.formats(row).split(',') - formats = formats if formats != None else [] - if set(formats).intersection(available_input_formats()) is not None and set(self.device_manager.device_class.FORMATS).intersection(available_output_formats()) is not None: + formats = [f.lower() for f in self.library_view.model().db.formats(row).split(',')] + formats = formats if formats != None else [] + if list(set(formats).intersection(available_input_formats())) != [] and list(set(self.device_manager.device_class.FORMATS).intersection(available_output_formats())) != []: auto.append(row) else: bad.append(self.library_view.model().title(row)) @@ -646,10 +650,10 @@ class DeviceGUI(object): for fmt in self.device_manager.device_class.FORMATS: if fmt in list(set(self.device_manager.device_class.FORMATS).intersection(set(available_output_formats()))): format = fmt - break + break + d.exec_() self.auto_convert(_auto_rows, on_card, format) - d.exec_() - + if bad: bad = '\n'.join('<li>%s</li>'%(i,) for i in bad) d = warning_dialog(self, _('No suitable formats'), diff --git a/src/calibre/gui2/library.py b/src/calibre/gui2/library.py index 1f3ed31478..c67f9bc1b0 100644 --- a/src/calibre/gui2/library.py +++ b/src/calibre/gui2/library.py @@ -420,7 +420,8 @@ class BooksModel(QAbstractTableModel): def get_preferred_formats(self, rows, formats, paths=False, - set_metadata=False, specific_format=None): + set_metadata=False, specific_format=None, + exclude_auto=False): ans = [] need_auto = [] if specific_format is not None: @@ -448,7 +449,8 @@ class BooksModel(QAbstractTableModel): ans.append(pt) else: need_auto.append(row) - ans.append(None) + if not exclude_auto: + ans.append(None) return ans, need_auto def id(self, row): diff --git a/src/calibre/gui2/main.py b/src/calibre/gui2/main.py index dcece08a3e..fee500bdb9 100644 --- a/src/calibre/gui2/main.py +++ b/src/calibre/gui2/main.py @@ -969,17 +969,9 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI): def auto_convert(self, rows, on_card, format): previous = self.library_view.currentIndex() - comics, others = [], [] - db = self.library_view.model().db - for r in rows: - formats = db.formats(r) - if not formats: continue - formats = formats.lower().split(',') - if 'cbr' in formats or 'cbz' in formats: - comics.append(r) - else: - others.append(r) - jobs, changed, bad_rows = auto_convert_ebook(format, self, self.library_view.model().db, comics, others) + jobs, changed, bad_rows = auto_convert_ebook(format, self, self.library_view.model().db, rows) + if jobs is None: + return for func, args, desc, fmt, id, temp_files in jobs: if id not in bad_rows: job = self.job_manager.run_job(Dispatcher(self.book_auto_converted), @@ -1063,7 +1055,7 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI): if job.exception is not None: self.job_exception(job) return - data = open(temp_files[-1].name, 'rb') + data = open(temp_files[0].name, 'rb') self.library_view.model().db.add_format(book_id, fmt, data, index_is_id=True) data.close() self.status_bar.showMessage(job.description + (' completed'), 2000) @@ -1080,7 +1072,7 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI): self.library_view.model().current_changed(current, QModelIndex()) r = self.library_view.model().index(self.library_view.model().db.row(book_id), 0) - self.sync_to_device(on_card, False, specific_format=fmt, send_rows=[r], auto_convert=False) + self.sync_to_device(on_card, False, specific_format=fmt, send_rows=[r], do_auto_convert=False) def book_converted(self, job): temp_files, fmt, book_id = self.conversion_jobs.pop(job) diff --git a/src/calibre/gui2/tools.py b/src/calibre/gui2/tools.py index 07587d3c25..e6bbf543e1 100644 --- a/src/calibre/gui2/tools.py +++ b/src/calibre/gui2/tools.py @@ -9,6 +9,7 @@ Logic for setting up conversion jobs import os from PyQt4.Qt import QDialog +from calibre.customize.ui import available_input_formats from calibre.utils.config import prefs from calibre.gui2.dialogs.lrf_single import LRFSingleDialog, LRFBulkDialog from calibre.gui2.dialogs.epub import Config as EPUBConvert @@ -22,6 +23,11 @@ from calibre.ebooks.epub.from_any import SOURCE_FORMATS as EPUB_PREFERRED_SOURCE from calibre.ebooks.mobi.from_any import config as mobiconfig from calibre.ebooks.lrf.comic.convert_from import config as comicconfig +# Ordered list of source formats. Items closer to the beginning are +# preferred for conversion over those toward the end. +PREFERRED_SOURCE_FORMATS = ['epub', 'lit', 'mobi', 'prc', 'azw', 'fb2', 'odt', 'rtf', + 'txt', 'pdf', 'oebzip', 'htm', 'html'] + def get_dialog(fmt): return { 'epub':EPUBConvert, @@ -34,101 +40,77 @@ def get_config(fmt): 'mobi':mobiconfig, }[fmt] -def auto_convert(fmt, parent, db, comics, others): +def auto_convert(fmt, parent, db, rows): changed = False jobs = [] - total = sum(map(len, (others, comics))) + total = len(rows) if total == 0: - return + return None, None, None parent.status_bar.showMessage(_('Starting auto conversion of %d books')%total, 2000) i = 0 bad_rows = [] - for i, row in enumerate(others+comics): + for i, row in enumerate(rows): row_id = db.id(row) - if row in others: - temp_files = [] - - data = None - for _fmt in EPUB_PREFERRED_SOURCE_FORMATS: - try: - data = db.format(row, _fmt.upper()) - if data is not None: - break - except: - continue - if data is None: - bad_rows.append(row) - continue + temp_files = [] - defaults = db.conversion_options(db.id(row), fmt) - defaults = defaults if defaults else '' - options = get_config(fmt)(defaults=defaults).parse() - - mi = db.get_metadata(row) - opf = OPFCreator(os.getcwdu(), mi) - opf_file = PersistentTemporaryFile('.opf') - opf.render(opf_file) - opf_file.close() - pt = PersistentTemporaryFile('.'+_fmt.lower()) - pt.write(data) - pt.close() - of = PersistentTemporaryFile('.'+fmt) - of.close() - cover = db.cover(row) - cf = None - if cover: - cf = PersistentTemporaryFile('.jpeg') - cf.write(cover) - cf.close() - options.cover = cf.name - options.output = of.name - options.from_opf = opf_file.name - args = [options, pt.name] - desc = _('Auto convert book %d of %d (%s)')%(i+1, total, repr(mi.title)) - temp_files = [cf] if cf is not None else [] - temp_files.extend([opf_file, pt, of]) - jobs.append(('any2'+fmt, args, desc, fmt.upper(), row_id, temp_files)) - - changed = True - else: - defaults = db.conversion_options(db.id(row), fmt) - defaults = defaults if defaults else '' - options = comicconfig(defaults=defaults).parse() - - mi = db.get_metadata(row) - if mi.title: - options.title = mi.title - if mi.authors: - options.author = ','.join(mi.authors) - data = None - for _fmt in ['cbz', 'cbr']: - try: - data = db.format(row, _fmt.upper()) - if data is not None: - break - except: - continue - - if data is None: + data = None + in_formats = [f.lower() for f in db.formats(row).split(',')] + in_formats = list(set(in_formats).intersection(available_input_formats())) + for _fmt in PREFERRED_SOURCE_FORMATS: + if _fmt in in_formats: + data = _fmt + break + if data is None: + if in_formats != []: + data = list(in_formats)[0] + else: bad_rows.append(row) continue - - pt = PersistentTemporaryFile('.'+_fmt.lower()) - pt.write(data) - pt.close() - of = PersistentTemporaryFile('.'+fmt) - of.close() - setattr(options, 'output', of.name) - options.verbose = 1 - args = [pt.name, options] - desc = _('Convert book %d of %d (%s)')%(i+1, total, repr(mi.title)) - jobs.append(('comic2'+fmt, args, desc, fmt.upper(), row_id, [pt, of])) - - changed = True + +# defaults = db.conversion_options(db.id(row), fmt) +# defaults = defaults if defaults else '' +# options = get_config(fmt)(defaults=defaults).parse() + +# mi = db.get_metadata(row) +# opf = OPFCreator(os.getcwdu(), mi) +# opf_file = PersistentTemporaryFile('.opf') +# opf.render(opf_file) +# opf_file.close() +# pt = PersistentTemporaryFile('.'+_fmt.lower()) +# pt.write(data) +# pt.close() +# of = PersistentTemporaryFile('.'+fmt) +# of.close() +# cover = db.cover(row) +# cf = None +# if cover: +# cf = PersistentTemporaryFile('.jpeg') +# cf.write(cover) +# cf.close() +# options.cover = cf.name +# options.output = of.name +# options.from_opf = opf_file.name +# args = [options, pt.name] +# desc = _('Auto convert book %d of %d (%s)')%(i+1, total, repr(mi.title)) +# temp_files = [cf] if cf is not None else [] +# temp_files.extend([opf_file, pt, of]) +# jobs.append(('any2'+fmt, args, desc, fmt.upper(), row_id, temp_files)) + + mi = db.get_metadata(row) + in_file = db.format_abspath(row, data) + out_file = PersistentTemporaryFile('.'+fmt.lower()) + out_file.write(data) + out_file.close() + desc = _('Auto convert book %d of %d (%s)')%(i+1, total, repr(mi.title)) + args = [['', in_file, out_file.name]] + temp_files = [out_file] + jobs.append(('ebook-convert', args, desc, fmt.upper(), row_id, temp_files)) + + changed = True if bad_rows: res = [] @@ -141,9 +123,6 @@ def auto_convert(fmt, parent, db, comics, others): return jobs, changed, bad_rows -def auto_convert_lrf(fmt, parent, db, comics, others): - pass - def convert_single(fmt, parent, db, comics, others): changed = False jobs = [] diff --git a/src/calibre/parallel.py b/src/calibre/parallel.py index 4969877da9..90a2969c86 100644 --- a/src/calibre/parallel.py +++ b/src/calibre/parallel.py @@ -79,6 +79,9 @@ PARALLEL_FUNCS = { 'comic2mobi' : ('calibre.ebooks.mobi.from_comic', 'convert', {}, 'notification'), + + 'ebook-convert' : + ('calibre.ebooks.conversion.cli', 'main', {}, None), } From 011e2811d28c991e5a4a4715999a6b15827c8a39 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sun, 5 Apr 2009 09:08:43 -0400 Subject: [PATCH 072/319] Metadata reading and writing for TXT/PDF input/output. --- src/calibre/ebooks/pdf/input.py | 6 ++---- src/calibre/ebooks/pdf/output.py | 4 ++-- src/calibre/ebooks/pdf/writer.py | 20 +++++++++++++++++--- src/calibre/ebooks/txt/input.py | 7 +++---- src/calibre/ebooks/txt/output.py | 2 +- 5 files changed, 25 insertions(+), 14 deletions(-) diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py index 6f55b71dd5..edbc2d6b30 100644 --- a/src/calibre/ebooks/pdf/input.py +++ b/src/calibre/ebooks/pdf/input.py @@ -10,8 +10,7 @@ import os from calibre.customize.conversion import InputFormatPlugin from calibre.ebooks.pdf.pdftohtml import pdftohtml from calibre.ebooks.metadata.opf import OPFCreator -from calibre.ebooks.metadata import MetaInformation -#from calibre.ebooks.metadata.meta import metadata_from_formats +from calibre.customize.builtins import PDFMetadataReader class PDFInput(InputFormatPlugin): @@ -27,8 +26,7 @@ class PDFInput(InputFormatPlugin): with open('index.html', 'wb') as index: index.write(html) - #mi = metadata_from_formats([stream.name]) - mi = MetaInformation(_('Unknown'), _('Unknown')) + mi = PDFMetadataReader(None).get_metadata(stream, 'pdf') opf = OPFCreator(os.getcwd(), mi) opf.create_manifest([('index.html', None)]) opf.create_spine(['index.html']) diff --git a/src/calibre/ebooks/pdf/output.py b/src/calibre/ebooks/pdf/output.py index 230beed9ae..65af40dc51 100644 --- a/src/calibre/ebooks/pdf/output.py +++ b/src/calibre/ebooks/pdf/output.py @@ -17,7 +17,7 @@ from calibre.customize.conversion import OutputFormatPlugin, \ OptionRecommendation from calibre.ebooks.oeb.output import OEBOutput from calibre.ptempfile import TemporaryDirectory -from calibre.ebooks.pdf.writer import PDFWriter +from calibre.ebooks.pdf.writer import PDFWriter, PDFMetadata from calibre.ebooks.pdf.pageoptions import UNITS, unit, PAPER_SIZES, \ paper_size, ORIENTATIONS, orientation, PageOptions @@ -88,7 +88,7 @@ class PDFOutput(OutputFormatPlugin): out_stream.seek(0) out_stream.truncate() - writer.dump(opf, out_stream) + writer.dump(opf, out_stream, PDFMetadata(oeb_book.metadata)) if close: out_stream.close() diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index 2aebd7322c..7d0a690856 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -13,6 +13,7 @@ import os, shutil, sys from calibre.ptempfile import PersistentTemporaryDirectory from calibre.ebooks.pdf.pageoptions import PageOptions +from calibre.ebooks.metadata import authors_to_string from calibre.ebooks.metadata.opf2 import OPF from PyQt4 import QtCore @@ -22,6 +23,18 @@ from PyQt4.QtWebKit import QWebView from pyPdf import PdfFileWriter, PdfFileReader +class PDFMetadata(object): + def __init__(self, oeb_metadata=None): + self.title = _('Unknown') + self.author = _('Unknown') + + if oeb_metadata != None: + if len(oeb_metadata.title) >= 1: + self.title = oeb_metadata.title[0].value + if len(oeb_metadata.creator) >= 1: + self.author = authors_to_string([x.value for x in oeb_metadata.creator]) + + class PDFWriter(QObject): def __init__(self, log, popts=PageOptions()): if QApplication.instance() is None: @@ -37,8 +50,9 @@ class PDFWriter(QObject): self.combine_queue = [] self.tmp_path = PersistentTemporaryDirectory('_any2pdf_parts') self.popts = popts - - def dump(self, opfpath, out_stream): + + def dump(self, opfpath, out_stream, pdf_metadata): + self.metadata = pdf_metadata self._delete_tmpdir() opf = OPF(opfpath, os.path.dirname(opfpath)) @@ -88,7 +102,7 @@ class PDFWriter(QObject): self.logger.info('Combining individual PDF parts...') try: - outPDF = PdfFileWriter() + outPDF = PdfFileWriter(title=self.metadata.title, author=self.metadata.author) for item in self.combine_queue: inputPDF = PdfFileReader(file(item, 'rb')) for page in inputPDF.pages: diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index fdc2851342..69d9c09da5 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -10,7 +10,7 @@ import os from calibre.customize.conversion import InputFormatPlugin from calibre.ebooks.markdown import markdown from calibre.ebooks.metadata.opf import OPFCreator -#from calibre.ebooks.metadata.meta import metadata_from_formats +from calibre.customize.builtins import TXTMetadataReader class TXTInput(InputFormatPlugin): @@ -26,12 +26,11 @@ class TXTInput(InputFormatPlugin): md = markdown.Markdown( extensions=['footnotes', 'tables', 'toc'], safe_mode=False,) - html = '<html><body>'+md.convert(txt)+'</body></html>' + html = '<html><head><title /></head><body>'+md.convert(txt)+'</body></html>' with open('index.html', 'wb') as index: index.write(html.encode('utf-8')) - #mi = metadata_from_formats([stream.name]) - mi = None + mi = TXTMetadataReader(None).get_metadata(stream, 'txt') opf = OPFCreator(os.getcwd(), mi) opf.create_manifest([('index.html', None)]) opf.create_spine(['index.html']) diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index 2d1ef98662..423e668a56 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -34,7 +34,7 @@ class TXTOutput(OutputFormatPlugin): def convert(self, oeb_book, output_path, input_plugin, opts, log): metadata = TxtMetadata() if opts.prepend_metadata.lower() == 'true': - metadata.author = opts.authors if opts.authors else authors_to_string(oeb_book.metadata.authors.value) if oeb_book.metadata.authors != [] else _('Unknown') + metadata.author = opts.authors if opts.authors else authors_to_string([x.value for x in oeb_book.metadata.creator]) if oeb_book.metadata.creator != [] else _('Unknown') metadata.title = opts.title if opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown') writer = TxtWriter(TxtNewlines(opts.newline).newline, log) From 3b09d017016ab40685b32232f32294ca26d701c8 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sun, 5 Apr 2009 19:43:59 -0400 Subject: [PATCH 073/319] TXT input encoding option honored --- src/calibre/ebooks/txt/input.py | 5 ++++- src/calibre/gui2/tools.py | 29 ----------------------------- 2 files changed, 4 insertions(+), 30 deletions(-) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 69d9c09da5..e161f6b9bd 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -21,7 +21,10 @@ class TXTInput(InputFormatPlugin): def convert(self, stream, options, file_ext, log, accelerators): - txt = stream.read() + ienc = stream.encoding if stream.encoding else 'utf-8' + if options.input_encoding: + ienc = options.input_encoding + txt = stream.read().decode(ienc) md = markdown.Markdown( extensions=['footnotes', 'tables', 'toc'], diff --git a/src/calibre/gui2/tools.py b/src/calibre/gui2/tools.py index e6bbf543e1..d004dcb502 100644 --- a/src/calibre/gui2/tools.py +++ b/src/calibre/gui2/tools.py @@ -70,35 +70,6 @@ def auto_convert(fmt, parent, db, rows): else: bad_rows.append(row) continue - -# defaults = db.conversion_options(db.id(row), fmt) -# defaults = defaults if defaults else '' -# options = get_config(fmt)(defaults=defaults).parse() - -# mi = db.get_metadata(row) -# opf = OPFCreator(os.getcwdu(), mi) -# opf_file = PersistentTemporaryFile('.opf') -# opf.render(opf_file) -# opf_file.close() -# pt = PersistentTemporaryFile('.'+_fmt.lower()) -# pt.write(data) -# pt.close() -# of = PersistentTemporaryFile('.'+fmt) -# of.close() -# cover = db.cover(row) -# cf = None -# if cover: -# cf = PersistentTemporaryFile('.jpeg') -# cf.write(cover) -# cf.close() -# options.cover = cf.name -# options.output = of.name -# options.from_opf = opf_file.name -# args = [options, pt.name] -# desc = _('Auto convert book %d of %d (%s)')%(i+1, total, repr(mi.title)) -# temp_files = [cf] if cf is not None else [] -# temp_files.extend([opf_file, pt, of]) -# jobs.append(('any2'+fmt, args, desc, fmt.upper(), row_id, temp_files)) mi = db.get_metadata(row) in_file = db.format_abspath(row, data) From b2bfab32cfab2cda45a1473e1fc5678e667c0774 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Wed, 8 Apr 2009 13:35:51 -0700 Subject: [PATCH 074/319] Make iterating over links in XML and CSS documents more robust --- src/calibre/ebooks/oeb/base.py | 136 +++++++++++++++++-- src/calibre/ebooks/oeb/output.py | 4 +- src/calibre/ebooks/oeb/reader.py | 17 ++- src/calibre/ebooks/oeb/transforms/package.py | 52 +++++++ src/calibre/ebooks/oeb/writer.py | 8 +- 5 files changed, 190 insertions(+), 27 deletions(-) create mode 100644 src/calibre/ebooks/oeb/transforms/package.py diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 7d489ec3ae..2abf658697 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -7,14 +7,16 @@ __license__ = 'GPL v3' __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>' __docformat__ = 'restructuredtext en' -import os, re, uuid +import os, re, uuid, logging from mimetypes import types_map from collections import defaultdict from itertools import count from urlparse import urldefrag, urlparse, urlunparse from urllib import unquote as urlunquote -import logging +from urlparse import urljoin + from lxml import etree, html + import calibre from cssutils import CSSParser from calibre.translations.dynamic import translate @@ -77,16 +79,117 @@ def XLINK(name): def CALIBRE(name): return '{%s}%s' % (CALIBRE_NS, name) -def LINK_SELECTORS(): - results = [] - for expr in ('h:head/h:link/@href', 'h:body//h:a/@href', - 'h:body//h:img/@src', 'h:body//h:object/@data', - 'h:body//*/@xl:href', '//ncx:content/@src', - 'o2:page/@href'): - results.append(etree.XPath(expr, namespaces=XPNSMAP)) - return results +_css_url_re = re.compile(r'url\((.*?)\)', re.I) +_css_import_re = re.compile(r'@import "(.*?)"') +_archive_re = re.compile(r'[^ ]+') + +def iterlinks(root): + ''' + Iterate over all links in a OEB Document. + + :param root: A valid lxml.etree element. + ''' + assert etree.iselement(root) + link_attrs = set(html.defs.link_attrs) + link_attrs.add(XLINK('href')) + + for el in root.iter(): + attribs = el.attrib + + if el.tag == XHTML('object'): + codebase = None + ## <object> tags have attributes that are relative to + ## codebase + if 'codebase' in attribs: + codebase = el.get('codebase') + yield (el, 'codebase', codebase, 0) + for attrib in 'classid', 'data': + if attrib in attribs: + value = el.get(attrib) + if codebase is not None: + value = urljoin(codebase, value) + yield (el, attrib, value, 0) + if 'archive' in attribs: + for match in _archive_re.finditer(el.get('archive')): + value = match.group(0) + if codebase is not None: + value = urljoin(codebase, value) + yield (el, 'archive', value, match.start()) + else: + for attr in attribs: + if attr in link_attrs: + yield (el, attr, attribs[attr], 0) + + + if el.tag == XHTML('style') and el.text: + for match in _css_url_re.finditer(el.text): + yield (el, None, match.group(1), match.start(1)) + for match in _css_import_re.finditer(el.text): + yield (el, None, match.group(1), match.start(1)) + if 'style' in attribs: + for match in _css_url_re.finditer(attribs['style']): + yield (el, 'style', match.group(1), match.start(1)) + +def make_links_absolute(root, base_url): + ''' + Make all links in the document absolute, given the + ``base_url`` for the document (the full URL where the document + came from) + ''' + def link_repl(href): + return urljoin(base_url, href) + rewrite_links(root, link_repl) + +def resolve_base_href(root): + base_href = None + basetags = root.xpath('//base[@href]|//h:base[@href]', + namespaces=XPNSMAP) + for b in basetags: + base_href = b.get('href') + b.drop_tree() + if not base_href: + return + make_links_absolute(root, base_href, resolve_base_href=False) + +def rewrite_links(root, link_repl_func, resolve_base_href=True): + ''' + Rewrite all the links in the document. For each link + ``link_repl_func(link)`` will be called, and the return value + will replace the old link. + + Note that links may not be absolute (unless you first called + ``make_links_absolute()``), and may be internal (e.g., + ``'#anchor'``). They can also be values like + ``'mailto:email'`` or ``'javascript:expr'``. + + If the ``link_repl_func`` returns None, the attribute or + tag text will be removed completely. + ''' + if resolve_base_href: + resolve_base_href(root) + for el, attrib, link, pos in iterlinks(root): + new_link = link_repl_func(link.strip()) + if new_link == link: + continue + if new_link is None: + # Remove the attribute or element content + if attrib is None: + el.text = '' + else: + del el.attrib[attrib] + continue + if attrib is None: + new = el.text[:pos] + new_link + el.text[pos+len(link):] + el.text = new + else: + cur = el.attrib[attrib] + if not pos and len(cur) == len(link): + # Most common case + el.attrib[attrib] = new_link + else: + new = cur[:pos] + new_link + cur[pos+len(link):] + el.attrib[attrib] = new -LINK_SELECTORS = LINK_SELECTORS() EPUB_MIME = types_map['.epub'] XHTML_MIME = types_map['.xhtml'] @@ -199,7 +302,7 @@ def urlnormalize(href): characters URL quoted. """ parts = urlparse(href) - if not parts.scheme: + if not parts.scheme or parts.scheme == 'file': path, frag = urldefrag(href) parts = ('', '', path, '', '', frag) parts = (part.replace('\\', '/') for part in parts) @@ -724,7 +827,7 @@ class Manifest(object): if isinstance(data, unicode): return data.encode('utf-8') return str(data) - + def __unicode__(self): data = self.data if isinstance(data, etree._Element): @@ -778,8 +881,13 @@ class Manifest(object): """Convert the URL provided in :param:`href` from a reference relative to this manifest item to a book-absolute reference. """ - if urlparse(href).scheme: + purl = urlparse(href) + scheme = purl.scheme + if scheme and scheme != 'file': return href + purl = list(purl) + purl[0] = '' + href = urlunparse(purl) path, frag = urldefrag(href) if not path: return '#'.join((self.href, frag)) diff --git a/src/calibre/ebooks/oeb/output.py b/src/calibre/ebooks/oeb/output.py index fc1366fbcd..ea986f49fa 100644 --- a/src/calibre/ebooks/oeb/output.py +++ b/src/calibre/ebooks/oeb/output.py @@ -22,6 +22,7 @@ class OEBOutput(OutputFormatPlugin): if not os.path.exists(output_path): os.makedirs(output_path) from calibre.ebooks.oeb.base import OPF_MIME, NCX_MIME, PAGE_MAP_MIME + from calibre.ebooks.html import tostring as html_tostring with CurrentDir(output_path): results = oeb_book.to_opf2(page_map=True) for key in (OPF_MIME, NCX_MIME, PAGE_MAP_MIME): @@ -42,9 +43,8 @@ class OEBOutput(OutputFormatPlugin): if hasattr(raw, 'cssText'): raw = raw.cssText else: - raw = etree.tostring(raw, encoding='utf-8', + raw = html_tostring(raw, pretty_print=opts.pretty_print) - raw = '<?xml version="1.0" encoding="utf-8" ?>\n'+raw if isinstance(raw, unicode): raw = raw.encode('utf-8') with open(path, 'wb') as f: diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py index f4430ac07c..0c5a4ad97c 100644 --- a/src/calibre/ebooks/oeb/reader.py +++ b/src/calibre/ebooks/oeb/reader.py @@ -7,18 +7,21 @@ __license__ = 'GPL v3' __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>' import sys, os, uuid, copy -from itertools import izip, chain +from itertools import izip from urlparse import urldefrag, urlparse from urllib import unquote as urlunquote from mimetypes import guess_type from collections import defaultdict + from lxml import etree +import cssutils + from calibre.ebooks.oeb.base import OPF1_NS, OPF2_NS, OPF2_NSMAP, DC11_NS, \ DC_NSES, OPF from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_IMAGES, \ PAGE_MAP_MIME, JPEG_MIME, NCX_MIME, SVG_MIME -from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, CSSURL_RE, \ - ENTITY_RE, LINK_SELECTORS, MS_COVER_TYPE +from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, \ + ENTITY_RE, MS_COVER_TYPE, iterlinks from calibre.ebooks.oeb.base import namespace, barename, qname, XPath, xpath, \ urlnormalize, BINARY_MIME, \ OEBError, OEBBook, DirContainer @@ -191,8 +194,8 @@ class OEBReader(object): if (item.media_type in OEB_DOCS or item.media_type[-4:] in ('/xml', '+xml')) and \ item.data is not None: - hrefs = [sel(item.data) for sel in LINK_SELECTORS] - for href in chain(*hrefs): + hrefs = [r[2] for r in iterlinks(item.data)] + for href in hrefs: href, _ = urldefrag(href) if not href: continue @@ -201,8 +204,8 @@ class OEBReader(object): if not scheme and href not in known: new.add(href) elif item.media_type in OEB_STYLES: - for match in CSSURL_RE.finditer(item.data.cssText): - href, _ = urldefrag(match.group('url')) + for url in cssutils.getUrls(item.data): + href, _ = urldefrag(url) href = item.abshref(urlnormalize(href)) scheme = urlparse(href).scheme if not scheme and href not in known: diff --git a/src/calibre/ebooks/oeb/transforms/package.py b/src/calibre/ebooks/oeb/transforms/package.py new file mode 100644 index 0000000000..d8fb485dde --- /dev/null +++ b/src/calibre/ebooks/oeb/transforms/package.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' +__docformat__ = 'restructuredtext en' + +import os, shutil + +from calibre.ebooks.oeb.base import OEB_DOCS + +class Package(object): + + ''' + Move all the parts of an OEB into a folder structure rooted + at the specified folder. All links in recognized content types + are processed, the linked to resources are copied into the local + folder tree and all references to those resources are updated. + + The created folder structure is + + Base directory(OPF, NCX) -- content (XHTML) -- resources (CSS, Images, etc) + + ''' + + def __init__(self, base='.'): + ':param base: The base folder at which the OEB will be rooted' + self.new_base_path = os.path.abspath(base) + + def rewrite_links_in(self, item): + new_items = [] + return new_items + + def move_manifest_item(self, item): + item.data # Make sure the data has been loaded and cached + old_abspath = os.path.join(self.old_base_path, *item.href.split('/')) + bname = item.href.split('/')[-1] + new_href = 'content/' + \ + ('resources/' if item.media_type in OEB_DOCS else '')+bname + + def __call__(self, oeb, context): + self.map = {} + self.old_base_path = os.path.abspath(oeb.container.rootdir) + + for item in self.oeb.manifest: + self.move_manifest_item(item) + + for item in self.oeb.manifest: + self.rewrite_links_in(item) + + diff --git a/src/calibre/ebooks/oeb/writer.py b/src/calibre/ebooks/oeb/writer.py index 8789d03470..1e5e5aea11 100644 --- a/src/calibre/ebooks/oeb/writer.py +++ b/src/calibre/ebooks/oeb/writer.py @@ -6,9 +6,9 @@ from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>' -import sys, os, logging +import os from calibre.ebooks.oeb.base import OPF_MIME, xml2str -from calibre.ebooks.oeb.base import DirContainer, OEBBook +from calibre.ebooks.oeb.base import DirContainer, OEBError __all__ = ['OEBWriter'] @@ -18,7 +18,7 @@ class OEBWriter(object): TRANSFORMS = [] """List of transforms to apply to content written with this Writer.""" - + def __init__(self, version='2.0', page_map=False, pretty_print=False): self.version = version self.page_map = page_map @@ -46,7 +46,7 @@ class OEBWriter(object): pretty_print = opts.pretty_print return cls(version=version, page_map=page_map, pretty_print=pretty_print) - + def __call__(self, oeb, path): """Read the book in the :class:`OEBBook` object :param:`oeb` to a file at :param:`path`. From 383fe33adb0921f5355b901a1039b7848262b406 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Wed, 8 Apr 2009 19:51:56 -0400 Subject: [PATCH 075/319] process pdf input html output a bit. --- src/calibre/ebooks/pdf/pdftohtml.py | 35 +++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py index e7707479c3..0f6581dea6 100644 --- a/src/calibre/ebooks/pdf/pdftohtml.py +++ b/src/calibre/ebooks/pdf/pdftohtml.py @@ -6,7 +6,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>, ' \ '2009, John Schember <john@nachtimwald.com>' __docformat__ = 'restructuredtext en' -import errno, os, sys, subprocess +import errno, os, re, sys, subprocess from functools import partial from calibre.ebooks import ConversionError, DRMError @@ -24,6 +24,32 @@ if iswindows and hasattr(sys, 'frozen'): if islinux and getattr(sys, 'frozen_path', False): PDFTOHTML = os.path.join(getattr(sys, 'frozen_path'), 'pdftohtml') +# Fix pdftohtml markup +PDFTOHTML_RULES = [ + # Remove <hr> tags + (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'), + # Remove page numbers + (re.compile(r'\d+<br>', re.IGNORECASE), lambda match: ''), + # Remove <br> and replace <br><br> with <p> + (re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'), + (re.compile(r'(.*)<br.*?>', re.IGNORECASE), + lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40 + else match.group(1)), + # Remove hyphenation + (re.compile(r'-\n\r?'), lambda match: ''), + + # Remove gray background + (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'), + + # Remove non breaking spaces + (re.compile(ur'\u00a0'), lambda match : ' '), + + # Add second <br /> after first to allow paragraphs to show better + (re.compile(r'<br.*?>'), lambda match : '<br /><br />'), + + ] + + def pdftohtml(pdf_path): ''' Convert the pdf into html using the pdftohtml app. @@ -72,4 +98,9 @@ def pdftohtml(pdf_path): if not '<br' in raw[:4000]: raise ConversionError(os.path.basename(pdf_path) + _(' is an image based PDF. Only conversion of text based PDFs is supported.'), True) - return '<!-- created by calibre\'s pdftohtml -->\n' + raw + return '<!-- created by calibre\'s pdftohtml -->\n' + processed_html(raw) + +def processed_html(html): + for rule in PDFTOHTML_RULES: + html = rule[0].sub(rule[1], html) + return html From 093b98a9f1d7bd2be91e3cb0cdd4726493ed51b2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Wed, 8 Apr 2009 17:44:29 -0700 Subject: [PATCH 076/319] Untested implementation of HTML input. Uses a new transform that 'packages' an OEB book into a folder structure (the same folder structure that was used in the old codebase for EPUB output). This may have broken other thin gs, so use with care. --- src/calibre/customize/conversion.py | 5 +- src/calibre/ebooks/conversion/plumber.py | 29 +- src/calibre/ebooks/epub/__init__.py | 42 +-- src/calibre/ebooks/epub/fonts.py | 80 ++-- src/calibre/ebooks/epub/from_html.py | 2 +- src/calibre/ebooks/epub/iterator.py | 2 +- src/calibre/ebooks/epub/split.py | 138 +++---- src/calibre/ebooks/html/__init__.py | 30 ++ src/calibre/ebooks/html/input.py | 342 ++++++++++++++++++ src/calibre/ebooks/{html.py => html_old.py} | 0 src/calibre/ebooks/metadata/opf2.py | 37 +- src/calibre/ebooks/mobi/input.py | 2 +- src/calibre/ebooks/mobi/reader.py | 2 +- src/calibre/ebooks/oeb/base.py | 2 +- src/calibre/ebooks/oeb/transforms/package.py | 79 +++- .../ebooks/oeb/transforms/trimmanifest.py | 14 +- src/calibre/linux.py | 9 - 17 files changed, 609 insertions(+), 206 deletions(-) create mode 100644 src/calibre/ebooks/html/__init__.py create mode 100644 src/calibre/ebooks/html/input.py rename src/calibre/ebooks/{html.py => html_old.py} (100%) diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index 6530e5f16c..c531a15e34 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -122,8 +122,9 @@ class InputFormatPlugin(Plugin): def convert(self, stream, options, file_ext, log, accelerators): ''' This method must be implemented in sub-classes. It must return - the path to the created OPF file. All output should be contained in - the current directory. If this plugin creates files outside the current + the path to the created OPF file or an :class:`OEBBook` instance. + All output should be contained in the current directory. + If this plugin creates files outside the current directory they must be deleted/marked for deletion before this method returns. diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 6142cb555a..41d5f0abd9 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -299,21 +299,15 @@ OptionRecommendation(name='language', # Create an OEBBook from the input file. The input plugin does all the # heavy lifting. - from calibre.ebooks.oeb.reader import OEBReader - from calibre.ebooks.oeb.base import OEBBook accelerators = {} tdir = PersistentTemporaryDirectory('_plumber') - opfpath = self.input_plugin(open(self.input, 'rb'), self.opts, + self.oeb = self.input_plugin(open(self.input, 'rb'), self.opts, self.input_fmt, self.log, accelerators, tdir) - html_preprocessor = HTMLPreProcessor() - self.reader = OEBReader() - self.oeb = OEBBook(self.log, html_preprocessor=html_preprocessor) - # Read OEB Book into OEBBook - self.log.info('Parsing all content...') - self.reader(self.oeb, opfpath) + if not hasattr(self.oeb, 'manifest'): + self.oeb = create_oebbook(self.log, self.oeb) self.opts.source = self.opts.input_profile self.opts.dest = self.opts.output_profile @@ -340,7 +334,20 @@ OptionRecommendation(name='language', trimmer(self.oeb, self.opts) self.log.info('Creating %s...'%self.output_plugin.name) - self.output_plugin.convert(self.oeb, self.output, self.input_plugin, self.opts, - self.log) + self.output_plugin.convert(self.oeb, self.output, self.input_plugin, + self.opts, self.log) +def create_oebbook(log, opfpath): + ''' + Create an OEBBook from an OPF file. + ''' + from calibre.ebooks.oeb.reader import OEBReader + from calibre.ebooks.oeb.base import OEBBook + html_preprocessor = HTMLPreProcessor() + reader = OEBReader() + oeb = OEBBook(log, html_preprocessor=html_preprocessor) + # Read OEB Book into OEBBook + log.info('Parsing all content...') + reader(oeb, opfpath) + return oeb diff --git a/src/calibre/ebooks/epub/__init__.py b/src/calibre/ebooks/epub/__init__.py index 0be88da070..2bc076a8ad 100644 --- a/src/calibre/ebooks/epub/__init__.py +++ b/src/calibre/ebooks/epub/__init__.py @@ -10,23 +10,23 @@ import sys, textwrap, re, os, uuid from itertools import cycle from calibre.utils.config import Config, StringConfig from calibre.utils.zipfile import ZipFile, ZIP_STORED -from calibre.ebooks.html import config as common_config, tostring +from calibre.ebooks.html import tostring from lxml import etree class DefaultProfile(object): - + flow_size = sys.maxint screen_size = None remove_special_chars = False remove_object_tags = False - + class PRS505(DefaultProfile): - + flow_size = 270000 screen_size = (590, 765) remove_special_chars = re.compile(u'[\u200b\u00ad]') remove_object_tags = True - + PROFILES = { 'PRS505' : PRS505, @@ -64,11 +64,11 @@ def config(defaults=None, name='epub'): c = Config(name, desc) else: c = StringConfig(defaults, desc) - + c.update(common_config()) c.remove_opt('output') c.remove_opt('zip') - + c.add_opt('output', ['-o', '--output'], default=None, help=_('The output EPUB file. If not specified, it is ' 'derived from the input file name.')) @@ -81,22 +81,22 @@ def config(defaults=None, name='epub'): help=_('Either the path to a CSS stylesheet or raw CSS. ' 'This CSS will override any existing CSS ' 'declarations in the source files.')) - structure = c.add_group('structure detection', + structure = c.add_group('structure detection', _('Control auto-detection of document structure.')) - structure('chapter', ['--chapter'], + structure('chapter', ['--chapter'], default="//*[re:match(name(), 'h[1-2]') and " "re:test(., 'chapter|book|section|part', 'i')] | " "//*[@class = 'chapter']", help=_('''\ An XPath expression to detect chapter titles. The default is to consider <h1> or -<h2> tags that contain the words "chapter","book","section" or "part" as chapter titles as -well as any tags that have class="chapter". +<h2> tags that contain the words "chapter","book","section" or "part" as chapter titles as +well as any tags that have class="chapter". The expression used must evaluate to a list of elements. To disable chapter detection, use the expression "/". See the XPath Tutorial in the calibre User Manual for further help on using this feature. ''').replace('\n', ' ')) structure('chapter_mark', ['--chapter-mark'], choices=['pagebreak', 'rule', 'both', 'none'], - default='pagebreak', + default='pagebreak', help=_('Specify how to mark detected chapters. A value of ' '"pagebreak" will insert page breaks before chapters. ' 'A value of "rule" will insert a line before chapters. ' @@ -129,13 +129,13 @@ help on using this feature. help=_('XPath expression to find the name of each page in the ' 'pagination map relative to its boundary element. ' 'Default is to number all pages staring with 1.')) - toc = c.add_group('toc', + toc = c.add_group('toc', _('''\ Control the automatic generation of a Table of Contents. If an OPF file is detected and it specifies a Table of Contents, then that will be used rather than trying to auto-generate a Table of Contents. ''').replace('\n', ' ')) - toc('max_toc_links', ['--max-toc-links'], default=50, + toc('max_toc_links', ['--max-toc-links'], default=50, help=_('Maximum number of links to insert into the TOC. Set to 0 ' 'to disable. Default is: %default. Links are only added to the ' 'TOC if less than the --toc-threshold number of chapters were detected.')) @@ -166,15 +166,15 @@ to auto-generate a Table of Contents. help=_('Normally, if the source file already has a Table of Contents, ' 'it is used in preference to the auto-generated one. ' 'With this option, the auto-generated one is always used.')) - + layout = c.add_group('page layout', _('Control page layout')) - layout('margin_top', ['--margin-top'], default=5.0, + layout('margin_top', ['--margin-top'], default=5.0, help=_('Set the top margin in pts. Default is %default')) - layout('margin_bottom', ['--margin-bottom'], default=5.0, + layout('margin_bottom', ['--margin-bottom'], default=5.0, help=_('Set the bottom margin in pts. Default is %default')) - layout('margin_left', ['--margin-left'], default=5.0, + layout('margin_left', ['--margin-left'], default=5.0, help=_('Set the left margin in pts. Default is %default')) - layout('margin_right', ['--margin-right'], default=5.0, + layout('margin_right', ['--margin-right'], default=5.0, help=_('Set the right margin in pts. Default is %default')) layout('base_font_size2', ['--base-font-size'], default=12.0, help=_('The base font size in pts. Default is %defaultpt. ' @@ -195,12 +195,12 @@ to auto-generate a Table of Contents. 'This is only neccessary if the HTML files contain CSS that ' 'uses sibling selectors. Enabling this greatly slows down ' 'processing of large HTML files.')) - + c.add_opt('show_opf', ['--show-opf'], default=False, group='debug', help=_('Print generated OPF file to stdout')) c.add_opt('show_ncx', ['--show-ncx'], default=False, group='debug', help=_('Print generated NCX file to stdout')) - c.add_opt('keep_intermediate', ['--keep-intermediate-files'], group='debug', + c.add_opt('keep_intermediate', ['--keep-intermediate-files'], group='debug', default=False, help=_('Keep intermediate files during processing by html2epub')) c.add_opt('extract_to', ['--extract-to'], group='debug', default=None, diff --git a/src/calibre/ebooks/epub/fonts.py b/src/calibre/ebooks/epub/fonts.py index 5d0887f2d0..67e6066ed1 100644 --- a/src/calibre/ebooks/epub/fonts.py +++ b/src/calibre/ebooks/epub/fonts.py @@ -14,7 +14,7 @@ from lxml.cssselect import CSSSelector from lxml import etree from lxml.html import HtmlElement -from calibre.ebooks.html import fromstring +from calibre.ebooks.html_old import fromstring from calibre.ebooks.epub import rules from cssutils import CSSParser @@ -24,7 +24,7 @@ absolute_size = r'(?P<abs>(x?x-)?(small|large)|medium)' relative_size = r'(?P<rel>smaller|larger)' font_size_pat = re.compile('|'.join((relative_size, absolute_size, length)), re.I) -line_height_pat = re.compile(r'({num})(px|in|cm|mm|pt|pc)'.replace('{num}', num)) +line_height_pat = re.compile(r'({num})(px|in|cm|mm|pt|pc)'.replace('{num}', num)) PTU = { 'in' : 72., @@ -37,12 +37,12 @@ PTU = { DEFAULT_FONT_SIZE = 12 class Rationalizer(object): - + @classmethod def specificity(cls, s): '''Map CSS specificity tuple to a single integer''' - return sum([10**(4-i) + x for i,x in enumerate(s)]) - + return sum([10**(4-i) + x for i,x in enumerate(s)]) + @classmethod def compute_font_size(cls, elem): ''' @@ -59,7 +59,7 @@ class Rationalizer(object): elem.computed_font_size = sfs(parent.computed_font_size) else: elem.computed_font_size = sfs - + @classmethod def calculate_font_size(cls, style): 'Return font size in pts from style object. For relative units returns a callable' @@ -69,7 +69,7 @@ class Rationalizer(object): fs = match.group() if style.fontSize: fs = style.fontSize - + match = font_size_pat.search(fs) if match is None: return None @@ -89,8 +89,8 @@ class Rationalizer(object): return 12 * x if match.get('zero', False): return 0. - return functools.partial(operator.mul, 1.2) if 'larger' in fs.lower() else functools.partial(operator.mul, 0.8) - + return functools.partial(operator.mul, 1.2) if 'larger' in fs.lower() else functools.partial(operator.mul, 0.8) + @classmethod def resolve_rules(cls, stylesheets): for sheet in stylesheets: @@ -104,12 +104,12 @@ class Rationalizer(object): if font_size is not None: for s in r.selectorList: sheet.fs_rules.append([CSSSelector(s.selectorText), font_size]) - orig = line_height_pat.search(r.style.lineHeight) + orig = line_height_pat.search(r.style.lineHeight) if orig is not None: for s in r.selectorList: sheet.lh_rules.append([CSSSelector(s.selectorText), float(orig.group(1)) * PTU[orig.group(2).lower()]]) - - + + @classmethod def apply_font_size_rules(cls, stylesheets, root): 'Add a ``specified_font_size`` attribute to every element that has a specified font size' @@ -119,7 +119,7 @@ class Rationalizer(object): elems = selector(root) for elem in elems: elem.specified_font_size = font_size - + @classmethod def remove_font_size_information(cls, stylesheets): for r in rules(stylesheets): @@ -134,17 +134,17 @@ class Rationalizer(object): r.style.removeProperty('font') if line_height_pat.search(r.style.lineHeight) is not None: r.style.removeProperty('line-height') - + @classmethod def compute_font_sizes(cls, root, stylesheets, base=12): stylesheets = [s for s in stylesheets if hasattr(s, 'cssText')] cls.apply_font_size_rules(stylesheets, root) - + # Compute the effective font size of all tags root.computed_font_size = DEFAULT_FONT_SIZE for elem in root.iter(etree.Element): cls.compute_font_size(elem) - + extra_css = {} if base > 0: # Calculate the "base" (i.e. most common) font size @@ -157,20 +157,20 @@ class Rationalizer(object): if t: t = t.strip() if t: font_sizes[elem.computed_font_size] += len(t) - + t = getattr(elem, 'tail', '') if t: t = t.strip() if t: parent = elem.getparent() if parent.tag not in IGNORE: font_sizes[parent.computed_font_size] += len(t) - + try: most_common = max(font_sizes.items(), key=operator.itemgetter(1))[0] scale = base/most_common if most_common > 0 else 1. except ValueError: scale = 1. - + # rescale absolute line-heights counter = 0 for sheet in stylesheets: @@ -181,17 +181,17 @@ class Rationalizer(object): if not extra_css.has_key(elem.get('id')): extra_css[elem.get('id')] = [] extra_css[elem.get('id')].append('line-height:%fpt'%(lh*scale)) - - - + + + # Rescale all computed font sizes for elem in body.iter(etree.Element): if isinstance(elem, HtmlElement): elem.computed_font_size *= scale - - # Remove all font size specifications from the last stylesheet + + # Remove all font size specifications from the last stylesheet cls.remove_font_size_information(stylesheets[-1:]) - + # Create the CSS to implement the rescaled font sizes for elem in body.iter(etree.Element): cfs, pcfs = map(operator.attrgetter('computed_font_size'), (elem, elem.getparent())) @@ -201,12 +201,12 @@ class Rationalizer(object): if not extra_css.has_key(elem.get('id')): extra_css[elem.get('id')] = [] extra_css[elem.get('id')].append('font-size: %f%%'%(100*(cfs/pcfs))) - + css = CSSParser(loglevel=logging.ERROR).parseString('') for id, r in extra_css.items(): css.add('#%s {%s}'%(id, ';'.join(r))) return css - + @classmethod def rationalize(cls, stylesheets, root, opts): logger = logging.getLogger('html2epub') @@ -229,7 +229,7 @@ class Rationalizer(object): ################################################################################ class FontTest(unittest.TestCase): - + def setUp(self): from calibre.ebooks.epub import config self.opts = config(defaults='').parse() @@ -246,10 +246,10 @@ class FontTest(unittest.TestCase): <p id="p2">Some other <span class="it">text</span>.</p> <p id="longest">The longest piece of single font size text in this entire file. Used to test resizing.</p> </body> - </html> + </html> ''' self.root = fromstring(self.html) - + def do_test(self, css, base=DEFAULT_FONT_SIZE, scale=1): root1 = copy.deepcopy(self.root) root1.computed_font_size = DEFAULT_FONT_SIZE @@ -262,39 +262,39 @@ class FontTest(unittest.TestCase): for elem in root2.iter(etree.Element): Rationalizer.compute_font_size(elem) for e1, e2 in zip(root1.xpath('//body')[0].iter(etree.Element), root2.xpath('//body')[0].iter(etree.Element)): - self.assertAlmostEqual(e1.computed_font_size, e2.computed_font_size, + self.assertAlmostEqual(e1.computed_font_size, e2.computed_font_size, msg='Computed font sizes for %s not equal. Original: %f Processed: %f'%\ (root1.getroottree().getpath(e1), e1.computed_font_size, e2.computed_font_size)) return stylesheet2.cssText - + def testStripping(self): 'Test that any original entries are removed from the CSS' css = 'p { font: bold 10px italic smaller; font-size: x-large} \na { font-size: 0 }' css = CSSParser(loglevel=logging.ERROR).parseString(css) Rationalizer.compute_font_sizes(copy.deepcopy(self.root), [css]) - self.assertEqual(css.cssText.replace(' ', '').replace('\n', ''), + self.assertEqual(css.cssText.replace(' ', '').replace('\n', ''), 'p{font:bolditalic}') - + def testIdentity(self): 'Test that no unnecessary font size changes are made' extra_css = self.do_test('div {font-size:12pt} \nspan {font-size:100%}') self.assertEqual(extra_css.strip(), '') - + def testRelativization(self): 'Test conversion of absolute to relative sizes' self.do_test('#p1 {font: 24pt} b {font: 12pt} .it {font: 48pt} #p2 {font: 100%}') - + def testResizing(self): 'Test resizing of fonts' self.do_test('#longest {font: 24pt} .it {font:20pt; line-height:22pt}') - + def suite(): return unittest.TestLoader().loadTestsFromTestCase(FontTest) - + def test(): unittest.TextTestRunner(verbosity=2).run(suite()) if __name__ == '__main__': - sys.exit(test()) - \ No newline at end of file + sys.exit(test()) + diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py index 318cf5cc02..0ce4629062 100644 --- a/src/calibre/ebooks/epub/from_html.py +++ b/src/calibre/ebooks/epub/from_html.py @@ -38,7 +38,7 @@ from lxml.etree import XPath from lxml import html, etree from PyQt4.Qt import QApplication, QPixmap -from calibre.ebooks.html import Processor, merge_metadata, get_filelist,\ +from calibre.ebooks.html_old import Processor, merge_metadata, get_filelist,\ opf_traverse, create_metadata, rebase_toc, Link, parser from calibre.ebooks.epub import config as common_config, tostring from calibre.ptempfile import TemporaryDirectory diff --git a/src/calibre/ebooks/epub/iterator.py b/src/calibre/ebooks/epub/iterator.py index e55d402bef..5d47c93ea3 100644 --- a/src/calibre/ebooks/epub/iterator.py +++ b/src/calibre/ebooks/epub/iterator.py @@ -16,7 +16,7 @@ from calibre.ebooks.epub import config from calibre.ebooks.metadata.opf2 import OPF from calibre.ptempfile import TemporaryDirectory from calibre.ebooks.chardet import xml_to_unicode -from calibre.ebooks.html import create_dir +from calibre.ebooks.html_old import create_dir from calibre.utils.zipfile import safe_replace, ZipFile from calibre.utils.config import DynamicConfig diff --git a/src/calibre/ebooks/epub/split.py b/src/calibre/ebooks/epub/split.py index c39fe6d181..8ff62a1c4b 100644 --- a/src/calibre/ebooks/epub/split.py +++ b/src/calibre/ebooks/epub/split.py @@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en' Split the flows in an epub file to conform to size limitations. ''' -import os, math, logging, functools, collections, re, copy, sys +import os, math, functools, collections, re, copy, sys from lxml.etree import XPath as _XPath from lxml import etree, html @@ -24,16 +24,16 @@ SPLIT_ATTR = 'cs' SPLIT_POINT_ATTR = 'csp' class SplitError(ValueError): - + def __init__(self, path, root): size = len(tostring(root))/1024. - ValueError.__init__(self, _('Could not find reasonable point at which to split: %s Sub-tree size: %d KB')% + ValueError.__init__(self, _('Could not find reasonable point at which to split: %s Sub-tree size: %d KB')% (os.path.basename(path), size)) - + class Splitter(object): - + def __init__(self, path, opts, stylesheet_map, opf): self.setup_cli_handler(opts.verbose) self.path = path @@ -44,10 +44,10 @@ class Splitter(object): self.orig_size = os.stat(content(path)).st_size self.log_info('\tSplitting %s (%d KB)', path, self.orig_size/1024.) root = html.fromstring(open(content(path)).read()) - + self.page_breaks, self.trees = [], [] self.split_size = 0 - + # Split on page breaks self.splitting_on_page_breaks = True if not opts.dont_split_on_page_breaks: @@ -59,7 +59,7 @@ class Splitter(object): else: self.trees = [root.getroottree()] trees = list(self.trees) - + # Split any remaining over-sized trees self.splitting_on_page_breaks = False if self.opts.profile.flow_size < sys.maxint: @@ -67,7 +67,7 @@ class Splitter(object): self.log_info('\tLooking for large trees...') for i, tree in enumerate(list(trees)): self.trees = [] - size = len(tostring(tree.getroot())) + size = len(tostring(tree.getroot())) if size > self.opts.profile.flow_size: lt_found = True try: @@ -81,7 +81,7 @@ class Splitter(object): trees[i:i+1] = list(self.trees) if not lt_found: self.log_info('\tNo large trees found') - + self.trees = trees self.was_split = len(self.trees) > 1 if self.was_split: @@ -91,17 +91,17 @@ class Splitter(object): for f in self.files: self.log_info('\t\t\t%s - %d KB', f, os.stat(content(f)).st_size/1024.) self.fix_opf(opf) - + self.trees = None - - + + def split_text(self, text, root, size): self.log_debug('\t\t\tSplitting text of length: %d'%len(text)) rest = text.replace('\r', '') parts = re.split('\n\n', rest) self.log_debug('\t\t\t\tFound %d parts'%len(parts)) if max(map(len, parts)) > size: - raise SplitError('Cannot split as file contains a <pre> tag with a very large paragraph', root) + raise SplitError('Cannot split as file contains a <pre> tag with a very large paragraph', root) ans = [] buf = '' for part in parts: @@ -111,8 +111,8 @@ class Splitter(object): ans.append(buf) buf = part return ans - - + + def split_to_size(self, tree): self.log_debug('\t\tSplitting...') root = tree.getroot() @@ -134,7 +134,7 @@ class Splitter(object): p = pre.getparent() i = p.index(pre) p[i:i+1] = new_pres - + split_point, before = self.find_split_point(root) if split_point is None or self.split_size > 6*self.orig_size: if not self.always_remove: @@ -142,7 +142,7 @@ class Splitter(object): 'structure preservation. This may cause ' 'incorrect rendering.')) raise SplitError(self.path, root) - + for t in self.do_split(tree, split_point, before): r = t.getroot() if self.is_page_empty(r): @@ -151,12 +151,12 @@ class Splitter(object): if size <= self.opts.profile.flow_size: self.trees.append(t) #print tostring(t.getroot(), pretty_print=True) - self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)', + self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)', len(self.trees), size/1024.) self.split_size += size else: self.split_to_size(t) - + def is_page_empty(self, root): body = root.find('body') if body is None: @@ -170,14 +170,14 @@ class Splitter(object): if img.get('style', '') != 'display:none': return False return True - + def do_split(self, tree, split_point, before): ''' - Split ``tree`` into a *before* and *after* tree at ``split_point``, - preserving tag structure, but not duplicating any text. + Split ``tree`` into a *before* and *after* tree at ``split_point``, + preserving tag structure, but not duplicating any text. All tags that have had their text and tail removed have the attribute ``calibre_split`` set to 1. - + :param before: If True tree is split before split_point, otherwise after split_point :return: before_tree, after_tree ''' @@ -188,7 +188,7 @@ class Splitter(object): body, body2 = root.body, root2.body split_point = root.xpath(path)[0] split_point2 = root2.xpath(path)[0] - + def nix_element(elem, top=True): if self.always_remove: parent = elem.getparent() @@ -198,18 +198,18 @@ class Splitter(object): else: index = parent.index(elem) parent[index:index+1] = list(elem.iterchildren()) - + else: elem.text = u'' elem.tail = u'' elem.set(SPLIT_ATTR, '1') if elem.tag.lower() in ['ul', 'ol', 'dl', 'table', 'hr', 'img']: elem.set('style', 'display:none') - + def fix_split_point(sp): if not self.splitting_on_page_breaks: - sp.set('style', sp.get('style', '')+'page-break-before:avoid;page-break-after:avoid') - + sp.set('style', sp.get('style', '')+'page-break-before:avoid;page-break-after:avoid') + # Tree 1 hit_split_point = False for elem in list(body.iterdescendants(etree.Element)): @@ -223,8 +223,8 @@ class Splitter(object): continue if hit_split_point: nix_element(elem) - - + + # Tree 2 hit_split_point = False for elem in list(body2.iterdescendants(etree.Element)): @@ -238,17 +238,17 @@ class Splitter(object): continue if not hit_split_point: nix_element(elem, top=False) - + return tree, tree2 - - + + def split_on_page_breaks(self, orig_tree): ordered_ids = [] for elem in orig_tree.xpath('//*[@id]'): id = elem.get('id') if id in self.page_break_ids: ordered_ids.append(self.page_breaks[self.page_break_ids.index(id)]) - + self.trees = [] tree = orig_tree for pattern, before in ordered_ids: @@ -260,13 +260,13 @@ class Splitter(object): tree = after self.trees.append(tree) self.trees = [t for t in self.trees if not self.is_page_empty(t.getroot())] - - - + + + def find_page_breaks(self, stylesheets, root): ''' Find all elements that have either page-break-before or page-break-after set. - Populates `self.page_breaks` with id based XPath selectors (for elements that don't + Populates `self.page_breaks` with id based XPath selectors (for elements that don't have ids, an id is created). ''' page_break_selectors = set([]) @@ -283,16 +283,16 @@ class Splitter(object): page_break_selectors.add((CSSSelector(rule.selectorText), False)) except: pass - + page_breaks = set([]) for selector, before in page_break_selectors: for elem in selector(root): elem.pb_before = before page_breaks.add(elem) - + for i, elem in enumerate(root.iter()): elem.pb_order = i - + page_breaks = list(page_breaks) page_breaks.sort(cmp=lambda x,y : cmp(x.pb_order, y.pb_order)) self.page_break_ids = [] @@ -300,12 +300,12 @@ class Splitter(object): x.set('id', x.get('id', 'calibre_pb_%d'%i)) id = x.get('id') self.page_breaks.append((XPath('//*[@id="%s"]'%id), x.pb_before)) - self.page_break_ids.append(id) - - + self.page_break_ids.append(id) + + def find_split_point(self, root): ''' - Find the tag at which to split the tree rooted at `root`. + Find the tag at which to split the tree rooted at `root`. Search order is: * Heading tags * <div> tags @@ -314,7 +314,7 @@ class Splitter(object): * <p> tags * <br> tags * <li> tags - + We try to split in the "middle" of the file (as defined by tag counts. ''' def pick_elem(elems): @@ -325,18 +325,18 @@ class Splitter(object): i = int(math.floor(len(elems)/2.)) elems[i].set(SPLIT_POINT_ATTR, '1') return elems[i] - + for path in ( - '//*[re:match(name(), "h[1-6]", "i")]', + '//*[re:match(name(), "h[1-6]", "i")]', '/html/body/div', '//pre', - '//hr', + '//hr', '//p', '//div', '//br', '//li', ): - elems = root.xpath(path, + elems = root.xpath(path, namespaces={'re':'http://exslt.org/regular-expressions'}) elem = pick_elem(elems) if elem is not None: @@ -345,9 +345,9 @@ class Splitter(object): except: continue return elem, True - + return None, True - + def commit(self): ''' Commit all changes caused by the split. This removes the previously @@ -357,7 +357,7 @@ class Splitter(object): ''' self.anchor_map = collections.defaultdict(lambda :self.base%0) self.files = [] - + for i, tree in enumerate(self.trees): root = tree.getroot() self.files.append(self.base%i) @@ -367,7 +367,7 @@ class Splitter(object): for elem in root.xpath('//*[@%s or @%s]'%(SPLIT_ATTR, SPLIT_POINT_ATTR)): elem.attrib.pop(SPLIT_ATTR, None) elem.attrib.pop(SPLIT_POINT_ATTR, '0') - + for current, tree in zip(self.files, self.trees): for a in tree.getroot().xpath('//a[@href]'): href = a.get('href').strip() @@ -375,10 +375,10 @@ class Splitter(object): anchor = href[1:] file = self.anchor_map[anchor] if file != current: - a.set('href', file+href) + a.set('href', file+href) open(content(current), 'wb').\ write(tostring(tree.getroot(), pretty_print=self.opts.pretty_print)) - + os.remove(content(self.path)) @@ -391,12 +391,12 @@ class Splitter(object): id_map = {} for item in items: id_map[item.get('id')] = opf.replace_manifest_item(item, new_items) - + for id in id_map.keys(): opf.replace_spine_items_by_idref(id, id_map[id]) - + for ref in opf.iterguide(): - href = ref.get('href', '') + href = ref.get('href', '') if href.startswith('content/'+self.path): href = href.split('#') frag = None @@ -408,8 +408,8 @@ class Splitter(object): new_file = self.anchor_map[frag] ref.set('href', 'content/'+new_file+('' if frag is None else ('#'+frag))) - - + + def fix_content_links(html_files, changes, opts): split_files = [f.path for f in changes] anchor_maps = [f.anchor_map for f in changes] @@ -420,7 +420,7 @@ def fix_content_links(html_files, changes, opts): files[i:i+1] = changes[j].files except ValueError: continue - + for htmlfile in files: changed = False root = html.fromstring(open(content(htmlfile), 'rb').read()) @@ -439,7 +439,7 @@ def fix_content_links(html_files, changes, opts): frag = ('#'+anchor) if anchor else '' a.set('href', newf+frag) changed = True - + if changed: open(content(htmlfile), 'wb').write(tostring(root, pretty_print=opts.pretty_print)) @@ -448,7 +448,7 @@ def fix_ncx(path, changes): anchor_maps = [f.anchor_map for f in changes] tree = etree.parse(path) changed = False - for content in tree.getroot().xpath('//x:content[@src]', + for content in tree.getroot().xpath('//x:content[@src]', namespaces={'x':"http://www.daisy.org/z3986/2005/ncx/"}): href = content.get('src') if not href.startswith('#'): @@ -481,21 +481,21 @@ def find_html_files(opf): if os.path.exists(content(f)): html_files.append(f) return html_files - + def split(pathtoopf, opts, stylesheet_map): pathtoopf = os.path.abspath(pathtoopf) opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf)) - + with CurrentDir(os.path.dirname(pathtoopf)): html_files = find_html_files(opf) changes = [Splitter(f, opts, stylesheet_map, opf) for f in html_files] changes = [c for c in changes if c.was_split] - + fix_content_links(html_files, changes, opts) for item in opf.itermanifest(): if item.get('media-type', '') == 'application/x-dtbncx+xml': fix_ncx(item.get('href'), changes) - break + break open(pathtoopf, 'wb').write(opf.render()) diff --git a/src/calibre/ebooks/html/__init__.py b/src/calibre/ebooks/html/__init__.py new file mode 100644 index 0000000000..9a8f8e2d20 --- /dev/null +++ b/src/calibre/ebooks/html/__init__.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' +__docformat__ = 'restructuredtext en' + +import re + +from lxml.etree import tostring as _tostring + +def tostring(root, strip_comments=False, pretty_print=False): + ''' + Serialize processed XHTML. + ''' + root.set('xmlns', 'http://www.w3.org/1999/xhtml') + root.set('{http://www.w3.org/1999/xhtml}xlink', 'http://www.w3.org/1999/xlink') + for x in root.iter(): + if x.tag.rpartition('}')[-1].lower() == 'svg': + x.set('xmlns', 'http://www.w3.org/2000/svg') + + ans = _tostring(root, encoding='utf-8', pretty_print=pretty_print) + if strip_comments: + ans = re.compile(r'<!--.*?-->', re.DOTALL).sub('', ans) + ans = '<?xml version="1.0" encoding="utf-8" ?>\n'+ans + + return ans + + diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py new file mode 100644 index 0000000000..dd9aa0285c --- /dev/null +++ b/src/calibre/ebooks/html/input.py @@ -0,0 +1,342 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' +__docformat__ = 'restructuredtext en' + + +''' +Input plugin for HTML or OPF ebooks. +''' + +import os, re, sys, cStringIO +from urlparse import urlparse, urlunparse +from urllib import unquote + +from calibre.customize.conversion import InputFormatPlugin +from calibre.ebooks.metadata.meta import get_metadata +from calibre.ebooks.metadata.opf2 import OPF, OPFCreator +from calibre.ebooks.metadata import MetaInformation +from calibre.ebooks.chardet import xml_to_unicode +from calibre.customize.conversion import OptionRecommendation +from calibre import unicode_path + +class Link(object): + ''' + Represents a link in a HTML file. + ''' + + @classmethod + def url_to_local_path(cls, url, base): + path = urlunparse(('', '', url.path, url.params, url.query, '')) + path = unquote(path) + if os.path.isabs(path): + return path + return os.path.abspath(os.path.join(base, path)) + + def __init__(self, url, base): + ''' + :param url: The url this link points to. Must be an unquoted unicode string. + :param base: The base directory that relative URLs are with respect to. + Must be a unicode string. + ''' + assert isinstance(url, unicode) and isinstance(base, unicode) + self.url = url + self.parsed_url = urlparse(self.url) + self.is_local = self.parsed_url.scheme in ('', 'file') + self.is_internal = self.is_local and not bool(self.parsed_url.path) + self.path = None + self.fragment = unquote(self.parsed_url.fragment) + if self.is_local and not self.is_internal: + self.path = self.url_to_local_path(self.parsed_url, base) + + def __hash__(self): + if self.path is None: + return hash(self.url) + return hash(self.path) + + def __eq__(self, other): + return self.path == getattr(other, 'path', other) + + def __str__(self): + return u'Link: %s --> %s'%(self.url, self.path) + + +class IgnoreFile(Exception): + + def __init__(self, msg, errno): + Exception.__init__(self, msg) + self.doesnt_exist = errno == 2 + self.errno = errno + +class HTMLFile(object): + ''' + Contains basic information about an HTML file. This + includes a list of links to other files as well as + the encoding of each file. Also tries to detect if the file is not a HTML + file in which case :member:`is_binary` is set to True. + + The encoding of the file is available as :member:`encoding`. + ''' + + HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE) + TITLE_PAT = re.compile('<title>([^<>]+)', re.IGNORECASE) + LINK_PAT = re.compile( + r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P[^"]+)")|(?:\'(?P[^\']+)\')|(?P[^\s>]+))', + re.DOTALL|re.IGNORECASE) + + def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None): + ''' + :param level: The level of this file. Should be 0 for the root file. + :param encoding: Use `encoding` to decode HTML. + :param referrer: The :class:`HTMLFile` that first refers to this file. + ''' + self.path = unicode_path(path_to_html_file, abs=True) + self.title = os.path.splitext(os.path.basename(self.path))[0] + self.base = os.path.dirname(self.path) + self.level = level + self.referrer = referrer + self.links = [] + + try: + with open(self.path, 'rb') as f: + src = f.read() + except IOError, err: + msg = 'Could not read from file: %s with error: %s'%(self.path, unicode(err)) + if level == 0: + raise IOError(msg) + raise IgnoreFile(msg, err.errno) + + self.is_binary = not bool(self.HTML_PAT.search(src[:1024])) + if not self.is_binary: + if encoding is None: + encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1] + self.encoding = encoding + else: + self.encoding = encoding + + src = src.decode(encoding, 'replace') + match = self.TITLE_PAT.search(src) + self.title = match.group(1) if match is not None else self.title + self.find_links(src) + + + + def __eq__(self, other): + return self.path == getattr(other, 'path', other) + + def __str__(self): + return u'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path) + + def __repr__(self): + return str(self) + + + def find_links(self, src): + for match in self.LINK_PAT.finditer(src): + url = None + for i in ('url1', 'url2', 'url3'): + url = match.group(i) + if url: + break + link = self.resolve(url) + if link not in self.links: + self.links.append(link) + + def resolve(self, url): + return Link(url, self.base) + + +def depth_first(root, flat, visited=set([])): + yield root + visited.add(root) + for link in root.links: + if link.path is not None and link not in visited: + try: + index = flat.index(link) + except ValueError: # Can happen if max_levels is used + continue + hf = flat[index] + if hf not in visited: + yield hf + visited.add(hf) + for hf in depth_first(hf, flat, visited): + if hf not in visited: + yield hf + visited.add(hf) + + +def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None): + ''' + Recursively traverse all links in the HTML file. + + :param max_levels: Maximum levels of recursion. Must be non-negative. 0 + implies that no links in the root HTML file are followed. + :param encoding: Specify character encoding of HTML files. If `None` it is + auto-detected. + :return: A pair of lists (breadth_first, depth_first). Each list contains + :class:`HTMLFile` objects. + ''' + assert max_levels >= 0 + level = 0 + flat = [HTMLFile(path_to_html_file, level, encoding, verbose)] + next_level = list(flat) + while level < max_levels and len(next_level) > 0: + level += 1 + nl = [] + for hf in next_level: + rejects = [] + for link in hf.links: + if link.path is None or link.path in flat: + continue + try: + nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf) + if nf.is_binary: + raise IgnoreFile('%s is a binary file'%nf.path, -1) + nl.append(nf) + flat.append(nf) + except IgnoreFile, err: + rejects.append(link) + if not err.doesnt_exist or verbose > 1: + print repr(err) + for link in rejects: + hf.links.remove(link) + + next_level = list(nl) + orec = sys.getrecursionlimit() + sys.setrecursionlimit(500000) + try: + return flat, list(depth_first(flat[0], flat)) + finally: + sys.setrecursionlimit(orec) + + +def opf_traverse(opf_reader, verbose=0, encoding=None): + ''' + Return a list of :class:`HTMLFile` objects in the order specified by the + `` element of the OPF. + + :param opf_reader: An :class:`calibre.ebooks.metadata.opf2.OPF` instance. + :param encoding: Specify character encoding of HTML files. If `None` it is + auto-detected. + ''' + if not opf_reader.spine: + raise ValueError('OPF does not have a spine') + flat = [] + for path in opf_reader.spine.items(): + path = os.path.abspath(path) + if path not in flat: + flat.append(os.path.abspath(path)) + for item in opf_reader.manifest: + if 'html' in item.mime_type: + path = os.path.abspath(item.path) + if path not in flat: + flat.append(path) + for i, path in enumerate(flat): + if not os.path.exists(path): + path = path.replace('&', '%26') + if os.path.exists(path): + flat[i] = path + for item in opf_reader.itermanifest(): + item.set('href', item.get('href').replace('&', '%26')) + ans = [] + for path in flat: + if os.path.exists(path): + ans.append(HTMLFile(path, 0, encoding, verbose)) + else: + print 'WARNING: OPF spine item %s does not exist'%path + ans = [f for f in ans if not f.is_binary] + return ans + +def search_for_opf(dir): + for f in os.listdir(dir): + if f.lower().endswith('.opf'): + return OPF(open(os.path.join(dir, f), 'rb'), dir) + +def get_filelist(htmlfile, dir, opts, log): + ''' + Build list of files referenced by html file or try to detect and use an + OPF file instead. + ''' + print 'Building file list...' + opf = search_for_opf(dir) + filelist = None + if opf is not None: + try: + filelist = opf_traverse(opf, verbose=opts.verbose, + encoding=opts.input_encoding) + except: + pass + if not filelist: + filelist = traverse(htmlfile, max_levels=int(opts.max_levels), + verbose=opts.verbose, + encoding=opts.input_encoding)\ + [0 if opts.breadth_first else 1] + if opts.verbose: + log.debug('\tFound files...') + for f in filelist: + log.debug('\t\t', f) + return opf, filelist + + +class HTMLInput(InputFormatPlugin): + + name = 'HTML Input' + author = 'Kovid Goyal' + description = 'Convert HTML and OPF files to an OEB' + file_types = set(['opf', 'html', 'htm', 'xhtml', 'xhtm']) + + options = set([ + OptionRecommendation(name='breadth_first', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Traverse links in HTML files breadth first. Normally, ' + 'they are traversed depth first.' + ) + ), + + OptionRecommendation(name='max_levels', + recommended_value=5, level=OptionRecommendation.LOW, + help=_('Maximum levels of recursion when following links in ' + 'HTML files. Must be non-negative. 0 implies that no ' + 'links in the root HTML file are followed. Default is ' + '%default.' + ) + ), + + ]) + + def convert(self, stream, opts, file_ext, log, + accelerators): + basedir = os.getcwd() + if hasattr(stream, 'name'): + basedir = os.path.dirname(stream.name) + if file_ext == 'opf': + opf = OPF(stream, basedir) + filelist = opf_traverse(opf, verbose=opts.verbose, + encoding=opts.input_encoding) + mi = MetaInformation(opf) + else: + opf, filelist = get_filelist(stream.name, basedir, opts, log) + mi = MetaInformation(opf) + mi.smart_update(get_metadata(stream, 'html')) + + mi = OPFCreator(os.getcwdu(), mi) + mi.guide = None + entries = [(f.path, 'application/xhtml+xml') for f in filelist] + mi.create_manifest(entries) + mi.create_spine([f.path for f in filelist]) + + tocbuf = cStringIO.StringIO() + mi.render(open('metadata.opf', 'wb'), tocbuf, 'toc.ncx') + toc = tocbuf.getvalue() + if toc: + open('toc.ncx', 'wb').write(toc) + + from calibre.ebooks.conversion.plumber import create_oebbook + return create_oebbook(log, os.path.abspath('metadata.opf')) + + + + diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html_old.py similarity index 100% rename from src/calibre/ebooks/html.py rename to src/calibre/ebooks/html_old.py diff --git a/src/calibre/ebooks/metadata/opf2.py b/src/calibre/ebooks/metadata/opf2.py index f6b5a9bd1a..4b7648d81f 100644 --- a/src/calibre/ebooks/metadata/opf2.py +++ b/src/calibre/ebooks/metadata/opf2.py @@ -683,26 +683,6 @@ class OPF(object): return property(fget=fget, fset=fset) - @dynamic_property - def title_sort(self): - - def fget(self): - matches = self.title_path(self.metadata) - if matches: - for match in matches: - ans = match.get('{%s}file-as'%self.NAMESPACES['opf'], None) - if not ans: - ans = match.get('file-as', None) - if ans: - return ans - - def fset(self, val): - matches = self.title_path(self.metadata) - if matches: - matches[0].set('file-as', unicode(val)) - - return property(fget=fget, fset=fset) - @dynamic_property def tags(self): @@ -943,9 +923,10 @@ class OPFCreator(MetaInformation): from calibre.resources import opf_template from calibre.utils.genshi.template import MarkupTemplate template = MarkupTemplate(opf_template) + toc = getattr(self, 'toc', None) if self.manifest: self.manifest.set_basedir(self.base_path) - if ncx_manifest_entry is not None: + if ncx_manifest_entry is not None and toc is not None: if not os.path.isabs(ncx_manifest_entry): ncx_manifest_entry = os.path.join(self.base_path, ncx_manifest_entry) remove = [i for i in self.manifest if i.id == 'ncx'] @@ -965,7 +946,6 @@ class OPFCreator(MetaInformation): opf = template.generate(__appname__=__appname__, mi=self, __version__=__version__).render('xml') opf_stream.write(opf) opf_stream.flush() - toc = getattr(self, 'toc', None) if toc is not None and ncx_stream is not None: toc.render(ncx_stream, self.application_id) ncx_stream.flush() @@ -1030,19 +1010,8 @@ class OPFTest(unittest.TestCase): self.opf.smart_update(MetaInformation(self.opf)) self.testReading() - def testCreator(self): - opf = OPFCreator(os.getcwd(), self.opf) - buf = cStringIO.StringIO() - opf.render(buf) - raw = buf.getvalue() - self.testReading(opf=OPF(cStringIO.StringIO(raw), os.getcwd())) - - def testSmartUpdate(self): - self.opf.smart_update(self.opf) - self.testReading() - def suite(): return unittest.TestLoader().loadTestsFromTestCase(OPFTest) def test(): - unittest.TextTestRunner(verbosity=2).run(suite()) \ No newline at end of file + unittest.TextTestRunner(verbosity=2).run(suite()) diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py index 8f2e24a831..2eb45c9161 100644 --- a/src/calibre/ebooks/mobi/input.py +++ b/src/calibre/ebooks/mobi/input.py @@ -29,5 +29,5 @@ class MOBIInput(InputFormatPlugin): with open(f, 'wb') as q: q.write(html.tostring(root, encoding='utf-8', method='xml', include_meta_content_type=False)) - accelerators['pagebreaks'] = {f: '//div[@class="mbp_pagebreak"]'} + accelerators['pagebreaks'] = {f: '//*[@class="mbp_pagebreak"]'} return mr.created_opf_path diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index a78b5085d9..6032ae549a 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -522,7 +522,7 @@ class MobiReader(object): else: raise MobiError('Unknown compression algorithm: %s'%repr(self.book_header.compression_type)) if self.book_header.ancient and ' Date: Wed, 8 Apr 2009 20:53:45 -0400 Subject: [PATCH 077/319] a bit of preprocessing work --- src/calibre/ebooks/conversion/preprocess.py | 4 ++- src/calibre/ebooks/pdf/pdftohtml.py | 33 +-------------------- 2 files changed, 4 insertions(+), 33 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index f544a331d8..bb8ee90364 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -66,7 +66,9 @@ class HTMLPreProcessor(object): # Remove non breaking spaces (re.compile(ur'\u00a0'), lambda match : ' '), - + + # Have paragraphs show better + (re.compile(r''), lambda match : '

    '), ] # Fix Book Designer markup diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py index 0f6581dea6..e03d7d0647 100644 --- a/src/calibre/ebooks/pdf/pdftohtml.py +++ b/src/calibre/ebooks/pdf/pdftohtml.py @@ -24,32 +24,6 @@ if iswindows and hasattr(sys, 'frozen'): if islinux and getattr(sys, 'frozen_path', False): PDFTOHTML = os.path.join(getattr(sys, 'frozen_path'), 'pdftohtml') -# Fix pdftohtml markup -PDFTOHTML_RULES = [ - # Remove


    tags - (re.compile(r'', re.IGNORECASE), lambda match: '
    '), - # Remove page numbers - (re.compile(r'\d+
    ', re.IGNORECASE), lambda match: ''), - # Remove
    and replace

    with

    - (re.compile(r'\s*', re.IGNORECASE), lambda match: '

    '), - (re.compile(r'(.*)', re.IGNORECASE), - lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40 - else match.group(1)), - # Remove hyphenation - (re.compile(r'-\n\r?'), lambda match: ''), - - # Remove gray background - (re.compile(r']+>'), lambda match : ''), - - # Remove non breaking spaces - (re.compile(ur'\u00a0'), lambda match : ' '), - - # Add second
    after first to allow paragraphs to show better - (re.compile(r''), lambda match : '

    '), - - ] - - def pdftohtml(pdf_path): ''' Convert the pdf into html using the pdftohtml app. @@ -98,9 +72,4 @@ def pdftohtml(pdf_path): if not '\n' + processed_html(raw) - -def processed_html(html): - for rule in PDFTOHTML_RULES: - html = rule[0].sub(rule[1], html) - return html + return '\n' + raw From 2c3e8cccb81fa7ddc71493b7645ce3aaa9f53575 Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 9 Apr 2009 06:26:22 -0400 Subject: [PATCH 078/319] Better metadata detection --- src/calibre/ebooks/pdf/input.py | 4 ++-- src/calibre/ebooks/txt/input.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py index edbc2d6b30..6733d3aadc 100644 --- a/src/calibre/ebooks/pdf/input.py +++ b/src/calibre/ebooks/pdf/input.py @@ -10,7 +10,6 @@ import os from calibre.customize.conversion import InputFormatPlugin from calibre.ebooks.pdf.pdftohtml import pdftohtml from calibre.ebooks.metadata.opf import OPFCreator -from calibre.customize.builtins import PDFMetadataReader class PDFInput(InputFormatPlugin): @@ -26,7 +25,8 @@ class PDFInput(InputFormatPlugin): with open('index.html', 'wb') as index: index.write(html) - mi = PDFMetadataReader(None).get_metadata(stream, 'pdf') + from calibre.ebooks.metadata.meta import get_metadata + mi = get_metadata(stream, 'pdf') opf = OPFCreator(os.getcwd(), mi) opf.create_manifest([('index.html', None)]) opf.create_spine(['index.html']) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index e161f6b9bd..aafc36989e 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -10,7 +10,6 @@ import os from calibre.customize.conversion import InputFormatPlugin from calibre.ebooks.markdown import markdown from calibre.ebooks.metadata.opf import OPFCreator -from calibre.customize.builtins import TXTMetadataReader class TXTInput(InputFormatPlugin): @@ -33,7 +32,8 @@ class TXTInput(InputFormatPlugin): with open('index.html', 'wb') as index: index.write(html.encode('utf-8')) - mi = TXTMetadataReader(None).get_metadata(stream, 'txt') + from calibre.ebooks.metadata.meta import get_metadata + mi = get_metadata(stream, 'txt') opf = OPFCreator(os.getcwd(), mi) opf.create_manifest([('index.html', None)]) opf.create_spine(['index.html']) From f7ec532d578f874bf915c5f2bbd5077e667c074c Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 9 Apr 2009 17:31:07 -0400 Subject: [PATCH 079/319] pdftohtml processing: better line re-wrapping --- src/calibre/ebooks/conversion/preprocess.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index bb8ee90364..3fbbb47d13 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -48,6 +48,8 @@ class HTMLPreProcessor(object): # Fix pdftohtml markup PDFTOHTML = [ + # Remove page links + (re.compile(r'', re.IGNORECASE), lambda match: ''), # Remove


    tags (re.compile(r'', re.IGNORECASE), lambda match: '
    '), # Remove page numbers @@ -69,6 +71,12 @@ class HTMLPreProcessor(object): # Have paragraphs show better (re.compile(r''), lambda match : '

    '), + + # Re wrap lines + (re.compile(r'(?<=\w)\s*\s*\s*\s*(?=\w)'), lambda match: ' '), + (re.compile(r'(?<=\w)\s*\s*(?=\w)', re.UNICODE), lambda match: ' '), + # Clean up spaces + (re.compile(ru'(?<=\.|,|:|;|\?|!|”|"|\')[\s^ ]*(?=<)'), lambda match: ' '), ] # Fix Book Designer markup From ac0af1b844293f9c5720f9783fac6342e324724d Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 9 Apr 2009 19:16:49 -0400 Subject: [PATCH 080/319] pdftohtml processing: fix spaces rule --- src/calibre/ebooks/conversion/preprocess.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 3fbbb47d13..6b58d2d18d 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -72,11 +72,11 @@ class HTMLPreProcessor(object): # Have paragraphs show better (re.compile(r''), lambda match : '

    '), - # Re wrap lines + # Un wrap lines (re.compile(r'(?<=\w)\s*\s*\s*\s*(?=\w)'), lambda match: ' '), (re.compile(r'(?<=\w)\s*\s*(?=\w)', re.UNICODE), lambda match: ' '), # Clean up spaces - (re.compile(ru'(?<=\.|,|:|;|\?|!|”|"|\')[\s^ ]*(?=<)'), lambda match: ' '), + (re.compile(u'(?<=\.|,|:|;|\?|!|”|"|\')[\s^ ]*(?=<)'), lambda match: ' '), ] # Fix Book Designer markup From c6ffebff9017f258b299936bc6ab23204c8547a9 Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 9 Apr 2009 20:54:53 -0400 Subject: [PATCH 081/319] PDFOutput: add profile support --- src/calibre/ebooks/pdf/output.py | 2 +- src/calibre/ebooks/pdf/writer.py | 14 +++++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/pdf/output.py b/src/calibre/ebooks/pdf/output.py index 65af40dc51..5506316cba 100644 --- a/src/calibre/ebooks/pdf/output.py +++ b/src/calibre/ebooks/pdf/output.py @@ -75,7 +75,7 @@ class PDFOutput(OutputFormatPlugin): opf = glob.glob(os.path.join(oebdir, '*.opf'))[0] - writer = PDFWriter(log, popts) + writer = PDFWriter(log, popts, opts.output_profile) close = False if not hasattr(output_path, 'write'): diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index 7d0a690856..410787bd4f 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -12,13 +12,14 @@ Write content to PDF. import os, shutil, sys from calibre.ptempfile import PersistentTemporaryDirectory +from calibre.customize.profiles import OutputProfile from calibre.ebooks.pdf.pageoptions import PageOptions from calibre.ebooks.metadata import authors_to_string from calibre.ebooks.metadata.opf2 import OPF from PyQt4 import QtCore from PyQt4.Qt import QUrl, QEventLoop, SIGNAL, QObject, QApplication, QPrinter, \ - QMetaObject, Qt + QMetaObject, QSizeF, Qt from PyQt4.QtWebKit import QWebView from pyPdf import PdfFileWriter, PdfFileReader @@ -36,7 +37,7 @@ class PDFMetadata(object): class PDFWriter(QObject): - def __init__(self, log, popts=PageOptions()): + def __init__(self, log, popts=PageOptions(), profile=OutputProfile(None)): if QApplication.instance() is None: QApplication([]) QObject.__init__(self) @@ -50,6 +51,7 @@ class PDFWriter(QObject): self.combine_queue = [] self.tmp_path = PersistentTemporaryDirectory('_any2pdf_parts') self.popts = popts + self.profile = profile def dump(self, opfpath, out_stream, pdf_metadata): self.metadata = pdf_metadata @@ -85,8 +87,14 @@ class PDFWriter(QObject): self.logger.debug('\tRendering item %s as %i' % (os.path.basename(str(self.view.url().toLocalFile())), len(self.combine_queue))) printer = QPrinter(QPrinter.HighResolution) + + if self.profile.short_name == 'default': + printer.setPaperSize(self.popts.paper_size) + else: + #printer.setResolution(self.profile.dpi) + printer.setPaperSize(QSizeF(self.profile.width / self.profile.dpi, self.profile.height / self.profile.dpi), QPrinter.Inch) + printer.setPageMargins(self.popts.margin_left, self.popts.margin_top, self.popts.margin_right, self.popts.margin_bottom, self.popts.unit) - printer.setPaperSize(self.popts.paper_size) printer.setOrientation(self.popts.orientation) printer.setOutputFormat(QPrinter.PdfFormat) printer.setOutputFileName(item_path) From 1d19b2611e3acde0ea520738fb7b311139464dd8 Mon Sep 17 00:00:00 2001 From: John Schember Date: Fri, 10 Apr 2009 07:36:03 -0400 Subject: [PATCH 082/319] delete .dat files created by pdfs when removing books --- src/calibre/devices/cybookg3/driver.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/calibre/devices/cybookg3/driver.py b/src/calibre/devices/cybookg3/driver.py index 0998a60451..c6186b3c3a 100644 --- a/src/calibre/devices/cybookg3/driver.py +++ b/src/calibre/devices/cybookg3/driver.py @@ -123,9 +123,11 @@ class CYBOOKG3(USBMS): filepath, ext = os.path.splitext(path) - # Delete the ebook auxiliary file + # Delete the ebook auxiliary files if os.path.exists(filepath + '.mbp'): os.unlink(filepath + '.mbp') + if os.path.exists(filepath + '.dat'): + os.unlink(filepath + '.dat') # Delete the thumbnails file auto generated for the ebook if os.path.exists(filepath + '_6090.t2b'): From 41dd6ddf9fd38371ab0046c64ba74e9ebdf5b713 Mon Sep 17 00:00:00 2001 From: John Schember Date: Fri, 10 Apr 2009 21:10:40 -0400 Subject: [PATCH 083/319] PDFOutput: Custom document size, clean up pageoptions. --- src/calibre/ebooks/pdf/output.py | 38 +++++++++--------------- src/calibre/ebooks/pdf/pageoptions.py | 38 ++++-------------------- src/calibre/ebooks/pdf/writer.py | 42 ++++++++++++++++++--------- 3 files changed, 47 insertions(+), 71 deletions(-) diff --git a/src/calibre/ebooks/pdf/output.py b/src/calibre/ebooks/pdf/output.py index 5506316cba..20ba5028b0 100644 --- a/src/calibre/ebooks/pdf/output.py +++ b/src/calibre/ebooks/pdf/output.py @@ -18,8 +18,8 @@ from calibre.customize.conversion import OutputFormatPlugin, \ from calibre.ebooks.oeb.output import OEBOutput from calibre.ptempfile import TemporaryDirectory from calibre.ebooks.pdf.writer import PDFWriter, PDFMetadata -from calibre.ebooks.pdf.pageoptions import UNITS, unit, PAPER_SIZES, \ - paper_size, ORIENTATIONS, orientation, PageOptions +from calibre.ebooks.pdf.pageoptions import UNITS, PAPER_SIZES, \ + ORIENTATIONS class PDFOutput(OutputFormatPlugin): @@ -29,53 +29,43 @@ class PDFOutput(OutputFormatPlugin): options = set([ OptionRecommendation(name='margin_top', recommended_value='1', - level=OptionRecommendation.LOW, long_switch='margin_top', + level=OptionRecommendation.LOW, help=_('The top margin around the document.')), OptionRecommendation(name='margin_bottom', recommended_value='1', - level=OptionRecommendation.LOW, long_switch='margin_bottom', + level=OptionRecommendation.LOW, help=_('The bottom margin around the document.')), OptionRecommendation(name='margin_left', recommended_value='1', - level=OptionRecommendation.LOW, long_switch='margin_left', + level=OptionRecommendation.LOW, help=_('The left margin around the document.')), OptionRecommendation(name='margin_right', recommended_value='1', - level=OptionRecommendation.LOW, long_switch='margin_right', + level=OptionRecommendation.LOW, help=_('The right margin around the document.')), OptionRecommendation(name='unit', recommended_value='inch', - level=OptionRecommendation.LOW, short_switch='u', - long_switch='unit', choices=UNITS.keys(), + level=OptionRecommendation.LOW, short_switch='u', choices=UNITS.keys(), help=_('The unit of measure. Default is inch. Choices ' 'are %s' % UNITS.keys())), OptionRecommendation(name='paper_size', recommended_value='letter', - level=OptionRecommendation.LOW, - long_switch='paper_size', choices=PAPER_SIZES.keys(), + level=OptionRecommendation.LOW, choices=PAPER_SIZES.keys(), help=_('The size of the paper. Default is letter. Choices ' 'are %s' % PAPER_SIZES.keys())), + OptionRecommendation(name='custom_size', recommended_value=None, + help=_('Custom size of the document. Use the form widthxheight ' + 'EG. `123x321` to specify the width and height. ' + 'This overrides any specified paper-size.')), OptionRecommendation(name='orientation', recommended_value='portrait', - level=OptionRecommendation.LOW, - long_switch='orientation', choices=ORIENTATIONS.keys(), + level=OptionRecommendation.LOW, choices=ORIENTATIONS.keys(), help=_('The orientation of the page. Default is portrait. Choices ' 'are %s' % ORIENTATIONS.keys())), ]) def convert(self, oeb_book, output_path, input_plugin, opts, log): - popts = PageOptions() - - popts.set_margin_top(opts.margin_top) - popts.set_margin_bottom(opts.margin_bottom) - popts.set_margin_left(opts.margin_left) - popts.set_margin_right(opts.margin_right) - - popts.unit = unit(opts.unit) - popts.paper_size = paper_size(opts.paper_size) - popts.orientation = orientation(opts.orientation) - with TemporaryDirectory('_pdf_out') as oebdir: OEBOutput(None).convert(oeb_book, oebdir, input_plugin, opts, log) opf = glob.glob(os.path.join(oebdir, '*.opf'))[0] - writer = PDFWriter(log, popts, opts.output_profile) + writer = PDFWriter(log, opts) close = False if not hasattr(output_path, 'write'): diff --git a/src/calibre/ebooks/pdf/pageoptions.py b/src/calibre/ebooks/pdf/pageoptions.py index 26fae81662..b115ac0f34 100644 --- a/src/calibre/ebooks/pdf/pageoptions.py +++ b/src/calibre/ebooks/pdf/pageoptions.py @@ -63,36 +63,8 @@ ORIENTATIONS = { def orientation(orientation): return ORIENTATIONS.get(orientation, QPrinter.Portrait) - -class PageOptions(object): - margin_top = 1 - margin_bottom = 1 - margin_left = 1 - margin_right = 1 - unit = QPrinter.Inch - paper_size = QPrinter.Letter - orientation = QPrinter.Portrait - - def set_margin_top(self, size): - try: - self.margin_top = int(size) - except: - self.margin_top = 1 - - def set_margin_bottom(self, size): - try: - self.margin_bottom = int(size) - except: - self.margin_bottom = 1 - - def set_margin_left(self, size): - try: - self.margin_left = int(size) - except: - self.margin_left = 1 - - def set_margin_right(self, size): - try: - self.margin_right = int(size) - except: - self.margin_right = 1 +def size(size): + try: + return int(size) + except: + return 1 diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index 410787bd4f..0f8cbf50c0 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -13,13 +13,14 @@ import os, shutil, sys from calibre.ptempfile import PersistentTemporaryDirectory from calibre.customize.profiles import OutputProfile -from calibre.ebooks.pdf.pageoptions import PageOptions +from calibre.ebooks.pdf.pageoptions import unit, paper_size, \ + orientation, size from calibre.ebooks.metadata import authors_to_string from calibre.ebooks.metadata.opf2 import OPF from PyQt4 import QtCore -from PyQt4.Qt import QUrl, QEventLoop, SIGNAL, QObject, QApplication, QPrinter, \ - QMetaObject, QSizeF, Qt +from PyQt4.Qt import QUrl, QEventLoop, SIGNAL, QObject, \ + QApplication, QPrinter, QMetaObject, QSizeF, Qt from PyQt4.QtWebKit import QWebView from pyPdf import PdfFileWriter, PdfFileReader @@ -37,7 +38,7 @@ class PDFMetadata(object): class PDFWriter(QObject): - def __init__(self, log, popts=PageOptions(), profile=OutputProfile(None)): + def __init__(self, log, opts): if QApplication.instance() is None: QApplication([]) QObject.__init__(self) @@ -49,9 +50,20 @@ class PDFWriter(QObject): self.connect(self.view, SIGNAL('loadFinished(bool)'), self._render_html) self.render_queue = [] self.combine_queue = [] - self.tmp_path = PersistentTemporaryDirectory('_any2pdf_parts') - self.popts = popts - self.profile = profile + self.tmp_path = PersistentTemporaryDirectory('_pdf_output_parts') + + self.custom_size = None + if opts.custom_size != None: + width, sep, height = opts.custom_size.partition('x') + if height != '': + try: + width = int(width) + height = int(height) + self.custom_size = (width, height) + except: + self.custom_size = None + + self.opts = opts def dump(self, opfpath, out_stream, pdf_metadata): self.metadata = pdf_metadata @@ -88,14 +100,16 @@ class PDFWriter(QObject): printer = QPrinter(QPrinter.HighResolution) - if self.profile.short_name == 'default': - printer.setPaperSize(self.popts.paper_size) + if self.opts.output_profile.short_name == 'default': + if self.custom_size == None: + printer.setPaperSize(paper_size(self.opts.paper_size)) + else: + printer.setPaperSize(QSizeF(self.custom_size[0], self.custom_size[1]), unit(self.opts.unit)) else: - #printer.setResolution(self.profile.dpi) - printer.setPaperSize(QSizeF(self.profile.width / self.profile.dpi, self.profile.height / self.profile.dpi), QPrinter.Inch) + printer.setPaperSize(QSizeF(self.opts.output_profile.width / self.opts.output_profile.dpi, self.opts.output_profile.height / self.opts.output_profile.dpi), QPrinter.Inch) - printer.setPageMargins(self.popts.margin_left, self.popts.margin_top, self.popts.margin_right, self.popts.margin_bottom, self.popts.unit) - printer.setOrientation(self.popts.orientation) + printer.setPageMargins(size(self.opts.margin_left), size(self.opts.margin_top), size(self.opts.margin_right), size(self.opts.margin_bottom), unit(self.opts.unit)) + printer.setOrientation(orientation(self.opts.orientation)) printer.setOutputFormat(QPrinter.PdfFormat) printer.setOutputFileName(item_path) self.view.print_(printer) @@ -104,7 +118,7 @@ class PDFWriter(QObject): def _delete_tmpdir(self): if os.path.exists(self.tmp_path): shutil.rmtree(self.tmp_path, True) - self.tmp_path = PersistentTemporaryDirectory('_pdf_out_parts') + self.tmp_path = PersistentTemporaryDirectory('_pdf_output_parts') def _write(self): self.logger.info('Combining individual PDF parts...') From 95d1b58ae301335879a9b6576fffe168cfc012dc Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 10 Apr 2009 21:12:27 -0700 Subject: [PATCH 084/319] Working HTML/OPF input plugin. Also fixed feeds download and removed cover processing from OEBBook --- src/calibre/customize/builtins.py | 107 ++++++------ src/calibre/customize/conversion.py | 2 +- src/calibre/ebooks/html/__init__.py | 2 +- src/calibre/ebooks/html/input.py | 109 +++--------- src/calibre/ebooks/oeb/reader.py | 2 +- src/calibre/ebooks/oeb/transforms/package.py | 68 ++++++-- src/calibre/ebooks/oeb/writer.py | 3 +- src/calibre/web/feeds/main.py | 78 ++++----- src/calibre/web/feeds/news.py | 90 +++++----- src/calibre/web/fetch/simple.py | 171 +++++++++---------- 10 files changed, 295 insertions(+), 337 deletions(-) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 30f423fce3..484d46dc36 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -18,7 +18,7 @@ every time you add an HTML file to the library.\ file_types = set(['html', 'htm', 'xhtml', 'xhtm']) supported_platforms = ['windows', 'osx', 'linux'] on_import = True - + def run(self, htmlfile): of = self.temporary_file('_plugin_html2zip.zip') from calibre.ebooks.html import gui_main as html2oeb @@ -26,172 +26,173 @@ every time you add an HTML file to the library.\ return of.name class OPFMetadataReader(MetadataReaderPlugin): - + name = 'Read OPF metadata' file_types = set(['opf']) description = _('Read metadata from %s files')%'OPF' - + def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.opf2 import OPF from calibre.ebooks.metadata import MetaInformation return MetaInformation(OPF(stream, os.getcwd())) class RTFMetadataReader(MetadataReaderPlugin): - - name = 'Read RTF metadata' + + name = 'Read RTF metadata' file_types = set(['rtf']) description = _('Read metadata from %s files')%'RTF' - + def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.rtf import get_metadata return get_metadata(stream) class FB2MetadataReader(MetadataReaderPlugin): - + name = 'Read FB2 metadata' file_types = set(['fb2']) description = _('Read metadata from %s files')%'FB2' - + def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.fb2 import get_metadata return get_metadata(stream) class LRFMetadataReader(MetadataReaderPlugin): - + name = 'Read LRF metadata' file_types = set(['lrf']) description = _('Read metadata from %s files')%'LRF' - + def get_metadata(self, stream, ftype): from calibre.ebooks.lrf.meta import get_metadata return get_metadata(stream) class PDFMetadataReader(MetadataReaderPlugin): - + name = 'Read PDF metadata' file_types = set(['pdf']) description = _('Read metadata from %s files')%'PDF' - + def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.pdf import get_metadata return get_metadata(stream) class LITMetadataReader(MetadataReaderPlugin): - + name = 'Read LIT metadata' file_types = set(['lit']) description = _('Read metadata from %s files')%'LIT' - + def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.lit import get_metadata return get_metadata(stream) class IMPMetadataReader(MetadataReaderPlugin): - + name = 'Read IMP metadata' file_types = set(['imp']) description = _('Read metadata from %s files')%'IMP' author = 'Ashish Kulkarni' - + def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.imp import get_metadata return get_metadata(stream) class RBMetadataReader(MetadataReaderPlugin): - + name = 'Read RB metadata' file_types = set(['rb']) description = _('Read metadata from %s files')%'RB' author = 'Ashish Kulkarni' - + def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.rb import get_metadata return get_metadata(stream) class EPUBMetadataReader(MetadataReaderPlugin): - + name = 'Read EPUB metadata' file_types = set(['epub']) description = _('Read metadata from %s files')%'EPUB' - + def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.epub import get_metadata return get_metadata(stream) class HTMLMetadataReader(MetadataReaderPlugin): - + name = 'Read HTML metadata' file_types = set(['html']) description = _('Read metadata from %s files')%'HTML' - + def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.html import get_metadata return get_metadata(stream) class MOBIMetadataReader(MetadataReaderPlugin): - + name = 'Read MOBI metadata' file_types = set(['mobi', 'prc', 'azw']) description = _('Read metadata from %s files')%'MOBI' - + def get_metadata(self, stream, ftype): from calibre.ebooks.mobi.reader import get_metadata return get_metadata(stream) class TOPAZMetadataReader(MetadataReaderPlugin): - + name = 'Read Topaz metadata' file_types = set(['tpz', 'azw1']) description = _('Read metadata from %s files')%'MOBI' - + def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.topaz import get_metadata return get_metadata(stream) class ODTMetadataReader(MetadataReaderPlugin): - + name = 'Read ODT metadata' file_types = set(['odt']) description = _('Read metadata from %s files')%'ODT' - + def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.odt import get_metadata return get_metadata(stream) - + class TXTMetadataReader(MetadataReaderPlugin): - + name = 'Read TXT metadata' file_types = set(['txt']) description = _('Read metadata from %s files') % 'TXT' - + def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.txt import get_metadata return get_metadata(stream) class LRXMetadataReader(MetadataReaderPlugin): - + name = 'Read LRX metadata' file_types = set(['lrx']) description = _('Read metadata from %s files')%'LRX' - + def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.lrx import get_metadata return get_metadata(stream) class ComicMetadataReader(MetadataReaderPlugin): - + name = 'Read comic metadata' file_types = set(['cbr', 'cbz']) description = _('Extract cover from comic files') - + def get_metadata(self, stream, ftype): if ftype == 'cbr': from calibre.libunrar import extract_member as extract_first + extract_first else: from calibre.libunzip import extract_member as extract_first - from calibre.ebooks.metadata import MetaInformation + from calibre.ebooks.metadata import MetaInformation ret = extract_first(stream) mi = MetaInformation(None, None) if ret is not None: @@ -199,65 +200,65 @@ class ComicMetadataReader(MetadataReaderPlugin): ext = os.path.splitext(path)[1][1:] mi.cover_data = (ext.lower(), data) return mi - + class ZipMetadataReader(MetadataReaderPlugin): - + name = 'Read ZIP metadata' file_types = set(['zip', 'oebzip']) description = _('Read metadata from ebooks in ZIP archives') - + def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.zip import get_metadata return get_metadata(stream) class RARMetadataReader(MetadataReaderPlugin): - + name = 'Read RAR metadata' file_types = set(['rar']) description = _('Read metadata from ebooks in RAR archives') - + def get_metadata(self, stream, ftype): from calibre.ebooks.metadata.rar import get_metadata return get_metadata(stream) class EPUBMetadataWriter(MetadataWriterPlugin): - + name = 'Set EPUB metadata' file_types = set(['epub']) description = _('Set metadata in %s files')%'EPUB' - + def set_metadata(self, stream, mi, type): from calibre.ebooks.metadata.epub import set_metadata set_metadata(stream, mi) - + class LRFMetadataWriter(MetadataWriterPlugin): - + name = 'Set LRF metadata' file_types = set(['lrf']) description = _('Set metadata in %s files')%'LRF' - + def set_metadata(self, stream, mi, type): from calibre.ebooks.lrf.meta import set_metadata set_metadata(stream, mi) class RTFMetadataWriter(MetadataWriterPlugin): - + name = 'Set RTF metadata' file_types = set(['rtf']) description = _('Set metadata in %s files')%'RTF' - + def set_metadata(self, stream, mi, type): from calibre.ebooks.metadata.rtf import set_metadata set_metadata(stream, mi) class MOBIMetadataWriter(MetadataWriterPlugin): - + name = 'Set MOBI metadata' file_types = set(['mobi', 'prc', 'azw']) description = _('Set metadata in %s files')%'MOBI' author = 'Marshall T. Vandegrift' - + def set_metadata(self, stream, mi, type): from calibre.ebooks.metadata.mobi import set_metadata set_metadata(stream, mi) @@ -267,14 +268,16 @@ from calibre.ebooks.epub.input import EPUBInput from calibre.ebooks.mobi.input import MOBIInput from calibre.ebooks.pdf.input import PDFInput from calibre.ebooks.txt.input import TXTInput +from calibre.ebooks.html.input import HTMLInput from calibre.ebooks.oeb.output import OEBOutput from calibre.ebooks.txt.output import TXTOutput from calibre.ebooks.pdf.output import PDFOutput from calibre.customize.profiles import input_profiles, output_profiles -plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, TXTInput, OEBOutput, TXTOutput, PDFOutput] +plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput, + TXTInput, OEBOutput, TXTOutput, PDFOutput] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataWriter')] -plugins += input_profiles + output_profiles \ No newline at end of file +plugins += input_profiles + output_profiles diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index c531a15e34..77cdb0b7da 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -163,9 +163,9 @@ class InputFormatPlugin(Plugin): for x in os.listdir('.'): shutil.rmtree(x) if os.path.isdir(x) else os.remove(x) - ret = self.convert(stream, options, file_ext, log, accelerators) + if options.debug_input is not None: options.debug_input = os.path.abspath(options.debug_input) if not os.path.exists(options.debug_input): diff --git a/src/calibre/ebooks/html/__init__.py b/src/calibre/ebooks/html/__init__.py index 9a8f8e2d20..d026256ee8 100644 --- a/src/calibre/ebooks/html/__init__.py +++ b/src/calibre/ebooks/html/__init__.py @@ -17,7 +17,7 @@ def tostring(root, strip_comments=False, pretty_print=False): root.set('xmlns', 'http://www.w3.org/1999/xhtml') root.set('{http://www.w3.org/1999/xhtml}xlink', 'http://www.w3.org/1999/xlink') for x in root.iter(): - if x.tag.rpartition('}')[-1].lower() == 'svg': + if hasattr(x.tag, 'rpartition') and x.tag.rpartition('}')[-1].lower() == 'svg': x.set('xmlns', 'http://www.w3.org/2000/svg') ans = _tostring(root, encoding='utf-8', pretty_print=pretty_print) diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index dd9aa0285c..951b0824a5 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -11,14 +11,12 @@ __docformat__ = 'restructuredtext en' Input plugin for HTML or OPF ebooks. ''' -import os, re, sys, cStringIO +import os, re, sys from urlparse import urlparse, urlunparse from urllib import unquote from calibre.customize.conversion import InputFormatPlugin -from calibre.ebooks.metadata.meta import get_metadata -from calibre.ebooks.metadata.opf2 import OPF, OPFCreator -from calibre.ebooks.metadata import MetaInformation +from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.chardet import xml_to_unicode from calibre.customize.conversion import OptionRecommendation from calibre import unicode_path @@ -213,72 +211,21 @@ def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None) sys.setrecursionlimit(orec) -def opf_traverse(opf_reader, verbose=0, encoding=None): - ''' - Return a list of :class:`HTMLFile` objects in the order specified by the - `` element of the OPF. - - :param opf_reader: An :class:`calibre.ebooks.metadata.opf2.OPF` instance. - :param encoding: Specify character encoding of HTML files. If `None` it is - auto-detected. - ''' - if not opf_reader.spine: - raise ValueError('OPF does not have a spine') - flat = [] - for path in opf_reader.spine.items(): - path = os.path.abspath(path) - if path not in flat: - flat.append(os.path.abspath(path)) - for item in opf_reader.manifest: - if 'html' in item.mime_type: - path = os.path.abspath(item.path) - if path not in flat: - flat.append(path) - for i, path in enumerate(flat): - if not os.path.exists(path): - path = path.replace('&', '%26') - if os.path.exists(path): - flat[i] = path - for item in opf_reader.itermanifest(): - item.set('href', item.get('href').replace('&', '%26')) - ans = [] - for path in flat: - if os.path.exists(path): - ans.append(HTMLFile(path, 0, encoding, verbose)) - else: - print 'WARNING: OPF spine item %s does not exist'%path - ans = [f for f in ans if not f.is_binary] - return ans - -def search_for_opf(dir): - for f in os.listdir(dir): - if f.lower().endswith('.opf'): - return OPF(open(os.path.join(dir, f), 'rb'), dir) - def get_filelist(htmlfile, dir, opts, log): ''' Build list of files referenced by html file or try to detect and use an OPF file instead. ''' - print 'Building file list...' - opf = search_for_opf(dir) - filelist = None - if opf is not None: - try: - filelist = opf_traverse(opf, verbose=opts.verbose, - encoding=opts.input_encoding) - except: - pass - if not filelist: - filelist = traverse(htmlfile, max_levels=int(opts.max_levels), - verbose=opts.verbose, - encoding=opts.input_encoding)\ - [0 if opts.breadth_first else 1] + log.info('Building file list...') + filelist = traverse(htmlfile, max_levels=int(opts.max_levels), + verbose=opts.verbose, + encoding=opts.input_encoding)\ + [0 if opts.breadth_first else 1] if opts.verbose: log.debug('\tFound files...') for f in filelist: log.debug('\t\t', f) - return opf, filelist + return filelist class HTMLInput(InputFormatPlugin): @@ -309,34 +256,32 @@ class HTMLInput(InputFormatPlugin): def convert(self, stream, opts, file_ext, log, accelerators): + from calibre.ebooks.metadata.meta import get_metadata + basedir = os.getcwd() + if hasattr(stream, 'name'): basedir = os.path.dirname(stream.name) if file_ext == 'opf': - opf = OPF(stream, basedir) - filelist = opf_traverse(opf, verbose=opts.verbose, - encoding=opts.input_encoding) - mi = MetaInformation(opf) + opfpath = stream.name else: - opf, filelist = get_filelist(stream.name, basedir, opts, log) - mi = MetaInformation(opf) - mi.smart_update(get_metadata(stream, 'html')) + filelist = get_filelist(stream.name, basedir, opts, log) + mi = get_metadata(stream, 'html') + mi = OPFCreator(os.getcwdu(), mi) + mi.guide = None + entries = [(f.path, 'application/xhtml+xml') for f in filelist] + mi.create_manifest(entries) + mi.create_spine([f.path for f in filelist]) - mi = OPFCreator(os.getcwdu(), mi) - mi.guide = None - entries = [(f.path, 'application/xhtml+xml') for f in filelist] - mi.create_manifest(entries) - mi.create_spine([f.path for f in filelist]) - - tocbuf = cStringIO.StringIO() - mi.render(open('metadata.opf', 'wb'), tocbuf, 'toc.ncx') - toc = tocbuf.getvalue() - if toc: - open('toc.ncx', 'wb').write(toc) + mi.render(open('metadata.opf', 'wb')) + opfpath = os.path.abspath('metadata.opf') from calibre.ebooks.conversion.plumber import create_oebbook - return create_oebbook(log, os.path.abspath('metadata.opf')) - - + oeb = create_oebbook(log, opfpath) + + from calibre.ebooks.oeb.transforms.package import Package + Package(os.getcwdu())(oeb, opts) + + return oeb diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py index 0c5a4ad97c..faeff4b825 100644 --- a/src/calibre/ebooks/oeb/reader.py +++ b/src/calibre/ebooks/oeb/reader.py @@ -573,7 +573,7 @@ class OEBReader(object): item = self._find_ncx(opf) self._toc_from_opf(opf, item) self._pages_from_opf(opf, item) - self._ensure_cover_image() + #self._ensure_cover_image() def main(argv=sys.argv): diff --git a/src/calibre/ebooks/oeb/transforms/package.py b/src/calibre/ebooks/oeb/transforms/package.py index de775f8865..faf5486475 100644 --- a/src/calibre/ebooks/oeb/transforms/package.py +++ b/src/calibre/ebooks/oeb/transforms/package.py @@ -6,13 +6,14 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import os +import os, re from urllib import unquote as urlunquote from functools import partial from lxml import etree import cssutils +from calibre import sanitize_file_name from calibre.constants import islinux from calibre.ebooks.oeb.base import OEB_DOCS, urlnormalize, urldefrag, \ rewrite_links @@ -36,15 +37,21 @@ class Package(object): self.new_base_path = os.path.abspath(base) def rewrite_links_in(self, item): - base = os.path.join(self.new_base_path, *item.href.split('/')) + old_href = item.old_href.split('#')[0] + new_href = item.href.split('#')[0] + base = os.path.join(self.old_base_path, *old_href.split('/')) base = os.path.dirname(base) + self.log.debug('\tRewriting links in', base+'/'+ + item.href.rpartition('/')[-1]) + new_base = os.path.join(self.new_base_path, *new_href.split('/')) + new_base = os.path.dirname(new_base) if etree.iselement(item.data): - self.rewrite_links_in_xml(item.data, base) + self.rewrite_links_in_xml(item.data, base, new_base) elif hasattr(item.data, 'cssText'): - self.rewrite_links_in_css(item.data, base) + self.rewrite_links_in_css(item.data, base, new_base) - def link_replacer(self, link_, base=''): + def link_replacer(self, link_, base='', new_base=''): link = urlnormalize(link_) link, frag = urldefrag(link) link = urlunquote(link).replace('/', os.sep) @@ -55,20 +62,33 @@ class Package(object): link = link.lower() if link not in self.map: return link_ - nlink = os.path.relpath(self.map[link], base) + nlink = os.path.relpath(self.map[link], new_base) if frag: - nlink = '#'.join(nlink, frag) + nlink = '#'.join((nlink, frag)) return nlink.replace(os.sep, '/') - def rewrite_links_in_css(self, sheet, base): - repl = partial(self.link_replacer, base=base) + def rewrite_links_in_css(self, sheet, base, new_base): + repl = partial(self.link_replacer, base=base, new_base=new_base) cssutils.replaceUrls(sheet, repl) - def rewrite_links_in_xml(self, root, base): - repl = partial(self.link_replacer, base=base) + def rewrite_links_in_xml(self, root, base, new_base): + repl = partial(self.link_replacer, base=base, new_base=new_base) rewrite_links(root, repl) - def move_manifest_item(self, item): + def uniqify_name(self, new_href, hrefs): + c = 0 + while new_href in hrefs: + c += 1 + parts = new_href.split('/') + name, ext = os.path.splitext(parts[-1]) + name = re.sub(r'_\d+$', '', name) + name += '_%d'%c + parts[-1] = name + ext + new_href = '/'.join(parts) + return new_href + + + def move_manifest_item(self, item, hrefs): item.data # Make sure the data has been loaded and cached old_abspath = os.path.join(self.old_base_path, *(urldefrag(item.href)[0].split('/'))) @@ -79,11 +99,17 @@ class Package(object): new_href = 'content/' elif item.href.lower().endswith('.ncx'): new_href = '' - new_href += bname + new_href += sanitize_file_name(bname) + + if new_href in hrefs: + new_href = self.uniqify_name(new_href, hrefs) + hrefs.add(new_href) new_abspath = os.path.join(self.new_base_path, *new_href.split('/')) new_abspath = os.path.abspath(new_abspath) + item.old_href = self.oeb.manifest.hrefs.pop(item.href).href item.href = new_href + self.oeb.manifest.hrefs[item.href] = item if not islinux: old_abspath, new_abspath = old_abspath.lower(), new_abspath.lower() if old_abspath != new_abspath: @@ -91,25 +117,33 @@ class Package(object): def rewrite_links_in_toc(self, toc): if toc.href: - toc.href = self.link_replacer(toc.href, base=self.new_base_path) + toc.href = self.link_replacer(toc.href, base=self.old_base_path, + new_base=self.new_base_path) for x in toc: self.rewrite_links_in_toc(x) def __call__(self, oeb, context): self.map = {} - self.log = self.oeb.log + self.log = oeb.log + self.oeb = oeb self.old_base_path = os.path.abspath(oeb.container.rootdir) + hrefs = set([]) for item in self.oeb.manifest: - self.move_manifest_item(item) + self.move_manifest_item(item, hrefs) + self.log.debug('Rewriting links in OEB documents...') for item in self.oeb.manifest: self.rewrite_links_in(item) if getattr(oeb.toc, 'nodes', False): + self.log.debug('Rewriting links in TOC...') self.rewrite_links_in_toc(oeb.toc) if hasattr(oeb, 'guide'): + self.log.debug('Rewriting links in guide...') for ref in oeb.guide.values(): - ref.href = self.link_replacer(ref.href, base=self.new_base_path) + ref.href = self.link_replacer(ref.href, + base=self.old_base_path, + new_base=self.new_base_path) diff --git a/src/calibre/ebooks/oeb/writer.py b/src/calibre/ebooks/oeb/writer.py index 1e5e5aea11..ef72414f5a 100644 --- a/src/calibre/ebooks/oeb/writer.py +++ b/src/calibre/ebooks/oeb/writer.py @@ -48,7 +48,8 @@ class OEBWriter(object): pretty_print=pretty_print) def __call__(self, oeb, path): - """Read the book in the :class:`OEBBook` object :param:`oeb` to a file + """ + Read the book in the :class:`OEBBook` object :param:`oeb` to a file at :param:`path`. """ version = int(self.version[0]) diff --git a/src/calibre/web/feeds/main.py b/src/calibre/web/feeds/main.py index faa132bef4..61bfa97e11 100644 --- a/src/calibre/web/feeds/main.py +++ b/src/calibre/web/feeds/main.py @@ -1,11 +1,11 @@ -#!/usr/bin/env python +#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' ''' CLI for downloading feeds. ''' -import sys, os, logging +import sys, os from calibre.web.feeds.recipes import get_builtin_recipe, compile_recipe, titles from calibre.web.fetch.simple import option_parser as _option_parser from calibre.web.feeds.news import BasicNewsRecipe @@ -14,13 +14,13 @@ from calibre.utils.config import Config, StringConfig def config(defaults=None): desc = _('Options to control the fetching of periodical content from the web.') c = Config('feeds2disk', desc) if defaults is None else StringConfig(defaults, desc) - + web2disk = c.add_group('web2disk', _('Customize the download engine')) - web2disk('timeout', ['-t', '--timeout'], default=10.0, + web2disk('timeout', ['-t', '--timeout'], default=10.0, help=_('Timeout in seconds to wait for a response from the server. Default: %default s'),) - web2disk('delay', ['--delay'], default=0, + web2disk('delay', ['--delay'], default=0, help=_('Minimum interval in seconds between consecutive fetches. Default is %default s')) - web2disk('encoding', ['--encoding'], default=None, + web2disk('encoding', ['--encoding'], default=None, help=_('The character encoding for the websites you are trying to download. The default is to try and guess the encoding.')) web2disk('match_regexps', ['--match-regexp'], default=[], action='append', help=_('Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.')) @@ -28,42 +28,42 @@ def config(defaults=None): help=_('Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.')) web2disk('no_stylesheets', ['--dont-download-stylesheets'], action='store_true', default=False, help=_('Do not download CSS stylesheets.')) - + c.add_opt('feeds', ['--feeds'], default=None, - help=_('''Specify a list of feeds to download. For example: + help=_('''Specify a list of feeds to download. For example: "['http://feeds.newsweek.com/newsweek/TopNews', 'http://feeds.newsweek.com/headlines/politics']" If you specify this option, any argument to %prog is ignored and a default recipe is used to download the feeds.''')) c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count', help=_('''Be more verbose while processing.''')) c.add_opt('title', ['--title'], default=None, help=_('The title for this recipe. Used as the title for any ebooks created from the downloaded feeds.')) - c.add_opt('username', ['-u', '--username'], default=None, + c.add_opt('username', ['-u', '--username'], default=None, help=_('Username for sites that require a login to access content.')) - c.add_opt('password', ['-p', '--password'], default=None, + c.add_opt('password', ['-p', '--password'], default=None, help=_('Password for sites that require a login to access content.')) - c.add_opt('lrf', ['--lrf'], default=False, action='store_true', + c.add_opt('lrf', ['--lrf'], default=False, action='store_true', help='Optimize fetching for subsequent conversion to LRF.') - c.add_opt('epub', ['--epub'], default=False, action='store_true', + c.add_opt('epub', ['--epub'], default=False, action='store_true', help='Optimize fetching for subsequent conversion to EPUB.') - c.add_opt('mobi', ['--mobi'], default=False, action='store_true', + c.add_opt('mobi', ['--mobi'], default=False, action='store_true', help='Optimize fetching for subsequent conversion to MOBI.') c.add_opt('recursions', ['--recursions'], default=0, help=_('Number of levels of links to follow on webpages that are linked to from feeds. Defaul %default')) - c.add_opt('output_dir', ['--output-dir'], default='.', + c.add_opt('output_dir', ['--output-dir'], default='.', help=_('The directory in which to store the downloaded feeds. Defaults to the current directory.')) c.add_opt('no_progress_bar', ['--no-progress-bar'], default=False, action='store_true', help=_("Don't show the progress bar")) c.add_opt('debug', ['--debug'], action='store_true', default=False, help=_('Very verbose output, useful for debugging.')) - c.add_opt('test', ['--test'], action='store_true', default=False, + c.add_opt('test', ['--test'], action='store_true', default=False, help=_('Useful for recipe development. Forces max_articles_per_feed to 2 and downloads at most 2 feeds.')) - + return c - + USAGE=_('''\ %%prog [options] ARG -%%prog parses an online source of articles, like an RSS or ATOM feed and +%%prog parses an online source of articles, like an RSS or ATOM feed and fetches the article contents organized in a nice hierarchy. ARG can be one of: @@ -85,9 +85,9 @@ def option_parser(usage=USAGE): p.remove_option('--verbose') p.remove_option('--max-files') p.subsume('WEB2DISK OPTIONS', _('Options to control web2disk (used to fetch websites linked from feeds)')) - + p.add_option('--feeds', default=None, - help=_('''Specify a list of feeds to download. For example: + help=_('''Specify a list of feeds to download. For example: "['http://feeds.newsweek.com/newsweek/TopNews', 'http://feeds.newsweek.com/headlines/politics']" If you specify this option, any argument to %prog is ignored and a default recipe is used to download the feeds.''')) p.add_option('--verbose', default=False, action='store_true', @@ -99,70 +99,62 @@ If you specify this option, any argument to %prog is ignored and a default recip p.add_option('--lrf', default=False, action='store_true', help='Optimize fetching for subsequent conversion to LRF.') p.add_option('--recursions', default=0, type='int', help=_('Number of levels of links to follow on webpages that are linked to from feeds. Defaul %default')) - p.add_option('--output-dir', default=os.getcwd(), + p.add_option('--output-dir', default=os.getcwd(), help=_('The directory in which to store the downloaded feeds. Defaults to the current directory.')) p.add_option('--no-progress-bar', dest='no_progress_bar', default=False, action='store_true', help=_('Dont show the progress bar')) p.add_option('--debug', action='store_true', default=False, help=_('Very verbose output, useful for debugging.')) - p.add_option('--test', action='store_true', default=False, + p.add_option('--test', action='store_true', default=False, help=_('Useful for recipe development. Forces max_articles_per_feed to 2 and downloads at most 2 feeds.')) - + return p - + class RecipeError(Exception): pass -def run_recipe(opts, recipe_arg, parser, notification=None, handler=None): +def run_recipe(opts, recipe_arg, parser, notification=None): if notification is None: from calibre.utils.terminfo import TerminalController, ProgressBar term = TerminalController(sys.stdout) pb = ProgressBar(term, _('Fetching feeds...'), no_progress_bar=opts.no_progress_bar) notification = pb.update - + recipe = None if opts.feeds is not None: recipe = BasicNewsRecipe else: try: if os.access(recipe_arg, os.R_OK): - recipe = compile_recipe(open(recipe_arg).read()) + recipe = compile_recipe(open(recipe_arg).read()) else: raise Exception('not file') except: recipe = get_builtin_recipe(recipe_arg) if recipe is None: recipe = compile_recipe(recipe_arg) - + if recipe is None: raise RecipeError(recipe_arg+ ' is an invalid recipe') - - - if handler is None: - from calibre import ColoredFormatter - handler = logging.StreamHandler(sys.stdout) - handler.setLevel(logging.DEBUG if opts.debug else logging.INFO if opts.verbose else logging.WARN) - handler.setFormatter(ColoredFormatter('%(levelname)s: %(message)s\n')) # The trailing newline is need because of the progress bar - logging.getLogger('feeds2disk').addHandler(handler) - + recipe = recipe(opts, parser, notification) - + if not os.path.exists(recipe.output_dir): os.makedirs(recipe.output_dir) recipe.download(for_lrf=True) - + return recipe -def main(args=sys.argv, notification=None, handler=None): +def main(args=sys.argv, notification=None): p = option_parser() opts, args = p.parse_args(args=args[1:]) - + if len(args) != 1 and opts.feeds is None: p.print_help() return 1 recipe_arg = args[0] if len(args) > 0 else None - run_recipe(opts, recipe_arg, p, notification=notification, handler=handler) - + run_recipe(opts, recipe_arg, p, notification=notification) + return 0 if __name__ == '__main__': diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index efcfdff94b..4ee6753180 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -7,7 +7,7 @@ Defines various abstract base classes that can be subclassed to create powerful __docformat__ = "restructuredtext en" -import logging, os, cStringIO, time, traceback, re, urlparse, sys +import os, time, traceback, re, urlparse, sys from collections import defaultdict from functools import partial from contextlib import nested, closing @@ -27,6 +27,7 @@ from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed from calibre.web.fetch.simple import option_parser as web2disk_option_parser from calibre.web.fetch.simple import RecursiveFetcher from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending +from calibre.utils.logging import Log from calibre.ptempfile import PersistentTemporaryFile, \ PersistentTemporaryDirectory @@ -423,7 +424,7 @@ class BasicNewsRecipe(object): ''' raise NotImplementedError - def get_obfuscated_article(self, url, logger): + def get_obfuscated_article(self, url): ''' If you set :member:`articles_are_obfuscated` this method is called with every article URL. It should return the path to a file on the filesystem @@ -443,6 +444,7 @@ class BasicNewsRecipe(object): :param parser: Command line option parser. Used to intelligently merge options. :param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional. ''' + self.log = Log() if not isinstance(self.title, unicode): self.title = unicode(self.title, 'utf-8', 'replace') @@ -455,7 +457,6 @@ class BasicNewsRecipe(object): if self.debug: - logging.getLogger('feeds2disk').setLevel(logging.DEBUG) self.verbose = True self.report_progress = progress_reporter @@ -560,20 +561,20 @@ class BasicNewsRecipe(object): res = self.build_index() self.report_progress(1, _('Download finished')) if self.failed_downloads: - self.log_warning(_('Failed to download the following articles:')) + self.log.warning(_('Failed to download the following articles:')) for feed, article, debug in self.failed_downloads: - self.log_warning(article.title+_(' from ')+feed.title) - self.log_debug(article.url) - self.log_debug(debug) + self.log.warning(article.title+_(' from ')+feed.title) + self.log.debug(article.url) + self.log.debug(debug) if self.partial_failures: - self.log_warning(_('Failed to download parts of the following articles:')) + self.log.warning(_('Failed to download parts of the following articles:')) for feed, atitle, aurl, debug in self.partial_failures: - self.log_warning(atitle + _(' from ') + feed) - self.log_debug(aurl) - self.log_warning(_('\tFailed links:')) + self.log.warning(atitle + _(' from ') + feed) + self.log.debug(aurl) + self.log.warning(_('\tFailed links:')) for l, tb in debug: - self.log_warning(l) - self.log_debug(tb) + self.log.warning(l) + self.log.debug(tb) return res finally: self.cleanup() @@ -636,20 +637,11 @@ class BasicNewsRecipe(object): extra_css=self.extra_css).render(doctype='xhtml') - def create_logger(self, feed_number, article_number): - logger = logging.getLogger('feeds2disk.article_%d_%d'%(feed_number, article_number)) - out = cStringIO.StringIO() - handler = logging.StreamHandler(out) - handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s')) - handler.setLevel(logging.INFO if self.verbose else logging.WARNING) - if self.debug: - handler.setLevel(logging.DEBUG) - logger.addHandler(handler) - return logger, out - - def _fetch_article(self, url, dir, logger, f, a, num_of_feeds): + def _fetch_article(self, url, dir, f, a, num_of_feeds): self.web2disk_options.browser = self.get_browser() if self.multithreaded_fetch else self.browser - fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, (url, f, a, num_of_feeds)) + fetcher = RecursiveFetcher(self.web2disk_options, self.log, + self.image_map, self.css_map, + (url, f, a, num_of_feeds)) fetcher.base_dir = dir fetcher.current_dir = dir fetcher.show_progress = False @@ -661,21 +653,21 @@ class BasicNewsRecipe(object): raise Exception(_('Could not fetch article. Run with --debug to see the reason')) return res, path, failures - def fetch_article(self, url, dir, logger, f, a, num_of_feeds): - return self._fetch_article(url, dir, logger, f, a, num_of_feeds) + def fetch_article(self, url, dir, f, a, num_of_feeds): + return self._fetch_article(url, dir, f, a, num_of_feeds) - def fetch_obfuscated_article(self, url, dir, logger, f, a, num_of_feeds): - path = os.path.abspath(self.get_obfuscated_article(url, logger)) + def fetch_obfuscated_article(self, url, dir, f, a, num_of_feeds): + path = os.path.abspath(self.get_obfuscated_article(url)) url = ('file:'+path) if iswindows else ('file://'+path) - return self._fetch_article(url, dir, logger, f, a, num_of_feeds) + return self._fetch_article(url, dir, f, a, num_of_feeds) - def fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds): + def fetch_embedded_article(self, article, dir, f, a, num_of_feeds): templ = templates.EmbeddedContent() raw = templ.generate(article).render('html') with PersistentTemporaryFile('_feeds2disk.html') as pt: pt.write(raw) url = ('file:'+pt.name) if iswindows else ('file://'+pt.name) - return self._fetch_article(url, dir, logger, f, a, num_of_feeds) + return self._fetch_article(url, dir, f, a, num_of_feeds) def build_index(self): @@ -716,7 +708,6 @@ class BasicNewsRecipe(object): art_dir = os.path.join(feed_dir, 'article_%d'%a) if not os.path.isdir(art_dir): os.makedirs(art_dir) - logger, stream = self.create_logger(f, a) try: url = self.print_version(article.url) except NotImplementedError: @@ -726,10 +717,9 @@ class BasicNewsRecipe(object): func, arg = (self.fetch_embedded_article, article) if self.use_embedded_content else \ ((self.fetch_obfuscated_article if self.articles_are_obfuscated \ else self.fetch_article), url) - req = WorkRequest(func, (arg, art_dir, logger, f, a, len(feed)), + req = WorkRequest(func, (arg, art_dir, f, a, len(feed)), {}, (f, a), self.article_downloaded, self.error_in_article_download) - req.stream = stream req.feed = feed req.article = article req.feed_dir = feed_dir @@ -768,8 +758,8 @@ class BasicNewsRecipe(object): cu = self.get_cover_url() except Exception, err: cu = None - self.log_error(_('Could not download cover: %s')%str(err)) - self.log_debug(traceback.format_exc()) + self.log.error(_('Could not download cover: %s')%str(err)) + self.log.debug(traceback.format_exc()) if cu is not None: ext = cu.rpartition('.')[-1] if '?' in ext: @@ -841,8 +831,8 @@ class BasicNewsRecipe(object): f.write(html.encode('utf-8')) renderer = render_html(hf) if renderer.tb is not None: - self.logger.warning('Failed to render default cover') - self.logger.debug(renderer.tb) + self.log.warning('Failed to render default cover') + self.log.debug(renderer.tb) else: cover_file.write(renderer.data) cover_file.flush() @@ -863,7 +853,7 @@ class BasicNewsRecipe(object): manifest.append(os.path.join(dir, 'index.ncx')) cpath = getattr(self, 'cover_path', None) if cpath is None: - pf = PersistentTemporaryFile('_recipe_cover.jpg') + pf = open(os.path.join(dir, 'cover.jpg'), 'wb') self.default_cover(pf) cpath = pf.name if cpath is not None and os.access(cpath, os.R_OK): @@ -944,7 +934,7 @@ class BasicNewsRecipe(object): a = request.requestID[1] article = request.article - self.log_debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue().decode('utf-8', 'ignore'))) + self.log.debug(_('\nDownloaded article %s from %s')%(article.title, article.url)) article.orig_url = article.url article.url = 'article_%d/index.html'%a article.downloaded = True @@ -956,11 +946,11 @@ class BasicNewsRecipe(object): def error_in_article_download(self, request, traceback): self.jobs_done += 1 - self.log_error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url)) + self.log.error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url)) debug = request.stream.getvalue().decode('utf-8', 'ignore') - self.log_debug(debug) - self.log_debug(traceback) - self.log_debug('\n') + self.log.debug(debug) + self.log.debug(traceback) + self.log.debug('\n') self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title) self.failed_downloads.append((request.feed, request.article, debug)) @@ -990,7 +980,7 @@ class BasicNewsRecipe(object): feed.populate_from_preparsed_feed(msg, []) feed.description = unicode(err) parsed_feeds.append(feed) - self.log_exception(msg) + self.log.exception(msg) return parsed_feeds @@ -1057,7 +1047,7 @@ class CustomIndexRecipe(BasicNewsRecipe): index = os.path.abspath(self.custom_index()) url = 'file:'+index if iswindows else 'file://'+index self.web2disk_options.browser = self.browser - fetcher = RecursiveFetcher(self.web2disk_options, self.logger) + fetcher = RecursiveFetcher(self.web2disk_options, self.log) fetcher.base_dir = self.output_dir fetcher.current_dir = self.output_dir fetcher.show_progress = False @@ -1069,7 +1059,7 @@ class AutomaticNewsRecipe(BasicNewsRecipe): keep_only_tags = [dict(name=['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])] - def fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds): + def fetch_embedded_article(self, article, dir, f, a, num_of_feeds): if self.use_embedded_content: self.web2disk_options.keep_only_tags = [] - return BasicNewsRecipe.fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds) + return BasicNewsRecipe.fetch_embedded_article(self, article, dir, f, a, num_of_feeds) diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py index 51a4554a50..2ae705e01a 100644 --- a/src/calibre/web/fetch/simple.py +++ b/src/calibre/web/fetch/simple.py @@ -7,18 +7,19 @@ __copyright__ = '2008, Kovid Goyal ' Fetch a webpage and its links recursively. The webpages are saved to disk in UTF-8 encoding with any charset declarations removed. ''' -import sys, socket, os, urlparse, logging, re, time, copy, urllib2, threading, traceback +import sys, socket, os, urlparse, re, time, copy, urllib2, threading, traceback from urllib import url2pathname, quote from threading import RLock from httplib import responses from PIL import Image from cStringIO import StringIO -from calibre import setup_cli_handlers, browser, sanitize_file_name, \ +from calibre import browser, sanitize_file_name, \ relpath, unicode_path from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.config import OptionParser +from calibre.utils.logging import Log class FetchError(Exception): pass @@ -28,10 +29,10 @@ class closing(object): def __init__(self, thing): self.thing = thing - + def __enter__(self): return self.thing - + def __exit__(self, *exc_info): try: self.thing.close() @@ -55,47 +56,48 @@ def save_soup(soup, target): for meta in metas: if 'charset' in meta.get('content', '').lower(): meta.replaceWith(nm) - + selfdir = os.path.dirname(target) - + for tag in soup.findAll(['img', 'link', 'a']): for key in ('src', 'href'): path = tag.get(key, None) if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path): tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/')) - + html = unicode(soup) with open(target, 'wb') as f: f.write(html.encode('utf-8')) - + class response(str): - + def __new__(cls, *args): obj = super(response, cls).__new__(cls, *args) obj.newurl = None return obj - + class DummyLock(object): - + def __enter__(self, *args): return self def __exit__(self, *args): pass class RecursiveFetcher(object): - LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in + LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in ('.exe\s*$', '.mp3\s*$', '.ogg\s*$', '^\s*mailto:', '^\s*$')) #ADBLOCK_FILTER = tuple(re.compile(i, re.IGNORECASE) for it in # ( - # + # # ) # ) CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE) default_timeout = socket.getdefaulttimeout() # Needed here as it is used in __del__ DUMMY_LOCK = DummyLock() - - def __init__(self, options, logger, image_map={}, css_map={}, job_info=None): + + def __init__(self, options, log, image_map={}, css_map={}, job_info=None): self.base_dir = os.path.abspath(os.path.expanduser(options.dir)) if not os.path.exists(self.base_dir): os.makedirs(self.base_dir) + self.log = log self.default_timeout = socket.getdefaulttimeout() socket.setdefaulttimeout(options.timeout) self.verbose = options.verbose @@ -122,19 +124,19 @@ class RecursiveFetcher(object): self.remove_tags_after = getattr(options, 'remove_tags_after', None) self.remove_tags_before = getattr(options, 'remove_tags_before', None) self.keep_only_tags = getattr(options, 'keep_only_tags', []) - self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup) + self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup) self.postprocess_html_ext= getattr(options, 'postprocess_html', None) self.download_stylesheets = not options.no_stylesheets self.show_progress = True self.failed_links = [] self.job_info = job_info - + def get_soup(self, src): nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) nmassage.extend(self.preprocess_regexps) nmassage += [(re.compile(r'', re.DOTALL), lambda m: '')] # Some websites have buggy doctype declarations that mess up beautifulsoup soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage) - + if self.keep_only_tags: body = Tag(soup, 'body') try: @@ -146,7 +148,7 @@ class RecursiveFetcher(object): soup.find('body').replaceWith(body) except AttributeError: # soup has no body element pass - + def remove_beyond(tag, next): while tag is not None and tag.name != 'body': after = getattr(tag, next) @@ -155,27 +157,27 @@ class RecursiveFetcher(object): after.extract() after = ns tag = tag.parent - + if self.remove_tags_after is not None: rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'nextSibling') - + if self.remove_tags_before is not None: tag = soup.find(**self.remove_tags_before) remove_beyond(tag, 'previousSibling') - + for kwds in self.remove_tags: for tag in soup.findAll(**kwds): tag.extract() return self.preprocess_html_ext(soup) - - + + def fetch_url(self, url): data = None - self.log_debug('Fetching %s', url) - delta = time.time() - self.last_fetch_at + self.log.debug('Fetching', url) + delta = time.time() - self.last_fetch_at if delta < self.delay: time.sleep(delta) if re.search(r'\s+', url) is not None: @@ -190,43 +192,43 @@ class RecursiveFetcher(object): raise FetchError, responses[err.code] if getattr(err, 'reason', [0])[0] == 104 or \ getattr(getattr(err, 'args', [None])[0], 'errno', None) == -2: # Connection reset by peer or Name or service not know - self.log_debug('Temporary error, retrying in 1 second') + self.log.debug('Temporary error, retrying in 1 second') time.sleep(1) with closing(self.browser.open(url)) as f: data = response(f.read()+f.read()) data.newurl = f.geturl() - else: + else: raise err finally: self.last_fetch_at = time.time() return data - + def start_fetch(self, url): soup = BeautifulSoup(u'') - self.log_info('Downloading') + self.log.debug('Downloading') res = self.process_links(soup, url, 0, into_dir='') - self.log_info('%s saved to %s', url, res) + self.log.debug('%s saved to %s'%( url, res)) return res - + def is_link_ok(self, url): for i in self.__class__.LINK_FILTER: if i.search(url): return False return True - + def is_link_wanted(self, url): if self.filter_regexps: for f in self.filter_regexps: if f.search(url): - return False + return False if self.match_regexps: for m in self.match_regexps: if m.search(url): return True return False return True - + def process_stylesheets(self, soup, baseurl): diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets')) if not os.path.exists(diskpath): @@ -243,8 +245,7 @@ class RecursiveFetcher(object): try: data = self.fetch_url(iurl) except Exception, err: - self.log_debug('Could not fetch stylesheet %s', iurl) - self.log_debug('Error: %s', str(err), exc_info=True) + self.log.exception('Could not fetch stylesheet %s'% iurl) continue stylepath = os.path.join(diskpath, 'style'+str(c)+'.css') with self.stylemap_lock: @@ -253,7 +254,7 @@ class RecursiveFetcher(object): x.write(data) tag['href'] = stylepath else: - for ns in tag.findAll(text=True): + for ns in tag.findAll(text=True): src = str(ns) m = self.__class__.CSS_IMPORT_PATTERN.search(src) if m: @@ -267,8 +268,7 @@ class RecursiveFetcher(object): try: data = self.fetch_url(iurl) except Exception, err: - self.log_warning('Could not fetch stylesheet %s', iurl) - self.log_debug('Error: %s', str(err), exc_info=True) + self.log.exception('Could not fetch stylesheet %s'% iurl) continue c += 1 stylepath = os.path.join(diskpath, 'style'+str(c)+'.css') @@ -277,9 +277,9 @@ class RecursiveFetcher(object): with open(stylepath, 'wb') as x: x.write(data) ns.replaceWith(src.replace(m.group(1), stylepath)) - - - + + + def process_images(self, soup, baseurl): diskpath = unicode_path(os.path.join(self.current_dir, 'images')) if not os.path.exists(diskpath): @@ -291,9 +291,6 @@ class RecursiveFetcher(object): iurl = self.image_url_processor(baseurl, iurl) ext = os.path.splitext(iurl)[1] ext = ext[:5] - #if not ext: - # self.log_debug('Skipping extensionless image %s', iurl) - # continue if not urlparse.urlsplit(iurl).scheme: iurl = urlparse.urljoin(baseurl, iurl, False) with self.imagemap_lock: @@ -303,8 +300,7 @@ class RecursiveFetcher(object): try: data = self.fetch_url(iurl) except Exception, err: - self.log_warning('Could not fetch image %s', iurl) - self.log_debug('Error: %s', str(err), exc_info=True) + self.log.exception('Could not fetch image %s'% iurl) continue c += 1 fname = sanitize_file_name('img'+str(c)+ext) @@ -322,7 +318,7 @@ class RecursiveFetcher(object): traceback.print_exc() continue - def absurl(self, baseurl, tag, key, filter=True): + def absurl(self, baseurl, tag, key, filter=True): iurl = tag[key] parts = urlparse.urlsplit(iurl) if not parts.netloc and not parts.path: @@ -330,32 +326,32 @@ class RecursiveFetcher(object): if not parts.scheme: iurl = urlparse.urljoin(baseurl, iurl, False) if not self.is_link_ok(iurl): - self.log_debug('Skipping invalid link: %s', iurl) + self.log.debug('Skipping invalid link:', iurl) return None if filter and not self.is_link_wanted(iurl): - self.log_debug('Filtered link: '+iurl) + self.log.debug('Filtered link: '+iurl) return None return iurl - + def normurl(self, url): parts = list(urlparse.urlsplit(url)) parts[4] = '' return urlparse.urlunsplit(parts) - + def localize_link(self, tag, key, path): parts = urlparse.urlsplit(tag[key]) suffix = '#'+parts.fragment if parts.fragment else '' tag[key] = path+suffix - + def process_return_links(self, soup, baseurl): for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')): - iurl = self.absurl(baseurl, tag, 'href') + iurl = self.absurl(baseurl, tag, 'href') if not iurl: continue nurl = self.normurl(iurl) if self.filemap.has_key(nurl): self.localize_link(tag, 'href', self.filemap[nurl]) - + def process_links(self, soup, baseurl, recursion_level, into_dir='links'): res = '' diskpath = os.path.join(self.current_dir, into_dir) @@ -365,7 +361,7 @@ class RecursiveFetcher(object): try: self.current_dir = diskpath tags = list(soup.findAll('a', href=True)) - + for c, tag in enumerate(tags): if self.show_progress: print '.', @@ -395,17 +391,17 @@ class RecursiveFetcher(object): dsrc = dsrc.decode(self.encoding, 'ignore') else: dsrc = xml_to_unicode(dsrc, self.verbose)[0] - + soup = self.get_soup(dsrc) - + base = soup.find('base', href=True) if base is not None: newbaseurl = base['href'] - self.log_debug('Processing images...') + self.log.debug('Processing images...') self.process_images(soup, newbaseurl) if self.download_stylesheets: self.process_stylesheets(soup, newbaseurl) - + _fname = basename(iurl) if not isinstance(_fname, unicode): _fname.decode('latin1', 'replace') @@ -416,56 +412,55 @@ class RecursiveFetcher(object): self.downloaded_paths.append(res) self.filemap[nurl] = res if recursion_level < self.max_recursions: - self.log_debug('Processing links...') + self.log.debug('Processing links...') self.process_links(soup, newbaseurl, recursion_level+1) else: - self.process_return_links(soup, newbaseurl) - self.log_debug('Recursion limit reached. Skipping links in %s', iurl) - + self.process_return_links(soup, newbaseurl) + self.log.debug('Recursion limit reached. Skipping links in', iurl) + if callable(self.postprocess_html_ext): - soup = self.postprocess_html_ext(soup, + soup = self.postprocess_html_ext(soup, c==0 and recursion_level==0 and not getattr(self, 'called_first', False), self.job_info) - + if c==0 and recursion_level == 0: self.called_first = True - + save_soup(soup, res) self.localize_link(tag, 'href', res) except Exception, err: self.failed_links.append((iurl, traceback.format_exc())) - self.log_warning('Could not fetch link %s', iurl) - self.log_debug('Error: %s', str(err), exc_info=True) + self.log.exception('Could not fetch link', iurl) finally: self.current_dir = diskpath - self.files += 1 + self.files += 1 finally: self.current_dir = prev_dir if self.show_progress: print return res - + def __del__(self): dt = getattr(self, 'default_timeout', None) if dt is not None: socket.setdefaulttimeout(dt) - + def option_parser(usage=_('%prog URL\n\nWhere URL is for example http://google.com')): parser = OptionParser(usage=usage) - parser.add_option('-d', '--base-dir', + parser.add_option('-d', '--base-dir', help=_('Base directory into which URL is saved. Default is %default'), default='.', type='string', dest='dir') - parser.add_option('-t', '--timeout', + parser.add_option('-t', '--timeout', help=_('Timeout in seconds to wait for a response from the server. Default: %default s'), default=10.0, type='float', dest='timeout') - parser.add_option('-r', '--max-recursions', default=1, + parser.add_option('-r', '--max-recursions', default=1, help=_('Maximum number of levels to recurse i.e. depth of links to follow. Default %default'), type='int', dest='max_recursions') parser.add_option('-n', '--max-files', default=sys.maxint, type='int', dest='max_files', help=_('The maximum number of files to download. This only applies to files from tags. Default is %default')) parser.add_option('--delay', default=0, dest='delay', type='int', help=_('Minimum interval in seconds between consecutive fetches. Default is %default s')) - parser.add_option('--encoding', default=None, + parser.add_option('--encoding', default=None, help=_('The character encoding for the websites you are trying to download. The default is to try and guess the encoding.')) parser.add_option('--match-regexp', default=[], action='append', dest='match_regexps', help=_('Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.')) @@ -478,23 +473,21 @@ def option_parser(usage=_('%prog URL\n\nWhere URL is for example http://google.c return parser -def create_fetcher(options, logger=None, image_map={}): - if logger is None: - level = logging.DEBUG if options.verbose else logging.INFO - logger = logging.getLogger('web2disk') - setup_cli_handlers(logger, level) - return RecursiveFetcher(options, logger, image_map={}) +def create_fetcher(options, image_map={}, log=None): + if log is None: + log = Log() + return RecursiveFetcher(options, log, image_map={}) def main(args=sys.argv): - parser = option_parser() + parser = option_parser() options, args = parser.parse_args(args) if len(args) != 2: parser.print_help() return 1 - - fetcher = create_fetcher(options) - fetcher.start_fetch(args[1]) - -if __name__ == '__main__': + fetcher = create_fetcher(options) + fetcher.start_fetch(args[1]) + + +if __name__ == '__main__': sys.exit(main()) From c06f894229a15d35cc88c98b52177a76399dcd7f Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 11 Apr 2009 10:31:47 -0400 Subject: [PATCH 085/319] USBMS: Do not check for books in subdirs on devices that do not support subdirs. --- src/calibre/devices/usbms/driver.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/calibre/devices/usbms/driver.py b/src/calibre/devices/usbms/driver.py index 68041a19cd..0a66b78014 100644 --- a/src/calibre/devices/usbms/driver.py +++ b/src/calibre/devices/usbms/driver.py @@ -57,12 +57,17 @@ class USBMS(Device): prefix = self._card_prefix if oncard else self._main_prefix ebook_dir = self.EBOOK_DIR_CARD if oncard else self.EBOOK_DIR_MAIN - # Get all books in all directories under the root ebook_dir directory - for path, dirs, files in os.walk(os.path.join(prefix, ebook_dir)): - # Filter out anything that isn't in the list of supported ebook - # types - for book_type in self.FORMATS: - for filename in fnmatch.filter(files, '*.%s' % (book_type)): + # Get all books in the ebook_dir directory + if self.SUPPORTS_SUB_DIRS: + for path, dirs, files in os.walk(os.path.join(prefix, ebook_dir)): + # Filter out anything that isn't in the list of supported ebook types + for book_type in self.FORMATS: + for filename in fnmatch.filter(files, '*.%s' % (book_type)): + bl.append(self.__class__.book_from_path(os.path.join(path, filename))) + else: + path = os.path.join(prefix, ebook_dir) + for filename in os.listdir(path): + if path_to_ext(filename) in self.FORMATS: bl.append(self.__class__.book_from_path(os.path.join(path, filename))) return bl From 8d774b6e7c6c3cc41f2d41594622fa5bf6ba26c0 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 11 Apr 2009 11:46:55 -0400 Subject: [PATCH 086/319] EPubInput: make convert conform to interface. --- src/calibre/ebooks/epub/input.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/epub/input.py b/src/calibre/ebooks/epub/input.py index 4c1cdbfcf5..5c8a5c9d89 100644 --- a/src/calibre/ebooks/epub/input.py +++ b/src/calibre/ebooks/epub/input.py @@ -51,8 +51,7 @@ class EPUBInput(InputFormatPlugin): traceback.print_exc() return False - def convert(self, stream, options, file_ext, parse_cache, log, - accelerators): + def convert(self, stream, options, file_ext, log, accelerators): from calibre.utils.zipfile import ZipFile from calibre import walk from calibre.ebooks import DRMError @@ -72,6 +71,5 @@ class EPUBInput(InputFormatPlugin): if os.path.exists(encfile): if not self.process_encryption(encfile, opf, log): raise DRMError(os.path.basename(path)) - - return opf - + + return os.path.join(os.getcwd(), opf) From dc5299b8a81537cd85422500e34906b26e4275af Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 11 Apr 2009 12:24:52 -0700 Subject: [PATCH 087/319] Port ebook-viewer to the new architecture --- src/calibre/customize/ui.py | 66 +++++++++--------- src/calibre/ebooks/conversion/plumber.py | 11 +-- src/calibre/ebooks/{epub => oeb}/iterator.py | 70 +++++++++++++------- src/calibre/gui2/viewer/main.py | 7 +- src/calibre/web/feeds/news.py | 2 + 5 files changed, 93 insertions(+), 63 deletions(-) rename src/calibre/ebooks/{epub => oeb}/iterator.py (79%) diff --git a/src/calibre/customize/ui.py b/src/calibre/customize/ui.py index af85ca523d..99c74ce5f0 100644 --- a/src/calibre/customize/ui.py +++ b/src/calibre/customize/ui.py @@ -30,7 +30,7 @@ def _config(): c.add_opt('filetype_mapping', default={}, help=_('Mapping for filetype plugins')) c.add_opt('plugin_customization', default={}, help=_('Local plugin customization')) c.add_opt('disabled_plugins', default=set([]), help=_('Disabled plugins')) - + return ConfigProxy(c) config = _config() @@ -45,7 +45,7 @@ class PluginNotFound(ValueError): def load_plugin(path_to_zip_file): ''' Load plugin from zip file or raise InvalidPlugin error - + :return: A :class:`Plugin` instance. ''' print 'Loading plugin from', path_to_zip_file @@ -61,9 +61,9 @@ def load_plugin(path_to_zip_file): if x.minimum_calibre_version > version or \ platform not in x.supported_platforms: continue - + return x - + raise InvalidPlugin(_('No valid plugin found in ')+path_to_zip_file) _initialized_plugins = [] @@ -122,8 +122,8 @@ def reread_metadata_plugins(): for ft in plugin.file_types: if not _metadata_writers.has_key(ft): _metadata_writers[ft] = [] - _metadata_writers[ft].append(plugin) - + _metadata_writers[ft].append(plugin) + def metadata_readers(): ans = set([]) for plugins in _metadata_readers.values(): @@ -136,8 +136,8 @@ def metadata_writers(): for plugins in _metadata_writers.values(): for plugin in plugins: ans.add(plugin) - return ans - + return ans + def get_file_type_metadata(stream, ftype): mi = MetaInformation(None, None) ftype = ftype.lower().strip() @@ -163,21 +163,21 @@ def set_file_type_metadata(stream, mi, ftype): plugin.set_metadata(stream, mi, ftype.lower().strip()) break except: - print 'Failed to set metadata for', repr(getattr(mi, 'title', '')) + print 'Failed to set metadata for', repr(getattr(mi, 'title', '')) traceback.print_exc() - - + + def _run_filetype_plugins(path_to_file, ft=None, occasion='preprocess'): - occasion = {'import':_on_import, 'preprocess':_on_preprocess, + occasion = {'import':_on_import, 'preprocess':_on_preprocess, 'postprocess':_on_postprocess}[occasion] customization = config['plugin_customization'] if ft is None: - ft = os.path.splitext(path_to_file)[-1].lower().replace('.', '') + ft = os.path.splitext(path_to_file)[-1].lower().replace('.', '') nfp = path_to_file for plugin in occasion.get(ft, []): if is_disabled(plugin): continue - plugin.site_customization = customization.get(plugin.name, '') + plugin.site_customization = customization.get(plugin.name, '') with plugin: try: nfp = plugin.run(path_to_file) @@ -190,13 +190,13 @@ def _run_filetype_plugins(path_to_file, ft=None, occasion='preprocess'): nfp = path_to_file return nfp -run_plugins_on_import = functools.partial(_run_filetype_plugins, +run_plugins_on_import = functools.partial(_run_filetype_plugins, occasion='import') -run_plugins_on_preprocess = functools.partial(_run_filetype_plugins, +run_plugins_on_preprocess = functools.partial(_run_filetype_plugins, occasion='preprocess') -run_plugins_on_postprocess = functools.partial(_run_filetype_plugins, +run_plugins_on_postprocess = functools.partial(_run_filetype_plugins, occasion='postprocess') - + def initialize_plugin(plugin, path_to_zip_file): try: @@ -206,7 +206,7 @@ def initialize_plugin(plugin, path_to_zip_file): tb = traceback.format_exc() raise InvalidPlugin((_('Initialization of plugin %s failed with traceback:') %tb) + '\n'+tb) - + def add_plugin(path_to_zip_file): make_config_dir() @@ -248,18 +248,18 @@ def input_format_plugins(): for plugin in _initialized_plugins: if isinstance(plugin, InputFormatPlugin): yield plugin - + def plugin_for_input_format(fmt): for plugin in input_format_plugins(): if fmt.lower() in plugin.file_types: return plugin def available_input_formats(): - formats = [] + formats = set([]) for plugin in input_format_plugins(): if not is_disabled(plugin): for format in plugin.file_types: - formats.append(format) + formats.add(format) return formats def output_format_plugins(): @@ -273,10 +273,10 @@ def plugin_for_output_format(fmt): return plugin def available_output_formats(): - formats = [] + formats = set([]) for plugin in output_format_plugins(): if not is_disabled(plugin): - formats.append(plugin.file_type) + formats.add(plugin.file_type) return formats def disable_plugin(plugin_or_name): @@ -309,21 +309,21 @@ def initialize_plugins(): except: print 'Failed to initialize plugin...' traceback.print_exc() - _initialized_plugins.sort(cmp=lambda x,y:cmp(x.priority, y.priority), reverse=True) + _initialized_plugins.sort(cmp=lambda x,y:cmp(x.priority, y.priority), reverse=True) reread_filetype_plugins() reread_metadata_plugins() - + initialize_plugins() def option_parser(): parser = OptionParser(usage=_('''\ %prog options - + Customize calibre by loading external plugins. ''')) - parser.add_option('-a', '--add-plugin', default=None, + parser.add_option('-a', '--add-plugin', default=None, help=_('Add a plugin by specifying the path to the zip file containing it.')) - parser.add_option('-r', '--remove-plugin', default=None, + parser.add_option('-r', '--remove-plugin', default=None, help=_('Remove a custom plugin by name. Has no effect on builtin plugins')) parser.add_option('--customize-plugin', default=None, help=_('Customize plugin. Specify name of plugin and customization string separated by a comma.')) @@ -377,16 +377,16 @@ def main(args=sys.argv): print for plugin in initialized_plugins(): print fmt%( - plugin.type, plugin.name, - plugin.version, is_disabled(plugin), + plugin.type, plugin.name, + plugin.version, is_disabled(plugin), plugin_customization(plugin) ) print '\t', plugin.description if plugin.is_customizable(): print '\t', plugin.customization_help() print - + return 0 - + if __name__ == '__main__': sys.exit(main()) diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 41d5f0abd9..ab30e71ba1 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -179,10 +179,13 @@ OptionRecommendation(name='language', raise ValueError('Input file must have an extension') input_fmt = input_fmt[1:].lower() - output_fmt = os.path.splitext(output)[1] - if not output_fmt: - output_fmt = '.oeb' - output_fmt = output_fmt[1:].lower() + if os.path.exists(output) and os.path.isdir(output): + output_fmt = 'oeb' + else: + output_fmt = os.path.splitext(output)[1] + if not output_fmt: + output_fmt = '.oeb' + output_fmt = output_fmt[1:].lower() self.input_plugin = plugin_for_input_format(input_fmt) self.output_plugin = plugin_for_output_format(output_fmt) diff --git a/src/calibre/ebooks/epub/iterator.py b/src/calibre/ebooks/oeb/iterator.py similarity index 79% rename from src/calibre/ebooks/epub/iterator.py rename to src/calibre/ebooks/oeb/iterator.py index 5d47c93ea3..ec0eda908a 100644 --- a/src/calibre/ebooks/epub/iterator.py +++ b/src/calibre/ebooks/oeb/iterator.py @@ -5,20 +5,20 @@ __copyright__ = '2008 Kovid Goyal ' Iterate over the HTML files in an ebook. Useful for writing viewers. ''' -import re, os, math, copy +import re, os, math from cStringIO import StringIO from PyQt4.Qt import QFontDatabase -from calibre.ebooks.epub.from_any import MAP +from calibre.customize.ui import available_input_formats from calibre.ebooks.epub.from_html import TITLEPAGE -from calibre.ebooks.epub import config -from calibre.ebooks.metadata.opf2 import OPF +from calibre.ebooks.metadata.opf2 import OPF, OPFCreator from calibre.ptempfile import TemporaryDirectory from calibre.ebooks.chardet import xml_to_unicode -from calibre.ebooks.html_old import create_dir from calibre.utils.zipfile import safe_replace, ZipFile from calibre.utils.config import DynamicConfig +from calibre.utils.logging import Log +from calibre import CurrentDir def character_count(html): ''' @@ -50,11 +50,28 @@ class SpineItem(unicode): obj.max_page = -1 return obj -def html2opf(path, tdir, opts): - opts = copy.copy(opts) - opts.output = tdir - create_dir(path, opts) - return os.path.join(tdir, 'metadata.opf') +class FakeOpts(object): + verbose = 0 + breadth_first = False + max_levels = 5 + input_encoding = None + +def html2opf(path, tdir, log): + from calibre.ebooks.html.input import get_filelist + from calibre.ebooks.metadata.meta import get_metadata + with CurrentDir(tdir): + fl = get_filelist(path, tdir, FakeOpts(), log) + mi = get_metadata(open(path, 'rb'), 'html') + mi = OPFCreator(os.getcwdu(), mi) + mi.guide = None + entries = [(f.path, 'application/xhtml+xml') for f in fl] + mi.create_manifest(entries) + mi.create_spine([f.path for f in fl]) + + mi.render(open('metadata.opf', 'wb')) + opfpath = os.path.abspath('metadata.opf') + + return opfpath def opf2opf(path, tdir, opts): return path @@ -62,24 +79,22 @@ def opf2opf(path, tdir, opts): def is_supported(path): ext = os.path.splitext(path)[1].replace('.', '').lower() ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext) - return ext in list(MAP.keys())+['html', 'opf'] + return ext in available_input_formats() class EbookIterator(object): CHARACTERS_PER_PAGE = 1000 - def __init__(self, pathtoebook): + def __init__(self, pathtoebook, log=None): + self.log = log + if log is None: + self.log = Log() pathtoebook = pathtoebook.strip() self.pathtoebook = os.path.abspath(pathtoebook) self.config = DynamicConfig(name='iterator') ext = os.path.splitext(pathtoebook)[1].replace('.', '').lower() ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext) - map = dict(MAP) - map['html'] = html2opf - map['opf'] = opf2opf - if ext not in map.keys(): - raise UnsupportedFormatError(ext) - self.to_opf = map[ext] + self.ebook_ext = ext def search(self, text, index): text = text.lower() @@ -115,14 +130,24 @@ class EbookIterator(object): def __enter__(self): self._tdir = TemporaryDirectory('_ebook_iter') self.base = self._tdir.__enter__() - opts = config('').parse() - self.pathtoopf = self.to_opf(self.pathtoebook, self.base, opts) + if self.ebook_ext == 'opf': + self.pathtoopf = self.pathtoebook + elif self.ebook_ext == 'html': + self.pathtoopf = html2opf(self.pathtoebook, self.base, self.log) + else: + from calibre.ebooks.conversion.plumber import Plumber + plumber = Plumber(self.pathtoebook, self.base, self.log) + plumber.setup_options() + self.pathtoopf = plumber.input_plugin(open(plumber.input, 'rb'), + plumber.opts, plumber.input_fmt, self.log, + {}, self.base) + + self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf)) self.spine = [SpineItem(i.path) for i in self.opf.spine] cover = self.opf.cover - if os.path.splitext(self.pathtoebook)[1].lower() in \ - ('.lit', '.mobi', '.prc') and cover: + if self.ebook_ext in ('lit', 'mobi', 'prc', 'opf') and cover: cfile = os.path.join(os.path.dirname(self.spine[0]), 'calibre_ei_cover.html') open(cfile, 'wb').write(TITLEPAGE%cover) self.spine[0:0] = [SpineItem(cfile)] @@ -131,7 +156,6 @@ class EbookIterator(object): self.opf.path_to_html_toc not in self.spine: self.spine.append(SpineItem(self.opf.path_to_html_toc)) - sizes = [i.character_count for i in self.spine] self.pages = [math.ceil(i/float(self.CHARACTERS_PER_PAGE)) for i in sizes] for p, s in zip(self.pages, self.spine): diff --git a/src/calibre/gui2/viewer/main.py b/src/calibre/gui2/viewer/main.py index 543d92904b..0b8800035a 100644 --- a/src/calibre/gui2/viewer/main.py +++ b/src/calibre/gui2/viewer/main.py @@ -17,14 +17,14 @@ from calibre.gui2.viewer.bookmarkmanager import BookmarkManager from calibre.gui2.main_window import MainWindow from calibre.gui2 import Application, ORG_NAME, APP_UID, choose_files, \ info_dialog, error_dialog -from calibre.ebooks.epub.iterator import EbookIterator -from calibre.ebooks.epub.from_any import SOURCE_FORMATS +from calibre.ebooks.oeb.iterator import EbookIterator from calibre.ebooks import DRMError from calibre.gui2.dialogs.conversion_error import ConversionErrorDialog from calibre.constants import islinux from calibre.utils.config import Config, StringConfig from calibre.gui2.library import SearchBox from calibre.ebooks.metadata import MetaInformation +from calibre.customize.ui import available_input_formats class TOCItem(QStandardItem): @@ -362,7 +362,8 @@ class EbookViewer(MainWindow, Ui_EbookViewer): def open_ebook(self, checked): files = choose_files(self, 'ebook viewer open dialog', _('Choose ebook'), - [(_('Ebooks'), SOURCE_FORMATS)], all_files=False, + [(_('Ebooks'), available_input_formats())], + all_files=False, select_only_single_file=True) if files: self.load_ebook(files[0]) diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index 71529b79e9..c7a39cbc4b 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -445,6 +445,8 @@ class BasicNewsRecipe(object): :param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional. ''' self.log = Log() + if options.verbose: + self.log.filter_level = self.log.DEBUG if not isinstance(self.title, unicode): self.title = unicode(self.title, 'utf-8', 'replace') From 316b5670c5030ec61df704c15dbeb1791b49d3af Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 11 Apr 2009 17:38:12 -0400 Subject: [PATCH 088/319] space symbol --- src/calibre/ebooks/htmlsymbols.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/calibre/ebooks/htmlsymbols.py b/src/calibre/ebooks/htmlsymbols.py index fa10873845..d46e4c707a 100644 --- a/src/calibre/ebooks/htmlsymbols.py +++ b/src/calibre/ebooks/htmlsymbols.py @@ -306,5 +306,7 @@ HTML_SYMBOLS = { u'ý' : ['ý', 'ý'], # latin small letter y with acute u'þ' : ['þ', 'þ'], # latin small letter thorn u'ÿ' : ['ÿ', 'ÿ'], # latin small letter y with diaeresis + # More + u' ' : [' '], } From b963cdc58104300189b5bb258b09e7065e0e0639 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 11 Apr 2009 19:32:39 -0400 Subject: [PATCH 089/319] MobiReader only reads part of file when getting metadata --- src/calibre/ebooks/mobi/reader.py | 101 +++++++++++++++++++++++------- 1 file changed, 79 insertions(+), 22 deletions(-) diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index fa43a7af42..38d8255348 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -157,6 +157,62 @@ class BookHeader(object): self.exth.mi.language = self.language +class MetadataHeader(BookHeader): + def __init__(self, stream, log): + self.stream = stream + + self.ident = self.identity() + self.num_sections = self.section_count() + + if self.num_sections >= 2: + header = self.header() + BookHeader.__init__(self, header, self.ident, None, log) + else: + self.exth = None + + def identity(self): + self.stream.seek(60) + ident = self.stream.read(8).upper() + + if ident not in ['BOOKMOBI', 'TEXTREAD']: + raise MobiError('Unknown book type: %s' % ident) + return ident + + def section_count(self): + self.stream.seek(76) + return struct.unpack('>H', self.stream.read(2))[0] + + def section_offset(self, number): + self.stream.seek(78+number*8) + return struct.unpack('>LBBBB', self.stream.read(8))[0] + + def header(self): + section_headers = [] + + # First section with the metadata + section_headers.append(self.section_offset(0)) + # Second section used to get the lengh of the first + section_headers.append(self.section_offset(1)) + + end_off = section_headers[1] + off = section_headers[0] + + self.stream.seek(off) + return self.stream.read(end_off - off) + + def section_data(self, number): + start = self.section_offset(number) + + if number == self.num_sections -1: + end = os.stat(self.stream.name).st_size + else: + end = self.section_offset(number + 1) + + self.stream.seek(start) + + return self.stream.read(end - start) + + class MobiReader(object): PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE) IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex') @@ -414,7 +470,7 @@ class MobiReader(object): def create_opf(self, htmlfile, guide=None, root=None): mi = getattr(self.book_header.exth, 'mi', self.embedded_mi) if mi is None: - mi = MetaInformation(self.title, [_('Unknown')]) + mi = MetaInformation(self.book_header.title, [_('Unknown')]) opf = OPFCreator(os.path.dirname(htmlfile), mi) if hasattr(self.book_header.exth, 'cover_offset'): opf.cover = 'images/%05d.jpg'%(self.book_header.exth.cover_offset+1) @@ -595,25 +651,26 @@ class MobiReader(object): def get_metadata(stream): from calibre.utils.logging import Log log = Log() - mr = MobiReader(stream, log) - if mr.book_header.exth is None: - mi = MetaInformation(mr.name, [_('Unknown')]) - else: - mi = mr.create_opf('dummy.html')[0] - try: - if hasattr(mr.book_header.exth, 'cover_offset'): - cover_index = mr.book_header.first_image_index + \ - mr.book_header.exth.cover_offset - data = mr.sections[int(cover_index)][0] - else: - data = mr.sections[mr.book_header.first_image_index][0] - buf = cStringIO.StringIO(data) - im = PILImage.open(buf) - obuf = cStringIO.StringIO() - im.convert('RGBA').save(obuf, format='JPEG') - mi.cover_data = ('jpg', obuf.getvalue()) - except: - log.exception() + + mi = MetaInformation(stream.name, [_('Unknown')]) + try: + mh = MetadataHeader(stream, log) + + if mh.exth is not None: + if mh.exth.mi is not None: + mi = mh.exth.mi + + if hasattr(mh.exth, 'cover_offset'): + cover_index = mh.first_image_index + mh.exth.cover_offset + data = mh.section_data(int(cover_index)) + else: + data = mh.section_data(mh.first_image_index) + buf = cStringIO.StringIO(data) + im = PILImage.open(buf) + obuf = cStringIO.StringIO() + im.convert('RGBA').save(obuf, format='JPEG') + mi.cover_data = ('jpg', obuf.getvalue()) + except: + log.exception() + return mi - - From eed1f6923191b86f53ed8c489d98f4385384a0e9 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 11 Apr 2009 20:22:03 -0400 Subject: [PATCH 090/319] MobiReader read metadata from content with older prc files. --- src/calibre/ebooks/mobi/reader.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index 38d8255348..9e29ea09b3 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -15,7 +15,8 @@ except ImportError: from lxml import html, etree -from calibre import entity_to_unicode +from calibre import entity_to_unicode, sanitize_file_name +from calibre.ptempfile import TemporaryDirectory from calibre.ebooks import DRMError from calibre.ebooks.chardet import ENCODING_PATS from calibre.ebooks.mobi import MobiError @@ -25,7 +26,6 @@ from calibre.ebooks.mobi.langcodes import main_language, sub_language from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata.opf2 import OPFCreator, OPF from calibre.ebooks.metadata.toc import TOC -from calibre import sanitize_file_name class EXTHHeader(object): @@ -659,6 +659,13 @@ def get_metadata(stream): if mh.exth is not None: if mh.exth.mi is not None: mi = mh.exth.mi + else: + with TemporaryDirectory('_mobi_meta_reader') as tdir: + mr = MobiReader(stream, log) + parse_cache = {} + mr.extract_content(tdir, parse_cache) + if mr.embedded_mi is not None: + mi = mr.embedded_mi if hasattr(mh.exth, 'cover_offset'): cover_index = mh.first_image_index + mh.exth.cover_offset From 632db425a2be1ccb36749bb3f029b75e3cb8a26e Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 11 Apr 2009 20:32:00 -0400 Subject: [PATCH 091/319] MobiReader do not include file path in default metadata title. --- src/calibre/ebooks/mobi/reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index 9e29ea09b3..161a6995ba 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -652,7 +652,7 @@ def get_metadata(stream): from calibre.utils.logging import Log log = Log() - mi = MetaInformation(stream.name, [_('Unknown')]) + mi = MetaInformation(os.path.basename(stream.name), [_('Unknown')]) try: mh = MetadataHeader(stream, log) From a48dd172db1e35f5cf54628051e8e7d44852217d Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 12 Apr 2009 10:03:46 -0400 Subject: [PATCH 092/319] PDFMetadataWriter working --- src/calibre/customize/builtins.py | 11 +++++++++++ src/calibre/ebooks/metadata/pdf.py | 31 +++++++++++++++++++----------- src/calibre/ebooks/oeb/iterator.py | 1 + 3 files changed, 32 insertions(+), 11 deletions(-) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 484d46dc36..a9fc342059 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -262,6 +262,17 @@ class MOBIMetadataWriter(MetadataWriterPlugin): def set_metadata(self, stream, mi, type): from calibre.ebooks.metadata.mobi import set_metadata set_metadata(stream, mi) + +class PDFMetadataWriter(MetadataWriterPlugin): + + name = 'Set PDF metadata' + file_types = set(['pdf']) + description = _('Set metadata in %s files') % 'PDF' + author = 'John Schember' + + def set_metadata(self, stream, mi, type): + from calibre.ebooks.metadata.pdf import set_metadata + set_metadata(stream, mi) from calibre.ebooks.epub.input import EPUBInput diff --git a/src/calibre/ebooks/metadata/pdf.py b/src/calibre/ebooks/metadata/pdf.py index 6b94b07275..06a02939ba 100644 --- a/src/calibre/ebooks/metadata/pdf.py +++ b/src/calibre/ebooks/metadata/pdf.py @@ -5,7 +5,7 @@ from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' -import sys, os, re, StringIO +import sys, os, StringIO from calibre.ebooks.metadata import MetaInformation, authors_to_string from calibre.ptempfile import TemporaryDirectory @@ -52,18 +52,27 @@ def get_metadata(stream, extract_cover=True): def set_metadata(stream, mi): stream.seek(0) - raw = stream.read() - if mi.title: - tit = mi.title.encode('utf-8') if isinstance(mi.title, unicode) else mi.title - raw = re.compile(r'<<.*?/Title\((.+?)\)', re.DOTALL).sub(lambda m: m.group().replace(m.group(1), tit), raw) - if mi.authors: - au = authors_to_string(mi.authors) - if isinstance(au, unicode): - au = au.encode('utf-8') - raw = re.compile(r'<<.*?/Author\((.+?)\)', re.DOTALL).sub(lambda m: m.group().replace(m.group(1), au), raw) + + # Use a StringIO object for the pdf because we will want to over + # write it later and if we are working on the stream directly it + # could cause some issues. + raw = StringIO.StringIO(stream.read()) + orig_pdf = PdfFileReader(raw) + + title = mi.title if mi.title else orig_pdf.documentInfo.title + author = authors_to_string(mi.authors) if mi.authors else orig_pdf.documentInfo.author + + out_pdf = PdfFileWriter(title=title, author=author) + for page in orig_pdf.pages: + out_pdf.addPage(page) + + out_str = StringIO.StringIO() + out_pdf.write(out_str) + stream.seek(0) stream.truncate() - stream.write(raw) + out_str.seek(0) + stream.write(out_str.read()) stream.seek(0) def get_cover(stream): diff --git a/src/calibre/ebooks/oeb/iterator.py b/src/calibre/ebooks/oeb/iterator.py index ec0eda908a..88fffc604a 100644 --- a/src/calibre/ebooks/oeb/iterator.py +++ b/src/calibre/ebooks/oeb/iterator.py @@ -1,3 +1,4 @@ +from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008 Kovid Goyal ' From 1bd3829c6b4442283541c31e3630c0ca16f643da Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 12 Apr 2009 12:48:16 -0400 Subject: [PATCH 093/319] Don't allow editing of config category names. Fix import. --- src/calibre/gui2/dialogs/config.py | 2 +- src/calibre/gui2/dialogs/config.ui | 794 +++++++++++++++-------------- 2 files changed, 399 insertions(+), 397 deletions(-) diff --git a/src/calibre/gui2/dialogs/config.py b/src/calibre/gui2/dialogs/config.py index 831d44251e..1b2a2b8702 100644 --- a/src/calibre/gui2/dialogs/config.py +++ b/src/calibre/gui2/dialogs/config.py @@ -18,7 +18,7 @@ from calibre.utils.config import prefs from calibre.gui2.widgets import FilenamePattern from calibre.gui2.library import BooksModel from calibre.ebooks import BOOK_EXTENSIONS -from calibre.ebooks.epub.iterator import is_supported +from calibre.ebooks.oeb.iterator import is_supported from calibre.library import server_config from calibre.customize.ui import initialized_plugins, is_disabled, enable_plugin, \ disable_plugin, customize_plugin, \ diff --git a/src/calibre/gui2/dialogs/config.ui b/src/calibre/gui2/dialogs/config.ui index 9afcac8914..ac432c52c0 100644 --- a/src/calibre/gui2/dialogs/config.ui +++ b/src/calibre/gui2/dialogs/config.ui @@ -1,9 +1,8 @@ - - + Kovid Goyal Dialog - - + + 0 0 @@ -11,108 +10,111 @@ 557 - + Configuration - - + + :/images/config.svg:/images/config.svg - - - + + + - - - + + + 1 0 - + 75 true - + + QAbstractItemView::NoEditTriggers + + true - + false - + 48 48 - + QAbstractItemView::ScrollPerItem - + QAbstractItemView::ScrollPerPixel - + QListView::TopToBottom - + 20 - + QListView::ListMode - - - + + + 100 0 - + 0 - - + + - + - - + + 16777215 70 - + &Location of ebooks (The ebooks are stored in folders sorted by author and metadata is stored in the file metadata.db) - + true - + location - + - + - - + + Browse for the new database location - + ... - - + + :/images/mimetypes/dir.svg:/images/mimetypes/dir.svg @@ -122,107 +124,107 @@ - - + + Show notification when &new version is available - - + + If you disable this setting, metadata is guessed from the filename instead. This can be configured in the Advanced section. - + Read &metadata from files - + true - - - - + + + + Format for &single file save: - + single_format - - + + - - - + + + Default network &timeout: - + timeout - - - + + + Set the default timeout for network fetches (i.e. anytime we go out to the internet to get information) - + seconds - + 2 - + 120 - + 5 - - + + - - - + + + Choose &language (requires restart): - + language - - + + - + Normal - + High - + Low - - - + + + Job &priority: - + priority @@ -230,19 +232,19 @@ - - + + Frequently used directories - - - + + + - - + + true - + 22 22 @@ -251,13 +253,13 @@ - + - + Qt::Vertical - + 20 40 @@ -266,25 +268,25 @@ - - + + Add a directory to the frequently used directories list - + ... - - + + :/images/plus.svg:/images/plus.svg - + Qt::Vertical - + 20 40 @@ -293,25 +295,25 @@ - - + + Remove a directory from the frequently used directories list - + ... - - + + :/images/list_remove.svg:/images/list_remove.svg - + Qt::Vertical - + 20 40 @@ -328,111 +330,111 @@ - - + + - - + + Use &Roman numerals for series number - + true - - + + Enable system &tray icon (needs restart) - - + + Show &notifications in system tray - - + + Show cover &browser in a separate window (needs restart) - - + + Automatically send downloaded &news to ebook reader - - + + &Delete news from library when it is automatically sent to reader - + - - + + &Number of covers to show in browse mode (needs restart): - + cover_browse - + - - + + Toolbar - - - + + + - + Large - + Medium - + Small - - - + + + &Button size in toolbar - + toolbar_button_size - - - + + + Show &text in toolbar buttons - + true @@ -441,44 +443,44 @@ - + - - + + Select visible &columns in library view - + - + - - + + true - + QAbstractItemView::SelectRows - + - - + + ... - - + + :/images/arrow-up.svg:/images/arrow-up.svg - - + + Qt::Vertical - + 20 40 @@ -487,12 +489,12 @@ - - + + ... - - + + :/images/arrow-down.svg:/images/arrow-down.svg @@ -505,17 +507,17 @@ - - + + Use internal &viewer for: - - - - + + + + true - + QAbstractItemView::NoSelection @@ -536,99 +538,99 @@ - - - - - + + + + + calibre can send your books to you (or your reader) by email - + true - - + + - - + + Send email &from: - + email_from - - - <p>This is what will be present in the From: field of emails sent by calibre.<br> Set it to your email address + + + <p>This is what will be present in the From: field of emails sent by calibre.<br> Set it to your email address - - + + - - + + QAbstractItemView::SingleSelection - + QAbstractItemView::SelectRows - + - - + + Add an email address to which to send books - + &Add email - - + + :/images/plus.svg:/images/plus.svg - + 24 24 - + Qt::ToolButtonTextUnderIcon - - + + Make &default - - + + &Remove email - - + + :/images/minus.svg:/images/minus.svg - + 24 24 - + Qt::ToolButtonTextUnderIcon @@ -637,155 +639,155 @@ - - - - <p>A mail server is useful if the service you are sending mail to only accepts email from well know mail services. + + + + <p>A mail server is useful if the service you are sending mail to only accepts email from well know mail services. - + Mail &Server - - - - - calibre can <b>optionally</b> use a server to send mail + + + + + calibre can <b>optionally</b> use a server to send mail - + true - - - + + + &Hostname: - + relay_host - - - + + + The hostname of your mail server. For e.g. smtp.gmail.com - - + + - - + + &Port: - + relay_port - - + + The port your mail server listens for connections on. The default is 25 - + 1 - + 65555 - + 25 - - - + + + &Username: - + relay_username - - - + + + Your username on the mail server - - - + + + &Password: - + relay_password - - - + + + Your password on the mail server - + QLineEdit::Password - - - + + + &Show - - - + + + &Encryption: - + relay_tls - - - + + + Use TLS encryption when connecting to the mail server. This is the most common. - + &TLS - + true - - - + + + Use SSL encryption when connecting to the mail server. - + &SSL - - - + + + Qt::Horizontal - + 40 20 @@ -796,31 +798,31 @@ - - + + - - + + Use Gmail - - + + :/images/gmail_logo.png:/images/gmail_logo.png - + 48 48 - + Qt::ToolButtonTextUnderIcon - - + + &Test email @@ -829,16 +831,16 @@ - - + + - + - + Qt::Horizontal - + 40 20 @@ -847,21 +849,21 @@ - - + + Free unused diskspace from the database - + &Compact database - + Qt::Horizontal - + 40 20 @@ -872,17 +874,17 @@ - - + + &Metadata from file name - + - + Qt::Vertical - + 20 40 @@ -895,96 +897,96 @@ - - + + - - + + calibre contains a network server that allows you to access your book collection using a browser from anywhere in the world. Any changes to the settings will only take effect after a server restart. - + true - - - - + + + + Server &port: - + port - - - + + + 1025 - + 16000 - + 8080 - - - + + + &Username: - + username - - + + - - - + + + &Password: - + password - - - + + + If you leave the password blank, anyone will be able to access your book collection using the web interface. - - - + + + &Show password - - - + + + The maximum size (widthxheight) for displayed covers. Larger covers are resized. - + - - - + + + Max. &cover size: - + max_cover_size @@ -992,27 +994,27 @@ - + - - + + &Start Server - - + + St&op Server - - + + Qt::Horizontal - + 40 20 @@ -1021,8 +1023,8 @@ - - + + &Test Server @@ -1030,25 +1032,25 @@ - - + + Run server &automatically on startup - - + + View &server logs - - + + Qt::Vertical - + 20 40 @@ -1057,21 +1059,21 @@ - - + + If you want to use the content server to access your ebook collection on your iphone with Stanza, you will need to add the URL http://myhostname:8080/stanza as a new catalog in the stanza reader on your iphone. Here myhostname should be the fully qualified hostname or the IP address of this computer. - + true - - + + Qt::Vertical - + 20 40 @@ -1081,53 +1083,53 @@ - - + + - - + + Here you can customize the behavior of Calibre by controlling what plugins it uses. - + true - - + + 32 32 - + true - + true - + - - + + Enable/&Disable plugin - - + + &Customize plugin - - + + &Remove plugin @@ -1135,33 +1137,33 @@ - - + + Add new plugin - + - + - - + + Plugin &file: - + plugin_path - + - - + + ... - - + + :/images/document_open.svg:/images/document_open.svg @@ -1169,13 +1171,13 @@ - + - - + + Qt::Horizontal - + 40 20 @@ -1184,8 +1186,8 @@ - - + + &Add @@ -1201,12 +1203,12 @@ - - - + + + Qt::Horizontal - + QDialogButtonBox::Cancel|QDialogButtonBox::Ok @@ -1214,7 +1216,7 @@ - + @@ -1223,11 +1225,11 @@ Dialog accept() - + 239 558 - + 157 274 @@ -1239,11 +1241,11 @@ Dialog reject() - + 307 558 - + 286 274 From a423691dd52475d8de2b0227b357fcf3fdbb68f1 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 12 Apr 2009 12:09:38 -0700 Subject: [PATCH 094/319] Initial (untested) port of splitting code to OEBBook --- src/calibre/ebooks/oeb/base.py | 32 +- src/calibre/ebooks/oeb/iterator.py | 1 - src/calibre/ebooks/oeb/output.py | 12 +- .../ebooks/{epub => oeb/transforms}/split.py | 556 ++++++++---------- 4 files changed, 280 insertions(+), 321 deletions(-) rename src/calibre/ebooks/{epub => oeb/transforms}/split.py (51%) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 76a6648e8d..ed7981df4f 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -272,11 +272,26 @@ def XPath(expr): def xpath(elem, expr): return elem.xpath(expr, namespaces=XPNSMAP) -def xml2str(root, pretty_print=False): - return etree.tostring(root, encoding='utf-8', xml_declaration=True, +def _prepare_xml_for_serialization(root): + root.set('xmlns', XHTML_NS) + root.set('{%s}xlink'%XHTML_NS, XLINK_NS) + for x in root.iter(): + if hasattr(x.tag, 'rpartition') and x.tag.rpartition('}')[-1].lower() == 'svg': + x.set('xmlns', SVG_NS) + +def xml2str(root, pretty_print=False, strip_comments=False): + _prepare_xml_for_serialization(root) + ans = etree.tostring(root, encoding='utf-8', xml_declaration=True, pretty_print=pretty_print) + if strip_comments: + ans = re.compile(r'', re.DOTALL).sub('', ans) + + return ans + + def xml2unicode(root, pretty_print=False): + _prepare_xml_for_serialization(root) return etree.tostring(root, pretty_print=pretty_print) ASCII_CHARS = set(chr(x) for x in xrange(128)) @@ -826,6 +841,11 @@ class Manifest(object): return xml2str(data, pretty_print=self.oeb.pretty_print) if isinstance(data, unicode): return data.encode('utf-8') + if hasattr(data, 'cssText'): + data = data.cssText + if isinstance(data, unicode): + data = data.encode('utf-8') + return data return str(data) def __unicode__(self): @@ -834,6 +854,8 @@ class Manifest(object): return xml2unicode(data, pretty_print=self.oeb.pretty_print) if isinstance(data, unicode): return data + if hasattr(data, 'cssText'): + return data.cssText return unicode(data) def __eq__(self, other): @@ -1044,6 +1066,12 @@ class Spine(object): self.items[i].spine_position = i item.spine_position = None + def index(self, item): + for i, x in enumerate(self): + if item == x: + return i + return -1 + def __iter__(self): for item in self.items: yield item diff --git a/src/calibre/ebooks/oeb/iterator.py b/src/calibre/ebooks/oeb/iterator.py index ec0eda908a..8672d42e2b 100644 --- a/src/calibre/ebooks/oeb/iterator.py +++ b/src/calibre/ebooks/oeb/iterator.py @@ -162,7 +162,6 @@ class EbookIterator(object): s.pages = p start = 1 - for s in self.spine: s.start_page = start start += s.pages diff --git a/src/calibre/ebooks/oeb/output.py b/src/calibre/ebooks/oeb/output.py index ea986f49fa..480ca3776e 100644 --- a/src/calibre/ebooks/oeb/output.py +++ b/src/calibre/ebooks/oeb/output.py @@ -22,7 +22,6 @@ class OEBOutput(OutputFormatPlugin): if not os.path.exists(output_path): os.makedirs(output_path) from calibre.ebooks.oeb.base import OPF_MIME, NCX_MIME, PAGE_MAP_MIME - from calibre.ebooks.html import tostring as html_tostring with CurrentDir(output_path): results = oeb_book.to_opf2(page_map=True) for key in (OPF_MIME, NCX_MIME, PAGE_MAP_MIME): @@ -38,16 +37,7 @@ class OEBOutput(OutputFormatPlugin): dir = os.path.dirname(path) if not os.path.exists(dir): os.makedirs(dir) - raw = item.data - if not isinstance(raw, basestring): - if hasattr(raw, 'cssText'): - raw = raw.cssText - else: - raw = html_tostring(raw, - pretty_print=opts.pretty_print) - if isinstance(raw, unicode): - raw = raw.encode('utf-8') with open(path, 'wb') as f: - f.write(raw) + f.write(str(item)) diff --git a/src/calibre/ebooks/epub/split.py b/src/calibre/ebooks/oeb/transforms/split.py similarity index 51% rename from src/calibre/ebooks/epub/split.py rename to src/calibre/ebooks/oeb/transforms/split.py index 8ff62a1c4b..20205e9c6d 100644 --- a/src/calibre/ebooks/epub/split.py +++ b/src/calibre/ebooks/oeb/transforms/split.py @@ -4,21 +4,25 @@ __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __docformat__ = 'restructuredtext en' ''' -Split the flows in an epub file to conform to size limitations. +Splitting of the XHTML flows. Splitting can happen on page boundaries or can be +forces at "likely" locations to conform to size limitations. This transform +assumes a prior call to the flatcss transform. ''' -import os, math, functools, collections, re, copy, sys +import os, math, functools, collections, re, copy from lxml.etree import XPath as _XPath from lxml import etree, html from lxml.cssselect import CSSSelector -from calibre.ebooks.metadata.opf2 import OPF +from calibre.ebooks.oeb.base import OEB_STYLES, XPNSMAP, urldefrag, \ + rewrite_links from calibre.ebooks.epub import tostring, rules -from calibre import CurrentDir -XPath = functools.partial(_XPath, namespaces={'re':'http://exslt.org/regular-expressions'}) -content = functools.partial(os.path.join, 'content') +NAMESPACES = dict(XPNSMAP) +NAMESPACES['re'] = 'http://exslt.org/regular-expressions' + +XPath = functools.partial(_XPath, namespaces=NAMESPACES) SPLIT_ATTR = 'cs' SPLIT_POINT_ATTR = 'csp' @@ -27,149 +31,166 @@ class SplitError(ValueError): def __init__(self, path, root): size = len(tostring(root))/1024. - ValueError.__init__(self, _('Could not find reasonable point at which to split: %s Sub-tree size: %d KB')% - (os.path.basename(path), size)) + ValueError.__init__(self, + _('Could not find reasonable point at which to split: ' + '%s Sub-tree size: %d KB')% + (path, size)) + +class Split(object): + + def __init__(self, split_on_page_breaks=True, page_breaks_xpath=None, + max_flow_size=0): + self.split_on_page_breaks = split_on_page_breaks + self.page_breaks_xpath = page_breaks_xpath + self.max_flow_size = max_flow_size + if self.page_breaks_xpath is not None: + self.page_breaks_xpath = XPath(self.page_breaks_xpath) + + def __call__(self, oeb, context): + self.oeb = oeb + self.log = oeb.log + self.map = {} + self.page_break_selectors = None + for item in self.oeb.manifest.items: + if etree.iselement(item.data): + self.split_item(item) + + self.fix_links() + + def split_item(self, item): + if self.split_on_page_breaks: + if self.page_breaks_xpath is None: + page_breaks, page_break_ids = self.find_page_breaks(item) + else: + page_breaks, page_break_ids = self.page_breaks_xpath(item.data) + + splitter = FlowSplitter(item, page_breaks, page_break_ids, + self.max_flow_size, self.oeb) + if splitter.was_split: + self.map[item.href] = dict(splitter.anchor_map) + + def find_page_breaks(self, item): + if self.page_break_selectors is None: + self.page_break_selectors = set([]) + stylesheets = [x.data for x in self.oeb.manifest if x.media_type in + OEB_STYLES] + page_break_selectors = set([]) + for rule in rules(stylesheets): + before = getattr(rule.style.getPropertyCSSValue( + 'page-break-before'), 'cssText', '').strip().lower() + after = getattr(rule.style.getPropertyCSSValue( + 'page-break-after'), 'cssText', '').strip().lower() + try: + if before and before != 'avoid': + page_break_selectors.add((CSSSelector(rule.selectorText), + True)) + except: + pass + try: + if after and after != 'avoid': + page_break_selectors.add((CSSSelector(rule.selectorText), + False)) + except: + pass + + page_breaks = set([]) + for selector, before in page_break_selectors: + for elem in selector(item.data): + elem.pb_before = before + page_breaks.add(elem) + + for i, elem in enumerate(item.data.iter()): + elem.pb_order = i + + page_breaks = list(page_breaks) + page_breaks.sort(cmp=lambda x,y : cmp(x.pb_order, y.pb_order)) + page_break_ids, page_breaks_ = [], [] + for i, x in enumerate(page_breaks): + x.set('id', x.get('id', 'calibre_pb_%d'%i)) + id = x.get('id') + page_breaks_.append((XPath('//*[@id="%s"]'%id), x.pb_before)) + page_break_ids.append(id) + + return page_breaks_, page_break_ids + + def fix_links(self, opf): + ''' + Fix references to the split files in other content files. + ''' + for item in self.oeb.manifest: + if etree.iselement(item.data): + self.current_item = item + rewrite_links(item.data, self.rewrite_links) + + def rewrite_links(self, url): + href, frag = urldefrag(url) + href = self.current_item.abshref(href) + if href in self.map: + anchor_map = self.map[href] + nhref = anchor_map[frag if frag else None] + if frag: + nhref = '#'.joinn(href, frag) + return nhref + return url -class Splitter(object): +class FlowSplitter(object): - def __init__(self, path, opts, stylesheet_map, opf): - self.setup_cli_handler(opts.verbose) - self.path = path - self.always_remove = not opts.preserve_tag_structure or \ - os.stat(content(path)).st_size > 5*opts.profile.flow_size - self.base = (os.path.splitext(path)[0].replace('%', '%%') + '_split_%d.html') - self.opts = opts - self.orig_size = os.stat(content(path)).st_size - self.log_info('\tSplitting %s (%d KB)', path, self.orig_size/1024.) - root = html.fromstring(open(content(path)).read()) + def __init__(self, item, page_breaks, page_break_ids, max_flow_size, oeb): + self.item = item + self.oeb = oeb + self.log = oeb.log + self.page_breaks = page_breaks + self.page_break_ids = page_break_ids + self.max_flow_size = max_flow_size + self.base = item.abshref(item.href) - self.page_breaks, self.trees = [], [] - self.split_size = 0 + base, ext = os.path.splitext(self.base) + self.base = base.replace('%', '%%')+'_split_%d'+ext - # Split on page breaks + self.trees = [self.item.data] self.splitting_on_page_breaks = True - if not opts.dont_split_on_page_breaks: - self.log_info('\tSplitting on page breaks...') - if self.path in stylesheet_map: - self.find_page_breaks(stylesheet_map[self.path], root) - self.split_on_page_breaks(root.getroottree()) - trees = list(self.trees) - else: - self.trees = [root.getroottree()] - trees = list(self.trees) - - # Split any remaining over-sized trees + if self.page_breaks: + self.split_on_page_breaks(self.item.data) self.splitting_on_page_breaks = False - if self.opts.profile.flow_size < sys.maxint: + + if self.max_flow_size > 0: lt_found = False - self.log_info('\tLooking for large trees...') - for i, tree in enumerate(list(trees)): + self.log('\tLooking for large trees...') + trees = list(self.trees) + for i, tree in enumerate(list(self.trees)): self.trees = [] size = len(tostring(tree.getroot())) if size > self.opts.profile.flow_size: lt_found = True - try: - self.split_to_size(tree) - except (SplitError, RuntimeError): # Splitting fails - if not self.always_remove: - self.always_remove = True - self.split_to_size(tree) - else: - raise + self.split_to_size(tree) trees[i:i+1] = list(self.trees) if not lt_found: self.log_info('\tNo large trees found') + self.trees = trees - self.trees = trees self.was_split = len(self.trees) > 1 - if self.was_split: - self.commit() - self.log_info('\t\tSplit into %d parts.', len(self.trees)) - if self.opts.verbose: - for f in self.files: - self.log_info('\t\t\t%s - %d KB', f, os.stat(content(f)).st_size/1024.) - self.fix_opf(opf) + self.commit() - self.trees = None + def split_on_page_breaks(self, orig_tree): + ordered_ids = [] + for elem in orig_tree.xpath('//*[@id]'): + id = elem.get('id') + if id in self.page_break_ids: + ordered_ids.append(self.page_breaks[self.page_break_ids.index(id)]) - - def split_text(self, text, root, size): - self.log_debug('\t\t\tSplitting text of length: %d'%len(text)) - rest = text.replace('\r', '') - parts = re.split('\n\n', rest) - self.log_debug('\t\t\t\tFound %d parts'%len(parts)) - if max(map(len, parts)) > size: - raise SplitError('Cannot split as file contains a

     tag with a very large paragraph', root)
    -        ans = []
    -        buf = ''
    -        for part in parts:
    -            if len(buf) + len(part) < size:
    -                buf += '\n\n'+part
    -            else:
    -                ans.append(buf)
    -                buf = part
    -        return ans
    -
    -
    -    def split_to_size(self, tree):
    -        self.log_debug('\t\tSplitting...')
    -        root = tree.getroot()
    -        # Split large 
     tags
    -        for pre in list(root.xpath('//pre')):
    -            text = u''.join(pre.xpath('descendant::text()'))
    -            pre.text = text
    -            for child in list(pre.iterchildren()):
    -                pre.remove(child)
    -            if len(pre.text) > self.opts.profile.flow_size*0.5:
    -                frags = self.split_text(pre.text, root, int(0.2*self.opts.profile.flow_size))
    -                new_pres = []
    -                for frag in frags:
    -                    pre2 = copy.copy(pre)
    -                    pre2.text = frag
    -                    pre2.tail = u''
    -                    new_pres.append(pre2)
    -                new_pres[-1].tail = pre.tail
    -                p = pre.getparent()
    -                i = p.index(pre)
    -                p[i:i+1] = new_pres
    -
    -        split_point, before = self.find_split_point(root)
    -        if split_point is None or self.split_size > 6*self.orig_size:
    -            if not self.always_remove:
    -                self.log_warn(_('\t\tToo much markup. Re-splitting without '
    -                                'structure preservation. This may cause '
    -                                'incorrect rendering.'))
    -            raise SplitError(self.path, root)
    -
    -        for t in self.do_split(tree, split_point, before):
    -            r = t.getroot()
    -            if self.is_page_empty(r):
    -                continue
    -            size = len(tostring(r))
    -            if size <= self.opts.profile.flow_size:
    -                self.trees.append(t)
    -                #print tostring(t.getroot(), pretty_print=True)
    -                self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)',
    -                               len(self.trees), size/1024.)
    -                self.split_size += size
    -            else:
    -                self.split_to_size(t)
    -
    -    def is_page_empty(self, root):
    -        body = root.find('body')
    -        if body is None:
    -            return False
    -        txt = re.sub(r'\s+', '', html.tostring(body, method='text', encoding=unicode))
    -        if len(txt) > 4:
    -            #if len(txt) < 100:
    -            #    print 1111111, html.tostring(body, method='html', encoding=unicode)
    -            return False
    -        for img in root.xpath('//img'):
    -            if img.get('style', '') != 'display:none':
    -                return False
    -        return True
    +        self.trees = []
    +        tree = orig_tree
    +        for pattern, before in ordered_ids:
    +            self.log.debug('\t\tSplitting on page-break')
    +            elem = pattern(tree)
    +            if elem:
    +                before, after = self.do_split(tree, elem[0], before)
    +                self.trees.append(before)
    +                tree = after
    +        self.trees.append(tree)
    +        self.trees = [t for t in self.trees if not self.is_page_empty(t.getroot())]
     
         def do_split(self, tree, split_point, before):
             '''
    @@ -190,7 +211,7 @@ class Splitter(object):
             split_point2 = root2.xpath(path)[0]
     
             def nix_element(elem, top=True):
    -            if self.always_remove:
    +            if True:
                     parent = elem.getparent()
                     index = parent.index(elem)
                     if top:
    @@ -198,7 +219,6 @@ class Splitter(object):
                     else:
                         index = parent.index(elem)
                         parent[index:index+1] = list(elem.iterchildren())
    -
                 else:
                     elem.text = u''
                     elem.tail = u''
    @@ -241,67 +261,76 @@ class Splitter(object):
     
             return tree, tree2
     
    +    def is_page_empty(self, root):
    +        body = root.find('body')
    +        if body is None:
    +            return False
    +        txt = re.sub(r'\s+', '', html.tostring(body, method='text', encoding=unicode))
    +        if len(txt) > 4:
    +            return False
    +        for img in root.xpath('//img'):
    +            if img.get('style', '') != 'display:none':
    +                return False
    +        return True
     
    -    def split_on_page_breaks(self, orig_tree):
    -        ordered_ids = []
    -        for elem in orig_tree.xpath('//*[@id]'):
    -            id = elem.get('id')
    -            if id in self.page_break_ids:
    -                ordered_ids.append(self.page_breaks[self.page_break_ids.index(id)])
    -
    -        self.trees = []
    -        tree = orig_tree
    -        for pattern, before in ordered_ids:
    -            self.log_info('\t\tSplitting on page-break')
    -            elem = pattern(tree)
    -            if elem:
    -                before, after = self.do_split(tree, elem[0], before)
    -                self.trees.append(before)
    -                tree = after
    -        self.trees.append(tree)
    -        self.trees = [t for t in self.trees if not self.is_page_empty(t.getroot())]
    +    def split_text(self, text, root, size):
    +        self.log.debug('\t\t\tSplitting text of length: %d'%len(text))
    +        rest = text.replace('\r', '')
    +        parts = re.split('\n\n', rest)
    +        self.log.debug('\t\t\t\tFound %d parts'%len(parts))
    +        if max(map(len, parts)) > size:
    +            raise SplitError('Cannot split as file contains a 
     tag '
    +                'with a very large paragraph', root)
    +        ans = []
    +        buf = ''
    +        for part in parts:
    +            if len(buf) + len(part) < size:
    +                buf += '\n\n'+part
    +            else:
    +                ans.append(buf)
    +                buf = part
    +        return ans
     
     
    +    def split_to_size(self, tree):
    +        self.log.debug('\t\tSplitting...')
    +        root = tree.getroot()
    +        # Split large 
     tags
    +        for pre in list(root.xpath('//pre')):
    +            text = u''.join(pre.xpath('descendant::text()'))
    +            pre.text = text
    +            for child in list(pre.iterchildren()):
    +                pre.remove(child)
    +            if len(pre.text) > self.max_flow_size*0.5:
    +                frags = self.split_text(pre.text, root, int(0.2*self.max_flow_size))
    +                new_pres = []
    +                for frag in frags:
    +                    pre2 = copy.copy(pre)
    +                    pre2.text = frag
    +                    pre2.tail = u''
    +                    new_pres.append(pre2)
    +                new_pres[-1].tail = pre.tail
    +                p = pre.getparent()
    +                i = p.index(pre)
    +                p[i:i+1] = new_pres
     
    -    def find_page_breaks(self, stylesheets, root):
    -        '''
    -        Find all elements that have either page-break-before or page-break-after set.
    -        Populates `self.page_breaks` with id based XPath selectors (for elements that don't
    -        have ids, an id is created).
    -        '''
    -        page_break_selectors = set([])
    -        for rule in rules(stylesheets):
    -            before = getattr(rule.style.getPropertyCSSValue('page-break-before'), 'cssText', '').strip().lower()
    -            after  = getattr(rule.style.getPropertyCSSValue('page-break-after'), 'cssText', '').strip().lower()
    -            try:
    -                if before and before != 'avoid':
    -                    page_break_selectors.add((CSSSelector(rule.selectorText), True))
    -            except:
    -                pass
    -            try:
    -                if after and after != 'avoid':
    -                    page_break_selectors.add((CSSSelector(rule.selectorText), False))
    -            except:
    -                pass
    -
    -        page_breaks = set([])
    -        for selector, before in page_break_selectors:
    -            for elem in selector(root):
    -                elem.pb_before = before
    -                page_breaks.add(elem)
    -
    -        for i, elem in enumerate(root.iter()):
    -            elem.pb_order = i
    -
    -        page_breaks = list(page_breaks)
    -        page_breaks.sort(cmp=lambda x,y : cmp(x.pb_order, y.pb_order))
    -        self.page_break_ids = []
    -        for i, x in enumerate(page_breaks):
    -            x.set('id', x.get('id', 'calibre_pb_%d'%i))
    -            id = x.get('id')
    -            self.page_breaks.append((XPath('//*[@id="%s"]'%id), x.pb_before))
    -            self.page_break_ids.append(id)
    +        split_point, before = self.find_split_point(root)
    +        if split_point is None:
    +            raise SplitError(self.item.href, root)
     
    +        for t in self.do_split(tree, split_point, before):
    +            r = t.getroot()
    +            if self.is_page_empty(r):
    +                continue
    +            size = len(tostring(r))
    +            if size <= self.max_flow_size:
    +                self.trees.append(t)
    +                #print tostring(t.getroot(), pretty_print=True)
    +                self.log.debug('\t\t\tCommitted sub-tree #%d (%d KB)',
    +                               len(self.trees), size/1024.)
    +                self.split_size += size
    +            else:
    +                self.split_to_size(t)
     
         def find_split_point(self, root):
             '''
    @@ -336,8 +365,7 @@ class Splitter(object):
                          '//br',
                          '//li',
                          ):
    -            elems = root.xpath(path,
    -                    namespaces={'re':'http://exslt.org/regular-expressions'})
    +            elems = root.xpath(path, namespaces=NAMESPACES)
                 elem = pick_elem(elems)
                 if elem is not None:
                     try:
    @@ -355,6 +383,8 @@ class Splitter(object):
             all anchors in the original tree. Internal links are re-directed. The
             original file is deleted and the split files are saved.
             '''
    +        if not self.was_split:
    +            return
             self.anchor_map = collections.defaultdict(lambda :self.base%0)
             self.files = []
     
    @@ -368,134 +398,46 @@ class Splitter(object):
                     elem.attrib.pop(SPLIT_ATTR, None)
                     elem.attrib.pop(SPLIT_POINT_ATTR, '0')
     
    -        for current, tree in zip(self.files, self.trees):
    -            for a in tree.getroot().xpath('//a[@href]'):
    +        spine_pos = self.item.spine_pos
    +        for current, tree in zip(map(reversed, (self.files, self.trees))):
    +            for a in tree.getroot().xpath('//h:a[@href]', namespaces=NAMESPACES):
                     href = a.get('href').strip()
                     if href.startswith('#'):
                         anchor = href[1:]
                         file = self.anchor_map[anchor]
                         if file != current:
                             a.set('href', file+href)
    -            open(content(current), 'wb').\
    -                write(tostring(tree.getroot(), pretty_print=self.opts.pretty_print))
     
    -        os.remove(content(self.path))
    +            new_id = self.oeb.manifest.generate(id=self.item.id)[0]
    +            new_item = self.oeb.manifest.add(new_id, current,
    +                    self.item.media_type, data=tree.getroot())
    +            self.oeb.spine.insert(spine_pos, new_item, self.item.linear)
    +
    +        if self.oeb.guide:
    +            for ref in self.oeb.guide:
    +                href, frag = urldefrag(ref.href)
    +                if href == self.item.href:
    +                    nhref = self.anchor_map[frag if frag else None]
    +                    if frag:
    +                        nhref = '#'.join(nhref, frag)
    +                    ref.href = nhref
    +
    +        def fix_toc_entry(toc):
    +            if toc.href:
    +                href, frag = urldefrag(toc.href)
    +                if href == self.item.href:
    +                    nhref = self.anchor_map[frag if frag else None]
    +                    if frag:
    +                        nhref = '#'.join(nhref, frag)
    +                    toc.href = nhref
    +            for x in toc:
    +                fix_toc_entry(x)
     
     
    -    def fix_opf(self, opf):
    -        '''
    -        Fix references to the split file in the OPF.
    -        '''
    -        items = [item for item in opf.itermanifest() if item.get('href') == 'content/'+self.path]
    -        new_items = [('content/'+f, None) for f in self.files]
    -        id_map = {}
    -        for item in items:
    -            id_map[item.get('id')] = opf.replace_manifest_item(item, new_items)
    +        if self.oeb.toc:
    +            fix_toc_entry(self.oeb.toc)
     
    -        for id in id_map.keys():
    -            opf.replace_spine_items_by_idref(id, id_map[id])
    -
    -        for ref in opf.iterguide():
    -            href = ref.get('href', '')
    -            if href.startswith('content/'+self.path):
    -                href = href.split('#')
    -                frag = None
    -                if len(href) > 1:
    -                    frag = href[1]
    -                if frag not in self.anchor_map:
    -                    self.log_warning('\t\tUnable to re-map OPF link', href)
    -                    continue
    -                new_file = self.anchor_map[frag]
    -                ref.set('href', 'content/'+new_file+('' if frag is None else ('#'+frag)))
    +        self.oeb.manifest.remove(self.item)
     
     
     
    -def fix_content_links(html_files, changes, opts):
    -    split_files = [f.path for f in changes]
    -    anchor_maps = [f.anchor_map for f in changes]
    -    files = list(html_files)
    -    for j, f in enumerate(split_files):
    -        try:
    -            i = files.index(f)
    -            files[i:i+1] = changes[j].files
    -        except ValueError:
    -            continue
    -
    -    for htmlfile in files:
    -        changed = False
    -        root = html.fromstring(open(content(htmlfile), 'rb').read())
    -        for a in root.xpath('//a[@href]'):
    -            href = a.get('href')
    -            if not href.startswith('#'):
    -                href = href.split('#')
    -                anchor = href[1] if len(href) > 1 else None
    -                href = href[0]
    -                if href in split_files:
    -                    try:
    -                        newf = anchor_maps[split_files.index(href)][anchor]
    -                    except:
    -                        print '\t\tUnable to remap HTML link:', href, anchor
    -                        continue
    -                    frag = ('#'+anchor) if anchor else ''
    -                    a.set('href', newf+frag)
    -                    changed = True
    -
    -        if changed:
    -            open(content(htmlfile), 'wb').write(tostring(root, pretty_print=opts.pretty_print))
    -
    -def fix_ncx(path, changes):
    -    split_files = [f.path for f in changes]
    -    anchor_maps = [f.anchor_map for f in changes]
    -    tree = etree.parse(path)
    -    changed = False
    -    for content in tree.getroot().xpath('//x:content[@src]',
    -                    namespaces={'x':"http://www.daisy.org/z3986/2005/ncx/"}):
    -        href = content.get('src')
    -        if not href.startswith('#'):
    -            href = href.split('#')
    -            anchor = href[1] if len(href) > 1 else None
    -            href = href[0].split('/')[-1]
    -            if href in split_files:
    -                try:
    -                    newf = anchor_maps[split_files.index(href)][anchor]
    -                except:
    -                    print 'Unable to remap NCX link:', href, anchor
    -                frag = ('#'+anchor) if anchor else ''
    -                content.set('src', 'content/'+newf+frag)
    -                changed = True
    -    if changed:
    -        open(path, 'wb').write(etree.tostring(tree.getroot(), encoding='UTF-8', xml_declaration=True))
    -
    -def find_html_files(opf):
    -    '''
    -    Find all HTML files referenced by `opf`.
    -    '''
    -    html_files = []
    -    for item in opf.itermanifest():
    -        if 'html' in item.get('media-type', '').lower():
    -            f = item.get('href').split('/')[-1]
    -            f2 = f.replace('&', '%26')
    -            if not os.path.exists(content(f)) and os.path.exists(content(f2)):
    -                f = f2
    -                item.set('href', item.get('href').replace('&', '%26'))
    -            if os.path.exists(content(f)):
    -                html_files.append(f)
    -    return html_files
    -
    -
    -def split(pathtoopf, opts, stylesheet_map):
    -    pathtoopf = os.path.abspath(pathtoopf)
    -    opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf))
    -
    -    with CurrentDir(os.path.dirname(pathtoopf)):
    -        html_files = find_html_files(opf)
    -        changes = [Splitter(f, opts, stylesheet_map, opf) for f in html_files]
    -        changes = [c for c in changes if c.was_split]
    -
    -        fix_content_links(html_files, changes, opts)
    -        for item in opf.itermanifest():
    -            if item.get('media-type', '') == 'application/x-dtbncx+xml':
    -                fix_ncx(item.get('href'), changes)
    -                break
    -
    -        open(pathtoopf, 'wb').write(opf.render())
    
    From 959f35504323a72d0cc825e9ff9a531475232d01 Mon Sep 17 00:00:00 2001
    From: John Schember 
    Date: Tue, 14 Apr 2009 19:57:19 -0400
    Subject: [PATCH 095/319] GUI: 2 card support. PRS505/700 moved to usbms
     infastructure.
    
    ---
     src/calibre/customize/builtins.py      |   1 +
     src/calibre/devices/cybookg3/driver.py |  32 +-
     src/calibre/devices/eb600/driver.py    |  10 +-
     src/calibre/devices/interface.py       |  38 ++-
     src/calibre/devices/kindle/driver.py   |   6 +-
     src/calibre/devices/prs500/driver.py   |   0
     src/calibre/devices/prs505/books.py    |  13 +-
     src/calibre/devices/prs505/driver.py   | 438 ++++---------------------
     src/calibre/devices/prs700/driver.py   |  12 +-
     src/calibre/devices/usbms/cli.py       |  82 +++++
     src/calibre/devices/usbms/device.py    | 103 ++++--
     src/calibre/devices/usbms/driver.py    | 117 ++-----
     src/calibre/gui2/device.py             |  52 +--
     src/calibre/gui2/main.py               |  40 ++-
     src/calibre/gui2/main.ui               |  44 ++-
     src/calibre/gui2/viewer/printing.py    |   2 +-
     src/calibre/gui2/widgets.py            |  22 +-
     17 files changed, 441 insertions(+), 571 deletions(-)
     mode change 100755 => 100644 src/calibre/devices/eb600/driver.py
     mode change 100755 => 100644 src/calibre/devices/kindle/driver.py
     mode change 100755 => 100644 src/calibre/devices/prs500/driver.py
     create mode 100644 src/calibre/devices/usbms/cli.py
    
    diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py
    index a9fc342059..aa6c003114 100644
    --- a/src/calibre/customize/builtins.py
    +++ b/src/calibre/customize/builtins.py
    @@ -165,6 +165,7 @@ class TXTMetadataReader(MetadataReaderPlugin):
         name        = 'Read TXT metadata'
         file_types  = set(['txt'])
         description = _('Read metadata from %s files') % 'TXT'
    +    author      = 'John Schember'
     
         def get_metadata(self, stream, ftype):
             from calibre.ebooks.metadata.txt import get_metadata
    diff --git a/src/calibre/devices/cybookg3/driver.py b/src/calibre/devices/cybookg3/driver.py
    index 9e7e9d5862..c3a4fa94b0 100644
    --- a/src/calibre/devices/cybookg3/driver.py
    +++ b/src/calibre/devices/cybookg3/driver.py
    @@ -8,7 +8,7 @@ import os, shutil
     from itertools import cycle
     
     from calibre.ebooks.metadata import authors_to_string
    -from calibre.devices.errors import FreeSpaceError
    +from calibre.devices.errors import DeviceError, FreeSpaceError
     from calibre.devices.usbms.driver import USBMS
     import calibre.devices.cybookg3.t2b as t2b
     
    @@ -23,28 +23,34 @@ class CYBOOKG3(USBMS):
     
         VENDOR_NAME = 'BOOKEEN'
         WINDOWS_MAIN_MEM = 'CYBOOK_GEN3__-FD'
    -    WINDOWS_CARD_MEM = 'CYBOOK_GEN3__-SD'
    +    WINDOWS_CARD_A_MEM = 'CYBOOK_GEN3__-SD'
     
         OSX_MAIN_MEM = 'Bookeen Cybook Gen3 -FD Media'
    -    OSX_CARD_MEM = 'Bookeen Cybook Gen3 -SD Media'
    +    OSX_CARD_A_MEM = 'Bookeen Cybook Gen3 -SD Media'
     
         MAIN_MEMORY_VOLUME_LABEL  = 'Cybook Gen 3 Main Memory'
         STORAGE_CARD_VOLUME_LABEL = 'Cybook Gen 3 Storage Card'
     
         EBOOK_DIR_MAIN = "eBooks"
    -    EBOOK_DIR_CARD = "eBooks"
    +    EBOOK_DIR_CARD_A = "eBooks"
         THUMBNAIL_HEIGHT = 144
         SUPPORTS_SUB_DIRS = True
     
    -    def upload_books(self, files, names, on_card=False, end_session=True,
    +    def upload_books(self, files, names, on_card=None, end_session=True,
                          metadata=None):
    -        if on_card and not self._card_prefix:
    -            raise ValueError(_('The reader has no storage card connected.'))
    +        if on_card == 'carda' and not self._card_a_prefix:
    +            raise ValueError(_('The reader has no storage card in this slot.'))
    +        elif on_card == 'cardb' and not self._card_b_prefix:
    +            raise ValueError(_('The reader has no storage card in this slot.'))
    +        elif on_card and on_card not in ('carda', 'cardb'):
    +            raise DeviceError(_('The reader has no storage card in this slot.'))
     
    -        if not on_card:
    -            path = os.path.join(self._main_prefix, self.EBOOK_DIR_MAIN)
    +        if on_card == 'carda':
    +            path = os.path.join(self._card_a_prefix, self.EBOOK_DIR_CARD_A)
    +        if on_card == 'cardb':
    +            path = os.path.join(self._card_b_prefix, self.EBOOK_DIR_CARD_B)
             else:
    -            path = os.path.join(self._card_prefix, self.EBOOK_DIR_CARD)
    +            path = os.path.join(self._main_prefix, self.EBOOK_DIR_MAIN)
     
             def get_size(obj):
                 if hasattr(obj, 'seek'):
    @@ -57,10 +63,12 @@ class CYBOOKG3(USBMS):
             sizes = [get_size(f) for f in files]
             size = sum(sizes)
     
    -        if on_card and size > self.free_space()[2] - 1024*1024:
    -            raise FreeSpaceError(_("There is insufficient free space on the storage card"))
             if not on_card and size > self.free_space()[0] - 2*1024*1024:
                 raise FreeSpaceError(_("There is insufficient free space in main memory"))
    +        if on_card == 'carda' and size > self.free_space()[1] - 1024*1024:
    +            raise FreeSpaceError(_("There is insufficient free space on the storage card"))
    +        if on_card == 'cardb' and size > self.free_space()[2] - 1024*1024:
    +            raise FreeSpaceError(_("There is insufficient free space on the storage card"))
     
             paths = []
             names = iter(names)
    diff --git a/src/calibre/devices/eb600/driver.py b/src/calibre/devices/eb600/driver.py
    old mode 100755
    new mode 100644
    index 44690655a0..cb2f25d2f9
    --- a/src/calibre/devices/eb600/driver.py
    +++ b/src/calibre/devices/eb600/driver.py
    @@ -17,24 +17,24 @@ class EB600(USBMS):
     
         VENDOR_NAME      = 'NETRONIX'
         WINDOWS_MAIN_MEM = 'EBOOK'
    -    WINDOWS_CARD_MEM = 'EBOOK'
    +    WINDOWS_CARD_A_MEM = 'EBOOK'
     
         OSX_MAIN_MEM = 'EB600 Internal Storage Media'
    -    OSX_CARD_MEM = 'EB600 Card Storage Media'
    +    OSX_CARD_A_MEM = 'EB600 Card Storage Media'
     
         MAIN_MEMORY_VOLUME_LABEL  = 'EB600 Main Memory'
         STORAGE_CARD_VOLUME_LABEL = 'EB600 Storage Card'
     
         EBOOK_DIR_MAIN = ''
    -    EBOOK_DIR_CARD = ''
    +    EBOOK_DIR_CARD_A = ''
         SUPPORTS_SUB_DIRS = True
     
         def windows_sort_drives(self, drives):
             main = drives['main']
    -        card = drives['card']
    +        card = drives['carda']
             if card and main and card < main:
                 drives['main'] = card
    -            drives['card'] = main
    +            drives['carda'] = main
     
             return drives
     
    diff --git a/src/calibre/devices/interface.py b/src/calibre/devices/interface.py
    index 21790e3c46..0ad01e7493 100644
    --- a/src/calibre/devices/interface.py
    +++ b/src/calibre/devices/interface.py
    @@ -87,7 +87,13 @@ class Device(object):
         
         def card_prefix(self, end_session=True):
             '''
    -        Return prefix to paths on the card or '' if no cards present.
    +        Return a 2 element list of the prefix to paths on the cards.
    +        If no card is present None is set for the card's prefix.
    +        E.G.
    +        ('/place', '/place2')
    +        (None, 'place2')
    +        ('place', None)
    +        (None, None)
             '''
             raise NotImplementedError()
         
    @@ -95,8 +101,8 @@ class Device(object):
             """ 
             Get total space available on the mountpoints:
                 1. Main memory
    -            2. Memory Stick
    -            3. SD Card
    +            2. Memory Card A
    +            3. Memory Card B
     
             @return: A 3 element list with total space in bytes of (1, 2, 3). If a
             particular device doesn't have any of these locations it should return 0.
    @@ -115,24 +121,25 @@ class Device(object):
             """    
             raise NotImplementedError()
         
    -    def books(self, oncard=False, end_session=True):
    +    def books(self, oncard=None, end_session=True):
             """ 
             Return a list of ebooks on the device.
    -        @param oncard:  If True return a list of ebooks on the storage card, 
    -                        otherwise return list of ebooks in main memory of device.
    -                        If True and no books on card return empty list. 
    +        @param oncard:  If 'carda' or 'cardb' return a list of ebooks on the
    +                        specific storage card, otherwise return list of ebooks
    +                        in main memory of device. If a card is specified and no
    +                        books are on the card return empty list. 
             @return: A BookList. 
             """    
             raise NotImplementedError()
         
    -    def upload_books(self, files, names, on_card=False, end_session=True,
    +    def upload_books(self, files, names, on_card=None, end_session=True,
                          metadata=None):
             '''
             Upload a list of books to the device. If a file already
             exists on the device, it should be replaced.
             This method should raise a L{FreeSpaceError} if there is not enough
             free space on the device. The text of the FreeSpaceError must contain the
    -        word "card" if C{on_card} is True otherwise it must contain the word "memory".
    +        word "card" if C{on_card} is not None otherwise it must contain the word "memory".
             @param files: A list of paths and/or file-like objects.
             @param names: A list of file names that the books should have 
             once uploaded to the device. len(names) == len(files)
    @@ -163,7 +170,8 @@ class Device(object):
             another dictionary that maps tag names to lists of book ids. The ids are
             ids from the book database.
             @param booklists: A tuple containing the result of calls to 
    -                                (L{books}(oncard=False), L{books}(oncard=True)).
    +                                (L{books}(oncard=None), L{books}(oncard='carda'),
    +                                L{books}(oncard='cardb')).
             '''
             raise NotImplementedError
         
    @@ -179,16 +187,18 @@ class Device(object):
             Remove books from the metadata list. This function must not communicate 
             with the device.
             @param paths: paths to books on the device.
    -        @param booklists:  A tuple containing the result of calls to 
    -                                (L{books}(oncard=False), L{books}(oncard=True)).
    +        @param booklists:  A tuple containing the result of calls to
    +                                (L{books}(oncard=None), L{books}(oncard='carda'),
    +                                L{books}(oncard='cardb')).
             '''
             raise NotImplementedError()
             
         def sync_booklists(self, booklists, end_session=True):
             '''
             Update metadata on device.
    -        @param booklists: A tuple containing the result of calls to 
    -                                (L{books}(oncard=False), L{books}(oncard=True)).
    +        @param booklists: A tuple containing the result of calls to
    +                                (L{books}(oncard=None), L{books}(oncard='carda'),
    +                                L{books}(oncard='cardb')).
             '''
             raise NotImplementedError()
         
    diff --git a/src/calibre/devices/kindle/driver.py b/src/calibre/devices/kindle/driver.py
    old mode 100755
    new mode 100644
    index a5775dec8a..d598e2a503
    --- a/src/calibre/devices/kindle/driver.py
    +++ b/src/calibre/devices/kindle/driver.py
    @@ -18,16 +18,16 @@ class KINDLE(USBMS):
     
         VENDOR_NAME = 'KINDLE'
         WINDOWS_MAIN_MEM = 'INTERNAL_STORAGE'
    -    WINDOWS_CARD_MEM = 'CARD_STORAGE'
    +    WINDOWS_CARD_A_MEM = 'CARD_STORAGE'
     
         OSX_MAIN_MEM = 'Kindle Internal Storage Media'
    -    OSX_CARD_MEM = 'Kindle Card Storage Media'
    +    OSX_CARD_A_MEM = 'Kindle Card Storage Media'
     
         MAIN_MEMORY_VOLUME_LABEL  = 'Kindle Main Memory'
         STORAGE_CARD_VOLUME_LABEL = 'Kindle Storage Card'
     
         EBOOK_DIR_MAIN = "documents"
    -    EBOOK_DIR_CARD = "documents"
    +    EBOOK_DIR_CARD_A = "documents"
         SUPPORTS_SUB_DIRS = True
     
         WIRELESS_FILE_NAME_PATTERN = re.compile(
    diff --git a/src/calibre/devices/prs500/driver.py b/src/calibre/devices/prs500/driver.py
    old mode 100755
    new mode 100644
    diff --git a/src/calibre/devices/prs505/books.py b/src/calibre/devices/prs505/books.py
    index eb34bff0e7..3fdb8a8432 100644
    --- a/src/calibre/devices/prs505/books.py
    +++ b/src/calibre/devices/prs505/books.py
    @@ -380,14 +380,16 @@ class BookList(_BookList):
                     item.setAttribute('id', str(map[id]))
                     pl.appendChild(item)
             
    -def fix_ids(main, card):
    +def fix_ids(main, carda, cardb):
         '''
         Adjust ids the XML databases.
         '''
         if hasattr(main, 'purge_empty_playlists'):
             main.purge_empty_playlists()
    -    if hasattr(card, 'purge_empty_playlists'):
    -        card.purge_empty_playlists()
    +    if hasattr(carda, 'purge_empty_playlists'):
    +        carda.purge_empty_playlists()
    +    if hasattr(cardb, 'purge_empty_playlists'):
    +        cardb.purge_empty_playlists()
         
         def regen_ids(db):
             if not hasattr(db, 'root_element'):
    @@ -413,6 +415,7 @@ def fix_ids(main, card):
             db.reorder_playlists()
         
         regen_ids(main)
    -    regen_ids(card)
    +    regen_ids(carda)
    +    regen_ids(cardb)
             
    -    main.set_next_id(str(main.max_id()+1))
    \ No newline at end of file
    +    main.set_next_id(str(main.max_id()+1))
    diff --git a/src/calibre/devices/prs505/driver.py b/src/calibre/devices/prs505/driver.py
    index 6e21c60d1b..efc48a2dff 100644
    --- a/src/calibre/devices/prs505/driver.py
    +++ b/src/calibre/devices/prs505/driver.py
    @@ -6,250 +6,43 @@ Device driver for the SONY PRS-505
     import sys, os, shutil, time, subprocess, re
     from itertools import cycle
     
    -from calibre.devices.interface import Device
    +from calibre.devices.usbms.cli import CLI
    +from calibre.devices.usbms.device import Device
     from calibre.devices.errors import DeviceError, FreeSpaceError
     from calibre.devices.prs505.books import BookList, fix_ids
     from calibre import iswindows, islinux, isosx, __appname__
     from calibre.devices.errors import PathError
     
    -class File(object):
    -    def __init__(self, path):
    -        stats = os.stat(path)
    -        self.is_dir = os.path.isdir(path)
    -        self.is_readonly = not os.access(path, os.W_OK)
    -        self.ctime = stats.st_ctime
    -        self.wtime = stats.st_mtime
    -        self.size  = stats.st_size
    -        if path.endswith(os.sep):
    -            path = path[:-1]
    -        self.path = path
    -        self.name = os.path.basename(path)
    -
    -
    -class PRS505(Device):
    -    VENDOR_ID    = 0x054c   #: SONY Vendor Id
    -    PRODUCT_ID   = 0x031e   #: Product Id for the PRS-505
    -    BCD          = [0x229]  #: Needed to disambiguate 505 and 700 on linux
    -    PRODUCT_NAME = 'PRS-505'
    -    VENDOR_NAME  = 'SONY'
    +class PRS505(CLI, Device):
         FORMATS      = ['epub', 'lrf', 'lrx', 'rtf', 'pdf', 'txt']
    +    
    +    VENDOR_ID    = [0x054c]   #: SONY Vendor Id
    +    PRODUCT_ID   = [0x031e]   #: Product Id for the PRS-505
    +    BCD          = [0x229]  #: Needed to disambiguate 505 and 700 on linux
     
    -    MEDIA_XML    = 'database/cache/media.xml'
    -    CACHE_XML    = 'Sony Reader/database/cache.xml'
    +    VENDOR_NAME  = 'SONY'
    +    WINDOWS_MAIN_MEM = 'PRS-505'
    +    WINDOWS_CARD_A_MEM = 'PRS-505/UC:MS'
    +    WINDOWS_CARD_B_MEM = 'PRS-505/UC:SD'
    +
    +    OSX_MAIN_MEM = 'Sony PRS-505/UC Media'
    +    OSX_CARD_A_MEM = 'Sony PRS-505/UC:MS Media'
    +    OSX_CARD_B_MEM = 'Sony PRS-505/UC:SD'
     
         MAIN_MEMORY_VOLUME_LABEL  = 'Sony Reader Main Memory'
         STORAGE_CARD_VOLUME_LABEL = 'Sony Reader Storage Card'
     
    -    OSX_NAME                  = 'Sony PRS-505'
    +    MEDIA_XML    = 'database/cache/media.xml'
    +    CACHE_XML    = 'Sony Reader/database/cache.xml'
     
         CARD_PATH_PREFIX          = __appname__
     
    -    FDI_TEMPLATE = \
    -'''
    -  
    -      
    -          
    -              
    -                  
    -                      
    -                          %(main_memory)s
    -                          %(deviceclass)s
    -                      
    -                  
    -              
    -          
    -      
    -  
    -  
    -      
    -          
    -              
    -                  
    -                      
    -                          %(storage_card)s
    -                          %(deviceclass)s
    -                      
    -                  
    -              
    -          
    -      
    -  
    -'''.replace('%(app)s', __appname__)
    -
    -
    -    def __init__(self, log_packets=False):
    -        self._main_prefix = self._card_prefix = None
    -
    -    @classmethod
    -    def get_fdi(cls):
    -        return cls.FDI_TEMPLATE%dict(
    -                                     deviceclass=cls.__name__,
    -                                     vendor_id=hex(cls.VENDOR_ID),
    -                                     product_id=hex(cls.PRODUCT_ID),
    -                                     bcd=hex(cls.BCD[0]),
    -                                     main_memory=cls.MAIN_MEMORY_VOLUME_LABEL,
    -                                     storage_card=cls.STORAGE_CARD_VOLUME_LABEL,
    -                                     )
    -
    -    @classmethod
    -    def is_device(cls, device_id):
    -        device_id = device_id.upper()
    -        if 'VEN_'+cls.VENDOR_NAME in device_id and \
    -               'PROD_'+cls.PRODUCT_NAME in device_id:
    -            return True
    -        vid, pid = hex(cls.VENDOR_ID)[2:], hex(cls.PRODUCT_ID)[2:]
    -        if len(vid) < 4: vid = '0'+vid
    -        if len(pid) < 4: pid = '0'+pid
    -        if 'VID_'+vid in device_id and \
    -               'PID_'+pid in device_id:
    -            return True
    -        return False
    -
    -    @classmethod
    -    def get_osx_mountpoints(cls, raw=None):
    -        if raw is None:
    -            ioreg = '/usr/sbin/ioreg'
    -            if not os.access(ioreg, os.X_OK):
    -                ioreg = 'ioreg'
    -            raw = subprocess.Popen((ioreg+' -w 0 -S -c IOMedia').split(),
    -                                   stdout=subprocess.PIPE).communicate()[0]
    -        lines = raw.splitlines()
    -        names = {}
    -        for i, line in enumerate(lines):
    -            if line.strip().endswith('') and cls.OSX_NAME in line:
    -                loc = 'stick' if ':MS' in line else 'card' if ':SD' in line else 'main'
    -                for line in lines[i+1:]:
    -                    line = line.strip()
    -                    if line.endswith('}'):
    -                        break
    -                    match = re.search(r'"BSD Name"\s+=\s+"(.*?)"', line)
    -                    if match is not None:
    -                        names[loc] = match.group(1)
    -                        break
    -            if len(names.keys()) == 3:
    -                break
    -        return names
    -
    -
    -    def open_osx(self):
    -        mount = subprocess.Popen('mount', shell=True,
    -                                 stdout=subprocess.PIPE).stdout.read()
    -        names = self.get_osx_mountpoints()
    -        dev_pat = r'/dev/%s(\w*)\s+on\s+([^\(]+)\s+'
    -        if 'main' not in names.keys():
    -            raise DeviceError(_('Unable to detect the %s disk drive. Try rebooting.')%self.__class__.__name__)
    -        main_pat = dev_pat%names['main']
    -        self._main_prefix = re.search(main_pat, mount).group(2) + os.sep
    -        card_pat = names['stick'] if 'stick' in names.keys() else names['card'] if 'card' in names.keys() else None
    -        if card_pat is not None:
    -            card_pat = dev_pat%card_pat
    -            self._card_prefix = re.search(card_pat, mount).group(2) + os.sep
    -
    -
    -    def open_windows(self):
    -        time.sleep(6)
    -        drives = []
    -        wmi = __import__('wmi', globals(), locals(), [], -1)
    -        c = wmi.WMI()
    -        for drive in c.Win32_DiskDrive():
    -            if self.__class__.is_device(str(drive.PNPDeviceID)):
    -                if drive.Partitions == 0:
    -                    continue
    -                try:
    -                    partition = drive.associators("Win32_DiskDriveToDiskPartition")[0]
    -                    logical_disk = partition.associators('Win32_LogicalDiskToPartition')[0]
    -                    prefix = logical_disk.DeviceID+os.sep
    -                    drives.append((drive.Index, prefix))
    -                except IndexError:
    -                    continue
    -
    -
    -        if not drives:
    -            raise DeviceError(_('Unable to detect the %s disk drive. Try rebooting.')%self.__class__.__name__)
    -
    -        drives.sort(cmp=lambda a, b: cmp(a[0], b[0]))
    -        self._main_prefix = drives[0][1]
    -        if len(drives) > 1:
    -            self._card_prefix = drives[1][1]
    -
    -
    -    def open_linux(self):
    -        import dbus
    -        bus = dbus.SystemBus()
    -        hm  = dbus.Interface(bus.get_object("org.freedesktop.Hal", "/org/freedesktop/Hal/Manager"), "org.freedesktop.Hal.Manager")
    -
    -        def conditional_mount(dev, main_mem=True):
    -            mmo = bus.get_object("org.freedesktop.Hal", dev)
    -            label = mmo.GetPropertyString('volume.label', dbus_interface='org.freedesktop.Hal.Device')
    -            is_mounted = mmo.GetPropertyString('volume.is_mounted', dbus_interface='org.freedesktop.Hal.Device')
    -            mount_point = mmo.GetPropertyString('volume.mount_point', dbus_interface='org.freedesktop.Hal.Device')
    -            fstype = mmo.GetPropertyString('volume.fstype', dbus_interface='org.freedesktop.Hal.Device')
    -            if is_mounted:
    -                return str(mount_point)
    -            mmo.Mount(label, fstype, ['umask=077', 'uid='+str(os.getuid()), 'sync'],
    -                          dbus_interface='org.freedesktop.Hal.Device.Volume')
    -            return os.path.normpath('/media/'+label)+'/'
    -
    -
    -        mm = hm.FindDeviceStringMatch(__appname__+'.mainvolume', self.__class__.__name__)
    -        if not mm:
    -            raise DeviceError(_('Unable to detect the %s disk drive. Try rebooting.')%(self.__class__.__name__,))
    -        self._main_prefix = None
    -        for dev in mm:
    -            try:
    -                self._main_prefix = conditional_mount(dev)+os.sep
    -                break
    -            except dbus.exceptions.DBusException:
    -                continue
    -
    -
    -        if not self._main_prefix:
    -            raise DeviceError('Could not open device for reading. Try a reboot.')
    -
    -        self._card_prefix = None
    -        cards = hm.FindDeviceStringMatch(__appname__+'.cardvolume', self.__class__.__name__)
    -        keys = []
    -        for card in cards:
    -            keys.append(int('UC_SD' in bus.get_object("org.freedesktop.Hal", card).GetPropertyString('info.parent', dbus_interface='org.freedesktop.Hal.Device')))
    -
    -        cards = zip(cards, keys)
    -        cards.sort(cmp=lambda x, y: cmp(x[1], y[1]))
    -        cards = [i[0] for i in cards]
    -
    -        for dev in cards:
    -            try:
    -                self._card_prefix = conditional_mount(dev, False)+os.sep
    -                break
    -            except:
    -                import traceback
    -                print traceback
    -                continue
    -
    -
         def open(self):
    -        time.sleep(5)
    -        self._main_prefix = self._card_prefix = None
    -        if islinux:
    +        Device.open(self)
    +        
    +        def write_cache(prefix):
                 try:
    -                self.open_linux()
    -            except DeviceError:
    -                time.sleep(3)
    -                self.open_linux()
    -        if iswindows:
    -            try:
    -                self.open_windows()
    -            except DeviceError:
    -                time.sleep(3)
    -                self.open_windows()
    -        if isosx:
    -            try:
    -                self.open_osx()
    -            except DeviceError:
    -                time.sleep(3)
    -                self.open_osx()
    -        if self._card_prefix is not None:
    -            try:
    -                cachep = os.path.join(self._card_prefix, self.CACHE_XML)
    +                cachep = os.path.join(prefix, self.CACHE_XML)
                     if not os.path.exists(cachep):
                         os.makedirs(os.path.dirname(cachep), mode=0777)
                         f = open(cachep, 'wb')
    @@ -263,133 +56,47 @@ class PRS505(Device):
                     import traceback
                     traceback.print_exc()
     
    -    def set_progress_reporter(self, pr):
    -        self.report_progress = pr
    +        if self._card_a_prefix is not None:
    +            write_cache(self._card_a_prefix)
    +        if self._card_b_prefix is not None:
    +            write_cache(self._card_b_prefix)
     
         def get_device_information(self, end_session=True):
             return (self.__class__.__name__, '', '', '')
     
    -    def card_prefix(self, end_session=True):
    -        return self._card_prefix
    -
    -    @classmethod
    -    def _windows_space(cls, prefix):
    -        if prefix is None:
    -            return 0, 0
    -        win32file = __import__('win32file', globals(), locals(), [], -1)
    -        try:
    -            sectors_per_cluster, bytes_per_sector, free_clusters, total_clusters = \
    -                win32file.GetDiskFreeSpace(prefix[:-1])
    -        except Exception, err:
    -            if getattr(err, 'args', [None])[0] == 21: # Disk not ready
    -                time.sleep(3)
    -                sectors_per_cluster, bytes_per_sector, free_clusters, total_clusters = \
    -                    win32file.GetDiskFreeSpace(prefix[:-1])
    -            else: raise
    -        mult = sectors_per_cluster * bytes_per_sector
    -        return total_clusters * mult, free_clusters * mult
    -
    -    def total_space(self, end_session=True):
    -        msz = csz = 0
    -        if not iswindows:
    -            if self._main_prefix is not None:
    -                stats = os.statvfs(self._main_prefix)
    -                msz = stats.f_frsize * (stats.f_blocks + stats.f_bavail - stats.f_bfree)
    -            if self._card_prefix is not None:
    -                stats = os.statvfs(self._card_prefix)
    -                csz = stats.f_frsize * (stats.f_blocks + stats.f_bavail - stats.f_bfree)
    -        else:
    -            msz = self._windows_space(self._main_prefix)[0]
    -            csz = self._windows_space(self._card_prefix)[0]
    -
    -        return (msz, 0, csz)
    -
    -    def free_space(self, end_session=True):
    -        msz = csz = 0
    -        if not iswindows:
    -            if self._main_prefix is not None:
    -                stats = os.statvfs(self._main_prefix)
    -                msz = stats.f_frsize * stats.f_bavail
    -            if self._card_prefix is not None:
    -                stats = os.statvfs(self._card_prefix)
    -                csz = stats.f_frsize * stats.f_bavail
    -        else:
    -            msz = self._windows_space(self._main_prefix)[1]
    -            csz = self._windows_space(self._card_prefix)[1]
    -
    -        return (msz, 0, csz)
    -
    -    def books(self, oncard=False, end_session=True):
    -        if oncard and self._card_prefix is None:
    +    def books(self, oncard=None, end_session=True):
    +        if oncard == 'carda' and not self._card_a_prefix:
                 return []
    +        elif oncard == 'cardb' and not self._card_b_prefix:
    +            return []
    +        elif oncard and oncard != 'carda' and oncard != 'cardb':
    +            return []
    +
             db = self.__class__.CACHE_XML if oncard else self.__class__.MEDIA_XML
    -        prefix = self._card_prefix if oncard else self._main_prefix
    +        prefix = self._card_a_prefix if oncard == 'carda' else self._card_b_prefix if oncard == 'cardb' else self._main_prefix
             bl = BookList(open(prefix + db, 'rb'), prefix)
             paths = bl.purge_corrupted_files()
             for path in paths:
    -            path = os.path.join(self._card_prefix if oncard else self._main_prefix, path)
    +            path = os.path.join(prefix, path)
                 if os.path.exists(path):
                     os.unlink(path)
             return bl
     
    -    def munge_path(self, path):
    -        if path.startswith('/') and not (path.startswith(self._main_prefix) or \
    -            (self._card_prefix and path.startswith(self._card_prefix))):
    -            path = self._main_prefix + path[1:]
    -        elif path.startswith('card:'):
    -            path = path.replace('card:', self._card_prefix[:-1])
    -        return path
    -
    -    def mkdir(self, path, end_session=True):
    -        """ Make directory """
    -        path = self.munge_path(path)
    -        os.mkdir(path)
    -
    -    def list(self, path, recurse=False, end_session=True, munge=True):
    -        if munge:
    -            path = self.munge_path(path)
    -        if os.path.isfile(path):
    -            return [(os.path.dirname(path), [File(path)])]
    -        entries = [File(os.path.join(path, f)) for f in os.listdir(path)]
    -        dirs = [(path, entries)]
    -        for _file in entries:
    -            if recurse and _file.is_dir:
    -                dirs[len(dirs):] = self.list(_file.path, recurse=True, munge=False)
    -        return dirs
    -
    -    def get_file(self, path, outfile, end_session=True):
    -        path = self.munge_path(path)
    -        src = open(path, 'rb')
    -        shutil.copyfileobj(src, outfile, 10*1024*1024)
    -
    -    def put_file(self, infile, path, replace_file=False, end_session=True):
    -        path = self.munge_path(path)
    -        if os.path.isdir(path):
    -            path = os.path.join(path, infile.name)
    -        if not replace_file and os.path.exists(path):
    -            raise PathError('File already exists: '+path)
    -        dest = open(path, 'wb')
    -        shutil.copyfileobj(infile, dest, 10*1024*1024)
    -        dest.flush()
    -        dest.close()
    -
    -    def rm(self, path, end_session=True):
    -        path = self.munge_path(path)
    -        os.unlink(path)
    -
    -    def touch(self, path, end_session=True):
    -        path = self.munge_path(path)
    -        if not os.path.exists(path):
    -            open(path, 'w').close()
    -        if not os.path.isdir(path):
    -            os.utime(path, None)
    -
    -    def upload_books(self, files, names, on_card=False, end_session=True,
    +    def upload_books(self, files, names, on_card=None, end_session=True,
                          metadata=None):
    -        if on_card and not self._card_prefix:
    -            raise ValueError(_('The reader has no storage card connected.'))
    -        path = os.path.join(self._card_prefix, self.CARD_PATH_PREFIX) if on_card \
    -               else os.path.join(self._main_prefix, 'database', 'media', 'books')
    +        if on_card == 'carda' and not self._card_a_prefix:
    +            raise ValueError(_('The reader has no storage card in this slot.'))
    +        elif on_card == 'cardb' and not self._card_b_prefix:
    +            raise ValueError(_('The reader has no storage card in this slot.'))
    +        elif on_card and on_card not in ('carda', 'cardb'):
    +            raise DeviceError(_('The reader has no storage card in this slot.'))
    +
    +        if on_card == 'carda':
    +            path = os.path.join(self._card_a_prefix, self.CARD_PATH_PREFIX)
    +        elif on_card == 'cardb':
    +            path = os.path.join(self._card_b_prefix, self.CARD_PATH_PREFIX)
    +        else:
    +            path = os.path.join(self._main_prefix, 'database', 'media', 'books')
     
             def get_size(obj):
                 if hasattr(obj, 'seek'):
    @@ -399,17 +106,15 @@ class PRS505(Device):
                     return size
                 return os.path.getsize(obj)
     
    -        sizes = map(get_size, files)
    +        sizes = [get_size(f) for f in files]
             size = sum(sizes)
    -        space = self.free_space()
    -        mspace = space[0]
    -        cspace = space[2]
    -        if on_card and size > cspace - 1024*1024:
    -            raise FreeSpaceError("There is insufficient free space "+\
    -                                          "on the storage card")
    -        if not on_card and size > mspace - 2*1024*1024:
    -            raise FreeSpaceError("There is insufficient free space " +\
    -                                         "in main memory")
    +
    +        if not on_card and size > self.free_space()[0] - 2*1024*1024:
    +            raise FreeSpaceError(_("There is insufficient free space in main memory"))
    +        if on_card == 'carda' and size > self.free_space()[1] - 1024*1024:
    +            raise FreeSpaceError(_("There is insufficient free space on the storage card"))
    +        if on_card == 'cardb' and size > self.free_space()[2] - 1024*1024:
    +            raise FreeSpaceError(_("There is insufficient free space on the storage card"))
     
             paths, ctimes = [], []
     
    @@ -435,11 +140,11 @@ class PRS505(Device):
             for location in locations:
                 info = metadata.next()
                 path = location[0]
    -            on_card = 1 if location[3] else 0
    +            blist = 2 if location[3] == 'cardb' else 1 if location[3] == 'carda' else 0
                 name = path.rpartition(os.sep)[2]
    -            name = (cls.CARD_PATH_PREFIX+'/' if on_card else 'database/media/books/') + name
    +            name = (cls.CARD_PATH_PREFIX+'/' if blist else 'database/media/books/') + name
                 name = name.replace('//', '/')
    -            booklists[on_card].add_book(info, name, *location[1:-1])
    +            booklists[blist].add_book(info, name, *location[1:-1])
             fix_ids(*booklists)
     
         def delete_books(self, paths, end_session=True):
    @@ -462,18 +167,13 @@ class PRS505(Device):
             f = open(self._main_prefix + self.__class__.MEDIA_XML, 'wb')
             booklists[0].write(f)
             f.close()
    -        if self._card_prefix is not None and hasattr(booklists[1], 'write'):
    -            if not os.path.exists(self._card_prefix):
    -                os.makedirs(self._card_prefix)
    -            f = open(self._card_prefix + self.__class__.CACHE_XML, 'wb')
    -            booklists[1].write(f)
    -            f.close()
    -
    -
    -
    -
    -def main(args=sys.argv):
    -    return 0
    -
    -if __name__ == '__main__':
    -    sys.exit(main())
    +        
    +        def write_card_prefix(prefix, listid):
    +            if prefix is not None and hasattr(booklists[listid], 'write'):
    +                if not os.path.exists(prefix):
    +                    os.makedirs(prefix)
    +                f = open(prefix + self.__class__.CACHE_XML, 'wb')
    +                booklists[listid].write(f)
    +                f.close()
    +        write_card_prefix(self._card_a_prefix, 1)
    +        write_card_prefix(self._card_b_prefix, 2)
    diff --git a/src/calibre/devices/prs700/driver.py b/src/calibre/devices/prs700/driver.py
    index 5db60ef506..2b82eb3e34 100644
    --- a/src/calibre/devices/prs700/driver.py
    +++ b/src/calibre/devices/prs700/driver.py
    @@ -10,6 +10,12 @@ from calibre.devices.prs505.driver import PRS505
     class PRS700(PRS505):
         
         BCD          = [0x31a]
    -    PRODUCT_NAME = 'PRS-700'
    -    OSX_NAME     = 'Sony PRS-700'
    -     
    +    
    +    WINDOWS_MAIN_MEM = 'PRS-700'
    +    WINDOWS_CARD_A_MEM = 'PRS-700/UC:MS'
    +    WINDOWS_CARD_B_MEM = 'PRS-700/UC:SD'
    +
    +    OSX_MAIN_MEM = 'Sony PRS-700/UC Media'
    +    OSX_CARD_A_MEM = 'Sony PRS-700/UC:MS Media'
    +    OSX_CARD_B_MEM = 'Sony PRS-700/UC:SD'
    +
    diff --git a/src/calibre/devices/usbms/cli.py b/src/calibre/devices/usbms/cli.py
    new file mode 100644
    index 0000000000..40e2225486
    --- /dev/null
    +++ b/src/calibre/devices/usbms/cli.py
    @@ -0,0 +1,82 @@
    +# -*- coding: utf-8 -*-
    +from __future__ import with_statement
    +
    +__license__ = 'GPL 3'
    +__copyright__ = '2009, John Schember '
    +__docformat__ = 'restructuredtext en'
    +
    +import os, shutil
    +
    +from calibre.devices.errors import PathError
    +
    +class File(object):
    +
    +    def __init__(self, path):
    +        stats = os.stat(path)
    +        self.is_dir = os.path.isdir(path)
    +        self.is_readonly = not os.access(path, os.W_OK)
    +        self.ctime = stats.st_ctime
    +        self.wtime = stats.st_mtime
    +        self.size  = stats.st_size
    +        if path.endswith(os.sep):
    +            path = path[:-1]
    +        self.path = path
    +        self.name = os.path.basename(path)
    +
    +
    +class CLI(object):
    +
    +    def get_file(self, path, outfile, end_session=True):
    +        path = self.munge_path(path)
    +        with open(path, 'rb') as src:
    +            shutil.copyfileobj(src, outfile, 10*1024*1024)
    +
    +    def put_file(self, infile, path, replace_file=False, end_session=True):
    +        path = self.munge_path(path)
    +        if os.path.isdir(path):
    +            path = os.path.join(path, infile.name)
    +        if not replace_file and os.path.exists(path):
    +            raise PathError('File already exists: ' + path)
    +        dest = open(path, 'wb')
    +        shutil.copyfileobj(infile, dest, 10*1024*1024)
    +        dest.flush()
    +        dest.close()
    +
    +    def munge_path(self, path):
    +        if path.startswith('/') and not (path.startswith(self._main_prefix) or \
    +            (self._card_a_prefix and path.startswith(self._card_a_prefix)) or \
    +            (self._card_b_prefix and path.startswith(self._card_b_prefix))):
    +            path = self._main_prefix + path[1:]
    +        elif path.startswith('carda:'):
    +            path = path.replace('carda:', self._card_prefix[:-1])
    +        elif path.startswith('cardb:'):
    +            path = path.replace('cardb:', self._card_prefix[:-1])
    +        return path
    +
    +    def list(self, path, recurse=False, end_session=True, munge=True):
    +        if munge:
    +            path = self.munge_path(path)
    +        if os.path.isfile(path):
    +            return [(os.path.dirname(path), [File(path)])]
    +        entries = [File(os.path.join(path, f)) for f in os.listdir(path)]
    +        dirs = [(path, entries)]
    +        for _file in entries:
    +            if recurse and _file.is_dir:
    +                dirs[len(dirs):] = self.list(_file.path, recurse=True, munge=False)
    +        return dirs
    +
    +    def mkdir(self, path, end_session=True):
    +        if self.SUPPORTS_SUB_DIRS:
    +            path = self.munge_path(path)
    +            os.mkdir(path)
    +
    +    def rm(self, path, end_session=True):
    +        path = self.munge_path(path)
    +        self.delete_books([path])
    +
    +    def touch(self, path, end_session=True):
    +        path = self.munge_path(path)
    +        if not os.path.exists(path):
    +            open(path, 'w').close()
    +        if not os.path.isdir(path):
    +            os.utime(path, None)
    diff --git a/src/calibre/devices/usbms/device.py b/src/calibre/devices/usbms/device.py
    index 5a1b5ef40d..63dabe001a 100644
    --- a/src/calibre/devices/usbms/device.py
    +++ b/src/calibre/devices/usbms/device.py
    @@ -25,10 +25,12 @@ class Device(_Device):
     
         VENDOR_NAME = None
         WINDOWS_MAIN_MEM = None
    -    WINDOWS_CARD_MEM = None
    +    WINDOWS_CARD_A_MEM = None
    +    WINDOWS_CARD_B_MEM = None
     
         OSX_MAIN_MEM = None
    -    OSX_CARD_MEM = None
    +    OSX_CARD_A_MEM = None
    +    OSX_CARD_B_MEM = None
     
         MAIN_MEMORY_VOLUME_LABEL  = ''
         STORAGE_CARD_VOLUME_LABEL = ''
    @@ -63,12 +65,26 @@ class Device(_Device):
               
           
       
    +  
    +      
    +          
    +              
    +                %(BCD_start)s
    +                  
    +                          %(storage_card)s
    +                          %(deviceclass)s
    +                  
    +                %(BCD_end)s
    +              
    +          
    +      
    +  
     '''
         FDI_BCD_TEMPLATE = ''
     
     
         def __init__(self, key='-1', log_packets=False, report_progress=None) :
    -        self._main_prefix = self._card_prefix = None
    +        self._main_prefix = self._card_a_prefix = self._card_b_prefix = None
     
         @classmethod
         def get_fdi(cls):
    @@ -102,7 +118,7 @@ class Device(_Device):
             self.report_progress = report_progress
     
         def card_prefix(self, end_session=True):
    -        return self._card_prefix
    +        return (self._card_a_prefix, self._card_b_prefix)
     
         @classmethod
         def _windows_space(cls, prefix):
    @@ -122,34 +138,41 @@ class Device(_Device):
             return total_clusters * mult, free_clusters * mult
     
         def total_space(self, end_session=True):
    -        msz = csz = 0
    +        msz = casz = cbsz = 0
             if not iswindows:
                 if self._main_prefix is not None:
                     stats = os.statvfs(self._main_prefix)
                     msz = stats.f_frsize * (stats.f_blocks + stats.f_bavail - stats.f_bfree)
    -            if self._card_prefix is not None:
    -                stats = os.statvfs(self._card_prefix)
    -                csz = stats.f_frsize * (stats.f_blocks + stats.f_bavail - stats.f_bfree)
    +            if self._card_a_prefix is not None:
    +                stats = os.statvfs(self._card_a_prefix)
    +                casz = stats.f_frsize * (stats.f_blocks + stats.f_bavail - stats.f_bfree)
    +            if self._card_b_prefix is not None:
    +                stats = os.statvfs(self._card_b_prefix)
    +                cbsz = stats.f_frsize * (stats.f_blocks + stats.f_bavail - stats.f_bfree)
             else:
                 msz = self._windows_space(self._main_prefix)[0]
    -            csz = self._windows_space(self._card_prefix)[0]
    +            casz = self._windows_space(self._card_a_prefix)[0]
    +            cbsz = self._windows_space(self._card_b_prefix)[0]
     
    -        return (msz, 0, csz)
    +        return (msz, casz, cbsz)
     
         def free_space(self, end_session=True):
    -        msz = csz = 0
    +        msz = casz = cbsz = 0
             if not iswindows:
                 if self._main_prefix is not None:
                     stats = os.statvfs(self._main_prefix)
                     msz = stats.f_frsize * stats.f_bavail
    -            if self._card_prefix is not None:
    -                stats = os.statvfs(self._card_prefix)
    -                csz = stats.f_frsize * stats.f_bavail
    +            if self._card_a_prefix is not None:
    +                stats = os.statvfs(self._card_a_prefix)
    +                casz = stats.f_frsize * stats.f_bavail
    +            if self._card_b_prefix is not None:
    +                stats = os.statvfs(self._card_b_prefix)
    +                cbsz = stats.f_frsize * stats.f_bavail
             else:
                 msz = self._windows_space(self._main_prefix)[1]
                 csz = self._windows_space(self._card_prefix)[1]
     
    -        return (msz, 0, csz)
    +        return (msz, casz, cbsz)
     
         def windows_match_device(self, pnp_id, device_id):
             pnp_id = pnp_id.upper()
    @@ -190,15 +213,18 @@ class Device(_Device):
             for drive in c.Win32_DiskDrive():
                 if self.windows_match_device(str(drive.PNPDeviceID), self.WINDOWS_MAIN_MEM):
                     drives['main'] = self.windows_get_drive_prefix(drive)
    -            elif self.windows_match_device(str(drive.PNPDeviceID), self.WINDOWS_CARD_MEM):
    -                drives['card'] = self.windows_get_drive_prefix(drive)
    +            elif self.windows_match_device(str(drive.PNPDeviceID), self.WINDOWS_CARD_A_MEM):
    +                drives['carda'] = self.windows_get_drive_prefix(drive)
    +            elif self.windows_match_device(str(drive.PNPDeviceID), self.WINDOWS_CARD_B_MEM):
    +                drives['cardb'] = self.windows_get_drive_prefix(drive)
     
    -            if 'main' in drives.keys() and 'card' in drives.keys():
    +            if 'main' in drives.keys() and 'carda' in drives.keys() and 'cardb' in drives.keys():
                     break
     
             drives = self.windows_sort_drives(drives)
             self._main_prefix = drives.get('main')
    -        self._card_prefix = drives.get('card')
    +        self._card_a_prefix = drives.get('carda')
    +        self._card_b_prefix = drives.get('cardb')
     
             if not self._main_prefix:
                 raise DeviceError(
    @@ -228,9 +254,11 @@ class Device(_Device):
             for i, line in enumerate(lines):
                 if self.OSX_MAIN_MEM is not None and line.strip().endswith('') and self.OSX_MAIN_MEM in line:
                     get_dev_node(lines[i+1:], 'main')
    -            if self.OSX_CARD_MEM is not None and line.strip().endswith('') and self.OSX_CARD_MEM in line:
    -                get_dev_node(lines[i+1:], 'card')
    -            if len(names.keys()) == 2:
    +            if self.OSX_CARD_A_MEM is not None and line.strip().endswith('') and self.OSX_CARD_A_MEM in line:
    +                get_dev_node(lines[i+1:], 'carda')
    +            if self.OSX_CARD_B_MEM is not None and line.strip().endswith('') and self.OSX_CARD_B_MEM in line:
    +                get_dev_node(lines[i+1:], 'cardb')
    +            if len(names.keys()) == 3:
                     break
             return names
     
    @@ -242,10 +270,18 @@ class Device(_Device):
                 raise DeviceError(_('Unable to detect the %s disk drive. Try rebooting.')%self.__class__.__name__)
             main_pat = dev_pat % names['main']
             self._main_prefix = re.search(main_pat, mount).group(2) + os.sep
    -        card_pat = names['card'] if 'card' in names.keys() else None
    -        if card_pat is not None:
    -            card_pat = dev_pat % card_pat
    -            self._card_prefix = re.search(card_pat, mount).group(2) + os.sep
    +        card_a_pat = names['carda'] if 'carda' in names.keys() else None
    +        card_b_pat = names['cardb'] if 'cardb' in names.keys() else None
    +        
    +        def get_card_prefix(pat):
    +            if pat is not None:
    +                pat = dev_pat % pat
    +                return re.search(pat, mount).group(2) + os.sep
    +            else:
    +                return None
    +                
    +        self._card_a_prefix = get_card_prefix(card_a_pat)
    +        self._card_b_prefix = get_card_prefix(card_b_pat) 
     
         def open_linux(self):
             import dbus
    @@ -278,21 +314,24 @@ class Device(_Device):
             if not self._main_prefix:
                 raise DeviceError('Could not open device for reading. Try a reboot.')
     
    -        self._card_prefix = None
    +        self._card_a_prefix = self._card_b_prefix = None
             cards = hm.FindDeviceStringMatch(__appname__+'.cardvolume', self.__class__.__name__)
     
    -        for dev in cards:
    +        def mount_card(dev):
                 try:
    -                self._card_prefix = conditional_mount(dev)+os.sep
    -                break
    +                return conditional_mount(dev)+os.sep
                 except:
                     import traceback
                     print traceback
    -                continue
    +
    +        if len(cards) >= 1:
    +            self._card_a_prefix = mount_card(cards[0])
    +        if len(cards) >=2:
    +            self._card_b_prefix = mount_card(cards[1])
     
         def open(self):
             time.sleep(5)
    -        self._main_prefix = self._card_prefix = None
    +        self._main_prefix = self._card_a_prefix = self._card_b_prefix = None
             if islinux:
                 try:
                     self.open_linux()
    diff --git a/src/calibre/devices/usbms/driver.py b/src/calibre/devices/usbms/driver.py
    index 0a66b78014..bb7a104fa4 100644
    --- a/src/calibre/devices/usbms/driver.py
    +++ b/src/calibre/devices/usbms/driver.py
    @@ -12,28 +12,19 @@ from itertools import cycle
     
     from calibre.ebooks.metadata.meta import metadata_from_formats, path_to_ext
     from calibre.ebooks.metadata import authors_to_string
    +from calibre.devices.usbms.cli import CLI
     from calibre.devices.usbms.device import Device
     from calibre.devices.usbms.books import BookList, Book
    -from calibre.devices.errors import FreeSpaceError, PathError
    +from calibre.devices.errors import DeviceError, FreeSpaceError
     from calibre.devices.mime import mime_type_ext
     
    -class File(object):
    -    def __init__(self, path):
    -        stats = os.stat(path)
    -        self.is_dir = os.path.isdir(path)
    -        self.is_readonly = not os.access(path, os.W_OK)
    -        self.ctime = stats.st_ctime
    -        self.wtime = stats.st_mtime
    -        self.size  = stats.st_size
    -        if path.endswith(os.sep):
    -            path = path[:-1]
    -        self.path = path
    -        self.name = os.path.basename(path)
    -
    -class USBMS(Device):
    +# CLI must come before Device as it implments the CLI functions that
    +# are inherited from the device interface in Device. 
    +class USBMS(CLI, Device):
         FORMATS = []
         EBOOK_DIR_MAIN = ''
    -    EBOOK_DIR_CARD = ''
    +    EBOOK_DIR_CARD_A = ''
    +    EBOOK_DIR_CARD_B = ''
         SUPPORTS_SUB_DIRS = False
         CAN_SET_METADATA = False
     
    @@ -48,14 +39,18 @@ class USBMS(Device):
             """
             return (self.__class__.__name__, '', '', '')
     
    -    def books(self, oncard=False, end_session=True):
    +    def books(self, oncard=None, end_session=True):
             bl = BookList()
     
    -        if oncard and self._card_prefix is None:
    +        if oncard == 'carda' and not self._card_a_prefix:
    +            return bl
    +        elif oncard == 'cardb' and not self._card_b_prefix:
    +            return bl
    +        elif oncard and oncard != 'carda' and oncard != 'cardb':
                 return bl
     
    -        prefix = self._card_prefix if oncard else self._main_prefix
    -        ebook_dir = self.EBOOK_DIR_CARD if oncard else self.EBOOK_DIR_MAIN
    +        prefix = self._card_a_prefix if oncard == 'carda' else self._card_b_prefix if oncard == 'cardb' else self._main_prefix
    +        ebook_dir = self.EBOOK_DIR_CARD_A if oncard == 'carda' else self.EBOOK_DIR_CARD_B if oncard == 'cardb' else self.EBOOK_DIR_MAIN
     
             # Get all books in the ebook_dir directory
             if self.SUPPORTS_SUB_DIRS:
    @@ -71,15 +66,21 @@ class USBMS(Device):
                         bl.append(self.__class__.book_from_path(os.path.join(path, filename)))
             return bl
     
    -    def upload_books(self, files, names, on_card=False, end_session=True,
    +    def upload_books(self, files, names, on_card=None, end_session=True,
                          metadata=None):
    -        if on_card and not self._card_prefix:
    -            raise ValueError(_('The reader has no storage card connected.'))
    +        if on_card == 'carda' and not self._card_a_prefix:
    +            raise ValueError(_('The reader has no storage card in this slot.'))
    +        elif on_card == 'cardb' and not self._card_b_prefix:
    +            raise ValueError(_('The reader has no storage card in this slot.'))
    +        elif on_card and on_card not in ('carda', 'cardb'):
    +            raise DeviceError(_('The reader has no storage card in this slot.'))
     
    -        if not on_card:
    -            path = os.path.join(self._main_prefix, self.EBOOK_DIR_MAIN)
    +        if on_card == 'carda':
    +            path = os.path.join(self._card_a_prefix, self.EBOOK_DIR_CARD_A)
    +        if on_card == 'cardb':
    +            path = os.path.join(self._card_b_prefix, self.EBOOK_DIR_CARD_B)
             else:
    -            path = os.path.join(self._card_prefix, self.EBOOK_DIR_CARD)
    +            path = os.path.join(self._main_prefix, self.EBOOK_DIR_MAIN)
     
             def get_size(obj):
                 if hasattr(obj, 'seek'):
    @@ -92,10 +93,12 @@ class USBMS(Device):
             sizes = [get_size(f) for f in files]
             size = sum(sizes)
     
    -        if on_card and size > self.free_space()[2] - 1024*1024:
    -            raise FreeSpaceError(_("There is insufficient free space on the storage card"))
             if not on_card and size > self.free_space()[0] - 2*1024*1024:
                 raise FreeSpaceError(_("There is insufficient free space in main memory"))
    +        if on_card == 'carda' and size > self.free_space()[1] - 1024*1024:
    +            raise FreeSpaceError(_("There is insufficient free space on the storage card"))
    +        if on_card == 'cardb' and size > self.free_space()[2] - 1024*1024:
    +            raise FreeSpaceError(_("There is insufficient free space on the storage card"))
     
             paths = []
             names = iter(names)
    @@ -147,12 +150,12 @@ class USBMS(Device):
         def add_books_to_metadata(cls, locations, metadata, booklists):
             for location in locations:
                 path = location[0]
    -            on_card = 1 if location[1] else 0
    +            blist = 2 if location[1] == 'cardb' else 1 if location[1] == 'carda' else 0
     
                 book = cls.book_from_path(path)
     
    -            if not book in booklists[on_card]:
    -                booklists[on_card].append(book)
    +            if not book in booklists[blist]:
    +                booklists[blist].append(book)
     
     
         def delete_books(self, paths, end_session=True):
    @@ -180,58 +183,6 @@ class USBMS(Device):
             # the Sony Readers.
             pass
     
    -    def get_file(self, path, outfile, end_session=True):
    -        path = self.munge_path(path)
    -        with open(path, 'rb') as src:
    -            shutil.copyfileobj(src, outfile, 10*1024*1024)
    -
    -    def put_file(self, infile, path, replace_file=False, end_session=True):
    -        path = self.munge_path(path)
    -        if os.path.isdir(path):
    -            path = os.path.join(path, infile.name)
    -        if not replace_file and os.path.exists(path):
    -            raise PathError('File already exists: ' + path)
    -        dest = open(path, 'wb')
    -        shutil.copyfileobj(infile, dest, 10*1024*1024)
    -        dest.flush()
    -        dest.close()
    -
    -    def munge_path(self, path):
    -        if path.startswith('/') and not (path.startswith(self._main_prefix) or \
    -            (self._card_prefix and path.startswith(self._card_prefix))):
    -            path = self._main_prefix + path[1:]
    -        elif path.startswith('card:'):
    -            path = path.replace('card:', self._card_prefix[:-1])
    -        return path
    -
    -    def list(self, path, recurse=False, end_session=True, munge=True):
    -        if munge:
    -            path = self.munge_path(path)
    -        if os.path.isfile(path):
    -            return [(os.path.dirname(path), [File(path)])]
    -        entries = [File(os.path.join(path, f)) for f in os.listdir(path)]
    -        dirs = [(path, entries)]
    -        for _file in entries:
    -            if recurse and _file.is_dir:
    -                dirs[len(dirs):] = self.list(_file.path, recurse=True, munge=False)
    -        return dirs
    -
    -    def mkdir(self, path, end_session=True):
    -        if self.SUPPORTS_SUB_DIRS:
    -            path = self.munge_path(path)
    -            os.mkdir(path)
    -
    -    def rm(self, path, end_session=True):
    -        path = self.munge_path(path)
    -        self.delete_books([path])
    -
    -    def touch(self, path, end_session=True):
    -        path = self.munge_path(path)
    -        if not os.path.exists(path):
    -            open(path, 'w').close()
    -        if not os.path.isdir(path):
    -            os.utime(path, None)
    -
         @classmethod
         def metadata_from_path(cls, path):
             return metadata_from_formats([path])
    diff --git a/src/calibre/gui2/device.py b/src/calibre/gui2/device.py
    index 46cf9895d4..d21a249395 100644
    --- a/src/calibre/gui2/device.py
    +++ b/src/calibre/gui2/device.py
    @@ -139,9 +139,10 @@ class DeviceManager(Thread):
     
         def _books(self):
             '''Get metadata from device'''
    -        mainlist = self.device.books(oncard=False, end_session=False)
    -        cardlist = self.device.books(oncard=True)
    -        return (mainlist, cardlist)
    +        mainlist = self.device.books(oncard=None, end_session=False)
    +        cardalist = self.device.books(oncard='carda')
    +        cardblist = self.device.books(oncard='cardb')
    +        return (mainlist, cardalist, cardblist)
     
         def books(self, done):
             '''Return callable that returns the list of books on device as two booklists'''
    @@ -156,12 +157,12 @@ class DeviceManager(Thread):
             return self.create_job(self._sync_booklists, done, args=[booklists],
                             description=_('Send metadata to device'))
     
    -    def _upload_books(self, files, names, on_card=False, metadata=None):
    +    def _upload_books(self, files, names, on_card=None, metadata=None):
             '''Upload books to device: '''
             return self.device.upload_books(files, names, on_card,
                                             metadata=metadata, end_session=False)
     
    -    def upload_books(self, done, files, names, on_card=False, titles=None,
    +    def upload_books(self, done, files, names, on_card=None, titles=None,
                          metadata=None):
             desc = _('Upload %d books to device')%len(names)
             if titles:
    @@ -197,6 +198,7 @@ class DeviceManager(Thread):
     
         def _view_book(self, path, target):
             f = open(target, 'wb')
    +        print self.device
             self.device.get_file(path, f)
             f.close()
             return target
    @@ -256,24 +258,27 @@ class DeviceMenu(QMenu):
                     self.connect(action2, SIGNAL('a_s(QAction)'),
                                 self.action_triggered)
     
    -
    -
    -
             _actions = [
                     ('main:', False, False,  ':/images/reader.svg',
                         _('Send to main memory')),
    -                ('card:0', False, False, ':/images/sd.svg',
    -                    _('Send to storage card')),
    +                ('carda:0', False, False, ':/images/sd.svg',
    +                    _('Send to storage card A')),
    +                ('cardb:0', False, False, ':/images/sd.svg',
    +                    _('Send to storage card B')),
                     '-----',
                     ('main:', True, False,   ':/images/reader.svg',
                         _('Send to main memory')),
    -                ('card:0', True, False,  ':/images/sd.svg',
    -                    _('Send to storage card')),
    +                ('carda:0', True, False,  ':/images/sd.svg',
    +                    _('Send to storage card A')),
    +                ('cardb:0', True, False,  ':/images/sd.svg',
    +                    _('Send to storage card B')),
                     '-----',
                     ('main:', False, True,  ':/images/reader.svg',
                         _('Send specific format to main memory')),
    -                ('card:0', False, True, ':/images/sd.svg',
    -                    _('Send specific format to storage card')),
    +                ('carda:0', False, True, ':/images/sd.svg',
    +                    _('Send specific format to storage card A')),
    +                ('cardb:0', False, True, ':/images/sd.svg',
    +                    _('Send specific format to storage card B')),
     
                     ]
             if default_account is not None:
    @@ -335,7 +340,7 @@ class DeviceMenu(QMenu):
     
         def enable_device_actions(self, enable):
             for action in self.actions:
    -            if action.dest[:4] in ('main', 'card'):
    +            if action.dest in ('main:', 'carda:0', 'cardb:0'):
                     action.setEnabled(enable)
     
     class Emailer(Thread):
    @@ -412,16 +417,23 @@ class DeviceGUI(object):
                 d.exec_()
                 fmt = d.format().lower()
             dest, sub_dest = dest.split(':')
    -        if dest in ('main', 'card'):
    +        if dest in ('main', 'carda', 'cardb'):
                 if not self.device_connected or not self.device_manager:
                     error_dialog(self, _('No device'),
                             _('Cannot send: No device is connected')).exec_()
                     return
    -            on_card = dest == 'card'
    -            if on_card and not self.device_manager.has_card():
    +            if dest == 'carda' and not self.device_manager.has_card():
                     error_dialog(self, _('No card'),
                             _('Cannot send: Device has no storage card')).exec_()
                     return
    +            if dest == 'cardb' and not self.device_manager.has_card():
    +                error_dialog(self, _('No card'),
    +                        _('Cannot send: Device has no storage card')).exec_()
    +                return
    +            if dest == 'main':
    +                on_card = None
    +            else:
    +                on_card = dest
                 self.sync_to_device(on_card, delete, fmt)
             elif dest == 'mail':
                 to, fmts = sub_dest.split(';')
    @@ -678,7 +690,7 @@ class DeviceGUI(object):
             cp, fs = job.result
             self.location_view.model().update_devices(cp, fs)
     
    -    def upload_books(self, files, names, metadata, on_card=False, memory=None):
    +    def upload_books(self, files, names, metadata, on_card=None, memory=None):
             '''
             Upload books to device.
             :param files: List of either paths to files or file like objects
    @@ -717,7 +729,7 @@ class DeviceGUI(object):
     
             self.upload_booklists()
     
    -        view = self.card_view if on_card else self.memory_view
    +        view = self.card_a_view if on_card == 'carda' else self.card_b_view if on_card == 'cardb' else self.memory_view
             view.model().resort(reset=False)
             view.model().research()
             for f in files:
    diff --git a/src/calibre/gui2/main.py b/src/calibre/gui2/main.py
    index f90f98b73d..f7362e556c 100644
    --- a/src/calibre/gui2/main.py
    +++ b/src/calibre/gui2/main.py
    @@ -303,7 +303,9 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
                                             similar_menu=similar_menu)
             self.memory_view.set_context_menu(None, None, None,
                     self.action_view, self.action_save, None, None)
    -        self.card_view.set_context_menu(None, None, None,
    +        self.card_a_view.set_context_menu(None, None, None,
    +                self.action_view, self.action_save, None, None)
    +        self.card_b_view.set_context_menu(None, None, None,
                     self.action_view, self.action_save, None, None)
             QObject.connect(self.library_view,
                     SIGNAL('files_dropped(PyQt_PyObject)'),
    @@ -313,11 +315,12 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
                                  ('connect_to_book_display',
                                      self.status_bar.book_info.show_data),
                                  ]:
    -            for view in (self.library_view, self.memory_view, self.card_view):
    +            for view in (self.library_view, self.memory_view, self.card_a_view, self.card_b_view):
                     getattr(view, func)(target)
     
             self.memory_view.connect_dirtied_signal(self.upload_booklists)
    -        self.card_view.connect_dirtied_signal(self.upload_booklists)
    +        self.card_a_view.connect_dirtied_signal(self.upload_booklists)
    +        self.card_b_view.connect_dirtied_signal(self.upload_booklists)
     
             self.show()
             if self.system_tray_icon.isVisible() and opts.start_in_tray:
    @@ -580,10 +583,12 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
             if idx == 1:
                 return self.memory_view
             if idx == 2:
    -            return self.card_view
    +            return self.card_a_view
    +        if idx == 3:
    +            return self.card_b_view
     
         def booklists(self):
    -        return self.memory_view.model().db, self.card_view.model().db
    +        return self.memory_view.model().db, self.card_a_view.model().db, self.card_b_view.model().db
     
     
     
    @@ -645,12 +650,14 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
                 else:
                     self.device_job_exception(job)
                 return
    -        mainlist, cardlist = job.result
    +        mainlist, cardalist, cardblist = job.result
             self.memory_view.set_database(mainlist)
             self.memory_view.set_editable(self.device_manager.device_class.CAN_SET_METADATA)
    -        self.card_view.set_database(cardlist)
    -        self.card_view.set_editable(self.device_manager.device_class.CAN_SET_METADATA)
    -        for view in (self.memory_view, self.card_view):
    +        self.card_a_view.set_database(cardalist)
    +        self.card_a_view.set_editable(self.device_manager.device_class.CAN_SET_METADATA)
    +        self.card_b_view.set_database(cardblist)
    +        self.card_b_view.set_editable(self.device_manager.device_class.CAN_SET_METADATA)
    +        for view in (self.memory_view, self.card_a_view, self.card_b_view):
                 view.sortByColumn(3, Qt.DescendingOrder)
                 if not view.restore_column_widths():
                     view.resizeColumnsToContents()
    @@ -791,8 +798,12 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
                     return
                 view.model().delete_books(rows)
             else:
    -            view = self.memory_view if self.stack.currentIndex() == 1 \
    -                    else self.card_view
    +            if self.stack.currentIndex() == 1:
    +                view = self.memory_view
    +            elif self.stack.currentIndex() == 2:
    +                view = self.card_a_view
    +            else:
    +                view = self.card_b_view
                 paths = view.model().paths(rows)
                 job = self.remove_paths(paths)
                 self.delete_memory[job] = (paths, view.model())
    @@ -807,7 +818,7 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
             '''
             Called once deletion is done on the device
             '''
    -        for view in (self.memory_view, self.card_view):
    +        for view in (self.memory_view, self.card_a_view, self.card_b_view):
                 view.model().deletion_done(job, bool(job.exception))
             if job.exception is not None:
                 self.device_job_exception(job)
    @@ -1316,10 +1327,11 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
             '''
             Called when a location icon is clicked (e.g. Library)
             '''
    -        page = 0 if location == 'library' else 1 if location == 'main' else 2
    +        page = 0 if location == 'library' else 1 if location == 'main' else 2 if location == 'carda' else 3
             self.stack.setCurrentIndex(page)
             view = self.memory_view if page == 1 else \
    -                self.card_view if page == 2 else None
    +                self.card_a_view if page == 2 else \
    +                self.card_b_view if page == 3 else None
             if view:
                 if view.resize_on_select:
                     view.resizeRowsToContents()
    diff --git a/src/calibre/gui2/main.ui b/src/calibre/gui2/main.ui
    index fbae01d3e6..24ba2a1c7a 100644
    --- a/src/calibre/gui2/main.ui
    +++ b/src/calibre/gui2/main.ui
    @@ -288,7 +288,7 @@
            
           
           
    -       0
    +       3
           
           
            
    @@ -417,10 +417,48 @@
             
            
           
    -      
    +      
            
             
    -         
    +         
    +          
    +           
    +            10
    +            10
    +           
    +          
    +          
    +           true
    +          
    +          
    +           true
    +          
    +          
    +           false
    +          
    +          
    +           QAbstractItemView::DragDrop
    +          
    +          
    +           true
    +          
    +          
    +           QAbstractItemView::SelectRows
    +          
    +          
    +           false
    +          
    +          
    +           false
    +          
    +         
    +        
    +       
    +      
    +      
    +       
    +        
    +         
               
                
                 10
    diff --git a/src/calibre/gui2/viewer/printing.py b/src/calibre/gui2/viewer/printing.py
    index e948360338..8d9801e306 100644
    --- a/src/calibre/gui2/viewer/printing.py
    +++ b/src/calibre/gui2/viewer/printing.py
    @@ -8,7 +8,7 @@ import os, sys, traceback, urlparse
     
     from BeautifulSoup import BeautifulSoup, Tag
     
    -from calibre.ebooks.epub.iterator import EbookIterator
    +from calibre.ebooks.oeb.iterator import EbookIterator
     from calibre.ptempfile import TemporaryDirectory
     
     from PyQt4 import QtCore
    diff --git a/src/calibre/gui2/widgets.py b/src/calibre/gui2/widgets.py
    index 535ec4251f..886320aedb 100644
    --- a/src/calibre/gui2/widgets.py
    +++ b/src/calibre/gui2/widgets.py
    @@ -171,17 +171,20 @@ class LocationModel(QAbstractListModel):
             QAbstractListModel.__init__(self, parent)
             self.icons = [QVariant(QIcon(':/library')),
                           QVariant(QIcon(':/images/reader.svg')),
    +                      QVariant(QIcon(':/images/sd.svg')),
                           QVariant(QIcon(':/images/sd.svg'))]
             self.text = [_('Library\n%d\nbooks'),
                          _('Reader\n%s\navailable'),
    -                     _('Card\n%s\navailable')]
    -        self.free = [-1, -1]
    +                     _('Card A\n%s\navailable'),
    +                     _('Card B\n%s\navailable')]
    +        self.free = [-1, -1, -1]
             self.count = 0
             self.highlight_row = 0
             self.tooltips = [
                              _('Click to see the list of books available on your computer'),
                              _('Click to see the list of books in the main memory of your reader'),
    -                         _('Click to see the list of books on the storage card in your reader')
    +                         _('Click to see the list of books on storage card A in your reader'),
    +                         _('Click to see the list of books on storage card B in your reader')
                              ]
     
         def rowCount(self, parent):
    @@ -218,9 +221,14 @@ class LocationModel(QAbstractListModel):
     
         def update_devices(self, cp=None, fs=[-1, -1, -1]):
             self.free[0] = fs[0]
    -        self.free[1] = max(fs[1:])
    -        if cp == None:
    +        self.free[1] = fs[1]
    +        self.free[2] = fs[2]
    +        if cp != None:
    +            self.free[1] = fs[1] if fs[1] else -1
    +            self.free[2] = fs[2] if fs[2] else -1
    +        else:
                 self.free[1] = -1
    +            self.free[2] = -1
             self.reset()
     
         def location_changed(self, row):
    @@ -244,12 +252,12 @@ class LocationView(QListView):
         def current_changed(self, current, previous):
             if current.isValid():
                 i = current.row()
    -            location = 'library' if i == 0 else 'main' if i == 1 else 'card'
    +            location = 'library' if i == 0 else 'main' if i == 1 else 'carda' if i == 2 else 'cardb'
                 self.emit(SIGNAL('location_selected(PyQt_PyObject)'), location)
                 self.model().location_changed(i)
     
         def location_changed(self, row):
    -        if 0 <= row and row <= 2:
    +        if 0 <= row and row <= 3:
                 self.model().location_changed(row)
     
     class JobsView(TableView):
    
    From 35e8e347fea42005c861675d58b82e3559984066 Mon Sep 17 00:00:00 2001
    From: Kovid Goyal 
    Date: Wed, 15 Apr 2009 14:38:45 -0700
    Subject: [PATCH 096/319] Implement the --linearize-tables transform.
    
    ---
     src/calibre/customize/profiles.py             |  2 +-
     src/calibre/ebooks/conversion/plumber.py      | 29 +++++++++++++++++--
     src/calibre/ebooks/mobi/input.py              |  2 +-
     .../ebooks/oeb/transforms/linearize_tables.py | 21 ++++++++++++++
     4 files changed, 50 insertions(+), 4 deletions(-)
     create mode 100644 src/calibre/ebooks/oeb/transforms/linearize_tables.py
    
    diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py
    index 8623a94ddd..c11529f025 100644
    --- a/src/calibre/customize/profiles.py
    +++ b/src/calibre/customize/profiles.py
    @@ -143,7 +143,7 @@ class OutputProfile(Plugin):
     
         # ADE dies an agonizing, long drawn out death if HTML files have more
         # bytes than this.
    -    flow_size                 = sys.maxint
    +    flow_size                 = -1
         # ADE runs screaming when it sees these characters
         remove_special_chars      = re.compile(u'[\u200b\u00ad]')
         # ADE falls to the ground in a dead faint when it sees an 
    diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
    index ab30e71ba1..119ae4d63e 100644
    --- a/src/calibre/ebooks/conversion/plumber.py
    +++ b/src/calibre/ebooks/conversion/plumber.py
    @@ -94,7 +94,8 @@ OptionRecommendation(name='font_size_mapping',
     OptionRecommendation(name='line_height',
                 recommended_value=None, level=OptionRecommendation.LOW,
                 help=_('The line height in pts. Controls spacing between consecutive '
    -                   'lines of text. By default ??'
    +                   'lines of text. By default no line height manipulation is '
    +                   'performed.'
                        )
             ),
     
    @@ -102,12 +103,25 @@ OptionRecommendation(name='linearize_tables',
                 recommended_value=False, level=OptionRecommendation.LOW,
                 help=_('Some badly designed documents use tables to control the '
                     'layout of text on the page. When converted these documents '
    -                'often have text that runs of the page and other artifacts. '
    +                'often have text that runs off the page and other artifacts. '
                     'This option will extract the content from the tables and '
                     'present it in a linear fashion.'
                     )
             ),
     
    +OptionRecommendation(name='dont_split_on_page_breaks',
    +            recommended_value=False, level=OptionRecommendation.LOW,
    +            help=_('Turn off splitting at page breaks. Normally, input '
    +                    'files are automatically split at every page break into '
    +                    'two files. This gives an output ebook that can be '
    +                    'parsed faster and with less resources. However, '
    +                    'splitting is slow and if your source file contains a '
    +                    'very large number of page breaks, you should turn off '
    +                    'splitting on page breaks.'
    +                )
    +        ),
    +
    +
     OptionRecommendation(name='read_metadata_from_opf',
                 recommended_value=None, level=OptionRecommendation.LOW,
                 short_switch='m',
    @@ -330,6 +344,17 @@ OptionRecommendation(name='language',
                     untable=self.opts.linearize_tables)
             flattener(self.oeb, self.opts)
     
    +        if self.opts.linearize_tables:
    +            from calibre.ebooks.oeb.transforms.linearize_tables import LinearizeTables
    +            LinearizeTables()(self.oeb, self.opts)
    +
    +        from calibre.ebooks.oeb.transforms.split import Split
    +        pbx = accelerators.get('pagebreaks', None)
    +        split = Split(not self.opts.dont_split_on_page_breaks,
    +                max_flow_size=self.opts.output_profile.flow_size,
    +                page_breaks_xpath=pbx)
    +        split(self.oeb, self.opts)
    +
             from calibre.ebooks.oeb.transforms.trimmanifest import ManifestTrimmer
     
             self.log.info('Cleaning up manifest...')
    diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py
    index 2eb45c9161..97d94a0e33 100644
    --- a/src/calibre/ebooks/mobi/input.py
    +++ b/src/calibre/ebooks/mobi/input.py
    @@ -29,5 +29,5 @@ class MOBIInput(InputFormatPlugin):
                 with open(f, 'wb') as q:
                     q.write(html.tostring(root, encoding='utf-8', method='xml',
                         include_meta_content_type=False))
    -            accelerators['pagebreaks'] = {f: '//*[@class="mbp_pagebreak"]'}
    +                accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
             return mr.created_opf_path
    diff --git a/src/calibre/ebooks/oeb/transforms/linearize_tables.py b/src/calibre/ebooks/oeb/transforms/linearize_tables.py
    new file mode 100644
    index 0000000000..a0c11f848c
    --- /dev/null
    +++ b/src/calibre/ebooks/oeb/transforms/linearize_tables.py
    @@ -0,0 +1,21 @@
    +#!/usr/bin/env python
    +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
    +from __future__ import with_statement
    +
    +__license__   = 'GPL v3'
    +__copyright__ = '2009, Kovid Goyal '
    +__docformat__ = 'restructuredtext en'
    +
    +from calibre.ebooks.oeb.base import OEB_DOCS, XPNSMAP
    +
    +class LinearizeTables(object):
    +
    +    def linearize(self, root):
    +        for x in root.xpath('//h:table|//h:td|//h:tr|//h:th',
    +                namespaces=XPNSMAP):
    +            x.tag = 'div'
    +
    +    def __call__(self, oeb, context):
    +        for x in oeb.manifest.items:
    +            if x.media_type in OEB_DOCS:
    +                self.linearize(x.data)
    
    From aafc6d97649de2a72e303507953872205f8fbc5b Mon Sep 17 00:00:00 2001
    From: John Schember 
    Date: Wed, 15 Apr 2009 19:57:42 -0400
    Subject: [PATCH 097/319] Fix text output regex
    
    ---
     src/calibre/ebooks/txt/writer.py | 2 +-
     1 file changed, 1 insertion(+), 1 deletion(-)
    
    diff --git a/src/calibre/ebooks/txt/writer.py b/src/calibre/ebooks/txt/writer.py
    index 0f84c32804..ea613010ef 100644
    --- a/src/calibre/ebooks/txt/writer.py
    +++ b/src/calibre/ebooks/txt/writer.py
    @@ -76,7 +76,7 @@ class TxtWriter(object):
                     text = re.sub('(?imu)' % tag, '\n\n', text)
                 
                 for tag in ['hr', 'br']:
    -                text = re.sub('(?imu)<[ ]*%s[ ]*/*?>' % tag, '\n\n', text)
    +                text = re.sub('(?imu)<[ ]*%s.*?>' % tag, '\n\n', text)
                 
                 # Remove any tags that do not need special processing.
                 text = re.sub('<.*?>', '', text)
    
    From 575b021f48ea9cab351648999bc69737ea2aafa0 Mon Sep 17 00:00:00 2001
    From: John Schember 
    Date: Wed, 15 Apr 2009 20:11:00 -0400
    Subject: [PATCH 098/319] pdftohtml preprocess rules work
    
    ---
     src/calibre/ebooks/conversion/preprocess.py | 2 +-
     1 file changed, 1 insertion(+), 1 deletion(-)
    
    diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
    index 6b58d2d18d..632a7a3291 100644
    --- a/src/calibre/ebooks/conversion/preprocess.py
    +++ b/src/calibre/ebooks/conversion/preprocess.py
    @@ -73,7 +73,7 @@ class HTMLPreProcessor(object):
                       (re.compile(r''), lambda match : '

    '), # Un wrap lines - (re.compile(r'(?<=\w)\s*\s*\s*\s*(?=\w)'), lambda match: ' '), + (re.compile(r'(?<=\w)\s*\s*\s*<(i|b|u)>\s*(?=\w)'), lambda match: ' '), (re.compile(r'(?<=\w)\s*\s*(?=\w)', re.UNICODE), lambda match: ' '), # Clean up spaces (re.compile(u'(?<=\.|,|:|;|\?|!|”|"|\')[\s^ ]*(?=<)'), lambda match: ' '), From 7814dda6d8a531dd37fa7ce56c63aaa948a364a5 Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 16 Apr 2009 19:01:25 -0400 Subject: [PATCH 099/319] Fix splitting of authors --- src/calibre/devices/cybookg3/driver.py | 5 ++--- src/calibre/devices/usbms/driver.py | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/calibre/devices/cybookg3/driver.py b/src/calibre/devices/cybookg3/driver.py index c3a4fa94b0..5458fbbffb 100644 --- a/src/calibre/devices/cybookg3/driver.py +++ b/src/calibre/devices/cybookg3/driver.py @@ -7,7 +7,6 @@ Device driver for Bookeen's Cybook Gen 3 import os, shutil from itertools import cycle -from calibre.ebooks.metadata import authors_to_string from calibre.devices.errors import DeviceError, FreeSpaceError from calibre.devices.usbms.driver import USBMS import calibre.devices.cybookg3.t2b as t2b @@ -92,8 +91,8 @@ class CYBOOKG3(USBMS): break if newpath == path: - newpath = os.path.join(newpath, authors_to_string(mdata.get('authors', ''))) - newpath = os.path.join(newpath, mdata.get('title', '')) + newpath = os.path.join(newpath, mdata.get('authors', _('Unknown'))) + newpath = os.path.join(newpath, mdata.get('title', _('Unknown'))) if not os.path.exists(newpath): os.makedirs(newpath) diff --git a/src/calibre/devices/usbms/driver.py b/src/calibre/devices/usbms/driver.py index bb7a104fa4..aa40f90c25 100644 --- a/src/calibre/devices/usbms/driver.py +++ b/src/calibre/devices/usbms/driver.py @@ -124,8 +124,8 @@ class USBMS(CLI, Device): break if newpath == path: - newpath = os.path.join(newpath, authors_to_string(mdata.get('authors', ''))) - newpath = os.path.join(newpath, mdata.get('title', '')) + newpath = os.path.join(newpath, mdata.get('authors', _('Unknown'))) + newpath = os.path.join(newpath, mdata.get('title', _('Unknown'))) if not os.path.exists(newpath): os.makedirs(newpath) From 4c6599fd45b2b3f188d6cc09bdec9b2c209ec5c3 Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 16 Apr 2009 19:10:38 -0400 Subject: [PATCH 100/319] PRS505/700: Put books in author/title dir structure and use USBMS style / tag paths. --- src/calibre/devices/cybookg3/driver.py | 29 +++++++++++----------- src/calibre/devices/prs505/driver.py | 33 ++++++++++++++++++++++---- 2 files changed, 43 insertions(+), 19 deletions(-) diff --git a/src/calibre/devices/cybookg3/driver.py b/src/calibre/devices/cybookg3/driver.py index 5458fbbffb..1cdf9863b4 100644 --- a/src/calibre/devices/cybookg3/driver.py +++ b/src/calibre/devices/cybookg3/driver.py @@ -77,22 +77,21 @@ class CYBOOKG3(USBMS): newpath = path mdata = metadata.next() - if self.SUPPORTS_SUB_DIRS: - if 'tags' in mdata.keys(): - for tag in mdata['tags']: - if tag.startswith(_('News')): - newpath = os.path.join(newpath, 'news') - newpath = os.path.join(newpath, mdata.get('title', '')) - newpath = os.path.join(newpath, mdata.get('timestamp', '')) - elif tag.startswith('/'): - newpath = path - newpath += tag - newpath = os.path.normpath(newpath) - break + if 'tags' in mdata.keys(): + for tag in mdata['tags']: + if tag.startswith(_('News')): + newpath = os.path.join(newpath, 'news') + newpath = os.path.join(newpath, mdata.get('title', '')) + newpath = os.path.join(newpath, mdata.get('timestamp', '')) + elif tag.startswith('/'): + newpath = path + newpath += tag + newpath = os.path.normpath(newpath) + break - if newpath == path: - newpath = os.path.join(newpath, mdata.get('authors', _('Unknown'))) - newpath = os.path.join(newpath, mdata.get('title', _('Unknown'))) + if newpath == path: + newpath = os.path.join(newpath, mdata.get('authors', _('Unknown'))) + newpath = os.path.join(newpath, mdata.get('title', _('Unknown'))) if not os.path.exists(newpath): os.makedirs(newpath) diff --git a/src/calibre/devices/prs505/driver.py b/src/calibre/devices/prs505/driver.py index efc48a2dff..a704eb1ec3 100644 --- a/src/calibre/devices/prs505/driver.py +++ b/src/calibre/devices/prs505/driver.py @@ -119,19 +119,44 @@ class PRS505(CLI, Device): paths, ctimes = [], [] names = iter(names) + metadata = iter(metadata) for infile in files: close = False if not hasattr(infile, 'read'): infile, close = open(infile, 'rb'), True infile.seek(0) - name = names.next() - paths.append(os.path.join(path, name)) - if not os.path.exists(os.path.dirname(paths[-1])): - os.makedirs(os.path.dirname(paths[-1])) + + newpath = path + mdata = metadata.next() + + if 'tags' in mdata.keys(): + for tag in mdata['tags']: + if tag.startswith(_('News')): + newpath = os.path.join(newpath, 'news') + newpath = os.path.join(newpath, mdata.get('title', '')) + newpath = os.path.join(newpath, mdata.get('timestamp', '')) + elif tag.startswith('/'): + newpath = path + newpath += tag + newpath = os.path.normpath(newpath) + break + + if newpath == path: + newpath = os.path.join(newpath, mdata.get('authors', _('Unknown'))) + newpath = os.path.join(newpath, mdata.get('title', _('Unknown'))) + + if not os.path.exists(newpath): + os.makedirs(newpath) + + filepath = os.path.join(newpath, names.next()) + paths.append(filepath) + self.put_file(infile, paths[-1], replace_file=True) + if close: infile.close() ctimes.append(os.path.getctime(paths[-1])) + return zip(paths, sizes, ctimes, cycle([on_card])) @classmethod From b9f80aa2292cdd527233c2363f5ee07502ffea26 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 16 Apr 2009 18:15:49 -0700 Subject: [PATCH 101/319] Implement --pretty-print --- src/calibre/ebooks/conversion/cli.py | 3 ++ src/calibre/ebooks/conversion/plumber.py | 7 +-- src/calibre/ebooks/html/input.py | 2 +- src/calibre/ebooks/oeb/base.py | 6 +-- src/calibre/ebooks/oeb/transforms/package.py | 1 + src/calibre/ebooks/oeb/transforms/split.py | 45 +++++++++++++------- 6 files changed, 39 insertions(+), 25 deletions(-) diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index e8f4aa68e2..a3d57be191 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -74,6 +74,9 @@ def option_recommendation_to_cli_option(add_option, rec): switches.append('--'+opt.long_switch) attrs = dict(dest=opt.name, help=opt.help, choices=opt.choices, default=rec.recommended_value) + if isinstance(rec.recommended_value, type(True)): + attrs['action'] = 'store_false' if rec.recommended_value else \ + 'store_true' add_option(Option(*switches, **attrs)) def add_input_output_options(parser, plumber): diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 119ae4d63e..93fc376bea 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -324,7 +324,7 @@ OptionRecommendation(name='language', self.input_fmt, self.log, accelerators, tdir) if not hasattr(self.oeb, 'manifest'): - self.oeb = create_oebbook(self.log, self.oeb) + self.oeb = create_oebbook(self.log, self.oeb, self.opts) self.opts.source = self.opts.input_profile self.opts.dest = self.opts.output_profile @@ -365,7 +365,7 @@ OptionRecommendation(name='language', self.output_plugin.convert(self.oeb, self.output, self.input_plugin, self.opts, self.log) -def create_oebbook(log, opfpath): +def create_oebbook(log, opfpath, opts): ''' Create an OEBBook from an OPF file. ''' @@ -373,7 +373,8 @@ def create_oebbook(log, opfpath): from calibre.ebooks.oeb.base import OEBBook html_preprocessor = HTMLPreProcessor() reader = OEBReader() - oeb = OEBBook(log, html_preprocessor=html_preprocessor) + oeb = OEBBook(log, html_preprocessor=html_preprocessor, + pretty_print=opts.pretty_print) # Read OEB Book into OEBBook log.info('Parsing all content...') reader(oeb, opfpath) diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index 951b0824a5..5b9a085b1d 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -277,7 +277,7 @@ class HTMLInput(InputFormatPlugin): opfpath = os.path.abspath('metadata.opf') from calibre.ebooks.conversion.plumber import create_oebbook - oeb = create_oebbook(log, opfpath) + oeb = create_oebbook(log, opfpath, opts) from calibre.ebooks.oeb.transforms.package import Package Package(os.getcwdu())(oeb, opts) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index ed7981df4f..5d2c51c4ba 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -273,11 +273,7 @@ def xpath(elem, expr): return elem.xpath(expr, namespaces=XPNSMAP) def _prepare_xml_for_serialization(root): - root.set('xmlns', XHTML_NS) - root.set('{%s}xlink'%XHTML_NS, XLINK_NS) - for x in root.iter(): - if hasattr(x.tag, 'rpartition') and x.tag.rpartition('}')[-1].lower() == 'svg': - x.set('xmlns', SVG_NS) + pass def xml2str(root, pretty_print=False, strip_comments=False): _prepare_xml_for_serialization(root) diff --git a/src/calibre/ebooks/oeb/transforms/package.py b/src/calibre/ebooks/oeb/transforms/package.py index faf5486475..20fe6e2650 100644 --- a/src/calibre/ebooks/oeb/transforms/package.py +++ b/src/calibre/ebooks/oeb/transforms/package.py @@ -128,6 +128,7 @@ class Package(object): self.log = oeb.log self.oeb = oeb self.old_base_path = os.path.abspath(oeb.container.rootdir) + self.log.info('Packaging HTML files...') hrefs = set([]) for item in self.oeb.manifest: diff --git a/src/calibre/ebooks/oeb/transforms/split.py b/src/calibre/ebooks/oeb/transforms/split.py index 20205e9c6d..1bb5b50d06 100644 --- a/src/calibre/ebooks/oeb/transforms/split.py +++ b/src/calibre/ebooks/oeb/transforms/split.py @@ -12,7 +12,7 @@ assumes a prior call to the flatcss transform. import os, math, functools, collections, re, copy from lxml.etree import XPath as _XPath -from lxml import etree, html +from lxml import etree from lxml.cssselect import CSSSelector from calibre.ebooks.oeb.base import OEB_STYLES, XPNSMAP, urldefrag, \ @@ -96,24 +96,32 @@ class Split(object): page_breaks = set([]) for selector, before in page_break_selectors: for elem in selector(item.data): - elem.pb_before = before + if before: + elem.set('pb_before', '1') page_breaks.add(elem) for i, elem in enumerate(item.data.iter()): - elem.pb_order = i + elem.set('pb_order', str(i)) page_breaks = list(page_breaks) - page_breaks.sort(cmp=lambda x,y : cmp(x.pb_order, y.pb_order)) + page_breaks.sort(cmp= + lambda x,y : cmp(int(x.get('pb_order')), int(y.get('pb_order')))) page_break_ids, page_breaks_ = [], [] for i, x in enumerate(page_breaks): x.set('id', x.get('id', 'calibre_pb_%d'%i)) id = x.get('id') - page_breaks_.append((XPath('//*[@id="%s"]'%id), x.pb_before)) + page_breaks_.append((XPath('//*[@id="%s"]'%id), + x.get('pb_before', False))) page_break_ids.append(id) + for elem in item.data.iter(): + elem.attrib.pop('pb_order') + if elem.get('pb_before', False): + elem.attrib.pop('pb_before') + return page_breaks_, page_break_ids - def fix_links(self, opf): + def fix_links(self): ''' Fix references to the split files in other content files. ''' @@ -129,13 +137,14 @@ class Split(object): anchor_map = self.map[href] nhref = anchor_map[frag if frag else None] if frag: - nhref = '#'.joinn(href, frag) + nhref = '#'.join(href, frag) return nhref return url class FlowSplitter(object): + 'The actual splitting logic' def __init__(self, item, page_breaks, page_break_ids, max_flow_size, oeb): self.item = item @@ -149,10 +158,10 @@ class FlowSplitter(object): base, ext = os.path.splitext(self.base) self.base = base.replace('%', '%%')+'_split_%d'+ext - self.trees = [self.item.data] + self.trees = [self.item.data.getroottree()] self.splitting_on_page_breaks = True if self.page_breaks: - self.split_on_page_breaks(self.item.data) + self.split_on_page_breaks(self.trees[0]) self.splitting_on_page_breaks = False if self.max_flow_size > 0: @@ -192,6 +201,12 @@ class FlowSplitter(object): self.trees.append(tree) self.trees = [t for t in self.trees if not self.is_page_empty(t.getroot())] + def get_body(self, root): + body = root.xpath('//h:body', namespaces=NAMESPACES) + if not body: + return None + return body[0] + def do_split(self, tree, split_point, before): ''' Split ``tree`` into a *before* and *after* tree at ``split_point``, @@ -206,7 +221,7 @@ class FlowSplitter(object): tree, tree2 = copy.deepcopy(tree), copy.deepcopy(tree) root = tree.getroot() root2 = tree2.getroot() - body, body2 = root.body, root2.body + body, body2 = map(self.get_body, (root, root2)) split_point = root.xpath(path)[0] split_point2 = root2.xpath(path)[0] @@ -262,13 +277,14 @@ class FlowSplitter(object): return tree, tree2 def is_page_empty(self, root): - body = root.find('body') + body = self.get_body(root) if body is None: return False - txt = re.sub(r'\s+', '', html.tostring(body, method='text', encoding=unicode)) + txt = re.sub(r'\s+', '', + etree.tostring(body, method='text', encoding=unicode)) if len(txt) > 4: return False - for img in root.xpath('//img'): + for img in root.xpath('//h:img', namespaces=NAMESPACES): if img.get('style', '') != 'display:none': return False return True @@ -438,6 +454,3 @@ class FlowSplitter(object): fix_toc_entry(self.oeb.toc) self.oeb.manifest.remove(self.item) - - - From a66fb31027465d3b79196abc9ac95811c5f1f82f Mon Sep 17 00:00:00 2001 From: John Schember Date: Fri, 17 Apr 2009 08:06:35 -0400 Subject: [PATCH 102/319] Clean up command line options display. Use opf2 instead of opf. --- src/calibre/ebooks/pdf/input.py | 2 +- src/calibre/ebooks/txt/input.py | 2 +- src/calibre/ebooks/txt/output.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py index 6733d3aadc..e8c3889e41 100644 --- a/src/calibre/ebooks/pdf/input.py +++ b/src/calibre/ebooks/pdf/input.py @@ -9,7 +9,7 @@ import os from calibre.customize.conversion import InputFormatPlugin from calibre.ebooks.pdf.pdftohtml import pdftohtml -from calibre.ebooks.metadata.opf import OPFCreator +from calibre.ebooks.metadata.opf2 import OPFCreator class PDFInput(InputFormatPlugin): diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index aafc36989e..34fafc91fc 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -9,7 +9,7 @@ import os from calibre.customize.conversion import InputFormatPlugin from calibre.ebooks.markdown import markdown -from calibre.ebooks.metadata.opf import OPFCreator +from calibre.ebooks.metadata.opf2 import OPFCreator class TXTInput(InputFormatPlugin): diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index 423e668a56..dd87394507 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -18,14 +18,14 @@ class TXTOutput(OutputFormatPlugin): options = set([ OptionRecommendation(name='newline', recommended_value='system', - level=OptionRecommendation.LOW, long_switch='newline', + level=OptionRecommendation.LOW, short_switch='n', choices=TxtNewlines.NEWLINE_TYPES.keys(), help=_('Type of newline to use. Options are %s. Default is \'system\'. ' 'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. ' 'For Mac OS X use \'unix\'. \'system\' will default to the newline ' 'type used by this OS.' % sorted(TxtNewlines.NEWLINE_TYPES.keys()))), OptionRecommendation(name='prepend_metadata', recommended_value='false', - level=OptionRecommendation.LOW, long_switch='prepend_metadata', + level=OptionRecommendation.LOW, choices=['true', 'false'], help=_('Write the title and author to the beginning of the file. ' 'Default is \'true\'. Use \'false\' to disable.')), From a2064499e815b37c0dbef55e2b3f251cb6a1366e Mon Sep 17 00:00:00 2001 From: John Schember Date: Fri, 17 Apr 2009 21:42:03 -0400 Subject: [PATCH 103/319] Fix bug 2112: Stop metadata reader from holding pdf files open after reading. --- src/calibre/ebooks/metadata/pdf.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/calibre/ebooks/metadata/pdf.py b/src/calibre/ebooks/metadata/pdf.py index 4476eb0847..9946d831af 100644 --- a/src/calibre/ebooks/metadata/pdf.py +++ b/src/calibre/ebooks/metadata/pdf.py @@ -21,6 +21,7 @@ def get_metadata(stream, extract_cover=True): """ Return metadata as a L{MetaInfo} object """ mi = MetaInformation(_('Unknown'), [_('Unknown')]) stream.seek(0) + stream = StringIO.StringIO(stream.read()) if extract_cover and _imagemagick_loaded: try: @@ -70,6 +71,10 @@ def set_metadata(stream, mi): stream.seek(0) def get_cover(stream): + stream.seek(0) + if not isinstance(stream, StringIO.StringIO): + stream = StringIO.StringIO(stream.read()) + data = StringIO.StringIO() try: From 70e1336a90b4bdf7f7a388734d6a58710f1f8b62 Mon Sep 17 00:00:00 2001 From: John Schember Date: Fri, 17 Apr 2009 22:29:00 -0400 Subject: [PATCH 104/319] Use FileWrapper instead of StringIO for bug 2112 fix. --- src/calibre/__init__.py | 17 +++++++ src/calibre/ebooks/metadata/pdf.py | 82 +++++++++++++++--------------- 2 files changed, 58 insertions(+), 41 deletions(-) diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index 807ce1def5..6299bb8782 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -246,6 +246,23 @@ class CurrentDir(object): os.chdir(self.cwd) +class FileWrapper(object): + ''' + Used primarily with pyPdf to ensure the stream is properly closed. + ''' + + def __init__(self, stream): + for x in ('read', 'seek', 'tell'): + setattr(self, x, getattr(stream, x)) + + def __exit__(self, *args): + for x in ('read', 'seek', 'tell'): + setattr(self, x, None) + + def __enter__(self): + return self + + def detect_ncpus(): """Detects the number of effective CPUs in the system""" try: diff --git a/src/calibre/ebooks/metadata/pdf.py b/src/calibre/ebooks/metadata/pdf.py index 9946d831af..4dc98509e2 100644 --- a/src/calibre/ebooks/metadata/pdf.py +++ b/src/calibre/ebooks/metadata/pdf.py @@ -4,8 +4,9 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' '''Read meta information from PDF files''' -import sys, os, StringIO +import sys, os, cStringIO +from calibre import FileWrapper from calibre.ebooks.metadata import MetaInformation, authors_to_string from calibre.ptempfile import TemporaryDirectory from pyPdf import PdfFileReader, PdfFileWriter @@ -21,7 +22,6 @@ def get_metadata(stream, extract_cover=True): """ Return metadata as a L{MetaInfo} object """ mi = MetaInformation(_('Unknown'), [_('Unknown')]) stream.seek(0) - stream = StringIO.StringIO(stream.read()) if extract_cover and _imagemagick_loaded: try: @@ -33,18 +33,19 @@ def get_metadata(stream, extract_cover=True): traceback.print_exc() try: - info = PdfFileReader(stream).getDocumentInfo() - if info.title: - mi.title = info.title - if info.author: - src = info.author.split('&') - authors = [] - for au in src: - authors += au.split(',') - mi.authors = authors - mi.author = info.author - if info.subject: - mi.category = info.subject + with FileWrapper(stream) as stream: + info = PdfFileReader(stream).getDocumentInfo() + if info.title: + mi.title = info.title + if info.author: + src = info.author.split('&') + authors = [] + for au in src: + authors += au.split(',') + mi.authors = authors + mi.author = info.author + if info.subject: + mi.category = info.subject except Exception, err: msg = u'Couldn\'t read metadata from pdf: %s with error %s'%(mi.title, unicode(err)) print >>sys.stderr, msg.encode('utf8') @@ -52,17 +53,17 @@ def get_metadata(stream, extract_cover=True): def set_metadata(stream, mi): stream.seek(0) - # Use a StringIO object for the pdf because we will want to over + # Use a cStringIO object for the pdf because we will want to over # write it later and if we are working on the stream directly it # could cause some issues. - raw = StringIO.StringIO(stream.read()) + raw = cStringIO.StringIO(stream.read()) orig_pdf = PdfFileReader(raw) title = mi.title if mi.title else orig_pdf.documentInfo.title author = authors_to_string(mi.authors) if mi.authors else orig_pdf.documentInfo.author out_pdf = PdfFileWriter(title=title, author=author) for page in orig_pdf.pages: out_pdf.addPage(page) - out_str = StringIO.StringIO() + out_str = cStringIO.StringIO() out_pdf.write(out_str) stream.seek(0) stream.truncate() @@ -72,33 +73,32 @@ def set_metadata(stream, mi): def get_cover(stream): stream.seek(0) - if not isinstance(stream, StringIO.StringIO): - stream = StringIO.StringIO(stream.read()) - data = StringIO.StringIO() + data = cStringIO.StringIO() try: - pdf = PdfFileReader(stream) - output = PdfFileWriter() - - if len(pdf.pages) >= 1: - output.addPage(pdf.getPage(0)) - - with TemporaryDirectory('_pdfmeta') as tdir: - cover_path = os.path.join(tdir, 'cover.pdf') - - outputStream = file(cover_path, "wb") - output.write(outputStream) - outputStream.close() - - wand = NewMagickWand() - MagickReadImage(wand, cover_path) - MagickSetImageFormat(wand, 'JPEG') - MagickWriteImage(wand, '%s.jpg' % cover_path) - - img = Image.open('%s.jpg' % cover_path) - - img.save(data, 'JPEG') + with FileWrapper(stream) as stream: + pdf = PdfFileReader(stream) + output = PdfFileWriter() + + if len(pdf.pages) >= 1: + output.addPage(pdf.getPage(0)) + + with TemporaryDirectory('_pdfmeta') as tdir: + cover_path = os.path.join(tdir, 'cover.pdf') + + outputStream = file(cover_path, "wb") + output.write(outputStream) + outputStream.close() + + wand = NewMagickWand() + MagickReadImage(wand, cover_path) + MagickSetImageFormat(wand, 'JPEG') + MagickWriteImage(wand, '%s.jpg' % cover_path) + + img = Image.open('%s.jpg' % cover_path) + + img.save(data, 'JPEG') except: import traceback traceback.print_exc() From 3e29dfbe5682eff555c0d9bf3126f398aacb6ec2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 18 Apr 2009 01:01:18 -0700 Subject: [PATCH 105/319] Added LIT input plugin. Ported splitting code now works (at least on the handful of files I've tested) --- src/calibre/customize/builtins.py | 7 +- src/calibre/customize/conversion.py | 19 ++++- src/calibre/ebooks/conversion/cli.py | 5 +- src/calibre/ebooks/conversion/plumber.py | 27 ++++--- src/calibre/ebooks/html/input.py | 11 +++ src/calibre/ebooks/lit/input.py | 24 +++++++ src/calibre/ebooks/lit/reader.py | 83 +++++++++++----------- src/calibre/ebooks/oeb/base.py | 30 ++++++-- src/calibre/ebooks/oeb/iterator.py | 54 ++++++-------- src/calibre/ebooks/oeb/output.py | 1 - src/calibre/ebooks/oeb/reader.py | 11 ++- src/calibre/ebooks/oeb/transforms/split.py | 72 ++++++++++--------- src/calibre/ebooks/oeb/writer.py | 2 +- 13 files changed, 209 insertions(+), 137 deletions(-) create mode 100644 src/calibre/ebooks/lit/input.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index aa6c003114..08824a3591 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -263,14 +263,14 @@ class MOBIMetadataWriter(MetadataWriterPlugin): def set_metadata(self, stream, mi, type): from calibre.ebooks.metadata.mobi import set_metadata set_metadata(stream, mi) - + class PDFMetadataWriter(MetadataWriterPlugin): name = 'Set PDF metadata' file_types = set(['pdf']) description = _('Set metadata in %s files') % 'PDF' author = 'John Schember' - + def set_metadata(self, stream, mi, type): from calibre.ebooks.metadata.pdf import set_metadata set_metadata(stream, mi) @@ -280,6 +280,7 @@ from calibre.ebooks.epub.input import EPUBInput from calibre.ebooks.mobi.input import MOBIInput from calibre.ebooks.pdf.input import PDFInput from calibre.ebooks.txt.input import TXTInput +from calibre.ebooks.lit.input import LITInput from calibre.ebooks.html.input import HTMLInput from calibre.ebooks.oeb.output import OEBOutput from calibre.ebooks.txt.output import TXTOutput @@ -287,7 +288,7 @@ from calibre.ebooks.pdf.output import PDFOutput from calibre.customize.profiles import input_profiles, output_profiles plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput, - TXTInput, OEBOutput, TXTOutput, PDFOutput] + TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index 77cdb0b7da..b334816adf 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -41,6 +41,11 @@ class ConversionOption(object): def __eq__(self, other): return hash(self) == hash(other) + def clone(self): + return ConversionOption(name=self.name, help=self.help, + long_switch=self.long_switch, short_switch=self.short_switch, + choices=self.choices) + class OptionRecommendation(object): LOW = 1 MED = 2 @@ -59,6 +64,10 @@ class OptionRecommendation(object): self.validate_parameters() + def clone(self): + return OptionRecommendation(recommended_value=self.recommended_value, + level=self.level, option=self.option.clone()) + def validate_parameters(self): if self.option.choices and self.recommended_value not in \ self.option.choices: @@ -170,8 +179,14 @@ class InputFormatPlugin(Plugin): options.debug_input = os.path.abspath(options.debug_input) if not os.path.exists(options.debug_input): os.makedirs(options.debug_input) - shutil.rmtree(options.debug_input) - shutil.copytree(output_dir, options.debug_input) + if isinstance(ret, basestring): + shutil.rmtree(options.debug_input) + shutil.copytree(output_dir, options.debug_input) + else: + from calibre.ebooks.oeb.writer import OEBWriter + w = OEBWriter(pretty_print=options.pretty_print) + w(ret, options.debug_input) + log.info('Input debug saved to:', options.debug_input) return ret diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index a3d57be191..b7336ab30a 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -57,7 +57,7 @@ def check_command_line_options(parser, args, log): raise SystemExit(1) output = args[2] - if output.startswith('.'): + if output.startswith('.') and output != '.': output = os.path.splitext(os.path.basename(input))[0]+output output = os.path.abspath(output) @@ -171,7 +171,8 @@ def main(args=sys.argv): plumber.run() - log(_('Output saved to'), ' ', plumber.output) + if plumber.opts.debug_input is None: + log(_('Output saved to'), ' ', plumber.output) return 0 diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 93fc376bea..1edeed8d9c 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -32,8 +32,8 @@ class Plumber(object): :param input: Path to input file. :param output: Path to output file/directory ''' - self.input = input - self.output = output + self.input = os.path.abspath(input) + self.output = os.path.abspath(output) self.log = log # Initialize the conversion options that are independent of input and @@ -188,15 +188,15 @@ OptionRecommendation(name='language', ] - input_fmt = os.path.splitext(input)[1] + input_fmt = os.path.splitext(self.input)[1] if not input_fmt: raise ValueError('Input file must have an extension') input_fmt = input_fmt[1:].lower() - if os.path.exists(output) and os.path.isdir(output): + if os.path.exists(self.output) and os.path.isdir(self.output): output_fmt = 'oeb' else: - output_fmt = os.path.splitext(output)[1] + output_fmt = os.path.splitext(self.output)[1] if not output_fmt: output_fmt = '.oeb' output_fmt = output_fmt[1:].lower() @@ -323,6 +323,9 @@ OptionRecommendation(name='language', self.oeb = self.input_plugin(open(self.input, 'rb'), self.opts, self.input_fmt, self.log, accelerators, tdir) + if self.opts.debug_input is not None: + self.log('Debug input called, aborting the rest of the pipeline.') + return if not hasattr(self.oeb, 'manifest'): self.oeb = create_oebbook(self.log, self.oeb, self.opts) @@ -365,18 +368,20 @@ OptionRecommendation(name='language', self.output_plugin.convert(self.oeb, self.output, self.input_plugin, self.opts, self.log) -def create_oebbook(log, opfpath, opts): +def create_oebbook(log, path_or_stream, opts, reader=None): ''' - Create an OEBBook from an OPF file. + Create an OEBBook. ''' - from calibre.ebooks.oeb.reader import OEBReader from calibre.ebooks.oeb.base import OEBBook html_preprocessor = HTMLPreProcessor() - reader = OEBReader() oeb = OEBBook(log, html_preprocessor=html_preprocessor, pretty_print=opts.pretty_print) # Read OEB Book into OEBBook - log.info('Parsing all content...') - reader(oeb, opfpath) + log('Parsing all content...') + if reader is None: + from calibre.ebooks.oeb.reader import OEBReader + reader = OEBReader + + reader()(oeb, path_or_stream) return oeb diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index 5b9a085b1d..252032a23d 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -252,6 +252,14 @@ class HTMLInput(InputFormatPlugin): ) ), + OptionRecommendation(name='dont_package', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Normally this input plugin re-arranges all the input ' + 'files into a standard folder hierarchy. Only use this option ' + 'if you know what you are doing as it can result in various ' + 'nasty side effects in the rest of of the conversion pipeline.' + ) + ), ]) def convert(self, stream, opts, file_ext, log, @@ -276,6 +284,9 @@ class HTMLInput(InputFormatPlugin): mi.render(open('metadata.opf', 'wb')) opfpath = os.path.abspath('metadata.opf') + if opts.dont_package: + return opfpath + from calibre.ebooks.conversion.plumber import create_oebbook oeb = create_oebbook(log, opfpath, opts) diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py new file mode 100644 index 0000000000..2d726f7eeb --- /dev/null +++ b/src/calibre/ebooks/lit/input.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from calibre.customize.conversion import InputFormatPlugin + +class LITInput(InputFormatPlugin): + + name = 'LIT Input' + author = 'Marshall T. Vandegrift' + description = 'Convert LIT files to HTML' + file_types = set(['lit']) + + def convert(self, stream, options, file_ext, log, + accelerators): + from calibre.ebooks.lit.reader import LitReader + from calibre.ebooks.conversion.plumber import create_oebbook + return create_oebbook(log, stream, options, reader=LitReader) + + diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index f32a65e010..79249fe7c3 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -7,13 +7,12 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' \ 'and Marshall T. Vandegrift ' -import sys, struct, os +import struct, os import functools import re from urlparse import urldefrag from cStringIO import StringIO from urllib import unquote as urlunquote -from lxml import etree from calibre.ebooks.lit import LitError from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP import calibre.ebooks.lit.mssha1 as mssha1 @@ -29,12 +28,12 @@ __all__ = ["LitReader"] XML_DECL = """ """ OPF_DECL = """ - """ HTML_DECL = """ - """ @@ -73,7 +72,7 @@ def encint(bytes, remaining): val <<= 7 val |= (b & 0x7f) if b & 0x80 == 0: break - return val, bytes[pos:], remaining + return val, bytes[pos:], remaining def msguid(bytes): values = struct.unpack(">(?=>>|[^>])') DOUBLE_ANGLE_RE = re.compile(r'([<>])\1') EMPTY_ATOMS = ({},{}) - + def __init__(self, bin, path, manifest={}, map=HTML_MAP, atoms=EMPTY_ATOMS): self.manifest = manifest self.tag_map, self.attr_map, self.tag_to_attr_map = map @@ -143,7 +142,7 @@ class UnBinary(object): raw = self.CLOSE_ANGLE_RE.sub(r'>', raw) raw = self.DOUBLE_ANGLE_RE.sub(r'\1', raw) self.raw = raw - + def item_path(self, internal_id): try: target = self.manifest[internal_id].path @@ -159,7 +158,7 @@ class UnBinary(object): index += 1 relpath = (['..'] * (len(base) - index)) + target[index:] return '/'.join(relpath) - + def __unicode__(self): return self.raw.decode('utf-8') @@ -172,11 +171,11 @@ class UnBinary(object): in_censorship = is_goingdown = False state = 'text' flags = 0 - + while index < len(bin): c, index = read_utf8_char(bin, index) oc = ord(c) - + if state == 'text': if oc == 0: state = 'get flags' @@ -188,14 +187,14 @@ class UnBinary(object): elif c == '<': c = '<<' buf.write(encode(c)) - + elif state == 'get flags': if oc == 0: state = 'text' continue flags = oc state = 'get tag' - + elif state == 'get tag': state = 'text' if oc == 0 else 'get attr' if flags & FLAG_OPENING: @@ -226,7 +225,7 @@ class UnBinary(object): if depth == 0: raise LitError('Extra closing tag') return index - + elif state == 'get attr': in_censorship = False if oc == 0: @@ -265,7 +264,7 @@ class UnBinary(object): state = 'get href length' else: state = 'get value length' - + elif state == 'get value length': if not in_censorship: buf.write('"') @@ -281,7 +280,7 @@ class UnBinary(object): continue if count < 0 or count > (len(bin) - index): raise LitError('Invalid character count %d' % count) - + elif state == 'get value': if count == 0xfffe: if not in_censorship: @@ -301,7 +300,7 @@ class UnBinary(object): buf.write('"') in_censorship = False state = 'get attr' - + elif state == 'get custom length': count = oc - 1 if count <= 0 or count > len(bin)-index: @@ -309,21 +308,21 @@ class UnBinary(object): dynamic_tag += 1 state = 'get custom' tag_name = '' - + elif state == 'get custom': tag_name += c count -= 1 if count == 0: buf.write(encode(tag_name)) state = 'get attr' - + elif state == 'get attr length': count = oc - 1 if count <= 0 or count > (len(bin) - index): raise LitError('Invalid character count %d' % count) buf.write(' ') state = 'get custom attr' - + elif state == 'get custom attr': buf.write(encode(c)) count -= 1 @@ -337,7 +336,7 @@ class UnBinary(object): raise LitError('Invalid character count %d' % count) href = '' state = 'get href' - + elif state == 'get href': href += c count -= 1 @@ -350,7 +349,7 @@ class UnBinary(object): buf.write(encode(u'"%s"' % path)) state = 'get attr' return index - + class DirectoryEntry(object): def __init__(self, name, section, offset, size): @@ -358,11 +357,11 @@ class DirectoryEntry(object): self.section = section self.offset = offset self.size = size - + def __repr__(self): return "DirectoryEntry(name=%s, section=%d, offset=%d, size=%d)" \ % (repr(self.name), self.section, self.offset, self.size) - + def __str__(self): return repr(self) @@ -382,12 +381,12 @@ class ManifestItem(object): path = os.path.normpath(path).replace('\\', '/') while path.startswith('../'): path = path[3:] self.path = path - + def __eq__(self, other): if hasattr(other, 'internal'): return self.internal == other.internal return self.internal == other - + def __repr__(self): return "ManifestItem(internal=%r, path=%r, mime_type=%r, " \ "offset=%d, root=%r, state=%r)" \ @@ -404,7 +403,7 @@ def preserve(function): self.stream.seek(opos) functools.update_wrapper(wrapper, function) return wrapper - + class LitFile(object): PIECE_SIZE = 16 @@ -438,14 +437,14 @@ class LitFile(object): return self.stream.read(8) return property(fget=fget) magic = magic() - + def version(): def fget(self): self.stream.seek(8) return u32(self.stream.read(4)) return property(fget=fget) version = version() - + def hdr_len(): @preserve def fget(self): @@ -453,7 +452,7 @@ class LitFile(object): return int32(self.stream.read(4)) return property(fget=fget) hdr_len = hdr_len() - + def num_pieces(): @preserve def fget(self): @@ -461,7 +460,7 @@ class LitFile(object): return int32(self.stream.read(4)) return property(fget=fget) num_pieces = num_pieces() - + def sec_hdr_len(): @preserve def fget(self): @@ -469,7 +468,7 @@ class LitFile(object): return int32(self.stream.read(4)) return property(fget=fget) sec_hdr_len = sec_hdr_len() - + def guid(): @preserve def fget(self): @@ -477,7 +476,7 @@ class LitFile(object): return self.stream.read(16) return property(fget=fget) guid = guid() - + def header(): @preserve def fget(self): @@ -488,7 +487,7 @@ class LitFile(object): return self.stream.read(size) return property(fget=fget) header = header() - + @preserve def __len__(self): self.stream.seek(0, 2) @@ -501,7 +500,7 @@ class LitFile(object): def read_content(self, offset, size): return self.read_raw(self.content_offset + offset, size) - + def read_secondary_header(self): offset = self.hdr_len + (self.num_pieces * self.PIECE_SIZE) bytes = self.read_raw(offset, self.sec_hdr_len) @@ -526,12 +525,12 @@ class LitFile(object): if u32(bytes[offset+4+16:]): raise LitError('This file has a 64bit content offset') self.content_offset = u32(bytes[offset+16:]) - self.timestamp = u32(bytes[offset+24:]) + self.timestamp = u32(bytes[offset+24:]) self.language_id = u32(bytes[offset+28:]) offset += 48 if not hasattr(self, 'content_offset'): raise LitError('Could not figure out the content offset') - + def read_header_pieces(self): src = self.header[self.hdr_len:] for i in xrange(self.num_pieces): @@ -556,7 +555,7 @@ class LitFile(object): self.piece3_guid = piece elif i == 4: self.piece4_guid = piece - + def read_directory(self, piece): if not piece.startswith('IFCM'): raise LitError('Header piece #1 is not main directory.') @@ -760,9 +759,9 @@ class LitFile(object): raise LitError("Reset table is too short") if u32(reset_table[RESET_UCLENGTH + 4:]) != 0: raise LitError("Reset table has 64bit value for UCLENGTH") - + result = [] - + window_size = 14 u = u32(control[CONTROL_WINDOW_SIZE:]) while u > 0: @@ -847,13 +846,13 @@ class LitContainer(object): def __init__(self, filename_or_stream): self._litfile = LitFile(filename_or_stream) - + def namelist(self): return self._litfile.paths.keys() def exists(self, name): return urlunquote(name) in self._litfile.paths - + def read(self, name): entry = self._litfile.paths[urlunquote(name)] if name else None if entry is None: @@ -869,7 +868,7 @@ class LitContainer(object): internal = '/'.join(('/data', entry.internal)) content = self._litfile.get_file(internal) return content - + def _read_meta(self): path = 'content.opf' raw = self._litfile.get_file('/meta') diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 5d2c51c4ba..dda36a7500 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -272,11 +272,7 @@ def XPath(expr): def xpath(elem, expr): return elem.xpath(expr, namespaces=XPNSMAP) -def _prepare_xml_for_serialization(root): - pass - def xml2str(root, pretty_print=False, strip_comments=False): - _prepare_xml_for_serialization(root) ans = etree.tostring(root, encoding='utf-8', xml_declaration=True, pretty_print=pretty_print) @@ -287,7 +283,6 @@ def xml2str(root, pretty_print=False, strip_comments=False): def xml2unicode(root, pretty_print=False): - _prepare_xml_for_serialization(root) return etree.tostring(root, pretty_print=pretty_print) ASCII_CHARS = set(chr(x) for x in xrange(128)) @@ -321,6 +316,25 @@ def urlnormalize(href): parts = (urlquote(part) for part in parts) return urlunparse(parts) +class DummyHandler(logging.Handler): + + def __init__(self): + logging.Handler.__init__(self, logging.WARNING) + self.setFormatter(logging.Formatter('%(message)s')) + self.log = None + + def emit(self, record): + if self.log is not None: + msg = self.format(record) + f = self.log.error if record.levelno >= logging.ERROR \ + else self.log.warn + f(msg) + + +_css_logger = logging.getLogger('calibre.css') +_css_logger.setLevel(logging.WARNING) +_css_log_handler = DummyHandler() +_css_logger.addHandler(_css_log_handler) class OEBError(Exception): """Generic OEB-processing error.""" @@ -778,7 +792,8 @@ class Manifest(object): data = self.oeb.css_preprocessor(data) data = XHTML_CSS_NAMESPACE + data parser = CSSParser(loglevel=logging.WARNING, - fetcher=self._fetch_css) + fetcher=self._fetch_css, + log=_css_logger) data = parser.parseString(data, href=self.href) data.namespaces['h'] = XHTML_NS return data @@ -1435,7 +1450,7 @@ class OEBBook(object): :attr:`pages`: List of "pages," such as indexed to a print edition of the same text. """ - + _css_log_handler.log = logger self.encoding = encoding self.html_preprocessor = html_preprocessor self.css_preprocessor = css_preprocessor @@ -1450,6 +1465,7 @@ class OEBBook(object): self.guide = Guide(self) self.toc = TOC() self.pages = PageList() + self.auto_generated_toc = True @classmethod def generate(cls, opts): diff --git a/src/calibre/ebooks/oeb/iterator.py b/src/calibre/ebooks/oeb/iterator.py index df4f3b88f1..81e1f89029 100644 --- a/src/calibre/ebooks/oeb/iterator.py +++ b/src/calibre/ebooks/oeb/iterator.py @@ -13,13 +13,12 @@ from PyQt4.Qt import QFontDatabase from calibre.customize.ui import available_input_formats from calibre.ebooks.epub.from_html import TITLEPAGE -from calibre.ebooks.metadata.opf2 import OPF, OPFCreator +from calibre.ebooks.metadata.opf2 import OPF from calibre.ptempfile import TemporaryDirectory from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.zipfile import safe_replace, ZipFile from calibre.utils.config import DynamicConfig from calibre.utils.logging import Log -from calibre import CurrentDir def character_count(html): ''' @@ -57,31 +56,21 @@ class FakeOpts(object): max_levels = 5 input_encoding = None -def html2opf(path, tdir, log): - from calibre.ebooks.html.input import get_filelist - from calibre.ebooks.metadata.meta import get_metadata - with CurrentDir(tdir): - fl = get_filelist(path, tdir, FakeOpts(), log) - mi = get_metadata(open(path, 'rb'), 'html') - mi = OPFCreator(os.getcwdu(), mi) - mi.guide = None - entries = [(f.path, 'application/xhtml+xml') for f in fl] - mi.create_manifest(entries) - mi.create_spine([f.path for f in fl]) - - mi.render(open('metadata.opf', 'wb')) - opfpath = os.path.abspath('metadata.opf') - - return opfpath - -def opf2opf(path, tdir, opts): - return path - def is_supported(path): ext = os.path.splitext(path)[1].replace('.', '').lower() ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext) return ext in available_input_formats() + +def write_oebbook(oeb, path): + from calibre.ebooks.oeb.writer import OEBWriter + from calibre import walk + w = OEBWriter() + w(oeb, path) + for f in walk(path): + if f.endswith('.opf'): + return f + class EbookIterator(object): CHARACTERS_PER_PAGE = 1000 @@ -131,17 +120,16 @@ class EbookIterator(object): def __enter__(self): self._tdir = TemporaryDirectory('_ebook_iter') self.base = self._tdir.__enter__() - if self.ebook_ext == 'opf': - self.pathtoopf = self.pathtoebook - elif self.ebook_ext == 'html': - self.pathtoopf = html2opf(self.pathtoebook, self.base, self.log) - else: - from calibre.ebooks.conversion.plumber import Plumber - plumber = Plumber(self.pathtoebook, self.base, self.log) - plumber.setup_options() - self.pathtoopf = plumber.input_plugin(open(plumber.input, 'rb'), - plumber.opts, plumber.input_fmt, self.log, - {}, self.base) + from calibre.ebooks.conversion.plumber import Plumber + plumber = Plumber(self.pathtoebook, self.base, self.log) + plumber.setup_options() + if hasattr(plumber.opts, 'dont_package'): + plumber.opts.dont_package = True + self.pathtoopf = plumber.input_plugin(open(plumber.input, 'rb'), + plumber.opts, plumber.input_fmt, self.log, + {}, self.base) + if hasattr(self.pathtoopf, 'manifest'): + self.pathtoopf = write_oebbook(self.pathtoebook, self._tdir) self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf)) diff --git a/src/calibre/ebooks/oeb/output.py b/src/calibre/ebooks/oeb/output.py index 480ca3776e..ba62897215 100644 --- a/src/calibre/ebooks/oeb/output.py +++ b/src/calibre/ebooks/oeb/output.py @@ -16,7 +16,6 @@ class OEBOutput(OutputFormatPlugin): author = 'Kovid Goyal' file_type = 'oeb' - def convert(self, oeb_book, output_path, input_plugin, opts, log): self.log, self.opts = log, opts if not os.path.exists(output_path): diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py index faeff4b825..6f0ff44bc9 100644 --- a/src/calibre/ebooks/oeb/reader.py +++ b/src/calibre/ebooks/oeb/reader.py @@ -349,6 +349,7 @@ class OEBReader(object): def _toc_from_ncx(self, item): if item is None: return False + self.log.debug('Reading TOC from NCX...') ncx = item.data title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()')) title = COLLAPSE_RE.sub(' ', title.strip()) @@ -364,6 +365,7 @@ class OEBReader(object): result = xpath(opf, 'o2:tours/o2:tour') if not result: return False + self.log.debug('Reading TOC from tour...') tour = result[0] toc = self.oeb.toc toc.title = tour.get('title') @@ -384,6 +386,7 @@ class OEBReader(object): def _toc_from_html(self, opf): if 'toc' not in self.oeb.guide: return False + self.log.debug('Reading TOC from HTML...') itempath, frag = urldefrag(self.oeb.guide['toc'].href) item = self.oeb.manifest.hrefs[itempath] html = item.data @@ -414,6 +417,7 @@ class OEBReader(object): return True def _toc_from_spine(self, opf): + self.log.warn('Generating default TOC from spine...') toc = self.oeb.toc titles = [] headers = [] @@ -441,11 +445,14 @@ class OEBReader(object): return True def _toc_from_opf(self, opf, item): + self.oeb.auto_generated_toc = False if self._toc_from_ncx(item): return - if self._toc_from_tour(opf): return - self.logger.warn('No metadata table of contents found') + # Prefer HTML to tour based TOC, since several LIT files + # have good HTML TOCs but bad tour based TOCs if self._toc_from_html(opf): return + if self._toc_from_tour(opf): return self._toc_from_spine(opf) + self.oeb.auto_generated_toc = True def _pages_from_ncx(self, opf, item): if item is None: diff --git a/src/calibre/ebooks/oeb/transforms/split.py b/src/calibre/ebooks/oeb/transforms/split.py index 1bb5b50d06..33ab14b73d 100644 --- a/src/calibre/ebooks/oeb/transforms/split.py +++ b/src/calibre/ebooks/oeb/transforms/split.py @@ -51,8 +51,8 @@ class Split(object): self.log = oeb.log self.map = {} self.page_break_selectors = None - for item in self.oeb.manifest.items: - if etree.iselement(item.data): + for item in list(self.oeb.manifest.items): + if item.spine_position is not None and etree.iselement(item.data): self.split_item(item) self.fix_links() @@ -74,31 +74,34 @@ class Split(object): self.page_break_selectors = set([]) stylesheets = [x.data for x in self.oeb.manifest if x.media_type in OEB_STYLES] - page_break_selectors = set([]) - for rule in rules(stylesheets): - before = getattr(rule.style.getPropertyCSSValue( - 'page-break-before'), 'cssText', '').strip().lower() - after = getattr(rule.style.getPropertyCSSValue( - 'page-break-after'), 'cssText', '').strip().lower() - try: - if before and before != 'avoid': - page_break_selectors.add((CSSSelector(rule.selectorText), - True)) - except: - pass - try: - if after and after != 'avoid': - page_break_selectors.add((CSSSelector(rule.selectorText), - False)) - except: - pass + for rule in rules(stylesheets): + before = getattr(rule.style.getPropertyCSSValue( + 'page-break-before'), 'cssText', '').strip().lower() + after = getattr(rule.style.getPropertyCSSValue( + 'page-break-after'), 'cssText', '').strip().lower() + try: + if before and before != 'avoid': + self.page_break_selectors.add((CSSSelector(rule.selectorText), + True)) + except: + pass + try: + if after and after != 'avoid': + self.page_break_selectors.add((CSSSelector(rule.selectorText), + False)) + except: + pass page_breaks = set([]) - for selector, before in page_break_selectors: - for elem in selector(item.data): - if before: - elem.set('pb_before', '1') - page_breaks.add(elem) + for selector, before in self.page_break_selectors: + body = item.data.xpath('//h:body', namespaces=NAMESPACES) + if not body: + continue + for elem in selector(body[0]): + if elem not in body: + if before: + elem.set('pb_before', '1') + page_breaks.add(elem) for i, elem in enumerate(item.data.iter()): elem.set('pb_order', str(i)) @@ -136,8 +139,10 @@ class Split(object): if href in self.map: anchor_map = self.map[href] nhref = anchor_map[frag if frag else None] + nhref = self.current_item.relhref(nhref) if frag: - nhref = '#'.join(href, frag) + nhref = '#'.join((nhref, frag)) + return nhref return url @@ -153,7 +158,7 @@ class FlowSplitter(object): self.page_breaks = page_breaks self.page_break_ids = page_break_ids self.max_flow_size = max_flow_size - self.base = item.abshref(item.href) + self.base = item.href base, ext = os.path.splitext(self.base) self.base = base.replace('%', '%%')+'_split_%d'+ext @@ -192,9 +197,9 @@ class FlowSplitter(object): self.trees = [] tree = orig_tree for pattern, before in ordered_ids: - self.log.debug('\t\tSplitting on page-break') elem = pattern(tree) if elem: + self.log.debug('\t\tSplitting on page-break') before, after = self.do_split(tree, elem[0], before) self.trees.append(before) tree = after @@ -414,13 +419,14 @@ class FlowSplitter(object): elem.attrib.pop(SPLIT_ATTR, None) elem.attrib.pop(SPLIT_POINT_ATTR, '0') - spine_pos = self.item.spine_pos - for current, tree in zip(map(reversed, (self.files, self.trees))): + spine_pos = self.item.spine_position + for current, tree in zip(*map(reversed, (self.files, self.trees))): for a in tree.getroot().xpath('//h:a[@href]', namespaces=NAMESPACES): href = a.get('href').strip() if href.startswith('#'): anchor = href[1:] file = self.anchor_map[anchor] + file = self.item.relhref(file) if file != current: a.set('href', file+href) @@ -430,12 +436,12 @@ class FlowSplitter(object): self.oeb.spine.insert(spine_pos, new_item, self.item.linear) if self.oeb.guide: - for ref in self.oeb.guide: + for ref in self.oeb.guide.values(): href, frag = urldefrag(ref.href) if href == self.item.href: nhref = self.anchor_map[frag if frag else None] if frag: - nhref = '#'.join(nhref, frag) + nhref = '#'.join((nhref, frag)) ref.href = nhref def fix_toc_entry(toc): @@ -444,7 +450,7 @@ class FlowSplitter(object): if href == self.item.href: nhref = self.anchor_map[frag if frag else None] if frag: - nhref = '#'.join(nhref, frag) + nhref = '#'.join((nhref, frag)) toc.href = nhref for x in toc: fix_toc_entry(x) diff --git a/src/calibre/ebooks/oeb/writer.py b/src/calibre/ebooks/oeb/writer.py index ef72414f5a..f71eb88ea5 100644 --- a/src/calibre/ebooks/oeb/writer.py +++ b/src/calibre/ebooks/oeb/writer.py @@ -49,7 +49,7 @@ class OEBWriter(object): def __call__(self, oeb, path): """ - Read the book in the :class:`OEBBook` object :param:`oeb` to a file + Write the book in the :class:`OEBBook` object :param:`oeb` to a folder at :param:`path`. """ version = int(self.version[0]) From 927be72be5d20d19b177dc7dfa5c7844bcd7ca9e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 18 Apr 2009 01:12:56 -0700 Subject: [PATCH 106/319] Cleanup PDF cover extraction --- src/calibre/ebooks/metadata/pdf.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/calibre/ebooks/metadata/pdf.py b/src/calibre/ebooks/metadata/pdf.py index 1a664e638d..a2b18e21ac 100644 --- a/src/calibre/ebooks/metadata/pdf.py +++ b/src/calibre/ebooks/metadata/pdf.py @@ -10,10 +10,11 @@ from threading import Thread from calibre.ebooks.metadata import MetaInformation, authors_to_string from calibre.ptempfile import TemporaryDirectory from pyPdf import PdfFileReader, PdfFileWriter -import Image +#import Image try: from calibre.utils.PythonMagickWand import \ - NewMagickWand, MagickReadImage, MagickSetImageFormat, MagickWriteImage + NewMagickWand, MagickReadImage, MagickSetImageFormat, \ + MagickWriteImage, ImageMagick _imagemagick_loaded = True except: _imagemagick_loaded = False @@ -108,15 +109,15 @@ def get_cover(stream): outputStream = file(cover_path, "wb") output.write(outputStream) outputStream.close() + with ImageMagick(): + wand = NewMagickWand() + MagickReadImage(wand, cover_path) + MagickSetImageFormat(wand, 'JPEG') + MagickWriteImage(wand, '%s.jpg' % cover_path) - wand = NewMagickWand() - MagickReadImage(wand, cover_path) - MagickSetImageFormat(wand, 'JPEG') - MagickWriteImage(wand, '%s.jpg' % cover_path) + #img = Image.open('%s.jpg' % cover_path) - img = Image.open('%s.jpg' % cover_path) - - img.save(data, 'JPEG') + #img.save(data, 'JPEG') except: import traceback traceback.print_exc() From 37b820b0468267b21cc65d5470578c09655e9755 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 18 Apr 2009 07:51:00 -0400 Subject: [PATCH 107/319] Switch from file to open because the use of file is not proper. --- src/calibre/ebooks/pdf/writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index 0f8cbf50c0..f91dae44fd 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -126,7 +126,7 @@ class PDFWriter(QObject): try: outPDF = PdfFileWriter(title=self.metadata.title, author=self.metadata.author) for item in self.combine_queue: - inputPDF = PdfFileReader(file(item, 'rb')) + inputPDF = PdfFileReader(open(item, 'rb')) for page in inputPDF.pages: outPDF.addPage(page) outPDF.write(self.out_stream) From fe3d1f5bc74171c78c437efa1cdaf0120b68e72b Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 18 Apr 2009 08:08:48 -0400 Subject: [PATCH 108/319] Rename FileWrapper with a more fitting name. Comment pdf get_cover to remove ambiguity of what and why. --- src/calibre/__init__.py | 2 +- src/calibre/ebooks/metadata/pdf.py | 44 ++++++++++++++++++------------ 2 files changed, 28 insertions(+), 18 deletions(-) diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index 6299bb8782..9e18af3cf9 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -246,7 +246,7 @@ class CurrentDir(object): os.chdir(self.cwd) -class FileWrapper(object): +class StreamReadWrapper(object): ''' Used primarily with pyPdf to ensure the stream is properly closed. ''' diff --git a/src/calibre/ebooks/metadata/pdf.py b/src/calibre/ebooks/metadata/pdf.py index ec713b5adf..4d8516f6c3 100644 --- a/src/calibre/ebooks/metadata/pdf.py +++ b/src/calibre/ebooks/metadata/pdf.py @@ -6,7 +6,7 @@ __copyright__ = '2008, Kovid Goyal ' import sys, os, cStringIO from threading import Thread -from calibre import FileWrapper +from calibre import StreamReadWrapper from calibre.ebooks.metadata import MetaInformation, authors_to_string from calibre.ptempfile import TemporaryDirectory from pyPdf import PdfFileReader, PdfFileWriter @@ -34,7 +34,7 @@ def get_metadata(stream, extract_cover=True): traceback.print_exc() try: - with FileWrapper(stream) as stream: + with StreamReadWrapper(stream) as stream: info = PdfFileReader(stream).getDocumentInfo() if info.title: mi.title = info.title @@ -98,29 +98,39 @@ def get_cover(stream): data = cStringIO.StringIO() try: - pdf = PdfFileReader(stream) - output = PdfFileWriter() + StreamReadWrapper(stream) as stream: + pdf = PdfFileReader(stream) + output = PdfFileWriter() - if len(pdf.pages) >= 1: - output.addPage(pdf.getPage(0)) + # We only need the first page of the pdf file as that will + # be used as the cover. Saving the first page into a new + # pdf will speed up processing with ImageMagick as it will + # try to create an image for every page in the document. + if len(pdf.pages) >= 1: + output.addPage(pdf.getPage(0)) - with TemporaryDirectory('_pdfmeta') as tdir: - cover_path = os.path.join(tdir, 'cover.pdf') + # ImageMagick will only take a file path and save the + # image to a file path. + with TemporaryDirectory('_pdfmeta') as tdir: + cover_path = os.path.join(tdir, 'cover.pdf') - with open(cover_path, "wb") as outputStream: - output.write(outputStream) + with open(cover_path, "wb") as outputStream: + output.write(outputStream) - with ImageMagick(): - wand = NewMagickWand() - MagickReadImage(wand, cover_path) - MagickSetImageFormat(wand, 'JPEG') - MagickWriteImage(wand, '%s.jpg' % cover_path) - + # Use ImageMagick to turn the pdf into a Jpg image. + with ImageMagick(): + wand = NewMagickWand() + MagickReadImage(wand, cover_path) + MagickSetImageFormat(wand, 'JPEG') + MagickWriteImage(wand, '%s.jpg' % cover_path) + + # We need the image as a stream so we can return the + # image as a string for use in a MetaInformation object. img = Image.open('%s.jpg' % cover_path) img.save(data, 'JPEG') except: import traceback traceback.print_exc() + # Return the string in the cStringIO object. return data.getvalue() - From 02cfaac01416a973edda93cb9b6683ae557d7cc9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 18 Apr 2009 09:29:16 -0700 Subject: [PATCH 109/319] Fix PDF cover extraction --- src/calibre/ebooks/metadata/pdf.py | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/src/calibre/ebooks/metadata/pdf.py b/src/calibre/ebooks/metadata/pdf.py index 3a5ecd1f0c..73d621661a 100644 --- a/src/calibre/ebooks/metadata/pdf.py +++ b/src/calibre/ebooks/metadata/pdf.py @@ -10,7 +10,6 @@ from calibre import FileWrapper from calibre.ebooks.metadata import MetaInformation, authors_to_string from calibre.ptempfile import TemporaryDirectory from pyPdf import PdfFileReader, PdfFileWriter -#import Image try: from calibre.utils.PythonMagickWand import \ NewMagickWand, MagickReadImage, MagickSetImageFormat, \ @@ -95,7 +94,6 @@ def set_metadata(stream, mi): stream.seek(0) def get_cover(stream): - data = cStringIO.StringIO() try: pdf = PdfFileReader(stream) @@ -104,24 +102,21 @@ def get_cover(stream): if len(pdf.pages) >= 1: output.addPage(pdf.getPage(0)) - with TemporaryDirectory('_pdfmeta') as tdir: - cover_path = os.path.join(tdir, 'cover.pdf') + with TemporaryDirectory('_pdfmeta') as tdir: + cover_path = os.path.join(tdir, 'cover.pdf') - outputStream = file(cover_path, "wb") - output.write(outputStream) - outputStream.close() - with ImageMagick(): - wand = NewMagickWand() - MagickReadImage(wand, cover_path) - MagickSetImageFormat(wand, 'JPEG') - MagickWriteImage(wand, '%s.jpg' % cover_path) + with open(cover_path, "wb") as outputStream: + output.write(outputStream) + with ImageMagick(): + wand = NewMagickWand() + MagickReadImage(wand, cover_path) + MagickSetImageFormat(wand, 'JPEG') + MagickWriteImage(wand, '%s.jpg' % cover_path) + return open('%s.jpg' % cover_path, 'rb').read() - #img = Image.open('%s.jpg' % cover_path) - - #img.save(data, 'JPEG') except: import traceback traceback.print_exc() - return data.getvalue() + return '' From 3fe2c7a2ed4a82a8bfb99d0864ea50e308add82e Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 18 Apr 2009 13:40:24 -0400 Subject: [PATCH 110/319] Better pdftohtml processing rules based on ldolse from mobileread's work. --- src/calibre/ebooks/conversion/preprocess.py | 26 +++++++++++++++++---- src/calibre/ebooks/metadata/pdf.py | 2 +- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 632a7a3291..b105a6c042 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -23,6 +23,14 @@ def sanitize_head(match): x = _span_pat.sub('', x) return '\n'+x+'\n' +def chap_head(match): + chap = match.group('chap') + title = match.group('title') + if not title: + return '

    '+chap+'


    ' + else: + return '

    '+chap+'
    '+title+'


    ' + class CSSPreProcessor(object): @@ -54,8 +62,9 @@ class HTMLPreProcessor(object): (re.compile(r'', re.IGNORECASE), lambda match: '
    '), # Remove page numbers (re.compile(r'\d+
    ', re.IGNORECASE), lambda match: ''), - # Remove
    and replace

    with

    + # Replace

    with

    (re.compile(r'\s*', re.IGNORECASE), lambda match: '

    '), + # Remove
    (re.compile(r'(.*)', re.IGNORECASE), lambda match: match.group() if \ re.match('<', match.group(1).lstrip()) or \ @@ -69,15 +78,22 @@ class HTMLPreProcessor(object): # Remove non breaking spaces (re.compile(ur'\u00a0'), lambda match : ' '), + # Detect Chapters to match default XPATH in GUI + (re.compile(r'(]*>)?(]*>)?s*(?P(Chapter|Epilogue|Prologue|Book|Part)\s*(\d+|\w+)?)(]*>|]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(]*>|]*>))((?P.*)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head), + (re.compile(r'(<br[^>]*>)?(</?p[^>]*>)?s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head), + # Have paragraphs show better (re.compile(r'<br.*?>'), lambda match : '<p>'), # Un wrap lines - (re.compile(r'(?<=\w)\s*</(i|b|u)>\s*<p.*?>\s*<(i|b|u)>\s*(?=\w)'), lambda match: ' '), - (re.compile(r'(?<=\w)\s*<p.*?>\s*(?=\w)', re.UNICODE), lambda match: ' '), + (re.compile(r'(?<=[^\.^\^?^!^"^”])\s*(</(i|b|u)>)*\s*<p.*?>\s*(<(i|b|u)>)*\s*(?=[a-z0-9I])', re.UNICODE), lambda match: ' '), + # Clean up spaces - (re.compile(u'(?<=\.|,|:|;|\?|!|”|"|\')[\s^ ]*(?=<)'), lambda match: ' '), - ] + (re.compile(u'(?<=[\.,:;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), + # Add space before and after italics + (re.compile(r'(?<!“)<i>'), lambda match: ' <i>'), + (re.compile(r'</i>(?=\w)'), lambda match: '</i> '), + ] # Fix Book Designer markup BOOK_DESIGNER = [ diff --git a/src/calibre/ebooks/metadata/pdf.py b/src/calibre/ebooks/metadata/pdf.py index 4d8516f6c3..a5ee619937 100644 --- a/src/calibre/ebooks/metadata/pdf.py +++ b/src/calibre/ebooks/metadata/pdf.py @@ -98,7 +98,7 @@ def get_cover(stream): data = cStringIO.StringIO() try: - StreamReadWrapper(stream) as stream: + with StreamReadWrapper(stream) as stream: pdf = PdfFileReader(stream) output = PdfFileWriter() From ac9f766a8d133048fcf46115b60ba3df0e85cc34 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 18 Apr 2009 19:33:59 -0400 Subject: [PATCH 111/319] GUI: Do not enable send to card x when card x is not present. --- src/calibre/gui2/device.py | 19 +++++++++++++++++-- src/calibre/gui2/main.py | 2 +- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/src/calibre/gui2/device.py b/src/calibre/gui2/device.py index 8d1b7a1b3a..239fd4d37d 100644 --- a/src/calibre/gui2/device.py +++ b/src/calibre/gui2/device.py @@ -346,10 +346,25 @@ class DeviceMenu(QMenu): self.action_triggered(action) break - def enable_device_actions(self, enable): + def enable_device_actions(self, enable, card_prefix=(None, None)): for action in self.actions: if action.dest in ('main:', 'carda:0', 'cardb:0'): - action.setEnabled(enable) + if not enable: + action.setEnabled(False) + else: + if action.dest == 'main:': + action.setEnabled(True) + elif action.dest == 'carda:0': + if card_prefix[0] != None: + action.setEnabled(True) + else: + action.setEnabled(False) + elif action.dest == 'cardb:0': + if card_prefix[1] != None: + action.setEnabled(True) + else: + action.setEnabled(False) + class Emailer(Thread): diff --git a/src/calibre/gui2/main.py b/src/calibre/gui2/main.py index 86d1b013e3..21d873db60 100644 --- a/src/calibre/gui2/main.py +++ b/src/calibre/gui2/main.py @@ -608,7 +608,7 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI): self.device_manager.device.__class__.__name__+\ _(' detected.'), 3000) self.device_connected = True - self._sync_menu.enable_device_actions(True) + self._sync_menu.enable_device_actions(True, self.device_manager.device.card_prefix()) else: self.device_connected = False self._sync_menu.enable_device_actions(False) From 1770f7bf74cf1c9330327b76778206b89ac4e7e1 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Sun, 19 Apr 2009 14:44:37 -0700 Subject: [PATCH 112/319] Ported structure detection code and added plugin for FB2 input. --- src/calibre/customize/builtins.py | 4 +- src/calibre/ebooks/conversion/cli.py | 25 ++- src/calibre/ebooks/conversion/plumber.py | 91 ++++++++++- src/calibre/ebooks/epub/from_any.py | 109 ++----------- src/calibre/ebooks/{lrf => }/fb2/__init__.py | 0 src/calibre/ebooks/{lrf => }/fb2/fb2.xsl | 0 src/calibre/ebooks/fb2/input.py | 74 +++++++++ src/calibre/ebooks/lrf/fb2/convert_from.py | 125 --------------- src/calibre/ebooks/oeb/base.py | 61 ++++++- src/calibre/ebooks/oeb/iterator.py | 8 +- src/calibre/ebooks/oeb/reader.py | 3 +- src/calibre/ebooks/oeb/transforms/split.py | 6 +- .../ebooks/oeb/transforms/structure.py | 151 ++++++++++++++++++ src/calibre/linux.py | 10 -- upload.py | 2 +- 15 files changed, 422 insertions(+), 247 deletions(-) rename src/calibre/ebooks/{lrf => }/fb2/__init__.py (100%) rename src/calibre/ebooks/{lrf => }/fb2/fb2.xsl (100%) create mode 100644 src/calibre/ebooks/fb2/input.py delete mode 100644 src/calibre/ebooks/lrf/fb2/convert_from.py create mode 100644 src/calibre/ebooks/oeb/transforms/structure.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 08824a3591..a56d13fd60 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -281,6 +281,7 @@ from calibre.ebooks.mobi.input import MOBIInput from calibre.ebooks.pdf.input import PDFInput from calibre.ebooks.txt.input import TXTInput from calibre.ebooks.lit.input import LITInput +from calibre.ebooks.fb2.input import FB2Input from calibre.ebooks.html.input import HTMLInput from calibre.ebooks.oeb.output import OEBOutput from calibre.ebooks.txt.output import TXTOutput @@ -288,7 +289,8 @@ from calibre.ebooks.pdf.output import PDFOutput from calibre.customize.profiles import input_profiles, output_profiles plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput, - TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput] + TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput, + FB2Input] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index b7336ab30a..6d5401278a 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -119,6 +119,24 @@ def add_pipeline_options(parser, plumber): ] ), + 'STRUCTURE DETECTION' : ( + _('Control auto-detection of document structure.'), + [ + 'dont_split_on_page_breaks', 'chapter', 'chapter_mark', + ] + ), + + 'TABLE OF CONTENTS' : ( + _('Control the automatic generation of a Table of Contents. By ' + 'default, if the source file has a Table of Contents, it will ' + 'be used in preference to the automatically generated one.'), + [ + 'level1_toc', 'level2_toc', 'level3_toc', + 'toc_threshold', 'max_toc_links', 'no_chapters_in_toc', + 'use_auto_toc', + ] + ), + 'METADATA' : (_('Options to set metadata in the output'), plumber.metadata_option_names, ), @@ -130,7 +148,8 @@ def add_pipeline_options(parser, plumber): } - group_order = ['', 'LOOK AND FEEL', 'METADATA', 'DEBUG'] + group_order = ['', 'LOOK AND FEEL', 'STRUCTURE DETECTION', + 'TABLE OF CONTENTS', 'METADATA', 'DEBUG'] for group in group_order: desc, options = groups[group] @@ -163,6 +182,10 @@ def main(args=sys.argv): add_pipeline_options(parser, plumber) opts = parser.parse_args(args)[0] + y = lambda q : os.path.abspath(os.path.expanduser(q)) + for x in ('read_metadata_from_opf', 'cover'): + if getattr(opts, x, None) is not None: + setattr(opts, x, y(getattr(opts, x))) recommendations = [(n.dest, getattr(opts, n.dest), OptionRecommendation.HIGH) \ for n in parser.options_iter() diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 1edeed8d9c..453591e433 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -121,6 +121,88 @@ OptionRecommendation(name='dont_split_on_page_breaks', ) ), +OptionRecommendation(name='level1_toc', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('XPath expression that specifies all tags that ' + 'should be added to the Table of Contents at level one. If ' + 'this is specified, it takes precedence over other forms ' + 'of auto-detection.' + ) + ), + +OptionRecommendation(name='level2_toc', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('XPath expression that specifies all tags that should be ' + 'added to the Table of Contents at level two. Each entry is added ' + 'under the previous level one entry.' + ) + ), + +OptionRecommendation(name='level3_toc', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('XPath expression that specifies all tags that should be ' + 'added to the Table of Contents at level three. Each entry ' + 'is added under the previous level two entry.' + ) + ), + +OptionRecommendation(name='use_auto_toc', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Normally, if the source file already has a Table of ' + 'Contents, it is used in preference to the auto-generated one. ' + 'With this option, the auto-generated one is always used.' + ) + ), + +OptionRecommendation(name='no_chapters_in_toc', + recommended_value=False, level=OptionRecommendation.LOW, + help=_("Don't add auto-detected chapters to the Table of " + 'Contents.' + ) + ), + +OptionRecommendation(name='toc_threshold', + recommended_value=6, level=OptionRecommendation.LOW, + help=_( + 'If fewer than this number of chapters is detected, then links ' + 'are added to the Table of Contents. Default: %default') + ), + +OptionRecommendation(name='max_toc_links', + recommended_value=50, level=OptionRecommendation.LOW, + help=_('Maximum number of links to insert into the TOC. Set to 0 ' + 'to disable. Default is: %default. Links are only added to the ' + 'TOC if less than the threshold number of chapters were detected.' + ) + ), + +OptionRecommendation(name='chapter', + recommended_value="//*[((name()='h1' or name()='h2') and " + "re:test(., 'chapter|book|section|part', 'i')) or @class " + "= 'chapter']", level=OptionRecommendation.LOW, + help=_('An XPath expression to detect chapter titles. The default ' + 'is to consider <h1> or <h2> tags that contain the words ' + '"chapter","book","section" or "part" as chapter titles as ' + 'well as any tags that have class="chapter". The expression ' + 'used must evaluate to a list of elements. To disable chapter ' + 'detection, use the expression "/". See the XPath Tutorial ' + 'in the calibre User Manual for further help on using this ' + 'feature.' + ) + ), + +OptionRecommendation(name='chapter_mark', + recommended_value='pagebreak', level=OptionRecommendation.LOW, + choices=['pagebreak', 'rule', 'both', 'none'], + help=_('Specify how to mark detected chapters. A value of ' + '"pagebreak" will insert page breaks before chapters. ' + 'A value of "rule" will insert a line before chapters. ' + 'A value of "none" will disable chapter marking and a ' + 'value of "both" will use both page breaks and lines ' + 'to mark chapters.') + ), + + OptionRecommendation(name='read_metadata_from_opf', recommended_value=None, level=OptionRecommendation.LOW, @@ -130,6 +212,7 @@ OptionRecommendation(name='read_metadata_from_opf', 'file.') ), + OptionRecommendation(name='title', recommended_value=None, level=OptionRecommendation.LOW, help=_('Set the title.')), @@ -237,6 +320,7 @@ OptionRecommendation(name='language', rec = self.get_option_by_name(name) if rec is not None and rec.level <= level: rec.recommended_value = val + rec.level = level def merge_ui_recommendations(self, recommendations): ''' @@ -248,6 +332,7 @@ OptionRecommendation(name='language', rec = self.get_option_by_name(name) if rec is not None and rec.level <= level and rec.level < rec.HIGH: rec.recommended_value = val + rec.level = level def read_user_metadata(self): ''' @@ -332,6 +417,9 @@ OptionRecommendation(name='language', self.opts.source = self.opts.input_profile self.opts.dest = self.opts.output_profile + from calibre.ebooks.oeb.transforms.structure import DetectStructure + DetectStructure()(self.oeb, self.opts) + from calibre.ebooks.oeb.transforms.flatcss import CSSFlattener fbase = self.opts.base_font_size if fbase == 0: @@ -364,6 +452,8 @@ OptionRecommendation(name='language', trimmer = ManifestTrimmer() trimmer(self.oeb, self.opts) + self.oeb.toc.rationalize_play_orders() + self.log.info('Creating %s...'%self.output_plugin.name) self.output_plugin.convert(self.oeb, self.output, self.input_plugin, self.opts, self.log) @@ -384,4 +474,3 @@ def create_oebbook(log, path_or_stream, opts, reader=None): reader()(oeb, path_or_stream) return oeb - diff --git a/src/calibre/ebooks/epub/from_any.py b/src/calibre/ebooks/epub/from_any.py index b3e5281525..196ed59646 100644 --- a/src/calibre/ebooks/epub/from_any.py +++ b/src/calibre/ebooks/epub/from_any.py @@ -15,88 +15,15 @@ from calibre.ebooks import DRMError from calibre.ebooks.epub import config as common_config from calibre.ebooks.epub.from_html import convert as html2epub, find_html_index from calibre.ptempfile import TemporaryDirectory -from calibre.ebooks.metadata import MetaInformation -from calibre.ebooks.metadata.opf2 import OPFCreator, OPF from calibre.utils.zipfile import ZipFile from calibre.customize.ui import run_plugins_on_preprocess -def lit2opf(path, tdir, opts): - from calibre.ebooks.lit.reader import LitReader - print 'Exploding LIT file:', path - reader = LitReader(path) - reader.extract_content(tdir, False) - opf = None - for opf in walk(tdir): - if opf.lower().endswith('.opf'): - break - if not opf.endswith('.opf'): - opf = None - if opf is not None: # Check for url-quoted filenames - _opf = OPF(opf, os.path.dirname(opf)) - replacements = [] - for item in _opf.itermanifest(): - href = item.get('href', '') - path = os.path.join(os.path.dirname(opf), *(href.split('/'))) - if not os.path.exists(path) and os.path.exists(path.replace('&', '%26')): - npath = path - path = path.replace('&', '%26') - replacements.append((path, npath)) - if replacements: - print 'Fixing quoted filenames...' - for path, npath in replacements: - if os.path.exists(path): - os.rename(path, npath) - for f in walk(tdir): - with open(f, 'r+b') as f: - raw = f.read() - for path, npath in replacements: - raw = raw.replace(os.path.basename(path), os.path.basename(npath)) - f.seek(0) - f.truncate() - f.write(raw) - return opf -def mobi2opf(path, tdir, opts): - from calibre.ebooks.mobi.reader import MobiReader - print 'Exploding MOBI file:', path.encode('utf-8') if isinstance(path, unicode) else path - reader = MobiReader(path) - reader.extract_content(tdir) - files = list(walk(tdir)) - opts.encoding = 'utf-8' - for f in files: - if f.lower().endswith('.opf'): - return f - html_pat = re.compile(r'\.(x){0,1}htm(l){0,1}', re.IGNORECASE) - hf = [f for f in files if html_pat.match(os.path.splitext(f)[1]) is not None] - mi = MetaInformation(os.path.splitext(os.path.basename(path))[0], [_('Unknown')]) - opf = OPFCreator(tdir, mi) - opf.create_manifest([(hf[0], None)]) - opf.create_spine([hf[0]]) - ans = os.path.join(tdir, 'metadata.opf') - opf.render(open(ans, 'wb')) - return ans - -def fb22opf(path, tdir, opts): - from calibre.ebooks.lrf.fb2.convert_from import to_html - print 'Converting FB2 to HTML...' - return to_html(path, tdir) - def rtf2opf(path, tdir, opts): from calibre.ebooks.lrf.rtf.convert_from import generate_html generate_html(path, tdir) return os.path.join(tdir, 'metadata.opf') -def txt2opf(path, tdir, opts): - from calibre.ebooks.lrf.txt.convert_from import generate_html - generate_html(path, opts.encoding, tdir) - return os.path.join(tdir, 'metadata.opf') - -def pdf2opf(path, tdir, opts): - from calibre.ebooks.lrf.pdf.convert_from import generate_html - generate_html(path, tdir) - opts.dont_split_on_page_breaks = True - return os.path.join(tdir, 'metadata.opf') - def epub2opf(path, tdir, opts): zf = ZipFile(path) zf.extractall(tdir) @@ -110,35 +37,23 @@ def epub2opf(path, tdir, opts): if opf and os.path.exists(encfile): if not process_encryption(encfile, opf): raise DRMError(os.path.basename(path)) - + if opf is None: raise ValueError('%s is not a valid EPUB file'%path) return opf - + def odt2epub(path, tdir, opts): from calibre.ebooks.odt.to_oeb import Extract opts.encoding = 'utf-8' return Extract()(path, tdir) -MAP = { - 'lit' : lit2opf, - 'mobi' : mobi2opf, - 'prc' : mobi2opf, - 'azw' : mobi2opf, - 'fb2' : fb22opf, - 'rtf' : rtf2opf, - 'txt' : txt2opf, - 'pdf' : pdf2opf, - 'epub' : epub2opf, - 'odt' : odt2epub, - } -SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'azw', 'fb2', 'odt', 'rtf', +SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'azw', 'fb2', 'odt', 'rtf', 'txt', 'pdf', 'rar', 'zip', 'oebzip', 'htm', 'html', 'epub'] def unarchive(path, tdir): extract(path, tdir) files = list(walk(tdir)) - + for ext in ['opf'] + list(MAP.keys()): for f in files: if f.lower().endswith('.'+ext): @@ -147,32 +62,32 @@ def unarchive(path, tdir): return f, ext return find_html_index(files) -def any2epub(opts, path, notification=None, create_epub=True, +def any2epub(opts, path, notification=None, create_epub=True, oeb_cover=False, extract_to=None): path = run_plugins_on_preprocess(path) ext = os.path.splitext(path)[1] if not ext: raise ValueError('Unknown file type: '+path) ext = ext.lower()[1:] - + if opts.output is None: opts.output = os.path.splitext(os.path.basename(path))[0]+'.epub' - + with nested(TemporaryDirectory('_any2epub1'), TemporaryDirectory('_any2epub2')) as (tdir1, tdir2): if ext in ['rar', 'zip', 'oebzip']: path, ext = unarchive(path, tdir1) print 'Found %s file in archive'%(ext.upper()) - + if ext in MAP.keys(): path = MAP[ext](path, tdir2, opts) ext = 'opf' - - + + if re.match(r'((x){0,1}htm(l){0,1})|opf', ext) is None: raise ValueError('Conversion from %s is not supported'%ext.upper()) - + print 'Creating EPUB file...' - html2epub(path, opts, notification=notification, + html2epub(path, opts, notification=notification, create_epub=create_epub, oeb_cover=oeb_cover, extract_to=extract_to) diff --git a/src/calibre/ebooks/lrf/fb2/__init__.py b/src/calibre/ebooks/fb2/__init__.py similarity index 100% rename from src/calibre/ebooks/lrf/fb2/__init__.py rename to src/calibre/ebooks/fb2/__init__.py diff --git a/src/calibre/ebooks/lrf/fb2/fb2.xsl b/src/calibre/ebooks/fb2/fb2.xsl similarity index 100% rename from src/calibre/ebooks/lrf/fb2/fb2.xsl rename to src/calibre/ebooks/fb2/fb2.xsl diff --git a/src/calibre/ebooks/fb2/input.py b/src/calibre/ebooks/fb2/input.py new file mode 100644 index 0000000000..d96758a4bd --- /dev/null +++ b/src/calibre/ebooks/fb2/input.py @@ -0,0 +1,74 @@ +from __future__ import with_statement +__license__ = 'GPL v3' +__copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>' +""" +Convert .fb2 files to .lrf +""" +import os +from base64 import b64decode +from lxml import etree + +from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation +from calibre import guess_type + +FB2NS = 'http://www.gribuser.ru/xml/fictionbook/2.0' + +class FB2Input(InputFormatPlugin): + + name = 'FB2 Input' + author = 'Anatoly Shipitsin' + description = 'Convert FB2 files to HTML' + file_types = set(['fb2']) + + recommendations = set([ + ('level1_toc', '//h:h1', OptionRecommendation.MED), + ('level2_toc', '//h:h2', OptionRecommendation.MED), + ('level3_toc', '//h:h3', OptionRecommendation.MED), + ]) + + def convert(self, stream, options, file_ext, log, + accelerators): + from calibre.resources import fb2_xsl + from calibre.ebooks.metadata.opf2 import OPFCreator + from calibre.ebooks.metadata.meta import get_metadata + from calibre.ebooks.oeb.base import XLINK_NS + NAMESPACES = {'f':FB2NS, 'l':XLINK_NS} + + log.debug('Parsing XML...') + parser = etree.XMLParser(recover=True, no_network=True) + doc = etree.parse(stream, parser) + self.extract_embedded_content(doc) + log.debug('Converting XML to HTML...') + styledoc = etree.fromstring(fb2_xsl) + + transform = etree.XSLT(styledoc) + result = transform(doc) + open('index.xhtml', 'wb').write(transform.tostring(result)) + stream.seek(0) + mi = get_metadata(stream, 'fb2') + if not mi.title: + mi.title = _('Unknown') + if not mi.authors: + mi.authors = [_('Unknown')] + opf = OPFCreator(os.getcwdu(), mi) + entries = [(f, guess_type(f)[0]) for f in os.listdir('.')] + opf.create_manifest(entries) + opf.create_spine(['index.xhtml']) + + for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES): + href = img.get('{%s}href'%XLINK_NS, img.get('href', None)) + if href is not None: + if href.startswith('#'): + href = href[1:] + opf.guide.set_cover(os.path.abspath(href)) + + opf.render(open('metadata.opf', 'wb')) + return os.path.join(os.getcwd(), 'metadata.opf') + + def extract_embedded_content(self, doc): + for elem in doc.xpath('./*'): + if 'binary' in elem.tag and elem.attrib.has_key('id'): + fname = elem.attrib['id'] + data = b64decode(elem.text.strip()) + open(fname, 'wb').write(data) + diff --git a/src/calibre/ebooks/lrf/fb2/convert_from.py b/src/calibre/ebooks/lrf/fb2/convert_from.py deleted file mode 100644 index 24562e708c..0000000000 --- a/src/calibre/ebooks/lrf/fb2/convert_from.py +++ /dev/null @@ -1,125 +0,0 @@ -from __future__ import with_statement -__license__ = 'GPL v3' -__copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>' -""" -Convert .fb2 files to .lrf -""" -import os, sys, shutil, logging -from base64 import b64decode -from lxml import etree - -from calibre.ebooks.lrf import option_parser as lrf_option_parser -from calibre.ebooks.metadata.meta import get_metadata -from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file -from calibre import setup_cli_handlers -from calibre.resources import fb2_xsl -from calibre.ptempfile import PersistentTemporaryDirectory -from calibre.ebooks.metadata.opf import OPFCreator -from calibre.ebooks.metadata import MetaInformation - - -def option_parser(): - parser = lrf_option_parser( -_('''%prog [options] mybook.fb2 - - -%prog converts mybook.fb2 to mybook.lrf''')) - parser.add_option('--debug-html-generation', action='store_true', default=False, - dest='debug_html_generation', help=_('Print generated HTML to stdout and quit.')) - parser.add_option('--keep-intermediate-files', action='store_true', default=False, - help=_('Keep generated HTML files after completing conversion to LRF.')) - return parser - -def extract_embedded_content(doc): - for elem in doc.xpath('./*'): - if 'binary' in elem.tag and elem.attrib.has_key('id'): - fname = elem.attrib['id'] - data = b64decode(elem.text.strip()) - open(fname, 'wb').write(data) - -def to_html(fb2file, tdir): - fb2file = os.path.abspath(fb2file) - cwd = os.getcwd() - try: - os.chdir(tdir) - print 'Parsing XML...' - parser = etree.XMLParser(recover=True, no_network=True) - doc = etree.parse(fb2file, parser) - extract_embedded_content(doc) - print 'Converting XML to HTML...' - styledoc = etree.fromstring(fb2_xsl) - - transform = etree.XSLT(styledoc) - result = transform(doc) - open('index.html', 'wb').write(transform.tostring(result)) - try: - mi = get_metadata(open(fb2file, 'rb'), 'fb2') - except: - mi = MetaInformation(None, None) - if not mi.title: - mi.title = os.path.splitext(os.path.basename(fb2file))[0] - if not mi.authors: - mi.authors = [_('Unknown')] - opf = OPFCreator(tdir, mi) - opf.create_manifest([('index.html', None)]) - opf.create_spine(['index.html']) - opf.render(open('metadata.opf', 'wb')) - return os.path.join(tdir, 'metadata.opf') - finally: - os.chdir(cwd) - - -def generate_html(fb2file, encoding, logger): - tdir = PersistentTemporaryDirectory('_fb22lrf') - to_html(fb2file, tdir) - return os.path.join(tdir, 'index.html') - -def process_file(path, options, logger=None): - if logger is None: - level = logging.DEBUG if options.verbose else logging.INFO - logger = logging.getLogger('fb22lrf') - setup_cli_handlers(logger, level) - fb2 = os.path.abspath(os.path.expanduser(path)) - f = open(fb2, 'rb') - mi = get_metadata(f, 'fb2') - f.close() - htmlfile = generate_html(fb2, options.encoding, logger) - tdir = os.path.dirname(htmlfile) - cwd = os.getcwdu() - try: - if not options.output: - ext = '.lrs' if options.lrs else '.lrf' - options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext) - options.output = os.path.abspath(os.path.expanduser(options.output)) - if not mi.title: - mi.title = os.path.splitext(os.path.basename(fb2))[0] - if (not options.title or options.title == _('Unknown')): - options.title = mi.title - if (not options.author or options.author == _('Unknown')) and mi.authors: - options.author = mi.authors.pop() - if (not options.category or options.category == _('Unknown')) and mi.category: - options.category = mi.category - if (not options.freetext or options.freetext == _('Unknown')) and mi.comments: - options.freetext = mi.comments - os.chdir(tdir) - html_process_file(htmlfile, options, logger) - finally: - os.chdir(cwd) - if getattr(options, 'keep_intermediate_files', False): - logger.debug('Intermediate files in '+ tdir) - else: - shutil.rmtree(tdir) - -def main(args=sys.argv, logger=None): - parser = option_parser() - options, args = parser.parse_args(args) - if len(args) != 2: - parser.print_help() - print - print 'No fb2 file specified' - return 1 - process_file(args[1], options, logger) - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index dda36a7500..85510e2127 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -41,10 +41,12 @@ NCX_NS = 'http://www.daisy.org/z3986/2005/ncx/' SVG_NS = 'http://www.w3.org/2000/svg' XLINK_NS = 'http://www.w3.org/1999/xlink' CALIBRE_NS = 'http://calibre.kovidgoyal.net/2009/metadata' +RE_NS = 'http://exslt.org/regular-expressions' + XPNSMAP = {'h' : XHTML_NS, 'o1' : OPF1_NS, 'o2' : OPF2_NS, 'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS, 'xsi': XSI_NS, 'dt' : DCTERMS_NS, 'ncx': NCX_NS, - 'svg': SVG_NS, 'xl' : XLINK_NS} + 'svg': SVG_NS, 'xl' : XLINK_NS, 're': RE_NS} OPF1_NSMAP = {'dc': DC11_NS, 'oebpackage': OPF1_NS} OPF2_NSMAP = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS, 'xsi': XSI_NS, 'calibre': CALIBRE_NS} @@ -1256,16 +1258,21 @@ class TOC(object): :attr:`klass`: Optional semantic class referenced by this node. :attr:`id`: Option unique identifier for this node. """ - def __init__(self, title=None, href=None, klass=None, id=None): + def __init__(self, title=None, href=None, klass=None, id=None, + play_order=None): self.title = title self.href = urlnormalize(href) if href else href self.klass = klass self.id = id self.nodes = [] + self.play_order = 0 + if play_order is None: + play_order = self.next_play_order() + self.play_order = play_order - def add(self, title, href, klass=None, id=None): + def add(self, title, href, klass=None, id=None, play_order=0): """Create and return a new sub-node of this node.""" - node = TOC(title, href, klass, id) + node = TOC(title, href, klass, id, play_order) self.nodes.append(node) return node @@ -1276,6 +1283,18 @@ class TOC(object): for node in child.iter(): yield node + def count(self): + return len(list(self.iter())) - 1 + + def next_play_order(self): + return max([x.play_order for x in self.iter()])+1 + + def has_href(self, href): + for x in self.iter(): + if x.href == href: + return True + return False + def iterdescendants(self): """Iterate over all descendant nodes in depth-first order.""" for child in self.nodes: @@ -1309,6 +1328,10 @@ class TOC(object): except ValueError: return 1 + def __str__(self): + return 'TOC: %s --> %s'%(self.title, self.href) + + def to_opf1(self, tour): for node in self.nodes: element(tour, 'site', attrib={ @@ -1319,7 +1342,7 @@ class TOC(object): def to_ncx(self, parent): for node in self.nodes: id = node.id or unicode(uuid.uuid4()) - attrib = {'id': id, 'playOrder': '0'} + attrib = {'id': id, 'playOrder': str(node.play_order)} if node.klass: attrib['class'] = node.klass point = element(parent, NCX('navPoint'), attrib=attrib) @@ -1329,6 +1352,34 @@ class TOC(object): node.to_ncx(point) return parent + def rationalize_play_orders(self): + ''' + Ensure that all nodes with the same play_order have the same href and + with different play_orders have different hrefs. + ''' + def po_node(n): + for x in self.iter(): + if x is n: + return + if x.play_order == n.play_order: + return x + + def href_node(n): + for x in self.iter(): + if x is n: + return + if x.href == n.href: + return x + + for x in self.iter(): + y = po_node(x) + if y is not None: + if x.href != y.href: + x.play_order = getattr(href_node(x), 'play_order', + self.next_play_order()) + y = href_node(x) + if y is not None: + x.play_order = y.play_order class PageList(object): """Collection of named "pages" to mapped positions within an OEB data model diff --git a/src/calibre/ebooks/oeb/iterator.py b/src/calibre/ebooks/oeb/iterator.py index 81e1f89029..ab3e90083d 100644 --- a/src/calibre/ebooks/oeb/iterator.py +++ b/src/calibre/ebooks/oeb/iterator.py @@ -118,6 +118,7 @@ class EbookIterator(object): print 'Loaded embedded font:', repr(family) def __enter__(self): + self.delete_on_exit = [] self._tdir = TemporaryDirectory('_ebook_iter') self.base = self._tdir.__enter__() from calibre.ebooks.conversion.plumber import Plumber @@ -137,9 +138,11 @@ class EbookIterator(object): cover = self.opf.cover if self.ebook_ext in ('lit', 'mobi', 'prc', 'opf') and cover: - cfile = os.path.join(os.path.dirname(self.spine[0]), 'calibre_ei_cover.html') + cfile = os.path.join(os.path.dirname(self.spine[0]), + 'calibre_iterator_cover.html') open(cfile, 'wb').write(TITLEPAGE%cover) self.spine[0:0] = [SpineItem(cfile)] + self.delete_on_exit.append(cfile) if self.opf.path_to_html_toc is not None and \ self.opf.path_to_html_toc not in self.spine: @@ -221,3 +224,6 @@ class EbookIterator(object): def __exit__(self, *args): self._tdir.__exit__(*args) + for x in self.delete_on_exit: + if os.path.exists(x): + os.remove(x) diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py index 6f0ff44bc9..02b3b92b01 100644 --- a/src/calibre/ebooks/oeb/reader.py +++ b/src/calibre/ebooks/oeb/reader.py @@ -343,7 +343,8 @@ class OEBReader(object): continue id = child.get('id') klass = child.get('class') - node = toc.add(title, href, id=id, klass=klass) + po = int(child.get('playOrder', self.oeb.toc.next_play_order())) + node = toc.add(title, href, id=id, klass=klass, play_order=po) self._toc_from_navpoint(item, node, child) def _toc_from_ncx(self, item): diff --git a/src/calibre/ebooks/oeb/transforms/split.py b/src/calibre/ebooks/oeb/transforms/split.py index 33ab14b73d..bc7e4e195d 100644 --- a/src/calibre/ebooks/oeb/transforms/split.py +++ b/src/calibre/ebooks/oeb/transforms/split.py @@ -15,12 +15,10 @@ from lxml.etree import XPath as _XPath from lxml import etree from lxml.cssselect import CSSSelector -from calibre.ebooks.oeb.base import OEB_STYLES, XPNSMAP, urldefrag, \ - rewrite_links +from calibre.ebooks.oeb.base import OEB_STYLES, XPNSMAP as NAMESPACES, \ + urldefrag, rewrite_links from calibre.ebooks.epub import tostring, rules -NAMESPACES = dict(XPNSMAP) -NAMESPACES['re'] = 'http://exslt.org/regular-expressions' XPath = functools.partial(_XPath, namespaces=NAMESPACES) diff --git a/src/calibre/ebooks/oeb/transforms/structure.py b/src/calibre/ebooks/oeb/transforms/structure.py new file mode 100644 index 0000000000..0f1502ef03 --- /dev/null +++ b/src/calibre/ebooks/oeb/transforms/structure.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' +__docformat__ = 'restructuredtext en' + +from lxml import etree +from urlparse import urlparse + +from calibre.ebooks.oeb.base import XPNSMAP, TOC +XPath = lambda x: etree.XPath(x, namespaces=XPNSMAP) + +class DetectStructure(object): + + def __call__(self, oeb, opts): + self.log = oeb.log + self.oeb = oeb + self.opts = opts + self.log('Detecting structure...') + + self.detect_chapters() + if self.oeb.auto_generated_toc or opts.use_auto_toc: + orig_toc = self.oeb.toc + self.oeb.toc = TOC() + self.create_level_based_toc() + if self.oeb.toc.count() < 1: + if not opts.no_chapters_in_toc and self.detected_chapters: + self.create_toc_from_chapters() + if self.oeb.toc.count() < opts.toc_threshold: + self.create_toc_from_links() + if self.oeb.toc.count() < 2 and orig_toc.count() > 2: + self.oeb.toc = orig_toc + else: + self.oeb.auto_generated_toc = True + self.log('Auto generated TOC with %d entries.' % + self.oeb.toc.count()) + + + def detect_chapters(self): + self.detected_chapters = [] + if self.opts.chapter: + chapter_xpath = XPath(self.opts.chapter) + for item in self.oeb.spine: + for x in chapter_xpath(item.data): + self.detected_chapters.append((item, x)) + + chapter_mark = self.opts.chapter_mark + page_break_before = 'display: block; page-break-before: always' + page_break_after = 'display: block; page-break-after: always' + for item, elem in self.detected_chapters: + text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')]) + self.log('\tDetected chapter:', text[:50]) + if chapter_mark == 'none': + continue + elif chapter_mark == 'rule': + mark = etree.Element('hr') + elif chapter_mark == 'pagebreak': + mark = etree.Element('div', style=page_break_after) + else: # chapter_mark == 'both': + mark = etree.Element('hr', style=page_break_before) + elem.addprevious(mark) + + def create_level_based_toc(self): + if self.opts.level1_toc is None: + return + for item in self.oeb.spine: + self.add_leveled_toc_items(item) + + def create_toc_from_chapters(self): + counter = self.oeb.toc.next_play_order() + for item, elem in self.detected_chapters: + text, href = self.elem_to_link(item, elem, counter) + self.oeb.toc.add(text, href, play_order=counter) + counter += 1 + + def create_toc_from_links(self): + for item in self.oeb.spine: + for a in item.data.xpath('//h:a[@href]'): + href = a.get('href') + purl = urlparse(href) + if not purl[0] or purl[0] == 'file': + href, frag = purl.path, purl.fragment + href = item.abshref(href) + if frag: + href = '#'.join((href, frag)) + if not self.oeb.toc.has_href(href): + text = u' '.join([t.strip() for t in \ + a.xpath('descendant::text()')]) + text = text[:100].strip() + if not self.oeb.toc.has_text(text): + self.oeb.toc.add(text, href, + play_order=self.oeb.toc.next_play_order()) + + + def elem_to_link(self, item, elem, counter): + text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')]) + text = text[:100].strip() + id = elem.get('id', 'calibre_toc_%d'%counter) + elem.set('id', id) + href = '#'.join((item.href, id)) + return text, href + + + def add_leveled_toc_items(self, item): + level1 = XPath(self.opts.level1_toc)(item.data) + level1_order = [] + + counter = 1 + if level1: + added = {} + for elem in level1: + text, _href = self.elem_to_link(item, elem, counter) + counter += 1 + if text: + node = self.oeb.toc.add(text, _href, + play_order=self.oeb.toc.next_play_order()) + level1_order.append(node) + added[elem] = node + #node.add(_('Top'), _href) + if self.opts.level2_toc is not None: + added2 = {} + level2 = list(XPath(self.opts.level2_toc)(item.data)) + for elem in level2: + level1 = None + for item in item.data.iterdescendants(): + if item in added.keys(): + level1 = added[item] + elif item == elem and level1 is not None: + text, _href = self.elem_to_link(item, elem, counter) + counter += 1 + if text: + added2[elem] = level1.add(text, _href, + play_order=self.oeb.toc.next_play_order()) + if self.opts.level3_toc is not None: + level3 = list(XPath(self.opts.level3_toc)(item.data)) + for elem in level3: + level2 = None + for item in item.data.iterdescendants(): + if item in added2.keys(): + level2 = added2[item] + elif item == elem and level2 is not None: + text, _href = \ + self.elem_to_link(item, elem, counter) + counter += 1 + if text: + level2.add(text, _href, + play_order=self.oeb.toc.next_play_order()) + + diff --git a/src/calibre/linux.py b/src/calibre/linux.py index ee51370b61..2d13ea2730 100644 --- a/src/calibre/linux.py +++ b/src/calibre/linux.py @@ -27,10 +27,6 @@ entry_points = { 'lrs2lrf = calibre.ebooks.lrf.lrs.convert_from:main', 'isbndb = calibre.ebooks.metadata.isbndb:main', 'librarything = calibre.ebooks.metadata.library_thing:main', - 'comic2lrf = calibre.ebooks.lrf.comic.convert_from:main', - 'comic2epub = calibre.ebooks.epub.from_comic:main', - 'comic2mobi = calibre.ebooks.mobi.from_comic:main', - 'comic2pdf = calibre.ebooks.pdf.from_comic:main', 'calibre-debug = calibre.debug:main', 'calibredb = calibre.library.cli:main', 'calibre-fontconfig = calibre.utils.fontconfig:main', @@ -151,8 +147,6 @@ def setup_completion(fatal_errors): from calibre.ebooks.lrf.pdf.reflow import option_parser as pdfhtmlop from calibre.web.feeds.main import option_parser as feeds2disk from calibre.web.feeds.recipes import titles as feed_titles - from calibre.ebooks.lrf.comic.convert_from import option_parser as comicop - from calibre.ebooks.epub.from_comic import option_parser as comic2epub from calibre.ebooks.metadata.fetch import option_parser as fem_op from calibre.gui2.main import option_parser as guiop from calibre.utils.smtp import option_parser as smtp_op @@ -181,10 +175,6 @@ def setup_completion(fatal_errors): f.write(opts_and_exts('ebook-meta', metaop, list(meta_filetypes()))) f.write(opts_and_exts('lrfviewer', lrfviewerop, ['lrf'])) f.write(opts_and_exts('pdfrelow', pdfhtmlop, ['pdf'])) - f.write(opts_and_exts('comic2lrf', comicop, ['cbz', 'cbr'])) - f.write(opts_and_exts('comic2epub', comic2epub, ['cbz', 'cbr'])) - f.write(opts_and_exts('comic2mobi', comic2epub, ['cbz', 'cbr'])) - f.write(opts_and_exts('comic2pdf', comic2epub, ['cbz', 'cbr'])) f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles)) f.write(opts_and_words('fetch-ebook-metadata', fem_op, [])) f.write(opts_and_words('calibre-smtp', smtp_op, [])) diff --git a/upload.py b/upload.py index 6bc90aada2..a29e5b097c 100644 --- a/upload.py +++ b/upload.py @@ -139,7 +139,7 @@ class resources(OptionlessCommand): RESOURCES = dict( opf_template = 'ebooks/metadata/opf.xml', ncx_template = 'ebooks/metadata/ncx.xml', - fb2_xsl = 'ebooks/lrf/fb2/fb2.xsl', + fb2_xsl = 'ebooks/fb2/fb2.xsl', metadata_sqlite = 'library/metadata_sqlite.sql', jquery = 'gui2/viewer/jquery.js', jquery_scrollTo = 'gui2/viewer/jquery_scrollTo.js', From a0d9e40869bc5c1cd4572c850c5b1000e5a3d125 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Sun, 19 Apr 2009 15:06:19 -0700 Subject: [PATCH 113/319] Add --filter-toc option --- src/calibre/ebooks/conversion/plumber.py | 9 +++++++++ src/calibre/ebooks/oeb/base.py | 10 ++++++++++ src/calibre/ebooks/oeb/transforms/structure.py | 8 ++++++++ 3 files changed, 27 insertions(+) diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 453591e433..3a2d39c314 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -176,6 +176,15 @@ OptionRecommendation(name='max_toc_links', ) ), +OptionRecommendation(name='toc_filter', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Remove entries from the Table of Contents whose titles ' + 'match the specified regular expression. Matching entries and all ' + 'their children are removed.' + ) + ), + + OptionRecommendation(name='chapter', recommended_value="//*[((name()='h1' or name()='h2') and " "re:test(., 'chapter|book|section|part', 'i')) or @class " diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 85510e2127..70303470d7 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -1276,6 +1276,16 @@ class TOC(object): self.nodes.append(node) return node + def remove(self, node): + for child in self.nodes: + if child is node: + self.nodes.remove(child) + return True + else: + if child.remove(node): + return True + return False + def iter(self): """Iterate over this node and all descendants in depth-first order.""" yield self diff --git a/src/calibre/ebooks/oeb/transforms/structure.py b/src/calibre/ebooks/oeb/transforms/structure.py index 0f1502ef03..6499a5e9c4 100644 --- a/src/calibre/ebooks/oeb/transforms/structure.py +++ b/src/calibre/ebooks/oeb/transforms/structure.py @@ -6,6 +6,8 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' +import re + from lxml import etree from urlparse import urlparse @@ -37,6 +39,12 @@ class DetectStructure(object): self.log('Auto generated TOC with %d entries.' % self.oeb.toc.count()) + if opts.toc_filter is not None: + regexp = re.compile(opts.toc_filter) + for node in self.oeb.toc.iter(): + if not node.title or regexp.search(node.title) is not None: + self.oeb.toc.remove(node) + def detect_chapters(self): self.detected_chapters = [] From 8be1892c4d40e40067a4ab450808d2414f421a7a Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Sun, 19 Apr 2009 16:32:48 -0700 Subject: [PATCH 114/319] ODT Input plugin --- src/calibre/customize/builtins.py | 3 +- src/calibre/ebooks/odt/input.py | 67 ++++++++++++++++++++++++++++ src/calibre/ebooks/odt/to_oeb.py | 72 ------------------------------- 3 files changed, 69 insertions(+), 73 deletions(-) create mode 100644 src/calibre/ebooks/odt/input.py delete mode 100644 src/calibre/ebooks/odt/to_oeb.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index a56d13fd60..a67224872b 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -282,6 +282,7 @@ from calibre.ebooks.pdf.input import PDFInput from calibre.ebooks.txt.input import TXTInput from calibre.ebooks.lit.input import LITInput from calibre.ebooks.fb2.input import FB2Input +from calibre.ebooks.odt.input import ODTInput from calibre.ebooks.html.input import HTMLInput from calibre.ebooks.oeb.output import OEBOutput from calibre.ebooks.txt.output import TXTOutput @@ -290,7 +291,7 @@ from calibre.customize.profiles import input_profiles, output_profiles plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput, TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput, - FB2Input] + FB2Input, ODTInput] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/ebooks/odt/input.py b/src/calibre/ebooks/odt/input.py new file mode 100644 index 0000000000..7d6498ab81 --- /dev/null +++ b/src/calibre/ebooks/odt/input.py @@ -0,0 +1,67 @@ +from __future__ import with_statement +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' +__docformat__ = 'restructuredtext en' + +''' +Convert an ODT file into a Open Ebook +''' +import os +from odf.odf2xhtml import ODF2XHTML + +from calibre import CurrentDir, walk +from calibre.customize.conversion import InputFormatPlugin + +class Extract(ODF2XHTML): + + def extract_pictures(self, zf): + if not os.path.exists('Pictures'): + os.makedirs('Pictures') + for name in zf.namelist(): + if name.startswith('Pictures'): + data = zf.read(name) + with open(name, 'wb') as f: + f.write(data) + + def __call__(self, stream, odir): + from calibre.utils.zipfile import ZipFile + from calibre.ebooks.metadata.meta import get_metadata + from calibre.ebooks.metadata.opf2 import OPFCreator + + + if not os.path.exists(odir): + os.makedirs(odir) + with CurrentDir(odir): + print 'Extracting ODT file...' + html = self.odf2xhtml(stream) + with open('index.xhtml', 'wb') as f: + f.write(html.encode('utf-8')) + zf = ZipFile(stream, 'r') + self.extract_pictures(zf) + stream.seek(0) + mi = get_metadata(stream, 'odt') + if not mi.title: + mi.title = _('Unknown') + if not mi.authors: + mi.authors = [_('Unknown')] + opf = OPFCreator(os.path.abspath(os.getcwdu()), mi) + opf.create_manifest([(os.path.abspath(f), None) for f in walk(os.getcwd())]) + opf.create_spine([os.path.abspath('index.xhtml')]) + with open('metadata.opf', 'wb') as f: + opf.render(f) + return os.path.abspath('metadata.opf') + + +class ODTInput(InputFormatPlugin): + + name = 'ODT Input' + author = 'Kovid Goyal' + description = 'Convert ODT (OpenOffice) files to HTML' + file_types = set(['odt']) + + + def convert(self, stream, options, file_ext, log, + accelerators): + return Extract()(stream, '.') + + diff --git a/src/calibre/ebooks/odt/to_oeb.py b/src/calibre/ebooks/odt/to_oeb.py deleted file mode 100644 index 7cb354884e..0000000000 --- a/src/calibre/ebooks/odt/to_oeb.py +++ /dev/null @@ -1,72 +0,0 @@ -from __future__ import with_statement -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' -__docformat__ = 'restructuredtext en' - -''' -Convert an ODT file into a Open Ebook -''' -import os, sys -from odf.odf2xhtml import ODF2XHTML - -from calibre import CurrentDir, walk -from calibre.utils.zipfile import ZipFile -from calibre.utils.config import OptionParser -from calibre.ebooks.metadata.odt import get_metadata -from calibre.ebooks.metadata.opf2 import OPFCreator - -class Extract(ODF2XHTML): - - def extract_pictures(self, zf): - if not os.path.exists('Pictures'): - os.makedirs('Pictures') - for name in zf.namelist(): - if name.startswith('Pictures'): - data = zf.read(name) - with open(name, 'wb') as f: - f.write(data) - - def __call__(self, path, odir): - if not os.path.exists(odir): - os.makedirs(odir) - path = os.path.abspath(path) - with CurrentDir(odir): - print 'Extracting ODT file...' - html = self.odf2xhtml(path) - with open('index.html', 'wb') as f: - f.write(html.encode('utf-8')) - with open(path, 'rb') as f: - zf = ZipFile(f, 'r') - self.extract_pictures(zf) - f.seek(0) - mi = get_metadata(f) - if not mi.title: - mi.title = os.path.splitext(os.path.basename(path)) - if not mi.authors: - mi.authors = [_('Unknown')] - opf = OPFCreator(os.path.abspath(os.getcwdu()), mi) - opf.create_manifest([(os.path.abspath(f), None) for f in walk(os.getcwd())]) - opf.create_spine([os.path.abspath('index.html')]) - with open('metadata.opf', 'wb') as f: - opf.render(f) - return os.path.abspath('metadata.opf') - -def option_parser(): - parser = OptionParser('%prog [options] file.odt') - parser.add_option('-o', '--output-dir', default='.', - help=_('The output directory. Defaults to the current directory.')) - return parser - -def main(args=sys.argv): - parser = option_parser() - opts, args = parser.parse_args(args) - if len(args) < 2: - parser.print_help() - print 'No ODT file specified' - return 1 - Extract()(args[1], os.path.abspath(opts.output_dir)) - print 'Extracted to', os.path.abspath(opts.output_dir) - return 0 - -if __name__ == '__main__': - sys.exit(main()) \ No newline at end of file From 5c5a4d867662e088c42fc75a8e54b397479215f7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Sun, 19 Apr 2009 18:20:26 -0700 Subject: [PATCH 115/319] Plugin for RTF input --- src/calibre/customize/builtins.py | 3 +- src/calibre/ebooks/epub/from_any.py | 10 - src/calibre/ebooks/lrf/rtf/convert_from.py | 190 ------------------- src/calibre/ebooks/{lrf => }/rtf/__init__.py | 0 src/calibre/ebooks/rtf/input.py | 101 ++++++++++ src/calibre/ebooks/{lrf => }/rtf/xsl.py | 0 src/calibre/ebooks/rtf2xml/ParseRtf.py | 6 +- src/calibre/ebooks/rtf2xml/pict.py | 6 +- 8 files changed, 110 insertions(+), 206 deletions(-) delete mode 100644 src/calibre/ebooks/lrf/rtf/convert_from.py rename src/calibre/ebooks/{lrf => }/rtf/__init__.py (100%) create mode 100644 src/calibre/ebooks/rtf/input.py rename src/calibre/ebooks/{lrf => }/rtf/xsl.py (100%) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index a67224872b..51a0e4c75f 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -283,6 +283,7 @@ from calibre.ebooks.txt.input import TXTInput from calibre.ebooks.lit.input import LITInput from calibre.ebooks.fb2.input import FB2Input from calibre.ebooks.odt.input import ODTInput +from calibre.ebooks.rtf.input import RTFInput from calibre.ebooks.html.input import HTMLInput from calibre.ebooks.oeb.output import OEBOutput from calibre.ebooks.txt.output import TXTOutput @@ -291,7 +292,7 @@ from calibre.customize.profiles import input_profiles, output_profiles plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput, TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput, - FB2Input, ODTInput] + FB2Input, ODTInput, RTFInput] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/ebooks/epub/from_any.py b/src/calibre/ebooks/epub/from_any.py index 196ed59646..68112592d2 100644 --- a/src/calibre/ebooks/epub/from_any.py +++ b/src/calibre/ebooks/epub/from_any.py @@ -19,11 +19,6 @@ from calibre.utils.zipfile import ZipFile from calibre.customize.ui import run_plugins_on_preprocess -def rtf2opf(path, tdir, opts): - from calibre.ebooks.lrf.rtf.convert_from import generate_html - generate_html(path, tdir) - return os.path.join(tdir, 'metadata.opf') - def epub2opf(path, tdir, opts): zf = ZipFile(path) zf.extractall(tdir) @@ -42,11 +37,6 @@ def epub2opf(path, tdir, opts): raise ValueError('%s is not a valid EPUB file'%path) return opf -def odt2epub(path, tdir, opts): - from calibre.ebooks.odt.to_oeb import Extract - opts.encoding = 'utf-8' - return Extract()(path, tdir) - SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'azw', 'fb2', 'odt', 'rtf', 'txt', 'pdf', 'rar', 'zip', 'oebzip', 'htm', 'html', 'epub'] diff --git a/src/calibre/ebooks/lrf/rtf/convert_from.py b/src/calibre/ebooks/lrf/rtf/convert_from.py deleted file mode 100644 index e4dd153d2a..0000000000 --- a/src/calibre/ebooks/lrf/rtf/convert_from.py +++ /dev/null @@ -1,190 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' -import os, sys, shutil, logging, glob - -from lxml import etree - -from calibre.ebooks.lrf import option_parser as lrf_option_parser -from calibre.ebooks.metadata.meta import get_metadata -from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file -from calibre import setup_cli_handlers -from calibre.libwand import convert, WandException -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup -from calibre.ebooks.lrf.rtf.xsl import xhtml -from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException -from calibre.ptempfile import PersistentTemporaryDirectory -from calibre.ebooks.metadata import MetaInformation -from calibre.ebooks.metadata.opf import OPFCreator - -def option_parser(): - parser = lrf_option_parser( -_('''%prog [options] mybook.rtf - - -%prog converts mybook.rtf to mybook.lrf''') - ) - parser.add_option('--keep-intermediate-files', action='store_true', default=False) - return parser - -def convert_images(html, logger): - wmfs = glob.glob('*.wmf') + glob.glob('*.WMF') - for wmf in wmfs: - target = os.path.join(os.path.dirname(wmf), os.path.splitext(os.path.basename(wmf))[0]+'.jpg') - try: - convert(wmf, target) - html = html.replace(os.path.basename(wmf), os.path.basename(target)) - except WandException, err: - logger.warning(u'Unable to convert image %s with error: %s'%(wmf, unicode(err))) - continue - return html - -def process_file(path, options, logger=None): - if logger is None: - level = logging.DEBUG if options.verbose else logging.INFO - logger = logging.getLogger('rtf2lrf') - setup_cli_handlers(logger, level) - rtf = os.path.abspath(os.path.expanduser(path)) - f = open(rtf, 'rb') - mi = get_metadata(f, 'rtf') - f.close() - tdir = PersistentTemporaryDirectory('_rtf2lrf') - html = generate_html(rtf, tdir) - cwd = os.getcwdu() - try: - if not options.output: - ext = '.lrs' if options.lrs else '.lrf' - options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext) - options.output = os.path.abspath(os.path.expanduser(options.output)) - if not mi.title: - mi.title = os.path.splitext(os.path.basename(rtf))[0] - if (not options.title or options.title == 'Unknown'): - options.title = mi.title - if (not options.author or options.author == 'Unknown') and mi.author: - options.author = mi.author - if (not options.category or options.category == 'Unknown') and mi.category: - options.category = mi.category - if (not options.freetext or options.freetext == 'Unknown') and mi.comments: - options.freetext = mi.comments - os.chdir(tdir) - html_process_file(html, options, logger) - finally: - os.chdir(cwd) - if hasattr(options, 'keep_intermediate_files') and options.keep_intermediate_files: - logger.debug('Intermediate files in '+ tdir) - else: - shutil.rmtree(tdir) - -def main(args=sys.argv, logger=None): - parser = option_parser() - options, args = parser.parse_args(args) - if len(args) != 2: - parser.print_help() - print - print 'No rtf file specified' - return 1 - process_file(args[1], options, logger) - return 0 - - -def generate_xml(rtfpath, tdir): - from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf - ofile = os.path.join(tdir, 'index.xml') - cwd = os.getcwdu() - os.chdir(tdir) - rtfpath = os.path.abspath(rtfpath) - try: - parser = ParseRtf( - in_file = rtfpath, - out_file = ofile, - # Convert symbol fonts to unicode equivelents. Default - # is 1 - convert_symbol = 1, - - # Convert Zapf fonts to unicode equivelents. Default - # is 1. - convert_zapf = 1, - - # Convert Wingding fonts to unicode equivelents. - # Default is 1. - convert_wingdings = 1, - - # Convert RTF caps to real caps. - # Default is 1. - convert_caps = 1, - - # Indent resulting XML. - # Default is 0 (no indent). - indent = 1, - - # Form lists from RTF. Default is 1. - form_lists = 1, - - # Convert headings to sections. Default is 0. - headings_to_sections = 1, - - # Group paragraphs with the same style name. Default is 1. - group_styles = 1, - - # Group borders. Default is 1. - group_borders = 1, - - # Write or do not write paragraphs. Default is 0. - empty_paragraphs = 0, - ) - parser.parse_rtf() - finally: - os.chdir(cwd) - return ofile - - -def generate_html(rtfpath, tdir): - print 'Converting RTF to XML...' - rtfpath = os.path.abspath(rtfpath) - try: - xml = generate_xml(rtfpath, tdir) - except RtfInvalidCodeException: - raise Exception(_('This RTF file has a feature calibre does not support. Convert it to HTML and then convert it.')) - tdir = os.path.dirname(xml) - cwd = os.getcwdu() - os.chdir(tdir) - try: - print 'Parsing XML...' - parser = etree.XMLParser(recover=True, no_network=True) - try: - doc = etree.parse(xml, parser) - except: - raise - print 'Parsing failed. Trying to clean up XML...' - soup = BeautifulStoneSoup(open(xml, 'rb').read()) - doc = etree.fromstring(str(soup)) - print 'Converting XML to HTML...' - styledoc = etree.fromstring(xhtml) - - transform = etree.XSLT(styledoc) - result = transform(doc) - tdir = os.path.dirname(xml) - html = os.path.join(tdir, 'index.html') - f = open(html, 'wb') - res = transform.tostring(result) - res = res[:100].replace('xmlns:html', 'xmlns') + res[100:] - f.write(res) - f.close() - try: - mi = get_metadata(open(rtfpath, 'rb'), 'rtf') - except: - mi = MetaInformation(None, None) - if not mi.title: - mi.title = os.path.splitext(os.path.basename(rtfpath))[0] - if not mi.authors: - mi.authors = [_('Unknown')] - opf = OPFCreator(tdir, mi) - opf.create_manifest([('index.html', None)]) - opf.create_spine(['index.html']) - opf.render(open('metadata.opf', 'wb')) - finally: - os.chdir(cwd) - return html - -if __name__ == '__main__': - sys.exit(main()) - \ No newline at end of file diff --git a/src/calibre/ebooks/lrf/rtf/__init__.py b/src/calibre/ebooks/rtf/__init__.py similarity index 100% rename from src/calibre/ebooks/lrf/rtf/__init__.py rename to src/calibre/ebooks/rtf/__init__.py diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py new file mode 100644 index 0000000000..764d47ff41 --- /dev/null +++ b/src/calibre/ebooks/rtf/input.py @@ -0,0 +1,101 @@ +from __future__ import with_statement +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' + +import os + +from lxml import etree + +from calibre.customize.conversion import InputFormatPlugin + +class RTFInput(InputFormatPlugin): + + name = 'RTF Input' + author = 'Kovid Goyal' + description = 'Convert RTF files to HTML' + file_types = set(['rtf']) + + def generate_xml(self, stream): + from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf + ofile = 'out.xml' + parser = ParseRtf( + in_file = stream, + out_file = ofile, + # Convert symbol fonts to unicode equivelents. Default + # is 1 + convert_symbol = 1, + + # Convert Zapf fonts to unicode equivelents. Default + # is 1. + convert_zapf = 1, + + # Convert Wingding fonts to unicode equivelents. + # Default is 1. + convert_wingdings = 1, + + # Convert RTF caps to real caps. + # Default is 1. + convert_caps = 1, + + # Indent resulting XML. + # Default is 0 (no indent). + indent = 1, + + # Form lists from RTF. Default is 1. + form_lists = 1, + + # Convert headings to sections. Default is 0. + headings_to_sections = 1, + + # Group paragraphs with the same style name. Default is 1. + group_styles = 1, + + # Group borders. Default is 1. + group_borders = 1, + + # Write or do not write paragraphs. Default is 0. + empty_paragraphs = 0, + ) + parser.parse_rtf() + ans = open('out.xml').read() + os.remove('out.xml') + return ans + + def convert(self, stream, options, file_ext, log, + accelerators): + from calibre.ebooks.rtf.xsl import xhtml + from calibre.ebooks.metadata.meta import get_metadata + from calibre.ebooks.metadata.opf import OPFCreator + from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException + self.log = log + self.log('Converting RTF to XML...') + try: + xml = self.generate_xml(stream) + except RtfInvalidCodeException: + raise ValueError(_('This RTF file has a feature calibre does not ' + 'support. Convert it to HTML first and then try it.')) + self.log('Parsing XML...') + parser = etree.XMLParser(recover=True, no_network=True) + doc = etree.fromstring(xml, parser=parser) + self.log('Converting XML to HTML...') + styledoc = etree.fromstring(xhtml) + + transform = etree.XSLT(styledoc) + result = transform(doc) + html = 'index.xhtml' + with open(html, 'wb') as f: + res = transform.tostring(result) + res = res[:100].replace('xmlns:html', 'xmlns') + res[100:] + f.write(res) + stream.seek(0) + mi = get_metadata(stream, 'rtf') + if not mi.title: + mi.title = _('Unknown') + if not mi.authors: + mi.authors = [_('Unknown')] + opf = OPFCreator(os.getcwd(), mi) + opf.create_manifest([('index.xhtml', None)]) + opf.create_spine(['index.xhtml']) + opf.render(open('metadata.opf', 'wb')) + return os.path.abspath('metadata.opf') + diff --git a/src/calibre/ebooks/lrf/rtf/xsl.py b/src/calibre/ebooks/rtf/xsl.py similarity index 100% rename from src/calibre/ebooks/lrf/rtf/xsl.py rename to src/calibre/ebooks/rtf/xsl.py diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index 5b008df615..cba0f900db 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -149,9 +149,10 @@ class ParseRtf: self.__group_borders = group_borders self.__empty_paragraphs = empty_paragraphs self.__no_dtd = no_dtd - + def __check_file(self, the_file, type): """Check to see if files exist""" + if hasattr(the_file, 'read'): return if the_file == None: if type == "file_to_parse": message = "You must provide a file for the script to work" @@ -545,13 +546,12 @@ class ParseRtf: def __make_temp_file(self,file): """Make a temporary file to parse""" write_file="rtf_write_file" - read_obj = open(file,'r') + read_obj = file if hasattr(file, 'read') else open(file,'r') write_obj = open(write_file, 'w') line = "dummy" while line: line = read_obj.read(1000) write_obj.write(line ) - read_obj.close() write_obj.close() return write_file """ diff --git a/src/calibre/ebooks/rtf2xml/pict.py b/src/calibre/ebooks/rtf2xml/pict.py index b1931b8c2e..6c88dd54e4 100755 --- a/src/calibre/ebooks/rtf2xml/pict.py +++ b/src/calibre/ebooks/rtf2xml/pict.py @@ -58,10 +58,12 @@ class Pict: return line[18:] def __make_dir(self): """ Make a dirctory to put the image data in""" - base_name = os.path.basename(self.__orig_file) + base_name = os.path.basename(getattr(self.__orig_file, 'name', + self.__orig_file)) base_name = os.path.splitext(base_name)[0] if self.__out_file: - dir_name = os.path.dirname(self.__out_file) + dir_name = os.path.dirname(getattr(self.__out_file, 'name', + self.__out_file)) else: dir_name = os.path.dirname(self.__orig_file) # self.__output_to_file_func() From 142463228611fad78f79080b766ef77a22c762b5 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Sun, 19 Apr 2009 19:27:57 -0700 Subject: [PATCH 116/319] EPUB input plugin now move OPF to top of tree as required by the rest of the pipeline --- src/calibre/ebooks/epub/from_any.py | 18 ----------------- src/calibre/ebooks/epub/input.py | 30 +++++++++++++++++++++-------- 2 files changed, 22 insertions(+), 26 deletions(-) diff --git a/src/calibre/ebooks/epub/from_any.py b/src/calibre/ebooks/epub/from_any.py index 68112592d2..2f3f81124f 100644 --- a/src/calibre/ebooks/epub/from_any.py +++ b/src/calibre/ebooks/epub/from_any.py @@ -19,24 +19,6 @@ from calibre.utils.zipfile import ZipFile from calibre.customize.ui import run_plugins_on_preprocess -def epub2opf(path, tdir, opts): - zf = ZipFile(path) - zf.extractall(tdir) - opts.chapter_mark = 'none' - encfile = os.path.join(tdir, 'META-INF', 'encryption.xml') - opf = None - for f in walk(tdir): - if f.lower().endswith('.opf'): - opf = f - break - if opf and os.path.exists(encfile): - if not process_encryption(encfile, opf): - raise DRMError(os.path.basename(path)) - - if opf is None: - raise ValueError('%s is not a valid EPUB file'%path) - return opf - SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'azw', 'fb2', 'odt', 'rtf', 'txt', 'pdf', 'rar', 'zip', 'oebzip', 'htm', 'html', 'epub'] diff --git a/src/calibre/ebooks/epub/input.py b/src/calibre/ebooks/epub/input.py index 5c8a5c9d89..10bb321a11 100644 --- a/src/calibre/ebooks/epub/input.py +++ b/src/calibre/ebooks/epub/input.py @@ -11,12 +11,12 @@ from lxml import etree from calibre.customize.conversion import InputFormatPlugin class EPUBInput(InputFormatPlugin): - + name = 'EPUB Input' author = 'Kovid Goyal' description = 'Convert EPUB files (.epub) to HTML' file_types = set(['epub']) - + @classmethod def decrypt_font(cls, key, path): raw = open(path, 'rb').read() @@ -26,7 +26,7 @@ class EPUBInput(InputFormatPlugin): with open(path, 'wb') as f: f.write(decrypt) f.write(raw[1024:]) - + @classmethod def process_ecryption(cls, encfile, opf, log): key = None @@ -55,21 +55,35 @@ class EPUBInput(InputFormatPlugin): from calibre.utils.zipfile import ZipFile from calibre import walk from calibre.ebooks import DRMError + from calibre.ebooks.metadata.opf2 import OPF zf = ZipFile(stream) zf.extractall(os.getcwd()) encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml')) opf = None - for f in walk('.'): + for f in walk(u'.'): if f.lower().endswith('.opf'): - opf = f + opf = os.path.abspath(f) break path = getattr(stream, 'name', 'stream') - + if opf is None: raise ValueError('%s is not a valid EPUB file'%path) - + if os.path.exists(encfile): if not self.process_encryption(encfile, opf, log): raise DRMError(os.path.basename(path)) - return os.path.join(os.getcwd(), opf) + opf = os.path.relpath(opf, os.getcwdu()) + parts = os.path.split(opf) + if len(parts) > 1: + delta = '/'.join(parts[:-1])+'/' + opf = OPF(opf, os.path.dirname(os.path.abspath(opf))) + for elem in opf.itermanifest(): + elem.set('href', delta+elem.get('href')) + for elem in opf.iterguide(): + elem.set('href', delta+elem.get('href')) + + with open('content.opf', 'wb') as nopf: + nopf.write(opf.render()) + + return os.path.abspath('content.opf') From c37c5436b0870d91a6df3358e30c9509a2e63726 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Sun, 19 Apr 2009 20:12:25 -0700 Subject: [PATCH 117/319] EPUB Input plugin now rasterizes cover and sets a type='titlepage' guide element to point to the HTML cover. The HTML cover is removed from the spine. --- src/calibre/ebooks/__init__.py | 5 ++++- src/calibre/ebooks/epub/input.py | 38 +++++++++++++++++++++++++++++++- 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/__init__.py b/src/calibre/ebooks/__init__.py index 26d2394818..79f4f7631e 100644 --- a/src/calibre/ebooks/__init__.py +++ b/src/calibre/ebooks/__init__.py @@ -59,7 +59,10 @@ class HTMLRenderer(object): def render_html(path_to_html, width=590, height=750): from PyQt4.QtWebKit import QWebPage - from PyQt4.Qt import QEventLoop, QPalette, Qt, SIGNAL, QUrl, QSize + from PyQt4.Qt import QEventLoop, QPalette, Qt, SIGNAL, QUrl, QSize, \ + QApplication + if QApplication.instance() is None: + QApplication([]) path_to_html = os.path.abspath(path_to_html) with CurrentDir(os.path.dirname(path_to_html)): page = QWebPage() diff --git a/src/calibre/ebooks/epub/input.py b/src/calibre/ebooks/epub/input.py index 10bb321a11..919416ffdc 100644 --- a/src/calibre/ebooks/epub/input.py +++ b/src/calibre/ebooks/epub/input.py @@ -51,6 +51,39 @@ class EPUBInput(InputFormatPlugin): traceback.print_exc() return False + @classmethod + def rationalize_cover(self, opf): + guide_cover, guide_elem = None, None + for guide_elem in opf.iterguide(): + if guide_elem.get('type', '').lower() == 'cover': + guide_cover = guide_elem.get('href', '') + break + if not guide_cover: + return + spine = list(opf.iterspine()) + if not spine: + return + idref = spine[0].get('idref', '') + manifest = list(opf.itermanifest()) + if not manifest: + return + if manifest[0].get('id', False) != idref: + return + spine[0].getparent().remove(spine[0]) + guide_elem.set('href', 'calibre_raster_cover.jpg') + for elem in list(opf.iterguide()): + if elem.get('type', '').lower() == 'titlepage': + elem.getparent().remove(elem) + from calibre.ebooks.oeb.base import OPF + t = etree.SubElement(guide_elem.getparent(), OPF('reference')) + t.set('type', 'titlepage') + t.set('href', guide_cover) + t.set('title', 'Title Page') + from calibre.ebooks import render_html + open('calibre_raster_cover.jpg', 'wb').write( + render_html(guide_cover).data) + + def convert(self, stream, options, file_ext, log, accelerators): from calibre.utils.zipfile import ZipFile from calibre import walk @@ -75,14 +108,17 @@ class EPUBInput(InputFormatPlugin): opf = os.path.relpath(opf, os.getcwdu()) parts = os.path.split(opf) + opf = OPF(opf, os.path.dirname(os.path.abspath(opf))) + if len(parts) > 1: delta = '/'.join(parts[:-1])+'/' - opf = OPF(opf, os.path.dirname(os.path.abspath(opf))) for elem in opf.itermanifest(): elem.set('href', delta+elem.get('href')) for elem in opf.iterguide(): elem.set('href', delta+elem.get('href')) + self.rationalize_cover(opf) + with open('content.opf', 'wb') as nopf: nopf.write(opf.render()) From 6ec37ff715039c71883a7d79450770317e5edbaf Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Mon, 20 Apr 2009 11:47:33 -0700 Subject: [PATCH 118/319] Add support for zip, rar and oebzip archives as input formats --- src/calibre/ebooks/conversion/cli.py | 2 +- src/calibre/ebooks/conversion/plumber.py | 54 ++++++++++++++++++++++-- 2 files changed, 52 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index 6d5401278a..fd99e1e346 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -133,7 +133,7 @@ def add_pipeline_options(parser, plumber): [ 'level1_toc', 'level2_toc', 'level3_toc', 'toc_threshold', 'max_toc_links', 'no_chapters_in_toc', - 'use_auto_toc', + 'use_auto_toc', 'toc_filter', ] ), diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 3a2d39c314..2b78aca822 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -3,13 +3,21 @@ __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' -import os +import os, re from calibre.customize.conversion import OptionRecommendation from calibre.customize.ui import input_profiles, output_profiles, \ plugin_for_input_format, plugin_for_output_format from calibre.ebooks.conversion.preprocess import HTMLPreProcessor from calibre.ptempfile import PersistentTemporaryDirectory +from calibre import extract, walk + +def supported_input_formats(): + from calibre.customize.ui import available_input_formats + fmts = available_input_formats() + for x in ('zip', 'rar', 'oebzip'): + fmts.add(x) + return fmts class OptionValues(object): pass @@ -279,11 +287,14 @@ OptionRecommendation(name='language', help=_('Set the language.')), ] - input_fmt = os.path.splitext(self.input)[1] if not input_fmt: raise ValueError('Input file must have an extension') input_fmt = input_fmt[1:].lower() + if input_fmt in ('zip', 'rar', 'oebzip'): + self.log('Processing archive...') + tdir = PersistentTemporaryDirectory('_plumber') + self.input, input_fmt = self.unarchive(self.input, tdir) if os.path.exists(self.output) and os.path.isdir(self.output): output_fmt = 'oeb' @@ -293,7 +304,7 @@ OptionRecommendation(name='language', output_fmt = '.oeb' output_fmt = output_fmt[1:].lower() - self.input_plugin = plugin_for_input_format(input_fmt) + self.input_plugin = plugin_for_input_format(input_fmt) self.output_plugin = plugin_for_output_format(output_fmt) if self.input_plugin is None: @@ -316,6 +327,43 @@ OptionRecommendation(name='language', # plugins. self.merge_plugin_recommendations() + @classmethod + def unarchive(self, path, tdir): + extract(path, tdir) + files = list(walk(tdir)) + from calibre.customize.ui import available_input_formats + fmts = available_input_formats() + for x in ('htm', 'html', 'xhtm', 'xhtml'): fmts.remove(x) + + for ext in fmts: + for f in files: + if f.lower().endswith('.'+ext): + if ext in ['txt', 'rtf'] and os.stat(f).st_size < 2048: + continue + return f, ext + return self.find_html_index(files) + + @classmethod + def find_html_index(self, files): + ''' + Given a list of files, find the most likely root HTML file in the + list. + ''' + html_pat = re.compile(r'\.(x){0,1}htm(l){0,1}$', re.IGNORECASE) + html_files = [f for f in files if html_pat.search(f) is not None] + if not html_files: + raise ValueError(_('Could not find an ebook inside the archive')) + html_files = [(f, os.stat(f).st_size) for f in html_files] + html_files.sort(cmp = lambda x, y: cmp(x[1], y[1])) + html_files = [f[0] for f in html_files] + for q in ('toc', 'index'): + for f in html_files: + if os.path.splitext(os.path.basename(f))[0].lower() == q: + return f, os.path.splitext(f)[1].lower()[1:] + return html_files[-1], os.path.splitext(html_files[-1])[1].lower()[1:] + + + def get_option_by_name(self, name): for group in (self.input_options, self.pipeline_options, self.output_options): From b34854b6e4682724b4f51924f851ac1693a9b87d Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Mon, 20 Apr 2009 13:55:21 -0700 Subject: [PATCH 119/319] Implement --extra-css --- src/calibre/ebooks/conversion/cli.py | 1 + src/calibre/ebooks/conversion/plumber.py | 11 +++++ src/calibre/ebooks/oeb/stylizer.py | 43 +++++++++++--------- src/calibre/ebooks/oeb/transforms/flatcss.py | 3 +- 4 files changed, 38 insertions(+), 20 deletions(-) diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index fd99e1e346..a30549cbc3 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -116,6 +116,7 @@ def add_pipeline_options(parser, plumber): 'font_size_mapping', 'line_height', 'linearize_tables', + 'extra_css', ] ), diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 2b78aca822..22c11303ad 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -219,6 +219,14 @@ OptionRecommendation(name='chapter_mark', 'to mark chapters.') ), +OptionRecommendation(name='extra_css', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Either the path to a CSS stylesheet or raw CSS. ' + 'This CSS will be appended to the style rules from ' + 'the source file, so it can be used to override those ' + 'rules.') + ), + OptionRecommendation(name='read_metadata_from_opf', @@ -487,6 +495,9 @@ OptionRecommendation(name='language', else: fkey = map(float, fkey.split(',')) + if self.opts.extra_css and os.path.exists(self.opts.extra_css): + self.opts.extra_css = open(self.opts.extra_css, 'rb').read() + flattener = CSSFlattener(fbase=fbase, fkey=fkey, lineh=self.opts.line_height, untable=self.opts.linearize_tables) diff --git a/src/calibre/ebooks/oeb/stylizer.py b/src/calibre/ebooks/oeb/stylizer.py index 8bc82883e3..34abea32f5 100644 --- a/src/calibre/ebooks/oeb/stylizer.py +++ b/src/calibre/ebooks/oeb/stylizer.py @@ -88,7 +88,7 @@ FONT_SIZE_NAMES = set(['xx-small', 'x-small', 'small', 'medium', 'large', class CSSSelector(etree.XPath): MIN_SPACE_RE = re.compile(r' *([>~+]) *') LOCAL_NAME_RE = re.compile(r"(?<!local-)name[(][)] *= *'[^:]+:") - + def __init__(self, css, namespaces=XPNSMAP): css = self.MIN_SPACE_RE.sub(r'\1', css) path = css_to_xpath(css) @@ -103,10 +103,10 @@ class CSSSelector(etree.XPath): self.css) -class Stylizer(object): +class Stylizer(object): STYLESHEETS = WeakKeyDictionary() - - def __init__(self, tree, path, oeb, profile=PROFILES['PRS505']): + + def __init__(self, tree, path, oeb, profile=PROFILES['PRS505'], extra_css=''): self.oeb = oeb self.profile = profile self.logger = oeb.logger @@ -135,6 +135,11 @@ class Stylizer(object): (path, item.href)) continue stylesheets.append(sitem.data) + if extra_css: + text = XHTML_CSS_NAMESPACE + extra_css + stylesheet = parser.parseString(text, href=cssname) + stylesheet.namespaces['h'] = XHTML_NS + stylesheets.append(stylesheet) rules = [] index = 0 self.stylesheets = set() @@ -159,7 +164,7 @@ class Stylizer(object): self.style(elem)._update_cssdict(cssdict) for elem in xpath(tree, '//h:*[@style]'): self.style(elem)._apply_style_attr() - + def _fetch_css_file(self, path): hrefs = self.oeb.manifest.hrefs if path not in hrefs: @@ -171,7 +176,7 @@ class Stylizer(object): return (None, None) data = item.data.cssText return ('utf-8', data) - + def flatten_rule(self, rule, href, index): results = [] if isinstance(rule, CSSStyleRule): @@ -185,7 +190,7 @@ class Stylizer(object): style = self.flatten_style(rule.style) self.page_rule.update(style) return results - + def flatten_style(self, cssstyle): style = {} for prop in cssstyle: @@ -202,7 +207,7 @@ class Stylizer(object): if size in FONT_SIZE_NAMES: style['font-size'] = "%dpt" % self.profile.fnames[size] return style - + def _normalize_edge(self, cssvalue, name): style = {} if isinstance(cssvalue, CSSValueList): @@ -224,7 +229,7 @@ class Stylizer(object): for edge, value in itertools.izip(edges, values): style["%s-%s" % (name, edge)] = value return style - + def _normalize_font(self, cssvalue): composition = ('font-style', 'font-variant', 'font-weight', 'font-size', 'line-height', 'font-family') @@ -271,7 +276,7 @@ class Stylizer(object): class Style(object): UNIT_RE = re.compile(r'^(-*[0-9]*[.]?[0-9]*)\s*(%|em|px|mm|cm|in|pt|pc)$') - + def __init__(self, element, stylizer): self._element = element self._profile = stylizer.profile @@ -285,7 +290,7 @@ class Style(object): def _update_cssdict(self, cssdict): self._style.update(cssdict) - + def _apply_style_attr(self): attrib = self._element.attrib if 'style' not in attrib: @@ -297,7 +302,7 @@ class Style(object): except CSSSyntaxError: return self._style.update(self._stylizer.flatten_style(style)) - + def _has_parent(self): return (self._element.getparent() is not None) @@ -346,7 +351,7 @@ class Style(object): elif unit == 'in': result = value * 72.0 elif unit == 'pt': - result = value + result = value elif unit == 'em': font = font or self.fontSize result = value * font @@ -421,7 +426,7 @@ class Style(object): result = self._unit_convert(width, base=base) self._width = result return self._width - + @property def height(self): if self._height is None: @@ -463,27 +468,27 @@ class Style(object): result = 1.2 * self.fontSize self._lineHeight = result return self._lineHeight - + @property def marginTop(self): return self._unit_convert( self._get('margin-top'), base=self.height) - + @property def marginBottom(self): return self._unit_convert( self._get('margin-bottom'), base=self.height) - + @property def paddingTop(self): return self._unit_convert( self._get('padding-top'), base=self.height) - + @property def paddingBottom(self): return self._unit_convert( self._get('padding-bottom'), base=self.height) - + def __str__(self): items = self._style.items() items.sort() diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py index b33042e10b..ca96d28a8d 100644 --- a/src/calibre/ebooks/oeb/transforms/flatcss.py +++ b/src/calibre/ebooks/oeb/transforms/flatcss.py @@ -116,7 +116,8 @@ class CSSFlattener(object): profile = self.context.source for item in self.oeb.spine: html = item.data - stylizer = Stylizer(html, item.href, self.oeb, profile) + stylizer = Stylizer(html, item.href, self.oeb, profile, + extra_css=self.context.extra_css) self.stylizers[item] = stylizer def baseline_node(self, node, stylizer, sizes, csize): From e63f8a2cb7a07cd821d12b4f8f4d7521de8a2572 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Tue, 21 Apr 2009 14:20:36 -0700 Subject: [PATCH 120/319] Unquote hrefs when creating OPF 2.0 --- src/calibre/ebooks/oeb/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 70303470d7..a36ad8f676 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -1026,7 +1026,7 @@ class Manifest(object): media_type = XHTML_MIME elif media_type in OEB_STYLES: media_type = CSS_MIME - attrib = {'id': item.id, 'href': item.href, + attrib = {'id': item.id, 'href': urlunquote(item.href), 'media-type': media_type} if item.fallback: attrib['fallback'] = item.fallback @@ -1238,7 +1238,7 @@ class Guide(object): def to_opf2(self, parent=None): elem = element(parent, OPF('guide')) for ref in self.refs.values(): - attrib = {'type': ref.type, 'href': ref.href} + attrib = {'type': ref.type, 'href': urlunquote(ref.href)} if ref.title: attrib['title'] = ref.title element(elem, OPF('reference'), attrib=attrib) From 14636efa240237ab7873c95c8ee8a3b59ed1a014 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Tue, 21 Apr 2009 15:21:07 -0700 Subject: [PATCH 121/319] Fix splitting code --- src/calibre/ebooks/oeb/transforms/split.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/oeb/transforms/split.py b/src/calibre/ebooks/oeb/transforms/split.py index bc7e4e195d..bee74c54a9 100644 --- a/src/calibre/ebooks/oeb/transforms/split.py +++ b/src/calibre/ebooks/oeb/transforms/split.py @@ -102,7 +102,10 @@ class Split(object): page_breaks.add(elem) for i, elem in enumerate(item.data.iter()): - elem.set('pb_order', str(i)) + try: + elem.set('pb_order', str(i)) + except TypeError: # Cant set attributes on comment nodes etc. + continue page_breaks = list(page_breaks) page_breaks.sort(cmp= @@ -116,7 +119,7 @@ class Split(object): page_break_ids.append(id) for elem in item.data.iter(): - elem.attrib.pop('pb_order') + elem.attrib.pop('pb_order', False) if elem.get('pb_before', False): elem.attrib.pop('pb_before') From 68e7e1b1122b7b461dfe90f56062948084d7ee55 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Tue, 21 Apr 2009 19:09:03 -0400 Subject: [PATCH 122/319] initial ereader input --- src/calibre/customize/builtins.py | 3 +- src/calibre/ebooks/pdb/__init__.py | 26 +++ src/calibre/ebooks/pdb/ereader/__init__.py | 12 ++ .../ebooks/pdb/ereader/pmlconverter.py | 98 +++++++++ src/calibre/ebooks/pdb/ereader/reader.py | 199 ++++++++++++++++++ src/calibre/ebooks/pdb/header.py | 60 ++++++ src/calibre/ebooks/pdb/input.py | 32 +++ 7 files changed, 429 insertions(+), 1 deletion(-) create mode 100644 src/calibre/ebooks/pdb/__init__.py create mode 100644 src/calibre/ebooks/pdb/ereader/__init__.py create mode 100644 src/calibre/ebooks/pdb/ereader/pmlconverter.py create mode 100644 src/calibre/ebooks/pdb/ereader/reader.py create mode 100644 src/calibre/ebooks/pdb/header.py create mode 100644 src/calibre/ebooks/pdb/input.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 08824a3591..ade60fcc9f 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -278,6 +278,7 @@ class PDFMetadataWriter(MetadataWriterPlugin): from calibre.ebooks.epub.input import EPUBInput from calibre.ebooks.mobi.input import MOBIInput +from calibre.ebooks.pdb.input import PDBInput from calibre.ebooks.pdf.input import PDFInput from calibre.ebooks.txt.input import TXTInput from calibre.ebooks.lit.input import LITInput @@ -287,7 +288,7 @@ from calibre.ebooks.txt.output import TXTOutput from calibre.ebooks.pdf.output import PDFOutput from calibre.customize.profiles import input_profiles, output_profiles -plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput, +plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDBInput, PDFInput, HTMLInput, TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] diff --git a/src/calibre/ebooks/pdb/__init__.py b/src/calibre/ebooks/pdb/__init__.py new file mode 100644 index 0000000000..5e51a807e9 --- /dev/null +++ b/src/calibre/ebooks/pdb/__init__.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +from calibre.ebooks.pdb.ereader.reader import Reader as eReader + +FORMATS = { + 'PNPdPPrs' : eReader, + 'PNRdPPrs' : eReader, +} + +class PDBError(Exception): + pass + + +def get_reader(identity): + ''' + Returns None if no reader is found for the identity. + ''' + if identity in FORMATS.keys(): + return FORMATS[identity] + else: + return None diff --git a/src/calibre/ebooks/pdb/ereader/__init__.py b/src/calibre/ebooks/pdb/ereader/__init__.py new file mode 100644 index 0000000000..f2f1761cad --- /dev/null +++ b/src/calibre/ebooks/pdb/ereader/__init__.py @@ -0,0 +1,12 @@ +# -*- coding: utf-8 -*- +from __future__ import with_statement +''' +Write content to TXT. +''' + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +class EreaderError(Exception): + pass diff --git a/src/calibre/ebooks/pdb/ereader/pmlconverter.py b/src/calibre/ebooks/pdb/ereader/pmlconverter.py new file mode 100644 index 0000000000..a85f1c84ac --- /dev/null +++ b/src/calibre/ebooks/pdb/ereader/pmlconverter.py @@ -0,0 +1,98 @@ +# -*- coding: utf-8 -*- +from __future__ import with_statement +''' +Convert pml markup to html +''' + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +import re + +from calibre.ebooks.htmlsymbols import HTML_SYMBOLS + +PML_HTML_RULES = [ + (re.compile('\\\\p'), lambda match: '<br /><br style="page-break-after: always;" />'), + (re.compile('\\\\x(?P<text>.+?)\\\\x', re.DOTALL), lambda match: '<h1 style="page-break-before: always;">%s</h1>' % match.group('text')), + (re.compile('\\\\X(?P<val>[0-4])(?P<text>.+?)\\\\X[0-4]', re.DOTALL), lambda match: '<h%i style="page-break-before: always;">%i</h%i>' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)), + (re.compile('\\\\C\d=".+"'), lambda match: ''), # This should be made to create a TOC entry + (re.compile('\\\\c(?P<text>.+?)\\\\c', re.DOTALL), lambda match: '<div style="text-align: center; display: block; margin: auto;">%s</div>' % match.group('text')), + (re.compile('\\\\r(?P<text>.+?)\\\\r', re.DOTALL), lambda match: '<div style="text-align: right; display: block;">%s</div>' % match.group('text')), + (re.compile('\\\\i(?P<text>.+?)\\\\i', re.DOTALL), lambda match: '<i>%s</i>' % match.group('text')), + (re.compile('\\\\u(?P<text>.+?)\\\\u', re.DOTALL), lambda match: '<div style="text-decoration: underline;">%s</div>' % match.group('text')), + (re.compile('\\\\o(?P<text>.+?)\\\\o', re.DOTALL), lambda match: '<del>%s</del>' % match.group('text')), + (re.compile('\\\\v(?P<text>.+?)\\\\v', re.DOTALL), lambda match: '<!-- %s -->' % match.group('text')), + (re.compile('\\\\t(?P<text>.+?)\\\\t', re.DOTALL), lambda match: '<div style="margin-left: 5%%">%s</div>' % match.group('text')), + (re.compile('\\\\T="(?P<val>\d+%*)"(?P<text>.+?)$', re.MULTILINE), lambda match: '<div style="margin-left: %i%">%s</div>' % (match.group('val'), match.group('text'))), + (re.compile('\\\\w="(?P<val>\d+)%"'), lambda match: '<hr width="%s%%" />' % match.group('val')), + (re.compile('\\\\n'), lambda match: ''), + (re.compile('\\\\s'), lambda match: ''), + (re.compile('\\\\b(?P<text>.+?)\\\\b', re.DOTALL), lambda match: '<b>%s</b>' % match.group('text')), # \b is deprecated; \B should be used instead. + (re.compile('\\\\l(?P<text>.+?)\\\\l', re.DOTALL), lambda match: '<big>%s</big>' % match.group('text')), + (re.compile('\\\\B(?P<text>.+?)\\\\B', re.DOTALL), lambda match: '<b>%s</b>' % match.group('text')), + (re.compile('\\\\Sp(?P<text>.+?)\\\\Sp', re.DOTALL), lambda match: '<sup>%s</sup>' % match.group('text')), + (re.compile('\\\\Sb(?P<text>.+?)\\\\Sb', re.DOTALL), lambda match: '<sub>%s</sub>' % match.group('text')), + (re.compile('\\\\k(?P<text>.+?)\\\\k', re.DOTALL), lambda match: '<small>%s</small>' % match.group('text')), + (re.compile('\\\\a(?P<num>\d\d\d)'), lambda match: '&#%s;' % match.group('num')), + (re.compile('\\\\U(?P<num>\d\d\d\d)'), lambda match: '&#%i;' % int(match.group('num'))), + (re.compile('\\\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % match.group('name')), + (re.compile('\\\\q="(?P<target>#.+?)"(?P<text>)\\\\q', re.DOTALL), lambda match: '<a href="%s">%s</a>' % (match.group('target'), match.group('text'))), + (re.compile('\\\\Q="(?P<target>.+?)"'), lambda match: '<div id="%s"></div>' % match.group('target')), + (re.compile('\\\\-'), lambda match: ''), + # Todo: Footnotes need link. + (re.compile('\\\\Fn="(?P<target>.+?)"(?P<text>.+?)\\\\Fn'), lambda match: '<a href="#footnote-%s">%s</a>' % (match.group('target'), match.group('text'))), + (re.compile('\\\\Sd="(?P<target>.+?)"(?P<text>.+?)\\\\Sd'), lambda match: '<a href="#sidebar-%s">%s</a>' % (match.group('target'), match.group('text'))), + (re.compile('\\\\I'), lambda match: ''), + + # eReader files are one paragraph per line. + # This forces the lines to wrap properly. + (re.compile('^(?P<text>.+)$', re.MULTILINE), lambda match: '<p>%s</p>' % match.group('text')), + + # Remove unmatched plm codes. + (re.compile('(?<=[^\\\\])\\\\[pxcriouvtblBk]'), lambda match: ''), + (re.compile('(?<=[^\\\\])\\\\X[0-4]'), lambda match: ''), + (re.compile('(?<=[^\\\\])\\\\Sp'), lambda match: ''), + (re.compile('(?<=[^\\\\])\\\\Sb'), lambda match: ''), + + # Replace \\ with \. + (re.compile('\\\\\\\\'), lambda match: '\\'), +] + +FOOTNOTE_HTML_RULES = [ + (re.compile('<footnote id="(?P<id>.+?)">(?P<text>.+?)</footnote>', re.DOTALL), lambda match: '<div id="footnote-%s">%s</div>') +] + +SIDEBAR_HTML_RULES = [ + (re.compile('<sidebar id="(?P<id>.+?)">(?P<text>.+?)</sidebar>', re.DOTALL), lambda match: '<div id="sidebar-%s">%s</div>') +] + + +def pml_to_html(pml): + html = pml + for rule in PML_HTML_RULES: + html = rule[0].sub(rule[1], html) + + for symbol in HTML_SYMBOLS.keys(): + if ord(symbol) > 128: + html = html.replace(symbol, HTML_SYMBOLS[symbol][len(HTML_SYMBOLS[symbol]) - 1]) + + return html + +def footnote_to_html(footnotes): + html = footnotes + for rule in FOOTNOTE_HTML_RULES: + html = rule[0].sub(rule[1], html) + + html = pml_to_html(html) + + return html + +def sidebar_to_html(sidebars): + html = sidebars + for rule in FOOTNOTE_HTML_RULES: + html = rule[0].sub(rule[1], html) + + html = pml_to_html(html) + + return html diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py new file mode 100644 index 0000000000..6883649921 --- /dev/null +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -0,0 +1,199 @@ +# -*- coding: utf-8 -*- +from __future__ import with_statement +''' +Read content from ereader pdb file. +''' + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +import os, sys, struct, zlib + +from calibre import CurrentDir +from calibre.ebooks import DRMError +from calibre.ebooks.metadata import MetaInformation +from calibre.ebooks.pdb.ereader import EreaderError +from calibre.ebooks.pdb.ereader.pmlconverter import pml_to_html, \ + footnote_to_html, sidebar_to_html +from calibre.ebooks.mobi.palmdoc import decompress_doc +from calibre.ebooks.metadata.opf2 import OPFCreator + +class HeaderRecord(object): + ''' + The first record in the file is always the header record. It holds + information related to the location of text, images, and so on + in the file. This is used in conjunction with the sections + defined in the file header. + ''' + + def __init__(self, raw): + self.version, = struct.unpack('>H', raw[0:2]) + self.non_text_offset, = struct.unpack('>H', raw[12:14]) + self.footnote_rec, = struct.unpack('>H', raw[28:30]) + self.sidebar_rec, = struct.unpack('>H', raw[30:32]) + self.bookmark_offset, = struct.unpack('>H', raw[32:34]) + self.image_data_offset, = struct.unpack('>H', raw[40:42]) + self.metadata_offset, = struct.unpack('>H', raw[44:46]) + self.footnote_offset, = struct.unpack('>H', raw[48:50]) + self.sidebar_offset, = struct.unpack('>H', raw[50:52]) + self.last_data_offset, = struct.unpack('>H', raw[52:54]) + + self.num_text_pages = self.non_text_offset -1 + self.num_image_pages = self.metadata_offset - self.image_data_offset + + # Can't tell which is sidebar and footnote if they have same offset. + # They don't exist if offset is larget than last_record. + self.num_footnote_pages = self.sidebar_offset - self.footnote_offset if self.footnote_offset < self.last_data_offset else 0 + self.num_sidebar_pages = self.sidebar_offset - self.last_data_offset if self.footnote_offset < self.last_data_offset else 0 + + +class Reader(object): + + def __init__(self, header, stream): + raw = stream.read() + + self.sections = [] + for i in range(header.num_sections): + self.sections.append(header.section_data(i)) + + self.header_record = HeaderRecord(self.section_data(0)) + + if self.header_record.version not in (2, 10): + if self.header_record.version in (260, 272): + raise DRMError('eReader DRM is not supported.') + else: + raise EreaderError('Unknown book version %i.' % self.header_record.version) + + def section_data(self, number): + return self.sections[number] + + def decompress_text(self, number): + if self.header_record.version == 2: + return decompress_doc(self.section_data(number)).decode('cp1252') + if self.header_record.version == 10: + return zlib.decompress(self.section_data(number)).decode('cp1252') + + + def get_image(self, number): + if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1: + return 'empty', '' + data = self.section_data(number) + name = data[4:4+32].strip('\0') + img = data[62:] + return name, img + + def get_text_page(self, number): + ''' + Only palmdoc and zlib compressed are supported. The text is + assumed to be encoded as Windows-1252. The encoding is part of + the eReader file spec and should always be this encoding. + ''' + if number < 1 or number > self.header_record.num_text_pages: + return '' + + return self.decompress_text(number) + + def get_footnote_page(self, number): + if number < self.header_record.footnote_offset or number > self.header_record.footnote_offset + self.header_record.num_footnote_pages - 1: + return '' + + return self.decompress_text(number) + + def get_sidebar_page(self, number): + if number < self.header_record.sidebar_offset or number > self.header_record.sidebar_offset + self.header_record.num_sidebar_pages - 1: + return '' + + return self.decompress_text(number) + + def has_footnotes(self): + if self.header_record.num_footnote_pages > 1: + try: + content = self.decompress_text(self.header_record.footnote_offset) + + if content.contains('</footnote>'): + return True + except: + pass + return False + + def has_sidebar(self): + if self.header_record.num_sidebar_pages > 1: + try: + content = self.decompress_text(self.header_record.sidebar_offset) + + if content.contains('</sidebar>'): + return True + except: + pass + return False + + def extract_content(self, output_dir): + output_dir = os.path.abspath(output_dir) + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + html = '<html><head><title>' + + for i in range(1, self.header_record.num_text_pages + 1): + html += pml_to_html(self.get_text_page(i)) + + # Untested: The num_.._pages variable may not be correct! + # Possibly use .._rec instead? + ''' + if has_footnotes(): + html += '

    %s

    ' % _('Footnotes') + for i in range(self.header_record.footnote_offset, self.header_record.num_footnote_pages): + html += footnote_to_html(self.get_footnote_page(i)) + + if has_sidebar(): + html += '

    %s

    ' % _('Sidebar') + for i in range(self.header_record.sidebar_offset, self.header_record.num_sidebar_pages): + html += sidebar_to_html(self.get_sidebar_page(i)) + ''' + + html += '' + + with CurrentDir(output_dir): + with open('index.html', 'wb') as index: + index.write(html.encode('utf-8')) + + if not os.path.exists(os.path.join(output_dir, 'images/')): + os.makedirs(os.path.join(output_dir, 'images/')) + images = [] + with CurrentDir(os.path.join(output_dir, 'images/')): + for i in range(0, self.header_record.num_image_pages): + name, img = self.get_image(self.header_record.image_data_offset + i) + images.append(name) + with open(name, 'wb') as imgf: + imgf.write(img) + + self.create_opf(output_dir, images) + + return os.path.join(output_dir, 'metadata.opf') + + def create_opf(self, output_dir, images): + mi = MetaInformation(None, None) + + with CurrentDir(output_dir): + opf = OPFCreator(output_dir, mi) + + manifest = [('index.html', None)] + + for i in images: + manifest.append((os.path.join('images/', i), None)) + + opf.create_manifest(manifest) + opf.create_spine(['index.html']) + with open('metadata.opf', 'wb') as opffile: + opf.render(opffile) + + def dump_pml(self): + pml = '' + + for i in range(1, self.header_record.num_text_pages + 1): + pml += self.get_text_page(i) + + return pml + diff --git a/src/calibre/ebooks/pdb/header.py b/src/calibre/ebooks/pdb/header.py new file mode 100644 index 0000000000..a3aa56a718 --- /dev/null +++ b/src/calibre/ebooks/pdb/header.py @@ -0,0 +1,60 @@ +# -*- coding: utf-8 -*- +from __future__ import with_statement +''' +Read the header data from a pdb file. +''' + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os, struct + +class PdbHeader(object): + + def __init__(self, stream): + self.stream = stream + self.ident = self.identity() + self.num_sections = self.section_count() + self.title = self.name() + + def identity(self): + self.stream.seek(60) + ident = self.stream.read(8) + return ident + + def section_count(self): + self.stream.seek(76) + return struct.unpack('>H', self.stream.read(2))[0] + + def name(self): + self.stream.seek(0) + return self.stream.read(32).replace('\x00', '') + + def full_section_info(self, number): + if number > self.num_sections: + raise ValueError('Not a valid section number %i' % number) + + self.stream.seek(78+number*8) + offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', self.stream.read(8))[0] + flags, val = a1, a2<<16 | a3<<8 | a4 + return (offset, flags, val) + + def section_offset(self, number): + if number > self.num_sections: + raise ValueError('Not a valid section number %i' % number) + + self.stream.seek(78+number*8) + return struct.unpack('>LBBBB', self.stream.read(8))[0] + + def section_data(self, number): + if number > self.num_sections: + raise ValueError('Not a valid section number %i' % number) + + start = self.section_offset(number) + if number == self.num_sections -1: + end = os.stat(self.stream.name).st_size + else: + end = self.section_offset(number + 1) + self.stream.seek(start) + return self.stream.read(end - start) diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py new file mode 100644 index 0000000000..47125f28ab --- /dev/null +++ b/src/calibre/ebooks/pdb/input.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os + +from calibre.customize.conversion import InputFormatPlugin +from calibre.ebooks.pdb.header import PdbHeader +from calibre.ebooks.pdb import PDBError, get_reader + +class PDBInput(InputFormatPlugin): + + name = 'PDB Input' + author = 'John Schember' + description = 'Convert PDB to HTML' + file_types = set(['pdb']) + + def convert(self, stream, options, file_ext, log, + accelerators): + header = PdbHeader(stream) + Reader = get_reader(header.ident) + + if Reader is None: + raise PDBError('Unknown format identity is %s' % header.identity) + + reader = Reader(header, stream) + opf = reader.extract_content(os.getcwd()) + + return opf From e968f529dab1949ef65c840107c77bf36b8aeec1 Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 21 Apr 2009 19:37:37 -0400 Subject: [PATCH 123/319] Working eReader input. --- src/calibre/ebooks/pdb/__init__.py | 5 +++++ src/calibre/ebooks/pdb/ereader/pmlconverter.py | 1 - src/calibre/ebooks/pdb/ereader/reader.py | 15 ++++++++++++--- src/calibre/ebooks/pdb/input.py | 8 +++++--- 4 files changed, 22 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/pdb/__init__.py b/src/calibre/ebooks/pdb/__init__.py index 5e51a807e9..8c4f45337f 100644 --- a/src/calibre/ebooks/pdb/__init__.py +++ b/src/calibre/ebooks/pdb/__init__.py @@ -12,6 +12,11 @@ FORMATS = { 'PNRdPPrs' : eReader, } +IDENTITY_TO_NAME = { + 'PNPdPPrs' : 'eReader', + 'PNRdPPrs' : 'eReader', +} + class PDBError(Exception): pass diff --git a/src/calibre/ebooks/pdb/ereader/pmlconverter.py b/src/calibre/ebooks/pdb/ereader/pmlconverter.py index a85f1c84ac..454510f699 100644 --- a/src/calibre/ebooks/pdb/ereader/pmlconverter.py +++ b/src/calibre/ebooks/pdb/ereader/pmlconverter.py @@ -40,7 +40,6 @@ PML_HTML_RULES = [ (re.compile('\\\\q="(?P#.+?)"(?P)\\\\q', re.DOTALL), lambda match: '%s' % (match.group('target'), match.group('text'))), (re.compile('\\\\Q="(?P.+?)"'), lambda match: '
    ' % match.group('target')), (re.compile('\\\\-'), lambda match: ''), - # Todo: Footnotes need link. (re.compile('\\\\Fn="(?P.+?)"(?P.+?)\\\\Fn'), lambda match: '%s' % (match.group('target'), match.group('text'))), (re.compile('\\\\Sd="(?P.+?)"(?P.+?)\\\\Sd'), lambda match: '%s' % (match.group('target'), match.group('text'))), (re.compile('\\\\I'), lambda match: ''), diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index 6883649921..9354787447 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -44,13 +44,15 @@ class HeaderRecord(object): # Can't tell which is sidebar and footnote if they have same offset. # They don't exist if offset is larget than last_record. + # Todo: Determine if the subtraction is necessary and find out + # what _rec means. self.num_footnote_pages = self.sidebar_offset - self.footnote_offset if self.footnote_offset < self.last_data_offset else 0 self.num_sidebar_pages = self.sidebar_offset - self.last_data_offset if self.footnote_offset < self.last_data_offset else 0 class Reader(object): - def __init__(self, header, stream): + def __init__(self, header, stream, log): raw = stream.read() self.sections = [] @@ -169,9 +171,9 @@ class Reader(object): with open(name, 'wb') as imgf: imgf.write(img) - self.create_opf(output_dir, images) + opf_path = self.create_opf(output_dir, images) - return os.path.join(output_dir, 'metadata.opf') + return opf_path def create_opf(self, output_dir, images): mi = MetaInformation(None, None) @@ -188,6 +190,8 @@ class Reader(object): opf.create_spine(['index.html']) with open('metadata.opf', 'wb') as opffile: opf.render(opffile) + + return os.path.join(output_dir, 'metadata.opf') def dump_pml(self): pml = '' @@ -197,3 +201,8 @@ class Reader(object): return pml + +class EreaderMetadata(object): + + def __init__(self, record): + pass diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py index 47125f28ab..d64e2aa51b 100644 --- a/src/calibre/ebooks/pdb/input.py +++ b/src/calibre/ebooks/pdb/input.py @@ -9,7 +9,7 @@ import os from calibre.customize.conversion import InputFormatPlugin from calibre.ebooks.pdb.header import PdbHeader -from calibre.ebooks.pdb import PDBError, get_reader +from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader class PDBInput(InputFormatPlugin): @@ -24,9 +24,11 @@ class PDBInput(InputFormatPlugin): Reader = get_reader(header.ident) if Reader is None: - raise PDBError('Unknown format identity is %s' % header.identity) + raise PDBError('Unknown format in pdb file. Identity is %s' % header.identity) + + log.debug('Detected ebook format as: %s with identity: %s' % (IDENTITY_TO_NAME[header.ident], header.ident)) - reader = Reader(header, stream) + reader = Reader(header, stream, log) opf = reader.extract_content(os.getcwd()) return opf From 3bbd277d2b95f2b539a11362a1be128bbb818de9 Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 22 Apr 2009 07:30:22 -0400 Subject: [PATCH 124/319] ereader reader debug output --- src/calibre/ebooks/pdb/ereader/__init__.py | 3 --- src/calibre/ebooks/pdb/ereader/pmlconverter.py | 2 +- src/calibre/ebooks/pdb/ereader/reader.py | 18 ++++++++++++------ src/calibre/ebooks/pdb/header.py | 6 +++--- 4 files changed, 16 insertions(+), 13 deletions(-) diff --git a/src/calibre/ebooks/pdb/ereader/__init__.py b/src/calibre/ebooks/pdb/ereader/__init__.py index f2f1761cad..89d9dfdd35 100644 --- a/src/calibre/ebooks/pdb/ereader/__init__.py +++ b/src/calibre/ebooks/pdb/ereader/__init__.py @@ -1,8 +1,5 @@ # -*- coding: utf-8 -*- from __future__ import with_statement -''' -Write content to TXT. -''' __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' diff --git a/src/calibre/ebooks/pdb/ereader/pmlconverter.py b/src/calibre/ebooks/pdb/ereader/pmlconverter.py index 454510f699..250b74eb56 100644 --- a/src/calibre/ebooks/pdb/ereader/pmlconverter.py +++ b/src/calibre/ebooks/pdb/ereader/pmlconverter.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import with_statement ''' -Convert pml markup to html +Convert pml markup to and from html ''' __license__ = 'GPL v3' diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index 9354787447..f6bbc3d23f 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -46,15 +46,16 @@ class HeaderRecord(object): # They don't exist if offset is larget than last_record. # Todo: Determine if the subtraction is necessary and find out # what _rec means. - self.num_footnote_pages = self.sidebar_offset - self.footnote_offset if self.footnote_offset < self.last_data_offset else 0 + end_footnote_offset = self.sidebar_offset if self.sidebar_offset != self.footnote_offset else self.last_data_offset + self.num_footnote_pages = end_footnote_offset - self.footnote_offset if self.footnote_offset < self.last_data_offset else 0 self.num_sidebar_pages = self.sidebar_offset - self.last_data_offset if self.footnote_offset < self.last_data_offset else 0 class Reader(object): def __init__(self, header, stream, log): - raw = stream.read() - + self.log = log + self.sections = [] for i in range(header.num_sections): self.sections.append(header.section_data(i)) @@ -91,19 +92,19 @@ class Reader(object): assumed to be encoded as Windows-1252. The encoding is part of the eReader file spec and should always be this encoding. ''' - if number < 1 or number > self.header_record.num_text_pages: + if number not in range(1, self.header_record.num_text_pages): return '' return self.decompress_text(number) def get_footnote_page(self, number): - if number < self.header_record.footnote_offset or number > self.header_record.footnote_offset + self.header_record.num_footnote_pages - 1: + if number not in range(self.header_record.footnote_offset, self.header_record.footnote_offset + self.header_record.num_footnote_pages): return '' return self.decompress_text(number) def get_sidebar_page(self, number): - if number < self.header_record.sidebar_offset or number > self.header_record.sidebar_offset + self.header_record.num_sidebar_pages - 1: + if number not in range(self.header_record.sidebar_offset, self.header_record.sidebar_offset + self.header_record.num_sidebar_pages - 1): return '' return self.decompress_text(number) @@ -139,6 +140,7 @@ class Reader(object): html = '' for i in range(1, self.header_record.num_text_pages + 1): + self.log.debug('Extracting text page %i' % i) html += pml_to_html(self.get_text_page(i)) # Untested: The num_.._pages variable may not be correct! @@ -147,11 +149,13 @@ class Reader(object): if has_footnotes(): html += '

    %s

    ' % _('Footnotes') for i in range(self.header_record.footnote_offset, self.header_record.num_footnote_pages): + self.log.debug('Extracting footnote page %i' % i) html += footnote_to_html(self.get_footnote_page(i)) if has_sidebar(): html += '

    %s

    ' % _('Sidebar') for i in range(self.header_record.sidebar_offset, self.header_record.num_sidebar_pages): + self.log.debug('Extracting sidebar page %i' % i) html += sidebar_to_html(self.get_sidebar_page(i)) ''' @@ -159,6 +163,7 @@ class Reader(object): with CurrentDir(output_dir): with open('index.html', 'wb') as index: + self.log.debug('Writing text to index.html') index.write(html.encode('utf-8')) if not os.path.exists(os.path.join(output_dir, 'images/')): @@ -169,6 +174,7 @@ class Reader(object): name, img = self.get_image(self.header_record.image_data_offset + i) images.append(name) with open(name, 'wb') as imgf: + self.log.debug('Writing image %s to images/' % name) imgf.write(img) opf_path = self.create_opf(output_dir, images) diff --git a/src/calibre/ebooks/pdb/header.py b/src/calibre/ebooks/pdb/header.py index a3aa56a718..efa727dac9 100644 --- a/src/calibre/ebooks/pdb/header.py +++ b/src/calibre/ebooks/pdb/header.py @@ -32,7 +32,7 @@ class PdbHeader(object): return self.stream.read(32).replace('\x00', '') def full_section_info(self, number): - if number > self.num_sections: + if number not in range(0, self.num_sections): raise ValueError('Not a valid section number %i' % number) self.stream.seek(78+number*8) @@ -41,14 +41,14 @@ class PdbHeader(object): return (offset, flags, val) def section_offset(self, number): - if number > self.num_sections: + if number not in range(0, self.num_sections): raise ValueError('Not a valid section number %i' % number) self.stream.seek(78+number*8) return struct.unpack('>LBBBB', self.stream.read(8))[0] def section_data(self, number): - if number > self.num_sections: + if number not in range(0, self.num_sections): raise ValueError('Not a valid section number %i' % number) start = self.section_offset(number) From f158c9c6430821568cdfd6a58ac7a08e948c8a93 Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 22 Apr 2009 08:04:19 -0400 Subject: [PATCH 125/319] Interface for pdb format readers. PDB: support user input encodings --- src/calibre/ebooks/pdb/ereader/reader.py | 10 ++++++---- src/calibre/ebooks/pdb/formatreader.py | 18 ++++++++++++++++++ src/calibre/ebooks/pdb/input.py | 2 +- 3 files changed, 25 insertions(+), 5 deletions(-) create mode 100644 src/calibre/ebooks/pdb/formatreader.py diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index f6bbc3d23f..b696005e85 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -13,6 +13,7 @@ import os, sys, struct, zlib from calibre import CurrentDir from calibre.ebooks import DRMError from calibre.ebooks.metadata import MetaInformation +from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.ereader import EreaderError from calibre.ebooks.pdb.ereader.pmlconverter import pml_to_html, \ footnote_to_html, sidebar_to_html @@ -51,10 +52,11 @@ class HeaderRecord(object): self.num_sidebar_pages = self.sidebar_offset - self.last_data_offset if self.footnote_offset < self.last_data_offset else 0 -class Reader(object): +class Reader(FormatReader): - def __init__(self, header, stream, log): + def __init__(self, header, stream, log, encoding=None): self.log = log + self.encoding = encoding self.sections = [] for i in range(header.num_sections): @@ -73,9 +75,9 @@ class Reader(object): def decompress_text(self, number): if self.header_record.version == 2: - return decompress_doc(self.section_data(number)).decode('cp1252') + return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding) if self.header_record.version == 10: - return zlib.decompress(self.section_data(number)).decode('cp1252') + return zlib.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding) def get_image(self, number): diff --git a/src/calibre/ebooks/pdb/formatreader.py b/src/calibre/ebooks/pdb/formatreader.py new file mode 100644 index 0000000000..25abb462cf --- /dev/null +++ b/src/calibre/ebooks/pdb/formatreader.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- +from __future__ import with_statement +''' +Interface defining the necessary public functions for a pdb format reader. +''' + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + + +class FormatReader(object): + + def __init__(self, header, stream, log, encoding=None): + raise NotImplementedError() + + def extract_content(self, output_dir): + raise NotImplementedError() diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py index d64e2aa51b..9d848b1c24 100644 --- a/src/calibre/ebooks/pdb/input.py +++ b/src/calibre/ebooks/pdb/input.py @@ -28,7 +28,7 @@ class PDBInput(InputFormatPlugin): log.debug('Detected ebook format as: %s with identity: %s' % (IDENTITY_TO_NAME[header.ident], header.ident)) - reader = Reader(header, stream, log) + reader = Reader(header, stream, log, options.input_encoding) opf = reader.extract_content(os.getcwd()) return opf From 0b6dc7f8ed784e4a9df6bb59a13f5cb331a6c107 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 22 Apr 2009 14:35:32 -0700 Subject: [PATCH 126/319] Conversion pipeline is now a superset of any2epub :) --- src/calibre/ebooks/conversion/cli.py | 5 ++ src/calibre/ebooks/conversion/plumber.py | 74 +++++++++++++++- src/calibre/ebooks/epub/output.py | 22 +++++ src/calibre/ebooks/metadata/__init__.py | 3 + src/calibre/ebooks/oeb/base.py | 27 ++++-- src/calibre/ebooks/oeb/output.py | 3 +- src/calibre/ebooks/oeb/stylizer.py | 21 +++-- src/calibre/ebooks/oeb/transforms/flatcss.py | 43 ++++++++-- src/calibre/ebooks/oeb/transforms/guide.py | 47 +++++++++++ src/calibre/ebooks/oeb/transforms/jacket.py | 66 +++++++++++++++ src/calibre/ebooks/oeb/transforms/metadata.py | 84 +++++++++++++++++++ src/calibre/ebooks/oeb/transforms/split.py | 4 +- 12 files changed, 374 insertions(+), 25 deletions(-) create mode 100644 src/calibre/ebooks/epub/output.py create mode 100644 src/calibre/ebooks/oeb/transforms/guide.py create mode 100644 src/calibre/ebooks/oeb/transforms/jacket.py create mode 100644 src/calibre/ebooks/oeb/transforms/metadata.py diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index a30549cbc3..ae0af532ab 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -117,6 +117,9 @@ def add_pipeline_options(parser, plumber): 'line_height', 'linearize_tables', 'extra_css', + 'margin_top', 'margin_left', 'margin_right', + 'margin_bottom', 'dont_justify', + 'insert_blank_line', 'remove_paragraph_spacing', ] ), @@ -124,6 +127,8 @@ def add_pipeline_options(parser, plumber): _('Control auto-detection of document structure.'), [ 'dont_split_on_page_breaks', 'chapter', 'chapter_mark', + 'prefer_metadata_cover', 'remove_first_image', + 'insert_comments', ] ), diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 22c11303ad..f55d677d08 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -195,7 +195,7 @@ OptionRecommendation(name='toc_filter', OptionRecommendation(name='chapter', recommended_value="//*[((name()='h1' or name()='h2') and " - "re:test(., 'chapter|book|section|part', 'i')) or @class " + r"re:test(., 'chapter|book|section|part\s+', 'i')) or @class " "= 'chapter']", level=OptionRecommendation.LOW, help=_('An XPath expression to detect chapter titles. The default ' 'is to consider

    or

    tags that contain the words ' @@ -227,6 +227,64 @@ OptionRecommendation(name='extra_css', 'rules.') ), +OptionRecommendation(name='margin_top', + recommended_value=5.0, level=OptionRecommendation.LOW, + help=_('Set the top margin in pts. Default is %default')), + +OptionRecommendation(name='margin_bottom', + recommended_value=5.0, level=OptionRecommendation.LOW, + help=_('Set the bottom margin in pts. Default is %default')), + +OptionRecommendation(name='margin_left', + recommended_value=5.0, level=OptionRecommendation.LOW, + help=_('Set the left margin in pts. Default is %default')), + +OptionRecommendation(name='margin_right', + recommended_value=5.0, level=OptionRecommendation.LOW, + help=_('Set the right margin in pts. Default is %default')), + +OptionRecommendation(name='dont_justify', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Do not force text to be justified in output. Whether text ' + 'is actually displayed justified or not depends on whether ' + 'the ebook format and reading device support justification.') + ), + +OptionRecommendation(name='remove_paragraph_spacing', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Remove spacing between paragraphs. Also sets an indent on ' + 'paragraphs of 1.5em. Spacing removal will not work ' + 'if the source file does not use paragraphs (

    or

    tags).') + ), + +OptionRecommendation(name='prefer_metadata_cover', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Use the cover detected from the source file in preference ' + 'to the specified cover.') + ), + +OptionRecommendation(name='insert_blank_line', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Insert a blank line between paragraphs. Will not work ' + 'if the source file does not use paragraphs (

    or

    tags).' + ) + ), + +OptionRecommendation(name='remove_first_image', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Remove the first image from the input ebook. Useful if the ' + 'first image in the source file is a cover and you are specifying ' + 'an external cover.' + ) + ), + +OptionRecommendation(name='insert_comments', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Insert the comments/summary from the book metadata at the start of ' + 'the book. This is useful if your ebook reader does not support ' + 'displaying the comments from the metadata.' + ) + ), OptionRecommendation(name='read_metadata_from_opf', @@ -244,7 +302,8 @@ OptionRecommendation(name='title', OptionRecommendation(name='authors', recommended_value=None, level=OptionRecommendation.LOW, - help=_('Set the authors. Multiple authors should be separated ')), + help=_('Set the authors. Multiple authors should be separated by ' + 'ampersands.')), OptionRecommendation(name='title_sort', recommended_value=None, level=OptionRecommendation.LOW, @@ -428,7 +487,6 @@ OptionRecommendation(name='language', mi.cover = None self.user_metadata = mi - def setup_options(self): ''' Setup the `self.opts` object. @@ -479,9 +537,16 @@ OptionRecommendation(name='language', if not hasattr(self.oeb, 'manifest'): self.oeb = create_oebbook(self.log, self.oeb, self.opts) + from calibre.ebooks.oeb.transforms.guide import Clean + Clean()(self.oeb, self.opts) + self.opts.source = self.opts.input_profile self.opts.dest = self.opts.output_profile + from calibre.ebooks.oeb.transforms.metadata import MergeMetadata + MergeMetadata()(self.oeb, self.user_metadata, + self.opts.prefer_metadata_cover) + from calibre.ebooks.oeb.transforms.structure import DetectStructure DetectStructure()(self.oeb, self.opts) @@ -495,6 +560,9 @@ OptionRecommendation(name='language', else: fkey = map(float, fkey.split(',')) + from calibre.ebooks.oeb.transforms.jacket import Jacket + Jacket()(self.oeb, self.opts) + if self.opts.extra_css and os.path.exists(self.opts.extra_css): self.opts.extra_css = open(self.opts.extra_css, 'rb').read() diff --git a/src/calibre/ebooks/epub/output.py b/src/calibre/ebooks/epub/output.py new file mode 100644 index 0000000000..4ce13720e0 --- /dev/null +++ b/src/calibre/ebooks/epub/output.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + + +from calibre.customize.conversion import OutputFormatPlugin +from calibre import CurrentDir + +class EPUBOutput(OutputFormatPlugin): + + name = 'EPUB Output' + author = 'Kovid Goyal' + file_type = 'epub' + + def convert(self, oeb, output_path, input_plugin, opts, log): + self.log, self.opts = log, opts + + diff --git a/src/calibre/ebooks/metadata/__init__.py b/src/calibre/ebooks/metadata/__init__.py index a14950a064..793c607527 100644 --- a/src/calibre/ebooks/metadata/__init__.py +++ b/src/calibre/ebooks/metadata/__init__.py @@ -260,6 +260,9 @@ class MetaInformation(object): x = 1.0 return '%d'%x if int(x) == x else '%.2f'%x + def authors_from_string(self, raw): + self.authors = string_to_authors(raw) + def __unicode__(self): ans = [] def fmt(x, y): diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index a36ad8f676..81120aaf2e 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -514,7 +514,8 @@ class Metadata(object): scheme = Attribute(lambda term: 'scheme' if \ term == OPF('meta') else OPF('scheme'), [DC('identifier'), OPF('meta')]) - file_as = Attribute(OPF('file-as'), [DC('creator'), DC('contributor')]) + file_as = Attribute(OPF('file-as'), [DC('creator'), DC('contributor'), + DC('title')]) role = Attribute(OPF('role'), [DC('creator'), DC('contributor')]) event = Attribute(OPF('event'), [DC('date')]) id = Attribute('id') @@ -593,6 +594,19 @@ class Metadata(object): yield key __iter__ = iterkeys + def clear(self, key): + l = self.items[key] + for x in list(l): + l.remove(x) + + def filter(self, key, predicate): + l = self.items[key] + for x in list(l): + if predicate(x): + l.remove(x) + + + def __getitem__(self, key): return self.items[key] @@ -1011,7 +1025,7 @@ class Manifest(object): media_type = OEB_DOC_MIME elif media_type in OEB_STYLES: media_type = OEB_CSS_MIME - attrib = {'id': item.id, 'href': item.href, + attrib = {'id': item.id, 'href': urlunquote(item.href), 'media-type': media_type} if item.fallback: attrib['fallback'] = item.fallback @@ -1202,6 +1216,9 @@ class Guide(object): self.refs[type] = ref return ref + def remove(self, type): + return self.refs.pop(type, None) + def iterkeys(self): for type in self.refs: yield type @@ -1229,7 +1246,7 @@ class Guide(object): def to_opf1(self, parent=None): elem = element(parent, 'guide') for ref in self.refs.values(): - attrib = {'type': ref.type, 'href': ref.href} + attrib = {'type': ref.type, 'href': urlunquote(ref.href)} if ref.title: attrib['title'] = ref.title element(elem, 'reference', attrib=attrib) @@ -1345,7 +1362,7 @@ class TOC(object): def to_opf1(self, tour): for node in self.nodes: element(tour, 'site', attrib={ - 'title': node.title, 'href': node.href}) + 'title': node.title, 'href': urlunquote(node.href)}) node.to_opf1(tour) return tour @@ -1358,7 +1375,7 @@ class TOC(object): point = element(parent, NCX('navPoint'), attrib=attrib) label = etree.SubElement(point, NCX('navLabel')) element(label, NCX('text')).text = node.title - element(point, NCX('content'), src=node.href) + element(point, NCX('content'), src=urlunquote(node.href)) node.to_ncx(point) return parent diff --git a/src/calibre/ebooks/oeb/output.py b/src/calibre/ebooks/oeb/output.py index ba62897215..6f141f7e5e 100644 --- a/src/calibre/ebooks/oeb/output.py +++ b/src/calibre/ebooks/oeb/output.py @@ -9,6 +9,7 @@ from lxml import etree from calibre.customize.conversion import OutputFormatPlugin from calibre import CurrentDir +from urllib import unquote class OEBOutput(OutputFormatPlugin): @@ -32,7 +33,7 @@ class OEBOutput(OutputFormatPlugin): f.write(raw) for item in oeb_book.manifest: - path = os.path.abspath(item.href) + path = os.path.abspath(unquote(item.href)) dir = os.path.dirname(path) if not os.path.exists(dir): os.makedirs(dir) diff --git a/src/calibre/ebooks/oeb/stylizer.py b/src/calibre/ebooks/oeb/stylizer.py index 34abea32f5..752a135db3 100644 --- a/src/calibre/ebooks/oeb/stylizer.py +++ b/src/calibre/ebooks/oeb/stylizer.py @@ -11,6 +11,7 @@ __copyright__ = '2008, Marshall T. Vandegrift ' import os import itertools import re +import logging import copy from weakref import WeakKeyDictionary from xml.dom import SyntaxErr as CSSSyntaxError @@ -106,7 +107,8 @@ class CSSSelector(etree.XPath): class Stylizer(object): STYLESHEETS = WeakKeyDictionary() - def __init__(self, tree, path, oeb, profile=PROFILES['PRS505'], extra_css=''): + def __init__(self, tree, path, oeb, profile=PROFILES['PRS505'], + extra_css='', user_css=''): self.oeb = oeb self.profile = profile self.logger = oeb.logger @@ -115,7 +117,8 @@ class Stylizer(object): cssname = os.path.splitext(basename)[0] + '.css' stylesheets = [HTML_CSS_STYLESHEET] head = xpath(tree, '/h:html/h:head')[0] - parser = cssutils.CSSParser(fetcher=self._fetch_css_file) + parser = cssutils.CSSParser(fetcher=self._fetch_css_file, + log=logging.getLogger('calibre.css')) for elem in head: if elem.tag == XHTML('style') and elem.text \ and elem.get('type', CSS_MIME) in OEB_STYLES: @@ -135,11 +138,12 @@ class Stylizer(object): (path, item.href)) continue stylesheets.append(sitem.data) - if extra_css: - text = XHTML_CSS_NAMESPACE + extra_css - stylesheet = parser.parseString(text, href=cssname) - stylesheet.namespaces['h'] = XHTML_NS - stylesheets.append(stylesheet) + for x in (extra_css, user_css): + if x: + text = XHTML_CSS_NAMESPACE + x + stylesheet = parser.parseString(text, href=cssname) + stylesheet.namespaces['h'] = XHTML_NS + stylesheets.append(stylesheet) rules = [] index = 0 self.stylesheets = set() @@ -288,6 +292,9 @@ class Style(object): self._lineHeight = None stylizer._styles[element] = self + def set(self, prop, val): + self._style[prop] = val + def _update_cssdict(self, cssdict): self._style.update(cssdict) diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py index ca96d28a8d..216697ae53 100644 --- a/src/calibre/ebooks/oeb/transforms/flatcss.py +++ b/src/calibre/ebooks/oeb/transforms/flatcss.py @@ -114,12 +114,27 @@ class CSSFlattener(object): def stylize_spine(self): self.stylizers = {} profile = self.context.source + css = '' for item in self.oeb.spine: html = item.data + body = html.find(XHTML('body')) + bs = body.get('style', '').split(';') + bs.append('margin-top: 0pt') + bs.append('margin-bottom: 0pt') + bs.append('margin-left : %fpt'%\ + float(self.context.margin_left)) + bs.append('margin-right : %fpt'%\ + float(self.context.margin_right)) + bs.append('text-align: '+ \ + ('left' if self.context.dont_justify else 'justify')) + body.set('style', '; '.join(bs)) + stylizer = Stylizer(html, item.href, self.oeb, profile, - extra_css=self.context.extra_css) + user_css=self.context.extra_css, + extra_css=css) self.stylizers[item] = stylizer + def baseline_node(self, node, stylizer, sizes, csize): csize = stylizer.style(node)['font-size'] if node.text: @@ -219,6 +234,15 @@ class CSSFlattener(object): if self.lineh and 'line-height' not in cssdict: lineh = self.lineh / psize cssdict['line-height'] = "%0.5fem" % lineh + if (self.context.remove_paragraph_spacing or + self.context.insert_blank_line) and tag in ('p', 'div'): + for prop in ('margin', 'padding', 'border'): + for edge in ('top', 'bottom'): + cssdict['%s-%s'%(prop, edge)] = '0pt' + if self.context.insert_blank_line: + cssdict['margin-top'] = cssdict['margin-bottom'] = '0.5em' + if self.context.remove_paragraph_spacing: + cssdict['text-indent'] = '1.5em' if cssdict: items = cssdict.items() items.sort() @@ -253,12 +277,16 @@ class CSSFlattener(object): href = item.relhref(href) etree.SubElement(head, XHTML('link'), rel='stylesheet', type=CSS_MIME, href=href) - if stylizer.page_rule: - items = stylizer.page_rule.items() - items.sort() - css = '; '.join("%s: %s" % (key, val) for key, val in items) - style = etree.SubElement(head, XHTML('style'), type=CSS_MIME) - style.text = "@page { %s; }" % css + stylizer.page_rule['margin-top'] = '%fpt'%\ + float(self.context.margin_top) + stylizer.page_rule['margin-bottom'] = '%fpt'%\ + float(self.context.margin_bottom) + + items = stylizer.page_rule.items() + items.sort() + css = '; '.join("%s: %s" % (key, val) for key, val in items) + style = etree.SubElement(head, XHTML('style'), type=CSS_MIME) + style.text = "@page { %s; }" % css def replace_css(self, css): manifest = self.oeb.manifest @@ -285,3 +313,4 @@ class CSSFlattener(object): for item in self.oeb.spine: stylizer = self.stylizers[item] self.flatten_head(item, stylizer, href) + diff --git a/src/calibre/ebooks/oeb/transforms/guide.py b/src/calibre/ebooks/oeb/transforms/guide.py new file mode 100644 index 0000000000..b20eddc6fe --- /dev/null +++ b/src/calibre/ebooks/oeb/transforms/guide.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + + +class Clean(object): + '''Clean up guide, leaving only a pointer to the cover''' + + def __call__(self, oeb, opts): + from calibre.ebooks.oeb.base import urldefrag + self.oeb, self.log, self.opts = oeb, oeb.log, opts + + cover_href = '' + if 'cover' not in self.oeb.guide: + covers = [] + for x in ('other.ms-coverimage-standard', + 'other.ms-titleimage-standard', 'other.ms-titleimage', + 'other.ms-coverimage', 'other.ms-thumbimage-standard', + 'other.ms-thumbimage'): + if x in self.oeb.guide: + href = self.oeb.guide[x].href + item = self.oeb.manifest.hrefs[href] + covers.append([self.oeb.guide[x], len(item.data)]) + covers.sort(cmp=lambda x,y:cmp(x[1], y[1]), reverse=True) + if covers: + ref = covers[0][0] + if len(covers) > 1: + self.log('Choosing %s:%s as the cover'%(ref.type, ref.href)) + ref.type = 'cover' + self.oeb.guide.refs['cover'] = ref + cover_href = urldefrag(ref.href)[0] + + for x in list(self.oeb.guide): + href = urldefrag(self.oeb.guide[x].href)[0] + if x.lower() != 'cover': + try: + if href != cover_href: + self.oeb.manifest.remove(self.oeb.manifest.hrefs[href]) + except KeyError: + pass + self.oeb.guide.remove(x) + + diff --git a/src/calibre/ebooks/oeb/transforms/jacket.py b/src/calibre/ebooks/oeb/transforms/jacket.py new file mode 100644 index 0000000000..c182faedfa --- /dev/null +++ b/src/calibre/ebooks/oeb/transforms/jacket.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import textwrap + +from lxml import etree + +from calibre.ebooks.oeb.base import XPNSMAP +from calibre import guess_type + +class Jacket(object): + ''' + Book jacket manipulation. Remove first image and insert comments at start of + book. + ''' + + JACKET_TEMPLATE = textwrap.dedent(u'''\ + + + %(title)s + + +

    %(title)s

    +

    %(jacket)s

    +
    + %(comments)s +
    + + + ''') + + def remove_first_image(self): + for i, item in enumerate(self.oeb.spine): + if i > 2: break + for img in item.data.xpath('//h:img[@src]', namespace=XPNSMAP): + href = item.abshref(img.get('src')) + image = self.oeb.manifest.hrefs.get(href, None) + if image is not None: + self.log('Removing first image', img.get('src')) + self.oeb.manifest.remove(image) + img.getparent().remove(img) + return + + def insert_comments(self, comments): + self.log('Inserting metadata comments into book...') + comments = comments.replace('\r\n', '\n').replace('\n\n', '

    ') + html = self.JACKET_TEMPLATE%dict(xmlns=XPNSMAP['h'], + title=self.opts.title, comments=comments, + jacket=_('Book Jacket')) + id, href = self.oeb.manifest.generate('jacket', 'jacket.xhtml') + root = etree.fromstring(html) + item = self.oeb.manifest.add(id, href, guess_type(href)[0], data=root) + self.oeb.spine.insert(0, item, True) + + + def __call__(self, oeb, opts): + self.oeb, self.opts, self.log = oeb, opts, oeb.log + if opts.remove_first_image: + self.remove_fisrt_image() + if opts.insert_comments and opts.comments: + self.insert_comments(opts.comments) diff --git a/src/calibre/ebooks/oeb/transforms/metadata.py b/src/calibre/ebooks/oeb/transforms/metadata.py new file mode 100644 index 0000000000..d2c4dd6309 --- /dev/null +++ b/src/calibre/ebooks/oeb/transforms/metadata.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import os + +class MergeMetadata(object): + 'Merge in user metadata, including cover' + + def __call__(self, oeb, mi, prefer_metadata_cover=False): + from calibre.ebooks.oeb.base import DC + self.oeb, self.log = oeb, oeb.log + m = self.oeb.metadata + self.log('Merging user specified metadata...') + if mi.title: + m.clear('title') + m.add('title', mi.title) + if mi.title_sort: + if not m.title: + m.add(DC('title'), mi.title_sort) + m.title[0].file_as = mi.title_sort + if mi.authors: + m.filter('creator', lambda x : x.role.lower() == 'aut') + for a in mi.authors: + attrib = {'role':'aut'} + if mi.author_sort: + attrib['file_as'] = mi.author_sort + m.add('creator', a, attrib=attrib) + if mi.comments: + m.clear('description') + m.add('description', mi.comments) + if mi.publisher: + m.clear('publisher') + m.add('publisher', mi.publisher) + if mi.series: + m.clear('series') + m.add('series', mi.series) + if mi.isbn: + has = False + for x in m.identifier: + if x.scheme.lower() == 'isbn': + x.content = mi.isbn + has = True + if not has: + m.add('identifier', mi.isbn, scheme='ISBN') + if mi.language: + m.clear('language') + m.add('language', mi.language) + if mi.book_producer: + m.filter('creator', lambda x : x.role.lower() == 'bkp') + m.add('creator', mi.book_producer, role='bkp') + if mi.series_index is not None: + m.clear('series_index') + m.add('series_index', '%.2f'%mi.series_index) + if mi.rating is not None: + m.clear('rating') + m.add('rating', '%.2f'%mi.rating) + if mi.tags: + m.clear('subject') + for t in mi.tags: + m.add('subject', t) + + self.set_cover(mi, prefer_metadata_cover) + + def set_cover(self, mi, prefer_metadata_cover): + cdata = '' + if mi.cover and os.access(mi.cover, os.R_OK): + cdata = open(mi.cover, 'rb').read() + elif mi.cover_data and mi.cover_data[-1]: + cdata = mi.cover_data[1] + if not cdata: return + if 'cover' in self.oeb.guide: + if not prefer_metadata_cover: + href = self.oeb.guide['cover'].href + self.oeb.manifest.hrefs[href]._data = cdata + else: + id, href = self.oeb.manifest.generate('cover', 'cover.jpg') + self.oeb.manifest.add(id, href, 'image/jpeg', data=cdata) + self.oeb.guide.add('cover', 'Cover', href) + diff --git a/src/calibre/ebooks/oeb/transforms/split.py b/src/calibre/ebooks/oeb/transforms/split.py index bee74c54a9..b54b0ebce0 100644 --- a/src/calibre/ebooks/oeb/transforms/split.py +++ b/src/calibre/ebooks/oeb/transforms/split.py @@ -16,7 +16,7 @@ from lxml import etree from lxml.cssselect import CSSSelector from calibre.ebooks.oeb.base import OEB_STYLES, XPNSMAP as NAMESPACES, \ - urldefrag, rewrite_links + urldefrag, rewrite_links, urlunquote from calibre.ebooks.epub import tostring, rules @@ -142,7 +142,7 @@ class Split(object): nhref = anchor_map[frag if frag else None] nhref = self.current_item.relhref(nhref) if frag: - nhref = '#'.join((nhref, frag)) + nhref = '#'.join((urlunquote(nhref), frag)) return nhref return url From 2905b9aedb357827d988c8f8fc7bf5a2c33e5198 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 22 Apr 2009 14:37:34 -0700 Subject: [PATCH 127/319] IGN:... --- src/calibre/ebooks/oeb/transforms/guide.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/calibre/ebooks/oeb/transforms/guide.py b/src/calibre/ebooks/oeb/transforms/guide.py index b20eddc6fe..06153c5a48 100644 --- a/src/calibre/ebooks/oeb/transforms/guide.py +++ b/src/calibre/ebooks/oeb/transforms/guide.py @@ -33,6 +33,8 @@ class Clean(object): ref.type = 'cover' self.oeb.guide.refs['cover'] = ref cover_href = urldefrag(ref.href)[0] + else: + cover_href = urldefrag(self.oeb.guide.refs['cover'].href)[0] for x in list(self.oeb.guide): href = urldefrag(self.oeb.guide[x].href)[0] From f96cd13f623cdc029242dc73aadcf6a7abd93bdb Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 22 Apr 2009 19:11:55 -0400 Subject: [PATCH 128/319] EreaderReader: dump images function. --- src/calibre/ebooks/pdb/ereader/reader.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index b696005e85..f9b58633a6 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -202,6 +202,10 @@ class Reader(FormatReader): return os.path.join(output_dir, 'metadata.opf') def dump_pml(self): + ''' + This is primarily used for debugging and 3rd party tools to + get the plm markup that comprises the text in the file. + ''' pml = '' for i in range(1, self.header_record.num_text_pages + 1): @@ -209,7 +213,21 @@ class Reader(FormatReader): return pml - + def dump_images(self, output_dir): + ''' + This is primarily used for debugging and 3rd party tools to + get the images in the file. + ''' + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + with CurrentDir(output_dir): + for i in range(0, self.header_record.num_image_pages): + name, img = self.get_image(self.header_record.image_data_offset + i) + with open(name, 'wb') as imgf: + imgf.write(img) + + class EreaderMetadata(object): def __init__(self, record): From d871313ff0acfd1c7a56a097fbd21253fc3498d4 Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 23 Apr 2009 19:09:13 -0400 Subject: [PATCH 129/319] Unfinished ereader writer work. --- src/calibre/customize/builtins.py | 3 +- src/calibre/ebooks/pdb/ereader/output.py | 23 ++++ .../ebooks/pdb/ereader/pmlconverter.py | 110 ++++++++++++------ src/calibre/ebooks/pdb/ereader/reader.py | 5 - src/calibre/ebooks/pdb/ereader/writer.py | 20 ++++ src/calibre/ebooks/pdb/header.py | 19 ++- src/calibre/ebooks/pdb/input.py | 2 +- 7 files changed, 139 insertions(+), 43 deletions(-) create mode 100644 src/calibre/ebooks/pdb/ereader/output.py create mode 100644 src/calibre/ebooks/pdb/ereader/writer.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index e0e9158f0e..945616a0ba 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -289,11 +289,12 @@ from calibre.ebooks.html.input import HTMLInput from calibre.ebooks.oeb.output import OEBOutput from calibre.ebooks.txt.output import TXTOutput from calibre.ebooks.pdf.output import PDFOutput +from calibre.ebooks.pdb.ereader.output import EREADEROutput from calibre.customize.profiles import input_profiles, output_profiles plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDBInput, PDFInput, HTMLInput, TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput, - FB2Input, ODTInput, RTFInput] + FB2Input, ODTInput, RTFInput, EREADEROutput] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/ebooks/pdb/ereader/output.py b/src/calibre/ebooks/pdb/ereader/output.py new file mode 100644 index 0000000000..034508b0da --- /dev/null +++ b/src/calibre/ebooks/pdb/ereader/output.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os + +from calibre.customize.conversion import OutputFormatPlugin, \ + OptionRecommendation +from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines, TxtMetadata +from calibre.ebooks.metadata import authors_to_string + +class EREADEROutput(OutputFormatPlugin): + + name = 'eReader PDB Output' + author = 'John Schember' + file_type = 'erpdb' + + def convert(self, oeb_book, output_path, input_plugin, opts, log): + from calibre.ebooks.pdb.ereader.pmlconverter import html_to_pml + +# print html_to_pml('

    “A hundred kisses from the Princess,” said he, “or else let everyone keep his own!”

    ') + print html_to_pml(str(oeb_book.spine[3])) diff --git a/src/calibre/ebooks/pdb/ereader/pmlconverter.py b/src/calibre/ebooks/pdb/ereader/pmlconverter.py index 250b74eb56..8ff30e9349 100644 --- a/src/calibre/ebooks/pdb/ereader/pmlconverter.py +++ b/src/calibre/ebooks/pdb/ereader/pmlconverter.py @@ -13,49 +13,49 @@ import re from calibre.ebooks.htmlsymbols import HTML_SYMBOLS PML_HTML_RULES = [ - (re.compile('\\\\p'), lambda match: '

    '), - (re.compile('\\\\x(?P.+?)\\\\x', re.DOTALL), lambda match: '

    %s

    ' % match.group('text')), - (re.compile('\\\\X(?P[0-4])(?P.+?)\\\\X[0-4]', re.DOTALL), lambda match: '%i' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)), - (re.compile('\\\\C\d=".+"'), lambda match: ''), # This should be made to create a TOC entry - (re.compile('\\\\c(?P.+?)\\\\c', re.DOTALL), lambda match: '
    %s
    ' % match.group('text')), - (re.compile('\\\\r(?P.+?)\\\\r', re.DOTALL), lambda match: '
    %s
    ' % match.group('text')), - (re.compile('\\\\i(?P.+?)\\\\i', re.DOTALL), lambda match: '%s' % match.group('text')), - (re.compile('\\\\u(?P.+?)\\\\u', re.DOTALL), lambda match: '
    %s
    ' % match.group('text')), - (re.compile('\\\\o(?P.+?)\\\\o', re.DOTALL), lambda match: '%s' % match.group('text')), - (re.compile('\\\\v(?P.+?)\\\\v', re.DOTALL), lambda match: '' % match.group('text')), - (re.compile('\\\\t(?P.+?)\\\\t', re.DOTALL), lambda match: '
    %s
    ' % match.group('text')), - (re.compile('\\\\T="(?P\d+%*)"(?P.+?)$', re.MULTILINE), lambda match: '
    %s
    ' % (match.group('val'), match.group('text'))), - (re.compile('\\\\w="(?P\d+)%"'), lambda match: '
    ' % match.group('val')), - (re.compile('\\\\n'), lambda match: ''), - (re.compile('\\\\s'), lambda match: ''), - (re.compile('\\\\b(?P.+?)\\\\b', re.DOTALL), lambda match: '%s' % match.group('text')), # \b is deprecated; \B should be used instead. - (re.compile('\\\\l(?P.+?)\\\\l', re.DOTALL), lambda match: '%s' % match.group('text')), - (re.compile('\\\\B(?P.+?)\\\\B', re.DOTALL), lambda match: '%s' % match.group('text')), - (re.compile('\\\\Sp(?P.+?)\\\\Sp', re.DOTALL), lambda match: '%s' % match.group('text')), - (re.compile('\\\\Sb(?P.+?)\\\\Sb', re.DOTALL), lambda match: '%s' % match.group('text')), - (re.compile('\\\\k(?P.+?)\\\\k', re.DOTALL), lambda match: '%s' % match.group('text')), - (re.compile('\\\\a(?P\d\d\d)'), lambda match: '&#%s;' % match.group('num')), - (re.compile('\\\\U(?P\d\d\d\d)'), lambda match: '&#%i;' % int(match.group('num'))), - (re.compile('\\\\m="(?P.+?)"'), lambda match: '' % match.group('name')), - (re.compile('\\\\q="(?P#.+?)"(?P)\\\\q', re.DOTALL), lambda match: '%s' % (match.group('target'), match.group('text'))), - (re.compile('\\\\Q="(?P.+?)"'), lambda match: '
    ' % match.group('target')), - (re.compile('\\\\-'), lambda match: ''), - (re.compile('\\\\Fn="(?P.+?)"(?P.+?)\\\\Fn'), lambda match: '%s' % (match.group('target'), match.group('text'))), - (re.compile('\\\\Sd="(?P.+?)"(?P.+?)\\\\Sd'), lambda match: '%s' % (match.group('target'), match.group('text'))), - (re.compile('\\\\I'), lambda match: ''), + (re.compile(r'\\p'), lambda match: '

    '), + (re.compile(r'\\x(?P.+?)\\x', re.DOTALL), lambda match: '

    %s

    ' % match.group('text')), + (re.compile(r'\\X(?P[0-4])(?P.+?)\\X[0-4]', re.DOTALL), lambda match: '%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)), + (re.compile(r'\\C\d=".+"'), lambda match: ''), # This should be made to create a TOC entry + (re.compile(r'\\c(?P.+?)\\c', re.DOTALL), lambda match: '
    %s
    ' % match.group('text')), + (re.compile(r'\\r(?P.+?)\\r', re.DOTALL), lambda match: '
    %s
    ' % match.group('text')), + (re.compile(r'\\i(?P.+?)\\i', re.DOTALL), lambda match: '%s' % match.group('text')), + (re.compile(r'\\u(?P.+?)\\u', re.DOTALL), lambda match: '
    %s
    ' % match.group('text')), + (re.compile(r'\\o(?P.+?)\\o', re.DOTALL), lambda match: '%s' % match.group('text')), + (re.compile(r'\\v(?P.+?)\\v', re.DOTALL), lambda match: '' % match.group('text')), + (re.compile(r'\\t(?P.+?)\\t', re.DOTALL), lambda match: '
    %s
    ' % match.group('text')), + (re.compile(r'\\T="(?P\d+)%%*"(?P.+?)$', re.MULTILINE), lambda match: '
    %s
    ' % (match.group('val'), match.group('text'))), + (re.compile(r'\\w="(?P\d+)%%"'), lambda match: '
    ' % match.group('val')), + (re.compile(r'\\n'), lambda match: ''), + (re.compile(r'\\s'), lambda match: ''), + (re.compile(r'\\b(?P.+?)\\b', re.DOTALL), lambda match: '%s' % match.group('text')), # \b is deprecated; \B should be used instead. + (re.compile(r'\\l(?P.+?)\\l', re.DOTALL), lambda match: '%s' % match.group('text')), + (re.compile(r'\\B(?P.+?)\\B', re.DOTALL), lambda match: '%s' % match.group('text')), + (re.compile(r'\\Sp(?P.+?)\\Sp', re.DOTALL), lambda match: '%s' % match.group('text')), + (re.compile(r'\\Sb(?P.+?)\\Sb', re.DOTALL), lambda match: '%s' % match.group('text')), + (re.compile(r'\\k(?P.+?)\\k', re.DOTALL), lambda match: '%s' % match.group('text')), + (re.compile(r'\\a(?P\d\d\d)'), lambda match: '&#%i;' % match.group('num')), + (re.compile(r'\\U(?P\d\d\d\d)'), lambda match: '&#%i;' % int(match.group('num'))), + (re.compile(r'\\m="(?P.+?)"'), lambda match: '' % match.group('name')), + (re.compile(r'\\q="(?P#.+?)"(?P)\\q', re.DOTALL), lambda match: '%s' % (match.group('target'), match.group('text'))), + (re.compile(r'\\Q="(?P.+?)"'), lambda match: '
    ' % match.group('target')), + (re.compile(r'\\-'), lambda match: ''), + (re.compile(r'\\Fn="(?P.+?)"(?P.+?)\\Fn'), lambda match: '%s' % (match.group('target'), match.group('text'))), + (re.compile(r'\\Sd="(?P.+?)"(?P.+?)\\Sd'), lambda match: '%s' % (match.group('target'), match.group('text'))), + (re.compile(r'\\I'), lambda match: ''), # eReader files are one paragraph per line. # This forces the lines to wrap properly. (re.compile('^(?P.+)$', re.MULTILINE), lambda match: '

    %s

    ' % match.group('text')), # Remove unmatched plm codes. - (re.compile('(?<=[^\\\\])\\\\[pxcriouvtblBk]'), lambda match: ''), - (re.compile('(?<=[^\\\\])\\\\X[0-4]'), lambda match: ''), - (re.compile('(?<=[^\\\\])\\\\Sp'), lambda match: ''), - (re.compile('(?<=[^\\\\])\\\\Sb'), lambda match: ''), + (re.compile(r'(?<=[^\\])\\[pxcriouvtblBk]'), lambda match: ''), + (re.compile(r'(?<=[^\\])\\X[0-4]'), lambda match: ''), + (re.compile(r'(?<=[^\\])\\Sp'), lambda match: ''), + (re.compile(r'(?<=[^\\])\\Sb'), lambda match: ''), # Replace \\ with \. - (re.compile('\\\\\\\\'), lambda match: '\\'), + (re.compile(r'\\\\'), lambda match: '\\'), ] FOOTNOTE_HTML_RULES = [ @@ -66,6 +66,37 @@ SIDEBAR_HTML_RULES = [ (re.compile('(?P.+?)', re.DOTALL), lambda match: '') ] +HTML_PML_RULES = [ + (re.compile(r'\\'), lambda match: '\\\\'), + (re.compile('(?<=[^\n])[ ]*'), lambda match: '\n

    '), + (re.compile('

    (^\n|\r\n)'), lambda match: '\n'), + (re.compile('.+?).*?">(?P.+?)'), lambda match: '\\Sd="%s"%s\\Sd' % (match.group('target'), match.group('text'))), + (re.compile('.+?).*?">(?P.+?)'), lambda match: '\\Fn="%s"%s\\Fn' % (match.group('target'), match.group('text'))), + (re.compile('.+?).*?">
    '), lambda match: '\\\\Q="%s"' % match.group('target')), + (re.compile('#.+?).*?">(?P)', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))), + (re.compile('.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')), + (re.compile('&#(?P\d\d\d\d);'), lambda match: '\\U%i' % int(match.group('num'))), + (re.compile('&#(?P\d\d\d);'), lambda match: '\\a%i' % match.group('num')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')), + (re.compile('\d+)%%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')), + (re.compile('\d+)%%*;.*?>(?P.+?)
    ', re.MULTILINE), lambda match: '\\T="%i%%"%s$' % (match.group('val'), match.group('text'))), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\t%s\\t' % match.group('text')), + (re.compile('', re.DOTALL), lambda match: '\\v%s\\v' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\u%s\\u' % match.group('text')), + (re.compile('(?P.+?)
    ', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\r%s\\r' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\c%s\\c' % match.group('text')), + (re.compile('[0-4]).*?>(?P.+?)', re.DOTALL), lambda match: '\\X%i%s\\X%i' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)), + (re.compile('(?P.+?)

    ', re.DOTALL), lambda match: '\\x%s\\x' % match.group('text')), + (re.compile(''), lambda match: '\\p'), + (re.compile('<.*?>'), lambda match: ''), + (re.compile(r'(\\p){2,}'), lambda match: r'\p'), +] def pml_to_html(pml): html = pml @@ -95,3 +126,12 @@ def sidebar_to_html(sidebars): html = pml_to_html(html) return html + +def html_to_pml(html): + pml = html + for rule in HTML_PML_RULES: + pml = rule[0].sub(rule[1], pml) + + # Replace symbols outside of cp1512 wtih \Uxxxx + + return pml diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index f9b58633a6..8a0abb970e 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -227,8 +227,3 @@ class Reader(FormatReader): with open(name, 'wb') as imgf: imgf.write(img) - -class EreaderMetadata(object): - - def __init__(self, record): - pass diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py new file mode 100644 index 0000000000..c9493d2915 --- /dev/null +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +from __future__ import with_statement +''' +Write content to ereader pdb file. +''' + +from calibre.ebooks.pdb.ereader.pmlconverter import html_to_pml + +class Writer(object): + + def __init__(self, log): + self.oeb_book = oeb_book + + def dump(oeb_book): + pml_pages = [] + for page in oeb_book.spine: + pml_pages.append(html_to_pml(page)) + + + \ No newline at end of file diff --git a/src/calibre/ebooks/pdb/header.py b/src/calibre/ebooks/pdb/header.py index efa727dac9..5b47e48a16 100644 --- a/src/calibre/ebooks/pdb/header.py +++ b/src/calibre/ebooks/pdb/header.py @@ -10,7 +10,7 @@ __docformat__ = 'restructuredtext en' import os, struct -class PdbHeader(object): +class PdbHeaderReader(object): def __init__(self, stream): self.stream = stream @@ -58,3 +58,20 @@ class PdbHeader(object): end = self.section_offset(number + 1) self.stream.seek(start) return self.stream.read(end - start) + + +class PdbHeaderWriter(object): + + def __init__(self, identity, title): + self.identity = identity[:8] + self.title = title.ljust(32, '\x00')[:32] + + def build_header(self, sections) + ''' + Sections is a list of section offsets + ''' + + + + + return header diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py index 9d848b1c24..180e0814a6 100644 --- a/src/calibre/ebooks/pdb/input.py +++ b/src/calibre/ebooks/pdb/input.py @@ -20,7 +20,7 @@ class PDBInput(InputFormatPlugin): def convert(self, stream, options, file_ext, log, accelerators): - header = PdbHeader(stream) + header = PdbHeaderReader(stream) Reader = get_reader(header.ident) if Reader is None: From 4cd285859b6721c48eefd9b23fe47b0bfc5ab871 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 23 Apr 2009 22:31:11 -0700 Subject: [PATCH 130/319] Initial implementation of EPUB Output plugin --- src/calibre/customize/builtins.py | 3 +- src/calibre/customize/profiles.py | 4 +- src/calibre/ebooks/epub/__init__.py | 173 ------ src/calibre/ebooks/epub/fonts.py | 300 ---------- src/calibre/ebooks/epub/from_any.py | 93 --- src/calibre/ebooks/epub/from_feeds.py | 71 --- src/calibre/ebooks/epub/from_html.py | 547 ------------------ src/calibre/ebooks/epub/output.py | 221 ++++++- src/calibre/ebooks/oeb/iterator.py | 4 +- src/calibre/ebooks/oeb/transforms/guide.py | 13 +- src/calibre/ebooks/oeb/transforms/rescale.py | 37 ++ src/calibre/ebooks/oeb/transforms/split.py | 5 +- .../ebooks/oeb/transforms/structure.py | 19 +- 13 files changed, 285 insertions(+), 1205 deletions(-) delete mode 100644 src/calibre/ebooks/epub/fonts.py delete mode 100644 src/calibre/ebooks/epub/from_any.py delete mode 100644 src/calibre/ebooks/epub/from_feeds.py delete mode 100644 src/calibre/ebooks/epub/from_html.py create mode 100644 src/calibre/ebooks/oeb/transforms/rescale.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index e0e9158f0e..c726a19b2a 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -287,13 +287,14 @@ from calibre.ebooks.odt.input import ODTInput from calibre.ebooks.rtf.input import RTFInput from calibre.ebooks.html.input import HTMLInput from calibre.ebooks.oeb.output import OEBOutput +from calibre.ebooks.epub.output import EPUBOutput from calibre.ebooks.txt.output import TXTOutput from calibre.ebooks.pdf.output import PDFOutput from calibre.customize.profiles import input_profiles, output_profiles plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDBInput, PDFInput, HTMLInput, TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput, - FB2Input, ODTInput, RTFInput] + FB2Input, ODTInput, RTFInput, EPUBOutput] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py index c11529f025..67dd920135 100644 --- a/src/calibre/customize/profiles.py +++ b/src/calibre/customize/profiles.py @@ -3,7 +3,7 @@ __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import sys, re +import re from itertools import izip from calibre.customize import Plugin as _Plugin @@ -22,7 +22,7 @@ class Plugin(_Plugin): fbase = 12 fsizes = [5, 7, 9, 12, 13.5, 17, 20, 22, 24] - screen_size = (800, 600) + screen_size = (1600, 1200) dpi = 100 def __init__(self, *args, **kwargs): diff --git a/src/calibre/ebooks/epub/__init__.py b/src/calibre/ebooks/epub/__init__.py index 2bc076a8ad..f5de8421e0 100644 --- a/src/calibre/ebooks/epub/__init__.py +++ b/src/calibre/ebooks/epub/__init__.py @@ -6,32 +6,7 @@ __docformat__ = 'restructuredtext en' ''' Conversion to EPUB. ''' -import sys, textwrap, re, os, uuid -from itertools import cycle -from calibre.utils.config import Config, StringConfig from calibre.utils.zipfile import ZipFile, ZIP_STORED -from calibre.ebooks.html import tostring -from lxml import etree - -class DefaultProfile(object): - - flow_size = sys.maxint - screen_size = None - remove_special_chars = False - remove_object_tags = False - -class PRS505(DefaultProfile): - - flow_size = 270000 - screen_size = (590, 765) - remove_special_chars = re.compile(u'[\u200b\u00ad]') - remove_object_tags = True - - -PROFILES = { - 'PRS505' : PRS505, - 'None' : DefaultProfile, - } def rules(stylesheets): for s in stylesheets: @@ -58,152 +33,4 @@ def initialize_container(path_to_container, opf_name='metadata.opf'): zf.writestr('META-INF/container.xml', CONTAINER) return zf -def config(defaults=None, name='epub'): - desc = _('Options to control the conversion to EPUB') - if defaults is None: - c = Config(name, desc) - else: - c = StringConfig(defaults, desc) - c.update(common_config()) - c.remove_opt('output') - c.remove_opt('zip') - - c.add_opt('output', ['-o', '--output'], default=None, - help=_('The output EPUB file. If not specified, it is ' - 'derived from the input file name.')) - c.add_opt('profile', ['--profile'], default='PRS505', choices=list(PROFILES.keys()), - help=_('Profile of the target device this EPUB is meant for. ' - 'Set to None to create a device independent EPUB. ' - 'The profile is used for device specific restrictions ' - 'on the EPUB. Choices are: ')+str(list(PROFILES.keys()))) - c.add_opt('override_css', ['--override-css'], default=None, - help=_('Either the path to a CSS stylesheet or raw CSS. ' - 'This CSS will override any existing CSS ' - 'declarations in the source files.')) - structure = c.add_group('structure detection', - _('Control auto-detection of document structure.')) - structure('chapter', ['--chapter'], - default="//*[re:match(name(), 'h[1-2]') and " - "re:test(., 'chapter|book|section|part', 'i')] | " - "//*[@class = 'chapter']", - help=_('''\ -An XPath expression to detect chapter titles. The default is to consider

    or -

    tags that contain the words "chapter","book","section" or "part" as chapter titles as -well as any tags that have class="chapter". -The expression used must evaluate to a list of elements. To disable chapter detection, -use the expression "/". See the XPath Tutorial in the calibre User Manual for further -help on using this feature. -''').replace('\n', ' ')) - structure('chapter_mark', ['--chapter-mark'], choices=['pagebreak', 'rule', 'both', 'none'], - default='pagebreak', - help=_('Specify how to mark detected chapters. A value of ' - '"pagebreak" will insert page breaks before chapters. ' - 'A value of "rule" will insert a line before chapters. ' - 'A value of "none" will disable chapter marking and a ' - 'value of "both" will use both page breaks and lines ' - 'to mark chapters.')) - structure('cover', ['--cover'], default=None, - help=_('Path to the cover to be used for this book')) - structure('prefer_metadata_cover', ['--prefer-metadata-cover'], default=False, - action='store_true', - help=_('Use the cover detected from the source file in preference ' - 'to the specified cover.')) - structure('remove_first_image', ['--remove-first-image'], default=False, - help=_('Remove the first image from the input ebook. Useful if ' - 'the first image in the source file is a cover and you ' - 'are specifying an external cover.')) - structure('dont_split_on_page_breaks', ['--dont-split-on-page-breaks'], default=False, - help=_('Turn off splitting at page breaks. Normally, input files ' - 'are automatically split at every page break into ' - 'two files. This gives an output ebook that can be parsed ' - 'faster and with less resources. However, splitting is ' - 'slow and if your source file contains a very large ' - 'number of page breaks, you should turn off splitting ' - 'on page breaks.')) - structure('page', ['--page'], default=None, - help=_('XPath expression to detect page boundaries for building ' - 'a custom pagination map, as used by AdobeDE. Default is ' - 'not to build an explicit pagination map.')) - structure('page_names', ['--page-names'], default=None, - help=_('XPath expression to find the name of each page in the ' - 'pagination map relative to its boundary element. ' - 'Default is to number all pages staring with 1.')) - toc = c.add_group('toc', - _('''\ -Control the automatic generation of a Table of Contents. If an OPF file is detected -and it specifies a Table of Contents, then that will be used rather than trying -to auto-generate a Table of Contents. -''').replace('\n', ' ')) - toc('max_toc_links', ['--max-toc-links'], default=50, - help=_('Maximum number of links to insert into the TOC. Set to 0 ' - 'to disable. Default is: %default. Links are only added to the ' - 'TOC if less than the --toc-threshold number of chapters were detected.')) - toc('no_chapters_in_toc', ['--no-chapters-in-toc'], default=False, - help=_("Don't add auto-detected chapters to the Table of Contents.")) - toc('toc_threshold', ['--toc-threshold'], default=6, - help=_('If fewer than this number of chapters is detected, then links ' - 'are added to the Table of Contents. Default: %default')) - toc('level1_toc', ['--level1-toc'], default=None, - help=_('XPath expression that specifies all tags that should be added ' - 'to the Table of Contents at level one. If this is specified, ' - 'it takes precedence over other forms of auto-detection.')) - toc('level2_toc', ['--level2-toc'], default=None, - help=_('XPath expression that specifies all tags that should be added ' - 'to the Table of Contents at level two. Each entry is added ' - 'under the previous level one entry.')) - toc('level3_toc', ['--level3-toc'], default=None, - help=_('XPath expression that specifies all tags that should be added ' - 'to the Table of Contents at level three. Each entry is added ' - 'under the previous level two entry.')) - toc('from_ncx', ['--from-ncx'], default=None, - help=_('Path to a .ncx file that contains the table of contents to use ' - 'for this ebook. The NCX file should contain links relative to ' - 'the directory it is placed in. See ' - 'http://www.niso.org/workrooms/daisy/Z39-86-2005.html#NCX for ' - 'an overview of the NCX format.')) - toc('use_auto_toc', ['--use-auto-toc'], default=False, - help=_('Normally, if the source file already has a Table of Contents, ' - 'it is used in preference to the auto-generated one. ' - 'With this option, the auto-generated one is always used.')) - - layout = c.add_group('page layout', _('Control page layout')) - layout('margin_top', ['--margin-top'], default=5.0, - help=_('Set the top margin in pts. Default is %default')) - layout('margin_bottom', ['--margin-bottom'], default=5.0, - help=_('Set the bottom margin in pts. Default is %default')) - layout('margin_left', ['--margin-left'], default=5.0, - help=_('Set the left margin in pts. Default is %default')) - layout('margin_right', ['--margin-right'], default=5.0, - help=_('Set the right margin in pts. Default is %default')) - layout('base_font_size2', ['--base-font-size'], default=12.0, - help=_('The base font size in pts. Default is %defaultpt. ' - 'Set to 0 to disable rescaling of fonts.')) - layout('remove_paragraph_spacing', ['--remove-paragraph-spacing'], default=False, - help=_('Remove spacing between paragraphs. ' - 'Also sets a indent on paragraphs of 1.5em. ' - 'You can override this by adding p {text-indent: 0cm} to ' - '--override-css. Spacing removal will not work if the source ' - 'file forces inter-paragraph spacing.')) - layout('no_justification', ['--no-justification'], default=False, - help=_('Do not force text to be justified in output.')) - layout('linearize_tables', ['--linearize-tables'], default=False, - help=_('Remove table markup, converting it into paragraphs. ' - 'This is useful if your source file uses a table to manage layout.')) - layout('preserve_tag_structure', ['--preserve-tag-structure'], default=False, - help=_('Preserve the HTML tag structure while splitting large HTML files. ' - 'This is only neccessary if the HTML files contain CSS that ' - 'uses sibling selectors. Enabling this greatly slows down ' - 'processing of large HTML files.')) - - c.add_opt('show_opf', ['--show-opf'], default=False, group='debug', - help=_('Print generated OPF file to stdout')) - c.add_opt('show_ncx', ['--show-ncx'], default=False, group='debug', - help=_('Print generated NCX file to stdout')) - c.add_opt('keep_intermediate', ['--keep-intermediate-files'], group='debug', - default=False, - help=_('Keep intermediate files during processing by html2epub')) - c.add_opt('extract_to', ['--extract-to'], group='debug', default=None, - help=_('Extract the contents of the produced EPUB file to the ' - 'specified directory.')) - return c diff --git a/src/calibre/ebooks/epub/fonts.py b/src/calibre/ebooks/epub/fonts.py deleted file mode 100644 index 67e6066ed1..0000000000 --- a/src/calibre/ebooks/epub/fonts.py +++ /dev/null @@ -1,300 +0,0 @@ -#!/usr/bin/env python -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' -__docformat__ = 'restructuredtext en' - -''' -Font size rationalization. See :function:`relativize`. -''' - -import logging, re, operator, functools, collections, unittest, copy, sys -from xml.dom import SyntaxErr - -from lxml.cssselect import CSSSelector -from lxml import etree -from lxml.html import HtmlElement - -from calibre.ebooks.html_old import fromstring -from calibre.ebooks.epub import rules -from cssutils import CSSParser - -num = r'[-]?\d+|[-]?\d*\.\d+' -length = r'(?P0)|(?P{num})(?P%|em|ex|px|in|cm|mm|pt|pc)'.replace('{num}', num) -absolute_size = r'(?P(x?x-)?(small|large)|medium)' -relative_size = r'(?Psmaller|larger)' - -font_size_pat = re.compile('|'.join((relative_size, absolute_size, length)), re.I) -line_height_pat = re.compile(r'({num})(px|in|cm|mm|pt|pc)'.replace('{num}', num)) - -PTU = { - 'in' : 72., - 'cm' : 72/2.54, - 'mm' : 72/25.4, - 'pt' : 1.0, - 'pc' : 1/12., - } - -DEFAULT_FONT_SIZE = 12 - -class Rationalizer(object): - - @classmethod - def specificity(cls, s): - '''Map CSS specificity tuple to a single integer''' - return sum([10**(4-i) + x for i,x in enumerate(s)]) - - @classmethod - def compute_font_size(cls, elem): - ''' - Calculate the effective font size of an element traversing its ancestors as far as - neccessary. - ''' - cfs = elem.computed_font_size - if cfs is not None: - return - sfs = elem.specified_font_size - if callable(sfs): - parent = elem.getparent() - cls.compute_font_size(parent) - elem.computed_font_size = sfs(parent.computed_font_size) - else: - elem.computed_font_size = sfs - - @classmethod - def calculate_font_size(cls, style): - 'Return font size in pts from style object. For relative units returns a callable' - match = font_size_pat.search(style.font) - fs = '' - if match: - fs = match.group() - if style.fontSize: - fs = style.fontSize - - match = font_size_pat.search(fs) - if match is None: - return None - match = match.groupdict() - unit = match.get('unit', '') - if unit: unit = unit.lower() - if unit in PTU.keys(): - return PTU[unit] * float(match['num']) - if unit in ('em', 'ex'): - return functools.partial(operator.mul, float(match['num'])) - if unit == '%': - return functools.partial(operator.mul, float(match['num'])/100.) - abs = match.get('abs', '') - if abs: abs = abs.lower() - if abs: - x = (1.2)**(abs.count('x') * (-1 if 'small' in abs else 1)) - return 12 * x - if match.get('zero', False): - return 0. - return functools.partial(operator.mul, 1.2) if 'larger' in fs.lower() else functools.partial(operator.mul, 0.8) - - @classmethod - def resolve_rules(cls, stylesheets): - for sheet in stylesheets: - if hasattr(sheet, 'fs_rules'): - continue - sheet.fs_rules = [] - sheet.lh_rules = [] - for r in sheet: - if r.type == r.STYLE_RULE: - font_size = cls.calculate_font_size(r.style) - if font_size is not None: - for s in r.selectorList: - sheet.fs_rules.append([CSSSelector(s.selectorText), font_size]) - orig = line_height_pat.search(r.style.lineHeight) - if orig is not None: - for s in r.selectorList: - sheet.lh_rules.append([CSSSelector(s.selectorText), float(orig.group(1)) * PTU[orig.group(2).lower()]]) - - - @classmethod - def apply_font_size_rules(cls, stylesheets, root): - 'Add a ``specified_font_size`` attribute to every element that has a specified font size' - cls.resolve_rules(stylesheets) - for sheet in stylesheets: - for selector, font_size in sheet.fs_rules: - elems = selector(root) - for elem in elems: - elem.specified_font_size = font_size - - @classmethod - def remove_font_size_information(cls, stylesheets): - for r in rules(stylesheets): - r.style.removeProperty('font-size') - try: - new = font_size_pat.sub('', r.style.font).strip() - if new: - r.style.font = new - else: - r.style.removeProperty('font') - except SyntaxErr: - r.style.removeProperty('font') - if line_height_pat.search(r.style.lineHeight) is not None: - r.style.removeProperty('line-height') - - @classmethod - def compute_font_sizes(cls, root, stylesheets, base=12): - stylesheets = [s for s in stylesheets if hasattr(s, 'cssText')] - cls.apply_font_size_rules(stylesheets, root) - - # Compute the effective font size of all tags - root.computed_font_size = DEFAULT_FONT_SIZE - for elem in root.iter(etree.Element): - cls.compute_font_size(elem) - - extra_css = {} - if base > 0: - # Calculate the "base" (i.e. most common) font size - font_sizes = collections.defaultdict(lambda : 0) - body = root.xpath('//body')[0] - IGNORE = ('h1', 'h2', 'h3', 'h4', 'h5', 'h6') - for elem in body.iter(etree.Element): - if elem.tag not in IGNORE: - t = getattr(elem, 'text', '') - if t: t = t.strip() - if t: - font_sizes[elem.computed_font_size] += len(t) - - t = getattr(elem, 'tail', '') - if t: t = t.strip() - if t: - parent = elem.getparent() - if parent.tag not in IGNORE: - font_sizes[parent.computed_font_size] += len(t) - - try: - most_common = max(font_sizes.items(), key=operator.itemgetter(1))[0] - scale = base/most_common if most_common > 0 else 1. - except ValueError: - scale = 1. - - # rescale absolute line-heights - counter = 0 - for sheet in stylesheets: - for selector, lh in sheet.lh_rules: - for elem in selector(root): - elem.set('id', elem.get('id', 'cfs_%d'%counter)) - counter += 1 - if not extra_css.has_key(elem.get('id')): - extra_css[elem.get('id')] = [] - extra_css[elem.get('id')].append('line-height:%fpt'%(lh*scale)) - - - - # Rescale all computed font sizes - for elem in body.iter(etree.Element): - if isinstance(elem, HtmlElement): - elem.computed_font_size *= scale - - # Remove all font size specifications from the last stylesheet - cls.remove_font_size_information(stylesheets[-1:]) - - # Create the CSS to implement the rescaled font sizes - for elem in body.iter(etree.Element): - cfs, pcfs = map(operator.attrgetter('computed_font_size'), (elem, elem.getparent())) - if abs(cfs-pcfs) > 1/12. and abs(pcfs) > 1/12.: - elem.set('id', elem.get('id', 'cfs_%d'%counter)) - counter += 1 - if not extra_css.has_key(elem.get('id')): - extra_css[elem.get('id')] = [] - extra_css[elem.get('id')].append('font-size: %f%%'%(100*(cfs/pcfs))) - - css = CSSParser(loglevel=logging.ERROR).parseString('') - for id, r in extra_css.items(): - css.add('#%s {%s}'%(id, ';'.join(r))) - return css - - @classmethod - def rationalize(cls, stylesheets, root, opts): - logger = logging.getLogger('html2epub') - logger.info('\t\tRationalizing fonts...') - extra_css = None - if opts.base_font_size2 > 0: - try: - extra_css = cls.compute_font_sizes(root, stylesheets, base=opts.base_font_size2) - except: - logger.warning('Failed to rationalize font sizes.') - if opts.verbose > 1: - logger.exception('') - finally: - root.remove_font_size_information() - logger.debug('\t\tDone rationalizing') - return extra_css - -################################################################################ -############## Testing -################################################################################ - -class FontTest(unittest.TestCase): - - def setUp(self): - from calibre.ebooks.epub import config - self.opts = config(defaults='').parse() - self.html = ''' - - - Test document - - -
    - -

    Some text

    -
    -

    Some other text.

    -

    The longest piece of single font size text in this entire file. Used to test resizing.

    - - - ''' - self.root = fromstring(self.html) - - def do_test(self, css, base=DEFAULT_FONT_SIZE, scale=1): - root1 = copy.deepcopy(self.root) - root1.computed_font_size = DEFAULT_FONT_SIZE - stylesheet = CSSParser(loglevel=logging.ERROR).parseString(css) - stylesheet2 = Rationalizer.compute_font_sizes(root1, [stylesheet], base) - root2 = copy.deepcopy(root1) - root2.remove_font_size_information() - root2.computed_font_size = DEFAULT_FONT_SIZE - Rationalizer.apply_font_size_rules([stylesheet2], root2) - for elem in root2.iter(etree.Element): - Rationalizer.compute_font_size(elem) - for e1, e2 in zip(root1.xpath('//body')[0].iter(etree.Element), root2.xpath('//body')[0].iter(etree.Element)): - self.assertAlmostEqual(e1.computed_font_size, e2.computed_font_size, - msg='Computed font sizes for %s not equal. Original: %f Processed: %f'%\ - (root1.getroottree().getpath(e1), e1.computed_font_size, e2.computed_font_size)) - return stylesheet2.cssText - - def testStripping(self): - 'Test that any original entries are removed from the CSS' - css = 'p { font: bold 10px italic smaller; font-size: x-large} \na { font-size: 0 }' - css = CSSParser(loglevel=logging.ERROR).parseString(css) - Rationalizer.compute_font_sizes(copy.deepcopy(self.root), [css]) - self.assertEqual(css.cssText.replace(' ', '').replace('\n', ''), - 'p{font:bolditalic}') - - def testIdentity(self): - 'Test that no unnecessary font size changes are made' - extra_css = self.do_test('div {font-size:12pt} \nspan {font-size:100%}') - self.assertEqual(extra_css.strip(), '') - - def testRelativization(self): - 'Test conversion of absolute to relative sizes' - self.do_test('#p1 {font: 24pt} b {font: 12pt} .it {font: 48pt} #p2 {font: 100%}') - - def testResizing(self): - 'Test resizing of fonts' - self.do_test('#longest {font: 24pt} .it {font:20pt; line-height:22pt}') - - -def suite(): - return unittest.TestLoader().loadTestsFromTestCase(FontTest) - -def test(): - unittest.TextTestRunner(verbosity=2).run(suite()) - -if __name__ == '__main__': - sys.exit(test()) - diff --git a/src/calibre/ebooks/epub/from_any.py b/src/calibre/ebooks/epub/from_any.py deleted file mode 100644 index 2f3f81124f..0000000000 --- a/src/calibre/ebooks/epub/from_any.py +++ /dev/null @@ -1,93 +0,0 @@ -from __future__ import with_statement -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' -__docformat__ = 'restructuredtext en' - -''' -Convert any ebook format to epub. -''' - -import sys, os, re -from contextlib import nested - -from calibre import extract, walk -from calibre.ebooks import DRMError -from calibre.ebooks.epub import config as common_config -from calibre.ebooks.epub.from_html import convert as html2epub, find_html_index -from calibre.ptempfile import TemporaryDirectory -from calibre.utils.zipfile import ZipFile -from calibre.customize.ui import run_plugins_on_preprocess - - -SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'azw', 'fb2', 'odt', 'rtf', - 'txt', 'pdf', 'rar', 'zip', 'oebzip', 'htm', 'html', 'epub'] - -def unarchive(path, tdir): - extract(path, tdir) - files = list(walk(tdir)) - - for ext in ['opf'] + list(MAP.keys()): - for f in files: - if f.lower().endswith('.'+ext): - if ext in ['txt', 'rtf'] and os.stat(f).st_size < 2048: - continue - return f, ext - return find_html_index(files) - -def any2epub(opts, path, notification=None, create_epub=True, - oeb_cover=False, extract_to=None): - path = run_plugins_on_preprocess(path) - ext = os.path.splitext(path)[1] - if not ext: - raise ValueError('Unknown file type: '+path) - ext = ext.lower()[1:] - - if opts.output is None: - opts.output = os.path.splitext(os.path.basename(path))[0]+'.epub' - - with nested(TemporaryDirectory('_any2epub1'), TemporaryDirectory('_any2epub2')) as (tdir1, tdir2): - if ext in ['rar', 'zip', 'oebzip']: - path, ext = unarchive(path, tdir1) - print 'Found %s file in archive'%(ext.upper()) - - if ext in MAP.keys(): - path = MAP[ext](path, tdir2, opts) - ext = 'opf' - - - if re.match(r'((x){0,1}htm(l){0,1})|opf', ext) is None: - raise ValueError('Conversion from %s is not supported'%ext.upper()) - - print 'Creating EPUB file...' - html2epub(path, opts, notification=notification, - create_epub=create_epub, oeb_cover=oeb_cover, - extract_to=extract_to) - -def config(defaults=None): - return common_config(defaults=defaults) - - -def formats(): - return ['html', 'rar', 'zip', 'oebzip']+list(MAP.keys()) - -USAGE = _('''\ -%%prog [options] filename - -Convert any of a large number of ebook formats to a %s file. Supported formats are: %s -''') - -def option_parser(usage=USAGE): - return config().option_parser(usage=usage%('EPUB', formats())) - -def main(args=sys.argv): - parser = option_parser() - opts, args = parser.parse_args(args) - if len(args) < 2: - parser.print_help() - print 'No input file specified.' - return 1 - any2epub(opts, args[1]) - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/epub/from_feeds.py b/src/calibre/ebooks/epub/from_feeds.py deleted file mode 100644 index 6a12353f50..0000000000 --- a/src/calibre/ebooks/epub/from_feeds.py +++ /dev/null @@ -1,71 +0,0 @@ -from __future__ import with_statement -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' -__docformat__ = 'restructuredtext en' - -''' -Convert periodical content into EPUB ebooks. -''' -import sys, glob, os -from calibre.web.feeds.main import config as feeds2disk_config, USAGE, run_recipe -from calibre.ebooks.epub.from_html import config as html2epub_config -from calibre.ptempfile import TemporaryDirectory -from calibre.ebooks.epub.from_html import convert as html2epub -from calibre import strftime, sanitize_file_name - -def config(defaults=None): - c = feeds2disk_config(defaults=defaults) - c.remove('lrf') - c.remove('epub') - c.remove('output_dir') - c.update(html2epub_config(defaults=defaults)) - c.remove('chapter_mark') - return c - -def option_parser(): - c = config() - return c.option_parser(usage=USAGE) - -def convert(opts, recipe_arg, notification=None): - opts.lrf = False - opts.epub = True - if opts.debug: - opts.verbose = 2 - parser = option_parser() - with TemporaryDirectory('_feeds2epub') as tdir: - opts.output_dir = tdir - recipe = run_recipe(opts, recipe_arg, parser, notification=notification) - c = config() - recipe_opts = c.parse_string(recipe.html2epub_options) - c.smart_update(recipe_opts, opts) - opts = recipe_opts - opts.chapter_mark = 'none' - opts.dont_split_on_page_breaks = True - opf = glob.glob(os.path.join(tdir, '*.opf')) - if not opf: - raise Exception('Downloading of recipe: %s failed'%recipe_arg) - opf = opf[0] - - if opts.output is None: - fname = recipe.title + strftime(recipe.timefmt) + '.epub' - opts.output = os.path.join(os.getcwd(), sanitize_file_name(fname)) - - print 'Generating epub...' - opts.encoding = 'utf-8' - opts.remove_paragraph_spacing = True - html2epub(opf, opts, notification=notification) - - -def main(args=sys.argv, notification=None, handler=None): - parser = option_parser() - opts, args = parser.parse_args(args) - if len(args) != 2 and opts.feeds is None: - parser.print_help() - return 1 - recipe_arg = args[1] if len(args) > 1 else None - convert(opts, recipe_arg, notification=notification) - - return 0 - -if __name__ == '__main__': - sys.exit(main()) \ No newline at end of file diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py deleted file mode 100644 index 3e1ec4c811..0000000000 --- a/src/calibre/ebooks/epub/from_html.py +++ /dev/null @@ -1,547 +0,0 @@ -from __future__ import with_statement -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' -__docformat__ = 'restructuredtext en' - -''' -Conversion of HTML/OPF files follows several stages: - - * All links in the HTML files or in the OPF manifest are - followed to build up a list of HTML files to be converted. - This stage is implemented by - :function:`calibre.ebooks.html.traverse` and - :class:`calibre.ebooks.html.HTMLFile`. - - * The HTML is pre-processed to make it more semantic. - All links in the HTML files to other resources like images, - stylesheets, etc. are relativized. The resources are copied - into the `resources` sub directory. This is accomplished by - :class:`calibre.ebooks.html.PreProcessor` and - :class:`calibre.ebooks.html.Parser`. - - * The HTML is processed. Various operations are performed. - All style declarations are extracted and consolidated into - a single style sheet. Chapters are auto-detected and marked. - Various font related manipulations are performed. See - :class:`HTMLProcessor`. - - * The processed HTML is saved and the - :module:`calibre.ebooks.epub.split` module is used to split up - large HTML files into smaller chunks. - - * The EPUB container is created. -''' - -import os, sys, cStringIO, logging, re, functools, shutil - -from lxml.etree import XPath -from lxml import html, etree -from PyQt4.Qt import QApplication, QPixmap, Qt - -from calibre.ebooks.html_old import Processor, merge_metadata, get_filelist,\ - opf_traverse, create_metadata, rebase_toc, Link, parser -from calibre.ebooks.epub import config as common_config, tostring -from calibre.ptempfile import TemporaryDirectory -from calibre.ebooks.metadata.toc import TOC -from calibre.ebooks.metadata.opf2 import OPF -from calibre.ebooks.epub import initialize_container, PROFILES -from calibre.ebooks.epub.split import split -from calibre.ebooks.epub.pages import add_page_map -from calibre.ebooks.epub.fonts import Rationalizer -from calibre.constants import preferred_encoding -from calibre.customize.ui import run_plugins_on_postprocess -from calibre import walk, CurrentDir, to_unicode, fit_image - -content = functools.partial(os.path.join, u'content') - -def remove_bad_link(element, attribute, link, pos): - if attribute is not None: - if element.tag in ['link']: - element.getparent().remove(element) - else: - element.set(attribute, '') - del element.attrib[attribute] - -def check_links(opf_path, pretty_print): - ''' - Find and remove all invalid links in the HTML files - ''' - logger = logging.getLogger('html2epub') - logger.info('\tChecking files for bad links...') - pathtoopf = os.path.abspath(opf_path) - with CurrentDir(os.path.dirname(pathtoopf)): - opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf)) - html_files = [] - for item in opf.itermanifest(): - if 'html' in item.get('media-type', '').lower(): - f = item.get('href').split('/')[-1] - if isinstance(f, str): - f = f.decode('utf-8') - html_files.append(os.path.abspath(content(f))) - - for path in html_files: - if not os.access(path, os.R_OK): - continue - base = os.path.dirname(path) - root = html.fromstring(open(content(path), 'rb').read(), parser=parser) - for element, attribute, link, pos in list(root.iterlinks()): - link = to_unicode(link) - plink = Link(link, base) - bad = False - if plink.path is not None and not os.path.exists(plink.path): - bad = True - if bad: - remove_bad_link(element, attribute, link, pos) - open(content(path), 'wb').write(tostring(root, pretty_print)) - -def find_html_index(files): - ''' - Given a list of files, find the most likely root HTML file in the - list. - ''' - html_pat = re.compile(r'\.(x){0,1}htm(l){0,1}$', re.IGNORECASE) - html_files = [f for f in files if html_pat.search(f) is not None] - if not html_files: - raise ValueError(_('Could not find an ebook inside the archive')) - html_files = [(f, os.stat(f).st_size) for f in html_files] - html_files.sort(cmp = lambda x, y: cmp(x[1], y[1])) - html_files = [f[0] for f in html_files] - for q in ('toc', 'index'): - for f in html_files: - if os.path.splitext(os.path.basename(f))[0].lower() == q: - return f, os.path.splitext(f)[1].lower()[1:] - return html_files[-1], os.path.splitext(html_files[-1])[1].lower()[1:] - -def rescale_images(imgdir, screen_size, log): - pwidth, pheight = screen_size - if QApplication.instance() is None: - QApplication([]) - for f in os.listdir(imgdir): - path = os.path.join(imgdir, f) - if os.path.splitext(f)[1] in ('.css', '.js'): - continue - - p = QPixmap() - p.load(path) - if p.isNull(): - continue - width, height = p.width(), p.height() - scaled, new_width, new_height = fit_image(width, height, pwidth, - pheight) - if scaled: - log.info('Rescaling image: '+f) - p.scaled(new_width, new_height, Qt.IgnoreAspectRatio, - Qt.SmoothTransformation).save(path, 'JPEG') - - - - - -class HTMLProcessor(Processor, Rationalizer): - - def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, stylesheets): - Processor.__init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, - name='html2epub') - if opts.verbose > 2: - self.debug_tree('parsed') - self.detect_chapters() - - self.extract_css(stylesheets) - if self.opts.base_font_size2 > 0: - self.font_css = self.rationalize(self.external_stylesheets+[self.stylesheet], - self.root, self.opts) - if opts.verbose > 2: - self.debug_tree('nocss') - - if hasattr(self.body, 'xpath'): - for script in list(self.body.xpath('descendant::script')): - script.getparent().remove(script) - - self.fix_markup() - - def convert_image(self, img): - rpath = img.get('src', '') - path = os.path.join(os.path.dirname(self.save_path()), *rpath.split('/')) - if os.path.exists(path) and os.path.isfile(path): - if QApplication.instance() is None: - app = QApplication([]) - app - p = QPixmap() - p.load(path) - if not p.isNull(): - p.save(path + '_calibre_converted.jpg') - os.remove(path) - for key, val in self.resource_map.items(): - if val == rpath: - self.resource_map[key] = rpath+'_calibre_converted.jpg' - img.set('src', rpath+'_calibre_converted.jpg') - - def fix_markup(self): - ''' - Perform various markup transforms to get the output to render correctly - in the quirky ADE. - ''' - # Replace
    that are children of as ADE doesn't handle them - if hasattr(self.body, 'xpath'): - for br in self.body.xpath('./br'): - if br.getparent() is None: - continue - try: - sibling = br.itersiblings().next() - except: - sibling = None - br.tag = 'p' - br.text = u'\u00a0' - if (br.tail and br.tail.strip()) or sibling is None or \ - getattr(sibling, 'tag', '') != 'br': - style = br.get('style', '').split(';') - style = filter(None, map(lambda x: x.strip(), style)) - style.append('margin: 0pt; border:0pt; height:0pt') - br.set('style', '; '.join(style)) - else: - sibling.getparent().remove(sibling) - if sibling.tail: - if not br.tail: - br.tail = '' - br.tail += sibling.tail - - - if self.opts.profile.remove_object_tags: - for tag in self.root.xpath('//embed'): - tag.getparent().remove(tag) - for tag in self.root.xpath('//object'): - if tag.get('type', '').lower().strip() in ('image/svg+xml',): - continue - tag.getparent().remove(tag) - - - for tag in self.root.xpath('//title|//style'): - if not tag.text: - tag.getparent().remove(tag) - for tag in self.root.xpath('//script'): - if not tag.text and not tag.get('src', False): - tag.getparent().remove(tag) - - for tag in self.root.xpath('//form'): - tag.getparent().remove(tag) - - for tag in self.root.xpath('//center'): - tag.tag = 'div' - tag.set('style', 'text-align:center') - - if self.opts.linearize_tables: - for tag in self.root.xpath('//table | //tr | //th | //td'): - tag.tag = 'div' - - # ADE can't handle & in an img url - for tag in self.root.xpath('//img[@src]'): - tag.set('src', tag.get('src', '').replace('&', '')) - - - def save(self): - for meta in list(self.root.xpath('//meta')): - meta.getparent().remove(meta) - # Strip all comments since Adobe DE is petrified of them - Processor.save(self, strip_comments=True) - - def remove_first_image(self): - images = self.root.xpath('//img') - if images: - images[0].getparent().remove(images[0]) - return True - return False - - - - -def config(defaults=None): - return common_config(defaults=defaults) - -def option_parser(): - c = config() - return c.option_parser(usage=_('''\ -%prog [options] file.html|opf - -Convert a HTML file to an EPUB ebook. Recursively follows links in the HTML file. -If you specify an OPF file instead of an HTML file, the list of links is takes from -the element of the OPF file. -''')) - -def parse_content(filelist, opts, tdir): - os.makedirs(os.path.join(tdir, 'content', 'resources')) - resource_map, stylesheets = {}, {} - toc = TOC(base_path=tdir, type='root') - stylesheet_map = {} - first_image_removed = False - for htmlfile in filelist: - logging.getLogger('html2epub').debug('Processing %s...'%htmlfile) - hp = HTMLProcessor(htmlfile, opts, os.path.join(tdir, 'content'), - resource_map, filelist, stylesheets) - if not first_image_removed and opts.remove_first_image: - first_image_removed = hp.remove_first_image() - hp.populate_toc(toc) - hp.save() - stylesheet_map[os.path.basename(hp.save_path())] = \ - [s for s in hp.external_stylesheets + [hp.stylesheet, hp.font_css, hp.override_css] if s is not None] - - logging.getLogger('html2epub').debug('Saving stylesheets...') - if opts.base_font_size2 > 0: - Rationalizer.remove_font_size_information(stylesheets.values()) - for path, css in stylesheets.items(): - raw = getattr(css, 'cssText', css) - if isinstance(raw, unicode): - raw = raw.encode('utf-8') - open(path, 'wb').write(raw) - if toc.count('chapter') > opts.toc_threshold: - toc.purge(['file', 'link', 'unknown']) - if toc.count('chapter') + toc.count('file') > opts.toc_threshold: - toc.purge(['link', 'unknown']) - toc.purge(['link'], max=opts.max_toc_links) - - return resource_map, hp.htmlfile_map, toc, stylesheet_map - -TITLEPAGE = '''\ - - - Cover - - - -
    - cover -
    - - -''' - -def create_cover_image(src, dest, screen_size, rescale_cover=True): - try: - from PyQt4.Qt import QImage, Qt - if QApplication.instance() is None: - QApplication([]) - im = QImage() - im.load(src) - if im.isNull(): - raise ValueError('Invalid cover image') - if rescale_cover and screen_size is not None: - width, height = im.width(), im.height() - dw, dh = (screen_size[0]-width)/float(width), (screen_size[1]-height)/float(height) - delta = min(dw, dh) - if delta > 0: - nwidth = int(width + delta*(width)) - nheight = int(height + delta*(height)) - im = im.scaled(int(nwidth), int(nheight), Qt.IgnoreAspectRatio, Qt.SmoothTransformation) - im.save(dest) - except: - import traceback - traceback.print_exc() - return False - return True - -def process_title_page(mi, filelist, htmlfilemap, opts, tdir): - old_title_page = None - f = lambda x : os.path.normcase(os.path.normpath(x)) - if not isinstance(mi.cover, basestring): - mi.cover = None - if mi.cover: - if f(filelist[0].path) == f(mi.cover): - old_title_page = htmlfilemap[filelist[0].path] - #logger = logging.getLogger('html2epub') - metadata_cover = mi.cover - if metadata_cover and not os.path.exists(metadata_cover): - metadata_cover = None - - cpath = '/'.join(('resources', '_cover_.jpg')) - cover_dest = os.path.join(tdir, 'content', *cpath.split('/')) - if metadata_cover is not None: - if not create_cover_image(metadata_cover, cover_dest, - opts.profile.screen_size): - metadata_cover = None - specified_cover = opts.cover - if specified_cover and not os.path.exists(specified_cover): - specified_cover = None - if specified_cover is not None: - if not create_cover_image(specified_cover, cover_dest, - opts.profile.screen_size): - specified_cover = None - - cover = metadata_cover if specified_cover is None or (opts.prefer_metadata_cover and metadata_cover is not None) else specified_cover - - if cover is not None: - titlepage = TITLEPAGE%cpath - tp = 'calibre_title_page.html' if old_title_page is None else old_title_page - tppath = os.path.join(tdir, 'content', tp) - with open(tppath, 'wb') as f: - f.write(titlepage) - return tp if old_title_page is None else None, True - elif os.path.exists(cover_dest): - os.remove(cover_dest) - return None, old_title_page is not None - -def find_oeb_cover(htmlfile): - if os.stat(htmlfile).st_size > 2048: - return None - match = re.search(r'(?i)]+src\s*=\s*[\'"](.+?)[\'"]', open(htmlfile, 'rb').read()) - if match: - return match.group(1) - -def condense_ncx(ncx_path): - tree = etree.parse(ncx_path) - for tag in tree.getroot().iter(tag=etree.Element): - if tag.text: - tag.text = tag.text.strip() - if tag.tail: - tag.tail = tag.tail.strip() - compressed = etree.tostring(tree.getroot(), encoding='utf-8') - open(ncx_path, 'wb').write(compressed) - -def convert(htmlfile, opts, notification=None, create_epub=True, - oeb_cover=False, extract_to=None): - htmlfile = os.path.abspath(htmlfile) - if opts.output is None: - opts.output = os.path.splitext(os.path.basename(htmlfile))[0] + '.epub' - opts.profile = PROFILES[opts.profile] - opts.output = os.path.abspath(opts.output) - if opts.override_css is not None: - try: - opts.override_css = open(opts.override_css, 'rb').read().decode(preferred_encoding, 'replace') - except: - opts.override_css = opts.override_css.decode(preferred_encoding, 'replace') - if opts.from_opf: - opts.from_opf = os.path.abspath(opts.from_opf) - if opts.from_ncx: - opts.from_ncx = os.path.abspath(opts.from_ncx) - if htmlfile.lower().endswith('.opf'): - opf = OPF(htmlfile, os.path.dirname(os.path.abspath(htmlfile))) - filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding) - if not filelist: - # Bad OPF look for a HTML file instead - htmlfile = find_html_index(walk(os.path.dirname(htmlfile)))[0] - if htmlfile is None: - raise ValueError('Could not find suitable file to convert.') - filelist = get_filelist(htmlfile, opts)[1] - mi = merge_metadata(None, opf, opts) - else: - opf, filelist = get_filelist(htmlfile, opts) - mi = merge_metadata(htmlfile, opf, opts) - opts.chapter = XPath(opts.chapter, - namespaces={'re':'http://exslt.org/regular-expressions'}) - for x in (1, 2, 3): - attr = 'level%d_toc'%x - if getattr(opts, attr): - setattr(opts, attr, XPath(getattr(opts, attr), - namespaces={'re':'http://exslt.org/regular-expressions'})) - else: - setattr(opts, attr, None) - - with TemporaryDirectory(suffix='_html2epub', keep=opts.keep_intermediate) as tdir: - if opts.keep_intermediate: - print 'Intermediate files in', tdir - resource_map, htmlfile_map, generated_toc, stylesheet_map = \ - parse_content(filelist, opts, tdir) - logger = logging.getLogger('html2epub') - resources = [os.path.join(tdir, 'content', f) for f in resource_map.values()] - - - title_page, has_title_page = process_title_page(mi, filelist, htmlfile_map, opts, tdir) - spine = [htmlfile_map[f.path] for f in filelist] - if not oeb_cover and title_page is not None: - spine = [title_page] + spine - mi.cover = None - mi.cover_data = (None, None) - - - mi = create_metadata(tdir, mi, spine, resources) - buf = cStringIO.StringIO() - if mi.toc: - rebase_toc(mi.toc, htmlfile_map, tdir) - if opts.use_auto_toc or mi.toc is None or len(list(mi.toc.flat())) < 2: - mi.toc = generated_toc - if opts.from_ncx: - toc = TOC() - toc.read_ncx_toc(opts.from_ncx) - mi.toc = toc - for item in mi.manifest: - if getattr(item, 'mime_type', None) == 'text/html': - item.mime_type = 'application/xhtml+xml' - opf_path = os.path.join(tdir, 'metadata.opf') - with open(opf_path, 'wb') as f: - mi.render(f, buf, 'toc.ncx') - toc = buf.getvalue() - if toc: - with open(os.path.join(tdir, 'toc.ncx'), 'wb') as f: - f.write(toc) - if opts.show_ncx: - print toc - split(opf_path, opts, stylesheet_map) - if opts.page: - logger.info('\tBuilding page map...') - add_page_map(opf_path, opts) - check_links(opf_path, opts.pretty_print) - - opf = OPF(opf_path, tdir) - opf.remove_guide() - oeb_cover_file = None - if oeb_cover and title_page is not None: - oeb_cover_file = find_oeb_cover(os.path.join(tdir, 'content', title_page)) - if has_title_page or (oeb_cover and oeb_cover_file): - opf.create_guide_element() - if has_title_page and not oeb_cover: - opf.add_guide_item('cover', 'Cover', 'content/'+spine[0]) - if oeb_cover and oeb_cover_file: - opf.add_guide_item('cover', 'Cover', 'content/'+oeb_cover_file) - - cpath = os.path.join(tdir, 'content', 'resources', '_cover_.jpg') - if os.path.exists(cpath): - opf.add_path_to_manifest(cpath, 'image/jpeg') - with open(opf_path, 'wb') as f: - f.write(opf.render()) - ncx_path = os.path.join(os.path.dirname(opf_path), 'toc.ncx') - if os.path.exists(ncx_path) and os.stat(ncx_path).st_size > opts.profile.flow_size: - logger.info('Condensing NCX from %d bytes...'%os.stat(ncx_path).st_size) - condense_ncx(ncx_path) - if os.stat(ncx_path).st_size > opts.profile.flow_size: - logger.warn('NCX still larger than allowed size at %d bytes. Menu based Table of Contents may not work on device.'%os.stat(ncx_path).st_size) - - if opts.profile.screen_size is not None: - rescale_images(os.path.join(tdir, 'content', 'resources'), - opts.profile.screen_size, logger) - - if create_epub: - epub = initialize_container(opts.output) - epub.add_dir(tdir) - epub.close() - run_plugins_on_postprocess(opts.output, 'epub') - logger.info(_('Output written to ')+opts.output) - - if opts.show_opf: - print open(opf_path, 'rb').read() - - if opts.extract_to is not None: - if os.path.exists(opts.extract_to): - shutil.rmtree(opts.extract_to) - shutil.copytree(tdir, opts.extract_to) - - if extract_to is not None: - if os.path.exists(extract_to): - shutil.rmtree(extract_to) - shutil.copytree(tdir, extract_to) - - - -def main(args=sys.argv): - parser = option_parser() - opts, args = parser.parse_args(args) - if len(args) < 2: - parser.print_help() - print _('You must specify an input HTML file') - return 1 - convert(args[1], opts) - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/epub/output.py b/src/calibre/ebooks/epub/output.py index 4ce13720e0..a43ca4e5e3 100644 --- a/src/calibre/ebooks/epub/output.py +++ b/src/calibre/ebooks/epub/output.py @@ -6,9 +6,15 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' +import os +from urllib import unquote from calibre.customize.conversion import OutputFormatPlugin -from calibre import CurrentDir +from calibre.ptempfile import TemporaryDirectory +from calibre.constants import __appname__, __version__ +from calibre import strftime, guess_type +from lxml import etree + class EPUBOutput(OutputFormatPlugin): @@ -16,7 +22,218 @@ class EPUBOutput(OutputFormatPlugin): author = 'Kovid Goyal' file_type = 'epub' + TITLEPAGE_COVER = '''\ + + + Cover + + + +
    + cover +
    + + +''' + + TITLEPAGE = '''\ + + + + + +

    %(title)s

    +

    +
    +
    + calibre +
    +
    +

    %(date)s

    +




    +

    %(author)s

    +








    +

    Produced by %(app)s

    +
    +
    + + +''' + def convert(self, oeb, output_path, input_plugin, opts, log): - self.log, self.opts = log, opts + self.log, self.opts, self.oeb = log, opts, oeb + + self.workaround_ade_quirks() + + from calibre.ebooks.oeb.transforms.rescale import RescaleImages + RescaleImages()(oeb, opts) + self.insert_cover() + + with TemporaryDirectory('_epub_output') as tdir: + from calibre.customize.ui import plugin_for_output_format + oeb_output = plugin_for_output_format('oeb') + oeb_output.convert(oeb, tdir, input_plugin, opts, log) + opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0] + self.condense_ncx([os.path.join(tdir, x) for x in os.listdir(tdir)\ + if x.endswith('.ncx')][0]) + + from calibre.epub import initialize_container + epub = initialize_container(output_path, os.path.basename(opf)) + epub.add_dir(tdir) + epub.close() + + def default_cover(self): + ''' + Create a generic cover for books that dont have a cover + ''' + try: + from calibre.gui2 import images_rc # Needed for access to logo + from PyQt4.Qt import QApplication, QFile, QIODevice + except: + return None + from calibre.ebooks.metadata import authors_to_string + images_rc + m = self.oeb.metadata + title = unicode(m.title[0]) + a = [unicode(x) for x in m.creators if m.role == 'aut'] + author = authors_to_string(a) + if QApplication.instance() is None: QApplication([]) + f = QFile(':/library') + f.open(QIODevice.ReadOnly) + img_data = str(f.readAll()) + id, href = self.oeb.manifest.generate('calibre-logo', + 'calibre-logo.png') + self.oeb.manifest.add(id, href, 'image/png', data=img_data) + html = self.TITLEPAGE%dict(title=title, author=author, + date=strftime('%d %b, %Y'), + app=__appname__ +' '+__version__, + img=href) + id, href = self.oeb.manifest.generate('calibre-titlepage', + 'calibre-titlepage.xhtml') + return self.oeb.manifest.add(id, href, guess_type('t.xhtml')[0], + data=etree.fromstring(html)) + + + def insert_cover(self): + from calibre.ebooks.oeb.base import urldefrag + from calibre import guess_type + g, m = self.oeb.guide, self.oeb.manifest + if 'titlepage' not in g: + if 'cover' in g: + tp = self.TITLEPAGE_COVER%unquote(g['cover'].href) + id, href = m.generate('titlepage', 'titlepage.xhtml') + item = m.add(id, href, guess_type('t.xhtml'), + data=etree.fromstring(tp)) + else: + item = self.default_cover() + else: + item = self.oeb.manifest.hrefs[ + urldefrag(self.oeb.guide['titlepage'].href)[0]] + if item is not None: + self.oeb.spine.insert(0, item, True) + self.oeb.guide.refs['cover'].href = item.href + self.oeb.guide.refs['titlepage'].href = item.href + + + + def condense_ncx(self, ncx_path): + if not self.opts.pretty_print: + tree = etree.parse(ncx_path) + for tag in tree.getroot().iter(tag=etree.Element): + if tag.text: + tag.text = tag.text.strip() + if tag.tail: + tag.tail = tag.tail.strip() + compressed = etree.tostring(tree.getroot(), encoding='utf-8') + open(ncx_path, 'wb').write(compressed) + + + + def workaround_ade_quirks(self): + ''' + Perform various markup transforms to get the output to render correctly + in the quirky ADE. + ''' + from calibre.ebooks.oeb.base import XPNSMAP, XHTML + from lxml.etree import XPath as _XPath + from functools import partial + XPath = partial(_XPath, namespaces=XPNSMAP) + + for x in self.oeb.spine: + root = x.data + body = XPath('//h:body')(root) + if body: + body = body[0] + # Replace
    that are children of as ADE doesn't handle them + if hasattr(body, 'xpath'): + for br in body.xpath('./h:br'): + if br.getparent() is None: + continue + try: + sibling = br.itersiblings().next() + except: + sibling = None + br.tag = XHTML('p') + br.text = u'\u00a0' + if (br.tail and br.tail.strip()) or sibling is None or \ + getattr(sibling, 'tag', '') != XHTML('br'): + style = br.get('style', '').split(';') + style = filter(None, map(lambda x: x.strip(), style)) + style.append('margin: 0pt; border:0pt; height:0pt') + br.set('style', '; '.join(style)) + else: + sibling.getparent().remove(sibling) + if sibling.tail: + if not br.tail: + br.tail = '' + br.tail += sibling.tail + + + if self.opts.output_profile.remove_object_tags: + for tag in root.xpath('//h:embed'): + tag.getparent().remove(tag) + for tag in root.xpath('//h:object'): + if tag.get('type', '').lower().strip() in ('image/svg+xml',): + continue + tag.getparent().remove(tag) + + for tag in root.xpath('//h:title|//h:style'): + if not tag.text: + tag.getparent().remove(tag) + for tag in root.xpath('//h:script'): + if not tag.text and not tag.get('src', False): + tag.getparent().remove(tag) + + for tag in root.xpath('//h:form'): + tag.getparent().remove(tag) + + for tag in root.xpath('//h:center'): + tag.tag = XHTML('div') + tag.set('style', 'text-align:center') + + # ADE can't handle & in an img url + for tag in self.root.xpath('//h:img[@src]'): + tag.set('src', tag.get('src', '').replace('&', '')) + + stylesheet = self.oeb.manifest.hrefs['stylesheet.css'] + stylesheet.data.add('a { color: inherit; text-decoration: inherit; ' + 'cursor: default; }') + stylesheet.data.add('a[href] { color: blue; ' + 'text-decoration: underline; cursor:pointer; }') + diff --git a/src/calibre/ebooks/oeb/iterator.py b/src/calibre/ebooks/oeb/iterator.py index ab3e90083d..ffafa6d1a2 100644 --- a/src/calibre/ebooks/oeb/iterator.py +++ b/src/calibre/ebooks/oeb/iterator.py @@ -12,13 +12,15 @@ from cStringIO import StringIO from PyQt4.Qt import QFontDatabase from calibre.customize.ui import available_input_formats -from calibre.ebooks.epub.from_html import TITLEPAGE from calibre.ebooks.metadata.opf2 import OPF from calibre.ptempfile import TemporaryDirectory from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.zipfile import safe_replace, ZipFile from calibre.utils.config import DynamicConfig from calibre.utils.logging import Log +from calibre.ebooks.epub.output import EPUBOutput + +TITLEPAGE = EPUBOutput.TITLEPAGE_COVER def character_count(html): ''' diff --git a/src/calibre/ebooks/oeb/transforms/guide.py b/src/calibre/ebooks/oeb/transforms/guide.py index 06153c5a48..00830b1a8c 100644 --- a/src/calibre/ebooks/oeb/transforms/guide.py +++ b/src/calibre/ebooks/oeb/transforms/guide.py @@ -14,7 +14,10 @@ class Clean(object): from calibre.ebooks.oeb.base import urldefrag self.oeb, self.log, self.opts = oeb, oeb.log, opts - cover_href = '' + protected_hrefs = set([]) + if 'titlepage' in self.oeb.guide: + protected_hrefs.add(urldefrag( + self.oeb.guide['titlepage'].href)[0]) if 'cover' not in self.oeb.guide: covers = [] for x in ('other.ms-coverimage-standard', @@ -32,15 +35,15 @@ class Clean(object): self.log('Choosing %s:%s as the cover'%(ref.type, ref.href)) ref.type = 'cover' self.oeb.guide.refs['cover'] = ref - cover_href = urldefrag(ref.href)[0] + protected_hrefs.add(urldefrag(ref.href)[0]) else: - cover_href = urldefrag(self.oeb.guide.refs['cover'].href)[0] + protected_hrefs.add(urldefrag(self.oeb.guide.refs['cover'].href)[0]) for x in list(self.oeb.guide): href = urldefrag(self.oeb.guide[x].href)[0] - if x.lower() != 'cover': + if x.lower() != ('cover', 'titlepage'): try: - if href != cover_href: + if href not in protected_hrefs: self.oeb.manifest.remove(self.oeb.manifest.hrefs[href]) except KeyError: pass diff --git a/src/calibre/ebooks/oeb/transforms/rescale.py b/src/calibre/ebooks/oeb/transforms/rescale.py new file mode 100644 index 0000000000..5b62e5fda5 --- /dev/null +++ b/src/calibre/ebooks/oeb/transforms/rescale.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from calibre import fit_image + +class RescaleImages(object): + 'Rescale all images to fit inside given screen size' + + def __call__(self, oeb, opts): + from PyQt4.Qt import QApplication, QImage, Qt + from calibre.gui2 import pixmap_to_data + self.oeb, self.opts, self.log = oeb, opts, oeb.log + page_width, page_height = opts.dest.width, opts.dest.height + for item in oeb.manifest: + if item.media_type.startswith('image'): + raw = item.data + if not raw: continue + if QApplication.instance() is None: + QApplication([]) + + img = QImage(10, 10, QImage.Format_ARGB32_Premultiplied) + if not img.loadFromData(raw): continue + width, height = img.width(), img.height() + scaled, new_width, new_height = fit_image(width, height, + page_width, page_height) + if scaled: + self.log('Rescaling image', item.href) + img = img.scaled(new_width, new_height, + Qt.IgnoreAspectRatio, Qt.SmoothTransformation) + item.data = pixmap_to_data(img) + + diff --git a/src/calibre/ebooks/oeb/transforms/split.py b/src/calibre/ebooks/oeb/transforms/split.py index b54b0ebce0..d3505a5fd9 100644 --- a/src/calibre/ebooks/oeb/transforms/split.py +++ b/src/calibre/ebooks/oeb/transforms/split.py @@ -17,7 +17,7 @@ from lxml.cssselect import CSSSelector from calibre.ebooks.oeb.base import OEB_STYLES, XPNSMAP as NAMESPACES, \ urldefrag, rewrite_links, urlunquote -from calibre.ebooks.epub import tostring, rules +from calibre.ebooks.epub import rules XPath = functools.partial(_XPath, namespaces=NAMESPACES) @@ -25,6 +25,9 @@ XPath = functools.partial(_XPath, namespaces=NAMESPACES) SPLIT_ATTR = 'cs' SPLIT_POINT_ATTR = 'csp' +def tostring(root): + return etree.tostring(root, encoding='utf-8') + class SplitError(ValueError): def __init__(self, path, root): diff --git a/src/calibre/ebooks/oeb/transforms/structure.py b/src/calibre/ebooks/oeb/transforms/structure.py index 6499a5e9c4..9240873346 100644 --- a/src/calibre/ebooks/oeb/transforms/structure.py +++ b/src/calibre/ebooks/oeb/transforms/structure.py @@ -11,7 +11,7 @@ import re from lxml import etree from urlparse import urlparse -from calibre.ebooks.oeb.base import XPNSMAP, TOC +from calibre.ebooks.oeb.base import XPNSMAP, TOC, XHTML XPath = lambda x: etree.XPath(x, namespaces=XPNSMAP) class DetectStructure(object): @@ -63,11 +63,11 @@ class DetectStructure(object): if chapter_mark == 'none': continue elif chapter_mark == 'rule': - mark = etree.Element('hr') + mark = etree.Element(XHTML('hr')) elif chapter_mark == 'pagebreak': - mark = etree.Element('div', style=page_break_after) + mark = etree.Element(XHTML('div'), style=page_break_after) else: # chapter_mark == 'both': - mark = etree.Element('hr', style=page_break_before) + mark = etree.Element(XHTML('hr'), style=page_break_before) elem.addprevious(mark) def create_level_based_toc(self): @@ -114,12 +114,13 @@ class DetectStructure(object): def add_leveled_toc_items(self, item): level1 = XPath(self.opts.level1_toc)(item.data) level1_order = [] + document = item counter = 1 if level1: added = {} for elem in level1: - text, _href = self.elem_to_link(item, elem, counter) + text, _href = self.elem_to_link(document, elem, counter) counter += 1 if text: node = self.oeb.toc.add(text, _href, @@ -132,11 +133,11 @@ class DetectStructure(object): level2 = list(XPath(self.opts.level2_toc)(item.data)) for elem in level2: level1 = None - for item in item.data.iterdescendants(): + for item in document.data.iterdescendants(): if item in added.keys(): level1 = added[item] elif item == elem and level1 is not None: - text, _href = self.elem_to_link(item, elem, counter) + text, _href = self.elem_to_link(document, elem, counter) counter += 1 if text: added2[elem] = level1.add(text, _href, @@ -145,12 +146,12 @@ class DetectStructure(object): level3 = list(XPath(self.opts.level3_toc)(item.data)) for elem in level3: level2 = None - for item in item.data.iterdescendants(): + for item in document.data.iterdescendants(): if item in added2.keys(): level2 = added2[item] elif item == elem and level2 is not None: text, _href = \ - self.elem_to_link(item, elem, counter) + self.elem_to_link(document, elem, counter) counter += 1 if text: level2.add(text, _href, From 19ba43153b6bec69f0df754a064e565399cea62a Mon Sep 17 00:00:00 2001 From: John Schember Date: Fri, 24 Apr 2009 07:27:54 -0400 Subject: [PATCH 131/319] Line length for pdf processing --- src/calibre/ebooks/conversion/preprocess.py | 40 ++++++++++++++++++++- src/calibre/ebooks/pdb/header.py | 4 +-- src/calibre/ebooks/pdb/input.py | 2 +- 3 files changed, 42 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index b105a6c042..fb55ee74fb 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -32,6 +32,39 @@ def chap_head(match): return '

    '+chap+'
    '+title+'


    ' +def line_length(raw, percent): + ''' + raw is the raw text to find the line length to use for wrapping. + percentage is a decimal number, 0 - 1 which is used to determine + how far in the list of line lengths to use. + ''' + raw = raw.replace(' ', ' ') + linere = re.compile('(?<=
    ).*?(?=
    )', re.DOTALL) + lines = linere.findall(raw) + + lengths = [] + for line in lines: + if len(line) > 0: + lengths.append(len(line)) + total = sum(lengths) + avg = total / len(lengths) + max_line = avg * 2 + + lengths = sorted(lengths) + for i in range(len(lengths) - 1, -1, -1): + if lengths[i] > max_line: + del lengths[i] + + if percent > 1: + percent = 1 + if percent < 0: + percent = 0 + + index = int(len(lengths) * percent) - 1 + + return lengths[index] + + class CSSPreProcessor(object): PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}') @@ -129,7 +162,12 @@ class HTMLPreProcessor(object): elif self.is_book_designer(html): rules = self.BOOK_DESIGNER elif self.is_pdftohtml(html): - rules = self.PDFTOHTML + # Add rules that require matching line length here + #line_length_rules = [ + # (re.compile('%i' % line_length(html, .85)), lambda match:) + #] + + rules = self.PDFTOHTML # + line_length_rules else: rules = [] for rule in self.PREPROCESS + rules: diff --git a/src/calibre/ebooks/pdb/header.py b/src/calibre/ebooks/pdb/header.py index 5b47e48a16..60ce9f15b9 100644 --- a/src/calibre/ebooks/pdb/header.py +++ b/src/calibre/ebooks/pdb/header.py @@ -63,10 +63,10 @@ class PdbHeaderReader(object): class PdbHeaderWriter(object): def __init__(self, identity, title): - self.identity = identity[:8] + self.identity = identity.ljust(3, '\x00')[:8] self.title = title.ljust(32, '\x00')[:32] - def build_header(self, sections) + def build_header(self, offsets): ''' Sections is a list of section offsets ''' diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py index 180e0814a6..31808d27d5 100644 --- a/src/calibre/ebooks/pdb/input.py +++ b/src/calibre/ebooks/pdb/input.py @@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en' import os from calibre.customize.conversion import InputFormatPlugin -from calibre.ebooks.pdb.header import PdbHeader +from calibre.ebooks.pdb.header import PdbHeaderReader from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader class PDBInput(InputFormatPlugin): From 6fe1590813680a8d49513cf1d60cfc33d1964cce Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 24 Apr 2009 08:27:19 -0700 Subject: [PATCH 132/319] pluginize installs again --- src/calibre/ebooks/pdb/header.py | 18 +- src/calibre/ebooks/pdb/input.py | 9 +- src/calibre/gui2/dialogs/epub.py | 292 ------- src/calibre/gui2/dialogs/epub.ui | 1001 ---------------------- src/calibre/gui2/dialogs/lrf_single.py | 425 --------- src/calibre/gui2/dialogs/lrf_single.ui | 1091 ------------------------ src/calibre/gui2/dialogs/mobi.py | 22 - src/calibre/gui2/tools.py | 9 - 8 files changed, 13 insertions(+), 2854 deletions(-) delete mode 100644 src/calibre/gui2/dialogs/epub.py delete mode 100644 src/calibre/gui2/dialogs/epub.ui delete mode 100644 src/calibre/gui2/dialogs/lrf_single.py delete mode 100644 src/calibre/gui2/dialogs/lrf_single.ui delete mode 100644 src/calibre/gui2/dialogs/mobi.py diff --git a/src/calibre/ebooks/pdb/header.py b/src/calibre/ebooks/pdb/header.py index 5b47e48a16..d098a64f2b 100644 --- a/src/calibre/ebooks/pdb/header.py +++ b/src/calibre/ebooks/pdb/header.py @@ -34,7 +34,7 @@ class PdbHeaderReader(object): def full_section_info(self, number): if number not in range(0, self.num_sections): raise ValueError('Not a valid section number %i' % number) - + self.stream.seek(78+number*8) offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', self.stream.read(8))[0] flags, val = a1, a2<<16 | a3<<8 | a4 @@ -43,14 +43,14 @@ class PdbHeaderReader(object): def section_offset(self, number): if number not in range(0, self.num_sections): raise ValueError('Not a valid section number %i' % number) - + self.stream.seek(78+number*8) return struct.unpack('>LBBBB', self.stream.read(8))[0] def section_data(self, number): if number not in range(0, self.num_sections): raise ValueError('Not a valid section number %i' % number) - + start = self.section_offset(number) if number == self.num_sections -1: end = os.stat(self.stream.name).st_size @@ -65,13 +65,13 @@ class PdbHeaderWriter(object): def __init__(self, identity, title): self.identity = identity[:8] self.title = title.ljust(32, '\x00')[:32] - - def build_header(self, sections) + + def build_header(self, sections): ''' Sections is a list of section offsets ''' - - - - + + + + return header diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py index 180e0814a6..31dd216ee1 100644 --- a/src/calibre/ebooks/pdb/input.py +++ b/src/calibre/ebooks/pdb/input.py @@ -8,7 +8,6 @@ __docformat__ = 'restructuredtext en' import os from calibre.customize.conversion import InputFormatPlugin -from calibre.ebooks.pdb.header import PdbHeader from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader class PDBInput(InputFormatPlugin): @@ -17,18 +16,18 @@ class PDBInput(InputFormatPlugin): author = 'John Schember' description = 'Convert PDB to HTML' file_types = set(['pdb']) - + def convert(self, stream, options, file_ext, log, accelerators): header = PdbHeaderReader(stream) Reader = get_reader(header.ident) - + if Reader is None: raise PDBError('Unknown format in pdb file. Identity is %s' % header.identity) log.debug('Detected ebook format as: %s with identity: %s' % (IDENTITY_TO_NAME[header.ident], header.ident)) - + reader = Reader(header, stream, log, options.input_encoding) opf = reader.extract_content(os.getcwd()) - + return opf diff --git a/src/calibre/gui2/dialogs/epub.py b/src/calibre/gui2/dialogs/epub.py deleted file mode 100644 index e61d034642..0000000000 --- a/src/calibre/gui2/dialogs/epub.py +++ /dev/null @@ -1,292 +0,0 @@ -#!/usr/bin/env python -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' -__docformat__ = 'restructuredtext en' - -''' -The GUI for conversion to EPUB. -''' -import os, uuid - -from PyQt4.Qt import QDialog, QSpinBox, QDoubleSpinBox, QComboBox, QLineEdit, \ - QTextEdit, QCheckBox, Qt, QPixmap, QIcon, QListWidgetItem, SIGNAL -from lxml.etree import XPath - -from calibre.gui2.dialogs.choose_format import ChooseFormatDialog -from calibre.gui2.dialogs.epub_ui import Ui_Dialog -from calibre.gui2 import error_dialog, choose_images, pixmap_to_data, ResizableDialog -from calibre.ebooks.epub.from_any import SOURCE_FORMATS, config as epubconfig -from calibre.ebooks.metadata import MetaInformation -from calibre.ptempfile import PersistentTemporaryFile -from calibre.ebooks.metadata.opf2 import OPFCreator -from calibre.ebooks.metadata import authors_to_string, string_to_authors - - -class Config(ResizableDialog, Ui_Dialog): - - OUTPUT = 'EPUB' - - def __init__(self, parent, db, row=None, config=epubconfig): - ResizableDialog.__init__(self, parent) - self.hide_controls() - self.connect(self.category_list, SIGNAL('itemEntered(QListWidgetItem *)'), - self.show_category_help) - self.connect(self.cover_button, SIGNAL("clicked()"), self.select_cover) - - self.cover_changed = False - self.db = db - self.id = None - self.row = row - if row is not None: - self.id = db.id(row) - base = config().as_string() + '\n\n' - defaults = self.db.conversion_options(self.id, self.OUTPUT.lower()) - defaults = base + (defaults if defaults else '') - self.config = config(defaults=defaults) - else: - self.config = config() - self.initialize() - self.get_source_format() - self.category_list.setCurrentRow(0) - if self.row is None: - self.setWindowTitle(_('Bulk convert to ')+self.OUTPUT) - else: - self.setWindowTitle((_(u'Convert %s to ')%unicode(self.title.text()))+self.OUTPUT) - - def hide_controls(self): - self.source_profile_label.setVisible(False) - self.opt_source_profile.setVisible(False) - self.dest_profile_label.setVisible(False) - self.opt_dest_profile.setVisible(False) - self.opt_toc_title.setVisible(False) - self.toc_title_label.setVisible(False) - self.opt_rescale_images.setVisible(False) - self.opt_ignore_tables.setVisible(False) - self.opt_prefer_author_sort.setVisible(False) - - def initialize(self): - self.__w = [] - self.__w.append(QIcon(':/images/dialog_information.svg')) - self.item1 = QListWidgetItem(self.__w[-1], _('Metadata'), self.category_list) - self.__w.append(QIcon(':/images/lookfeel.svg')) - self.item2 = QListWidgetItem(self.__w[-1], _('Look & Feel').replace(' ','\n'), self.category_list) - self.__w.append(QIcon(':/images/page.svg')) - self.item3 = QListWidgetItem(self.__w[-1], _('Page Setup').replace(' ','\n'), self.category_list) - self.__w.append(QIcon(':/images/chapters.svg')) - self.item4 = QListWidgetItem(self.__w[-1], _('Chapter Detection').replace(' ','\n'), self.category_list) - self.setup_tooltips() - self.initialize_options() - - def set_help(self, msg): - if msg and getattr(msg, 'strip', lambda:True)(): - self.help_view.setPlainText(msg) - - def setup_tooltips(self): - for opt in self.config.option_set.preferences: - g = getattr(self, 'opt_'+opt.name, False) - if opt.help and g: - help = opt.help.replace('%default', str(opt.default)) - g._help = help - g.setToolTip(help.replace('<', '<').replace('>', '>')) - g.setWhatsThis(help.replace('<', '<').replace('>', '>')) - g.__class__.enterEvent = lambda obj, event: self.set_help(getattr(obj, '_help', obj.toolTip())) - - def show_category_help(self, item): - text = unicode(item.text()) - help = { - _('Metadata') : _('Specify metadata such as title and author for the book.\n\nMetadata will be updated in the database as well as the generated %s file.')%self.OUTPUT, - _('Look & Feel') : _('Adjust the look of the generated ebook by specifying things like font sizes.'), - _('Page Setup') : _('Specify the page layout settings like margins.'), - _('Chapter Detection') : _('Fine tune the detection of chapter and section headings.'), - } - self.set_help(help[text.replace('\n', ' ')]) - - def select_cover(self): - files = choose_images(self, 'change cover dialog', - _('Choose cover for ') + unicode(self.title.text())) - if not files: - return - _file = files[0] - if _file: - _file = os.path.abspath(_file) - if not os.access(_file, os.R_OK): - d = error_dialog(self.window, _('Cannot read'), - _('You do not have permission to read the file: ') + _file) - d.exec_() - return - cf, cover = None, None - try: - cf = open(_file, "rb") - cover = cf.read() - except IOError, e: - d = error_dialog(self.window, _('Error reading file'), - _("

    There was an error reading from file:
    ") + _file + "


    "+str(e)) - d.exec_() - if cover: - pix = QPixmap() - pix.loadFromData(cover) - if pix.isNull(): - d = error_dialog(self.window, _('Error reading file'), - _file + _(" is not a valid picture")) - d.exec_() - else: - self.cover_path.setText(_file) - self.cover.setPixmap(pix) - self.cover_changed = True - self.cpixmap = pix - - def initialize_metadata_options(self): - all_series = self.db.all_series() - all_series.sort(cmp=lambda x, y : cmp(x[1], y[1])) - for series in all_series: - self.series.addItem(series[1]) - self.series.setCurrentIndex(-1) - - if self.row is not None: - mi = self.db.get_metadata(self.id, index_is_id=True) - self.title.setText(mi.title) - if mi.authors: - self.author.setText(authors_to_string(mi.authors)) - else: - self.author.setText('') - self.publisher.setText(mi.publisher if mi.publisher else '') - self.author_sort.setText(mi.author_sort if mi.author_sort else '') - self.tags.setText(', '.join(mi.tags if mi.tags else [])) - self.comment.setText(mi.comments if mi.comments else '') - if mi.series: - self.series.setCurrentIndex(self.series.findText(mi.series)) - if mi.series_index is not None: - self.series_index.setValue(mi.series_index) - - cover = self.db.cover(self.id, index_is_id=True) - if cover: - pm = QPixmap() - pm.loadFromData(cover) - if not pm.isNull(): - self.cover.setPixmap(pm) - - def get_title_and_authors(self): - title = unicode(self.title.text()).strip() - if not title: - title = _('Unknown') - authors = unicode(self.author.text()).strip() - authors = string_to_authors(authors) if authors else [_('Unknown')] - return title, authors - - def get_metadata(self): - title, authors = self.get_title_and_authors() - mi = MetaInformation(title, authors) - publisher = unicode(self.publisher.text()).strip() - if publisher: - mi.publisher = publisher - author_sort = unicode(self.author_sort.text()).strip() - if author_sort: - mi.author_sort = author_sort - comments = unicode(self.comment.toPlainText()).strip() - if comments: - mi.comments = comments - mi.series_index = int(self.series_index.value()) - if self.series.currentIndex() > -1: - mi.series = unicode(self.series.currentText()).strip() - tags = [t.strip() for t in unicode(self.tags.text()).strip().split(',')] - if tags: - mi.tags = tags - - return mi - - def read_settings(self): - for pref in self.config.option_set.preferences: - g = getattr(self, 'opt_'+pref.name, False) - if g: - if isinstance(g, (QSpinBox, QDoubleSpinBox)): - self.config.set(pref.name, g.value()) - elif isinstance(g, (QLineEdit, QTextEdit)): - func = getattr(g, 'toPlainText', getattr(g, 'text', None))() - val = unicode(func) - self.config.set(pref.name, val if val else None) - elif isinstance(g, QComboBox): - self.config.set(pref.name, unicode(g.currentText())) - elif isinstance(g, QCheckBox): - self.config.set(pref.name, bool(g.isChecked())) - if self.row is not None: - self.db.set_conversion_options(self.id, self.OUTPUT.lower(), self.config.src) - - - def initialize_options(self): - self.initialize_metadata_options() - values = self.config.parse() - for pref in self.config.option_set.preferences: - g = getattr(self, 'opt_'+pref.name, False) - if g: - val = getattr(values, pref.name) - if val is None: - continue - if isinstance(g, (QSpinBox, QDoubleSpinBox)): - g.setValue(val) - elif isinstance(g, (QLineEdit, QTextEdit)): - getattr(g, 'setPlainText', g.setText)(val) - getattr(g, 'setCursorPosition', lambda x: x)(0) - elif isinstance(g, QComboBox): - for value in pref.choices: - g.addItem(value) - g.setCurrentIndex(g.findText(val)) - elif isinstance(g, QCheckBox): - g.setCheckState(Qt.Checked if bool(val) else Qt.Unchecked) - - - def get_source_format(self): - self.source_format = None - if self.row is not None: - temp = self.db.formats(self.id, index_is_id=True) - if not temp: - error_dialog(self.parent(), _('Cannot convert'), - _('This book has no available formats')).exec_() - - available_formats = [f.upper().strip() for f in temp.split(',')] - choices = [fmt.upper() for fmt in SOURCE_FORMATS if fmt.upper() in available_formats] - if not choices: - error_dialog(self.parent(), _('No available formats'), - _('Cannot convert %s as this book has no supported formats')%(self.title.text())).exec_() - elif len(choices) == 1: - self.source_format = choices[0] - else: - d = ChooseFormatDialog(self.parent(), _('Choose the format to convert to ')+self.OUTPUT, choices) - if d.exec_() == QDialog.Accepted: - self.source_format = d.format() - - def accept(self): - for opt in ('chapter', 'level1_toc', 'level2_toc', 'level3_toc', 'page', - 'page_names'): - text = unicode(getattr(self, 'opt_'+opt).text()) - if text: - try: - XPath(text,namespaces={'re':'http://exslt.org/regular-expressions'}) - except Exception, err: - error_dialog(self, _('Invalid XPath expression'), - _('The expression %s is invalid. Error: %s')%(text, err) - ).exec_() - return - mi = self.get_metadata() - self.user_mi = mi - self.read_settings() - self.cover_file = None - if self.row is not None: - self.db.set_metadata(self.id, mi) - self.mi = self.db.get_metadata(self.id, index_is_id=True) - self.mi.application_id = uuid.uuid4() - opf = OPFCreator(os.getcwdu(), self.mi) - self.opf_file = PersistentTemporaryFile('.opf') - opf.render(self.opf_file) - self.opf_file.close() - if self.cover_changed: - self.db.set_cover(self.id, pixmap_to_data(self.cover.pixmap())) - cover = self.db.cover(self.id, index_is_id=True) - if cover: - cf = PersistentTemporaryFile('.jpeg') - cf.write(cover) - cf.close() - self.cover_file = cf - self.opts = self.config.parse() - QDialog.accept(self) - - diff --git a/src/calibre/gui2/dialogs/epub.ui b/src/calibre/gui2/dialogs/epub.ui deleted file mode 100644 index b6e2299e1d..0000000000 --- a/src/calibre/gui2/dialogs/epub.ui +++ /dev/null @@ -1,1001 +0,0 @@ - - Dialog - - - - 0 - 0 - 965 - 698 - - - - Convert to EPUB - - - - :/images/convert.svg:/images/convert.svg - - - true - - - - - - - 0 - 0 - - - - - 75 - true - - - - true - - - Qt::ScrollBarAlwaysOff - - - true - - - - 48 - 48 - - - - 20 - - - true - - - - - - - - 10 - 0 - - - - QFrame::NoFrame - - - true - - - - - 0 - 0 - 697 - 554 - - - - - 680 - 520 - - - - - 0 - - - - - 0 - - - - - - - - - Book Cover - - - - - - 6 - - - 0 - - - - - Change &cover image: - - - cover_path - - - - - - - 6 - - - 0 - - - - - true - - - - - - - Browse for an image to use as the cover of this book. - - - ... - - - - :/images/document_open.svg:/images/document_open.svg - - - - - - - - - - - Use cover from &source file - - - true - - - - - - - - - - - - :/images/book.svg - - - true - - - Qt::AlignCenter - - - - - - - opt_prefer_metadata_cover - - - - - - - - - - - &Title: - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - title - - - - - - - Change the title of this book - - - - - - - &Author(s): - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - author - - - - - - - - 1 - 0 - - - - Change the author(s) of this book. Multiple authors should be separated by an &. If the author name contains an &, use && to represent it. - - - - - - - Author So&rt: - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - author_sort - - - - - - - - 0 - 0 - - - - Change the author(s) of this book. Multiple authors should be separated by a comma - - - - - - - &Publisher: - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - publisher - - - - - - - Change the publisher of this book - - - - - - - Ta&gs: - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - tags - - - - - - - Tags categorize the book. This is particularly useful while searching. <br><br>They can be any words or phrases, separated by commas. - - - - - - - &Series: - - - Qt::PlainText - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - series - - - - - - - - 10 - 0 - - - - List of known series. You can add new series. - - - List of known series. You can add new series. - - - true - - - QComboBox::InsertAlphabetically - - - QComboBox::AdjustToContents - - - - - - - true - - - Series index. - - - Series index. - - - Book - - - 1 - - - 10000 - - - - - - - - - - 0 - 0 - - - - - 16777215 - 200 - - - - Comments - - - - - - - 16777215 - 180 - - - - - - - - - - - - - - - - - - - - - Source en&coding: - - - opt_encoding - - - - - - - - - - Base &font size: - - - opt_base_font_size2 - - - - - - - pt - - - 0 - - - 0.000000000000000 - - - 30.000000000000000 - - - 1.000000000000000 - - - 30.000000000000000 - - - - - - - Remove &spacing between paragraphs - - - - - - - Preserve &tag structure when splitting - - - - - - - &Rescale images - - - - - - - &Ignore tables - - - - - - - &Use author sort to set author field in output - - - - - - - No text &justification - - - - - - - &Linearize tables - - - - - - - Remove &first image from source file - - - - - - - - - Override &CSS - - - - - - - - - - - - - - - - &Profile: - - - opt_profile - - - - - - - -1 - - - 1 - - - - - - - &Source profile: - - - opt_source_profile - - - - - - - - - - &Destination profile: - - - opt_dest_profile - - - - - - - - - - &Left Margin: - - - opt_margin_left - - - - - - - pt - - - 200 - - - 20 - - - - - - - &Right Margin: - - - opt_margin_right - - - - - - - pt - - - 200 - - - 20 - - - - - - - &Top Margin: - - - opt_margin_top - - - - - - - pt - - - 200 - - - 10 - - - - - - - &Bottom Margin: - - - opt_margin_bottom - - - - - - - pt - - - 200 - - - 0 - - - - - - - Do not &split on page breaks - - - - - - - &Page map - - - - - - - 0 - 0 - - - - <p>You can control how calibre detects page boundaries using a XPath expression. To learn how to use XPath expressions see the <a href="http://calibre.kovidgoyal.net/user_manual/xpath.html">XPath tutorial</a>. The page boundaries are useful only if you want a mapping from pages in a paper book, to locations in the e-book. This controls where Adobe Digital Editions displays the page numbers in the right margin.</p> - - - true - - - true - - - - - - - &Boundary XPath: - - - opt_page - - - - - - - - - - &Name XPath: - - - opt_page_names - - - - - - - - - - - - - Qt::Vertical - - - - 20 - 40 - - - - - - - - - - - - Automatic &chapter detection - - - - - - <p>You can control how calibre detects chapters using a XPath expression. To learn how to use XPath expressions see the <a href="http://calibre.kovidgoyal.net/user_manual/xpath.html">XPath tutorial</a></p> - - - Qt::RichText - - - true - - - true - - - - - - - - - &XPath: - - - opt_chapter - - - - - - - - - - Chapter &mark: - - - opt_chapter_mark - - - - - - - - - - - - - - - Automatic &Table of Contents - - - - - - - - - Number of &links to add to Table of Contents - - - opt_max_toc_links - - - - - - - Do not add &detected chapters to the Table of Contents - - - - - - - - - - Chapter &threshold - - - opt_toc_threshold - - - - - - - &Force use of auto-generated Table of Contents - - - - - - - - - - Level &1 TOC - - - opt_level1_toc - - - - - - - Level &2 TOC - - - opt_level2_toc - - - - - - - - - - - - - &Title for generated TOC - - - opt_toc_title - - - - - - - - - - Level &3 TOC - - - opt_level3_toc - - - - - - - - - - - - - - - - - - - 16777215 - 100 - - - - false - - - - - - - Qt::Horizontal - - - QDialogButtonBox::Cancel|QDialogButtonBox::Ok - - - - - - - - ImageView - QLabel -
    widgets.h
    -
    -
    - - - - - - - buttonBox - accepted() - Dialog - accept() - - - 226 - 684 - - - 157 - 274 - - - - - buttonBox - rejected() - Dialog - reject() - - - 290 - 658 - - - 286 - 274 - - - - - category_list - currentRowChanged(int) - stack - setCurrentIndex(int) - - - 81 - 118 - - - 866 - 11 - - - - -
    diff --git a/src/calibre/gui2/dialogs/lrf_single.py b/src/calibre/gui2/dialogs/lrf_single.py deleted file mode 100644 index fdcf908d1d..0000000000 --- a/src/calibre/gui2/dialogs/lrf_single.py +++ /dev/null @@ -1,425 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' -import os, codecs - -from PyQt4.QtCore import QObject, SIGNAL, Qt -from PyQt4.QtGui import QAbstractSpinBox, QLineEdit, QCheckBox, QDialog, \ - QPixmap, QTextEdit, QListWidgetItem, QIcon - -from calibre.gui2.dialogs.lrf_single_ui import Ui_LRFSingleDialog -from calibre.gui2.dialogs.choose_format import ChooseFormatDialog -from calibre.gui2 import qstring_to_unicode, error_dialog, \ - pixmap_to_data, choose_images, config -from calibre.gui2.widgets import FontFamilyModel -from calibre.ebooks.lrf import option_parser -from calibre.ptempfile import PersistentTemporaryFile -from calibre.constants import __appname__ -from calibre.ebooks.metadata import authors_to_string, string_to_authors, authors_to_sort_string - -font_family_model = None - -class LRFSingleDialog(QDialog, Ui_LRFSingleDialog): - - PARSER = option_parser('') - PREPROCESS_OPTIONS = [ o for o in PARSER.option_groups if o.title == 'PREPROCESSING OPTIONS'][0].option_list - - @classmethod - def options(cls): - options = cls.PARSER.option_list - for g in cls.PARSER.option_groups: - options.extend(g.option_list) - for opt in options: - yield opt - - @classmethod - def option_to_name(cls, opt): - src = opt.get_opt_string() - return 'gui_' + src[2:].replace('-', '_') - - def initialize_common(self): - self.output_format = 'LRF' - self.setup_tooltips() - self.initialize_options() - global font_family_model - if font_family_model is None: - font_family_model = FontFamilyModel() - self.font_family_model = font_family_model - self.gui_serif_family.setModel(self.font_family_model) - self.gui_sans_family.setModel(self.font_family_model) - self.gui_mono_family.setModel(self.font_family_model) - self.load_saved_global_defaults() - - def populate_list(self): - self.__w = [] - self.__w.append(QIcon(':/images/dialog_information.svg')) - self.item1 = QListWidgetItem(self.__w[-1], _("Metadata"), self.categoryList) - self.__w.append(QIcon(':/images/lookfeel.svg')) - self.item2 = QListWidgetItem(self.__w[-1], _('Look & Feel'), self.categoryList) - self.__w.append(QIcon(':/images/page.svg')) - self.item3 = QListWidgetItem(self.__w[-1], _('Page Setup'), self.categoryList) - self.__w.append(QIcon(':/images/chapters.svg')) - self.item4 = QListWidgetItem(self.__w[-1], _('Chapter Detection'), self.categoryList) - - def __init__(self, window, db, row): - QDialog.__init__(self, window) - Ui_LRFSingleDialog.__init__(self) - self.setupUi(self) - self.populate_list() - self.categoryList.setCurrentRow(0) - QObject.connect(self.categoryList, SIGNAL('itemEntered(QListWidgetItem *)'), - self.show_category_help) - QObject.connect(self.cover_button, SIGNAL("clicked(bool)"), self.select_cover) - #self.categoryList.leaveEvent = self.reset_help - self.reset_help() - self.selected_format = None - self.initialize_common() - self.db = db - self.row = row - self.cover_changed = False - self.cpixmap = None - self.changed = False - - if db: - self.id = self.db.id(self.row) - self.read_saved_options() - self.initialize_metadata() - formats = self.db.formats(self.row) - formats = [i.upper() for i in formats.split(',')] if formats else [] - try: - formats.remove(self.output_format) - except ValueError: - pass - if not formats: - d = error_dialog(window, _('No available formats'), - _('Cannot convert %s as this book has no supported formats')%(self.gui_title.text())) - d.exec_() - - if len(formats) > 1: - d = ChooseFormatDialog(window, _('Choose the format to convert into LRF'), formats) - d.exec_() - if d.result() == QDialog.Accepted: - self.selected_format = d.format() - elif len(formats) > 0: - self.selected_format = formats[0] - - if self.selected_format: - self.setWindowTitle(_('Convert %s to LRF')%(self.selected_format,)) - - else: - self.setWindowTitle(_('Set conversion defaults')) - - - def load_saved_global_defaults(self): - cmdline = config['LRF_conversion_defaults'] - if cmdline: - self.set_options_from_cmdline(cmdline) - - def set_options_from_cmdline(self, cmdline): - for opt in self.options(): - guiname = self.option_to_name(opt) - try: - obj = getattr(self, guiname) - except AttributeError: - continue - if isinstance(obj, QCheckBox): - if opt.get_opt_string() in cmdline: - obj.setCheckState(Qt.Checked) - else: - obj.setCheckState(Qt.Unchecked) - try: - i = cmdline.index(opt.get_opt_string()) - except ValueError: - continue - - if isinstance(obj, QAbstractSpinBox): - obj.setValue(cmdline[i+1]) - elif isinstance(obj, QLineEdit): - obj.setText(cmdline[i+1]) - elif isinstance(obj, QTextEdit): - obj.setPlainText(cmdline[i+1]) - profile = cmdline[cmdline.index('--profile')+1] - pindex = self.gui_profile.findText(profile) - if pindex >= 0: - self.gui_profile.setCurrentIndex(pindex) - for prepro in self.PREPROCESS_OPTIONS: - ops = prepro.get_opt_string() - if ops in cmdline: - self.preprocess.setCurrentIndex(self.preprocess.findText(ops[2:])) - break - - for opt in ('--serif-family', '--sans-family', '--mono-family'): - if opt in cmdline: - print 'in' - family = cmdline[cmdline.index(opt)+1].split(',')[-1].strip() - obj = getattr(self, 'gui_'+opt[2:].replace('-', '_')) - try: - obj.setCurrentIndex(self.font_family_model.index_of(family)) - except: - continue - - def read_saved_options(self): - cmdline = self.db.conversion_options(self.id, self.output_format.lower()) - if cmdline: - self.set_options_from_cmdline(cmdline) - - def select_cover(self, checked): - files = choose_images(self, 'change cover dialog', - _('Choose cover for ') + qstring_to_unicode(self.gui_title.text())) - if not files: - return - _file = files[0] - if _file: - _file = os.path.abspath(_file) - if not os.access(_file, os.R_OK): - d = error_dialog(self.window, _('Cannot read'), - _('You do not have permission to read the file: ') + _file) - d.exec_() - return - cf, cover = None, None - try: - cf = open(_file, "rb") - cover = cf.read() - except IOError, e: - d = error_dialog(self.window, _('Error reading file'), - _("

    There was an error reading from file:
    ") + _file + "


    "+str(e)) - d.exec_() - if cover: - pix = QPixmap() - pix.loadFromData(cover) - if pix.isNull(): - d = error_dialog(self.window, _file + _(" is not a valid picture")) - d.exec_() - else: - self.cover_path.setText(_file) - self.cover.setPixmap(pix) - self.cover_changed = True - self.cpixmap = pix - - def initialize_metadata(self): - db, row = self.db, self.row - self.id = self.db.id(row) - self.gui_title.setText(db.title(row)) - au = self.db.authors(row) - if au: - au = [a.strip().replace('|', ',') for a in au.split(',')] - self.gui_author.setText(authors_to_string(au)) - else: - self.gui_author.setText('') - aus = self.db.author_sort(row) - self.gui_author_sort.setText(aus if aus else '') - pub = self.db.publisher(row) - self.gui_publisher.setText(pub if pub else '') - tags = self.db.tags(row) - self.tags.setText(tags if tags else '') - comments = self.db.comments(row) - self.gui_comment.setPlainText(comments if comments else '') - - all_series = self.db.all_series() - all_series.sort(cmp=lambda x, y : cmp(x[1], y[1])) - series_id = self.db.series_id(row) - idx, c = None, 0 - for i in all_series: - id, name = i - if id == series_id: - idx = c - self.series.addItem(name) - c += 1 - - self.series.lineEdit().setText('') - if idx is not None: - self.series.setCurrentIndex(idx) - - self.series_index.setValue(self.db.series_index(row)) - - cover = self.db.cover(row) - if cover: - pm = QPixmap() - pm.loadFromData(cover) - if not pm.isNull(): - self.cover.setPixmap(pm) - - def initialize_options(self): - '''Initialize non metadata options from the defaults.''' - for name in self.option_map.keys(): - default = self.option_map[name].default - obj = getattr(self, name) - if isinstance(obj, QAbstractSpinBox): - obj.setValue(default) - elif isinstance(obj, QLineEdit) and default: - obj.setText(default) - elif isinstance(obj, QTextEdit) and default: - obj.setPlainText(default) - elif isinstance(obj, QCheckBox): - state = Qt.Checked if default else Qt.Unchecked - obj.setCheckState(state) - self.gui_headerformat.setDisabled(True) - self.gui_header_separation.setDisabled(True) - self.gui_use_metadata_cover.setCheckState(Qt.Checked) - self.preprocess.addItem(_('No preprocessing')) - for opt in self.PREPROCESS_OPTIONS: - self.preprocess.addItem(opt.get_opt_string()[2:]) - ph = _('Preprocess the file before converting to LRF. This is useful if you know that the file is from a specific source. Known sources:') - ph += _('
    1. baen - Books from BAEN Publishers
    2. ') - ph += _('
    3. pdftohtml - HTML files that are the output of the program pdftohtml
    4. ') - ph += _('
    5. book-designer - HTML0 files from Book Designer
    6. ') - self.preprocess.setToolTip(ph) - self.preprocess.setWhatsThis(ph) - for profile in self.PARSER.get_option('--profile').choices: - if self.gui_profile.findText(profile) < 0: - self.gui_profile.addItem(profile) - - def setup_tooltips(self): - def show_item_help(obj, event): - self.set_help(obj.toolTip()) - - self.option_map = {} - for opt in self.options(): - try: - help = opt.help.replace('%default', str(opt.default)) - except (ValueError, TypeError): - help = opt.help - - guiname = self.option_to_name(opt) - if hasattr(self, guiname): - obj = getattr(self, guiname) - obj.setToolTip(help) - obj.setWhatsThis(help) - self.option_map[guiname] = opt - obj.__class__.enterEvent = show_item_help - #obj.leaveEvent = self.reset_help - self.preprocess.__class__.enterEvent = show_item_help - #self.preprocess.leaveEvent = self.reset_help - - - def show_category_help(self, item): - text = qstring_to_unicode(item.text()) - help = { - _('Metadata') : _('Specify metadata such as title and author for the book.

      Metadata will be updated in the database as well as the generated LRF file.'), - _('Look & Feel') : _('Adjust the look of the generated LRF file by specifying things like font sizes and the spacing between words.'), - _('Page Setup') : _('Specify the page settings like margins and the screen size of the target device.'), - _('Chapter Detection') : _('Fine tune the detection of chapter and section headings.'), - } - self.set_help(help[text]) - - def set_help(self, msg): - if msg and getattr(msg, 'strip', lambda:True)(): - self.help_view.setHtml('%s'%(msg,)) - - def reset_help(self, *args): - self.set_help(_('No help available')) - if args: - args[0].accept() - - def build_commandline(self): - cmd = [__appname__] - for name in self.option_map.keys(): - opt = self.option_map[name].get_opt_string() - obj = getattr(self, name) - if isinstance(obj, QAbstractSpinBox): - cmd.extend([opt, obj.value()]) - elif isinstance(obj, QLineEdit): - val = qstring_to_unicode(obj.text()) - if val: - if opt == '--encoding': - try: - codecs.getdecoder(val) - except: - d = error_dialog(self, 'Unknown encoding', - '

      Unknown encoding: %s
      For a list of known encodings see http://docs.python.org/lib/standard-encodings.html'%val) - d.exec_() - return - cmd.extend([opt, val]) - elif isinstance(obj, QTextEdit): - val = qstring_to_unicode(obj.toPlainText()) - if val: - cmd.extend([opt, val]) - elif isinstance(obj, QCheckBox): - if obj.checkState() == Qt.Checked: - cmd.append(opt) - - text = qstring_to_unicode(self.preprocess.currentText()) - if text != _('No preprocessing'): - cmd.append(u'--'+text) - cmd.extend([u'--profile', qstring_to_unicode(self.gui_profile.currentText())]) - - for opt in ('--serif-family', '--sans-family', '--mono-family'): - obj = getattr(self, 'gui_'+opt[2:].replace('-', '_')) - family = qstring_to_unicode(obj.itemText(obj.currentIndex())).strip() - if family != 'None': - cmd.extend([opt, family]) - - return cmd - - def title(self): - return qstring_to_unicode(self.gui_title.text()) - - def write_metadata(self): - title = qstring_to_unicode(self.gui_title.text()) - self.db.set_title(self.id, title) - au = unicode(self.gui_author.text()) - if au: - self.db.set_authors(self.id, string_to_authors(au)) - aus = qstring_to_unicode(self.gui_author_sort.text()) - if not aus: - t = self.db.authors(self.id, index_is_id=True) - if not t: - t = _('Unknown') - aus = [a.strip().replace('|', ',') for a in t.split(',')] - aus = authors_to_sort_string(aus) - self.db.set_author_sort(self.id, aus) - self.db.set_publisher(self.id, qstring_to_unicode(self.gui_publisher.text())) - self.db.set_tags(self.id, qstring_to_unicode(self.tags.text()).split(',')) - self.db.set_series(self.id, qstring_to_unicode(self.series.currentText())) - self.db.set_series_index(self.id, self.series_index.value()) - if self.cover_changed: - self.db.set_cover(self.id, pixmap_to_data(self.cover.pixmap())) - - - def accept(self): - cmdline = self.build_commandline() - if cmdline is None: - return - if self.db: - self.cover_file = None - self.write_metadata() - cover = self.db.cover(self.row) - if cover: - self.cover_file = PersistentTemporaryFile(suffix='.jpeg') - self.cover_file.write(cover) - self.cover_file.close() - self.db.set_conversion_options(self.id, self.output_format.lower(), cmdline) - - if self.cover_file: - cmdline.extend([u'--cover', self.cover_file.name]) - self.cmdline = [unicode(i) for i in cmdline] - else: - config.set('LRF_conversion_defaults', cmdline) - QDialog.accept(self) - -class LRFBulkDialog(LRFSingleDialog): - - def __init__(self, window): - QDialog.__init__(self, window) - Ui_LRFSingleDialog.__init__(self) - self.setupUi(self) - self.populate_list() - - self.categoryList.takeItem(0) - self.stack.removeWidget(self.stack.widget(0)) - self.categoryList.setCurrentRow(0) - - self.initialize_common() - self.setWindowTitle(_('Bulk convert ebooks to LRF')) - - def accept(self): - self.cmdline = [unicode(i) for i in self.build_commandline()] - for meta in ('--title', '--author', '--publisher', '--comment'): - try: - index = self.cmdline.index(meta) - self.cmdline[index:index+2] = [] - except ValueError: - continue - - self.cover_file = None - QDialog.accept(self) - diff --git a/src/calibre/gui2/dialogs/lrf_single.ui b/src/calibre/gui2/dialogs/lrf_single.ui deleted file mode 100644 index 87c7382770..0000000000 --- a/src/calibre/gui2/dialogs/lrf_single.ui +++ /dev/null @@ -1,1091 +0,0 @@ - - LRFSingleDialog - - - - 0 - 0 - 866 - 679 - - - - Convert to LRF - - - - :/images/convert.svg:/images/convert.svg - - - - - - - 0 - 0 - - - - Category - - - - - - - 3 - 0 - - - - - 172 - 16777215 - - - - - 75 - true - - - - true - - - Qt::ScrollBarAlwaysOff - - - Qt::ScrollBarAlwaysOff - - - false - - - QAbstractItemView::NoDragDrop - - - false - - - QAbstractItemView::SelectRows - - - - 48 - 48 - - - - QListView::Static - - - 10 - - - QListView::IconMode - - - true - - - -1 - - - - - - - - - - QFrame::NoFrame - - - true - - - - - 0 - 0 - 664 - 515 - - - - - 0 - - - - - 0 - - - - - - - Book Cover - - - - - - - - - - - :/images/book.svg - - - true - - - Qt::AlignCenter - - - - - - - - - 6 - - - 0 - - - - - Change &cover image: - - - cover_path - - - - - - - 6 - - - 0 - - - - - true - - - - - - - Browse for an image to use as the cover of this book. - - - ... - - - - :/images/document_open.svg:/images/document_open.svg - - - - - - - - - - - Use cover from &source file - - - true - - - - - - - - - - - - - - &Title: - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - gui_title - - - - - - - Change the title of this book - - - - - - - &Author(s): - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - gui_author - - - - - - - - 1 - 0 - - - - Change the author(s) of this book. Multiple authors should be separated by an &. If the author name contains an &, use && to represent it. - - - - - - - Author So&rt: - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - gui_author_sort - - - - - - - - 0 - 0 - - - - Change the author(s) of this book. Multiple authors should be separated by a comma - - - - - - - &Publisher: - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - gui_publisher - - - - - - - Change the publisher of this book - - - - - - - Ta&gs: - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - tags - - - - - - - Tags categorize the book. This is particularly useful while searching. <br><br>They can be any words or phrases, separated by commas. - - - - - - - &Series: - - - Qt::PlainText - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - series - - - - - - - - 10 - 0 - - - - List of known series. You can add new series. - - - List of known series. You can add new series. - - - true - - - QComboBox::InsertAlphabetically - - - QComboBox::AdjustToContents - - - - - - - Series index. - - - Series index. - - - Book - - - 1 - - - 10000 - - - - - - - - - - 0 - 0 - - - - Comments - - - - - - - - - - - - - - - - - - Base &font size: - - - gui_base_font_size - - - - - - - QAbstractSpinBox::PlusMinus - - - pts - - - 1 - - - 2.000000000000000 - - - 20.000000000000000 - - - 0.100000000000000 - - - 10.000000000000000 - - - - - - - Embedded Fonts - - - - - - &Serif: - - - gui_serif_family - - - - - - - - 0 - 0 - - - - - - - - S&ans-serif: - - - gui_sans_family - - - - - - - - - - &Monospace: - - - gui_mono_family - - - - - - - - - - Source en&coding: - - - gui_encoding - - - - - - - - - - - - - Minimum &indent: - - - gui_minimum_indent - - - - - - - QAbstractSpinBox::PlusMinus - - - pts - - - 1 - - - - - - - &Word spacing: - - - Qt::PlainText - - - gui_wordspace - - - - - - - QAbstractSpinBox::PlusMinus - - - pts - - - 1 - - - 0.000000000000000 - - - 10.000000000000000 - - - 0.100000000000000 - - - 2.500000000000000 - - - - - - - - - Enable auto &rotation of images - - - - - - - Insert &blank lines between paragraphs - - - - - - - Ignore &tables - - - - - - - Ignore &colors - - - - - - - - - &Preprocess: - - - preprocess - - - - - - - - - - - 0 - 0 - - - - Header - - - - - - &Show header - - - - - - - &Header format: - - - gui_headerformat - - - - - - - - - - px - - - - - - - Header &separation: - - - gui_header_separation - - - - - - - - - - Override<br>CSS - - - - - - - - - - - - - - &Profile: - - - gui_profile - - - - - - - -1 - - - 1 - - - - - - - &Left Margin: - - - gui_left_margin - - - - - - - px - - - 250 - - - 20 - - - - - - - &Right Margin: - - - gui_right_margin - - - - - - - px - - - 250 - - - 20 - - - - - - - &Top Margin: - - - gui_top_margin - - - - - - - px - - - 250 - - - 10 - - - - - - - &Bottom Margin: - - - gui_bottom_margin - - - - - - - px - - - 250 - - - 0 - - - - - - - &Convert tables to images (good for large/complex tables) - - - - - - - &Multiplier for text size in rendered tables: - - - gui_text_size_multiplier_for_rendered_tables - - - - - - - false - - - 2 - - - 0.100000000000000 - - - 1.000000000000000 - - - - - - - - - - - Title based detection - - - - - - &Disable chapter detection - - - - - - - &Regular expression: - - - gui_chapter_regex - - - - - - - - - - Add &chapters to table of contents - - - - - - - Don't add &links to the table of contents - - - - - - - - - - Tag based detection - - - - - - &Page break before tag: - - - gui_page_break_before_tag - - - - - - - - - - &Force page break before tag: - - - gui_force_page_break_before_tag - - - - - - - - - - Force page break before &attribute: - - - gui_force_page_break_before_attr - - - - - - - - - - Detect chapter &at tag: - - - gui_chapter_attr - - - - - - - - - - - - - - - - - - - - - - 0 - 0 - - - - - 0 - 60 - - - - - 16777215 - 120 - - - - <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0//EN" "http://www.w3.org/TR/REC-html40/strict.dtd"> -<html><head><meta name="qrichtext" content="1" /><style type="text/css"> -p, li { white-space: pre-wrap; } -</style></head><body style=" font-family:'DejaVu Sans'; font-size:10pt; font-weight:400; font-style:normal;"> -<p style="-qt-paragraph-type:empty; margin-top:0px; margin-bottom:0px; margin-left:0px; margin-right:0px; -qt-block-indent:0; text-indent:0px;"></p></body></html> - - - - - - - Qt::Horizontal - - - QDialogButtonBox::Cancel|QDialogButtonBox::Ok - - - false - - - - - - - - ImageView - QLabel -

      widgets.h
      - - - - - - - - buttonBox - accepted() - LRFSingleDialog - accept() - - - 516 - 655 - - - 157 - 274 - - - - - buttonBox - rejected() - LRFSingleDialog - reject() - - - 794 - 660 - - - 286 - 274 - - - - - categoryList - currentRowChanged(int) - stack - setCurrentIndex(int) - - - 96 - 120 - - - 539 - 220 - - - - - gui_header - toggled(bool) - gui_header_separation - setEnabled(bool) - - - 235 - 298 - - - 361 - 321 - - - - - gui_header - toggled(bool) - gui_headerformat - setEnabled(bool) - - - 307 - 300 - - - 363 - 362 - - - - - diff --git a/src/calibre/gui2/dialogs/mobi.py b/src/calibre/gui2/dialogs/mobi.py deleted file mode 100644 index b9cff08200..0000000000 --- a/src/calibre/gui2/dialogs/mobi.py +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env python -__license__ = 'GPL v3' -__copyright__ = '2009, Kovid Goyal kovid@kovidgoyal.net' -__docformat__ = 'restructuredtext en' - -from calibre.gui2.dialogs.epub import Config as _Config -from calibre.ebooks.mobi.from_any import config as mobiconfig - -class Config(_Config): - - OUTPUT = 'MOBI' - - def __init__(self, parent, db, row=None): - _Config.__init__(self, parent, db, row=row, config=mobiconfig) - - def hide_controls(self): - self.profile_label.setVisible(False) - self.opt_profile.setVisible(False) - self.opt_dont_split_on_page_breaks.setVisible(False) - self.opt_preserve_tag_structure.setVisible(False) - self.opt_linearize_tables.setVisible(False) - self.page_map_box.setVisible(False) \ No newline at end of file diff --git a/src/calibre/gui2/tools.py b/src/calibre/gui2/tools.py index a3002089a9..d164daff95 100644 --- a/src/calibre/gui2/tools.py +++ b/src/calibre/gui2/tools.py @@ -11,17 +11,8 @@ from PyQt4.Qt import QDialog from calibre.customize.ui import available_input_formats from calibre.utils.config import prefs -from calibre.gui2.dialogs.lrf_single import LRFSingleDialog, LRFBulkDialog -from calibre.gui2.dialogs.epub import Config as EPUBConvert -from calibre.gui2.dialogs.mobi import Config as MOBIConvert -import calibre.gui2.dialogs.comicconf as ComicConf from calibre.gui2 import warning_dialog from calibre.ptempfile import PersistentTemporaryFile -from calibre.ebooks.lrf import preferred_source_formats as LRF_PREFERRED_SOURCE_FORMATS -from calibre.ebooks.metadata.opf import OPFCreator -from calibre.ebooks.epub.from_any import SOURCE_FORMATS as EPUB_PREFERRED_SOURCE_FORMATS, config as epubconfig -from calibre.ebooks.mobi.from_any import config as mobiconfig -from calibre.ebooks.lrf.comic.convert_from import config as comicconfig # Ordered list of source formats. Items closer to the beginning are # preferred for conversion over those toward the end. From 06aa8f83610ceacaef27e433ae1655fdf60d310f Mon Sep 17 00:00:00 2001 From: John Schember Date: Fri, 24 Apr 2009 20:21:01 -0400 Subject: [PATCH 133/319] ereader reader work --- .../ebooks/pdb/ereader/pmlconverter.py | 32 ++------- src/calibre/ebooks/pdb/ereader/reader.py | 68 ++++--------------- 2 files changed, 17 insertions(+), 83 deletions(-) diff --git a/src/calibre/ebooks/pdb/ereader/pmlconverter.py b/src/calibre/ebooks/pdb/ereader/pmlconverter.py index 8ff30e9349..61a91febda 100644 --- a/src/calibre/ebooks/pdb/ereader/pmlconverter.py +++ b/src/calibre/ebooks/pdb/ereader/pmlconverter.py @@ -24,8 +24,8 @@ PML_HTML_RULES = [ (re.compile(r'\\o(?P.+?)\\o', re.DOTALL), lambda match: '%s' % match.group('text')), (re.compile(r'\\v(?P.+?)\\v', re.DOTALL), lambda match: '' % match.group('text')), (re.compile(r'\\t(?P.+?)\\t', re.DOTALL), lambda match: '
      %s
      ' % match.group('text')), - (re.compile(r'\\T="(?P\d+)%%*"(?P.+?)$', re.MULTILINE), lambda match: '
      %s
      ' % (match.group('val'), match.group('text'))), - (re.compile(r'\\w="(?P\d+)%%"'), lambda match: '
      ' % match.group('val')), + (re.compile(r'\\T="(?P\d+)%*"(?P.+?)$', re.MULTILINE), lambda match: r'
      %s
      ' % (match.group('val'), match.group('text'))), + (re.compile(r'\\w="(?P\d+)%"'), lambda match: '
      ' % match.group('val')), (re.compile(r'\\n'), lambda match: ''), (re.compile(r'\\s'), lambda match: ''), (re.compile(r'\\b(?P.+?)\\b', re.DOTALL), lambda match: '%s' % match.group('text')), # \b is deprecated; \B should be used instead. @@ -58,14 +58,6 @@ PML_HTML_RULES = [ (re.compile(r'\\\\'), lambda match: '\\'), ] -FOOTNOTE_HTML_RULES = [ - (re.compile('(?P.+?)', re.DOTALL), lambda match: '
      %s
      ') -] - -SIDEBAR_HTML_RULES = [ - (re.compile('(?P.+?)', re.DOTALL), lambda match: '') -] - HTML_PML_RULES = [ (re.compile(r'\\'), lambda match: '\\\\'), (re.compile('(?<=[^\n])[ ]*'), lambda match: '\n

      '), @@ -109,23 +101,9 @@ def pml_to_html(pml): return html -def footnote_to_html(footnotes): - html = footnotes - for rule in FOOTNOTE_HTML_RULES: - html = rule[0].sub(rule[1], html) - - html = pml_to_html(html) - - return html - -def sidebar_to_html(sidebars): - html = sidebars - for rule in FOOTNOTE_HTML_RULES: - html = rule[0].sub(rule[1], html) - - html = pml_to_html(html) - - return html +def footnote_sidebar_to_html(id, pml): + html = '

      ' % (id, pml_to_html(pml)) + return html def html_to_pml(html): pml = html diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index 8a0abb970e..b47dac1af0 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -8,7 +8,7 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import os, sys, struct, zlib +import os, re, sys, struct, zlib from calibre import CurrentDir from calibre.ebooks import DRMError @@ -16,7 +16,7 @@ from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.ereader import EreaderError from calibre.ebooks.pdb.ereader.pmlconverter import pml_to_html, \ - footnote_to_html, sidebar_to_html + footnote_sidebar_to_html from calibre.ebooks.mobi.palmdoc import decompress_doc from calibre.ebooks.metadata.opf2 import OPFCreator @@ -42,14 +42,6 @@ class HeaderRecord(object): self.num_text_pages = self.non_text_offset -1 self.num_image_pages = self.metadata_offset - self.image_data_offset - - # Can't tell which is sidebar and footnote if they have same offset. - # They don't exist if offset is larget than last_record. - # Todo: Determine if the subtraction is necessary and find out - # what _rec means. - end_footnote_offset = self.sidebar_offset if self.sidebar_offset != self.footnote_offset else self.last_data_offset - self.num_footnote_pages = end_footnote_offset - self.footnote_offset if self.footnote_offset < self.last_data_offset else 0 - self.num_sidebar_pages = self.sidebar_offset - self.last_data_offset if self.footnote_offset < self.last_data_offset else 0 class Reader(FormatReader): @@ -94,44 +86,10 @@ class Reader(FormatReader): assumed to be encoded as Windows-1252. The encoding is part of the eReader file spec and should always be this encoding. ''' - if number not in range(1, self.header_record.num_text_pages): + if number not in range(1, self.header_record.num_text_pages + 1): return '' return self.decompress_text(number) - - def get_footnote_page(self, number): - if number not in range(self.header_record.footnote_offset, self.header_record.footnote_offset + self.header_record.num_footnote_pages): - return '' - - return self.decompress_text(number) - - def get_sidebar_page(self, number): - if number not in range(self.header_record.sidebar_offset, self.header_record.sidebar_offset + self.header_record.num_sidebar_pages - 1): - return '' - - return self.decompress_text(number) - - def has_footnotes(self): - if self.header_record.num_footnote_pages > 1: - try: - content = self.decompress_text(self.header_record.footnote_offset) - - if content.contains(''): - return True - except: - pass - return False - - def has_sidebar(self): - if self.header_record.num_sidebar_pages > 1: - try: - content = self.decompress_text(self.header_record.sidebar_offset) - - if content.contains(''): - return True - except: - pass - return False def extract_content(self, output_dir): output_dir = os.path.abspath(output_dir) @@ -144,22 +102,20 @@ class Reader(FormatReader): for i in range(1, self.header_record.num_text_pages + 1): self.log.debug('Extracting text page %i' % i) html += pml_to_html(self.get_text_page(i)) - - # Untested: The num_.._pages variable may not be correct! - # Possibly use .._rec instead? - ''' - if has_footnotes(): + + if self.header_record.footnote_rec > 0: html += '

      %s

      ' % _('Footnotes') - for i in range(self.header_record.footnote_offset, self.header_record.num_footnote_pages): + footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding)) + for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_rec)): self.log.debug('Extracting footnote page %i' % i) - html += footnote_to_html(self.get_footnote_page(i)) + html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i)) - if has_sidebar(): + if self.header_record.sidebar_rec > 0: html += '

      %s

      ' % _('Sidebar') - for i in range(self.header_record.sidebar_offset, self.header_record.num_sidebar_pages): + sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding)) + for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_rec)): self.log.debug('Extracting sidebar page %i' % i) - html += sidebar_to_html(self.get_sidebar_page(i)) - ''' + html += footnote_sidebar_to_html(sidebarids[sid], self.decompress_text(i)) html += '' From ae86b2a44b7cd613db48f9e4953d498de66f0f15 Mon Sep 17 00:00:00 2001 From: John Schember Date: Fri, 24 Apr 2009 21:10:58 -0400 Subject: [PATCH 134/319] ereader input: support footnotes and sidebars --- .../ebooks/pdb/ereader/pmlconverter.py | 36 +++++++++++-------- src/calibre/ebooks/pdb/ereader/reader.py | 5 +++ 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/src/calibre/ebooks/pdb/ereader/pmlconverter.py b/src/calibre/ebooks/pdb/ereader/pmlconverter.py index 61a91febda..347bde951c 100644 --- a/src/calibre/ebooks/pdb/ereader/pmlconverter.py +++ b/src/calibre/ebooks/pdb/ereader/pmlconverter.py @@ -12,10 +12,12 @@ import re from calibre.ebooks.htmlsymbols import HTML_SYMBOLS +from BeautifulSoup import BeautifulSoup + PML_HTML_RULES = [ (re.compile(r'\\p'), lambda match: '

      '), (re.compile(r'\\x(?P.+?)\\x', re.DOTALL), lambda match: '

      %s

      ' % match.group('text')), - (re.compile(r'\\X(?P[0-4])(?P.+?)\\X[0-4]', re.DOTALL), lambda match: '%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)), + (re.compile(r'\\X(?P[0-4])(?P.+?)\\X[0-4]', re.DOTALL), lambda match: '%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)), (re.compile(r'\\C\d=".+"'), lambda match: ''), # This should be made to create a TOC entry (re.compile(r'\\c(?P.+?)\\c', re.DOTALL), lambda match: '
      %s
      ' % match.group('text')), (re.compile(r'\\r(?P.+?)\\r', re.DOTALL), lambda match: '
      %s
      ' % match.group('text')), @@ -34,10 +36,10 @@ PML_HTML_RULES = [ (re.compile(r'\\Sp(?P.+?)\\Sp', re.DOTALL), lambda match: '%s' % match.group('text')), (re.compile(r'\\Sb(?P.+?)\\Sb', re.DOTALL), lambda match: '%s' % match.group('text')), (re.compile(r'\\k(?P.+?)\\k', re.DOTALL), lambda match: '%s' % match.group('text')), - (re.compile(r'\\a(?P\d\d\d)'), lambda match: '&#%i;' % match.group('num')), - (re.compile(r'\\U(?P\d\d\d\d)'), lambda match: '&#%i;' % int(match.group('num'))), + (re.compile(r'\\a(?P\d\d\d)'), lambda match: '&#%s;' % match.group('num')), + (re.compile(r'\\U(?P\d+)'), lambda match: '%s' % unichr(int(match.group('num'), 16))), (re.compile(r'\\m="(?P.+?)"'), lambda match: '' % match.group('name')), - (re.compile(r'\\q="(?P#.+?)"(?P)\\q', re.DOTALL), lambda match: '%s' % (match.group('target'), match.group('text'))), + (re.compile(r'\\q="(?P#.+?)"(?P.+?)\\q', re.DOTALL), lambda match: '%s' % (match.group('target'), match.group('text'))), (re.compile(r'\\Q="(?P.+?)"'), lambda match: '
      ' % match.group('target')), (re.compile(r'\\-'), lambda match: ''), (re.compile(r'\\Fn="(?P.+?)"(?P.+?)\\Fn'), lambda match: '%s' % (match.group('target'), match.group('text'))), @@ -67,23 +69,23 @@ HTML_PML_RULES = [ (re.compile('.+?).*?">'), lambda match: '\\\\Q="%s"' % match.group('target')), (re.compile('#.+?).*?">(?P)', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))), (re.compile('.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')), - (re.compile('&#(?P\d\d\d\d);'), lambda match: '\\U%i' % int(match.group('num'))), - (re.compile('&#(?P\d\d\d);'), lambda match: '\\a%i' % match.group('num')), + #(re.compile('&#(?P\d\d\d\d);'), lambda match: '\\U%s' % int(match.group('num'))), + (re.compile('&#(?P\d\d\d);'), lambda match: '\\a%s' % match.group('num')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')), - (re.compile('\d+)%%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')), - (re.compile('\d+)%%*;.*?>(?P.+?)', re.MULTILINE), lambda match: '\\T="%i%%"%s$' % (match.group('val'), match.group('text'))), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\t%s\\t' % match.group('text')), + (re.compile('\d+)%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')), + (re.compile('\d+)%*;.*?>(?P.+?)', re.MULTILINE), lambda match: '\\T="%s%%"%s$' % (match.group('val'), match.group('text'))), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\t%s\\t' % match.group('text')), (re.compile('', re.DOTALL), lambda match: '\\v%s\\v' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\u%s\\u' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\r%s\\r' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\c%s\\c' % match.group('text')), - (re.compile('[0-4]).*?>(?P.+?)', re.DOTALL), lambda match: '\\X%i%s\\X%i' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)), + (re.compile('[0-4]).*?>(?P.+?)', re.DOTALL), lambda match: '\\X%s%s\\X%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)), (re.compile('(?P.+?)

    ', re.DOTALL), lambda match: '\\x%s\\x' % match.group('text')), (re.compile(''), lambda match: '\\p'), (re.compile('<.*?>'), lambda match: ''), @@ -102,13 +104,19 @@ def pml_to_html(pml): return html def footnote_sidebar_to_html(id, pml): - html = '' % (id, pml_to_html(pml)) + html = '
    %s
    ' % (id, id, pml_to_html(pml)) return html def html_to_pml(html): - pml = html - for rule in HTML_PML_RULES: - pml = rule[0].sub(rule[1], pml) + pml = '' + + for dom_tree in BeautifulSoup(html).findAll('body'): + body = unicode(dom_tree.pretty_print()) + + for rule in HTML_PML_RULES: + body = rule[0].sub(rule[1], pml) + + pml += body # Replace symbols outside of cp1512 wtih \Uxxxx diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index b47dac1af0..e0e42e40fd 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -108,14 +108,19 @@ class Reader(FormatReader): footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding)) for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_rec)): self.log.debug('Extracting footnote page %i' % i) + html += '
    ' html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i)) + html += '
    ' + if self.header_record.sidebar_rec > 0: html += '

    %s

    ' % _('Sidebar') sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding)) for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_rec)): self.log.debug('Extracting sidebar page %i' % i) + html += '
    ' html += footnote_sidebar_to_html(sidebarids[sid], self.decompress_text(i)) + html += '
    ' html += '' From 1f6737eeb0e4dfc41d841050778469e7aa7471b1 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 24 Apr 2009 18:42:36 -0700 Subject: [PATCH 135/319] Re-implement palmdoc compress/uncompress in C for speed --- setup.py | 21 +-- src/calibre/__init__.py | 16 +-- src/calibre/constants.py | 2 +- src/calibre/ebooks/mobi/palmdoc.c | 204 +++++++++++++++++++++++++++++ src/calibre/ebooks/mobi/palmdoc.py | 61 +++++---- 5 files changed, 258 insertions(+), 46 deletions(-) create mode 100644 src/calibre/ebooks/mobi/palmdoc.c diff --git a/setup.py b/setup.py index b0ff04a983..003067b34f 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' -import sys, re, os, shutil, cStringIO, tempfile, subprocess, time +import sys, re, os, subprocess sys.path.append('src') iswindows = re.search('win(32|64)', sys.platform) isosx = 'darwin' in sys.platform @@ -54,7 +54,7 @@ if __name__ == '__main__': build_osx, upload_installers, upload_user_manual, \ upload_to_pypi, stage3, stage2, stage1, upload, \ upload_rss - + entry_points['console_scripts'].append( 'calibre_postinstall = calibre.linux:post_install') ext_modules = [ @@ -65,12 +65,15 @@ if __name__ == '__main__': 'src/calibre/utils/lzx/lzc.c', 'src/calibre/utils/lzx/lzxc.c'], include_dirs=['src/calibre/utils/lzx']), - + Extension('calibre.plugins.msdes', sources=['src/calibre/utils/msdes/msdesmodule.c', 'src/calibre/utils/msdes/des.c'], include_dirs=['src/calibre/utils/msdes']), - + + Extension('calibre.plugins.cPalmdoc', + sources=['src/calibre/ebooks/mobi/palmdoc.c']), + PyQtExtension('calibre.plugins.pictureflow', ['src/calibre/gui2/pictureflow/pictureflow.cpp', 'src/calibre/gui2/pictureflow/pictureflow.h'], @@ -81,7 +84,7 @@ if __name__ == '__main__': ext_modules.append(Extension('calibre.plugins.winutil', sources=['src/calibre/utils/windows/winutil.c'], libraries=['shell32', 'setupapi'], - include_dirs=os.environ.get('INCLUDE', + include_dirs=os.environ.get('INCLUDE', 'C:/WinDDK/6001.18001/inc/api/;' 'C:/WinDDK/6001.18001/inc/crt/').split(';'), extra_compile_args=['/X'] @@ -91,7 +94,7 @@ if __name__ == '__main__': sources=['src/calibre/devices/usbobserver/usbobserver.c'], extra_link_args=['-framework', 'IOKit']) ) - + if not iswindows: plugins = ['plugins/%s.so'%(x.name.rpartition('.')[-1]) for x in ext_modules] else: @@ -99,7 +102,7 @@ if __name__ == '__main__': ['plugins/%s.pyd.manifest'%(x.name.rpartition('.')[-1]) \ for x in ext_modules if 'pictureflow' not in x.name] - + setup( name = APPNAME, packages = find_packages('src'), @@ -152,9 +155,9 @@ if __name__ == '__main__': 'Topic :: System :: Hardware :: Hardware Drivers' ], cmdclass = { - 'build_ext' : build_ext, + 'build_ext' : build_ext, 'build' : build, - 'build_py' : build_py, + 'build_py' : build_py, 'pot' : pot, 'manual' : manual, 'resources' : resources, diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index 9e18af3cf9..5656079ead 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -2,7 +2,7 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import sys, os, re, logging, time, subprocess, atexit, mimetypes, \ +import sys, os, re, logging, time, subprocess, mimetypes, \ __builtin__, warnings __builtin__.__dict__['dynamic_property'] = lambda(func): func(None) from htmlentitydefs import name2codepoint @@ -71,7 +71,7 @@ def sanitize_file_name(name, substitute='_', as_unicode=False): **WARNING:** This function also replaces path separators, so only pass file names and not full paths to it. *NOTE:* This function always returns byte strings, not unicode objects. The byte strings - are encoded in the filesystem encoding of the platform, or UTF-8. + are encoded in the filesystem encoding of the platform, or UTF-8. ''' if isinstance(name, unicode): name = name.encode(filesystem_encoding, 'ignore') @@ -159,7 +159,7 @@ def extract(path, dir): def get_proxies(): proxies = {} - + for q in ('http', 'ftp'): proxy = os.environ.get(q+'_proxy', None) if not proxy: continue @@ -194,8 +194,8 @@ def get_proxies(): def browser(honor_time=True, max_time=2, mobile_browser=False): ''' Create a mechanize browser for web scraping. The browser handles cookies, - refresh requests and ignores robots.txt. Also uses proxy if avaialable. - + refresh requests and ignores robots.txt. Also uses proxy if avaialable. + :param honor_time: If True honors pause time in refresh requests :param max_time: Maximum time in seconds to wait during a refresh request ''' @@ -232,16 +232,16 @@ def fit_image(width, height, pwidth, pheight): return scaled, int(width), int(height) class CurrentDir(object): - + def __init__(self, path): self.path = path self.cwd = None - + def __enter__(self, *args): self.cwd = os.getcwd() os.chdir(self.path) return self.cwd - + def __exit__(self, *args): os.chdir(self.cwd) diff --git a/src/calibre/constants.py b/src/calibre/constants.py index 186eb37e34..ff641cfbeb 100644 --- a/src/calibre/constants.py +++ b/src/calibre/constants.py @@ -53,7 +53,7 @@ if plugins is None: plugin_path = getattr(pkg_resources, 'resource_filename')('calibre', 'plugins') sys.path.insert(0, plugin_path) - for plugin in ['pictureflow', 'lzx', 'msdes'] + \ + for plugin in ['pictureflow', 'lzx', 'msdes', 'cPalmdoc'] + \ (['winutil'] if iswindows else []) + \ (['usbobserver'] if isosx else []): try: diff --git a/src/calibre/ebooks/mobi/palmdoc.c b/src/calibre/ebooks/mobi/palmdoc.c new file mode 100644 index 0000000000..87c6c32da8 --- /dev/null +++ b/src/calibre/ebooks/mobi/palmdoc.c @@ -0,0 +1,204 @@ +/* +:mod:`cPalmdoc` -- Palmdoc compression/decompression +===================================================== + +.. module:: cPalmdoc + :platform: All + :synopsis: Compression decompression of Palmdoc implemented in C for speed + +.. moduleauthor:: Kovid Goyal Copyright 2009 + +*/ + +#define PY_SSIZE_T_CLEAN +#include +#include + +#define DELTA sizeof(Byte)*4096 + +#define BUFFER 6000 + +#define MIN(x, y) ( ((x) < (y)) ? (x) : (y) ) + +typedef unsigned short int Byte; +typedef struct { + Byte *data; + Py_ssize_t len; +} buffer; + +#ifdef bool +#undef bool +#endif +#define bool int + +#ifdef false +#undef false +#endif +#define false 0 + +#ifdef true +#undef true +#endif +#define true 1 + +#define CHAR(x) (( (x) > 127 ) ? (x)-256 : (x)) + +static PyObject * +cpalmdoc_decompress(PyObject *self, PyObject *args) { + const char *_input = NULL; Py_ssize_t input_len = 0; + Py_ssize_t i = 0, o = 0, j = 0, di, n; + if (!PyArg_ParseTuple(args, "t#", &_input, &input_len)) + return NULL; + Byte *input = (Byte *)PyMem_Malloc(sizeof(Byte)*input_len); + if (input == NULL) return PyErr_NoMemory(); + // Map chars to bytes + for (j = 0; j < input_len; j++) + input[j] = (_input[j] < 0) ? _input[j]+256 : _input[j]; + char *output = (char *)PyMem_Malloc(sizeof(char)*BUFFER); + Byte c; + PyObject *ans; + if (output == NULL) return PyErr_NoMemory(); + + while (i < input_len) { + c = input[i++]; + if (c >= 1 && c <= 8) // copy 'c' bytes + while (c--) output[o++] = input[i++]; + + else if (c <= 0x7F) // 0, 09-7F = self + output[o++] = c; + + else if (c >= 0xC0) { // space + ASCII char + output[o++] = ' '; + output[o++] = c ^ 0x80; + } + else { // 80-BF repeat sequences + c = (c << 8) + input[i++]; + di = (c & 0x3FFF) >> 3; + for ( n = (c & 7) + 3; n--; ++o ) + output[o] = output[o - di]; + } + } + ans = Py_BuildValue("s#", output, o); + if (output != NULL) PyMem_Free(output); + if (input != NULL) PyMem_Free(input); + return ans; +} + +static bool +cpalmdoc_memcmp( Byte *a, Byte *b, Py_ssize_t len) { + Py_ssize_t i; + for (i = 0; i < len; i++) if (a[i] != b[i]) return false; + return true; +} + +static Py_ssize_t +cpalmdoc_rfind(Byte *data, Py_ssize_t pos, Py_ssize_t chunk_length) { + Py_ssize_t i; + for (i = pos - chunk_length; i > -1; i--) + if (cpalmdoc_memcmp(data+i, data+pos, chunk_length)) return i; + return pos; +} + + +static Py_ssize_t +cpalmdoc_do_compress(buffer *b, char *output) { + Py_ssize_t i = 0, j, chunk_len, dist; + unsigned compound; + Byte c, n; + bool found; + char *head; + head = output; + buffer temp; + temp.data = (Byte *)PyMem_Malloc(sizeof(Byte)*8); temp.len = 0; + if (temp.data == NULL) return 0; + while (i < b->len) { + c = b->data[i]; + //do repeats + if ( i > 10 && (b->len - i) > 10) { + found = false; + for (chunk_len = 10; chunk_len > 2; chunk_len--) { + j = cpalmdoc_rfind(b->data, i, chunk_len); + if (j < i) { + found = true; + dist = i - j; + compound = (dist << 3) + chunk_len-3; + *(output++) = CHAR(0x80 + (compound >> 8 )); + *(output++) = CHAR(compound & 0xFF); + i += chunk_len; + break; + } + } + if (found) continue; + } + + //write single character + i++; + if (c == 32 && i < b->len) { + n = b->data[i]; + if ( n >= 0x40 && n <= 0x7F) { + *(output++) = CHAR(n^0x80); i++; continue; + } + } + if (c == 0 || (c > 8 && c < 0x80)) + *(output++) = CHAR(c); + else { // Write binary data + j = i; + temp.data[0] = c; temp.len = 1; + while (j < b->len && temp.len < 8) { + c = b->data[j]; + if (c == 0 || (c > 8 && c < 0x80)) break; + temp.data[temp.len++] = c; j++; + } + i += temp.len - 1; + *(output++) = temp.len; + for (j=0; j < temp.len; j++) *(output++) = temp.data[j]; + } + } + return output - head; +} + +static PyObject * +cpalmdoc_compress(PyObject *self, PyObject *args) { + const char *_input = NULL; Py_ssize_t input_len = 0; + Py_ssize_t j = 0; + buffer b; + if (!PyArg_ParseTuple(args, "t#", &_input, &input_len)) + return NULL; + b.data = (Byte *)PyMem_Malloc(sizeof(Byte)*input_len); + if (b.data == NULL) return PyErr_NoMemory(); + // Map chars to bytes + for (j = 0; j < input_len; j++) + b.data[j] = (_input[j] < 0) ? _input[j]+256 : _input[j]; + b.len = input_len; + char *output = (char *)PyMem_Malloc(sizeof(char) * b.len); + if (output == NULL) return PyErr_NoMemory(); + j = cpalmdoc_do_compress(&b, output); + if ( j == 0) return PyErr_NoMemory(); + PyObject *ans = Py_BuildValue("s#", output, j); + PyMem_Free(output); + PyMem_Free(b.data); + return ans; +} + +static PyMethodDef cPalmdocMethods[] = { + {"decompress", cpalmdoc_decompress, METH_VARARGS, + "decompress(bytestring) -> decompressed bytestring\n\n" + "Decompress a palmdoc compressed byte string. " + }, + + {"compress", cpalmdoc_compress, METH_VARARGS, + "compress(bytestring) -> compressed bytestring\n\n" + "Palmdoc compress a byte string. " + }, + {NULL, NULL, 0, NULL} +}; + +PyMODINIT_FUNC +initcPalmdoc(void) { + PyObject *m; + m = Py_InitModule3("cPalmdoc", cPalmdocMethods, + "Compress and decompress palmdoc strings." + ); + if (m == NULL) return; +} + diff --git a/src/calibre/ebooks/mobi/palmdoc.py b/src/calibre/ebooks/mobi/palmdoc.py index eedab1c88f..90dabcb5a8 100644 --- a/src/calibre/ebooks/mobi/palmdoc.py +++ b/src/calibre/ebooks/mobi/palmdoc.py @@ -2,41 +2,46 @@ # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai __license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' \ - 'and Marshall T. Vandegrift ' +__copyright__ = '2008, Kovid Goyal ' from cStringIO import StringIO from struct import pack -COUNT_BITS = 3 +from calibre.constants import plugins +cPalmdoc = plugins['cPalmdoc'][0] +if not cPalmdoc: + raise RuntimeError(('Failed to load required cPalmdoc module: ' + '%s')%plugins['cPalmdoc'][1]) def decompress_doc(data): - buffer = [ord(i) for i in data] - res = [] - i = 0 - while i < len(buffer): - c = buffer[i] - i += 1 - if c >= 1 and c <= 8: - res.extend(buffer[i:i+c]) - i += c - elif c <= 0x7f: - res.append(c) - elif c >= 0xc0: - res.extend( (ord(' '), c^0x80) ) - else: - c = (c << 8) + buffer[i] - i += 1 - di = (c & 0x3fff) >> COUNT_BITS - j = len(res) - num = (c & ((1 << COUNT_BITS) - 1)) + 3 - - for k in range( num ): - res.append(res[j - di+k]) - - return ''.join([chr(i) for i in res]) + return cPalmdoc.decompress(data) def compress_doc(data): + return cPalmdoc.compress(data) + +def test(): + TESTS = [ + 'abc\x03\x04\x05\x06ms', # Test binary writing + 'a b c \xfed ', # Test encoding of spaces + '0123456789axyz2bxyz2cdfgfo9iuyerh', + '0123456789asd0123456789asd|yyzzxxffhhjjkk', + ('ciewacnaq eiu743 r787q 0w% ; sa fd\xef\ffdxosac wocjp acoiecowei ' + 'owaic jociowapjcivcjpoivjporeivjpoavca; p9aw8743y6r74%$^$^%8 ') + ] + for test in TESTS: + print 'Test:', repr(test) + print '\tTesting compression...' + good = py_compress_doc(test) + x = compress_doc(test) + print '\t\tgood:', repr(good) + print '\t\tx :', repr(x) + assert x == good + print '\tTesting decompression...' + print '\t\t', repr(decompress_doc(x)) + assert decompress_doc(x) == test + print + +def py_compress_doc(data): out = StringIO() i = 0 ldata = len(data) @@ -85,4 +90,4 @@ def compress_doc(data): out.write(''.join(binseq)) i += len(binseq) - 1 return out.getvalue() - + From 7a6afe4ee47a1fb4ef379450ccd49b56b9b457c1 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 24 Apr 2009 20:21:37 -0700 Subject: [PATCH 136/319] Fix various regressions --- src/calibre/ebooks/mobi/palmdoc.c | 4 ++-- src/calibre/ebooks/mobi/reader.py | 22 ++++++++++++++++------ src/calibre/ebooks/oeb/base.py | 2 ++ src/calibre/ebooks/oeb/transforms/guide.py | 6 ++++-- src/calibre/ebooks/oeb/transforms/split.py | 9 +++------ 5 files changed, 27 insertions(+), 16 deletions(-) diff --git a/src/calibre/ebooks/mobi/palmdoc.c b/src/calibre/ebooks/mobi/palmdoc.c index 87c6c32da8..29e9579140 100644 --- a/src/calibre/ebooks/mobi/palmdoc.c +++ b/src/calibre/ebooks/mobi/palmdoc.c @@ -118,9 +118,9 @@ cpalmdoc_do_compress(buffer *b, char *output) { found = false; for (chunk_len = 10; chunk_len > 2; chunk_len--) { j = cpalmdoc_rfind(b->data, i, chunk_len); - if (j < i) { + dist = i - j; + if (j < i && dist <= 2047) { found = true; - dist = i - j; compound = (dist << 3) + chunk_len-3; *(output++) = CHAR(0x80 + (compound >> 8 )); *(output++) = CHAR(compound & 0xFF); diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index b68263ab28..38de3476d1 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -226,7 +226,7 @@ class MobiReader(object): page-break-after: always; margin: 0; display: block } ''') - self.tag_css_rules = [] + self.tag_css_rules = {} if hasattr(filename_or_stream, 'read'): stream = filename_or_stream @@ -328,10 +328,10 @@ class MobiReader(object): with open('styles.css', 'wb') as s: s.write(self.base_css_rules+'\n\n') - for rule in self.tag_css_rules: + for cls, rule in self.tag_css_rules.items(): if isinstance(rule, unicode): rule = rule.encode('utf-8') - s.write(rule+'\n\n') + s.write('.%s { %s }\n\n'%(cls, rule)) if self.book_header.exth is not None or self.embedded_mi is not None: @@ -389,6 +389,7 @@ class MobiReader(object): 'xx-large' : '6', } mobi_version = self.book_header.mobi_version + style_map = {} for i, tag in enumerate(root.iter(etree.Element)): if tag.tag in ('country-region', 'place', 'placetype', 'placename', 'state', 'city', 'street', 'address', 'content'): @@ -455,9 +456,18 @@ class MobiReader(object): except ValueError: pass if styles: - attrib['id'] = attrib.get('id', 'calibre_mr_gid%d'%i) - self.tag_css_rules.append('#%s {%s}'%(attrib['id'], - '; '.join(styles))) + cls = None + rule = '; '.join(styles) + for sel, srule in self.tag_css_rules.items(): + if srule == rule: + cls = sel + break + if cls is None: + ncls = 'calibre_%d'%i + self.tag_css_rules[ncls] = rule + cls = attrib.get('class', '') + cls = cls + (' ' if cls else '') + ncls + attrib['class'] = cls def create_opf(self, htmlfile, guide=None, root=None): mi = getattr(self.book_header.exth, 'mi', self.embedded_mi) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 81120aaf2e..783f09e5cc 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -726,6 +726,7 @@ class Manifest(object): % (self.id, self.href, self.media_type) def _parse_xhtml(self, data): + self.oeb.log.debug('Parsing', self.href, '...') # Convert to Unicode and normalize line endings data = self.oeb.decode(data) data = self.oeb.html_preprocessor(data) @@ -804,6 +805,7 @@ class Manifest(object): return data def _parse_css(self, data): + self.oeb.log.debug('Parsing', self.href, '...') data = self.oeb.decode(data) data = self.oeb.css_preprocessor(data) data = XHTML_CSS_NAMESPACE + data diff --git a/src/calibre/ebooks/oeb/transforms/guide.py b/src/calibre/ebooks/oeb/transforms/guide.py index 00830b1a8c..dc7123446b 100644 --- a/src/calibre/ebooks/oeb/transforms/guide.py +++ b/src/calibre/ebooks/oeb/transforms/guide.py @@ -41,10 +41,12 @@ class Clean(object): for x in list(self.oeb.guide): href = urldefrag(self.oeb.guide[x].href)[0] - if x.lower() != ('cover', 'titlepage'): + if x.lower() not in ('cover', 'titlepage'): try: if href not in protected_hrefs: - self.oeb.manifest.remove(self.oeb.manifest.hrefs[href]) + item = self.oeb.manifest.hrefs[href] + if item not in self.oeb.spine: + self.oeb.manifest.remove(self.oeb.manifest.hrefs[href]) except KeyError: pass self.oeb.guide.remove(x) diff --git a/src/calibre/ebooks/oeb/transforms/split.py b/src/calibre/ebooks/oeb/transforms/split.py index d3505a5fd9..21d71da5bb 100644 --- a/src/calibre/ebooks/oeb/transforms/split.py +++ b/src/calibre/ebooks/oeb/transforms/split.py @@ -44,14 +44,14 @@ class Split(object): self.split_on_page_breaks = split_on_page_breaks self.page_breaks_xpath = page_breaks_xpath self.max_flow_size = max_flow_size + self.page_break_selectors = None if self.page_breaks_xpath is not None: - self.page_breaks_xpath = XPath(self.page_breaks_xpath) + self.page_break_selectors = [(XPath(self.page_breaks_xpath), False)] def __call__(self, oeb, context): self.oeb = oeb self.log = oeb.log self.map = {} - self.page_break_selectors = None for item in list(self.oeb.manifest.items): if item.spine_position is not None and etree.iselement(item.data): self.split_item(item) @@ -60,10 +60,7 @@ class Split(object): def split_item(self, item): if self.split_on_page_breaks: - if self.page_breaks_xpath is None: - page_breaks, page_break_ids = self.find_page_breaks(item) - else: - page_breaks, page_break_ids = self.page_breaks_xpath(item.data) + page_breaks, page_break_ids = self.find_page_breaks(item) splitter = FlowSplitter(item, page_breaks, page_break_ids, self.max_flow_size, self.oeb) From 188f630c35e127e2ab3964b322b8f181b18480af Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 25 Apr 2009 08:14:22 -0400 Subject: [PATCH 137/319] New pdftohtml processing rules. Best yet. --- src/calibre/ebooks/conversion/preprocess.py | 33 ++++++++++----------- 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index fb55ee74fb..0421534f65 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -31,6 +31,12 @@ def chap_head(match): else: return '

    '+chap+'
    '+title+'


    ' +def wrap_lines(match): + ital = match.group('ital') + if not ital: + return ' ' + else: + return ital+' ' def line_length(raw, percent): ''' @@ -93,17 +99,11 @@ class HTMLPreProcessor(object): (re.compile(r'', re.IGNORECASE), lambda match: ''), # Remove
    tags (re.compile(r'', re.IGNORECASE), lambda match: '
    '), - # Remove page numbers - (re.compile(r'\d+
    ', re.IGNORECASE), lambda match: ''), # Replace

    with

    (re.compile(r'\s*', re.IGNORECASE), lambda match: '

    '), - # Remove
    - (re.compile(r'(.*)', re.IGNORECASE), - lambda match: match.group() if \ - re.match('<', match.group(1).lstrip()) or \ - len(match.group(1)) < 40 else match.group(1)), + # Remove hyphenation - (re.compile(r'-\n\r?'), lambda match: ''), + (re.compile(r'-\n\r?'), lambda match: ''), # Remove gray background (re.compile(r']+>'), lambda match : ''), @@ -112,15 +112,12 @@ class HTMLPreProcessor(object): (re.compile(ur'\u00a0'), lambda match : ' '), # Detect Chapters to match default XPATH in GUI - (re.compile(r'(]*>)?(]*>)?s*(?P(Chapter|Epilogue|Prologue|Book|Part)\s*(\d+|\w+)?)(]*>|]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(]*>|]*>))((?P.*)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head), - (re.compile(r'(<br[^>]*>)?(</?p[^>]*>)?s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head), + (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<i><b>|<i>|<b>)?(Chapter|Epilogue|Prologue|Book|Part)\s*(\d+|\w+)?(</i></b>|</i>|</b>)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>(<i>)?\s*\w+(\s+\w+)?(</i>)?)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head), + (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head), # Have paragraphs show better (re.compile(r'<br.*?>'), lambda match : '<p>'), - # Un wrap lines - (re.compile(r'(?<=[^\.^\^?^!^"^”])\s*(</(i|b|u)>)*\s*<p.*?>\s*(<(i|b|u)>)*\s*(?=[a-z0-9I])', re.UNICODE), lambda match: ' '), - # Clean up spaces (re.compile(u'(?<=[\.,:;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), # Add space before and after italics @@ -162,12 +159,12 @@ class HTMLPreProcessor(object): elif self.is_book_designer(html): rules = self.BOOK_DESIGNER elif self.is_pdftohtml(html): - # Add rules that require matching line length here - #line_length_rules = [ - # (re.compile('%i' % line_length(html, .85)), lambda match:) - #] + line_length_rules = [ + # Un wrap using punctuation + (re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .2), re.UNICODE), wrap_lines), + ] - rules = self.PDFTOHTML # + line_length_rules + rules = self.PDFTOHTML + line_length_rules else: rules = [] for rule in self.PREPROCESS + rules: From 91bb71ed8467cf9a5608b27c4d505141caa87a21 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Sat, 25 Apr 2009 08:26:58 -0700 Subject: [PATCH 138/319] Misc. minor fixes --- src/calibre/ebooks/epub/output.py | 23 +++++++++++----------- src/calibre/ebooks/mobi/reader.py | 5 +++++ src/calibre/ebooks/oeb/transforms/guide.py | 14 ------------- 3 files changed, 17 insertions(+), 25 deletions(-) diff --git a/src/calibre/ebooks/epub/output.py b/src/calibre/ebooks/epub/output.py index a43ca4e5e3..1b37f054b0 100644 --- a/src/calibre/ebooks/epub/output.py +++ b/src/calibre/ebooks/epub/output.py @@ -91,7 +91,7 @@ class EPUBOutput(OutputFormatPlugin): self.condense_ncx([os.path.join(tdir, x) for x in os.listdir(tdir)\ if x.endswith('.ncx')][0]) - from calibre.epub import initialize_container + from calibre.ebooks.epub import initialize_container epub = initialize_container(output_path, os.path.basename(opf)) epub.add_dir(tdir) epub.close() @@ -136,7 +136,7 @@ class EPUBOutput(OutputFormatPlugin): if 'cover' in g: tp = self.TITLEPAGE_COVER%unquote(g['cover'].href) id, href = m.generate('titlepage', 'titlepage.xhtml') - item = m.add(id, href, guess_type('t.xhtml'), + item = m.add(id, href, guess_type('t.xhtml')[0], data=etree.fromstring(tp)) else: item = self.default_cover() @@ -146,7 +146,8 @@ class EPUBOutput(OutputFormatPlugin): if item is not None: self.oeb.spine.insert(0, item, True) self.oeb.guide.refs['cover'].href = item.href - self.oeb.guide.refs['titlepage'].href = item.href + if 'titlepage' in self.oeb.guide.refs: + self.oeb.guide.refs['titlepage'].href = item.href @@ -180,7 +181,7 @@ class EPUBOutput(OutputFormatPlugin): body = body[0] # Replace <br> that are children of <body> as ADE doesn't handle them if hasattr(body, 'xpath'): - for br in body.xpath('./h:br'): + for br in XPath('./h:br')(body): if br.getparent() is None: continue try: @@ -204,29 +205,29 @@ class EPUBOutput(OutputFormatPlugin): if self.opts.output_profile.remove_object_tags: - for tag in root.xpath('//h:embed'): + for tag in XPath('//h:embed')(root): tag.getparent().remove(tag) - for tag in root.xpath('//h:object'): + for tag in XPath('//h:object')(root): if tag.get('type', '').lower().strip() in ('image/svg+xml',): continue tag.getparent().remove(tag) - for tag in root.xpath('//h:title|//h:style'): + for tag in XPath('//h:title|//h:style')(root): if not tag.text: tag.getparent().remove(tag) - for tag in root.xpath('//h:script'): + for tag in XPath('//h:script')(root): if not tag.text and not tag.get('src', False): tag.getparent().remove(tag) - for tag in root.xpath('//h:form'): + for tag in XPath('//h:form')(root): tag.getparent().remove(tag) - for tag in root.xpath('//h:center'): + for tag in XPath('//h:center')(root): tag.tag = XHTML('div') tag.set('style', 'text-align:center') # ADE can't handle & in an img url - for tag in self.root.xpath('//h:img[@src]'): + for tag in XPath('//h:img[@src]')(root): tag.set('src', tag.get('src', '').replace('&', '')) stylesheet = self.oeb.manifest.hrefs['stylesheet.css'] diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index 38de3476d1..25b4114cc2 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -298,6 +298,11 @@ class MobiReader(object): self.log.debug('Parsing HTML...') root = html.fromstring(self.processed_html) + if root.xpath('descendant::p/descendant::p'): + from lxml.html import soupparser + self.log.warning('Markup contains unclosed <p> tags, parsing using', + 'BeatifulSoup') + root = soupparser.fromstring(self.processed_html) self.upshift_markup(root) guides = root.xpath('//guide') guide = guides[0] if guides else None diff --git a/src/calibre/ebooks/oeb/transforms/guide.py b/src/calibre/ebooks/oeb/transforms/guide.py index dc7123446b..aaeba67d80 100644 --- a/src/calibre/ebooks/oeb/transforms/guide.py +++ b/src/calibre/ebooks/oeb/transforms/guide.py @@ -14,10 +14,6 @@ class Clean(object): from calibre.ebooks.oeb.base import urldefrag self.oeb, self.log, self.opts = oeb, oeb.log, opts - protected_hrefs = set([]) - if 'titlepage' in self.oeb.guide: - protected_hrefs.add(urldefrag( - self.oeb.guide['titlepage'].href)[0]) if 'cover' not in self.oeb.guide: covers = [] for x in ('other.ms-coverimage-standard', @@ -35,20 +31,10 @@ class Clean(object): self.log('Choosing %s:%s as the cover'%(ref.type, ref.href)) ref.type = 'cover' self.oeb.guide.refs['cover'] = ref - protected_hrefs.add(urldefrag(ref.href)[0]) - else: - protected_hrefs.add(urldefrag(self.oeb.guide.refs['cover'].href)[0]) for x in list(self.oeb.guide): href = urldefrag(self.oeb.guide[x].href)[0] if x.lower() not in ('cover', 'titlepage'): - try: - if href not in protected_hrefs: - item = self.oeb.manifest.hrefs[href] - if item not in self.oeb.spine: - self.oeb.manifest.remove(self.oeb.manifest.hrefs[href]) - except KeyError: - pass self.oeb.guide.remove(x) From 19e7ed0cb7049e7fccc75a6c0adb67757789a602 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Sat, 25 Apr 2009 10:03:00 -0700 Subject: [PATCH 139/319] More miscellaneous bug fixes --- src/calibre/ebooks/epub/input.py | 2 +- src/calibre/ebooks/epub/output.py | 23 +- src/calibre/ebooks/html_old.py | 1190 ----------------- src/calibre/ebooks/mobi/reader.py | 3 +- src/calibre/ebooks/oeb/base.py | 6 + .../ebooks/oeb/transforms/structure.py | 2 +- src/pyPdf/pdf.py | 3 +- todo | 2 + 8 files changed, 35 insertions(+), 1196 deletions(-) delete mode 100644 src/calibre/ebooks/html_old.py diff --git a/src/calibre/ebooks/epub/input.py b/src/calibre/ebooks/epub/input.py index 919416ffdc..f134ea6abd 100644 --- a/src/calibre/ebooks/epub/input.py +++ b/src/calibre/ebooks/epub/input.py @@ -110,7 +110,7 @@ class EPUBInput(InputFormatPlugin): parts = os.path.split(opf) opf = OPF(opf, os.path.dirname(os.path.abspath(opf))) - if len(parts) > 1: + if len(parts) > 1 and parts[0]: delta = '/'.join(parts[:-1])+'/' for elem in opf.itermanifest(): elem.set('href', delta+elem.get('href')) diff --git a/src/calibre/ebooks/epub/output.py b/src/calibre/ebooks/epub/output.py index 1b37f054b0..d5f0a9349a 100644 --- a/src/calibre/ebooks/epub/output.py +++ b/src/calibre/ebooks/epub/output.py @@ -6,13 +6,15 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' -import os +import os, shutil from urllib import unquote from calibre.customize.conversion import OutputFormatPlugin from calibre.ptempfile import TemporaryDirectory from calibre.constants import __appname__, __version__ from calibre import strftime, guess_type +from calibre.customize.conversion import OptionRecommendation + from lxml import etree @@ -22,6 +24,14 @@ class EPUBOutput(OutputFormatPlugin): author = 'Kovid Goyal' file_type = 'epub' + options = set([ + OptionRecommendation(name='extract_to', + help=_('Extract the contents of the generated EPUB file to the ' + 'specified directory. The contents of the directory are first ' + 'deleted, so be careful.')) + ]) + + TITLEPAGE_COVER = '''\ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en"> <head> @@ -43,6 +53,7 @@ class EPUBOutput(OutputFormatPlugin): TITLEPAGE = '''\ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en"> <head> + <title>%(title)s ', re.DOTALL), lambda match: ''), + + # Reflow paragraphs + (re.compile('(?P.*?)

    ', re.DOTALL), lambda match: match.group('text').replace('\r\n', ' ').replace('\n', ' ')), + + # HTML to PML (re.compile('.+?).*?">(?P.+?)'), lambda match: '\\Sd="%s"%s\\Sd' % (match.group('target'), match.group('text'))), (re.compile('.+?).*?">(?P.+?)'), lambda match: '\\Fn="%s"%s\\Fn' % (match.group('target'), match.group('text'))), (re.compile('.+?).*?">'), lambda match: '\\\\Q="%s"' % match.group('target')), (re.compile('#.+?).*?">(?P)', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))), - (re.compile('.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')), + #(re.compile('.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')), + (re.compile('.+?)".*?>'), lambda match: '\\m="%s"' % image_name(match.group('name'))), #(re.compile('&#(?P\d\d\d\d);'), lambda match: '\\U%s' % int(match.group('num'))), (re.compile('&#(?P\d\d\d);'), lambda match: '\\a%s' % match.group('num')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')), (re.compile('\d+)%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')), (re.compile('\d+)%*;.*?>(?P.+?)', re.MULTILINE), lambda match: '\\T="%s%%"%s$' % (match.group('val'), match.group('text'))), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\t%s\\t' % match.group('text')), - (re.compile('', re.DOTALL), lambda match: '\\v%s\\v' % match.group('text')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')), + (re.compile('', re.DOTALL), lambda match: '\\v%s\\v' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\u%s\\u' % match.group('text')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\r%s\\r' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\c%s\\c' % match.group('text')), (re.compile('[0-4]).*?>(?P.+?)', re.DOTALL), lambda match: '\\X%s%s\\X%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\x%s\\x' % match.group('text')), - (re.compile(''), lambda match: '\\p'), + (re.compile('
    '), lambda match: '\\p'), + (re.compile('
    '), lambda match: '\\p'), + + # Remove remaining HTML tags (re.compile('<.*?>'), lambda match: ''), + + # Remove redundant page break markers (re.compile(r'(\\p){2,}'), lambda match: r'\p'), + + # Remove whitespace on empty lines + (re.compile('^[\t\r ]$', re.MULTILINE), lambda match: ''), + + # Remove excess newlines at the beginning and end + (re.compile('^(\r\n){1,}'), lambda match: ''), + (re.compile('^\n{1,}'), lambda match: ''), + (re.compile('(\r\n){3,}$'), lambda match: ''), + (re.compile('\n{3,}$'), lambda match: ''), ] def pml_to_html(pml): @@ -111,13 +146,13 @@ def html_to_pml(html): pml = '' for dom_tree in BeautifulSoup(html).findAll('body'): - body = unicode(dom_tree.pretty_print()) + body = unicode(dom_tree.prettify()) for rule in HTML_PML_RULES: - body = rule[0].sub(rule[1], pml) + body = rule[0].sub(rule[1], body) pml += body - + # Replace symbols outside of cp1512 wtih \Uxxxx return pml diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index e0e42e40fd..c6f520ecb2 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -40,7 +40,7 @@ class HeaderRecord(object): self.sidebar_offset, = struct.unpack('>H', raw[50:52]) self.last_data_offset, = struct.unpack('>H', raw[52:54]) - self.num_text_pages = self.non_text_offset -1 + self.num_text_pages = self.non_text_offset - 1 self.num_image_pages = self.metadata_offset - self.image_data_offset diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index c9493d2915..1605e15f32 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -4,17 +4,90 @@ from __future__ import with_statement Write content to ereader pdb file. ''' +import struct, zlib + +from calibre.ebooks.oeb.base import OEB_IMAGES +from calibre.ebooks.pdb.header import PdbHeaderBuilder +from calibre.ebooks.pdb.ereader import image_name from calibre.ebooks.pdb.ereader.pmlconverter import html_to_pml +IDENTITY = 'PNPdPPrs' + class Writer(object): def __init__(self, log): - self.oeb_book = oeb_book + self.log = log - def dump(oeb_book): + def dump(self, oeb_book, out_stream, metadata=None): + text = self._text(oeb_book.spine) + images = self._images(oeb_book.manifest) + metadata = [self._metadata(metadata)] + + hr = [self._header_record(len(text), len(images))] + + sections = hr+text+images+metadata + + lengths = [len(i) for i in sections] + + pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, '') + pdbHeaderBuilder.build_header(lengths, out_stream) + + for item in sections: + out_stream.write(item) + + def _text(self, pages): pml_pages = [] - for page in oeb_book.spine: - pml_pages.append(html_to_pml(page)) + for page in pages: + pml_pages.append(zlib.compress(html_to_pml(unicode(page)))) + + return pml_pages - \ No newline at end of file + def _images(self, manifest): + images = [] + + for item in manifest: + if item.media_type in OEB_IMAGES: + image = '\x00\x00\x00\x00' + + image += image_name(item.href) + image = image.ljust(62, '\x00') + image += item.data + + images.append(image) + + return images + + def _metadata(self, metadata): + return '' + + def _header_record(self, text_items, image_items): + ''' + text_items = the number of text pages + image_items = the number of images + ''' + version = 10 + non_text_offset = text_items + + if image_items > 0: + image_data_offset = text_items + 1 + meta_data_offset = image_data_offset + image_items + else: + meta_data_offset = text_items + 1 + image_data_offset = meta_data_offset + + record = u'' + + # Version + record += struct.pack('>H', version) + record = record.ljust(12, '\x00') + record += struct.pack('>H', non_text_offset) + record = record.ljust(40, '\x00') + record += struct.pack('>H', image_data_offset) + record = record.ljust(44, '\x00') + record += struct.pack('>H', meta_data_offset) + record = record.ljust(52, '\x00') + record += struct.pack('>H', meta_data_offset) + + return record + diff --git a/src/calibre/ebooks/pdb/header.py b/src/calibre/ebooks/pdb/header.py index d270c0ef71..8a9b7b105c 100644 --- a/src/calibre/ebooks/pdb/header.py +++ b/src/calibre/ebooks/pdb/header.py @@ -8,7 +8,7 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import os, struct +import os, re, struct, time class PdbHeaderReader(object): @@ -60,18 +60,26 @@ class PdbHeaderReader(object): return self.stream.read(end - start) -class PdbHeaderWriter(object): +class PdbHeaderBuilder(object): def __init__(self, identity, title): self.identity = identity.ljust(3, '\x00')[:8] - self.title = title.ljust(32, '\x00')[:32] + self.title = re.sub('[^-A-Za-z0-9]+', '_', title).ljust(32, '\x00')[:32] - def build_header(self, offsets): + def build_header(self, section_lengths, out_stream): ''' - Offsets is a list of section offsets + section_lengths = Lenght of each section in file. ''' + + now = int(time.time()) + nrecords = len(section_lengths) + + out_stream.write(self.title + struct.pack('>HHIIIIII', 0, 0, now, now, 0, 0, 0, 0)) + out_stream.write(self.identity + struct.pack('>IIH', nrecords, 0, nrecords)) + + offset = 78 + (8 * nrecords) + 2 + for id, record in enumerate(section_lengths): + out_stream.write(struct.pack('>LBBBB', long(offset), 0, 0, 0, 0)) + offset += record + out_stream.write('\x00\x00') - - - - return header diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index dd87394507..62c07c3d04 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -55,3 +55,4 @@ class TXTOutput(OutputFormatPlugin): if close: out_stream.close() + From e80fcc13fcffef68f7eccb7d0f135f08dce91f12 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 25 Apr 2009 12:22:39 -0700 Subject: [PATCH 141/319] More miscellaneous fixes --- src/calibre/ebooks/oeb/base.py | 5 ++++- src/calibre/ebooks/oeb/transforms/structure.py | 1 + src/calibre/gui2/dialogs/metadata_single.py | 7 +++++-- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 33bb44840b..9d8598c766 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -941,7 +941,10 @@ class Manifest(object): href = urlunparse(purl) path, frag = urldefrag(href) if not path: - return '#'.join((self.href, frag)) + if frag: + return '#'.join((self.href, frag)) + else: + return self.href if '/' not in self.href: return href dirname = os.path.dirname(self.href) diff --git a/src/calibre/ebooks/oeb/transforms/structure.py b/src/calibre/ebooks/oeb/transforms/structure.py index 197a265139..605cdaa7cf 100644 --- a/src/calibre/ebooks/oeb/transforms/structure.py +++ b/src/calibre/ebooks/oeb/transforms/structure.py @@ -102,6 +102,7 @@ class DetectStructure(object): play_order=self.oeb.toc.next_play_order()) + def elem_to_link(self, item, elem, counter): text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')]) text = text[:100].strip() diff --git a/src/calibre/gui2/dialogs/metadata_single.py b/src/calibre/gui2/dialogs/metadata_single.py index e3e2080cc0..4d5471caf0 100644 --- a/src/calibre/gui2/dialogs/metadata_single.py +++ b/src/calibre/gui2/dialogs/metadata_single.py @@ -159,9 +159,12 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): row = self.formats.currentRow() fmt = self.formats.item(row) if fmt is None: - error_dialog(self, _('No format selected'), + if self.formats.count() == 1: + fmt = self.formats.item(0) + if fmt is None: + error_dialog(self, _('No format selected'), _('No format selected')).exec_() - return + return ext = fmt.ext.lower() if fmt.path is None: stream = self.db.format(self.row, ext, as_file=True) From 6ee829ff794bba0820f6c21ed44d62760df5eca3 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 25 Apr 2009 15:50:52 -0400 Subject: [PATCH 142/319] ereader output work --- src/calibre/ebooks/conversion/preprocess.py | 8 ++++---- src/calibre/ebooks/pdb/ereader/pmlconverter.py | 11 ++++++++--- src/calibre/ebooks/pdb/ereader/reader.py | 2 +- src/calibre/ebooks/pdb/ereader/writer.py | 4 ++-- 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 0421534f65..43f1f619d0 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -27,9 +27,9 @@ def chap_head(match): chap = match.group('chap') title = match.group('title') if not title: - return '

    '+chap+'


    ' + return '

    '+chap+'


    \n' else: - return '

    '+chap+'
    '+title+'


    ' + return '

    '+chap+'
    \n'+title+'


    \n' def wrap_lines(match): ital = match.group('ital') @@ -121,7 +121,7 @@ class HTMLPreProcessor(object): # Clean up spaces (re.compile(u'(?<=[\.,:;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), # Add space before and after italics - (re.compile(r'(?'), lambda match: ' '), + (re.compile(u'(?'), lambda match: ' '), (re.compile(r'(?=\w)'), lambda match: ' '), ] @@ -161,7 +161,7 @@ class HTMLPreProcessor(object): elif self.is_pdftohtml(html): line_length_rules = [ # Un wrap using punctuation - (re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P)?\s*()\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .2), re.UNICODE), wrap_lines), + (re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P)?\s*()\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .4), re.UNICODE), wrap_lines), ] rules = self.PDFTOHTML + line_length_rules diff --git a/src/calibre/ebooks/pdb/ereader/pmlconverter.py b/src/calibre/ebooks/pdb/ereader/pmlconverter.py index 88c841b81f..a9c9d2f7a7 100644 --- a/src/calibre/ebooks/pdb/ereader/pmlconverter.py +++ b/src/calibre/ebooks/pdb/ereader/pmlconverter.py @@ -50,6 +50,7 @@ PML_HTML_RULES = [ # eReader files are one paragraph per line. # This forces the lines to wrap properly. (re.compile('^(?P.+)$', re.MULTILINE), lambda match: '

    %s

    ' % match.group('text')), + (re.compile('

    [ ]*

    '), lambda match: ''), # Remove unmatched plm codes. (re.compile(r'(?<=[^\\])\\[pxcriouvtblBk]'), lambda match: ''), @@ -82,7 +83,7 @@ HTML_PML_RULES = [ (re.compile('.+?).*?">'), lambda match: '\\\\Q="%s"' % match.group('target')), (re.compile('#.+?).*?">(?P)', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))), #(re.compile('.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')), - (re.compile('.+?)".*?>'), lambda match: '\\m="%s"' % image_name(match.group('name'))), + (re.compile('.+?)".*?>(.*?)*'), lambda match: '\\m="%s"' % image_name(match.group('name'))), #(re.compile('&#(?P\d\d\d\d);'), lambda match: '\\U%s' % int(match.group('num'))), (re.compile('&#(?P\d\d\d);'), lambda match: '\\a%s' % match.group('num')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')), @@ -93,6 +94,8 @@ HTML_PML_RULES = [ (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')), + (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')), (re.compile('\d+)%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')), @@ -108,8 +111,8 @@ HTML_PML_RULES = [ (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\c%s\\c' % match.group('text')), (re.compile('[0-4]).*?>(?P.+?)', re.DOTALL), lambda match: '\\X%s%s\\X%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\x%s\\x' % match.group('text')), - (re.compile('
    '), lambda match: '\\p'), - (re.compile('
    '), lambda match: '\\p'), + (re.compile('
    '), lambda match: '\n'), + (re.compile('
    '), lambda match: '\n'), # Remove remaining HTML tags (re.compile('<.*?>'), lambda match: ''), @@ -119,6 +122,8 @@ HTML_PML_RULES = [ # Remove whitespace on empty lines (re.compile('^[\t\r ]$', re.MULTILINE), lambda match: ''), + # Remove excess whitespace in lines + (re.compile('(?<=.)[ ]{2,}(?=.)'), lambda match: ' '), # Remove excess newlines at the beginning and end (re.compile('^(\r\n){1,}'), lambda match: ''), diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index c6f520ecb2..e0953753f4 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -154,7 +154,7 @@ class Reader(FormatReader): for i in images: manifest.append((os.path.join('images/', i), None)) - + opf.create_manifest(manifest) opf.create_spine(['index.html']) with open('metadata.opf', 'wb') as opffile: diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index 1605e15f32..cc90b41fb6 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -39,7 +39,7 @@ class Writer(object): pml_pages = [] for page in pages: - pml_pages.append(zlib.compress(html_to_pml(unicode(page)))) + pml_pages.append(zlib.compress(html_to_pml(unicode(page)).encode('utf-8'))) return pml_pages @@ -67,7 +67,7 @@ class Writer(object): image_items = the number of images ''' version = 10 - non_text_offset = text_items + non_text_offset = text_items + 1 if image_items > 0: image_data_offset = text_items + 1 From e7ec12575d51ad4e2a645fdb74295e5d4cbc0058 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 25 Apr 2009 16:57:29 -0400 Subject: [PATCH 143/319] ereader writer working --- src/calibre/ebooks/pdb/ereader/pmlconverter.py | 4 ++-- src/calibre/ebooks/pdb/ereader/reader.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/pdb/ereader/pmlconverter.py b/src/calibre/ebooks/pdb/ereader/pmlconverter.py index a9c9d2f7a7..391f70a504 100644 --- a/src/calibre/ebooks/pdb/ereader/pmlconverter.py +++ b/src/calibre/ebooks/pdb/ereader/pmlconverter.py @@ -39,7 +39,7 @@ PML_HTML_RULES = [ (re.compile(r'\\k(?P.+?)\\k', re.DOTALL), lambda match: '%s' % match.group('text')), (re.compile(r'\\a(?P\d\d\d)'), lambda match: '&#%s;' % match.group('num')), (re.compile(r'\\U(?P\d+)'), lambda match: '%s' % unichr(int(match.group('num'), 16))), - (re.compile(r'\\m="(?P.+?)"'), lambda match: '' % match.group('name')), + (re.compile(r'\\m="(?P.+?)"'), lambda match: '' % image_name(match.group('name')).strip('\x00')), (re.compile(r'\\q="(?P#.+?)"(?P.+?)\\q', re.DOTALL), lambda match: '%s' % (match.group('target'), match.group('text'))), (re.compile(r'\\Q="(?P.+?)"'), lambda match: '
    ' % match.group('target')), (re.compile(r'\\-'), lambda match: ''), @@ -83,7 +83,7 @@ HTML_PML_RULES = [ (re.compile('.+?).*?">'), lambda match: '\\\\Q="%s"' % match.group('target')), (re.compile('#.+?).*?">(?P)', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))), #(re.compile('.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')), - (re.compile('.+?)".*?>(.*?)*'), lambda match: '\\m="%s"' % image_name(match.group('name'))), + (re.compile('.+?)".*?>(.*?)*'), lambda match: '\\m="%s"' % image_name(match.group('name').strip('\x00'))), #(re.compile('&#(?P\d\d\d\d);'), lambda match: '\\U%s' % int(match.group('num'))), (re.compile('&#(?P\d\d\d);'), lambda match: '\\a%s' % match.group('num')), (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')), diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index e0953753f4..d36e01ed69 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -76,7 +76,7 @@ class Reader(FormatReader): if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1: return 'empty', '' data = self.section_data(number) - name = data[4:4+32].strip('\0') + name = data[4:4+32].strip('\x00') img = data[62:] return name, img @@ -97,7 +97,7 @@ class Reader(FormatReader): if not os.path.exists(output_dir): os.makedirs(output_dir) - html = '' + html = u'' for i in range(1, self.header_record.num_text_pages + 1): self.log.debug('Extracting text page %i' % i) @@ -110,8 +110,7 @@ class Reader(FormatReader): self.log.debug('Extracting footnote page %i' % i) html += '
    ' html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i)) - html += '
    ' - + html += '' if self.header_record.sidebar_rec > 0: html += '

    %s

    ' % _('Sidebar') @@ -127,7 +126,8 @@ class Reader(FormatReader): with CurrentDir(output_dir): with open('index.html', 'wb') as index: self.log.debug('Writing text to index.html') - index.write(html.encode('utf-8')) + index.write(html) +# print html if not os.path.exists(os.path.join(output_dir, 'images/')): os.makedirs(os.path.join(output_dir, 'images/')) From 0d07ad2610b8b58d237075392353fb35e45d2ae7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 25 Apr 2009 14:12:23 -0700 Subject: [PATCH 144/319] Strip 0 bytes from HTML before parsing --- src/calibre/ebooks/conversion/preprocess.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index fb55ee74fb..42e6654127 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -26,9 +26,9 @@ def sanitize_head(match): def chap_head(match): chap = match.group('chap') title = match.group('title') - if not title: + if not title: return '

    '+chap+'


    ' - else: + else: return '

    '+chap+'
    '+title+'


    ' @@ -49,19 +49,19 @@ def line_length(raw, percent): total = sum(lengths) avg = total / len(lengths) max_line = avg * 2 - + lengths = sorted(lengths) for i in range(len(lengths) - 1, -1, -1): if lengths[i] > max_line: del lengths[i] - + if percent > 1: percent = 1 if percent < 0: percent = 0 index = int(len(lengths) * percent) - 1 - + return lengths[index] @@ -110,17 +110,17 @@ class HTMLPreProcessor(object): # Remove non breaking spaces (re.compile(ur'\u00a0'), lambda match : ' '), - + # Detect Chapters to match default XPATH in GUI (re.compile(r'(]*>)?(]*>)?s*(?P(Chapter|Epilogue|Prologue|Book|Part)\s*(\d+|\w+)?)(]*>|]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(]*>|]*>))((?P.*)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head), (re.compile(r'(<br[^>]*>)?(</?p[^>]*>)?s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head), - + # Have paragraphs show better (re.compile(r'<br.*?>'), lambda match : '<p>'), - + # Un wrap lines (re.compile(r'(?<=[^\.^\^?^!^"^”])\s*(</(i|b|u)>)*\s*<p.*?>\s*(<(i|b|u)>)*\s*(?=[a-z0-9I])', re.UNICODE), lambda match: ' '), - + # Clean up spaces (re.compile(u'(?<=[\.,:;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), # Add space before and after italics @@ -157,6 +157,7 @@ class HTMLPreProcessor(object): def __call__(self, html, remove_special_chars=None): if remove_special_chars is not None: html = remove_special_chars.sub('', html) + html = html.replace('\0', '') if self.is_baen(html): rules = [] elif self.is_book_designer(html): @@ -166,7 +167,7 @@ class HTMLPreProcessor(object): #line_length_rules = [ # (re.compile('%i' % line_length(html, .85)), lambda match:) #] - + rules = self.PDFTOHTML # + line_length_rules else: rules = [] From d253544a1f311aa692e78e5ff333af6d870fece3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Sat, 25 Apr 2009 14:38:23 -0700 Subject: [PATCH 145/319] Implement a --page-breaks-before option --- src/calibre/ebooks/conversion/cli.py | 2 +- src/calibre/ebooks/conversion/plumber.py | 8 ++++++++ src/calibre/ebooks/oeb/transforms/structure.py | 8 ++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index ae0af532ab..e12686a36c 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -128,7 +128,7 @@ def add_pipeline_options(parser, plumber): [ 'dont_split_on_page_breaks', 'chapter', 'chapter_mark', 'prefer_metadata_cover', 'remove_first_image', - 'insert_comments', + 'insert_comments', 'page_breaks_before', ] ), diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index f55d677d08..da9c9f11e2 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -227,6 +227,14 @@ OptionRecommendation(name='extra_css', 'rules.') ), +OptionRecommendation(name='page_breaks_before', + recommended_value="//*[name()='h1' or name()='h2']", + level=OptionRecommendation.LOW, + help=_('An XPath expression. Page breaks are inserted ' + 'before the specified elements.') + ), + + OptionRecommendation(name='margin_top', recommended_value=5.0, level=OptionRecommendation.LOW, help=_('Set the top margin in pts. Default is %default')), diff --git a/src/calibre/ebooks/oeb/transforms/structure.py b/src/calibre/ebooks/oeb/transforms/structure.py index 605cdaa7cf..8ec3c7737a 100644 --- a/src/calibre/ebooks/oeb/transforms/structure.py +++ b/src/calibre/ebooks/oeb/transforms/structure.py @@ -45,6 +45,14 @@ class DetectStructure(object): if not node.title or regexp.search(node.title) is not None: self.oeb.toc.remove(node) + if opts.page_breaks_before is not None: + pb_xpath = XPath(opts.page_breaks_before) + for item in oeb.spine: + for elem in pb_xpath(item.data): + style = elem.get('style', '') + if style: + style += '; ' + elem.set('style', style+'page-break-before:always') def detect_chapters(self): self.detected_chapters = [] From 1daf7bd86a950e7b21676341098d8c3f01e00f39 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 25 Apr 2009 19:57:21 -0400 Subject: [PATCH 146/319] ereader writer changes --- src/calibre/ebooks/conversion/preprocess.py | 7 ----- src/calibre/ebooks/pdb/ereader/writer.py | 34 ++++++++++++++++++--- 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 230d759755..dad77ea3aa 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -26,17 +26,10 @@ def sanitize_head(match): def chap_head(match): chap = match.group('chap') title = match.group('title') -<<<<<<< TREE if not title: return '<h1>'+chap+'</h1><br/>\n' else: return '<h1>'+chap+'<br/>\n'+title+'</h1><br/>\n' -======= - if not title: - return '<h1>'+chap+'</h1><br/>' - else: - return '<h1>'+chap+'<br/>'+title+'</h1><br/>' ->>>>>>> MERGE-SOURCE def wrap_lines(match): ital = match.group('ital') diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index cc90b41fb6..65eb35157e 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -6,6 +6,8 @@ Write content to ereader pdb file. import struct, zlib +import Image, cStringIO + from calibre.ebooks.oeb.base import OEB_IMAGES from calibre.ebooks.pdb.header import PdbHeaderBuilder from calibre.ebooks.pdb.ereader import image_name @@ -52,14 +54,22 @@ class Writer(object): image += image_name(item.href) image = image.ljust(62, '\x00') - image += item.data - images.append(image) + im = Image.open(cStringIO.StringIO(item.data)) + + data = cStringIO.StringIO() + im.save(data, 'PNG') + data = data.getvalue() + + image += data + + if len(image) < 65505: + images.append(image) return images def _metadata(self, metadata): - return '' + return '\x00\x00\x00\x00\x00' def _header_record(self, text_items, image_items): ''' @@ -72,22 +82,36 @@ class Writer(object): if image_items > 0: image_data_offset = text_items + 1 meta_data_offset = image_data_offset + image_items + last_data_offset = meta_data_offset + 1 else: meta_data_offset = text_items + 1 - image_data_offset = meta_data_offset + last_data_offset = meta_data_offset + 1 + image_data_offset = last_data_offset record = u'' # Version record += struct.pack('>H', version) record = record.ljust(12, '\x00') + # Non-text offset, everything between record 0 and non_text_offset is text pages record += struct.pack('>H', non_text_offset) + record = record.ljust(28, '\x00') + # Footnote and Sidebar rec + record += struct.pack('>H', 0) + record += struct.pack('>H', 0) + record += struct.pack('>H', last_data_offset) record = record.ljust(40, '\x00') + # image pages record += struct.pack('>H', image_data_offset) record = record.ljust(44, '\x00') + # metadata string record += struct.pack('>H', meta_data_offset) + record = record.ljust(48, '\x00') + # footnote and sidebar offsets + record += struct.pack('>H', last_data_offset) + record += struct.pack('>H', last_data_offset) record = record.ljust(52, '\x00') - record += struct.pack('>H', meta_data_offset) + record += struct.pack('>H', last_data_offset) return record From ccdb99299247d83cf1a540cd15e41d68a6d6a7c1 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sun, 26 Apr 2009 17:09:23 -0400 Subject: [PATCH 147/319] Fix bug 2342. ereader inspector script to aid in implementing writer. ereader writer tweaks. --- src/calibre/ebooks/conversion/preprocess.py | 12 +++ src/calibre/ebooks/pdb/ereader/inspector.py | 87 +++++++++++++++++++++ src/calibre/ebooks/pdb/ereader/writer.py | 35 ++++++++- 3 files changed, 131 insertions(+), 3 deletions(-) create mode 100644 src/calibre/ebooks/pdb/ereader/inspector.py diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index dad77ea3aa..9bfe6d4255 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -95,6 +95,18 @@ class HTMLPreProcessor(object): # Fix pdftohtml markup PDFTOHTML = [ + # Fix umlauts + (re.compile(u'¨\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ö'), + (re.compile(u'¨\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ö'), + (re.compile(u'¨\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ü'), + (re.compile(u'¨\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ü'), + (re.compile(u'¨\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ë'), + (re.compile(u'¨\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ë'), + (re.compile(u'¨\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'ï'), + (re.compile(u'¨\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ï'), + (re.compile(u'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ä'), + (re.compile(u'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ä'), + # Remove page links (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''), # Remove <hr> tags diff --git a/src/calibre/ebooks/pdb/ereader/inspector.py b/src/calibre/ebooks/pdb/ereader/inspector.py new file mode 100644 index 0000000000..a3875daad4 --- /dev/null +++ b/src/calibre/ebooks/pdb/ereader/inspector.py @@ -0,0 +1,87 @@ +# -*- coding: utf-8 -*- +''' +Inspect the header of ereader files. This is primarily used for debugging. +''' + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +import struct, sys + +from calibre.ebooks.pdb.header import PdbHeaderReader +from calibre.ebooks.pdb.ereader.reader import HeaderRecord + +def pdb_header_info(header): + print 'PDB Header Info:' + print '' + print 'Identity: %s' % header.ident + print 'Total Sectons: %s' % header.num_sections + print 'Title: %s' % header.title + print '' + +def ereader_header_info(header): + h0 = header.section_data(0) + + print 'Ereader Record 0 (Header) Info:' + print '' + print '0-2 Version: %i' % struct.unpack('>H', h0[0:2])[0] + print '2-4: %i' % struct.unpack('>H', h0[2:4])[0] + print '4-6: %i' % struct.unpack('>H', h0[4:6])[0] + print '6-8: %i' % struct.unpack('>H', h0[6:8])[0] + print '8-10: %i' % struct.unpack('>H', h0[8:10])[0] + print '10-12: %i' % struct.unpack('>H', h0[10:12])[0] + print '12-14 Non-Text: %i' % struct.unpack('>H', h0[12:14])[0] + print '14-16: %i' % struct.unpack('>H', h0[14:16])[0] + print '16-18: %i' % struct.unpack('>H', h0[16:18])[0] + print '18-20: %i' % struct.unpack('>H', h0[18:20])[0] + print '20-22: %i' % struct.unpack('>H', h0[20:22])[0] + print '22-24: %i' % struct.unpack('>H', h0[22:24])[0] + print '24-26: %i' % struct.unpack('>H', h0[24:26])[0] + print '26-28: %i' % struct.unpack('>H', h0[26:28])[0] + print '28-30 footnote_rec: %i' % struct.unpack('>H', h0[28:30])[0] + print '30-32 sidebar_rec: %i' % struct.unpack('>H', h0[30:32])[0] + print '32-34 bookmark_offset: %i' % struct.unpack('>H', h0[32:34])[0] + print '34-36: %i' % struct.unpack('>H', h0[34:36])[0] + print '36-38: %i' % struct.unpack('>H', h0[36:38])[0] + print '38-40: %i' % struct.unpack('>H', h0[38:40])[0] + print '40-42 image_data_offset: %i' % struct.unpack('>H', h0[40:42])[0] + print '42-44: %i' % struct.unpack('>H', h0[42:44])[0] + print '44-46 metadata_offset: %i' % struct.unpack('>H', h0[44:46])[0] + print '46-48: %i' % struct.unpack('>H', h0[46:48])[0] + print '48-50 footnote_offset: %i' % struct.unpack('>H', h0[48:50])[0] + print '50-52 sidebar_offset: %i' % struct.unpack('>H', h0[50:52])[0] + print '52-54 last_data_offset: %i' % struct.unpack('>H', h0[52:54])[0] + + print '' + +def section_lengths(header): + print 'Section Sizes' + print '' + + for i in range(0, header.section_count()): + size = len(header.section_data(i)) + if size > 65505: + message = '<--- Over!' + else: + message = '' + + print 'Section %i: %i %s' % (i, size, message) + +def main(args=sys.argv): + if len(args) < 2: + print 'Error: requires input file.' + return 1 + + f = open(sys.argv[1], 'rb') + + pheader = PdbHeaderReader(f) + + pdb_header_info(pheader) + ereader_header_info(pheader) + section_lengths(pheader) + + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index 65eb35157e..b831849488 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -27,7 +27,7 @@ class Writer(object): hr = [self._header_record(len(text), len(images))] - sections = hr+text+images+metadata + sections = hr+text+images+metadata+['MeTaInFo\x00'] lengths = [len(i) for i in sections] @@ -82,7 +82,7 @@ class Writer(object): if image_items > 0: image_data_offset = text_items + 1 meta_data_offset = image_data_offset + image_items - last_data_offset = meta_data_offset + 1 + last_data_offset = meta_data_offset + 2 else: meta_data_offset = text_items + 1 last_data_offset = meta_data_offset + 1 @@ -90,6 +90,35 @@ class Writer(object): record = u'' + record += struct.pack('>H', version) # [0:2] + record += struct.pack('>H', 0) # [2:4] + record += struct.pack('>H', 0) # [4:6] + record += struct.pack('>H', 25152) # [6:8] # 25152 is MAGIC + record += struct.pack('>H', last_data_offset) # [8:10] + record += struct.pack('>H', last_data_offset) # [10:12] + record += struct.pack('>H', non_text_offset) # [12:14] # non_text_offset + record += struct.pack('>H', non_text_offset) # [14:16] + record += struct.pack('>H', 1) # [16:18] + record += struct.pack('>H', 1) # [18:20] + record += struct.pack('>H', 0) # [20:22] + record += struct.pack('>H', 1) # [22:24] + record += struct.pack('>H', 1) # [24:26] + record += struct.pack('>H', 0) # [26:28] + record += struct.pack('>H', 0) # [28:30] # footnote_rec + record += struct.pack('>H', 0) # [30:32] # sidebar_rec + record += struct.pack('>H', last_data_offset) # [32:34] # bookmark_offset + record += struct.pack('>H', 2560) # [34:36] # 2560 is MAGIC + record += struct.pack('>H', non_text_offset) # [36:38] + record += struct.pack('>H', non_text_offset + 1) # [38:40] + record += struct.pack('>H', image_data_offset) # [40:42] + record += struct.pack('>H', image_data_offset) # [42:44] + record += struct.pack('>H', meta_data_offset) # [44:46] + record += struct.pack('>H', meta_data_offset) # [46:48] + record += struct.pack('>H', last_data_offset) # [48:50] # footnote_offset + record += struct.pack('>H', last_data_offset) # [52:54] # sidebar_offset + record += struct.pack('>H', last_data_offset) # [54:56] # last_data_offset + + ''' # Version record += struct.pack('>H', version) record = record.ljust(12, '\x00') @@ -112,6 +141,6 @@ class Writer(object): record += struct.pack('>H', last_data_offset) record = record.ljust(52, '\x00') record += struct.pack('>H', last_data_offset) - + ''' return record From 3938bbdaebfd62cdc53009208349edc59c66485d Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Sun, 26 Apr 2009 22:43:12 -0700 Subject: [PATCH 148/319] Plugin for comic input --- src/calibre/customize/builtins.py | 3 +- src/calibre/customize/conversion.py | 5 + src/calibre/customize/profiles.py | 17 +- .../ebooks/{lrf => }/comic/__init__.py | 0 src/calibre/ebooks/comic/input.py | 454 ++++++++++++++ src/calibre/ebooks/conversion/cli.py | 2 +- src/calibre/ebooks/lrf/comic/convert_from.py | 562 ------------------ src/calibre/ebooks/oeb/iterator.py | 2 + src/calibre/libunzip.py | 12 +- src/calibre/parallel.py | 76 +-- 10 files changed, 524 insertions(+), 609 deletions(-) rename src/calibre/ebooks/{lrf => }/comic/__init__.py (100%) create mode 100755 src/calibre/ebooks/comic/input.py delete mode 100755 src/calibre/ebooks/lrf/comic/convert_from.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index bf70a828a9..9a686e0d94 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -286,6 +286,7 @@ from calibre.ebooks.fb2.input import FB2Input from calibre.ebooks.odt.input import ODTInput from calibre.ebooks.rtf.input import RTFInput from calibre.ebooks.html.input import HTMLInput +from calibre.ebooks.comic.input import ComicInput from calibre.ebooks.oeb.output import OEBOutput from calibre.ebooks.epub.output import EPUBOutput from calibre.ebooks.txt.output import TXTOutput @@ -294,7 +295,7 @@ from calibre.ebooks.pdb.ereader.output import EREADEROutput from calibre.customize.profiles import input_profiles, output_profiles plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDBInput, PDFInput, HTMLInput, - TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput, + TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput, ComicInput, FB2Input, ODTInput, RTFInput, EPUBOutput, EREADEROutput] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index b334816adf..c358986d18 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -96,6 +96,11 @@ class InputFormatPlugin(Plugin): #: For example: ``set(['azw', 'mobi', 'prc'])`` file_types = set([]) + #: If True, this input plugin generates a collection of images, + #: one per HTML file. You can obtain access to the images via + #: convenience method, :method:`get_image_collection`. + is_image_collection = False + #: Options shared by all Input format plugins. Do not override #: in sub-classes. Use :member:`options` instead. Every option must be an #: instance of :class:`OptionRecommendation`. diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py index 67dd920135..f60f7b5e7b 100644 --- a/src/calibre/customize/profiles.py +++ b/src/calibre/customize/profiles.py @@ -148,6 +148,8 @@ class OutputProfile(Plugin): remove_special_chars = re.compile(u'[\u200b\u00ad]') # ADE falls to the ground in a dead faint when it sees an <object> remove_object_tags = True + # The image size for comics + comic_screen_size = (584, 754) class SonyReaderOutput(OutputProfile): @@ -162,6 +164,18 @@ class SonyReaderOutput(OutputProfile): fbase = 12 fsizes = [7.5, 9, 10, 12, 15.5, 20, 22, 24] +class SonyReaderLandscapeOutput(SonyReaderOutput): + + name = 'Sony Reader Landscape' + short_name = 'sony-landscape' + description = _('This profile is intended for the SONY PRS line. ' + 'The 500/505/700 etc, in landscape mode. Mainly useful ' + 'for comics.') + + screen_size = (784, 1012) + comic_screen_size = (784, 1012) + + class MSReaderOutput(OutputProfile): name = 'Microsoft Reader' @@ -223,4 +237,5 @@ class KindleOutput(OutputProfile): fsizes = [12, 12, 14, 16, 18, 20, 22, 24] output_profiles = [OutputProfile, SonyReaderOutput, MSReaderOutput, - MobipocketOutput, HanlinV3Output, CybookG3Output, KindleOutput] + MobipocketOutput, HanlinV3Output, CybookG3Output, KindleOutput, + SonyReaderLandscapeOutput] diff --git a/src/calibre/ebooks/lrf/comic/__init__.py b/src/calibre/ebooks/comic/__init__.py similarity index 100% rename from src/calibre/ebooks/lrf/comic/__init__.py rename to src/calibre/ebooks/comic/__init__.py diff --git a/src/calibre/ebooks/comic/input.py b/src/calibre/ebooks/comic/input.py new file mode 100755 index 0000000000..f6d6557ee4 --- /dev/null +++ b/src/calibre/ebooks/comic/input.py @@ -0,0 +1,454 @@ +from __future__ import with_statement +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' +__docformat__ = 'restructuredtext en' + +''' +Based on ideas from comiclrf created by FangornUK. +''' + +import os, shutil, traceback, textwrap + +from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation +from calibre import extract, CurrentDir +from calibre.ptempfile import PersistentTemporaryDirectory +from calibre.parallel import Server, ParallelJob + +def extract_comic(path_to_comic_file): + ''' + Un-archive the comic file. + ''' + tdir = PersistentTemporaryDirectory(suffix='_comic_extract') + extract(path_to_comic_file, tdir) + return tdir + +def find_pages(dir, sort_on_mtime=False, verbose=False): + ''' + Find valid comic pages in a previously un-archived comic. + + :param dir: Directory in which extracted comic lives + :param sort_on_mtime: If True sort pages based on their last modified time. + Otherwise, sort alphabetically. + ''' + extensions = ['jpeg', 'jpg', 'gif', 'png'] + pages = [] + for datum in os.walk(dir): + for name in datum[-1]: + path = os.path.join(datum[0], name) + for ext in extensions: + if path.lower().endswith('.'+ext): + pages.append(path) + break + if sort_on_mtime: + comparator = lambda x, y : cmp(os.stat(x).st_mtime, os.stat(y).st_mtime) + else: + comparator = lambda x, y : cmp(os.path.basename(x), os.path.basename(y)) + + pages.sort(cmp=comparator) + if verbose: + print 'Found comic pages...' + print '\t'+'\n\t'.join([os.path.basename(p) for p in pages]) + return pages + +class PageProcessor(list): + ''' + Contains the actual image rendering logic. See :method:`render` and + :method:`process_pages`. + ''' + + def __init__(self, path_to_page, dest, opts, num): + list.__init__(self) + self.path_to_page = path_to_page + self.opts = opts + self.num = num + self.dest = dest + self.rotate = False + self.render() + + + def render(self): + import calibre.utils.PythonMagickWand as pw + img = pw.NewMagickWand() + if img < 0: + raise RuntimeError('Cannot create wand.') + if not pw.MagickReadImage(img, self.path_to_page): + raise IOError('Failed to read image from: %'%self.path_to_page) + width = pw.MagickGetImageWidth(img) + height = pw.MagickGetImageHeight(img) + if self.num == 0: # First image so create a thumbnail from it + thumb = pw.CloneMagickWand(img) + if thumb < 0: + raise RuntimeError('Cannot create wand.') + pw.MagickThumbnailImage(thumb, 60, 80) + pw.MagickWriteImage(thumb, os.path.join(self.dest, 'thumbnail.png')) + pw.DestroyMagickWand(thumb) + self.pages = [img] + if width > height: + if self.opts.landscape: + self.rotate = True + else: + split1, split2 = map(pw.CloneMagickWand, (img, img)) + pw.DestroyMagickWand(img) + if split1 < 0 or split2 < 0: + raise RuntimeError('Cannot create wand.') + pw.MagickCropImage(split1, (width/2)-1, height, 0, 0) + pw.MagickCropImage(split2, (width/2)-1, height, width/2, 0 ) + self.pages = [split2, split1] if self.opts.right2left else [split1, split2] + self.process_pages() + + def process_pages(self): + import calibre.utils.PythonMagickWand as p + for i, wand in enumerate(self.pages): + pw = p.NewPixelWand() + try: + if pw < 0: + raise RuntimeError('Cannot create wand.') + p.PixelSetColor(pw, 'white') + + p.MagickSetImageBorderColor(wand, pw) + if self.rotate: + p.MagickRotateImage(wand, pw, -90) + + # 25 percent fuzzy trim? + if not self.opts.disable_trim: + p.MagickTrimImage(wand, 25*65535/100) + p.MagickSetImagePage(wand, 0,0,0,0) #Clear page after trim, like a "+repage" + # Do the Photoshop "Auto Levels" equivalent + if not self.opts.dont_normalize: + p.MagickNormalizeImage(wand) + sizex = p.MagickGetImageWidth(wand) + sizey = p.MagickGetImageHeight(wand) + + SCRWIDTH, SCRHEIGHT = self.opts.output_profile.comic_screen_size + + if self.opts.keep_aspect_ratio: + # Preserve the aspect ratio by adding border + aspect = float(sizex) / float(sizey) + if aspect <= (float(SCRWIDTH) / float(SCRHEIGHT)): + newsizey = SCRHEIGHT + newsizex = int(newsizey * aspect) + deltax = (SCRWIDTH - newsizex) / 2 + deltay = 0 + else: + newsizex = SCRWIDTH + newsizey = int(newsizex / aspect) + deltax = 0 + deltay = (SCRHEIGHT - newsizey) / 2 + p.MagickResizeImage(wand, newsizex, newsizey, p.CatromFilter, 1.0) + p.MagickSetImageBorderColor(wand, pw) + p.MagickBorderImage(wand, pw, deltax, deltay) + elif self.opts.wide: + # Keep aspect and Use device height as scaled image width so landscape mode is clean + aspect = float(sizex) / float(sizey) + screen_aspect = float(SCRWIDTH) / float(SCRHEIGHT) + # Get dimensions of the landscape mode screen + # Add 25px back to height for the battery bar. + wscreenx = SCRHEIGHT + 25 + wscreeny = int(wscreenx / screen_aspect) + if aspect <= screen_aspect: + newsizey = wscreeny + newsizex = int(newsizey * aspect) + deltax = (wscreenx - newsizex) / 2 + deltay = 0 + else: + newsizex = wscreenx + newsizey = int(newsizex / aspect) + deltax = 0 + deltay = (wscreeny - newsizey) / 2 + p.MagickResizeImage(wand, newsizex, newsizey, p.CatromFilter, 1.0) + p.MagickSetImageBorderColor(wand, pw) + p.MagickBorderImage(wand, pw, deltax, deltay) + else: + p.MagickResizeImage(wand, SCRWIDTH, SCRHEIGHT, p.CatromFilter, 1.0) + + if not self.opts.dont_sharpen: + p.MagickSharpenImage(wand, 0.0, 1.0) + + p.MagickSetImageType(wand, p.GrayscaleType) + + if self.opts.despeckle: + p.MagickDespeckleImage(wand) + + p.MagickQuantizeImage(wand, self.opts.colors, p.RGBColorspace, 0, 1, 0) + dest = '%d_%d.png'%(self.num, i) + dest = os.path.join(self.dest, dest) + p.MagickWriteImage(wand, dest+'8') + os.rename(dest+'8', dest) + self.append(dest) + finally: + if pw > 0: + p.DestroyPixelWand(pw) + p.DestroyMagickWand(wand) + +def render_pages(tasks, dest, opts, notification=None): + ''' + Entry point for the job server. + ''' + failures, pages = [], [] + from calibre.utils.PythonMagickWand import ImageMagick + with ImageMagick(): + for num, path in tasks: + try: + pages.extend(PageProcessor(path, dest, opts, num)) + msg = _('Rendered %s') + except: + failures.append(path) + msg = _('Failed %s') + if opts.verbose: + msg += '\n' + traceback.format_exc() + msg = msg%path + if notification is not None: + notification(0.5, msg) + + return pages, failures + + +class JobManager(object): + ''' + Simple job manager responsible for keeping track of overall progress. + ''' + + def __init__(self, total, update): + self.total = total + self.update = update + self.done = 0 + self.add_job = lambda j: j + self.output = lambda j: j + self.start_work = lambda j: j + self.job_done = lambda j: j + + def status_update(self, job): + self.done += 1 + #msg = msg%os.path.basename(job.args[0]) + self.update(float(self.done)/self.total, job.msg) + +def process_pages(pages, opts, update, tdir): + ''' + Render all identified comic pages. + ''' + from calibre.utils.PythonMagickWand import ImageMagick + ImageMagick + + job_manager = JobManager(len(pages), update) + server = Server() + jobs = [] + tasks = server.split(pages) + for task in tasks: + jobs.append(ParallelJob('render_pages', lambda s:s, job_manager=job_manager, + args=[task, tdir, opts])) + server.add_job(jobs[-1]) + server.wait() + server.killall() + server.close() + ans, failures = [], [] + + for job in jobs: + if job.result is None: + raise Exception(_('Failed to process comic: %s\n\n%s')%(job.exception, job.traceback)) + pages, failures_ = job.result + ans += pages + failures += failures_ + return ans, failures + + +class ComicInput(InputFormatPlugin): + + name = 'Comic Input' + author = 'Kovid Goyal' + description = 'Optimize comic files (.cbz, .cbr, .cbc) for viewing on portable devices' + file_types = set(['cbz', 'cbr', 'cbc']) + is_image_collection = True + + options = set([ + OptionRecommendation(name='colors', recommended_value=64, + help=_('Number of colors for grayscale image conversion. Default: %default')), + OptionRecommendation(name='dont_normalize', recommended_value=False, + help=_('Disable normalize (improve contrast) color range ' + 'for pictures. Default: False')), + OptionRecommendation(name='keep_aspect_ratio', recommended_value=False, + help=_('Maintain picture aspect ratio. Default is to fill the screen.')), + OptionRecommendation(name='dont_sharpen', recommended_value=False, + help=_('Disable sharpening.')), + OptionRecommendation(name='disable_trim', recommended_value=False, + help=_('Disable trimming of comic pages. For some comics, ' + 'trimming might remove content as well as borders.')), + OptionRecommendation(name='landspace', recommended_value=False, + help=_("Don't split landscape images into two portrait images")), + OptionRecommendation(name='wide', recommended_value=False, + help=_("Keep aspect ratio and scale image using screen height as " + "image width for viewing in landscape mode.")), + OptionRecommendation(name='right2left', recommended_value=False, + help=_('Used for right-to-left publications like manga. ' + 'Causes landscape pages to be split into portrait pages ' + 'from right to left.')), + OptionRecommendation(name='despeckle', recommended_value=False, + help=_('Enable Despeckle. Reduces speckle noise. ' + 'May greatly increase processing time.')), + OptionRecommendation(name='no_sort', recommended_value=False, + help=_("Don't sort the files found in the comic " + "alphabetically by name. Instead use the order they were " + "added to the comic.")), + OptionRecommendation(name='no_process', recommended_value=False, + help=_("Apply no processing to the image")), + ]) + + recommendations = set([ + ('margin_left', 0, OptionRecommendation.HIGH), + ('margin_top', 0, OptionRecommendation.HIGH), + ('margin_right', 0, OptionRecommendation.HIGH), + ('margin_bottom', 0, OptionRecommendation.HIGH), + ('insert_blank_line', False, OptionRecommendation.HIGH), + ('remove_paragraph_spacing', False, OptionRecommendation.HIGH), + ('dont_justify', True, OptionRecommendation.HIGH), + ('dont_split_on_pagebreaks', True, OptionRecommendation.HIGH), + ('chapter', None, OptionRecommendation.HIGH), + ('page_breaks_brefore', None, OptionRecommendation.HIGH), + ('use_auto_toc', False, OptionRecommendation.HIGH), + ]) + + def get_comics_from_collection(self, stream): + from calibre.libunzip import extract as zipextract + tdir = PersistentTemporaryDirectory('_comic_collection') + zipextract(stream, tdir) + comics = [] + with CurrentDir(tdir): + if not os.path.exists('comics.txt'): + raise ValueError('%s is not a valid comic collection' + %stream.name) + for line in open('comics.txt', + 'rb').read().decode('utf-8').splitlines(): + fname, title = line.partition(':')[0], line.partition(':')[-1] + fname = os.path.join(tdir, *fname.split('/')) + if not title: + title = os.path.basename(fname).rpartition('.')[0] + if os.access(fname, os.R_OK): + comics.append([title, fname]) + if not comics: + raise ValueError('%s has no comics'%stream.name) + return comics + + def get_pages(self, comic, tdir2): + tdir = extract_comic(comic) + new_pages = find_pages(tdir, sort_on_mtime=self.opts.no_sort, + verbose=self.opts.verbose) + thumbnail = None + if not new_pages: + raise ValueError('Could not find any pages in the comic: %s' + %comic) + if self.opts.no_process: + n2 = [] + for page in new_pages: + n2.append(os.path.join(tdir2, os.path.basename(page))) + shutil.copyfile(page, n2[-1]) + new_pages = n2 + else: + new_pages, failures = process_pages(new_pages, self.opts, + self.progress, tdir2) + if not new_pages: + raise ValueError('Could not find any valid pages in comic: %s' + % comic) + if failures: + self.log.warning('Could not process the following pages ' + '(run with --verbose to see why):') + for f in failures: + self.log.warning('\t', f) + thumbnail = os.path.join(tdir2, 'thumbnail.png') + if not os.access(thumbnail, os.R_OK): + thumbnail = None + return new_pages + + def convert(self, stream, opts, file_ext, log, accelerators, + progress=lambda p, m : m): + from calibre.ebooks.metadata import MetaInformation + from calibre.ebooks.metadata.opf2 import OPFCreator + from calibre.ebooks.metadata.toc import TOC + + self.opts, self.log, self.progress = opts, log, progress + if file_ext == 'cbc': + comics_ = self.get_comics_from_collection(stream) + else: + comics_ = [['Comic', os.path.abspath(stream.name)]] + stream.close() + comics = [] + for i, x in enumerate(comics_): + title, fname = x + cdir = 'comic_%d'%(i+1) if len(comics_) > 1 else '.' + cdir = os.path.abspath(cdir) + if not os.path.exists(cdir): + os.makedirs(cdir) + pages = self.get_pages(fname, cdir) + if not pages: continue + wrappers = self.create_wrappers(pages) + comics.append((title, pages, wrappers)) + + if not comics: + raise ValueError('No comic pages found in %s'%stream.name) + + mi = MetaInformation(os.path.basename(stream.name).rpartition('.')[0], + [_('Unknown')]) + opf = OPFCreator(os.path.abspath('.'), mi) + entries = [] + + def href(x): + if len(comics) == 1: return os.path.basename(x) + return '/'.join(x.split(os.sep)[-2:]) + + for comic in comics: + pages, wrappers = comic[1:] + entries += [(w, None) for w in map(href, wrappers)] + \ + [(x, None) for x in map(href, pages)] + opf.create_manifest(entries) + spine = [] + for comic in comics: + spine.extend(map(href, comic[2])) + opf.create_spine(spine) + toc = TOC() + if len(comics) == 1: + wrappers = comics[0][2] + for i, x in enumerate(wrappers): + toc.add_item(href(x), None, _('Page')+' %d'%(i+1), + play_order=i) + else: + po = 0 + for comic in comics: + po += 1 + wrappers = comic[2] + stoc = toc.add_item(href(wrappers[0]), + None, comic[0], play_order=po) + for i, x in enumerate(wrappers): + stoc.add_item(href(x), None, + _('Page')+' %d'%(i+1), play_order=po) + po += 1 + opf.set_toc(toc) + m, n = open('metadata.opf', 'wb'), open('toc.ncx', 'wb') + opf.render(m, n, 'toc.ncx') + return os.path.abspath('metadata.opf') + + def create_wrappers(self, pages): + from calibre.ebooks.oeb.base import XHTML_NS + wrappers = [] + WRAPPER = textwrap.dedent('''\ + <html xmlns="%s"> + <head> + <title>Page #%d + + + +
    + comic page #%d +
    + + + ''') + dir = os.path.dirname(pages[0]) + for i, page in enumerate(pages): + wrapper = WRAPPER%(XHTML_NS, i+1, os.path.basename(page), i+1) + page = os.path.join(dir, 'page_%d.xhtml'%(i+1)) + open(page, 'wb').write(wrapper) + wrappers.append(page) + return wrappers + diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index e12686a36c..941a1ec5fc 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -47,7 +47,7 @@ def print_help(parser, log): def check_command_line_options(parser, args, log): if len(args) < 3 or args[1].startswith('-') or args[2].startswith('-'): - print_help(parser) + print_help(parser, log) log.error('\n\nYou must specify the input AND output files') raise SystemExit(1) diff --git a/src/calibre/ebooks/lrf/comic/convert_from.py b/src/calibre/ebooks/lrf/comic/convert_from.py deleted file mode 100755 index 50f5e1e72e..0000000000 --- a/src/calibre/ebooks/lrf/comic/convert_from.py +++ /dev/null @@ -1,562 +0,0 @@ -from __future__ import with_statement -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' -__docformat__ = 'restructuredtext en' - -''' -Based on ideas from comiclrf created by FangornUK. -''' - -import os, sys, shutil, traceback, textwrap, fnmatch -from uuid import uuid4 - - - - -from calibre import extract, terminal_controller, __appname__, __version__ -from calibre.utils.config import Config, StringConfig -from calibre.ptempfile import PersistentTemporaryDirectory -from calibre.parallel import Server, ParallelJob -from calibre.utils.terminfo import ProgressBar -from calibre.ebooks.lrf.pylrs.pylrs import Book, BookSetting, ImageStream, ImageBlock -from calibre.ebooks.metadata import MetaInformation -from calibre.ebooks.metadata.opf import OPFCreator -from calibre.ebooks.epub.from_html import config as html2epub_config, convert as html2epub -from calibre.customize.ui import run_plugins_on_preprocess -try: - from calibre.utils.PythonMagickWand import \ - NewMagickWand, NewPixelWand, \ - MagickSetImageBorderColor, \ - MagickReadImage, MagickRotateImage, \ - MagickTrimImage, PixelSetColor,\ - MagickNormalizeImage, MagickGetImageWidth, \ - MagickGetImageHeight, \ - MagickResizeImage, MagickSetImageType, \ - GrayscaleType, CatromFilter, MagickSetImagePage, \ - MagickBorderImage, MagickSharpenImage, MagickDespeckleImage, \ - MagickQuantizeImage, RGBColorspace, \ - MagickWriteImage, DestroyPixelWand, \ - DestroyMagickWand, CloneMagickWand, \ - MagickThumbnailImage, MagickCropImage, ImageMagick - _imagemagick_loaded = True -except: - _imagemagick_loaded = False - -PROFILES = { - # Name : (width, height) in pixels - 'prs500':(584, 754), - # The SONY's LRF renderer (on the PRS500) only uses the first 800x600 block of the image - 'prs500-landscape': (784, 1012) - } - -def extract_comic(path_to_comic_file): - ''' - Un-archive the comic file. - ''' - tdir = PersistentTemporaryDirectory(suffix='_comic_extract') - extract(path_to_comic_file, tdir) - return tdir - -def find_pages(dir, sort_on_mtime=False, verbose=False): - ''' - Find valid comic pages in a previously un-archived comic. - - :param dir: Directory in which extracted comic lives - :param sort_on_mtime: If True sort pages based on their last modified time. - Otherwise, sort alphabetically. - ''' - extensions = ['jpeg', 'jpg', 'gif', 'png'] - pages = [] - for datum in os.walk(dir): - for name in datum[-1]: - path = os.path.join(datum[0], name) - for ext in extensions: - if path.lower().endswith('.'+ext): - pages.append(path) - break - if sort_on_mtime: - comparator = lambda x, y : cmp(os.stat(x).st_mtime, os.stat(y).st_mtime) - else: - comparator = lambda x, y : cmp(os.path.basename(x), os.path.basename(y)) - - pages.sort(cmp=comparator) - if verbose: - print 'Found comic pages...' - print '\t'+'\n\t'.join([os.path.basename(p) for p in pages]) - return pages - -class PageProcessor(list): - ''' - Contains the actual image rendering logic. See :method:`render` and - :method:`process_pages`. - ''' - - def __init__(self, path_to_page, dest, opts, num): - list.__init__(self) - self.path_to_page = path_to_page - self.opts = opts - self.num = num - self.dest = dest - self.rotate = False - self.render() - - - def render(self): - img = NewMagickWand() - if img < 0: - raise RuntimeError('Cannot create wand.') - if not MagickReadImage(img, self.path_to_page): - raise IOError('Failed to read image from: %'%self.path_to_page) - width = MagickGetImageWidth(img) - height = MagickGetImageHeight(img) - if self.num == 0: # First image so create a thumbnail from it - thumb = CloneMagickWand(img) - if thumb < 0: - raise RuntimeError('Cannot create wand.') - MagickThumbnailImage(thumb, 60, 80) - MagickWriteImage(thumb, os.path.join(self.dest, 'thumbnail.png')) - DestroyMagickWand(thumb) - self.pages = [img] - if width > height: - if self.opts.landscape: - self.rotate = True - else: - split1, split2 = map(CloneMagickWand, (img, img)) - DestroyMagickWand(img) - if split1 < 0 or split2 < 0: - raise RuntimeError('Cannot create wand.') - MagickCropImage(split1, (width/2)-1, height, 0, 0) - MagickCropImage(split2, (width/2)-1, height, width/2, 0 ) - self.pages = [split2, split1] if self.opts.right2left else [split1, split2] - self.process_pages() - - def process_pages(self): - for i, wand in enumerate(self.pages): - pw = NewPixelWand() - try: - if pw < 0: - raise RuntimeError('Cannot create wand.') - PixelSetColor(pw, 'white') - - MagickSetImageBorderColor(wand, pw) - if self.rotate: - MagickRotateImage(wand, pw, -90) - - # 25 percent fuzzy trim? - if not self.opts.disable_trim: - MagickTrimImage(wand, 25*65535/100) - MagickSetImagePage(wand, 0,0,0,0) #Clear page after trim, like a "+repage" - # Do the Photoshop "Auto Levels" equivalent - if not self.opts.dont_normalize: - MagickNormalizeImage(wand) - sizex = MagickGetImageWidth(wand) - sizey = MagickGetImageHeight(wand) - - SCRWIDTH, SCRHEIGHT = PROFILES[self.opts.profile] - - if self.opts.keep_aspect_ratio: - # Preserve the aspect ratio by adding border - aspect = float(sizex) / float(sizey) - if aspect <= (float(SCRWIDTH) / float(SCRHEIGHT)): - newsizey = SCRHEIGHT - newsizex = int(newsizey * aspect) - deltax = (SCRWIDTH - newsizex) / 2 - deltay = 0 - else: - newsizex = SCRWIDTH - newsizey = int(newsizex / aspect) - deltax = 0 - deltay = (SCRHEIGHT - newsizey) / 2 - MagickResizeImage(wand, newsizex, newsizey, CatromFilter, 1.0) - MagickSetImageBorderColor(wand, pw) - MagickBorderImage(wand, pw, deltax, deltay) - elif self.opts.wide: - # Keep aspect and Use device height as scaled image width so landscape mode is clean - aspect = float(sizex) / float(sizey) - screen_aspect = float(SCRWIDTH) / float(SCRHEIGHT) - # Get dimensions of the landscape mode screen - # Add 25px back to height for the battery bar. - wscreenx = SCRHEIGHT + 25 - wscreeny = int(wscreenx / screen_aspect) - if aspect <= screen_aspect: - newsizey = wscreeny - newsizex = int(newsizey * aspect) - deltax = (wscreenx - newsizex) / 2 - deltay = 0 - else: - newsizex = wscreenx - newsizey = int(newsizex / aspect) - deltax = 0 - deltay = (wscreeny - newsizey) / 2 - MagickResizeImage(wand, newsizex, newsizey, CatromFilter, 1.0) - MagickSetImageBorderColor(wand, pw) - MagickBorderImage(wand, pw, deltax, deltay) - else: - MagickResizeImage(wand, SCRWIDTH, SCRHEIGHT, CatromFilter, 1.0) - - if not self.opts.dont_sharpen: - MagickSharpenImage(wand, 0.0, 1.0) - - MagickSetImageType(wand, GrayscaleType) - - if self.opts.despeckle: - MagickDespeckleImage(wand) - - MagickQuantizeImage(wand, self.opts.colors, RGBColorspace, 0, 1, 0) - dest = '%d_%d.png'%(self.num, i) - dest = os.path.join(self.dest, dest) - MagickWriteImage(wand, dest+'8') - os.rename(dest+'8', dest) - self.append(dest) - finally: - if pw > 0: - DestroyPixelWand(pw) - DestroyMagickWand(wand) - -def render_pages(tasks, dest, opts, notification=None): - ''' - Entry point for the job server. - ''' - failures, pages = [], [] - with ImageMagick(): - for num, path in tasks: - try: - pages.extend(PageProcessor(path, dest, opts, num)) - msg = _('Rendered %s') - except: - failures.append(path) - msg = _('Failed %s') - if opts.verbose: - msg += '\n' + traceback.format_exc() - msg = msg%path - if notification is not None: - notification(0.5, msg) - - return pages, failures - - -class JobManager(object): - ''' - Simple job manager responsible for keeping track of overall progress. - ''' - - def __init__(self, total, update): - self.total = total - self.update = update - self.done = 0 - self.add_job = lambda j: j - self.output = lambda j: j - self.start_work = lambda j: j - self.job_done = lambda j: j - - def status_update(self, job): - self.done += 1 - #msg = msg%os.path.basename(job.args[0]) - self.update(float(self.done)/self.total, job.msg) - -def process_pages(pages, opts, update): - ''' - Render all identified comic pages. - ''' - if not _imagemagick_loaded: - raise RuntimeError('Failed to load ImageMagick') - - tdir = PersistentTemporaryDirectory('_comic2lrf_pp') - job_manager = JobManager(len(pages), update) - server = Server() - jobs = [] - tasks = server.split(pages) - for task in tasks: - jobs.append(ParallelJob('render_pages', lambda s:s, job_manager=job_manager, - args=[task, tdir, opts])) - server.add_job(jobs[-1]) - server.wait() - server.killall() - server.close() - ans, failures = [], [] - - for job in jobs: - if job.result is None: - raise Exception(_('Failed to process comic: %s\n\n%s')%(job.exception, job.traceback)) - pages, failures_ = job.result - ans += pages - failures += failures_ - return ans, failures, tdir - -def config(defaults=None,output_format='lrf'): - desc = _('Options to control the conversion of comics (CBR, CBZ) files into ebooks') - if defaults is None: - c = Config('comic', desc) - else: - c = StringConfig(defaults, desc) - c.add_opt('title', ['-t', '--title'], - help=_('Title for generated ebook. Default is to use the filename.')) - c.add_opt('author', ['-a', '--author'], - help=_('Set the author in the metadata of the generated ebook. Default is %default'), - default=_('Unknown')) - c.add_opt('output', ['-o', '--output'], - help=_('Path to output file. By default a file is created in the current directory.')) - c.add_opt('colors', ['-c', '--colors'], type='int', default=64, - help=_('Number of colors for grayscale image conversion. Default: %default')) - c.add_opt('dont_normalize', ['-n', '--disable-normalize'], default=False, - help=_('Disable normalize (improve contrast) color range for pictures. Default: False')) - c.add_opt('keep_aspect_ratio', ['-r', '--keep-aspect-ratio'], default=False, - help=_('Maintain picture aspect ratio. Default is to fill the screen.')) - c.add_opt('dont_sharpen', ['-s', '--disable-sharpen'], default=False, - help=_('Disable sharpening.')) - c.add_opt('disable_trim', ['--disable-trim'], default=False, - help=_('Disable trimming of comic pages. For some comics, ' - 'trimming might remove content as well as borders.')) - c.add_opt('landscape', ['-l', '--landscape'], default=False, - help=_("Don't split landscape images into two portrait images")) - c.add_opt('wide', ['-w', '--wide-aspect'], default=False, - help=_("Keep aspect ratio and scale image using screen height as image width for viewing in landscape mode.")) - c.add_opt('right2left', ['--right2left'], default=False, action='store_true', - help=_('Used for right-to-left publications like manga. Causes landscape pages to be split into portrait pages from right to left.')) - c.add_opt('despeckle', ['-d', '--despeckle'], default=False, - help=_('Enable Despeckle. Reduces speckle noise. May greatly increase processing time.')) - c.add_opt('no_sort', ['--no-sort'], default=False, - help=_("Don't sort the files found in the comic alphabetically by name. Instead use the order they were added to the comic.")) - c.add_opt('profile', ['-p', '--profile'], default='prs500', choices=PROFILES.keys(), - help=_('Choose a profile for the device you are generating this file for. The default is the SONY PRS-500 with a screen size of 584x754 pixels. This is suitable for any reader with the same screen size. Choices are %s')%PROFILES.keys()) - c.add_opt('verbose', ['-v', '--verbose'], default=0, action='count', - help=_('Be verbose, useful for debugging. Can be specified multiple times for greater verbosity.')) - c.add_opt('no_progress_bar', ['--no-progress-bar'], default=False, - help=_("Don't show progress bar.")) - if output_format == 'pdf': - c.add_opt('no_process',['--no_process'], default=False, - help=_("Apply no processing to the image")) - return c - -def option_parser(output_format='lrf'): - c = config(output_format=output_format) - return c.option_parser(usage=_('''\ -%prog [options] comic.cb[z|r] - -Convert a comic in a CBZ or CBR file to an ebook. -''')) - -def create_epub(pages, profile, opts, thumbnail=None): - wrappers = [] - WRAPPER = textwrap.dedent('''\ - - - Page #%d - - - -
    - comic page #%d -
    - - - ''') - dir = os.path.dirname(pages[0]) - for i, page in enumerate(pages): - wrapper = WRAPPER%(i+1, os.path.basename(page), i+1) - page = os.path.join(dir, 'page_%d.html'%(i+1)) - open(page, 'wb').write(wrapper) - wrappers.append(page) - - mi = MetaInformation(opts.title, [opts.author]) - opf = OPFCreator(dir, mi) - opf.create_manifest([(w, None) for w in wrappers]) - opf.create_spine(wrappers) - metadata = os.path.join(dir, 'metadata.opf') - opf.render(open(metadata, 'wb')) - opts2 = html2epub_config('margin_left=0\nmargin_right=0\nmargin_top=0\nmargin_bottom=0').parse() - opts2.output = opts.output - html2epub(metadata, opts2) - -def create_lrf(pages, profile, opts, thumbnail=None): - width, height = PROFILES[profile] - ps = {} - ps['topmargin'] = 0 - ps['evensidemargin'] = 0 - ps['oddsidemargin'] = 0 - ps['textwidth'] = width - ps['textheight'] = height - book = Book(title=opts.title, author=opts.author, - bookid=uuid4().hex, - publisher='%s %s'%(__appname__, __version__), thumbnail=thumbnail, - category='Comic', pagestyledefault=ps, - booksetting=BookSetting(screenwidth=width, screenheight=height)) - for page in pages: - imageStream = ImageStream(page) - _page = book.create_page() - _page.append(ImageBlock(refstream=imageStream, - blockwidth=width, blockheight=height, xsize=width, - ysize=height, x1=width, y1=height)) - book.append(_page) - - book.renderLrf(open(opts.output, 'wb')) - print _('Output written to'), opts.output - - -def create_pdf(pages, profile, opts, thumbnail=None,toc=None): - width, height = PROFILES[profile] - - from reportlab.pdfgen import canvas - - cur_page=0 - heading = [] - if toc != None: - if len(toc) == 1: - toc = None - else: - toc_index = 0 - base_cur = 0 - rem = 0 - breaker = False - while True: - letter=toc[0][0][base_cur] - for i in range(len(toc)): - if letter != toc[i][0][base_cur]: - breaker = True - if breaker: - break - if letter == os.sep: - rem=base_cur - base_cur += 1 - toc.append(("Not seen",-1)) - - - pdf = canvas.Canvas(filename=opts.output, pagesize=(width,height+15)) - pdf.setAuthor(opts.author) - pdf.setTitle(opts.title) - - - for page in pages: - if opts.keep_aspect_ratio: - img = NewMagickWand() - if img < 0: - raise RuntimeError('Cannot create wand.') - if not MagickReadImage(img, page): - raise IOError('Failed to read image from: %'%page) - sizex = MagickGetImageWidth(img) - sizey = MagickGetImageHeight(img) - if opts.keep_aspect_ratio: - # Preserve the aspect ratio by adding border - aspect = float(sizex) / float(sizey) - if aspect <= (float(width) / float(height)): - newsizey = height - newsizex = int(newsizey * aspect) - deltax = (width - newsizex) / 2 - deltay = 0 - else: - newsizex = width - newsizey = int(newsizex / aspect) - deltax = 0 - deltay = (height - newsizey) / 2 - pdf.drawImage(page, x=deltax,y=deltay,width=newsizex, height=newsizey) - else: - pdf.drawImage(page, x=0,y=0,width=width, height=height) - if toc != None: - if toc[toc_index][1] == cur_page: - tmp=toc[toc_index][0] - toc_current=tmp[rem:len(tmp)-4] - index=0 - while True: - key = 'page%d-%d' % (cur_page, index) - pdf.bookmarkPage(key) - (head,dummy,list)=toc_current.partition(os.sep) - try: - if heading[index] != head: - heading[index] = head - pdf.addOutlineEntry(title=head,key=key,level=index) - except: - heading.append(head) - pdf.addOutlineEntry(title=head,key=key,level=index) - index += 1 - toc_current=list - if dummy == "": - break - toc_index += 1 - cur_page += 1 - pdf.showPage() - # Write the document to disk - pdf.save() - - -def do_convert(path_to_file, opts, notification=lambda m, p: p, output_format='lrf'): - path_to_file = run_plugins_on_preprocess(path_to_file) - source = path_to_file - to_delete = [] - toc = [] - list = [] - pages = [] - - - if not opts.title: - opts.title = os.path.splitext(os.path.basename(source))[0] - if not opts.output: - opts.output = os.path.abspath(os.path.splitext(os.path.basename(source))[0]+'.'+output_format) - if os.path.isdir(source): - for path in all_files( source , '*.cbr|*.cbz' ): - list.append( path ) - else: - list= [ os.path.abspath(source) ] - - for source in list: - tdir = extract_comic(source) - new_pages = find_pages(tdir, sort_on_mtime=opts.no_sort, verbose=opts.verbose) - thumbnail = None - if not new_pages: - raise ValueError('Could not find any pages in the comic: %s'%source) - if not getattr(opts, 'no_process', False): - new_pages, failures, tdir2 = process_pages(new_pages, opts, notification) - if not new_pages: - raise ValueError('Could not find any valid pages in the comic: %s'%source) - if failures: - print 'Could not process the following pages (run with --verbose to see why):' - for f in failures: - print '\t', f - thumbnail = os.path.join(tdir2, 'thumbnail.png') - if not os.access(thumbnail, os.R_OK): - thumbnail = None - toc.append((source,len(pages))) - pages.extend(new_pages) - to_delete.append(tdir) - - - if output_format == 'lrf': - create_lrf(pages, opts.profile, opts, thumbnail=thumbnail) - if output_format == 'epub': - create_epub(pages, opts.profile, opts, thumbnail=thumbnail) - if output_format == 'pdf': - create_pdf(pages, opts.profile, opts, thumbnail=thumbnail,toc=toc) - for tdir in to_delete: - shutil.rmtree(tdir) - - -def all_files(root, patterns='*'): - # Expand patterns from semicolon-separated string to list - patterns = patterns.split('|') - for path, subdirs, files in os.walk(root): - files.sort( ) - for name in files: - for pattern in patterns: - if fnmatch.fnmatch(name, pattern): - yield os.path.join(path, name) - break - - -def main(args=sys.argv, notification=None, output_format='lrf'): - parser = option_parser(output_format=output_format) - opts, args = parser.parse_args(args) - if len(args) < 2: - parser.print_help() - print '\nYou must specify a file to convert' - return 1 - - if not callable(notification): - pb = ProgressBar(terminal_controller, _('Rendering comic pages...'), - no_progress_bar=opts.no_progress_bar or getattr(opts, 'no_process', False)) - notification = pb.update - - source = os.path.abspath(args[1]) - do_convert(source, opts, notification, output_format=output_format) - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/oeb/iterator.py b/src/calibre/ebooks/oeb/iterator.py index ffafa6d1a2..ea965c3410 100644 --- a/src/calibre/ebooks/oeb/iterator.py +++ b/src/calibre/ebooks/oeb/iterator.py @@ -128,6 +128,8 @@ class EbookIterator(object): plumber.setup_options() if hasattr(plumber.opts, 'dont_package'): plumber.opts.dont_package = True + if hasattr(plumber.opts, 'no_process'): + plumber.opts.no_process = True self.pathtoopf = plumber.input_plugin(open(plumber.input, 'rb'), plumber.opts, plumber.input_fmt, self.log, {}, self.base) diff --git a/src/calibre/libunzip.py b/src/calibre/libunzip.py index 55d71014a0..f384af1073 100644 --- a/src/calibre/libunzip.py +++ b/src/calibre/libunzip.py @@ -3,19 +3,19 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __docformat__ = 'restructuredtext en' -import os, re +import re from calibre.utils import zipfile def update(pathtozip, patterns, filepaths, names, compression=zipfile.ZIP_DEFLATED, verbose=True): ''' - Update files in the zip file at `pathtozip` matching the given + Update files in the zip file at `pathtozip` matching the given `patterns` with the given `filepaths`. If more than - one file matches, all of the files are replaced. - + one file matches, all of the files are replaced. + :param patterns: A list of compiled regular expressions :param filepaths: A list of paths to the replacement files. Must have the same length as `patterns`. - :param names: A list of archive names for each file in filepaths. + :param names: A list of archive names for each file in filepaths. A name can be `None` in which case the name of the existing file in the archive is used. :param compression: The compression to use when replacing files. Can be @@ -48,4 +48,4 @@ def extract_member(filename, match=re.compile(r'\.(jpg|jpeg|gif|png)\s*$', re.I) names = zf.namelist() for name in names: if match.search(name): - return name, zf.read(name) \ No newline at end of file + return name, zf.read(name) diff --git a/src/calibre/parallel.py b/src/calibre/parallel.py index 90a2969c86..cb14c4ed20 100644 --- a/src/calibre/parallel.py +++ b/src/calibre/parallel.py @@ -43,7 +43,7 @@ PARALLEL_FUNCS = { 'lrfviewer' : ('calibre.gui2.lrf_renderer.main', 'main', {}, None), - + 'ebook-viewer' : ('calibre.gui2.viewer.main', 'main', {}, None), @@ -52,34 +52,34 @@ PARALLEL_FUNCS = { 'render_table' : ('calibre.ebooks.lrf.html.table_as_image', 'do_render', {}, None), - + 'render_pages' : - ('calibre.ebooks.lrf.comic.convert_from', 'render_pages', {}, 'notification'), + ('calibre.ebooks.comic.input', 'render_pages', {}, 'notification'), 'comic2lrf' : ('calibre.ebooks.lrf.comic.convert_from', 'do_convert', {}, 'notification'), - + 'any2epub' : ('calibre.ebooks.epub.from_any', 'any2epub', {}, None), - + 'feeds2epub' : ('calibre.ebooks.epub.from_feeds', 'main', {}, 'notification'), - + 'comic2epub' : ('calibre.ebooks.epub.from_comic', 'convert', {}, 'notification'), - + 'any2mobi' : ('calibre.ebooks.mobi.from_any', 'any2mobi', {}, None), - + 'any2pdf' : - ('calibre.ebooks.pdf.from_any', 'any2pdf', {}, None), - + ('calibre.ebooks.pdf.from_any', 'any2pdf', {}, None), + 'feeds2mobi' : ('calibre.ebooks.mobi.from_feeds', 'main', {}, 'notification'), - + 'comic2mobi' : ('calibre.ebooks.mobi.from_comic', 'convert', {}, 'notification'), - + 'ebook-convert' : ('calibre.ebooks.conversion.cli', 'main', {}, None), } @@ -174,7 +174,7 @@ class WorkerMother(object): contents = os.path.join(contents, 'console.app', 'Contents') self.executable = os.path.join(contents, 'MacOS', os.path.basename(sys.executable)) - + resources = os.path.join(contents, 'Resources') fd = os.path.join(contents, 'Frameworks') sp = os.path.join(resources, 'lib', 'python'+sys.version[:3], 'site-packages.zip') @@ -198,7 +198,7 @@ class WorkerMother(object): for func in ('spawn_free_spirit', 'spawn_worker'): setattr(self, func, getattr(self, func+'_'+ext)) - + def cleanup_child_windows(self, child, name=None, fd=None): try: child.kill() @@ -526,8 +526,8 @@ class JobKilled(Exception): pass class Job(object): - - def __init__(self, job_done, job_manager=None, + + def __init__(self, job_done, job_manager=None, args=[], kwargs={}, description=None): self.args = args self.kwargs = kwargs @@ -540,9 +540,9 @@ class Job(object): self.description = description self.start_time = None self.running_time = None - + self.result = self.exception = self.traceback = self.log = None - + def __cmp__(self, other): sstatus, ostatus = self.status(), other.status() if sstatus == ostatus or (self.has_run and other.has_run): @@ -557,8 +557,8 @@ class Job(object): return -1 if ostatus == 'WAITING': return 1 - - + + def job_done(self): self.is_running, self.has_run = False, True self.running_time = (time.time() - self.start_time) if \ @@ -566,14 +566,14 @@ class Job(object): if self.job_manager is not None: self.job_manager.job_done(self) self._job_done(self) - + def start_work(self): self.is_running = True self.has_run = False self.start_time = time.time() if self.job_manager is not None: self.job_manager.start_work(self) - + def update_status(self, percent, msg=None): self.percent = percent self.msg = msg @@ -582,7 +582,7 @@ class Job(object): self.job_manager.status_update(self) except: traceback.print_exc() - + def status(self): if self.is_running: return 'WORKING' @@ -592,7 +592,7 @@ class Job(object): if self.exception is None: return 'DONE' return 'ERROR' - + def console_text(self): ans = [u'Job: '] if self.description: @@ -610,13 +610,13 @@ class Job(object): if self.traceback: ans.append(u'**Traceback**:') ans.extend(self.traceback.split('\n')) - + if self.log: if isinstance(self.log, str): self.log = unicode(self.log, 'utf-8', 'replace') ans.append(self.log) return (u'\n'.join(ans)).encode('utf-8') - + def gui_text(self): ans = [u'Job: '] if self.description: @@ -641,19 +641,19 @@ class Job(object): if isinstance(self.log, str): self.log = unicode(self.log, 'utf-8', 'replace') ans.extend(self.log.split('\n')) - + ans = [x.decode(preferred_encoding, 'replace') if isinstance(x, str) else x for x in ans] - + return u'
    '.join(ans) class ParallelJob(Job): - + def __init__(self, func, *args, **kwargs): Job.__init__(self, *args, **kwargs) self.func = func self.done = self.job_done - + def output(self, msg): if not self.log: self.log = u'' @@ -663,7 +663,7 @@ class ParallelJob(Job): self.log += msg if self.job_manager is not None: self.job_manager.output(self) - + def remove_ipc_socket(path): os = __import__('os') @@ -702,7 +702,7 @@ class Server(Thread): self.result_lock = RLock() self.pool_lock = RLock() self.start() - + def split(self, tasks): ''' Split a list into a list of sub lists, with the number of sub lists being @@ -720,7 +720,7 @@ class Server(Thread): ans.append(section) pos += delta return ans - + def close(self): try: @@ -733,7 +733,7 @@ class Server(Thread): self.jobs.append(job) if job.job_manager is not None: job.job_manager.add_job(job) - + def poll(self): ''' Return True if the server has either working or queued jobs @@ -741,14 +741,14 @@ class Server(Thread): with self.job_lock: with self.working_lock: return len(self.jobs) + len(self.working) > 0 - + def wait(self, sleep=1): ''' Wait until job queue is empty ''' while self.poll(): time.sleep(sleep) - + def run(self): while True: job = None @@ -935,7 +935,7 @@ def work(client_socket, func, args, kwdargs): func(*args, **kwargs) except (Exception, SystemExit): continue - + time.sleep(5) # Give any in progress BufferedSend time to complete @@ -948,7 +948,7 @@ def worker(host, port): if msg != 'OK': return 1 write(client_socket, 'WAITING') - + sys.stdout = BufferedSender(client_socket) sys.stderr = sys.stdout From 0749f44979ea69ab05109fbad777331650ba0658 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 26 Apr 2009 23:00:47 -0700 Subject: [PATCH 149/319] IGN:... --- src/calibre/customize/conversion.py | 8 ++++++ src/calibre/ebooks/comic/input.py | 6 +++++ src/calibre/ebooks/pdf/output.py | 13 +++++---- src/calibre/ebooks/pdf/writer.py | 41 ++++++++++++++--------------- 4 files changed, 42 insertions(+), 26 deletions(-) diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index c358986d18..7573dddeac 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -133,6 +133,14 @@ class InputFormatPlugin(Plugin): #: (option_name, recommended_value, recommendation_level) recommendations = set([]) + def get_images(self): + ''' + Return a list of absolute paths to the images, if this input plugin + represents an image collection. The list of images is in the same order + as the spine and the TOC. + ''' + raise NotImplementedError() + def convert(self, stream, options, file_ext, log, accelerators): ''' This method must be implemented in sub-classes. It must return diff --git a/src/calibre/ebooks/comic/input.py b/src/calibre/ebooks/comic/input.py index f6d6557ee4..82070bbc72 100755 --- a/src/calibre/ebooks/comic/input.py +++ b/src/calibre/ebooks/comic/input.py @@ -357,6 +357,9 @@ class ComicInput(InputFormatPlugin): thumbnail = None return new_pages + def get_images(self): + return self._images + def convert(self, stream, opts, file_ext, log, accelerators, progress=lambda p, m : m): from calibre.ebooks.metadata import MetaInformation @@ -401,6 +404,9 @@ class ComicInput(InputFormatPlugin): spine = [] for comic in comics: spine.extend(map(href, comic[2])) + self._images = [] + for comic in comics: + self._images.extend(comic[1]) opf.create_spine(spine) toc = TOC() if len(comics) == 1: diff --git a/src/calibre/ebooks/pdf/output.py b/src/calibre/ebooks/pdf/output.py index 20ba5028b0..7b8b0323ab 100644 --- a/src/calibre/ebooks/pdf/output.py +++ b/src/calibre/ebooks/pdf/output.py @@ -40,7 +40,7 @@ class PDFOutput(OutputFormatPlugin): OptionRecommendation(name='margin_right', recommended_value='1', level=OptionRecommendation.LOW, help=_('The right margin around the document.')), - + OptionRecommendation(name='unit', recommended_value='inch', level=OptionRecommendation.LOW, short_switch='u', choices=UNITS.keys(), help=_('The unit of measure. Default is inch. Choices ' @@ -58,15 +58,18 @@ class PDFOutput(OutputFormatPlugin): help=_('The orientation of the page. Default is portrait. Choices ' 'are %s' % ORIENTATIONS.keys())), ]) - + def convert(self, oeb_book, output_path, input_plugin, opts, log): + self.opts, self.log = opts, log + if input_plugin.is_image_collection: + self.convert_images(input_plugin.get_images()) with TemporaryDirectory('_pdf_out') as oebdir: OEBOutput(None).convert(oeb_book, oebdir, input_plugin, opts, log) opf = glob.glob(os.path.join(oebdir, '*.opf'))[0] writer = PDFWriter(log, opts) - + close = False if not hasattr(output_path, 'write'): close = True @@ -75,10 +78,10 @@ class PDFOutput(OutputFormatPlugin): out_stream = open(output_path, 'wb') else: out_stream = output_path - + out_stream.seek(0) out_stream.truncate() writer.dump(opf, out_stream, PDFMetadata(oeb_book.metadata)) - + if close: out_stream.close() diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index f91dae44fd..7a9973c6d7 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -9,12 +9,11 @@ __docformat__ = 'restructuredtext en' Write content to PDF. ''' -import os, shutil, sys +import os, shutil from calibre.ptempfile import PersistentTemporaryDirectory -from calibre.customize.profiles import OutputProfile from calibre.ebooks.pdf.pageoptions import unit, paper_size, \ - orientation, size + orientation, size from calibre.ebooks.metadata import authors_to_string from calibre.ebooks.metadata.opf2 import OPF @@ -24,12 +23,12 @@ from PyQt4.Qt import QUrl, QEventLoop, SIGNAL, QObject, \ from PyQt4.QtWebKit import QWebView from pyPdf import PdfFileWriter, PdfFileReader - + class PDFMetadata(object): def __init__(self, oeb_metadata=None): self.title = _('Unknown') self.author = _('Unknown') - + if oeb_metadata != None: if len(oeb_metadata.title) >= 1: self.title = oeb_metadata.title[0].value @@ -42,16 +41,16 @@ class PDFWriter(QObject): if QApplication.instance() is None: QApplication([]) QObject.__init__(self) - + self.logger = log - + self.loop = QEventLoop() self.view = QWebView() self.connect(self.view, SIGNAL('loadFinished(bool)'), self._render_html) self.render_queue = [] self.combine_queue = [] self.tmp_path = PersistentTemporaryDirectory('_pdf_output_parts') - + self.custom_size = None if opts.custom_size != None: width, sep, height = opts.custom_size.partition('x') @@ -62,44 +61,44 @@ class PDFWriter(QObject): self.custom_size = (width, height) except: self.custom_size = None - + self.opts = opts - + def dump(self, opfpath, out_stream, pdf_metadata): self.metadata = pdf_metadata self._delete_tmpdir() - + opf = OPF(opfpath, os.path.dirname(opfpath)) self.render_queue = [i.path for i in opf.spine] self.combine_queue = [] self.out_stream = out_stream - + QMetaObject.invokeMethod(self, "_render_book", Qt.QueuedConnection) self.loop.exec_() - + @QtCore.pyqtSignature('_render_book()') def _render_book(self): if len(self.render_queue) == 0: self._write() else: self._render_next() - + def _render_next(self): item = str(self.render_queue.pop(0)) self.combine_queue.append(os.path.join(self.tmp_path, '%i.pdf' % (len(self.combine_queue) + 1))) - + self.logger.info('Processing %s...' % item) - + self.view.load(QUrl(item)) def _render_html(self, ok): if ok: item_path = os.path.join(self.tmp_path, '%i.pdf' % len(self.combine_queue)) - + self.logger.debug('\tRendering item %s as %i' % (os.path.basename(str(self.view.url().toLocalFile())), len(self.combine_queue))) - + printer = QPrinter(QPrinter.HighResolution) - + if self.opts.output_profile.short_name == 'default': if self.custom_size == None: printer.setPaperSize(paper_size(self.opts.paper_size)) @@ -107,7 +106,7 @@ class PDFWriter(QObject): printer.setPaperSize(QSizeF(self.custom_size[0], self.custom_size[1]), unit(self.opts.unit)) else: printer.setPaperSize(QSizeF(self.opts.output_profile.width / self.opts.output_profile.dpi, self.opts.output_profile.height / self.opts.output_profile.dpi), QPrinter.Inch) - + printer.setPageMargins(size(self.opts.margin_left), size(self.opts.margin_top), size(self.opts.margin_right), size(self.opts.margin_bottom), unit(self.opts.unit)) printer.setOrientation(orientation(self.opts.orientation)) printer.setOutputFormat(QPrinter.PdfFormat) @@ -122,7 +121,7 @@ class PDFWriter(QObject): def _write(self): self.logger.info('Combining individual PDF parts...') - + try: outPDF = PdfFileWriter(title=self.metadata.title, author=self.metadata.author) for item in self.combine_queue: From 996dda3ffea65144cdb62fcd0e2c8c231f4f2325 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 27 Apr 2009 12:02:18 -0700 Subject: [PATCH 150/319] Fix regression in LIT metadata reader --- src/calibre/ebooks/lit/reader.py | 3 +++ src/calibre/ebooks/metadata/lit.py | 9 +++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index 79249fe7c3..37328328b7 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -882,6 +882,9 @@ class LitContainer(object): unbin = UnBinary(raw, path, self._litfile.manifest, OPF_MAP) return str(unbin) + def get_metadata(self): + return self._read_meta() + class LitReader(OEBReader): Container = LitContainer diff --git a/src/calibre/ebooks/metadata/lit.py b/src/calibre/ebooks/metadata/lit.py index 2a57d2f2d2..0a37b6c768 100644 --- a/src/calibre/ebooks/metadata/lit.py +++ b/src/calibre/ebooks/metadata/lit.py @@ -4,15 +4,16 @@ __copyright__ = '2008, Kovid Goyal ' Support for reading the metadata from a LIT file. ''' -import sys, cStringIO, os +import cStringIO, os from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata.opf2 import OPF -from calibre.ebooks.lit.reader import LitReader def get_metadata(stream): - litfile = LitReader(stream) - src = litfile.meta.encode('utf-8') + from calibre.ebooks.lit.reader import LitContainer + litfile = LitContainer(stream) + src = litfile.get_metadata().encode('utf-8') + litfile = litfile._litfile opf = OPF(cStringIO.StringIO(src), os.getcwd()) mi = MetaInformation(opf) covers = [] From 2da5589964160c991f13ecfaea1b84b6ce93a92a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 27 Apr 2009 15:41:10 -0700 Subject: [PATCH 151/319] Input plugin for recipes --- src/calibre/customize/builtins.py | 3 +- src/calibre/ebooks/conversion/cli.py | 5 +- src/calibre/ebooks/conversion/plumber.py | 18 +++++- src/calibre/ebooks/oeb/base.py | 8 +-- src/calibre/ebooks/oeb/transforms/split.py | 1 + src/calibre/web/__init__.py | 3 +- src/calibre/web/feeds/input.py | 65 ++++++++++++++++++++++ src/calibre/web/feeds/news.py | 36 +++++------- 8 files changed, 108 insertions(+), 31 deletions(-) create mode 100644 src/calibre/web/feeds/input.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 9a686e0d94..dcbffade92 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -287,6 +287,7 @@ from calibre.ebooks.odt.input import ODTInput from calibre.ebooks.rtf.input import RTFInput from calibre.ebooks.html.input import HTMLInput from calibre.ebooks.comic.input import ComicInput +from calibre.web.feeds.input import RecipeInput from calibre.ebooks.oeb.output import OEBOutput from calibre.ebooks.epub.output import EPUBOutput from calibre.ebooks.txt.output import TXTOutput @@ -296,7 +297,7 @@ from calibre.customize.profiles import input_profiles, output_profiles plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDBInput, PDFInput, HTMLInput, TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput, ComicInput, - FB2Input, ODTInput, RTFInput, EPUBOutput, EREADEROutput] + FB2Input, ODTInput, RTFInput, EPUBOutput, EREADEROutput, RecipeInput] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index 941a1ec5fc..d8de702915 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -52,7 +52,7 @@ def check_command_line_options(parser, args, log): raise SystemExit(1) input = os.path.abspath(args[1]) - if not os.access(input, os.R_OK): + if not input.endswith('.recipe') and not os.access(input, os.R_OK): log.error('Cannot read from', input) raise SystemExit(1) @@ -169,6 +169,9 @@ def add_pipeline_options(parser, plumber): if rec.level < rec.HIGH: option_recommendation_to_cli_option(add_option, rec) + option_recommendation_to_cli_option(parser.add_option, + plumber.get_option_by_name('list_recipes')) + def option_parser(): return OptionParser(usage=USAGE) diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index da9c9f11e2..1ef58e1d95 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -360,6 +360,10 @@ OptionRecommendation(name='book_producer', OptionRecommendation(name='language', recommended_value=None, level=OptionRecommendation.LOW, help=_('Set the language.')), + +OptionRecommendation(name='list_recipes', + recommended_value=False, help=_('List available recipes.')), + ] input_fmt = os.path.splitext(self.input)[1] @@ -525,6 +529,13 @@ OptionRecommendation(name='language', self.setup_options() if self.opts.verbose: self.log.filter_level = self.log.DEBUG + if self.opts.list_recipes: + from calibre.web.feeds.recipes import titles + self.log('Available recipes:') + for title in sorted(titles): + self.log('\t'+title) + self.log('%d recipes available'%len(titles)) + raise SystemExit(0) # Run any preprocess plugins from calibre.customize.ui import run_plugins_on_preprocess @@ -535,8 +546,13 @@ OptionRecommendation(name='language', accelerators = {} tdir = PersistentTemporaryDirectory('_plumber') + stream = self.input if self.input_fmt == 'recipe' else \ + open(self.input, 'rb') - self.oeb = self.input_plugin(open(self.input, 'rb'), self.opts, + if hasattr(self.opts, 'lrf') and self.output_plugin.file_type == 'lrf': + self.opts.lrf = True + + self.oeb = self.input_plugin(stream, self.opts, self.input_fmt, self.log, accelerators, tdir) if self.opts.debug_input is not None: diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 9d8598c766..f5395e04fe 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -1578,15 +1578,15 @@ class OEBBook(object): return data.decode('utf-16') except UnicodeDecodeError: pass - try: - return data.decode('utf-8') - except UnicodeDecodeError: - pass if self.encoding is not None: try: return data.decode(self.encoding) except UnicodeDecodeError: pass + try: + return data.decode('utf-8') + except UnicodeDecodeError: + pass data, _ = xml_to_unicode(data) data = data.replace('\r\n', '\n') data = data.replace('\r', '\n') diff --git a/src/calibre/ebooks/oeb/transforms/split.py b/src/calibre/ebooks/oeb/transforms/split.py index 21d71da5bb..ec3d63192d 100644 --- a/src/calibre/ebooks/oeb/transforms/split.py +++ b/src/calibre/ebooks/oeb/transforms/split.py @@ -59,6 +59,7 @@ class Split(object): self.fix_links() def split_item(self, item): + page_breaks, page_break_ids = [], [] if self.split_on_page_breaks: page_breaks, page_break_ids = self.find_page_breaks(item) diff --git a/src/calibre/web/__init__.py b/src/calibre/web/__init__.py index cadf21c39f..b14dc0ce28 100644 --- a/src/calibre/web/__init__.py +++ b/src/calibre/web/__init__.py @@ -2,5 +2,6 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' +class Recipe(object): + pass - \ No newline at end of file diff --git a/src/calibre/web/feeds/input.py b/src/calibre/web/feeds/input.py new file mode 100644 index 0000000000..21324293d3 --- /dev/null +++ b/src/calibre/web/feeds/input.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import os + +from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation + +class RecipeInput(InputFormatPlugin): + + name = 'Recipe Input' + author = 'Kovid Goyal' + description = _('Download periodical content from the internet') + file_types = set(['recipe']) + + recommendations = set([ + ('chapter_mark', 'none', OptionRecommendation.HIGH), + ('dont_split_on_page_breaks', True, OptionRecommendation.HIGH), + ('use_auto_toc', False, OptionRecommendation.HIGH), + ]) + + options = set([ + OptionRecommendation(name='test', recommended_value=False, + help=_('Useful for recipe development. Forces ' + 'max_articles_per_feed to 2 and downloads at most 2 feeds.')), + OptionRecommendation(name='username', recommended_value=None, + help=_('Username for sites that require a login to access ' + 'content.')), + OptionRecommendation(name='password', recommended_value=None, + help=_('Password for sites that require a login to access ' + 'content.')), + OptionRecommendation(name='lrf', recommended_value=False, + help='Optimize fetching for subsequent conversion to LRF.'), + ]) + + def convert(self, recipe_or_file, opts, file_ext, log, + accelerators, progress=lambda x, y: x): + from calibre.web.feeds.recipes import \ + get_builtin_recipe, compile_recipe + if os.access(recipe_or_file, os.R_OK): + recipe = compile_recipe(open(recipe_or_file, 'rb').read()) + else: + title = os.path.basename(recipe_or_file).rpartition('.')[0] + recipe = get_builtin_recipe(title) + + if recipe is None: + raise ValueError('%s is not a valid recipe file or builtin recipe' % + recipe_or_file) + + ro = recipe(opts, log, progress) + ro.download() + + opts.output_profile.flow_size = 0 + + for f in os.listdir('.'): + if f.endswith('.opf'): + return os.path.abspath(f) + + + + diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index 6a248b6992..216a827326 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -20,6 +20,7 @@ from calibre import browser, __appname__, iswindows, \ from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.lrf import entity_to_unicode +from calibre.web import Recipe from calibre.ebooks import render_html from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata import MetaInformation @@ -27,12 +28,11 @@ from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed from calibre.web.fetch.simple import option_parser as web2disk_option_parser from calibre.web.fetch.simple import RecursiveFetcher from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending -from calibre.utils.logging import Log from calibre.ptempfile import PersistentTemporaryFile, \ PersistentTemporaryDirectory -class BasicNewsRecipe(object): +class BasicNewsRecipe(Recipe): ''' Abstract base class that contains logic needed in all feed fetchers. ''' @@ -443,40 +443,34 @@ class BasicNewsRecipe(object): ''' raise NotImplementedError - def __init__(self, options, parser, progress_reporter): + def __init__(self, options, log, progress_reporter): ''' Initialize the recipe. :param options: Parsed commandline options :param parser: Command line option parser. Used to intelligently merge options. :param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional. ''' - self.log = Log() - if options.verbose: - self.log.filter_level = self.log.DEBUG + self.log = log if not isinstance(self.title, unicode): self.title = unicode(self.title, 'utf-8', 'replace') - for attr in ('username', 'password', 'lrf', 'output_dir', 'verbose', 'debug', 'test'): - setattr(self, attr, getattr(options, attr)) + self.debug = options.verbose > 1 + self.output_dir = os.getcwd() + self.verbose = options.verbose + self.test = options.test + self.username = options.username + self.password = options.password + self.lrf = options.lrf + self.output_dir = os.path.abspath(self.output_dir) if options.test: self.max_articles_per_feed = 2 self.simultaneous_downloads = min(4, self.simultaneous_downloads) - if self.debug: self.verbose = True self.report_progress = progress_reporter - self.username = self.password = None - #: If True optimize downloading for eventual conversion to LRF - self.lrf = False - defaults = parser.get_default_values() - - for opt in options.__dict__.keys(): - if getattr(options, opt) != getattr(defaults, opt, None): - setattr(self, opt, getattr(options, opt)) - if isinstance(self.feeds, basestring): self.feeds = eval(self.feeds) if isinstance(self.feeds, basestring): @@ -493,7 +487,6 @@ class BasicNewsRecipe(object): '--timeout', str(self.timeout), '--max-recursions', str(self.recursions), '--delay', str(self.delay), - '--timeout', str(self.timeout), ] if self.encoding is not None: web2disk_cmdline.extend(['--encoding', self.encoding]) @@ -520,9 +513,6 @@ class BasicNewsRecipe(object): self.simultaneous_downloads = 1 self.navbar = templates.NavBarTemplate() - self.html2lrf_options.extend(['--page-break-before', '$', '--use-spine', '--header', '--encoding', 'utf-8']) - if '--base-font-size' not in self.html2lrf_options: - self.html2lrf_options.extend(['--base-font-size', '12']) self.failed_downloads = [] self.partial_failures = [] @@ -557,7 +547,7 @@ class BasicNewsRecipe(object): return self.postprocess_html(soup, first_fetch) - def download(self, for_lrf=False): + def download(self): ''' Download and pre-process all articles from the feeds in this recipe. This method should be called only one on a particular Recipe instance. From a9a18aa565c0681b494bf957e9bc486ec2443c66 Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 27 Apr 2009 18:48:15 -0400 Subject: [PATCH 152/319] Ereader writer still not working --- src/calibre/ebooks/pdb/ereader/inspector.py | 3 +++ src/calibre/ebooks/pdb/ereader/writer.py | 14 +++++++++----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/pdb/ereader/inspector.py b/src/calibre/ebooks/pdb/ereader/inspector.py index a3875daad4..b5f2341cb5 100644 --- a/src/calibre/ebooks/pdb/ereader/inspector.py +++ b/src/calibre/ebooks/pdb/ereader/inspector.py @@ -53,6 +53,9 @@ def ereader_header_info(header): print '50-52 sidebar_offset: %i' % struct.unpack('>H', h0[50:52])[0] print '52-54 last_data_offset: %i' % struct.unpack('>H', h0[52:54])[0] + for i in range(54, 131, 2): + print '%i-%i: %i' % (i, i+2, struct.unpack('>H', h0[i:i+2])[0]) + print '' def section_lengths(header): diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index b831849488..7e3fdc30ea 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -31,7 +31,7 @@ class Writer(object): lengths = [len(i) for i in sections] - pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, '') + pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, 'test book') pdbHeaderBuilder.build_header(lengths, out_stream) for item in sections: @@ -69,7 +69,7 @@ class Writer(object): return images def _metadata(self, metadata): - return '\x00\x00\x00\x00\x00' + return 'test\x00\x00\x00\x00\x00' def _header_record(self, text_items, image_items): ''' @@ -88,7 +88,7 @@ class Writer(object): last_data_offset = meta_data_offset + 1 image_data_offset = last_data_offset - record = u'' + record = '' record += struct.pack('>H', version) # [0:2] record += struct.pack('>H', 0) # [2:4] @@ -115,8 +115,12 @@ class Writer(object): record += struct.pack('>H', meta_data_offset) # [44:46] record += struct.pack('>H', meta_data_offset) # [46:48] record += struct.pack('>H', last_data_offset) # [48:50] # footnote_offset - record += struct.pack('>H', last_data_offset) # [52:54] # sidebar_offset - record += struct.pack('>H', last_data_offset) # [54:56] # last_data_offset + record += struct.pack('>H', last_data_offset) # [50:52] # sidebar_offset + record += struct.pack('>H', last_data_offset) # [52:54] # last_data_offset + + record += struct.pack('>H', 1) # [54:56] + for i in range(56, 132, 2): + record += struct.pack('>H', 0) ''' # Version From ed13e58801098860ad9c765f2494d25604dd7591 Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 27 Apr 2009 19:04:20 -0400 Subject: [PATCH 153/319] PDF output: Remove redundant margin options, start image conversion. --- src/calibre/ebooks/pdf/output.py | 33 +++++++++++++++----------------- src/calibre/ebooks/pdf/writer.py | 11 +++++++++-- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/src/calibre/ebooks/pdf/output.py b/src/calibre/ebooks/pdf/output.py index 7b8b0323ab..a2674b83eb 100644 --- a/src/calibre/ebooks/pdf/output.py +++ b/src/calibre/ebooks/pdf/output.py @@ -28,23 +28,11 @@ class PDFOutput(OutputFormatPlugin): file_type = 'pdf' options = set([ - OptionRecommendation(name='margin_top', recommended_value='1', - level=OptionRecommendation.LOW, - help=_('The top margin around the document.')), - OptionRecommendation(name='margin_bottom', recommended_value='1', - level=OptionRecommendation.LOW, - help=_('The bottom margin around the document.')), - OptionRecommendation(name='margin_left', recommended_value='1', - level=OptionRecommendation.LOW, - help=_('The left margin around the document.')), - OptionRecommendation(name='margin_right', recommended_value='1', - level=OptionRecommendation.LOW, - help=_('The right margin around the document.')), - OptionRecommendation(name='unit', recommended_value='inch', level=OptionRecommendation.LOW, short_switch='u', choices=UNITS.keys(), help=_('The unit of measure. Default is inch. Choices ' - 'are %s' % UNITS.keys())), + 'are %s ' + 'Note: This does not override the unit for margins!' % UNITS.keys())), OptionRecommendation(name='paper_size', recommended_value='letter', level=OptionRecommendation.LOW, choices=PAPER_SIZES.keys(), help=_('The size of the paper. Default is letter. Choices ' @@ -60,15 +48,23 @@ class PDFOutput(OutputFormatPlugin): ]) def convert(self, oeb_book, output_path, input_plugin, opts, log): - self.opts, self.log = opts, log + self.input_plugin, self.opts, self.log = input_plugin, opts, log + if input_plugin.is_image_collection: - self.convert_images(input_plugin.get_images()) + self.convert_images(input_plugin.get_images(), output_path) + else: + self.convert_text(oeb_book, output_path) + + def convert_images(self, images, output_path): + raise NotImplementedError() + + def convert_text(self, oeb_book, output_path): with TemporaryDirectory('_pdf_out') as oebdir: - OEBOutput(None).convert(oeb_book, oebdir, input_plugin, opts, log) + OEBOutput(None).convert(oeb_book, oebdir, self.input_plugin, self.opts, self.log) opf = glob.glob(os.path.join(oebdir, '*.opf'))[0] - writer = PDFWriter(log, opts) + writer = PDFWriter(self.opts, self.log) close = False if not hasattr(output_path, 'write'): @@ -85,3 +81,4 @@ class PDFOutput(OutputFormatPlugin): if close: out_stream.close() + diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index 7a9973c6d7..e82c6bd257 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -37,7 +37,7 @@ class PDFMetadata(object): class PDFWriter(QObject): - def __init__(self, log, opts): + def __init__(self, opts, log): if QApplication.instance() is None: QApplication([]) QObject.__init__(self) @@ -107,7 +107,7 @@ class PDFWriter(QObject): else: printer.setPaperSize(QSizeF(self.opts.output_profile.width / self.opts.output_profile.dpi, self.opts.output_profile.height / self.opts.output_profile.dpi), QPrinter.Inch) - printer.setPageMargins(size(self.opts.margin_left), size(self.opts.margin_top), size(self.opts.margin_right), size(self.opts.margin_bottom), unit(self.opts.unit)) + printer.setPageMargins(0, 0, 0, 0, QPrinter.Point) printer.setOrientation(orientation(self.opts.orientation)) printer.setOutputFormat(QPrinter.PdfFormat) printer.setOutputFileName(item_path) @@ -132,3 +132,10 @@ class PDFWriter(QObject): finally: self._delete_tmpdir() self.loop.exit(0) + + +class ImagePDFWriter(object): + + def __init__(self, opts, log): + self.opts, self.log = opts, log + From ee777c539aa14030530258a91e477a7f5bb5b732 Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 28 Apr 2009 21:33:35 -0400 Subject: [PATCH 154/319] beginnings of working comic to pdf output. --- src/calibre/ebooks/pdf/output.py | 46 ++++++++++++++++++-------------- src/calibre/ebooks/pdf/writer.py | 25 ++++++++++------- 2 files changed, 42 insertions(+), 29 deletions(-) diff --git a/src/calibre/ebooks/pdf/output.py b/src/calibre/ebooks/pdf/output.py index a2674b83eb..f4d9cb4631 100644 --- a/src/calibre/ebooks/pdf/output.py +++ b/src/calibre/ebooks/pdf/output.py @@ -17,7 +17,7 @@ from calibre.customize.conversion import OutputFormatPlugin, \ OptionRecommendation from calibre.ebooks.oeb.output import OEBOutput from calibre.ptempfile import TemporaryDirectory -from calibre.ebooks.pdf.writer import PDFWriter, PDFMetadata +from calibre.ebooks.pdf.writer import PDFWriter, ImagePDFWriter, PDFMetadata from calibre.ebooks.pdf.pageoptions import UNITS, PAPER_SIZES, \ ORIENTATIONS @@ -49,36 +49,42 @@ class PDFOutput(OutputFormatPlugin): def convert(self, oeb_book, output_path, input_plugin, opts, log): self.input_plugin, self.opts, self.log = input_plugin, opts, log + self.output_path = output_path + self.metadata = oeb_book.metadata if input_plugin.is_image_collection: - self.convert_images(input_plugin.get_images(), output_path) + self.convert_images(input_plugin.get_images()) else: - self.convert_text(oeb_book, output_path) + self.convert_text(oeb_book) - def convert_images(self, images, output_path): - raise NotImplementedError() + def convert_images(self, images): + # process images to document size + self.write(ImagePDFWriter, images) - def convert_text(self, oeb_book, output_path): + def convert_text(self, oeb_book): with TemporaryDirectory('_pdf_out') as oebdir: OEBOutput(None).convert(oeb_book, oebdir, self.input_plugin, self.opts, self.log) opf = glob.glob(os.path.join(oebdir, '*.opf'))[0] + + self.write(PDFWriter, [s.path for s in opf.spine]) - writer = PDFWriter(self.opts, self.log) + def write(self, Writer, items): + writer = Writer(self.opts, self.log) - close = False - if not hasattr(output_path, 'write'): - close = True - if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '': - os.makedirs(os.path.dirname(output_path)) - out_stream = open(output_path, 'wb') - else: - out_stream = output_path + close = False + if not hasattr(self.output_path, 'write'): + close = True + if not os.path.exists(os.path.dirname(self.output_path)) and os.path.dirname(self.output_path) != '': + os.makedirs(os.path.dirname(self.output_path)) + out_stream = open(self.output_path, 'wb') + else: + out_stream = self.output_path - out_stream.seek(0) - out_stream.truncate() - writer.dump(opf, out_stream, PDFMetadata(oeb_book.metadata)) + out_stream.seek(0) + out_stream.truncate() + writer.dump(items, out_stream, PDFMetadata(self.metadata)) - if close: - out_stream.close() + if close: + out_stream.close() diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index e82c6bd257..adaae10d16 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -64,12 +64,11 @@ class PDFWriter(QObject): self.opts = opts - def dump(self, opfpath, out_stream, pdf_metadata): + def dump(self, items, out_stream, pdf_metadata): self.metadata = pdf_metadata self._delete_tmpdir() - opf = OPF(opfpath, os.path.dirname(opfpath)) - self.render_queue = [i.path for i in opf.spine] + self.render_queue = items self.combine_queue = [] self.out_stream = out_stream @@ -87,7 +86,7 @@ class PDFWriter(QObject): item = str(self.render_queue.pop(0)) self.combine_queue.append(os.path.join(self.tmp_path, '%i.pdf' % (len(self.combine_queue) + 1))) - self.logger.info('Processing %s...' % item) + self.logger.debug('Processing %s...' % item) self.view.load(QUrl(item)) @@ -120,7 +119,7 @@ class PDFWriter(QObject): self.tmp_path = PersistentTemporaryDirectory('_pdf_output_parts') def _write(self): - self.logger.info('Combining individual PDF parts...') + self.logger.debug('Combining individual PDF parts...') try: outPDF = PdfFileWriter(title=self.metadata.title, author=self.metadata.author) @@ -134,8 +133,16 @@ class PDFWriter(QObject): self.loop.exit(0) -class ImagePDFWriter(object): +class ImagePDFWriter(PDFWriter): - def __init__(self, opts, log): - self.opts, self.log = opts, log - + def _render_next(self): + item = str(self.render_queue.pop(0)) + self.combine_queue.append(os.path.join(self.tmp_path, '%i.pdf' % (len(self.combine_queue) + 1))) + + self.logger.debug('Processing %s...' % item) + + html = '' % item + + self.view.setHtml(html) + + From 49aec87b59093f6ddacee3282914a58cebc4b11c Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 29 Apr 2009 06:47:34 -0400 Subject: [PATCH 155/319] pdf image output tweaks --- src/calibre/ebooks/pdf/writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index adaae10d16..80bcacde2c 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -141,7 +141,7 @@ class ImagePDFWriter(PDFWriter): self.logger.debug('Processing %s...' % item) - html = '' % item + html = '' % item self.view.setHtml(html) From 6c54eca8b3267139708facbb28c5048e3fc85052 Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 29 Apr 2009 07:00:01 -0400 Subject: [PATCH 156/319] pdfmanipulate: rotate command. --- src/calibre/ebooks/pdf/manipulate/cli.py | 3 ++- src/calibre/ebooks/pdf/manipulate/crop.py | 2 +- src/calibre/ebooks/pdf/manipulate/info.py | 8 ++++---- src/calibre/ebooks/pdf/manipulate/merge.py | 2 +- src/calibre/ebooks/pdf/manipulate/reverse.py | 2 +- src/calibre/ebooks/pdf/manipulate/split.py | 2 +- 6 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/calibre/ebooks/pdf/manipulate/cli.py b/src/calibre/ebooks/pdf/manipulate/cli.py index edbba54a8d..4876cbd8f5 100644 --- a/src/calibre/ebooks/pdf/manipulate/cli.py +++ b/src/calibre/ebooks/pdf/manipulate/cli.py @@ -16,7 +16,7 @@ from calibre.utils.logging import Log from calibre.constants import preferred_encoding from calibre.customize.conversion import OptionRecommendation from calibre.ebooks.pdf.manipulate import crop, decrypt, encrypt, \ - info, merge, reverse, split + info, merge, reverse, rotate, split COMMANDS = { 'crop' : crop, @@ -25,6 +25,7 @@ COMMANDS = { 'info' : info, 'merge' : merge, 'reverse' : reverse, + 'rotate' : rotate, 'split' : split, } diff --git a/src/calibre/ebooks/pdf/manipulate/crop.py b/src/calibre/ebooks/pdf/manipulate/crop.py index 7627823a89..0f24f04638 100644 --- a/src/calibre/ebooks/pdf/manipulate/crop.py +++ b/src/calibre/ebooks/pdf/manipulate/crop.py @@ -19,7 +19,7 @@ from calibre.utils.config import OptionParser from calibre.utils.logging import Log from calibre.constants import preferred_encoding from calibre.customize.conversion import OptionRecommendation -from calibre.ebooks.pdf.verify import is_valid_pdf +from calibre.ebooks.pdf.verify import is_valid_pdf, is_encrypted from pyPdf import PdfFileWriter, PdfFileReader diff --git a/src/calibre/ebooks/pdf/manipulate/info.py b/src/calibre/ebooks/pdf/manipulate/info.py index d1b52a602c..13a39d10f6 100644 --- a/src/calibre/ebooks/pdf/manipulate/info.py +++ b/src/calibre/ebooks/pdf/manipulate/info.py @@ -16,7 +16,7 @@ from calibre.utils.config import OptionParser from calibre.utils.logging import Log from calibre.constants import preferred_encoding from calibre.customize.conversion import OptionRecommendation -from calibre.ebooks.pdf.verify import is_valid_pdfs, is_encrypted +from calibre.ebooks.pdf.verify import is_valid_pdfs, is_encrypted, is_encrypted from pyPdf import PdfFileWriter, PdfFileReader @@ -42,10 +42,10 @@ def print_info(pdf_path): print _('Subject: %s' % pdf.documentInfo.subject) print _('Creator: %s' % pdf.documentInfo.creator) print _('Producer: %s' % pdf.documentInfo.producer) - print _('Creation Date: %s' % time.strftime('%a %b %d %H:%M:%S %Y', time.gmtime(os.path.getctime(pdf_path)))) - print _('Modification Date: %s' % time.strftime('%a %b %d %H:%M:%S %Y', time.gmtime(os.path.getmtime(pdf_path)))) + #print _('Creation Date: %s' % time.strftime('%a %b %d %H:%M:%S %Y', time.gmtime(os.path.getctime(pdf_path)))) + #print _('Modification Date: %s' % time.strftime('%a %b %d %H:%M:%S %Y', time.gmtime(os.path.getmtime(pdf_path)))) print _('Pages: %s' % pdf.numPages) - print _('Encrypted: %s' % pdf.isEncrypted) + #print _('Encrypted: %s' % pdf.isEncrypted) try: print _('File Size: %s bytes' % os.path.getsize(pdf_path)) except: pass diff --git a/src/calibre/ebooks/pdf/manipulate/merge.py b/src/calibre/ebooks/pdf/manipulate/merge.py index fce7076e85..cb89368f31 100644 --- a/src/calibre/ebooks/pdf/manipulate/merge.py +++ b/src/calibre/ebooks/pdf/manipulate/merge.py @@ -18,7 +18,7 @@ from calibre.utils.config import OptionParser from calibre.utils.logging import Log from calibre.constants import preferred_encoding from calibre.customize.conversion import OptionRecommendation -from calibre.ebooks.pdf.verify import is_valid_pdfs +from calibre.ebooks.pdf.verify import is_valid_pdfs, is_encrypted from pyPdf import PdfFileWriter, PdfFileReader diff --git a/src/calibre/ebooks/pdf/manipulate/reverse.py b/src/calibre/ebooks/pdf/manipulate/reverse.py index f2f3fa16da..b4bbe27a40 100644 --- a/src/calibre/ebooks/pdf/manipulate/reverse.py +++ b/src/calibre/ebooks/pdf/manipulate/reverse.py @@ -18,7 +18,7 @@ from calibre.utils.config import OptionParser from calibre.utils.logging import Log from calibre.constants import preferred_encoding from calibre.customize.conversion import OptionRecommendation -from calibre.ebooks.pdf.verify import is_valid_pdf +from calibre.ebooks.pdf.verify import is_valid_pdf, is_encrypted from pyPdf import PdfFileWriter, PdfFileReader diff --git a/src/calibre/ebooks/pdf/manipulate/split.py b/src/calibre/ebooks/pdf/manipulate/split.py index 19012797ae..957b78f1e6 100644 --- a/src/calibre/ebooks/pdf/manipulate/split.py +++ b/src/calibre/ebooks/pdf/manipulate/split.py @@ -18,7 +18,7 @@ from calibre.utils.config import OptionParser from calibre.utils.logging import Log from calibre.constants import preferred_encoding from calibre.customize.conversion import OptionRecommendation -from calibre.ebooks.pdf.verify import is_valid_pdf +from calibre.ebooks.pdf.verify import is_valid_pdf, is_encrypted from pyPdf import PdfFileWriter, PdfFileReader From 73d203fa862b0bf2059e631c8922d3488d91e6a9 Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 29 Apr 2009 19:04:18 -0400 Subject: [PATCH 157/319] pdfmanipulage rotate command. image to pdf output working. --- src/calibre/ebooks/pdf/manipulate/rotate.py | 105 ++++++++++++++++++++ src/calibre/ebooks/pdf/writer.py | 63 +++++++++++- 2 files changed, 167 insertions(+), 1 deletion(-) create mode 100644 src/calibre/ebooks/pdf/manipulate/rotate.py diff --git a/src/calibre/ebooks/pdf/manipulate/rotate.py b/src/calibre/ebooks/pdf/manipulate/rotate.py new file mode 100644 index 0000000000..ac46a8e0c8 --- /dev/null +++ b/src/calibre/ebooks/pdf/manipulate/rotate.py @@ -0,0 +1,105 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +''' +Rotate pages of a PDF. +''' + +import os, sys +from optparse import OptionGroup, Option + +from calibre.ebooks.metadata.meta import metadata_from_formats +from calibre.ebooks.metadata import authors_to_string +from calibre.utils.config import OptionParser +from calibre.utils.logging import Log +from calibre.constants import preferred_encoding +from calibre.customize.conversion import OptionRecommendation +from calibre.ebooks.pdf.verify import is_valid_pdf, is_encrypted + +from pyPdf import PdfFileWriter, PdfFileReader + +USAGE = '\n%prog %%name ' + _('''\ +file.pdf degrees + +Rotate pages of a PDF clockwise. +''') + +OPTIONS = set([ + OptionRecommendation(name='output', recommended_value='rotated.pdf', + level=OptionRecommendation.HIGH, long_switch='output', short_switch='o', + help=_('Path to output file. By default a file is created in the current directory.')), +]) + +def print_help(parser, log): + help = parser.format_help().encode(preferred_encoding, 'replace') + log(help) + +def option_parser(name): + usage = USAGE.replace('%%name', name) + return OptionParser(usage=usage) + +def option_recommendation_to_cli_option(add_option, rec): + opt = rec.option + switches = ['-'+opt.short_switch] if opt.short_switch else [] + switches.append('--'+opt.long_switch) + attrs = dict(dest=opt.name, help=opt.help, + choices=opt.choices, default=rec.recommended_value) + add_option(Option(*switches, **attrs)) + +def add_options(parser): + group = OptionGroup(parser, _('Rotate Options:'), _('Options to control the transformation of pdf')) + parser.add_option_group(group) + add_option = group.add_option + + for rec in OPTIONS: + option_recommendation_to_cli_option(add_option, rec) + +def rotate(pdf_path, out_path, degrees, metadata=None): + if metadata == None: + title = _('Unknown') + author = _('Unknown') + else: + title = metadata.title + author = authors_to_string(metadata.authors) + + out_pdf = PdfFileWriter(title=title, author=author) + + pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb')) + for page in pdf.pages: + out_pdf.addPage(page.rotateClockwise(int(degrees))) + + with open(out_path, 'wb') as out_file: + out_pdf.write(out_file) + +def main(args=sys.argv, name=''): + log = Log() + parser = option_parser(name) + add_options(parser) + + opts, args = parser.parse_args(args) + args = args[1:] + + if len(args) < 2: + print 'Error: A PDF file and how many degrees to rotate is required.\n' + print_help(parser, log) + return 1 + + if not is_valid_pdf(args[0]): + print 'Error: Could not read file `%s`.' % args[0] + return 1 + + if is_encrypted(args[0]): + print 'Error: file `%s` is encrypted.' % args[0] + return 1 + + mi = metadata_from_formats([args[0]]) + + rotate(args[0], opts.output, args[1], mi) + + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index 80bcacde2c..1700d58025 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -140,9 +140,70 @@ class ImagePDFWriter(PDFWriter): self.combine_queue.append(os.path.join(self.tmp_path, '%i.pdf' % (len(self.combine_queue) + 1))) self.logger.debug('Processing %s...' % item) + + import Image + + size = self._size() + #height = 'height: %ipx;' % size[1] if Image.open(item).size[1] > size[1] else '' + #height = 'height: %icm;' % size[1] + #height = 'height: %ipx;' % self.opts.output_profile.comic_screen_size[1] +### height = 'height: 594px;' # self.opts.output_profile.comic_screen_size[1] * .78 + + #print + #print size[1] + #print Image.open(item).size[1] + #print Image.open(item).size[1] / self.opts.output_profile.dpi + #print height - html = '' % item + + #height = 'height: %ipx;' % (self.opts.output_profile.comic_screen_size[1] - 160) + #height = 'height: %ipx;' % (self.opts.output_profile.comic_screen_size[1] * .78) + height = 'height: %fcm;' % (size[1] * 1.3) + #print height + + html = '' % (item, height) self.view.setHtml(html) + def _render_html(self, ok): + if ok: + item_path = os.path.join(self.tmp_path, '%i.pdf' % len(self.combine_queue)) + + self.logger.debug('\tRendering item %s as %i' % (os.path.basename(str(self.view.url().toLocalFile())), len(self.combine_queue))) + + printer = QPrinter(QPrinter.HighResolution) + + if self.opts.output_profile.short_name == 'default': + if self.custom_size == None: + printer.setPaperSize(paper_size(self.opts.paper_size)) + else: + printer.setPaperSize(QSizeF(self.custom_size[0], self.custom_size[1]), unit(self.opts.unit)) + else: + printer.setPaperSize(QSizeF(self.opts.output_profile.comic_screen_size[0] / self.opts.output_profile.dpi, self.opts.output_profile.comic_screen_size[1] / self.opts.output_profile.dpi), QPrinter.Inch) + + printer.setPageMargins(0, 0, 0, 0, QPrinter.Point) + printer.setOrientation(orientation(self.opts.orientation)) + printer.setOutputFormat(QPrinter.PdfFormat) + printer.setOutputFileName(item_path) + self.view.print_(printer) + self._render_book() + + def _size(self): + printer = QPrinter(QPrinter.HighResolution) + + if self.opts.output_profile.short_name == 'default': + if self.custom_size == None: + printer.setPaperSize(paper_size(self.opts.paper_size)) + else: + printer.setPaperSize(QSizeF(self.custom_size[0], self.custom_size[1]), unit(self.opts.unit)) + else: + printer.setPaperSize(QSizeF(self.opts.output_profile.comic_screen_size[0] / self.opts.output_profile.dpi, self.opts.output_profile.comic_screen_size[1] / self.opts.output_profile.dpi), QPrinter.Inch) + + printer.setPageMargins(0, 0, 0, 0, QPrinter.Point) + printer.setOrientation(orientation(self.opts.orientation)) + printer.setOutputFormat(QPrinter.PdfFormat) + + size = printer.paperSize(QPrinter.Millimeter) + + return size.width() / 10, size.height() / 10 From 3a2da56ccc2401af43fb3d203c8e3fbcab0cca72 Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 29 Apr 2009 19:19:24 -0400 Subject: [PATCH 158/319] image to pdf output complete. --- src/calibre/ebooks/pdf/output.py | 5 +- src/calibre/ebooks/pdf/writer.py | 78 +++++++++++--------------------- 2 files changed, 29 insertions(+), 54 deletions(-) diff --git a/src/calibre/ebooks/pdf/output.py b/src/calibre/ebooks/pdf/output.py index f4d9cb4631..3f1e2db907 100644 --- a/src/calibre/ebooks/pdf/output.py +++ b/src/calibre/ebooks/pdf/output.py @@ -16,6 +16,7 @@ import os, glob from calibre.customize.conversion import OutputFormatPlugin, \ OptionRecommendation from calibre.ebooks.oeb.output import OEBOutput +from calibre.ebooks.metadata.opf2 import OPF from calibre.ptempfile import TemporaryDirectory from calibre.ebooks.pdf.writer import PDFWriter, ImagePDFWriter, PDFMetadata from calibre.ebooks.pdf.pageoptions import UNITS, PAPER_SIZES, \ @@ -58,14 +59,14 @@ class PDFOutput(OutputFormatPlugin): self.convert_text(oeb_book) def convert_images(self, images): - # process images to document size self.write(ImagePDFWriter, images) def convert_text(self, oeb_book): with TemporaryDirectory('_pdf_out') as oebdir: OEBOutput(None).convert(oeb_book, oebdir, self.input_plugin, self.opts, self.log) - opf = glob.glob(os.path.join(oebdir, '*.opf'))[0] + opfpath = glob.glob(os.path.join(oebdir, '*.opf'))[0] + opf = OPF(opfpath, os.path.dirname(opfpath)) self.write(PDFWriter, [s.path for s in opf.spine]) diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index 1700d58025..97eaeb9244 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -15,7 +15,6 @@ from calibre.ptempfile import PersistentTemporaryDirectory from calibre.ebooks.pdf.pageoptions import unit, paper_size, \ orientation, size from calibre.ebooks.metadata import authors_to_string -from calibre.ebooks.metadata.opf2 import OPF from PyQt4 import QtCore from PyQt4.Qt import QUrl, QEventLoop, SIGNAL, QObject, \ @@ -63,6 +62,8 @@ class PDFWriter(QObject): self.custom_size = None self.opts = opts + + self.size = self._size() def dump(self, items, out_stream, pdf_metadata): self.metadata = pdf_metadata @@ -75,6 +76,28 @@ class PDFWriter(QObject): QMetaObject.invokeMethod(self, "_render_book", Qt.QueuedConnection) self.loop.exec_() + def _size(self): + ''' + The size of a pdf page in cm. + ''' + printer = QPrinter(QPrinter.HighResolution) + + if self.opts.output_profile.short_name == 'default': + if self.custom_size == None: + printer.setPaperSize(paper_size(self.opts.paper_size)) + else: + printer.setPaperSize(QSizeF(self.custom_size[0], self.custom_size[1]), unit(self.opts.unit)) + else: + printer.setPaperSize(QSizeF(self.opts.output_profile.width / self.opts.output_profile.dpi, self.opts.output_profile.height / self.opts.output_profile.dpi), QPrinter.Inch) + + printer.setPageMargins(0, 0, 0, 0, QPrinter.Point) + printer.setOrientation(orientation(self.opts.orientation)) + printer.setOutputFormat(QPrinter.PdfFormat) + + size = printer.paperSize(QPrinter.Millimeter) + + return size.width() / 10, size.height() / 10 + @QtCore.pyqtSignature('_render_book()') def _render_book(self): if len(self.render_queue) == 0: @@ -97,15 +120,7 @@ class PDFWriter(QObject): self.logger.debug('\tRendering item %s as %i' % (os.path.basename(str(self.view.url().toLocalFile())), len(self.combine_queue))) printer = QPrinter(QPrinter.HighResolution) - - if self.opts.output_profile.short_name == 'default': - if self.custom_size == None: - printer.setPaperSize(paper_size(self.opts.paper_size)) - else: - printer.setPaperSize(QSizeF(self.custom_size[0], self.custom_size[1]), unit(self.opts.unit)) - else: - printer.setPaperSize(QSizeF(self.opts.output_profile.width / self.opts.output_profile.dpi, self.opts.output_profile.height / self.opts.output_profile.dpi), QPrinter.Inch) - + printer.setPaperSize(QSizeF(self.size[0] * 10, self.size[1] * 10), QPrinter.Millimeter) printer.setPageMargins(0, 0, 0, 0, QPrinter.Point) printer.setOrientation(orientation(self.opts.orientation)) printer.setOutputFormat(QPrinter.PdfFormat) @@ -141,53 +156,12 @@ class ImagePDFWriter(PDFWriter): self.logger.debug('Processing %s...' % item) - import Image + height = 'height: %fcm;' % (self.size[1] * 1.3) - size = self._size() - #height = 'height: %ipx;' % size[1] if Image.open(item).size[1] > size[1] else '' - #height = 'height: %icm;' % size[1] - #height = 'height: %ipx;' % self.opts.output_profile.comic_screen_size[1] -### height = 'height: 594px;' # self.opts.output_profile.comic_screen_size[1] * .78 - - #print - #print size[1] - #print Image.open(item).size[1] - #print Image.open(item).size[1] / self.opts.output_profile.dpi - #print height - - - #height = 'height: %ipx;' % (self.opts.output_profile.comic_screen_size[1] - 160) - #height = 'height: %ipx;' % (self.opts.output_profile.comic_screen_size[1] * .78) - height = 'height: %fcm;' % (size[1] * 1.3) - #print height - html = '' % (item, height) self.view.setHtml(html) - def _render_html(self, ok): - if ok: - item_path = os.path.join(self.tmp_path, '%i.pdf' % len(self.combine_queue)) - - self.logger.debug('\tRendering item %s as %i' % (os.path.basename(str(self.view.url().toLocalFile())), len(self.combine_queue))) - - printer = QPrinter(QPrinter.HighResolution) - - if self.opts.output_profile.short_name == 'default': - if self.custom_size == None: - printer.setPaperSize(paper_size(self.opts.paper_size)) - else: - printer.setPaperSize(QSizeF(self.custom_size[0], self.custom_size[1]), unit(self.opts.unit)) - else: - printer.setPaperSize(QSizeF(self.opts.output_profile.comic_screen_size[0] / self.opts.output_profile.dpi, self.opts.output_profile.comic_screen_size[1] / self.opts.output_profile.dpi), QPrinter.Inch) - - printer.setPageMargins(0, 0, 0, 0, QPrinter.Point) - printer.setOrientation(orientation(self.opts.orientation)) - printer.setOutputFormat(QPrinter.PdfFormat) - printer.setOutputFileName(item_path) - self.view.print_(printer) - self._render_book() - def _size(self): printer = QPrinter(QPrinter.HighResolution) From 13e7d6334b97afab58927c484fa78fb178cabe90 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 29 Apr 2009 20:15:31 -0700 Subject: [PATCH 159/319] Support for progress reporting in the conversion pipeline and fix size based splitting code --- src/calibre/customize/conversion.py | 13 +++++++ src/calibre/ebooks/comic/input.py | 7 ++-- src/calibre/ebooks/conversion/cli.py | 14 +++++++- src/calibre/ebooks/conversion/plumber.py | 40 +++++++++++++++++++--- src/calibre/ebooks/oeb/base.py | 14 ++++---- src/calibre/ebooks/oeb/transforms/split.py | 27 ++++++++------- src/calibre/web/feeds/input.py | 4 +-- 7 files changed, 88 insertions(+), 31 deletions(-) diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index 7573dddeac..7920b823de 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -79,6 +79,10 @@ class OptionRecommendation(object): repr(self.recommended_value) + ' is not a string or a number') +class DummyReporter(object): + + def __call__(self, percent, msg=''): + pass class InputFormatPlugin(Plugin): ''' @@ -133,6 +137,10 @@ class InputFormatPlugin(Plugin): #: (option_name, recommended_value, recommendation_level) recommendations = set([]) + def __init__(self, *args): + Plugin.__init__(self, *args) + self.report_progress = DummyReporter() + def get_images(self): ''' Return a list of absolute paths to the images, if this input plugin @@ -242,6 +250,11 @@ class OutputFormatPlugin(Plugin): #: (option_name, recommended_value, recommendation_level) recommendations = set([]) + def __init__(self, *args): + Plugin.__init__(self, *args) + self.report_progress = DummyReporter() + + def convert(self, oeb_book, output, input_plugin, opts, log): ''' Render the contents of `oeb_book` (which is an instance of diff --git a/src/calibre/ebooks/comic/input.py b/src/calibre/ebooks/comic/input.py index 82070bbc72..046acb4232 100755 --- a/src/calibre/ebooks/comic/input.py +++ b/src/calibre/ebooks/comic/input.py @@ -343,7 +343,7 @@ class ComicInput(InputFormatPlugin): new_pages = n2 else: new_pages, failures = process_pages(new_pages, self.opts, - self.progress, tdir2) + self.report_progress, tdir2) if not new_pages: raise ValueError('Could not find any valid pages in comic: %s' % comic) @@ -360,13 +360,12 @@ class ComicInput(InputFormatPlugin): def get_images(self): return self._images - def convert(self, stream, opts, file_ext, log, accelerators, - progress=lambda p, m : m): + def convert(self, stream, opts, file_ext, log, accelerators): from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.toc import TOC - self.opts, self.log, self.progress = opts, log, progress + self.opts, self.log= opts, log if file_ext == 'cbc': comics_ = self.get_comics_from_collection(stream) else: diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index d8de702915..b165fbf8f4 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -175,8 +175,20 @@ def add_pipeline_options(parser, plumber): def option_parser(): return OptionParser(usage=USAGE) + +class ProgressBar(object): + + def __init__(self, log): + self.log = log + + def __call__(self, frac, msg=''): + if msg: + percent = int(frac*100) + self.log('%d%% %s'%(percent, msg)) + def main(args=sys.argv): log = Log() + reporter = ProgressBar(log) parser = option_parser() if len(args) < 3: print_help(parser, log) @@ -186,7 +198,7 @@ def main(args=sys.argv): from calibre.ebooks.conversion.plumber import Plumber - plumber = Plumber(input, output, log) + plumber = Plumber(input, output, log, reporter) add_input_output_options(parser, plumber) add_pipeline_options(parser, plumber) diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 1ef58e1d95..9987ec0243 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -5,7 +5,7 @@ __docformat__ = 'restructuredtext en' import os, re -from calibre.customize.conversion import OptionRecommendation +from calibre.customize.conversion import OptionRecommendation, DummyReporter from calibre.customize.ui import input_profiles, output_profiles, \ plugin_for_input_format, plugin_for_output_format from calibre.ebooks.conversion.preprocess import HTMLPreProcessor @@ -22,6 +22,17 @@ def supported_input_formats(): class OptionValues(object): pass +class CompositeProgressReporter(object): + + def __init__(self, global_min, global_max, global_reporter): + self.global_min, self.global_max = global_min, global_max + self.global_reporter = global_reporter + + def __call__(self, fraction, msg=''): + global_frac = self.global_min + fraction * \ + (self.global_max - self.global_min) + self.global_reporter(global_frac, msg) + class Plumber(object): ''' The `Plumber` manages the conversion pipeline. An UI should call the methods @@ -35,7 +46,7 @@ class Plumber(object): 'tags', 'book_producer', 'language' ] - def __init__(self, input, output, log): + def __init__(self, input, output, log, report_progress=DummyReporter()): ''' :param input: Path to input file. :param output: Path to output file/directory @@ -43,6 +54,7 @@ class Plumber(object): self.input = os.path.abspath(input) self.output = os.path.abspath(output) self.log = log + self.ui_reporter = report_progress # Initialize the conversion options that are independent of input and # output formats. The input and output plugins can still disable these @@ -63,7 +75,8 @@ OptionRecommendation(name='input_profile', 'conversion system information on how to interpret ' 'various information in the input document. For ' 'example resolution dependent lengths (i.e. lengths in ' - 'pixels).') + 'pixels). Choices are:')+\ + ', '.join([x.short_name for x in input_profiles()]) ), OptionRecommendation(name='output_profile', @@ -73,8 +86,9 @@ OptionRecommendation(name='output_profile', 'tells the conversion system how to optimize the ' 'created document for the specified device. In some cases, ' 'an output profile is required to produce documents that ' - 'will work on a device. For example EPUB on the SONY reader.' - ) + 'will work on a device. For example EPUB on the SONY reader. ' + 'Choices are:') + \ + ', '.join([x.short_name for x in output_profiles()]) ), OptionRecommendation(name='base_font_size', @@ -552,6 +566,9 @@ OptionRecommendation(name='list_recipes', if hasattr(self.opts, 'lrf') and self.output_plugin.file_type == 'lrf': self.opts.lrf = True + self.ui_reporter(0.01, _('Converting input to HTML...')) + ir = CompositeProgressReporter(0.01, 0.34, self.ui_reporter) + self.input_plugin.report_progress = ir self.oeb = self.input_plugin(stream, self.opts, self.input_fmt, self.log, accelerators, tdir) @@ -560,9 +577,12 @@ OptionRecommendation(name='list_recipes', return if not hasattr(self.oeb, 'manifest'): self.oeb = create_oebbook(self.log, self.oeb, self.opts) + pr = CompositeProgressReporter(0.34, 0.67, self.ui_reporter) + pr(0., _('Running transforms on ebook...')) from calibre.ebooks.oeb.transforms.guide import Clean Clean()(self.oeb, self.opts) + pr(0.1) self.opts.source = self.opts.input_profile self.opts.dest = self.opts.output_profile @@ -570,9 +590,11 @@ OptionRecommendation(name='list_recipes', from calibre.ebooks.oeb.transforms.metadata import MergeMetadata MergeMetadata()(self.oeb, self.user_metadata, self.opts.prefer_metadata_cover) + pr(0.2) from calibre.ebooks.oeb.transforms.structure import DetectStructure DetectStructure()(self.oeb, self.opts) + pr(0.35) from calibre.ebooks.oeb.transforms.flatcss import CSSFlattener fbase = self.opts.base_font_size @@ -586,6 +608,7 @@ OptionRecommendation(name='list_recipes', from calibre.ebooks.oeb.transforms.jacket import Jacket Jacket()(self.oeb, self.opts) + pr(0.4) if self.opts.extra_css and os.path.exists(self.opts.extra_css): self.opts.extra_css = open(self.opts.extra_css, 'rb').read() @@ -598,6 +621,7 @@ OptionRecommendation(name='list_recipes', if self.opts.linearize_tables: from calibre.ebooks.oeb.transforms.linearize_tables import LinearizeTables LinearizeTables()(self.oeb, self.opts) + pr(0.7) from calibre.ebooks.oeb.transforms.split import Split pbx = accelerators.get('pagebreaks', None) @@ -605,6 +629,7 @@ OptionRecommendation(name='list_recipes', max_flow_size=self.opts.output_profile.flow_size, page_breaks_xpath=pbx) split(self.oeb, self.opts) + pr(0.9) from calibre.ebooks.oeb.transforms.trimmanifest import ManifestTrimmer @@ -613,10 +638,15 @@ OptionRecommendation(name='list_recipes', trimmer(self.oeb, self.opts) self.oeb.toc.rationalize_play_orders() + pr(1.) self.log.info('Creating %s...'%self.output_plugin.name) + our = CompositeProgressReporter(0.67, 1., self.ui_reporter) + self.output_plugin.report_progress = our + our(0., _('Creating')+' %s'%self.output_plugin.name) self.output_plugin.convert(self.oeb, self.output, self.input_plugin, self.opts, self.log) + self.ui_reporter(1.) def create_oebbook(log, path_or_stream, opts, reader=None): ''' diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index f5395e04fe..faf2d02dc4 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -1571,26 +1571,26 @@ class OEBBook(object): def decode(self, data): """Automatically decode :param:`data` into a `unicode` object.""" + def fix_data(d): + return d.replace('\r\n', '\n').replace('\r', '\n') if isinstance(data, unicode): - return data + return fix_data(data) if data[:2] in ('\xff\xfe', '\xfe\xff'): try: - return data.decode('utf-16') + return fix_data(data.decode('utf-16')) except UnicodeDecodeError: pass if self.encoding is not None: try: - return data.decode(self.encoding) + return fix_data(data.decode(self.encoding)) except UnicodeDecodeError: pass try: - return data.decode('utf-8') + return fix_data(data.decode('utf-8')) except UnicodeDecodeError: pass data, _ = xml_to_unicode(data) - data = data.replace('\r\n', '\n') - data = data.replace('\r', '\n') - return data + return fix_data(data) def to_opf1(self): """Produce OPF 1.2 representing the book's metadata and structure. diff --git a/src/calibre/ebooks/oeb/transforms/split.py b/src/calibre/ebooks/oeb/transforms/split.py index ec3d63192d..e83f211fb0 100644 --- a/src/calibre/ebooks/oeb/transforms/split.py +++ b/src/calibre/ebooks/oeb/transforms/split.py @@ -173,18 +173,22 @@ class FlowSplitter(object): if self.max_flow_size > 0: lt_found = False - self.log('\tLooking for large trees...') + self.log('\tLooking for large trees in %s...'%item.href) trees = list(self.trees) - for i, tree in enumerate(list(self.trees)): - self.trees = [] + self.tree_map = {} + for i, tree in enumerate(trees): size = len(tostring(tree.getroot())) - if size > self.opts.profile.flow_size: + if size > self.max_flow_size: + self.log('\tFound large tree #%d'%i) lt_found = True + self.split_trees = [] self.split_to_size(tree) - trees[i:i+1] = list(self.trees) + self.tree_map[tree] = self.split_trees if not lt_found: - self.log_info('\tNo large trees found') - self.trees = trees + self.log('\tNo large trees found') + self.trees = [] + for x in trees: + self.trees.extend(self.tree_map.get(x, [x])) self.was_split = len(self.trees) > 1 self.commit() @@ -347,11 +351,10 @@ class FlowSplitter(object): continue size = len(tostring(r)) if size <= self.max_flow_size: - self.trees.append(t) - #print tostring(t.getroot(), pretty_print=True) - self.log.debug('\t\t\tCommitted sub-tree #%d (%d KB)', - len(self.trees), size/1024.) - self.split_size += size + self.split_trees.append(t) + self.log.debug( + '\t\t\tCommitted sub-tree #%d (%d KB)'%( + len(self.split_trees), size/1024.)) else: self.split_to_size(t) diff --git a/src/calibre/web/feeds/input.py b/src/calibre/web/feeds/input.py index 21324293d3..e0a8b807c8 100644 --- a/src/calibre/web/feeds/input.py +++ b/src/calibre/web/feeds/input.py @@ -38,7 +38,7 @@ class RecipeInput(InputFormatPlugin): ]) def convert(self, recipe_or_file, opts, file_ext, log, - accelerators, progress=lambda x, y: x): + accelerators): from calibre.web.feeds.recipes import \ get_builtin_recipe, compile_recipe if os.access(recipe_or_file, os.R_OK): @@ -51,7 +51,7 @@ class RecipeInput(InputFormatPlugin): raise ValueError('%s is not a valid recipe file or builtin recipe' % recipe_or_file) - ro = recipe(opts, log, progress) + ro = recipe(opts, log, self.report_progress) ro.download() opts.output_profile.flow_size = 0 From d794904319a34818a2a907c3ecbccdb6156fa00c Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 30 Apr 2009 17:17:56 -0400 Subject: [PATCH 160/319] Pluginize device drivers. --- src/calibre/customize/__init__.py | 3 +-- src/calibre/customize/builtins.py | 10 ++++++++++ src/calibre/customize/ui.py | 7 +++++++ src/calibre/devices/__init__.py | 12 ------------ src/calibre/devices/blackberry/driver.py | 10 ++++++---- src/calibre/devices/cybookg3/driver.py | 6 ++++++ src/calibre/devices/eb600/driver.py | 5 +++++ src/calibre/devices/interface.py | 9 ++++++--- src/calibre/devices/kindle/driver.py | 12 +++++++++++- src/calibre/devices/prs500/driver.py | 10 +++++++--- src/calibre/devices/prs505/driver.py | 9 ++++++++- src/calibre/devices/prs700/driver.py | 5 +++++ src/calibre/devices/usbms/device.py | 6 +++--- src/calibre/devices/usbms/driver.py | 14 +++++++++++--- src/calibre/gui2/device.py | 10 +++++----- 15 files changed, 91 insertions(+), 37 deletions(-) diff --git a/src/calibre/customize/__init__.py b/src/calibre/customize/__init__.py index b43b242fd8..0e6bad8d2e 100644 --- a/src/calibre/customize/__init__.py +++ b/src/calibre/customize/__init__.py @@ -221,5 +221,4 @@ class MetadataWriterPlugin(Plugin): ''' pass - - \ No newline at end of file + diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index dcbffade92..0b5a0295ea 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -295,9 +295,19 @@ from calibre.ebooks.pdf.output import PDFOutput from calibre.ebooks.pdb.ereader.output import EREADEROutput from calibre.customize.profiles import input_profiles, output_profiles +from calibre.devices.prs500.driver import PRS500 +from calibre.devices.prs505.driver import PRS505 +from calibre.devices.prs700.driver import PRS700 +from calibre.devices.cybookg3.driver import CYBOOKG3 +from calibre.devices.kindle.driver import KINDLE +from calibre.devices.kindle.driver import KINDLE2 +from calibre.devices.blackberry.driver import BLACKBERRY +from calibre.devices.eb600.driver import EB600 + plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDBInput, PDFInput, HTMLInput, TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput, ComicInput, FB2Input, ODTInput, RTFInput, EPUBOutput, EREADEROutput, RecipeInput] +plugins += [PRS505, PRS700, CYBOOKG3, KINDLE, KINDLE2, BLACKBERRY, EB600] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/customize/ui.py b/src/calibre/customize/ui.py index 4bd5c9b284..ad321a1f83 100644 --- a/src/calibre/customize/ui.py +++ b/src/calibre/customize/ui.py @@ -10,6 +10,7 @@ from calibre.customize.conversion import InputFormatPlugin, OutputFormatPlugin from calibre.customize.profiles import InputProfile, OutputProfile from calibre.customize.builtins import plugins as builtin_plugins from calibre.constants import __version__, iswindows, isosx +from calibre.devices.interface import DevicePlugin from calibre.ebooks.metadata import MetaInformation from calibre.utils.config import make_config_dir, Config, ConfigProxy, \ plugin_dir, OptionParser @@ -286,6 +287,12 @@ def available_output_formats(): formats.add(plugin.file_type) return formats +def device_plugins(): + for plugin in _initialized_plugins: + if isinstance(plugin, DevicePlugin): + if not is_disabled(plugin): + yield plugin + def disable_plugin(plugin_or_name): x = getattr(plugin_or_name, 'name', plugin_or_name) plugin = find_plugin(x) diff --git a/src/calibre/devices/__init__.py b/src/calibre/devices/__init__.py index 9c515c07fd..874de7c070 100644 --- a/src/calibre/devices/__init__.py +++ b/src/calibre/devices/__init__.py @@ -5,18 +5,6 @@ __copyright__ = '2008, Kovid Goyal ' Device drivers. ''' -def devices(): - from calibre.devices.prs500.driver import PRS500 - from calibre.devices.prs505.driver import PRS505 - from calibre.devices.prs700.driver import PRS700 - from calibre.devices.cybookg3.driver import CYBOOKG3 - from calibre.devices.kindle.driver import KINDLE - from calibre.devices.kindle.driver import KINDLE2 - from calibre.devices.blackberry.driver import BLACKBERRY - from calibre.devices.eb600.driver import EB600 - return (PRS500, PRS505, PRS700, CYBOOKG3, KINDLE, KINDLE2, - BLACKBERRY, EB600) - import time DAY_MAP = dict(Sun=0, Mon=1, Tue=2, Wed=3, Thu=4, Fri=5, Sat=6) diff --git a/src/calibre/devices/blackberry/driver.py b/src/calibre/devices/blackberry/driver.py index f6c615b0de..da2328419a 100644 --- a/src/calibre/devices/blackberry/driver.py +++ b/src/calibre/devices/blackberry/driver.py @@ -7,6 +7,12 @@ __docformat__ = 'restructuredtext en' from calibre.devices.usbms.driver import USBMS class BLACKBERRY(USBMS): + + name = 'Blackberry Device Interface' + description = _('Communicate with the Blackberry smart phone.') + author = _('Kovid Goyal') + supported_platforms = ['windows', 'linux'] + # Ordered list of supported formats FORMATS = ['mobi', 'prc'] @@ -16,15 +22,11 @@ class BLACKBERRY(USBMS): VENDOR_NAME = 'RIM' WINDOWS_MAIN_MEM = 'BLACKBERRY_SD' - #WINDOWS_CARD_MEM = 'CARD_STORAGE' #OSX_MAIN_MEM = 'Kindle Internal Storage Media' - #OSX_CARD_MEM = 'Kindle Card Storage Media' MAIN_MEMORY_VOLUME_LABEL = 'Blackberry Main Memory' - #STORAGE_CARD_VOLUME_LABEL = 'Kindle Storage Card' EBOOK_DIR_MAIN = 'ebooks' - #EBOOK_DIR_CARD = "documents" SUPPORTS_SUB_DIRS = True diff --git a/src/calibre/devices/cybookg3/driver.py b/src/calibre/devices/cybookg3/driver.py index 1cdf9863b4..7f3e5a82f4 100644 --- a/src/calibre/devices/cybookg3/driver.py +++ b/src/calibre/devices/cybookg3/driver.py @@ -12,6 +12,12 @@ from calibre.devices.usbms.driver import USBMS import calibre.devices.cybookg3.t2b as t2b class CYBOOKG3(USBMS): + name = 'Cybook Gen 3 Device Interface' + description = _('Communicate with the Cybook eBook reader.') + author = _('John Schember') + supported_platforms = ['windows', 'osx', 'linux'] + + # Ordered list of supported formats # Be sure these have an entry in calibre.devices.mime FORMATS = ['mobi', 'prc', 'html', 'pdf', 'rtf', 'txt'] diff --git a/src/calibre/devices/eb600/driver.py b/src/calibre/devices/eb600/driver.py index cb2f25d2f9..638dea42ba 100644 --- a/src/calibre/devices/eb600/driver.py +++ b/src/calibre/devices/eb600/driver.py @@ -7,6 +7,11 @@ Device driver for the Netronix EB600 from calibre.devices.usbms.driver import USBMS class EB600(USBMS): + name = 'Netronix EB600 Device Interface' + description = _('Communicate with the EB600 eBook reader.') + author = _('Kovid Goyal') + supported_platforms = ['windows', 'osx', 'linux'] + # Ordered list of supported formats FORMATS = ['epub', 'prc', 'chm', 'djvu', 'html', 'rtf', 'txt', 'pdf'] DRM_FORMATS = ['prc', 'mobi', 'html', 'pdf', 'txt'] diff --git a/src/calibre/devices/interface.py b/src/calibre/devices/interface.py index 0ad01e7493..94cecd65b6 100644 --- a/src/calibre/devices/interface.py +++ b/src/calibre/devices/interface.py @@ -6,8 +6,9 @@ the GUI. A device backend must subclass the L{Device} class. See prs500.py for a backend that implement the Device interface for the SONY PRS500 Reader. """ +from calibre.customize import Plugin -class Device(object): +class DevicePlugin(Plugin): """ Defines the interface that should be implemented by backends that communicate with an ebook reader. @@ -16,6 +17,8 @@ class Device(object): the front-end needs to call several methods one after another, in which case the USB session should not be closed after each method call. """ + type = _('Device Interface') + # Ordered list of supported formats FORMATS = ["lrf", "rtf", "pdf", "txt"] VENDOR_ID = 0x0000 @@ -27,8 +30,8 @@ class Device(object): # Whether the metadata on books can be set via the GUI. CAN_SET_METADATA = True - def __init__(self, key='-1', log_packets=False, report_progress=None) : - """ + def reset(self, key='-1', log_packets=False, report_progress=None) : + """ @param key: The key to unlock the device @param log_packets: If true the packet stream to/from the device is logged @param report_progress: Function that is called with a % progress diff --git a/src/calibre/devices/kindle/driver.py b/src/calibre/devices/kindle/driver.py index d598e2a503..6f17cd335a 100644 --- a/src/calibre/devices/kindle/driver.py +++ b/src/calibre/devices/kindle/driver.py @@ -6,9 +6,14 @@ Device driver for Amazon's Kindle import os, re, sys -from calibre.devices.usbms.driver import USBMS, metadata_from_formats +from calibre.devices.usbms.driver import USBMS class KINDLE(USBMS): + name = 'Kindle Device Interface' + description = _('Communicate with the Kindle eBook reader.') + author = _('John Schember') + supported_platforms = ['windows', 'osx', 'linux'] + # Ordered list of supported formats FORMATS = ['azw', 'mobi', 'prc', 'azw1', 'tpz', 'txt'] @@ -46,6 +51,7 @@ class KINDLE(USBMS): @classmethod def metadata_from_path(cls, path): + from calibre.devices.usbms.driver import metadata_from_formats mi = metadata_from_formats([path]) if mi.title == _('Unknown') or ('-asin' in mi.title and '-type' in mi.title): match = cls.WIRELESS_FILE_NAME_PATTERN.match(os.path.basename(path)) @@ -58,6 +64,10 @@ class KINDLE(USBMS): class KINDLE2(KINDLE): + name = 'Kindle 2 Device Interface' + description = _('Communicate with the Kindle 2 eBook reader.') + author = _('John Schember') + supported_platforms = ['windows', 'osx', 'linux'] PRODUCT_ID = [0x0002] BCD = [0x0100] diff --git a/src/calibre/devices/prs500/driver.py b/src/calibre/devices/prs500/driver.py index a905a314ae..874ed494e4 100644 --- a/src/calibre/devices/prs500/driver.py +++ b/src/calibre/devices/prs500/driver.py @@ -40,7 +40,7 @@ from array import array from functools import wraps from StringIO import StringIO -from calibre.devices.interface import Device +from calibre.devices.interface import DevicePlugin from calibre.devices.libusb import Error as USBError from calibre.devices.libusb import get_device_by_id from calibre.devices.prs500.prstypes import * @@ -76,12 +76,16 @@ class File(object): return self.name -class PRS500(Device): +class PRS500(DevicePlugin): """ Implements the backend for communication with the SONY Reader. Each method decorated by C{safe} performs a task. """ + name = 'PRS-500 Device Interface' + description = _('Communicate with the Sony PRS-500 eBook reader.') + author = _('Kovid Goyal') + supported_platforms = ['windows', 'osx', 'linux'] VENDOR_ID = 0x054c #: SONY Vendor Id PRODUCT_ID = 0x029b #: Product Id for the PRS-500 @@ -181,7 +185,7 @@ class PRS500(Device): return run_session - def __init__(self, key='-1', log_packets=False, report_progress=None) : + def reset(self, key='-1', log_packets=False, report_progress=None) : """ @param key: The key to unlock the device @param log_packets: If true the packet stream to/from the device is logged diff --git a/src/calibre/devices/prs505/driver.py b/src/calibre/devices/prs505/driver.py index a704eb1ec3..f569667ba1 100644 --- a/src/calibre/devices/prs505/driver.py +++ b/src/calibre/devices/prs505/driver.py @@ -1,5 +1,6 @@ __license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' +__copyright__ = '2008, Kovid Goyal ' \ + '2009, John Schember ' ''' Device driver for the SONY PRS-505 ''' @@ -14,6 +15,12 @@ from calibre import iswindows, islinux, isosx, __appname__ from calibre.devices.errors import PathError class PRS505(CLI, Device): + + name = 'PRS-505 Device Interface' + description = _('Communicate with the Sony PRS-505 eBook reader.') + author = _('Kovid Goyal and John Schember') + supported_platforms = ['windows', 'osx', 'linux'] + FORMATS = ['epub', 'lrf', 'lrx', 'rtf', 'pdf', 'txt'] VENDOR_ID = [0x054c] #: SONY Vendor Id diff --git a/src/calibre/devices/prs700/driver.py b/src/calibre/devices/prs700/driver.py index 2b82eb3e34..a79902fe10 100644 --- a/src/calibre/devices/prs700/driver.py +++ b/src/calibre/devices/prs700/driver.py @@ -8,6 +8,11 @@ Device driver for the SONY PRS-700 from calibre.devices.prs505.driver import PRS505 class PRS700(PRS505): + + name = 'PRS-700 Device Interface' + description = _('Communicate with the Sony PRS-700 eBook reader.') + author = _('Kovid Goyal and John Schember') + supported_platforms = ['windows', 'osx', 'linux'] BCD = [0x31a] diff --git a/src/calibre/devices/usbms/device.py b/src/calibre/devices/usbms/device.py index 8f2755d3fa..c4bbe7839f 100644 --- a/src/calibre/devices/usbms/device.py +++ b/src/calibre/devices/usbms/device.py @@ -8,11 +8,11 @@ device. This class handles device detection. import os, subprocess, time, re -from calibre.devices.interface import Device as _Device +from calibre.devices.interface import DevicePlugin as Device from calibre.devices.errors import DeviceError from calibre import iswindows, islinux, isosx, __appname__ -class Device(_Device): +class Device(Device): ''' This class provides logic common to all drivers for devices that export themselves as USB Mass Storage devices. If you are writing such a driver, inherit from this @@ -83,7 +83,7 @@ class Device(_Device): FDI_BCD_TEMPLATE = '' - def __init__(self, key='-1', log_packets=False, report_progress=None) : + def reset(self, key='-1', log_packets=False, report_progress=None) : self._main_prefix = self._card_a_prefix = self._card_b_prefix = None @classmethod diff --git a/src/calibre/devices/usbms/driver.py b/src/calibre/devices/usbms/driver.py index aa40f90c25..156e3a5eb5 100644 --- a/src/calibre/devices/usbms/driver.py +++ b/src/calibre/devices/usbms/driver.py @@ -10,7 +10,6 @@ for a particular device. import os, fnmatch, shutil from itertools import cycle -from calibre.ebooks.metadata.meta import metadata_from_formats, path_to_ext from calibre.ebooks.metadata import authors_to_string from calibre.devices.usbms.cli import CLI from calibre.devices.usbms.device import Device @@ -21,6 +20,12 @@ from calibre.devices.mime import mime_type_ext # CLI must come before Device as it implments the CLI functions that # are inherited from the device interface in Device. class USBMS(CLI, Device): + + name = 'USBMS Base Device Interface' + description = _('Communicate with an eBook reader.') + author = _('John Schember') + supported_platforms = ['windows', 'osx', 'linux'] + FORMATS = [] EBOOK_DIR_MAIN = '' EBOOK_DIR_CARD_A = '' @@ -28,8 +33,8 @@ class USBMS(CLI, Device): SUPPORTS_SUB_DIRS = False CAN_SET_METADATA = False - def __init__(self, key='-1', log_packets=False, report_progress=None): - Device.__init__(self, key=key, log_packets=log_packets, + def reset(self, key='-1', log_packets=False, report_progress=None): + Device.reset(self, key=key, log_packets=log_packets, report_progress=report_progress) def get_device_information(self, end_session=True): @@ -40,6 +45,7 @@ class USBMS(CLI, Device): return (self.__class__.__name__, '', '', '') def books(self, oncard=None, end_session=True): + from calibre.ebooks.metadata.meta import path_to_ext bl = BookList() if oncard == 'carda' and not self._card_a_prefix: @@ -185,10 +191,12 @@ class USBMS(CLI, Device): @classmethod def metadata_from_path(cls, path): + from calibre.ebooks.metadata.meta import metadata_from_formats return metadata_from_formats([path]) @classmethod def book_from_path(cls, path): + from calibre.ebooks.metadata.meta import path_to_ext fileext = path_to_ext(path) mi = cls.metadata_from_path(path) mime = mime_type_ext(fileext) diff --git a/src/calibre/gui2/device.py b/src/calibre/gui2/device.py index 239fd4d37d..bbdc62746f 100644 --- a/src/calibre/gui2/device.py +++ b/src/calibre/gui2/device.py @@ -10,8 +10,8 @@ from binascii import unhexlify from PyQt4.Qt import QMenu, QAction, QActionGroup, QIcon, SIGNAL, QPixmap, \ Qt -from calibre.customize.ui import available_input_formats, available_output_formats -from calibre.devices import devices +from calibre.customize.ui import available_input_formats, available_output_formats, \ + device_plugins from calibre.constants import iswindows from calibre.gui2.dialogs.choose_format import ChooseFormatDialog from calibre.parallel import Job @@ -21,7 +21,6 @@ from calibre.gui2 import config, error_dialog, Dispatcher, dynamic, \ info_dialog from calibre.ebooks.metadata import authors_to_string from calibre.gui2.dialogs.conversion_error import ConversionErrorDialog -from calibre.devices.interface import Device from calibre import sanitize_file_name, preferred_encoding from calibre.utils.filenames import ascii_filename from calibre.devices.errors import FreeSpaceError @@ -54,7 +53,7 @@ class DeviceManager(Thread): ''' Thread.__init__(self) self.setDaemon(True) - self.devices = [[d, False] for d in devices()] + self.devices = [[d, False] for d in device_plugins()] self.device = None self.device_class = None self.sleep_time = sleep_time @@ -71,7 +70,8 @@ class DeviceManager(Thread): connected = self.scanner.is_device_connected(device[0]) if connected and not device[1]: try: - dev = device[0]() + dev = device[0] + dev.reset() if iswindows: import pythoncom pythoncom.CoInitialize() From 23a4537a47582ec8dec67bb02c83c3205d8356b3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 1 May 2009 12:43:13 -0700 Subject: [PATCH 161/319] IGN:... --- src/calibre/ebooks/comic/input.py | 6 +-- src/calibre/ebooks/oeb/transforms/split.py | 54 +++++++++------------- 2 files changed, 24 insertions(+), 36 deletions(-) diff --git a/src/calibre/ebooks/comic/input.py b/src/calibre/ebooks/comic/input.py index 046acb4232..e2a522a356 100755 --- a/src/calibre/ebooks/comic/input.py +++ b/src/calibre/ebooks/comic/input.py @@ -35,6 +35,7 @@ def find_pages(dir, sort_on_mtime=False, verbose=False): for datum in os.walk(dir): for name in datum[-1]: path = os.path.join(datum[0], name) + if '__MACOSX' in path: continue for ext in extensions: if path.lower().endswith('.'+ext): pages.append(path) @@ -190,13 +191,12 @@ def render_pages(tasks, dest, opts, notification=None): for num, path in tasks: try: pages.extend(PageProcessor(path, dest, opts, num)) - msg = _('Rendered %s') + msg = _('Rendered %s')%path except: failures.append(path) - msg = _('Failed %s') + msg = _('Failed %s')%path if opts.verbose: msg += '\n' + traceback.format_exc() - msg = msg%path if notification is not None: notification(0.5, msg) diff --git a/src/calibre/ebooks/oeb/transforms/split.py b/src/calibre/ebooks/oeb/transforms/split.py index e83f211fb0..86e60a7784 100644 --- a/src/calibre/ebooks/oeb/transforms/split.py +++ b/src/calibre/ebooks/oeb/transforms/split.py @@ -19,10 +19,8 @@ from calibre.ebooks.oeb.base import OEB_STYLES, XPNSMAP as NAMESPACES, \ urldefrag, rewrite_links, urlunquote from calibre.ebooks.epub import rules - XPath = functools.partial(_XPath, namespaces=NAMESPACES) -SPLIT_ATTR = 'cs' SPLIT_POINT_ATTR = 'csp' def tostring(root): @@ -66,7 +64,9 @@ class Split(object): splitter = FlowSplitter(item, page_breaks, page_break_ids, self.max_flow_size, self.oeb) if splitter.was_split: - self.map[item.href] = dict(splitter.anchor_map) + am = splitter.anchor_map + self.map[item.href] = collections.defaultdict( + am.default_factory, **am) def find_page_breaks(self, item): if self.page_break_selectors is None: @@ -161,6 +161,7 @@ class FlowSplitter(object): self.page_break_ids = page_break_ids self.max_flow_size = max_flow_size self.base = item.href + self.csp_counter = 0 base, ext = os.path.splitext(self.base) self.base = base.replace('%', '%%')+'_split_%d'+ext @@ -191,6 +192,8 @@ class FlowSplitter(object): self.trees.extend(self.tree_map.get(x, [x])) self.was_split = len(self.trees) > 1 + if self.was_split: + self.log('\tSplit into %d parts'%len(self.trees)) self.commit() def split_on_page_breaks(self, orig_tree): @@ -237,35 +240,21 @@ class FlowSplitter(object): split_point2 = root2.xpath(path)[0] def nix_element(elem, top=True): - if True: - parent = elem.getparent() - index = parent.index(elem) - if top: - parent.remove(elem) - else: - index = parent.index(elem) - parent[index:index+1] = list(elem.iterchildren()) + parent = elem.getparent() + index = parent.index(elem) + if top: + parent.remove(elem) else: - elem.text = u'' - elem.tail = u'' - elem.set(SPLIT_ATTR, '1') - if elem.tag.lower() in ['ul', 'ol', 'dl', 'table', 'hr', 'img']: - elem.set('style', 'display:none') - - def fix_split_point(sp): - if not self.splitting_on_page_breaks: - sp.set('style', sp.get('style', '')+'page-break-before:avoid;page-break-after:avoid') + index = parent.index(elem) + parent[index:index+1] = list(elem.iterchildren()) # Tree 1 hit_split_point = False for elem in list(body.iterdescendants(etree.Element)): - if elem.get(SPLIT_ATTR, '0') == '1': - continue if elem is split_point: hit_split_point = True if before: nix_element(elem) - fix_split_point(elem) continue if hit_split_point: nix_element(elem) @@ -274,13 +263,10 @@ class FlowSplitter(object): # Tree 2 hit_split_point = False for elem in list(body2.iterdescendants(etree.Element)): - if elem.get(SPLIT_ATTR, '0') == '1': - continue if elem is split_point2: hit_split_point = True if not before: nix_element(elem, top=False) - fix_split_point(elem) continue if not hit_split_point: nix_element(elem, top=False) @@ -374,8 +360,8 @@ class FlowSplitter(object): ''' def pick_elem(elems): if elems: - elems = [i for i in elems if i.get(SPLIT_POINT_ATTR, '0') != '1'\ - and i.get(SPLIT_ATTR, '0') != '1'] + elems = [i for i in elems if i.get(SPLIT_POINT_ATTR, '0') != + '1'] if elems: i = int(math.floor(len(elems)/2.)) elems[i].set(SPLIT_POINT_ATTR, '1') @@ -417,14 +403,16 @@ class FlowSplitter(object): for i, tree in enumerate(self.trees): root = tree.getroot() self.files.append(self.base%i) - for elem in root.xpath('//*[@id]'): - if elem.get(SPLIT_ATTR, '0') == '0': - self.anchor_map[elem.get('id')] = self.files[-1] - for elem in root.xpath('//*[@%s or @%s]'%(SPLIT_ATTR, SPLIT_POINT_ATTR)): - elem.attrib.pop(SPLIT_ATTR, None) + for elem in root.xpath('//*[@id or @name]'): + anchor = elem.get('id', '') + if not anchor: + anchor = elem.get('name') + self.anchor_map[anchor] = self.files[-1] + for elem in root.xpath('//*[@%s]'%SPLIT_POINT_ATTR): elem.attrib.pop(SPLIT_POINT_ATTR, '0') spine_pos = self.item.spine_position + for current, tree in zip(*map(reversed, (self.files, self.trees))): for a in tree.getroot().xpath('//h:a[@href]', namespaces=NAMESPACES): href = a.get('href').strip() From 27557bb1e43c66df5bc383aa180749771f71e8a7 Mon Sep 17 00:00:00 2001 From: John Schember Date: Fri, 1 May 2009 21:46:15 -0400 Subject: [PATCH 162/319] pluginize device drivers: configuration and fix build errors. --- src/calibre/devices/interface.py | 26 +++++- src/calibre/devices/prs500/cli/main.py | 6 +- src/calibre/devices/usbms/device.py | 5 +- src/calibre/devices/usbms/deviceconfig.py | 38 +++++++++ src/calibre/ebooks/lrf/html/convert_from.py | 2 +- .../gui2/device_drivers/configwidget.py | 46 ++++++++++ .../gui2/device_drivers/configwidget.ui | 85 +++++++++++++++++++ src/calibre/linux.py | 6 +- 8 files changed, 204 insertions(+), 10 deletions(-) create mode 100644 src/calibre/devices/usbms/deviceconfig.py create mode 100644 src/calibre/gui2/device_drivers/configwidget.py create mode 100644 src/calibre/gui2/device_drivers/configwidget.ui diff --git a/src/calibre/devices/interface.py b/src/calibre/devices/interface.py index 94cecd65b6..61393f9988 100644 --- a/src/calibre/devices/interface.py +++ b/src/calibre/devices/interface.py @@ -210,7 +210,31 @@ class DevicePlugin(Plugin): Read the file at C{path} on the device and write it to outfile. @param outfile: file object like C{sys.stdout} or the result of an C{open} call ''' - raise NotImplementedError() + raise NotImplementedError() + + @classmethod + def config_widget(cls): + ''' + Should return a QWidget. The QWidget contains the settings for the device interface + ''' + raise NotImplementedError() + + @classmethod + def save_settings(cls, settings_widget): + ''' + Should save settings to disk. Takes the widget created in config_widget + and saves all settings to disk. + ''' + raise NotImplementedError() + + @classmethod + def settings(cls): + ''' + Should return an opts object. The opts object should have one attribute + `formats` whihc is an ordered list of formats for the device. + ''' + raise NotImplementedError() + diff --git a/src/calibre/devices/prs500/cli/main.py b/src/calibre/devices/prs500/cli/main.py index 4a94bf41af..2484ff2902 100755 --- a/src/calibre/devices/prs500/cli/main.py +++ b/src/calibre/devices/prs500/cli/main.py @@ -13,7 +13,7 @@ from calibre import __version__, iswindows, __appname__ from calibre.devices.errors import PathError from calibre.utils.terminfo import TerminalController from calibre.devices.errors import ArgumentError, DeviceError, DeviceLocked -from calibre.devices import devices +from calibre.customize.ui import device_plugins from calibre.devices.scanner import DeviceScanner MINIMUM_COL_WIDTH = 12 #: Minimum width of columns in ls output @@ -203,7 +203,7 @@ def main(): _wmi = wmi.WMI() scanner = DeviceScanner(_wmi) scanner.scan() - for d in devices(): + for d in device_plugins(): if scanner.is_device_connected(d): dev = d(log_packets=options.log_packets) @@ -334,4 +334,4 @@ def main(): return 0 if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/src/calibre/devices/usbms/device.py b/src/calibre/devices/usbms/device.py index c4bbe7839f..ca8b07cf0b 100644 --- a/src/calibre/devices/usbms/device.py +++ b/src/calibre/devices/usbms/device.py @@ -10,9 +10,10 @@ import os, subprocess, time, re from calibre.devices.interface import DevicePlugin as Device from calibre.devices.errors import DeviceError +from calibre.devices.usbms.deviceconfig import DeviceConfig from calibre import iswindows, islinux, isosx, __appname__ -class Device(Device): +class Device(DeviceConfig, Device): ''' This class provides logic common to all drivers for devices that export themselves as USB Mass Storage devices. If you are writing such a driver, inherit from this @@ -94,7 +95,7 @@ class Device(Device): for pid in cls.PRODUCT_ID: fdi_base_values = dict( app=__appname__, - deviceclass=cls.__name__, + deviceclass=cls.__class__.__name__, vendor_id=hex(vid), product_id=hex(pid), main_memory=cls.MAIN_MEMORY_VOLUME_LABEL, diff --git a/src/calibre/devices/usbms/deviceconfig.py b/src/calibre/devices/usbms/deviceconfig.py new file mode 100644 index 0000000000..999f2ea1b7 --- /dev/null +++ b/src/calibre/devices/usbms/deviceconfig.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +from calibre.utils.config import Config, ConfigProxy + +class DeviceConfig(object): + + HELP_MESSAGE = _('Ordered list of formats the device will accept') + + @classmethod + def _config(cls): + c = Config('device_drivers_%s' % cls.__class__.__name__, _('settings for device drivers')) + c.add_opt('format_map', default=cls.FORMATS, help=cls.HELP_MESSAGE) + return c + + def _configProxy(cls): + return ConfigProxy(cls._config()) + + @classmethod + def config_widget(cls): + from calibre.gui2.device_drivers.configwidget import ConfigWidget + cw = ConfigWidget(cls.configProxy(cls._config()), cls.FORMATS) + return cw + + @classmethod + def save_settings(cls, config_widget): + cls.configProxy(cls._config())['format_map'] = config_widget.format_map() + + @classmethod + def settings(cls): + return cls._config().parse() + + def customization_help(cls, gui=False): + return cls.HELP_MESSAGE + diff --git a/src/calibre/ebooks/lrf/html/convert_from.py b/src/calibre/ebooks/lrf/html/convert_from.py index 9ec4857126..ebfdecc6f4 100644 --- a/src/calibre/ebooks/lrf/html/convert_from.py +++ b/src/calibre/ebooks/lrf/html/convert_from.py @@ -33,7 +33,7 @@ from calibre.ebooks.lrf.html.table import Table from calibre import filename_to_utf8, setup_cli_handlers, __appname__, \ fit_image, preferred_encoding from calibre.ptempfile import PersistentTemporaryFile -from calibre.devices.interface import Device +from calibre.devices.interface import DevicePlugin as Device from calibre.ebooks.lrf.html.color_map import lrs_color from calibre.ebooks.chardet import xml_to_unicode diff --git a/src/calibre/gui2/device_drivers/configwidget.py b/src/calibre/gui2/device_drivers/configwidget.py new file mode 100644 index 0000000000..6c144d16c4 --- /dev/null +++ b/src/calibre/gui2/device_drivers/configwidget.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +from PyQt4.Qt import QWidget, QListWidgetItem, Qt, QVariant, SIGNAL + +from calibre.gui2.device_drivers.configwidget_ui import Ui_ConfigWidget + +class ConfigWidget(QWidget, Ui_ConfigWidget): + + def __init__(self, config, all_formats): + QWidget.__init__(self) + Ui_ConfigWidget.__init__(self) + self.setupUi(self) + + self.config = config + + format_map = config['format_map'] + disabled_formats = list(set(all_formats).difference(format_map)) + for format in format_map + disabled_formats: + item = QListWidgetItem(format, self.columns) + item.setData(Qt.UserRole, QVariant(format)) + item.setFlags(Qt.ItemIsEnabled|Qt.ItemIsUserCheckable|Qt.ItemIsSelectable) + item.setCheckState(Qt.Checked if format in format_map else Qt.Unchecked) + + self.connect(self.column_up, SIGNAL('clicked()'), self.up_column) + self.connect(self.column_down, SIGNAL('clicked()'), self.down_column) + + def up_column(self): + idx = self.columns.currentRow() + if idx > 0: + self.columns.insertItem(idx-1, self.columns.takeItem(idx)) + self.columns.setCurrentRow(idx-1) + + def down_column(self): + idx = self.columns.currentRow() + if idx < self.columns.count()-1: + self.columns.insertItem(idx+1, self.columns.takeItem(idx)) + self.columns.setCurrentRow(idx+1) + + def format_map(self): + formats = [unicode(self.columns.item(i).data(Qt.UserRole).toString()) for i in range(self.columns.count()) if self.columns.item(i).checkState()==Qt.Checked] + return formats + diff --git a/src/calibre/gui2/device_drivers/configwidget.ui b/src/calibre/gui2/device_drivers/configwidget.ui new file mode 100644 index 0000000000..2f0359189e --- /dev/null +++ b/src/calibre/gui2/device_drivers/configwidget.ui @@ -0,0 +1,85 @@ + + + ConfigWidget + + + + 0 + 0 + 442 + 332 + + + + Form + + + + + + Select avaliable formats and their order for this device + + + + + + + + true + + + QAbstractItemView::SelectRows + + + + + + + + + ... + + + + :/images/arrow-up.svg:/images/arrow-up.svg + + + + + + + Qt::Vertical + + + + 20 + 40 + + + + + + + + ... + + + + :/images/arrow-down.svg:/images/arrow-down.svg + + + + + + + + + + + + + + + + + diff --git a/src/calibre/linux.py b/src/calibre/linux.py index 1d641de51c..da98f4de42 100644 --- a/src/calibre/linux.py +++ b/src/calibre/linux.py @@ -5,9 +5,9 @@ import sys, os, shutil from subprocess import check_call, call from calibre import __version__, __appname__ -from calibre.devices import devices +from calibre.customize.ui import device_plugins -DEVICES = devices() +DEVICES = device_plugins() DESTDIR = '' if os.environ.has_key('DESTDIR'): @@ -293,7 +293,7 @@ def setup_udev_rules(group_file, reload, fatal_errors): -'''%dict(cls=cls.__name__, vendor_id=cls.VENDOR_ID, product_id=cls.PRODUCT_ID, +'''%dict(cls=cls.__class__.__name__, vendor_id=cls.VENDOR_ID, product_id=cls.PRODUCT_ID, prog=__appname__, bcd=cls.BCD)) fdi.write('\n'+cls.get_fdi()) fdi.write('\n\n') From 8eb19437129e4327fcc9820f2aea9ed8d4c7e31f Mon Sep 17 00:00:00 2001 From: John Schember Date: Fri, 1 May 2009 22:11:18 -0400 Subject: [PATCH 163/319] Fix class reference --- src/calibre/devices/usbms/device.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/devices/usbms/device.py b/src/calibre/devices/usbms/device.py index 4ad14d8739..50abbaf5f6 100644 --- a/src/calibre/devices/usbms/device.py +++ b/src/calibre/devices/usbms/device.py @@ -13,7 +13,7 @@ from calibre.devices.errors import DeviceError from calibre.devices.usbms.deviceconfig import DeviceConfig from calibre import iswindows, islinux, isosx, __appname__ -class Device(DeviceConfig, Device): +class Device(DeviceConfig, DevicePlugin): ''' This class provides logic common to all drivers for devices that export themselves as USB Mass Storage devices. If you are writing such a driver, inherit from this From e73bd3b25f8c760e9f681e94f77adee63eaf9392 Mon Sep 17 00:00:00 2001 From: John Schember Date: Fri, 1 May 2009 22:40:30 -0400 Subject: [PATCH 164/319] Add jetbook to device plugin list --- src/calibre/customize/builtins.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 0b5a0295ea..4086ce187c 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -303,6 +303,7 @@ from calibre.devices.kindle.driver import KINDLE from calibre.devices.kindle.driver import KINDLE2 from calibre.devices.blackberry.driver import BLACKBERRY from calibre.devices.eb600.driver import EB600 +from calibre.devices.jetbook.driver import JETBOOK plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDBInput, PDFInput, HTMLInput, TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput, ComicInput, From 5ce22bb6baf73678e26a119ae6f25f7fc47c017f Mon Sep 17 00:00:00 2001 From: John Schember Date: Fri, 1 May 2009 23:00:12 -0400 Subject: [PATCH 165/319] Add jetbook to device plugin list --- src/calibre/customize/builtins.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 4086ce187c..e229ffb766 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -308,7 +308,8 @@ from calibre.devices.jetbook.driver import JETBOOK plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDBInput, PDFInput, HTMLInput, TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput, ComicInput, FB2Input, ODTInput, RTFInput, EPUBOutput, EREADEROutput, RecipeInput] -plugins += [PRS505, PRS700, CYBOOKG3, KINDLE, KINDLE2, BLACKBERRY, EB600] +plugins += [PRS505, PRS700, CYBOOKG3, KINDLE, KINDLE2, BLACKBERRY, EB600, \ + JETBOOK] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ From 88bafa75e21872f1383e5eba842a9315ad938db3 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 2 May 2009 08:15:26 -0400 Subject: [PATCH 166/319] Move jetbook import. Add note that 72 pts = 1 inch. --- src/calibre/devices/jetbook/driver.py | 5 +++-- src/calibre/ebooks/conversion/plumber.py | 12 ++++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/calibre/devices/jetbook/driver.py b/src/calibre/devices/jetbook/driver.py index d2054cd7a1..bdeb3a4032 100644 --- a/src/calibre/devices/jetbook/driver.py +++ b/src/calibre/devices/jetbook/driver.py @@ -7,7 +7,7 @@ Device driver for Ectaco Jetbook firmware >= JL04_v030e import os, re, sys, shutil from itertools import cycle -from calibre.devices.usbms.driver import USBMS, metadata_from_formats +from calibre.devices.usbms.driver import USBMS from calibre import sanitize_file_name as sanitize class JETBOOK(USBMS): @@ -98,7 +98,8 @@ class JETBOOK(USBMS): return txt.decode(sys.getfilesystemencoding(), 'replace') return txt - + + from calibre.devices.usbms.driver import metadata_from_formats mi = metadata_from_formats([path]) if (mi.title==_('Unknown') or mi.authors==[_('Unknown')]) \ diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 9987ec0243..b4c418547b 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -251,19 +251,23 @@ OptionRecommendation(name='page_breaks_before', OptionRecommendation(name='margin_top', recommended_value=5.0, level=OptionRecommendation.LOW, - help=_('Set the top margin in pts. Default is %default')), + help=_('Set the top margin in pts. Default is %default. ' + 'Note: 72 pts equals 1 inch')), OptionRecommendation(name='margin_bottom', recommended_value=5.0, level=OptionRecommendation.LOW, - help=_('Set the bottom margin in pts. Default is %default')), + help=_('Set the bottom margin in pts. Default is %default. ' + 'Note: 72 pts equals 1 inch')), OptionRecommendation(name='margin_left', recommended_value=5.0, level=OptionRecommendation.LOW, - help=_('Set the left margin in pts. Default is %default')), + help=_('Set the left margin in pts. Default is %default. ' + 'Note: 72 pts equals 1 inch')), OptionRecommendation(name='margin_right', recommended_value=5.0, level=OptionRecommendation.LOW, - help=_('Set the right margin in pts. Default is %default')), + help=_('Set the right margin in pts. Default is %default. ' + 'Note: 72 pts equals 1 inch')), OptionRecommendation(name='dont_justify', recommended_value=False, level=OptionRecommendation.LOW, From 1cdc8e7877b785cea3a092e4bbf4f085ddb5a518 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 2 May 2009 08:25:42 -0400 Subject: [PATCH 167/319] move pml to own location. Disable eReader output because it doesn't work. --- src/calibre/customize/builtins.py | 3 +-- src/calibre/ebooks/pdb/ereader/reader.py | 2 +- src/calibre/ebooks/pdb/ereader/writer.py | 2 +- src/calibre/ebooks/{pdb/ereader => pml}/pmlconverter.py | 0 4 files changed, 3 insertions(+), 4 deletions(-) rename src/calibre/ebooks/{pdb/ereader => pml}/pmlconverter.py (100%) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index e229ffb766..d284beca3b 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -292,7 +292,6 @@ from calibre.ebooks.oeb.output import OEBOutput from calibre.ebooks.epub.output import EPUBOutput from calibre.ebooks.txt.output import TXTOutput from calibre.ebooks.pdf.output import PDFOutput -from calibre.ebooks.pdb.ereader.output import EREADEROutput from calibre.customize.profiles import input_profiles, output_profiles from calibre.devices.prs500.driver import PRS500 @@ -307,7 +306,7 @@ from calibre.devices.jetbook.driver import JETBOOK plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDBInput, PDFInput, HTMLInput, TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput, ComicInput, - FB2Input, ODTInput, RTFInput, EPUBOutput, EREADEROutput, RecipeInput] + FB2Input, ODTInput, RTFInput, EPUBOutput, RecipeInput] plugins += [PRS505, PRS700, CYBOOKG3, KINDLE, KINDLE2, BLACKBERRY, EB600, \ JETBOOK] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index d36e01ed69..9b5fbf82da 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -15,7 +15,7 @@ from calibre.ebooks import DRMError from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.ereader import EreaderError -from calibre.ebooks.pdb.ereader.pmlconverter import pml_to_html, \ +from calibre.ebooks.pml.pmlconverter import pml_to_html, \ footnote_sidebar_to_html from calibre.ebooks.mobi.palmdoc import decompress_doc from calibre.ebooks.metadata.opf2 import OPFCreator diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index 7e3fdc30ea..55d3993171 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -11,7 +11,7 @@ import Image, cStringIO from calibre.ebooks.oeb.base import OEB_IMAGES from calibre.ebooks.pdb.header import PdbHeaderBuilder from calibre.ebooks.pdb.ereader import image_name -from calibre.ebooks.pdb.ereader.pmlconverter import html_to_pml +from calibre.ebooks.pml.pmlconverter import html_to_pml IDENTITY = 'PNPdPPrs' diff --git a/src/calibre/ebooks/pdb/ereader/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py similarity index 100% rename from src/calibre/ebooks/pdb/ereader/pmlconverter.py rename to src/calibre/ebooks/pml/pmlconverter.py From 9787215d5571319fe08292e238013f4a1b25545a Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 2 May 2009 08:30:50 -0400 Subject: [PATCH 168/319] Document eReader output a bit more. --- src/calibre/ebooks/pdb/ereader/writer.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index 55d3993171..1446cc3d74 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -31,13 +31,14 @@ class Writer(object): lengths = [len(i) for i in sections] - pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, 'test book') + pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, '') pdbHeaderBuilder.build_header(lengths, out_stream) for item in sections: out_stream.write(item) def _text(self, pages): + # Todo: Split pages over 65505 Bytes pml_pages = [] for page in pages: @@ -46,6 +47,7 @@ class Writer(object): return pml_pages def _images(self, manifest): + # Todo: resize images over 65505 Bytes images = [] for item in manifest: @@ -69,9 +71,19 @@ class Writer(object): return images def _metadata(self, metadata): - return 'test\x00\x00\x00\x00\x00' + ''' + Metadata takes the form: + title\x00 + author\x00 + copyright\x00 + publisher\x00 + isbn\x00 + ''' + return '\x00\x00\x00\x00\x00' def _header_record(self, text_items, image_items): + # Todo: Find out more about header and add correct values to the file + # can be read by eReader reader software. ''' text_items = the number of text pages image_items = the number of images From db159de0664850ece3831461a2ecb0448ce38c97 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 2 May 2009 09:45:51 -0400 Subject: [PATCH 169/319] PML input plugin. --- src/calibre/customize/builtins.py | 3 +- src/calibre/ebooks/pml/__init__.py | 0 src/calibre/ebooks/pml/input.py | 99 ++++++++++++++++++++++++++ src/calibre/ebooks/pml/pmlconverter.py | 6 +- 4 files changed, 106 insertions(+), 2 deletions(-) create mode 100644 src/calibre/ebooks/pml/__init__.py create mode 100644 src/calibre/ebooks/pml/input.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index d284beca3b..e68b6b80a8 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -292,6 +292,7 @@ from calibre.ebooks.oeb.output import OEBOutput from calibre.ebooks.epub.output import EPUBOutput from calibre.ebooks.txt.output import TXTOutput from calibre.ebooks.pdf.output import PDFOutput +from calibre.ebooks.pml.input import PMLInput from calibre.customize.profiles import input_profiles, output_profiles from calibre.devices.prs500.driver import PRS500 @@ -306,7 +307,7 @@ from calibre.devices.jetbook.driver import JETBOOK plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDBInput, PDFInput, HTMLInput, TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput, ComicInput, - FB2Input, ODTInput, RTFInput, EPUBOutput, RecipeInput] + FB2Input, ODTInput, RTFInput, EPUBOutput, RecipeInput, PMLInput] plugins += [PRS505, PRS700, CYBOOKG3, KINDLE, KINDLE2, BLACKBERRY, EB600, \ JETBOOK] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/ebooks/pml/__init__.py b/src/calibre/ebooks/pml/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/calibre/ebooks/pml/input.py b/src/calibre/ebooks/pml/input.py new file mode 100644 index 0000000000..36a9e3b526 --- /dev/null +++ b/src/calibre/ebooks/pml/input.py @@ -0,0 +1,99 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import glob, os, shutil + +from calibre.customize.conversion import InputFormatPlugin +from calibre.ptempfile import TemporaryDirectory +from calibre.utils.zipfile import ZipFile +from calibre.ebooks.pml.pmlconverter import pml_to_html +from calibre.ebooks.metadata.opf2 import OPFCreator + +class PMLInput(InputFormatPlugin): + + name = 'PML Input' + author = 'John Schember' + description = 'Convert PML to OEB' + # pmlz is a zip file containing pml files and png images. + file_types = set(['pml', 'pmlz']) + + def process_pml(self, pml_path, html_path): + pclose = False + hclose = False + + if not hasattr(pml_path, 'read'): + pml_stream = open(pml_path, 'rb') + pclose = True + else: + pml_stream = pml_path + + if not hasattr(html_path, 'write'): + html_stream = open(html_path, 'wb') + hclose = True + else: + html_stream = html_path + + ienc = pml_stream.encoding if pml_stream.encoding else 'utf-8' + if self.options.input_encoding: + ienc = self.options.input_encoding + + html = pml_to_html(pml_stream.read().decode(ienc)) + html_stream.write('</head><body>' + html + '</body></html>') + + if pclose: + pml_stream.close() + if hclose: + html_stream.close() + + def convert(self, stream, options, file_ext, log, + accelerators): + self.options = options + pages, images = [], [] + + if file_ext == 'pmlz': + with TemporaryDirectory('_unpmlz') as tdir: + zf = ZipFile(stream) + zf.extractall(tdir) + + pmls = glob.glob(os.path.join(tdir, '*.pml')) + for pml in pmls: + html_name = os.path.splitext(os.path.basename(pml))[0]+'.html' + html_path = os.path.join(os.getcwd(), html_name) + + pages.append(html_name) + self.process_pml(pml, html_path) + + imgs = glob.glob(os.path.join(tdir, '*.png')) + for img in imgs: + pimg_name = os.path.basename(img) + pimg_path = os.path.join(os.getcwd(), pimg_name) + + images.append(pimg_name) + + shutil.move(img, pimg_path) + else: + self.process_pml(stream, 'index.html') + + pages.append('index.html') + images = [] + + # We want pages to be orded alphabetically. + pages.sort() + + manifest_items = [] + for item in pages+images: + manifest_items.append((item, None)) + + from calibre.ebooks.metadata.meta import get_metadata + mi = get_metadata(stream, 'pml') + opf = OPFCreator(os.getcwd(), mi) + opf.create_manifest(manifest_items) + opf.create_spine(pages) + with open('metadata.opf', 'wb') as opffile: + opf.render(opffile) + + return os.path.join(os.getcwd(), 'metadata.opf') + diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index 391f70a504..14a6280338 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -from __future__ import with_statement + ''' Convert pml markup to and from html ''' @@ -47,6 +47,10 @@ PML_HTML_RULES = [ (re.compile(r'\\Sd="(?P<target>.+?)"(?P<text>.+?)\\Sd'), lambda match: '<a href="#sidebar-%s">%s</a>' % (match.group('target'), match.group('text'))), (re.compile(r'\\I'), lambda match: ''), + # Sidebar and Footnotes + (re.compile(r'<sidebar\s+id="(?P<target>.+?)">\s*(?P<text>.+?)\s*</sidebar>', re.DOTALL), lambda match: '<div id="sidebar-%s">%s</div>' % (match.group('target'), match.group('text'))), + (re.compile(r'<footnote\s+id="(?P<target>.+?)">\s*(?P<text>.+?)\s*</footnote>', re.DOTALL), lambda match: '<div id="footnote-%s">%s</div>' % (match.group('target'), match.group('text'))), + # eReader files are one paragraph per line. # This forces the lines to wrap properly. (re.compile('^(?P<text>.+)$', re.MULTILINE), lambda match: '<p>%s</p>' % match.group('text')), From b2e749e00d46991948f91148d19c48067c3ef670 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 2 May 2009 09:49:40 -0400 Subject: [PATCH 170/319] PML input: Put images in proper location. --- src/calibre/ebooks/pml/input.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/pml/input.py b/src/calibre/ebooks/pml/input.py index 36a9e3b526..a3dbc98568 100644 --- a/src/calibre/ebooks/pml/input.py +++ b/src/calibre/ebooks/pml/input.py @@ -69,7 +69,7 @@ class PMLInput(InputFormatPlugin): imgs = glob.glob(os.path.join(tdir, '*.png')) for img in imgs: pimg_name = os.path.basename(img) - pimg_path = os.path.join(os.getcwd(), pimg_name) + pimg_path = os.path.join(os.getcwd(), 'images', pimg_name) images.append(pimg_name) From b0993c006f14e269eea0beb2ef81ecf24fdce3bd Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 2 May 2009 10:19:01 -0400 Subject: [PATCH 171/319] PML output plugin. --- src/calibre/customize/builtins.py | 4 +- src/calibre/ebooks/pdb/ereader/output.py | 1 + src/calibre/ebooks/pdf/output.py | 6 ++- src/calibre/ebooks/pml/output.py | 55 ++++++++++++++++++++++++ 4 files changed, 63 insertions(+), 3 deletions(-) create mode 100644 src/calibre/ebooks/pml/output.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index e68b6b80a8..f52c42811b 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -293,6 +293,7 @@ from calibre.ebooks.epub.output import EPUBOutput from calibre.ebooks.txt.output import TXTOutput from calibre.ebooks.pdf.output import PDFOutput from calibre.ebooks.pml.input import PMLInput +from calibre.ebooks.pml.output import PMLOutput from calibre.customize.profiles import input_profiles, output_profiles from calibre.devices.prs500.driver import PRS500 @@ -307,7 +308,8 @@ from calibre.devices.jetbook.driver import JETBOOK plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDBInput, PDFInput, HTMLInput, TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput, ComicInput, - FB2Input, ODTInput, RTFInput, EPUBOutput, RecipeInput, PMLInput] + FB2Input, ODTInput, RTFInput, EPUBOutput, RecipeInput, PMLInput, + PMLOutput] plugins += [PRS505, PRS700, CYBOOKG3, KINDLE, KINDLE2, BLACKBERRY, EB600, \ JETBOOK] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/ebooks/pdb/ereader/output.py b/src/calibre/ebooks/pdb/ereader/output.py index 4b188ae2f1..f217c04415 100644 --- a/src/calibre/ebooks/pdb/ereader/output.py +++ b/src/calibre/ebooks/pdb/ereader/output.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- + __license__ = 'GPL 3' __copyright__ = '2009, John Schember <john@nachtimwald.com>' __docformat__ = 'restructuredtext en' diff --git a/src/calibre/ebooks/pdf/output.py b/src/calibre/ebooks/pdf/output.py index 3f1e2db907..4eb23877d9 100644 --- a/src/calibre/ebooks/pdf/output.py +++ b/src/calibre/ebooks/pdf/output.py @@ -63,8 +63,10 @@ class PDFOutput(OutputFormatPlugin): def convert_text(self, oeb_book): with TemporaryDirectory('_pdf_out') as oebdir: - OEBOutput(None).convert(oeb_book, oebdir, self.input_plugin, self.opts, self.log) - + from calibre.customize.ui import plugin_for_output_format + oeb_output = plugin_for_output_format('oeb') + oeb_output.convert(oeb, oeb_dir, self.input_plugin, self.opts, self.log) + opfpath = glob.glob(os.path.join(oebdir, '*.opf'))[0] opf = OPF(opfpath, os.path.dirname(opfpath)) diff --git a/src/calibre/ebooks/pml/output.py b/src/calibre/ebooks/pml/output.py new file mode 100644 index 0000000000..c5fbc990af --- /dev/null +++ b/src/calibre/ebooks/pml/output.py @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +import os + +import Image, cStringIO + +from calibre.customize.conversion import OutputFormatPlugin +from calibre.ptempfile import TemporaryDirectory +from calibre.utils.zipfile import ZipFile +from calibre.ebooks.oeb.base import OEB_IMAGES +from calibre.ebooks.pml.pmlconverter import html_to_pml + +class PMLOutput(OutputFormatPlugin): + + name = 'PML Output' + author = 'John Schember' + file_type = 'pmlz' + + def convert(self, oeb_book, output_path, input_plugin, opts, log): + with TemporaryDirectory('_pmlz_output') as tdir: + self.process_spine(oeb_book.spine, tdir) + self.write_images(oeb_book.manifest, tdir) + + pmlz = ZipFile(output_path, 'w') + pmlz.add_dir(tdir) + + def process_spine(self, spine, out_dir): + for item in spine: + html = html_to_pml(unicode(item)).encode('utf-8') + + name = os.path.splitext(os.path.basename(item.href))[0] + '.pml' + path = os.path.join(out_dir, name) + + with open(path, 'wb') as out: + out.write(html) + + def write_images(self, manifest, out_dir): + for item in manifest: + if item.media_type in OEB_IMAGES: + im = Image.open(cStringIO.StringIO(item.data)) + + data = cStringIO.StringIO() + im.save(data, 'PNG') + data = data.getvalue() + + name = os.path.splitext(os.path.basename(item.href))[0] + '.png' + path = os.path.join(out_dir, name) + + with open(path, 'wb') as out: + out.write(data) + From 789061a7ae6885ae82b3dcef628a1864f01e5014 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Sat, 2 May 2009 11:10:29 -0700 Subject: [PATCH 172/319] Miscellaneous minor fixes. Add tags and series metadata to the book jacket, not just comments --- src/calibre/devices/prs505/driver.py | 42 +++++++++-------- src/calibre/ebooks/conversion/cli.py | 2 +- src/calibre/ebooks/conversion/plumber.py | 8 ++-- src/calibre/ebooks/oeb/transforms/jacket.py | 50 ++++++++++++++++----- src/calibre/linux.py | 1 - 5 files changed, 69 insertions(+), 34 deletions(-) diff --git a/src/calibre/devices/prs505/driver.py b/src/calibre/devices/prs505/driver.py index f569667ba1..702dd37592 100644 --- a/src/calibre/devices/prs505/driver.py +++ b/src/calibre/devices/prs505/driver.py @@ -4,15 +4,14 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> ' \ ''' Device driver for the SONY PRS-505 ''' -import sys, os, shutil, time, subprocess, re +import os, time from itertools import cycle from calibre.devices.usbms.cli import CLI from calibre.devices.usbms.device import Device from calibre.devices.errors import DeviceError, FreeSpaceError from calibre.devices.prs505.books import BookList, fix_ids -from calibre import iswindows, islinux, isosx, __appname__ -from calibre.devices.errors import PathError +from calibre import __appname__ class PRS505(CLI, Device): @@ -22,7 +21,7 @@ class PRS505(CLI, Device): supported_platforms = ['windows', 'osx', 'linux'] FORMATS = ['epub', 'lrf', 'lrx', 'rtf', 'pdf', 'txt'] - + VENDOR_ID = [0x054c] #: SONY Vendor Id PRODUCT_ID = [0x031e] #: Product Id for the PRS-505 BCD = [0x229] #: Needed to disambiguate 505 and 700 on linux @@ -46,27 +45,34 @@ class PRS505(CLI, Device): def open(self): Device.open(self) - + def write_cache(prefix): try: cachep = os.path.join(prefix, self.CACHE_XML) if not os.path.exists(cachep): - os.makedirs(os.path.dirname(cachep), mode=0777) - f = open(cachep, 'wb') - f.write(u'''<?xml version="1.0" encoding="UTF-8"?> -<cache xmlns="http://www.kinoma.com/FskCache/1"> -</cache> -'''.encode('utf8')) - f.close() + try: + os.makedirs(os.path.dirname(cachep), mode=0777) + except: + time.sleep(5) + os.makedirs(os.path.dirname(cachep), mode=0777) + with open(cachep, 'wb') as f: + f.write(u'''<?xml version="1.0" encoding="UTF-8"?> + <cache xmlns="http://www.kinoma.com/FskCache/1"> + </cache> + '''.encode('utf8')) + return True except: self._card_prefix = None import traceback traceback.print_exc() + return False if self._card_a_prefix is not None: - write_cache(self._card_a_prefix) + if not write_cache(self._card_a_prefix): + self._card_a_prefix = None if self._card_b_prefix is not None: - write_cache(self._card_b_prefix) + if not write_cache(self._card_b_prefix): + self._card_b_prefix = None def get_device_information(self, end_session=True): return (self.__class__.__name__, '', '', '') @@ -132,7 +138,7 @@ class PRS505(CLI, Device): if not hasattr(infile, 'read'): infile, close = open(infile, 'rb'), True infile.seek(0) - + newpath = path mdata = metadata.next() @@ -159,11 +165,11 @@ class PRS505(CLI, Device): paths.append(filepath) self.put_file(infile, paths[-1], replace_file=True) - + if close: infile.close() ctimes.append(os.path.getctime(paths[-1])) - + return zip(paths, sizes, ctimes, cycle([on_card])) @classmethod @@ -199,7 +205,7 @@ class PRS505(CLI, Device): f = open(self._main_prefix + self.__class__.MEDIA_XML, 'wb') booklists[0].write(f) f.close() - + def write_card_prefix(prefix, listid): if prefix is not None and hasattr(booklists[listid], 'write'): if not os.path.exists(prefix): diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index b165fbf8f4..53b1a2065d 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -128,7 +128,7 @@ def add_pipeline_options(parser, plumber): [ 'dont_split_on_page_breaks', 'chapter', 'chapter_mark', 'prefer_metadata_cover', 'remove_first_image', - 'insert_comments', 'page_breaks_before', + 'insert_metadata', 'page_breaks_before', ] ), diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 9987ec0243..37611dcea7 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -300,11 +300,11 @@ OptionRecommendation(name='remove_first_image', ) ), -OptionRecommendation(name='insert_comments', +OptionRecommendation(name='insert_metadata', recommended_value=False, level=OptionRecommendation.LOW, - help=_('Insert the comments/summary from the book metadata at the start of ' + help=_('Insert the book metadata at the start of ' 'the book. This is useful if your ebook reader does not support ' - 'displaying the comments from the metadata.' + 'displaying/searching metadata directly.' ) ), @@ -607,7 +607,7 @@ OptionRecommendation(name='list_recipes', fkey = map(float, fkey.split(',')) from calibre.ebooks.oeb.transforms.jacket import Jacket - Jacket()(self.oeb, self.opts) + Jacket()(self.oeb, self.opts, self.user_metadata) pr(0.4) if self.opts.extra_css and os.path.exists(self.opts.extra_css): diff --git a/src/calibre/ebooks/oeb/transforms/jacket.py b/src/calibre/ebooks/oeb/transforms/jacket.py index c182faedfa..78f4ab871e 100644 --- a/src/calibre/ebooks/oeb/transforms/jacket.py +++ b/src/calibre/ebooks/oeb/transforms/jacket.py @@ -25,9 +25,13 @@ class Jacket(object): <title>%(title)s -

    %(title)s

    -

    %(jacket)s

    -
    +
    +

    %(title)s

    +

    %(jacket)s

    +
    %(series)s
    +
    %(tags)s
    +
    +
    %(comments)s
    @@ -46,21 +50,47 @@ class Jacket(object): img.getparent().remove(img) return - def insert_comments(self, comments): - self.log('Inserting metadata comments into book...') + def insert_metadata(self, mi): + self.log('Inserting metadata into book...') + comments = mi.comments + if not comments: + try: + comments = unicode(self.oeb.metadata.description[0]) + except: + comments = '' + if not comments.strip(): + comments = '' comments = comments.replace('\r\n', '\n').replace('\n\n', '

    ') + series = 'Series: ' + mi.series if mi.series else '' + if series and mi.series_index is not None: + series += ' [%s]'%mi.series_index + tags = mi.tags + if not tags: + try: + tags = map(unicode, self.oeb.metadata.subject) + except: + tags = [] + tags = u'/'.join(tags) + if tags: + tags = 'Tags: ' + u'/%s/'%tags + else: + tags = '' + try: + title = mi.title if mi.title else unicode(self.oeb.metadata.title[0]) + except: + title = _('Unknown') html = self.JACKET_TEMPLATE%dict(xmlns=XPNSMAP['h'], - title=self.opts.title, comments=comments, - jacket=_('Book Jacket')) + title=title, comments=comments, + jacket=_('Book Jacket'), series=series, tags=tags) id, href = self.oeb.manifest.generate('jacket', 'jacket.xhtml') root = etree.fromstring(html) item = self.oeb.manifest.add(id, href, guess_type(href)[0], data=root) self.oeb.spine.insert(0, item, True) - def __call__(self, oeb, opts): + def __call__(self, oeb, opts, metadata): self.oeb, self.opts, self.log = oeb, opts, oeb.log if opts.remove_first_image: self.remove_fisrt_image() - if opts.insert_comments and opts.comments: - self.insert_comments(opts.comments) + if opts.insert_metadata: + self.insert_metadata(metadata) diff --git a/src/calibre/linux.py b/src/calibre/linux.py index 1d641de51c..e6c0504316 100644 --- a/src/calibre/linux.py +++ b/src/calibre/linux.py @@ -5,7 +5,6 @@ import sys, os, shutil from subprocess import check_call, call from calibre import __version__, __appname__ -from calibre.devices import devices DEVICES = devices() From a0d9b093da93bd29dd3036290d0e600b0583cd22 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 2 May 2009 14:27:00 -0400 Subject: [PATCH 173/319] Move toggle check into toggle if. --- src/calibre/gui2/dialogs/config.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/calibre/gui2/dialogs/config.py b/src/calibre/gui2/dialogs/config.py index 1b2a2b8702..b3b36db880 100644 --- a/src/calibre/gui2/dialogs/config.py +++ b/src/calibre/gui2/dialogs/config.py @@ -525,11 +525,11 @@ class ConfigDialog(QDialog, Ui_Dialog): index = self.plugin_view.currentIndex() if index.isValid(): plugin = self._plugin_model.index_to_plugin(index) - if not plugin.can_be_disabled: - error_dialog(self,_('Plugin cannot be disabled'), - _('The plugin: %s cannot be disabled')%plugin.name).exec_() - return if op == 'toggle': + if not plugin.can_be_disabled: + error_dialog(self,_('Plugin cannot be disabled'), + _('The plugin: %s cannot be disabled')%plugin.name).exec_() + return if is_disabled(plugin): enable_plugin(plugin) else: From bf6ed98e35f871e3d5ee8ac1b2afed228a91732a Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 2 May 2009 14:57:19 -0400 Subject: [PATCH 174/319] Device interfaces can be configured in GUI. --- src/calibre/devices/usbms/deviceconfig.py | 5 ++-- src/calibre/gui2/device_drivers/__init__.py | 0 src/calibre/gui2/dialogs/config.py | 30 ++++++++++++++++----- 3 files changed, 27 insertions(+), 8 deletions(-) create mode 100644 src/calibre/gui2/device_drivers/__init__.py diff --git a/src/calibre/devices/usbms/deviceconfig.py b/src/calibre/devices/usbms/deviceconfig.py index 999f2ea1b7..50d548b823 100644 --- a/src/calibre/devices/usbms/deviceconfig.py +++ b/src/calibre/devices/usbms/deviceconfig.py @@ -16,18 +16,19 @@ class DeviceConfig(object): c.add_opt('format_map', default=cls.FORMATS, help=cls.HELP_MESSAGE) return c + @classmethod def _configProxy(cls): return ConfigProxy(cls._config()) @classmethod def config_widget(cls): from calibre.gui2.device_drivers.configwidget import ConfigWidget - cw = ConfigWidget(cls.configProxy(cls._config()), cls.FORMATS) + cw = ConfigWidget(cls._configProxy(), cls.FORMATS) return cw @classmethod def save_settings(cls, config_widget): - cls.configProxy(cls._config())['format_map'] = config_widget.format_map() + cls._configProxy()['format_map'] = config_widget.format_map() @classmethod def settings(cls): diff --git a/src/calibre/gui2/device_drivers/__init__.py b/src/calibre/gui2/device_drivers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/calibre/gui2/dialogs/config.py b/src/calibre/gui2/dialogs/config.py index b3b36db880..0de7826212 100644 --- a/src/calibre/gui2/dialogs/config.py +++ b/src/calibre/gui2/dialogs/config.py @@ -7,7 +7,8 @@ from PyQt4.Qt import QDialog, QMessageBox, QListWidgetItem, QIcon, \ QDesktopServices, QVBoxLayout, QLabel, QPlainTextEdit, \ QStringListModel, QAbstractItemModel, QFont, \ SIGNAL, QTimer, Qt, QSize, QVariant, QUrl, \ - QModelIndex, QInputDialog, QAbstractTableModel + QModelIndex, QInputDialog, QAbstractTableModel, \ + QDialogButtonBox from calibre.constants import islinux, iswindows from calibre.gui2.dialogs.config_ui import Ui_Dialog @@ -540,11 +541,28 @@ class ConfigDialog(QDialog, Ui_Dialog): info_dialog(self, _('Plugin not customizable'), _('Plugin: %s does not need customization')%plugin.name).exec_() return - help = plugin.customization_help() - text, ok = QInputDialog.getText(self, _('Customize %s')%plugin.name, - help) - if ok: - customize_plugin(plugin, unicode(text)) + if hasattr(plugin, 'config_widget'): + config_dialog = QDialog(self) + button_box = QDialogButtonBox(QDialogButtonBox.Ok | QDialogButtonBox.Cancel) + + config_dialog.connect(button_box, SIGNAL('accepted()'), config_dialog.accept) + config_dialog.connect(button_box, SIGNAL('rejected()'), config_dialog.reject) + + config_widget = plugin.config_widget() + v = QVBoxLayout(config_dialog) + v.addWidget(config_widget) + v.addWidget(button_box) + config_dialog.exec_() + + if config_dialog.result() == QDialog.Accepted: + plugin.save_settings(config_widget) + self._plugin_model.refresh_plugin(plugin) + else: + help = plugin.customization_help() + text, ok = QInputDialog.getText(self, _('Customize %s')%plugin.name, + help) + if ok: + customize_plugin(plugin, unicode(text)) self._plugin_model.refresh_plugin(plugin) if op == 'remove': if remove_plugin(plugin): From dc0e0f26a1a285bcaf3c1dd7b622bc54e0017a58 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 2 May 2009 12:40:29 -0700 Subject: [PATCH 175/319] Add a --preprocess-html option --- src/calibre/customize/conversion.py | 12 ++++++++++++ src/calibre/ebooks/conversion/cli.py | 1 + src/calibre/ebooks/conversion/plumber.py | 16 +++++++++++++--- src/calibre/ebooks/conversion/preprocess.py | 18 ++++++++++++------ src/calibre/ebooks/html/input.py | 2 +- src/calibre/ebooks/lit/input.py | 2 +- src/calibre/ebooks/oeb/base.py | 2 +- 7 files changed, 41 insertions(+), 12 deletions(-) diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index 7920b823de..3a89a9b156 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -149,6 +149,18 @@ class InputFormatPlugin(Plugin): ''' raise NotImplementedError() + def preprocess_html(self, html): + ''' + This method is called by the conversion pipeline on all HTML before it + is parsed. It is meant to be used to do any required preprocessing on + the HTML, like removing hard line breaks, etc. + + :param html: A unicode string + :return: A unicode string + ''' + return html + + def convert(self, stream, options, file_ext, log, accelerators): ''' This method must be implemented in sub-classes. It must return diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index 53b1a2065d..3274b912ea 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -129,6 +129,7 @@ def add_pipeline_options(parser, plumber): 'dont_split_on_page_breaks', 'chapter', 'chapter_mark', 'prefer_metadata_cover', 'remove_first_image', 'insert_metadata', 'page_breaks_before', + 'preprocess_html', ] ), diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index d1630a25f2..ed0fd4584e 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -312,6 +312,14 @@ OptionRecommendation(name='insert_metadata', ) ), +OptionRecommendation(name='preprocess_html', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Attempt to detect and correct hard line breaks and other ' + 'problems in the source file. This may make things worse, so use ' + 'with care.' + ) + ), + OptionRecommendation(name='read_metadata_from_opf', recommended_value=None, level=OptionRecommendation.LOW, @@ -580,7 +588,8 @@ OptionRecommendation(name='list_recipes', self.log('Debug input called, aborting the rest of the pipeline.') return if not hasattr(self.oeb, 'manifest'): - self.oeb = create_oebbook(self.log, self.oeb, self.opts) + self.oeb = create_oebbook(self.log, self.oeb, self.opts, + self.input_plugin) pr = CompositeProgressReporter(0.34, 0.67, self.ui_reporter) pr(0., _('Running transforms on ebook...')) @@ -652,12 +661,13 @@ OptionRecommendation(name='list_recipes', self.opts, self.log) self.ui_reporter(1.) -def create_oebbook(log, path_or_stream, opts, reader=None): +def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None): ''' Create an OEBBook. ''' from calibre.ebooks.oeb.base import OEBBook - html_preprocessor = HTMLPreProcessor() + html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html, + opts.preprocess_html) oeb = OEBBook(log, html_preprocessor=html_preprocessor, pretty_print=opts.pretty_print) # Read OEB Book into OEBBook diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 9bfe6d4255..76fc36708e 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -26,16 +26,16 @@ def sanitize_head(match): def chap_head(match): chap = match.group('chap') title = match.group('title') - if not title: + if not title: return '

    '+chap+'


    \n' - else: + else: return '

    '+chap+'
    \n'+title+'


    \n' def wrap_lines(match): ital = match.group('ital') - if not ital: + if not ital: return ' ' - else: + else: return ital+' ' def line_length(raw, percent): @@ -106,7 +106,7 @@ class HTMLPreProcessor(object): (re.compile(u'¨\s*()*\s*I', re.UNICODE), lambda match: u'Ï'), (re.compile(u'¨\s*()*\s*a', re.UNICODE), lambda match: u'ä'), (re.compile(u'¨\s*()*\s*A', re.UNICODE), lambda match: u'Ä'), - + # Remove page links (re.compile(r'', re.IGNORECASE), lambda match: ''), # Remove
    tags @@ -151,6 +151,9 @@ class HTMLPreProcessor(object): (re.compile('<]*?id=subtitle[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL), lambda match : '

    %s

    '%(match.group(1),)), ] + def __init__(self, input_plugin_preprocess, plugin_preprocess): + self.input_plugin_preprocess = input_plugin_preprocess + self.plugin_preprocess = plugin_preprocess def is_baen(self, src): return re.compile(r')?\s*()\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .4), re.UNICODE), wrap_lines), ] - + rules = self.PDFTOHTML + line_length_rules else: rules = [] @@ -192,5 +195,8 @@ class HTMLPreProcessor(object): html = XMLDECL_RE.sub('', html) + if self.plugin_preprocess: + html = self.input_plugin_preprocess(html) + return html diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index 252032a23d..255d975b1e 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -288,7 +288,7 @@ class HTMLInput(InputFormatPlugin): return opfpath from calibre.ebooks.conversion.plumber import create_oebbook - oeb = create_oebbook(log, opfpath, opts) + oeb = create_oebbook(log, opfpath, opts, self) from calibre.ebooks.oeb.transforms.package import Package Package(os.getcwdu())(oeb, opts) diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py index 2d726f7eeb..409482da29 100644 --- a/src/calibre/ebooks/lit/input.py +++ b/src/calibre/ebooks/lit/input.py @@ -19,6 +19,6 @@ class LITInput(InputFormatPlugin): accelerators): from calibre.ebooks.lit.reader import LitReader from calibre.ebooks.conversion.plumber import create_oebbook - return create_oebbook(log, stream, options, reader=LitReader) + return create_oebbook(log, stream, options, self, reader=LitReader) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index faf2d02dc4..728e1711a0 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -1506,7 +1506,7 @@ class OEBBook(object): COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]') def __init__(self, logger, - html_preprocessor=HTMLPreProcessor(), + html_preprocessor, css_preprocessor=CSSPreProcessor(), encoding='utf-8', pretty_print=False): """Create empty book. Arguments: From 7d15d42ec0911843d9259597e8c29953da7dbb02 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 2 May 2009 12:42:50 -0700 Subject: [PATCH 176/319] IGN:... --- src/calibre/ebooks/conversion/plumber.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index ed0fd4584e..7df5e54f0e 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -668,7 +668,7 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None): from calibre.ebooks.oeb.base import OEBBook html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html, opts.preprocess_html) - oeb = OEBBook(log, html_preprocessor=html_preprocessor, + oeb = OEBBook(log, html_preprocessor, pretty_print=opts.pretty_print) # Read OEB Book into OEBBook log('Parsing all content...') From 65c53808daaaabdb8409e1a76877b04e73670079 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 2 May 2009 17:01:30 -0400 Subject: [PATCH 177/319] Fix fdi generation bug. GUI send to device respects user settings for device formats. --- src/calibre/devices/prs500/cli/main.py | 3 ++- src/calibre/devices/usbms/device.py | 3 +-- src/calibre/gui2/device.py | 12 ++++++------ 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/calibre/devices/prs500/cli/main.py b/src/calibre/devices/prs500/cli/main.py index 2484ff2902..9211fcff41 100755 --- a/src/calibre/devices/prs500/cli/main.py +++ b/src/calibre/devices/prs500/cli/main.py @@ -205,7 +205,8 @@ def main(): scanner.scan() for d in device_plugins(): if scanner.is_device_connected(d): - dev = d(log_packets=options.log_packets) + dev = d + dev.reset(log_packets=options.log_packets) if dev is None: print >>sys.stderr, 'Unable to find a connected ebook reader.' diff --git a/src/calibre/devices/usbms/device.py b/src/calibre/devices/usbms/device.py index 50abbaf5f6..709ead05ef 100644 --- a/src/calibre/devices/usbms/device.py +++ b/src/calibre/devices/usbms/device.py @@ -90,12 +90,11 @@ class Device(DeviceConfig, DevicePlugin): @classmethod def get_fdi(cls): fdi = '' - for vid in cls.VENDOR_ID: for pid in cls.PRODUCT_ID: fdi_base_values = dict( app=__appname__, - deviceclass=cls.__class__.__name__, + deviceclass=cls.__name__, vendor_id=hex(vid), product_id=hex(pid), main_memory=cls.MAIN_MEMORY_VOLUME_LABEL, diff --git a/src/calibre/gui2/device.py b/src/calibre/gui2/device.py index 800c802747..8dd639d7c2 100644 --- a/src/calibre/gui2/device.py +++ b/src/calibre/gui2/device.py @@ -436,7 +436,7 @@ class DeviceGUI(object): fmt = None if specific: d = ChooseFormatDialog(self, _('Choose format to send to device'), - self.device_manager.device_class.FORMATS) + self.device_manager.device_class.settings().format_map) d.exec_() fmt = d.format().lower() dest, sub_dest = dest.split(':') @@ -581,7 +581,7 @@ class DeviceGUI(object): ids = list(dynamic.get('news_to_be_synced', set([]))) ids = [id for id in ids if self.library_view.model().db.has_id(id)] files = self.library_view.model().get_preferred_formats_from_ids( - ids, self.device_manager.device_class.FORMATS) + ids, self.device_manager.device_class.settings().format_map) files = [f for f in files if f is not None] if not files: dynamic.set('news_to_be_synced', set([])) @@ -618,7 +618,7 @@ class DeviceGUI(object): return _files, _auto_rows = self.library_view.model().get_preferred_formats(rows, - self.device_manager.device_class.FORMATS, + self.device_manager.device_class.settings().format_map, paths=True, set_metadata=True, specific_format=specific_format, exclude_auto=do_auto_convert) @@ -667,7 +667,7 @@ class DeviceGUI(object): if specific_format == None: formats = [f.lower() for f in self.library_view.model().db.formats(row).split(',')] formats = formats if formats != None else [] - if list(set(formats).intersection(available_input_formats())) != [] and list(set(self.device_manager.device_class.FORMATS).intersection(available_output_formats())) != []: + if list(set(formats).intersection(available_input_formats())) != [] and list(set(self.device_manager.device_class.settings().format_map).intersection(available_output_formats())) != []: auto.append(row) else: bad.append(self.library_view.model().title(row)) @@ -682,8 +682,8 @@ class DeviceGUI(object): autos = '\n'.join('
  • %s
  • '%(i,) for i in autos) d = info_dialog(self, _('No suitable formats'), _('Auto converting the following books before uploading to the device:
      %s
    ')%(autos,)) - for fmt in self.device_manager.device_class.FORMATS: - if fmt in list(set(self.device_manager.device_class.FORMATS).intersection(set(available_output_formats()))): + for fmt in self.device_manager.device_class.settings().format_map: + if fmt in list(set(self.device_manager.device_class.settings().format_map).intersection(set(available_output_formats()))): format = fmt break d.exec_() From 908751e3cba6a20962c9befd951fdd6b737444d1 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 2 May 2009 14:47:30 -0700 Subject: [PATCH 178/319] MOBI Ouput plugin --- src/calibre/customize/builtins.py | 7 +- src/calibre/ebooks/conversion/cli.py | 2 +- src/calibre/ebooks/conversion/plumber.py | 26 +---- src/calibre/ebooks/epub/output.py | 23 +++- src/calibre/ebooks/mobi/mobiml.py | 13 --- src/calibre/ebooks/mobi/output.py | 51 +++++++++ src/calibre/ebooks/mobi/writer.py | 138 +---------------------- 7 files changed, 85 insertions(+), 175 deletions(-) create mode 100644 src/calibre/ebooks/mobi/output.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index f52c42811b..682c82cd1b 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -290,6 +290,7 @@ from calibre.ebooks.comic.input import ComicInput from calibre.web.feeds.input import RecipeInput from calibre.ebooks.oeb.output import OEBOutput from calibre.ebooks.epub.output import EPUBOutput +from calibre.ebooks.mobi.output import MOBIOutput from calibre.ebooks.txt.output import TXTOutput from calibre.ebooks.pdf.output import PDFOutput from calibre.ebooks.pml.input import PMLInput @@ -309,9 +310,9 @@ from calibre.devices.jetbook.driver import JETBOOK plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDBInput, PDFInput, HTMLInput, TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput, ComicInput, FB2Input, ODTInput, RTFInput, EPUBOutput, RecipeInput, PMLInput, - PMLOutput] -plugins += [PRS505, PRS700, CYBOOKG3, KINDLE, KINDLE2, BLACKBERRY, EB600, \ - JETBOOK] + PMLOutput, MOBIOutput] +plugins += [PRS500, PRS505, PRS700, CYBOOKG3, KINDLE, KINDLE2, BLACKBERRY, + EB600, JETBOOK] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index 3274b912ea..f07c2d86ef 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -126,7 +126,7 @@ def add_pipeline_options(parser, plumber): 'STRUCTURE DETECTION' : ( _('Control auto-detection of document structure.'), [ - 'dont_split_on_page_breaks', 'chapter', 'chapter_mark', + 'chapter', 'chapter_mark', 'prefer_metadata_cover', 'remove_first_image', 'insert_metadata', 'page_breaks_before', 'preprocess_html', diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 7df5e54f0e..7c654f924d 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -131,18 +131,6 @@ OptionRecommendation(name='linearize_tables', ) ), -OptionRecommendation(name='dont_split_on_page_breaks', - recommended_value=False, level=OptionRecommendation.LOW, - help=_('Turn off splitting at page breaks. Normally, input ' - 'files are automatically split at every page break into ' - 'two files. This gives an output ebook that can be ' - 'parsed faster and with less resources. However, ' - 'splitting is slow and if your source file contains a ' - 'very large number of page breaks, you should turn off ' - 'splitting on page breaks.' - ) - ), - OptionRecommendation(name='level1_toc', recommended_value=None, level=OptionRecommendation.LOW, help=_('XPath expression that specifies all tags that ' @@ -628,20 +616,14 @@ OptionRecommendation(name='list_recipes', flattener = CSSFlattener(fbase=fbase, fkey=fkey, lineh=self.opts.line_height, - untable=self.opts.linearize_tables) + untable=self.output_plugin.file_type in ('mobi','lit'), + unfloat=self.output_plugin.file_type in ('mobi', 'lit')) flattener(self.oeb, self.opts) - if self.opts.linearize_tables: + if self.opts.linearize_tables and \ + self.output_plugin.file_type not in ('mobi', 'lrf'): from calibre.ebooks.oeb.transforms.linearize_tables import LinearizeTables LinearizeTables()(self.oeb, self.opts) - pr(0.7) - - from calibre.ebooks.oeb.transforms.split import Split - pbx = accelerators.get('pagebreaks', None) - split = Split(not self.opts.dont_split_on_page_breaks, - max_flow_size=self.opts.output_profile.flow_size, - page_breaks_xpath=pbx) - split(self.oeb, self.opts) pr(0.9) from calibre.ebooks.oeb.transforms.trimmanifest import ManifestTrimmer diff --git a/src/calibre/ebooks/epub/output.py b/src/calibre/ebooks/epub/output.py index d5f0a9349a..aba9bff0d8 100644 --- a/src/calibre/ebooks/epub/output.py +++ b/src/calibre/ebooks/epub/output.py @@ -28,7 +28,21 @@ class EPUBOutput(OutputFormatPlugin): OptionRecommendation(name='extract_to', help=_('Extract the contents of the generated EPUB file to the ' 'specified directory. The contents of the directory are first ' - 'deleted, so be careful.')) + 'deleted, so be careful.')), + + OptionRecommendation(name='dont_split_on_page_breaks', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Turn off splitting at page breaks. Normally, input ' + 'files are automatically split at every page break into ' + 'two files. This gives an output ebook that can be ' + 'parsed faster and with less resources. However, ' + 'splitting is slow and if your source file contains a ' + 'very large number of page breaks, you should turn off ' + 'splitting on page breaks.' + ) + ), + + ]) @@ -88,6 +102,13 @@ class EPUBOutput(OutputFormatPlugin): def convert(self, oeb, output_path, input_plugin, opts, log): self.log, self.opts, self.oeb = log, opts, oeb + from calibre.ebooks.oeb.transforms.split import Split + split = Split(not self.opts.dont_split_on_page_breaks, + max_flow_size=self.opts.output_profile.flow_size + ) + split(self.oeb, self.opts) + + self.workaround_ade_quirks() from calibre.ebooks.oeb.transforms.rescale import RescaleImages diff --git a/src/calibre/ebooks/mobi/mobiml.py b/src/calibre/ebooks/mobi/mobiml.py index 18f53317e0..a2d999ffc8 100644 --- a/src/calibre/ebooks/mobi/mobiml.py +++ b/src/calibre/ebooks/mobi/mobiml.py @@ -80,19 +80,6 @@ class MobiMLizer(object): def __init__(self, ignore_tables=False): self.ignore_tables = ignore_tables - @classmethod - def config(cls, cfg): - group = cfg.add_group('mobiml', _('Mobipocket markup options.')) - group('ignore_tables', ['--ignore-tables'], default=False, - help=_('Render HTML tables as blocks of text instead of actual ' - 'tables. This is neccessary if the HTML contains very ' - 'large or complex tables.')) - return cfg - - @classmethod - def generate(cls, opts): - return cls(ignore_tables=opts.ignore_tables) - def __call__(self, oeb, context): oeb.logger.info('Converting XHTML to Mobipocket markup...') self.oeb = oeb diff --git a/src/calibre/ebooks/mobi/output.py b/src/calibre/ebooks/mobi/output.py new file mode 100644 index 0000000000..1866888ab1 --- /dev/null +++ b/src/calibre/ebooks/mobi/output.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + + +from calibre.customize.conversion import OutputFormatPlugin +from calibre.customize.conversion import OptionRecommendation + +class MOBIOutput(OutputFormatPlugin): + + name = 'MOBI Output' + author = 'Marshall T. Vandegrift' + file_type = 'mobi' + + options = set([ + OptionRecommendation(name='rescale_images', recommended_value=False, + help=_('Modify images to meet Palm device size limitations.') + ), + OptionRecommendation(name='prefer_author_sort', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('When present, use author sort field as author.') + ), + OptionRecommendation(name='toc_title', recommended_value=None, + help=_('Title for any generated in-line table of contents.') + ), + ]) + + def convert(self, oeb, output_path, input_plugin, opts, log): + self.log, self.opts, self.oeb = log, opts, oeb + from calibre.ebooks.mobi.writer import PALM_MAX_IMAGE_SIZE, MobiWriter + from calibre.ebooks.mobi.mobiml import MobiMLizer + from calibre.ebooks.oeb.transforms.manglecase import CaseMangler + from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer + from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder + imagemax = PALM_MAX_IMAGE_SIZE if opts.rescale_images else None + tocadder = HTMLTOCAdder(title=opts.toc_title) + tocadder(oeb, opts) + mangler = CaseMangler() + mangler(oeb, opts) + rasterizer = SVGRasterizer() + rasterizer(oeb, opts) + mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables) + mobimlizer(oeb, opts) + writer = MobiWriter(imagemax=imagemax, + prefer_author_sort=opts.prefer_author_sort) + writer(oeb, output_path) + diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index c521ba9977..e16deeccda 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -6,8 +6,6 @@ from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Marshall T. Vandegrift ' -import sys -import os from struct import pack import time import random @@ -16,24 +14,14 @@ import re from itertools import izip, count from collections import defaultdict from urlparse import urldefrag -import logging from PIL import Image from calibre.ebooks.oeb.base import XML_NS, XHTML, XHTML_NS, OEB_DOCS, \ OEB_RASTER_IMAGES from calibre.ebooks.oeb.base import namespace, prefixname from calibre.ebooks.oeb.base import urlnormalize -from calibre.ebooks.oeb.base import OEBBook -from calibre.ebooks.oeb.profile import Context -from calibre.ebooks.oeb.transforms.flatcss import CSSFlattener -from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer -from calibre.ebooks.oeb.transforms.trimmanifest import ManifestTrimmer -from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder -from calibre.ebooks.oeb.transforms.manglecase import CaseMangler from calibre.ebooks.mobi.palmdoc import compress_doc from calibre.ebooks.mobi.langcodes import iana2mobi -from calibre.ebooks.mobi.mobiml import MBP_NS, MobiMLizer -from calibre.customize.ui import run_plugins_on_postprocess -from calibre.utils.config import Config, StringConfig +from calibre.ebooks.mobi.mobiml import MBP_NS # TODO: # - Allow override CSS (?) @@ -293,58 +281,22 @@ class Serializer(object): buffer.write('%010d' % ioff) -class MobiFlattener(object): - def config(self, cfg): - return cfg - - def generate(self, opts): - return self - - def __call__(self, oeb, context): - fbase = context.dest.fbase - fkey = context.dest.fnums.values() - flattener = CSSFlattener( - fbase=fbase, fkey=fkey, unfloat=True, untable=True) - return flattener(oeb, context) - class MobiWriter(object): COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+') - DEFAULT_PROFILE = 'CybookG3' - - TRANSFORMS = [HTMLTOCAdder, CaseMangler, MobiFlattener(), SVGRasterizer, - ManifestTrimmer, MobiMLizer] - - def __init__(self, compression=None, imagemax=None, + def __init__(self, compression=PALMDOC, imagemax=None, prefer_author_sort=False): self._compression = compression or UNCOMPRESSED self._imagemax = imagemax or OTHER_MAX_IMAGE_SIZE self._prefer_author_sort = prefer_author_sort - @classmethod - def config(cls, cfg): - """Add any book-writing options to the :class:`Config` object - :param:`cfg`. - """ - mobi = cfg.add_group('mobipocket', _('Mobipocket-specific options.')) - mobi('compress', ['--compress'], default=False, - help=_('Compress file text using PalmDOC compression. ' - 'Results in smaller files, but takes a long time to run.')) - mobi('rescale_images', ['--rescale-images'], default=False, - help=_('Modify images to meet Palm device size limitations.')) - mobi('prefer_author_sort', ['--prefer-author-sort'], default=False, - help=_('When present, use the author sorting information for ' - 'generating the Mobipocket author metadata.')) - return cfg - @classmethod def generate(cls, opts): """Generate a Writer instance from command-line options.""" - compression = PALMDOC if opts.compress else UNCOMPRESSED imagemax = PALM_MAX_IMAGE_SIZE if opts.rescale_images else None prefer_author_sort = opts.prefer_author_sort - return cls(compression=compression, imagemax=imagemax, + return cls(compression=PALMDOC, imagemax=imagemax, prefer_author_sort=prefer_author_sort) def __call__(self, oeb, path): @@ -577,88 +529,4 @@ class MobiWriter(object): self._write(record) -def config(defaults=None): - desc = _('Options to control the conversion to MOBI') - _profiles = list(sorted(Context.PROFILES.keys())) - if defaults is None: - c = Config('mobi', desc) - else: - c = StringConfig(defaults, desc) - profiles = c.add_group('profiles', _('Device renderer profiles. ' - 'Affects conversion of font sizes, image rescaling and rasterization ' - 'of tables. Valid profiles are: %s.') % ', '.join(_profiles)) - profiles('source_profile', ['--source-profile'], - default='Browser', choices=_profiles, - help=_("Source renderer profile. Default is %default.")) - profiles('dest_profile', ['--dest-profile'], - default='CybookG3', choices=_profiles, - help=_("Destination renderer profile. Default is %default.")) - c.add_opt('encoding', ['--encoding'], default=None, - help=_('Character encoding for HTML files. Default is to auto detect.')) - return c - - -def option_parser(): - c = config() - parser = c.option_parser(usage='%prog '+_('[options]')+' file.opf') - parser.add_option( - '-o', '--output', default=None, - help=_('Output file. Default is derived from input filename.')) - parser.add_option( - '-v', '--verbose', default=0, action='count', - help=_('Useful for debugging.')) - return parser - -def oeb2mobi(opts, inpath): - logger = Logger(logging.getLogger('oeb2mobi')) - logger.setup_cli_handler(opts.verbose) - outpath = opts.output - if outpath is None: - outpath = os.path.basename(inpath) - outpath = os.path.splitext(outpath)[0] + '.mobi' - source = opts.source_profile - if source not in Context.PROFILES: - logger.error(_('Unknown source profile %r') % source) - return 1 - dest = opts.dest_profile - if dest not in Context.PROFILES: - logger.error(_('Unknown destination profile %r') % dest) - return 1 - compression = PALMDOC if opts.compress else UNCOMPRESSED - imagemax = PALM_MAX_IMAGE_SIZE if opts.rescale_images else None - context = Context(source, dest) - oeb = OEBBook(inpath, logger=logger, encoding=opts.encoding) - tocadder = HTMLTOCAdder(title=opts.toc_title) - tocadder.transform(oeb, context) - mangler = CaseMangler() - mangler.transform(oeb, context) - fbase = context.dest.fbase - fkey = context.dest.fnums.values() - flattener = CSSFlattener( - fbase=fbase, fkey=fkey, unfloat=True, untable=True) - flattener.transform(oeb, context) - rasterizer = SVGRasterizer() - rasterizer.transform(oeb, context) - trimmer = ManifestTrimmer() - trimmer.transform(oeb, context) - mobimlizer = MobiMLizer(ignore_tables=opts.ignore_tables) - mobimlizer.transform(oeb, context) - writer = MobiWriter(compression=compression, imagemax=imagemax, - prefer_author_sort=opts.prefer_author_sort) - writer.dump(oeb, outpath) - run_plugins_on_postprocess(outpath, 'mobi') - logger.info(_('Output written to ') + outpath) - -def main(argv=sys.argv): - parser = option_parser() - opts, args = parser.parse_args(argv[1:]) - if len(args) != 1: - parser.print_help() - return 1 - inpath = args[0] - retval = oeb2mobi(opts, inpath) - return retval - -if __name__ == '__main__': - sys.exit(main()) From a45bcd70bba495207714a6c7e30d4e52c450546c Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 2 May 2009 19:37:07 -0400 Subject: [PATCH 179/319] eReader metadata reader --- src/calibre/customize/builtins.py | 11 ++++++ src/calibre/ebooks/metadata/ereader.py | 43 ++++++++++++++++++++++++ src/calibre/ebooks/metadata/pdb.py | 36 ++++++++++++++++++++ src/calibre/ebooks/pdb/ereader/reader.py | 5 ++- src/calibre/ebooks/pdb/ereader/writer.py | 6 +++- 5 files changed, 97 insertions(+), 4 deletions(-) create mode 100644 src/calibre/ebooks/metadata/ereader.py create mode 100644 src/calibre/ebooks/metadata/pdb.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 682c82cd1b..eef7e31ca6 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -171,6 +171,17 @@ class TXTMetadataReader(MetadataReaderPlugin): from calibre.ebooks.metadata.txt import get_metadata return get_metadata(stream) +class PDBMetadataReader(MetadataReaderPlugin): + + name = 'Read PDB metadata' + file_types = set(['pdb']) + description = _('Read metadata from %s files') % 'PDB' + author = 'John Schember' + + def get_metadata(self, stream, ftype): + from calibre.ebooks.metadata.pdb import get_metadata + return get_metadata(stream) + class LRXMetadataReader(MetadataReaderPlugin): name = 'Read LRX metadata' diff --git a/src/calibre/ebooks/metadata/ereader.py b/src/calibre/ebooks/metadata/ereader.py new file mode 100644 index 0000000000..f37ff9ab6d --- /dev/null +++ b/src/calibre/ebooks/metadata/ereader.py @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- + +''' +Read meta information from eReader pdb files. +''' + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import re + +from calibre.ebooks.metadata import MetaInformation +from calibre.ebooks.pdb.header import PdbHeaderReader +from calibre.ebooks.pdb.ereader.reader import HeaderRecord + +def get_metadata(stream, extract_cover=True): + """ + Return metadata as a L{MetaInfo} object + """ + mi = MetaInformation(None, [_('Unknown')]) + stream.seek(0) + + pheader = PdbHeaderReader(stream) + hr = HeaderRecord(pheader.section_data(0)) + + if hr.version in (2, 10): + try: + mdata = pheader.section_data(hr.metadata_offset) + + mdata = mdata.split('\x00') + mi.title = mdata[0] + mi.authors = [mdata[1]] + mi.publisher = mdata[3] + mi.isbn = mdata[4] + except: + pass + + if not mi.title: + mi.title = pheader.title if pheader.title else _('Unknown') + + return mi + diff --git a/src/calibre/ebooks/metadata/pdb.py b/src/calibre/ebooks/metadata/pdb.py new file mode 100644 index 0000000000..e473925b87 --- /dev/null +++ b/src/calibre/ebooks/metadata/pdb.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- + +''' +Read meta information from eReader pdb files. +''' + +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import re + +from calibre.ebooks.metadata import MetaInformation +from calibre.ebooks.pdb.header import PdbHeaderReader +from calibre.ebooks.metadata.ereader import get_metadata as eReader + +MREADER = { + 'PNPdPPrs' : eReader, + 'PNRdPPrs' : eReader, +} + +def get_metadata(stream, extract_cover=True): + """ + Return metadata as a L{MetaInfo} object + """ + + pheader = PdbHeaderReader(stream) + + MetadataReader = MREADER.get(pheader.ident, None) + + if MetadataReader is None: + return MetaInformation(_('Unknown'), [_('Unknown')]) + + + return MetadataReader(stream, extract_cover) + diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index 9b5fbf82da..ecf5c706c4 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -from __future__ import with_statement + ''' Read content from ereader pdb file. ''' @@ -127,8 +127,7 @@ class Reader(FormatReader): with open('index.html', 'wb') as index: self.log.debug('Writing text to index.html') index.write(html) -# print html - + if not os.path.exists(os.path.join(output_dir, 'images/')): os.makedirs(os.path.join(output_dir, 'images/')) images = [] diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index 1446cc3d74..ea9144579c 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -1,9 +1,13 @@ # -*- coding: utf-8 -*- -from __future__ import with_statement + ''' Write content to ereader pdb file. ''' +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + import struct, zlib import Image, cStringIO From 538d310bb848849cf91d8b88fe26ede0656e3ecc Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 2 May 2009 17:34:19 -0700 Subject: [PATCH 180/319] LRF Output --- src/calibre/customize/builtins.py | 3 +- src/calibre/ebooks/conversion/plumber.py | 8 +- src/calibre/ebooks/lrf/__init__.py | 231 +----- src/calibre/ebooks/lrf/any/__init__.py | 2 - src/calibre/ebooks/lrf/any/convert_from.py | 199 ------ src/calibre/ebooks/lrf/epub/__init__.py | 3 - src/calibre/ebooks/lrf/epub/convert_from.py | 75 -- src/calibre/ebooks/lrf/feeds/__init__.py | 4 - src/calibre/ebooks/lrf/feeds/convert_from.py | 59 -- src/calibre/ebooks/lrf/html/convert_from.py | 669 ++++++++---------- src/calibre/ebooks/lrf/lit/__init__.py | 3 - src/calibre/ebooks/lrf/lit/convert_from.py | 90 --- src/calibre/ebooks/lrf/mobi/__init__.py | 0 src/calibre/ebooks/lrf/mobi/convert_from.py | 63 -- src/calibre/ebooks/lrf/objects.py | 3 +- src/calibre/ebooks/lrf/output.py | 135 ++++ src/calibre/ebooks/lrf/pdf/__init__.py | 2 - src/calibre/ebooks/lrf/pdf/convert_from.py | 131 ---- src/calibre/ebooks/lrf/pdf/reflow.py | 426 ----------- src/calibre/ebooks/lrf/txt/__init__.py | 2 - src/calibre/ebooks/lrf/txt/convert_from.py | 112 --- src/calibre/ebooks/lrf/txt/demo/demo.txt | 89 --- src/calibre/ebooks/lrf/txt/demo/small.jpg | Bin 2055 -> 0 bytes src/calibre/ebooks/lrf/web/__init__.py | 6 - src/calibre/ebooks/lrf/web/convert_from.py | 183 ----- .../ebooks/lrf/web/profiles/__init__.py | 572 --------------- src/calibre/ebooks/lrf/web/profiles/ap.py | 38 - .../ebooks/lrf/web/profiles/atlantic.py | 47 -- .../ebooks/lrf/web/profiles/automatic.py | 75 -- .../ebooks/lrf/web/profiles/barrons.py | 90 --- src/calibre/ebooks/lrf/web/profiles/bbc.py | 45 -- .../ebooks/lrf/web/profiles/chr_mon.py | 46 -- src/calibre/ebooks/lrf/web/profiles/cnn.py | 51 -- .../ebooks/lrf/web/profiles/economist.py | 73 -- src/calibre/ebooks/lrf/web/profiles/faznet.py | 28 - src/calibre/ebooks/lrf/web/profiles/jpost.py | 36 - .../ebooks/lrf/web/profiles/jutarnji.py | 44 -- src/calibre/ebooks/lrf/web/profiles/nasa.py | 91 --- .../ebooks/lrf/web/profiles/newsweek.py | 37 - .../ebooks/lrf/web/profiles/newyorker.py | 56 -- .../ebooks/lrf/web/profiles/newyorkreview.py | 24 - .../ebooks/lrf/web/profiles/nytimes.py | 100 --- .../ebooks/lrf/web/profiles/portfolio.py | 40 -- .../ebooks/lrf/web/profiles/reuters.py | 39 - .../ebooks/lrf/web/profiles/spiegelde.py | 36 - src/calibre/ebooks/lrf/web/profiles/upi.py | 36 - .../ebooks/lrf/web/profiles/usatoday.py | 43 -- .../ebooks/lrf/web/profiles/wash_post.py | 44 -- src/calibre/ebooks/lrf/web/profiles/wsj.py | 108 --- src/calibre/ebooks/lrf/web/profiles/zeitde.py | 26 - src/calibre/ebooks/mobi/output.py | 4 + src/calibre/linux.py | 1 - src/calibre/web/feeds/news.py | 2 +- src/calibre/web/feeds/recipes/__init__.py | 13 +- 54 files changed, 490 insertions(+), 3853 deletions(-) delete mode 100644 src/calibre/ebooks/lrf/any/__init__.py delete mode 100644 src/calibre/ebooks/lrf/any/convert_from.py delete mode 100644 src/calibre/ebooks/lrf/epub/__init__.py delete mode 100644 src/calibre/ebooks/lrf/epub/convert_from.py delete mode 100644 src/calibre/ebooks/lrf/feeds/__init__.py delete mode 100644 src/calibre/ebooks/lrf/feeds/convert_from.py delete mode 100644 src/calibre/ebooks/lrf/lit/__init__.py delete mode 100644 src/calibre/ebooks/lrf/lit/convert_from.py delete mode 100644 src/calibre/ebooks/lrf/mobi/__init__.py delete mode 100644 src/calibre/ebooks/lrf/mobi/convert_from.py create mode 100644 src/calibre/ebooks/lrf/output.py delete mode 100644 src/calibre/ebooks/lrf/pdf/__init__.py delete mode 100644 src/calibre/ebooks/lrf/pdf/convert_from.py delete mode 100644 src/calibre/ebooks/lrf/pdf/reflow.py delete mode 100644 src/calibre/ebooks/lrf/txt/__init__.py delete mode 100644 src/calibre/ebooks/lrf/txt/convert_from.py delete mode 100644 src/calibre/ebooks/lrf/txt/demo/demo.txt delete mode 100644 src/calibre/ebooks/lrf/txt/demo/small.jpg delete mode 100644 src/calibre/ebooks/lrf/web/__init__.py delete mode 100644 src/calibre/ebooks/lrf/web/convert_from.py delete mode 100644 src/calibre/ebooks/lrf/web/profiles/__init__.py delete mode 100644 src/calibre/ebooks/lrf/web/profiles/ap.py delete mode 100644 src/calibre/ebooks/lrf/web/profiles/atlantic.py delete mode 100644 src/calibre/ebooks/lrf/web/profiles/automatic.py delete mode 100644 src/calibre/ebooks/lrf/web/profiles/barrons.py delete mode 100644 src/calibre/ebooks/lrf/web/profiles/bbc.py delete mode 100644 src/calibre/ebooks/lrf/web/profiles/chr_mon.py delete mode 100644 src/calibre/ebooks/lrf/web/profiles/cnn.py delete mode 100644 src/calibre/ebooks/lrf/web/profiles/economist.py delete mode 100644 src/calibre/ebooks/lrf/web/profiles/faznet.py delete mode 100644 src/calibre/ebooks/lrf/web/profiles/jpost.py delete mode 100644 src/calibre/ebooks/lrf/web/profiles/jutarnji.py delete mode 100644 src/calibre/ebooks/lrf/web/profiles/nasa.py delete mode 100644 src/calibre/ebooks/lrf/web/profiles/newsweek.py delete mode 100644 src/calibre/ebooks/lrf/web/profiles/newyorker.py delete mode 100644 src/calibre/ebooks/lrf/web/profiles/newyorkreview.py delete mode 100644 src/calibre/ebooks/lrf/web/profiles/nytimes.py delete mode 100644 src/calibre/ebooks/lrf/web/profiles/portfolio.py delete mode 100644 src/calibre/ebooks/lrf/web/profiles/reuters.py delete mode 100644 src/calibre/ebooks/lrf/web/profiles/spiegelde.py delete mode 100644 src/calibre/ebooks/lrf/web/profiles/upi.py delete mode 100644 src/calibre/ebooks/lrf/web/profiles/usatoday.py delete mode 100644 src/calibre/ebooks/lrf/web/profiles/wash_post.py delete mode 100644 src/calibre/ebooks/lrf/web/profiles/wsj.py delete mode 100644 src/calibre/ebooks/lrf/web/profiles/zeitde.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 682c82cd1b..4a968966c7 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -291,6 +291,7 @@ from calibre.web.feeds.input import RecipeInput from calibre.ebooks.oeb.output import OEBOutput from calibre.ebooks.epub.output import EPUBOutput from calibre.ebooks.mobi.output import MOBIOutput +from calibre.ebooks.lrf.output import LRFOutput from calibre.ebooks.txt.output import TXTOutput from calibre.ebooks.pdf.output import PDFOutput from calibre.ebooks.pml.input import PMLInput @@ -310,7 +311,7 @@ from calibre.devices.jetbook.driver import JETBOOK plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDBInput, PDFInput, HTMLInput, TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput, ComicInput, FB2Input, ODTInput, RTFInput, EPUBOutput, RecipeInput, PMLInput, - PMLOutput, MOBIOutput] + PMLOutput, MOBIOutput, LRFOutput] plugins += [PRS500, PRS505, PRS700, CYBOOKG3, KINDLE, KINDLE2, BLACKBERRY, EB600, JETBOOK] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 7c654f924d..502102a59a 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -236,7 +236,6 @@ OptionRecommendation(name='page_breaks_before', 'before the specified elements.') ), - OptionRecommendation(name='margin_top', recommended_value=5.0, level=OptionRecommendation.LOW, help=_('Set the top margin in pts. Default is %default. ' @@ -614,11 +613,18 @@ OptionRecommendation(name='list_recipes', if self.opts.extra_css and os.path.exists(self.opts.extra_css): self.opts.extra_css = open(self.opts.extra_css, 'rb').read() + oibl = self.opts.insert_blank_line + orps = self.opts.remove_paragraph_spacing + if self.output_plugin.file_type == 'lrf': + self.opts.insert_blank_line = False + self.opts.remove_paragraph_spacing = False flattener = CSSFlattener(fbase=fbase, fkey=fkey, lineh=self.opts.line_height, untable=self.output_plugin.file_type in ('mobi','lit'), unfloat=self.output_plugin.file_type in ('mobi', 'lit')) flattener(self.oeb, self.opts) + self.opts.insert_blank_line = oibl + self.opts.remove_paragraph_spacing = orps if self.opts.linearize_tables and \ self.output_plugin.file_type not in ('mobi', 'lrf'): diff --git a/src/calibre/ebooks/lrf/__init__.py b/src/calibre/ebooks/lrf/__init__.py index ae74e429ad..9f6be65e3a 100644 --- a/src/calibre/ebooks/lrf/__init__.py +++ b/src/calibre/ebooks/lrf/__init__.py @@ -1,43 +1,19 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' -""" -This package contains logic to read and write LRF files. -The LRF file format is documented at U{http://www.sven.de/librie/Librie/LrfFormat}. """ -import sys, os -from optparse import OptionValueError -from htmlentitydefs import name2codepoint +This package contains logic to read and write LRF files. +The LRF file format is documented at U{http://www.sven.de/librie/Librie/LrfFormat}. +""" from uuid import uuid4 from calibre.ebooks.lrf.pylrs.pylrs import Book as _Book -from calibre.ebooks.lrf.pylrs.pylrs import TextBlock, Header, PutObj, \ - Paragraph, TextStyle, BlockStyle +from calibre.ebooks.lrf.pylrs.pylrs import TextBlock, Header, \ + TextStyle, BlockStyle from calibre.ebooks.lrf.fonts import FONT_FILE_MAP from calibre.ebooks import ConversionError -from calibre import __appname__, __version__, __author__, iswindows -from calibre.utils.config import OptionParser __docformat__ = "epytext" -preferred_source_formats = [ - 'LIT', - 'MOBI', - 'EPUB', - 'ODT', - 'HTML', - 'HTM', - 'XHTM', - 'XHTML', - 'PRC', - 'AZW', - 'FB2', - 'RTF', - 'PDF', - 'TXT', - 'ZIP', - 'RAR' - ] - class LRFParseError(Exception): pass @@ -55,174 +31,8 @@ class PRS500_PROFILE(object): header_height = 30 #: In px default_fonts = { 'sans': "Swis721 BT Roman", 'mono': "Courier10 BT Roman", 'serif': "Dutch801 Rm BT Roman"} - - name = 'prs500' - -profile_map = { - PRS500_PROFILE.name : PRS500_PROFILE, - } - -def profile_from_string(option, opt_str, value, parser): - try: - profile = profile_map[value] - setattr(parser.values, option.dest, profile) - except KeyError: - raise OptionValueError('Profile: '+value+' is not implemented. Implemented profiles: %s'%(profile_map.keys())) - -def option_parser(usage, gui_mode=False): - parser = OptionParser(usage=usage, gui_mode=gui_mode) - metadata = parser.add_option_group('METADATA OPTIONS') - metadata.add_option("-t", "--title", action="store", type="string", default=None,\ - dest="title", help=_("Set the title. Default: filename.")) - metadata.add_option("-a", "--author", action="store", type="string", \ - dest="author", help=_("Set the author(s). Multiple authors should be set as a comma separated list. Default: %default"), - default=_('Unknown')) - metadata.add_option("--comment", action="store", type="string", \ - dest="freetext", help=_("Set the comment."), default=_('Unknown')) - metadata.add_option("--category", action="store", type="string", \ - dest="category", help=_("Set the category"), default=_('Unknown')) - metadata.add_option('--title-sort', action='store', default='', dest='title_sort', - help=_('Sort key for the title')) - metadata.add_option('--author-sort', action='store', default='', dest='author_sort', - help=_('Sort key for the author')) - metadata.add_option('--publisher', action='store', default=_('Unknown'), dest='publisher', - help=_('Publisher')) - metadata.add_option('--cover', action='store', dest='cover', default=None, \ - help=_('Path to file containing image to be used as cover')) - metadata.add_option('--use-metadata-cover', action='store_true', default=False, - help=_('If there is a cover graphic detected in the source file, use that instead of the specified cover.')) - - parser.add_option('-o', '--output', action='store', default=None, \ - help=_('Output file name. Default is derived from input filename')) - parser.add_option('--ignore-tables', action='store_true', default=False, dest='ignore_tables', - help=_('Render HTML tables as blocks of text instead of actual tables. This is neccessary if the HTML contains very large or complex tables.')) - laf = parser.add_option_group('LOOK AND FEEL') - laf.add_option('--base-font-size', action='store', type='float', default=10., - help=_('''Specify the base font size in pts. All fonts are rescaled accordingly. This option obsoletes the --font-delta option and takes precedence over it. To use --font-delta, set this to 0. Default: %defaultpt''')) - laf.add_option('--enable-autorotation', action='store_true', default=False, - help=_('Enable autorotation of images that are wider than the screen width.'), - dest='autorotation') - laf.add_option('--wordspace', dest='wordspace', default=2.5, type='float', - help=_('Set the space between words in pts. Default is %default')) - laf.add_option('--blank-after-para', action='store_true', default=False, - dest='blank_after_para', help=_('Separate paragraphs by blank lines.')) - laf.add_option('--header', action='store_true', default=False, dest='header', - help=_('Add a header to all the pages with title and author.')) - laf.add_option('--headerformat', default="%t by %a", dest='headerformat', type='string', - help=_('Set the format of the header. %a is replaced by the author and %t by the title. Default is %default')) - laf.add_option('--header-separation', default=0, type='int', - help=_('Add extra spacing below the header. Default is %default px.')) - laf.add_option('--override-css', default=None, dest='_override_css', type='string', - help=_('Override the CSS. Can be either a path to a CSS stylesheet or a string. If it is a string it is interpreted as CSS.')) - laf.add_option('--use-spine', default=False, dest='use_spine', action='store_true', - help=_('Use the element from the OPF file to determine the order in which the HTML files are appended to the LRF. The .opf file must be in the same directory as the base HTML file.')) - laf.add_option('--minimum-indent', default=0, type='float', - help=_('Minimum paragraph indent (the indent of the first line of a paragraph) in pts. Default: %default')) - laf.add_option('--font-delta', action='store', type='float', default=0., \ - help=_("""Increase the font size by 2 * FONT_DELTA pts and """ - '''the line spacing by FONT_DELTA pts. FONT_DELTA can be a fraction.''' - """If FONT_DELTA is negative, the font size is decreased."""), - dest='font_delta') - laf.add_option('--ignore-colors', action='store_true', default=False, dest='ignore_colors', - help=_('Render all content as black on white instead of the colors specified by the HTML or CSS.')) - - page = parser.add_option_group('PAGE OPTIONS') - profiles = profile_map.keys() - page.add_option('-p', '--profile', default=PRS500_PROFILE, dest='profile', type='choice', - choices=profiles, action='callback', callback=profile_from_string, - help=_('''Profile of the target device for which this LRF is ''' - '''being generated. The profile determines things like the ''' - '''resolution and screen size of the target device. ''' - '''Default: %s Supported profiles: ''')%(PRS500_PROFILE.name,)+\ - ', '.join(profiles)) - page.add_option('--left-margin', default=20, dest='left_margin', type='int', - help=_('''Left margin of page. Default is %default px.''')) - page.add_option('--right-margin', default=20, dest='right_margin', type='int', - help=_('''Right margin of page. Default is %default px.''')) - page.add_option('--top-margin', default=10, dest='top_margin', type='int', - help=_('''Top margin of page. Default is %default px.''')) - page.add_option('--bottom-margin', default=0, dest='bottom_margin', type='int', - help=_('''Bottom margin of page. Default is %default px.''')) - page.add_option('--render-tables-as-images', default=False, action='store_true', - help=_('Render tables in the HTML as images (useful if the document has large or complex tables)')) - page.add_option('--text-size-multiplier-for-rendered-tables', type='float', default=1.0, - help=_('Multiply the size of text in rendered tables by this factor. Default is %default')) - - link = parser.add_option_group('LINK PROCESSING OPTIONS') - link.add_option('--link-levels', action='store', type='int', default=sys.maxint, \ - dest='link_levels', - help=_(r'''The maximum number of levels to recursively process ''' - '''links. A value of 0 means thats links are not followed. ''' - '''A negative value means that tags are ignored.''')) - link.add_option('--link-exclude', dest='link_exclude', default='@', - help=_('''A regular expression. tags whose href ''' - '''matches will be ignored. Defaults to %default''')) - link.add_option('--no-links-in-toc', action='store_true', default=False, - dest='no_links_in_toc', - help=_('''Don't add links to the table of contents.''')) - chapter = parser.add_option_group('CHAPTER OPTIONS') - chapter.add_option('--disable-chapter-detection', action='store_true', - default=False, dest='disable_chapter_detection', - help=_('''Prevent the automatic detection chapters.''')) - chapter.add_option('--chapter-regex', dest='chapter_regex', - default='chapter|book|appendix', - help=_('''The regular expression used to detect chapter titles.''' - ''' It is searched for in heading tags (h1-h6). Defaults to %default''')) - chapter.add_option('--chapter-attr', default='$,,$', - help=_('Detect a chapter beginning at an element having the specified attribute. The format for this option is tagname regexp,attribute name,attribute value regexp. For example to match all heading tags that have the attribute class="chapter" you would use "h\d,class,chapter". You can set the attribute to "none" to match only on tag names. So for example, to match all h2 tags, you would use "h2,none,". Default is %default''')) - chapter.add_option('--page-break-before-tag', dest='page_break', default='h[12]', - help=_('''If html2lrf does not find any page breaks in the ''' - '''html file and cannot detect chapter headings, it will ''' - '''automatically insert page-breaks before the tags whose ''' - '''names match this regular expression. Defaults to %default. ''' - '''You can disable it by setting the regexp to "$". ''' - '''The purpose of this option is to try to ensure that ''' - '''there are no really long pages as this degrades the page ''' - '''turn performance of the LRF. Thus this option is ignored ''' - '''if the current page has only a few elements.''')) - chapter.add_option('--force-page-break-before-tag', dest='force_page_break', - default='$', help=_('Force a page break before tags whose names match this regular expression.')) - chapter.add_option('--force-page-break-before-attr', dest='force_page_break_attr', - default='$,,$', help=_('Force a page break before an element having the specified attribute. The format for this option is tagname regexp,attribute name,attribute value regexp. For example to match all heading tags that have the attribute class="chapter" you would use "h\d,class,chapter". Default is %default''')) - chapter.add_option('--add-chapters-to-toc', action='store_true', - default=False, dest='add_chapters_to_toc', - help=_('''Add detected chapters to the table of contents.''')) - prepro = parser.add_option_group('PREPROCESSING OPTIONS') - prepro.add_option('--baen', action='store_true', default=False, dest='baen', - help=_('''Preprocess Baen HTML files to improve generated LRF.''')) - prepro.add_option('--pdftohtml', action='store_true', default=False, dest='pdftohtml', - help=_('''You must add this option if processing files generated by pdftohtml, otherwise conversion will fail.''')) - prepro.add_option('--book-designer', action='store_true', default=False, dest='book_designer', - help=_('''Use this option on html0 files from Book Designer.''')) - - fonts = parser.add_option_group('FONT FAMILIES', - _('''Specify trutype font families for serif, sans-serif and monospace fonts. ''' - '''These fonts will be embedded in the LRF file. Note that custom fonts lead to ''' - '''slower page turns. ''' - '''For example: ''' - '''--serif-family "Times New Roman" - ''')) - fonts.add_option('--serif-family', - default=None, dest='serif_family', type='string', - help=_('The serif family of fonts to embed')) - fonts.add_option('--sans-family', - default=None, dest='sans_family', type='string', - help=_('The sans-serif family of fonts to embed')) - fonts.add_option('--mono-family', - default=None, dest='mono_family', type='string', - help=_('The monospace family of fonts to embed')) - - debug = parser.add_option_group('DEBUG OPTIONS') - debug.add_option('--verbose', dest='verbose', action='store_true', default=False, - help=_('''Be verbose while processing''')) - debug.add_option('--lrs', action='store_true', dest='lrs', \ - help=_('Convert to LRS'), default=False) - parser.add_option('--minimize-memory-usage', action='store_true', default=False, - help=_('Minimize memory usage at the cost of longer processing times. Use this option if you are on a memory constrained machine.')) - parser.add_option('--encoding', default=None, - help=_('Specify the character encoding of the source file. If the output LRF file contains strange characters, try changing this option. A common encoding for files from windows computers is cp-1252. Another common choice is utf-8. The default is to try and guess the encoding.')) - - return parser + + name = 'prs500' def find_custom_fonts(options, logger): from calibre.utils.fontconfig import files_for_family @@ -238,16 +48,16 @@ def find_custom_fonts(options, logger): f = family(options.sans_family) fonts['sans'] = files_for_family(f) if not fonts['sans']: - logger.warn('Unable to find sans family %s'%f) + logger.warn('Unable to find sans family %s'%f) if options.mono_family: f = family(options.mono_family) fonts['mono'] = files_for_family(f) if not fonts['mono']: - logger.warn('Unable to find mono family %s'%f) + logger.warn('Unable to find mono family %s'%f) return fonts - - -def Book(options, logger, font_delta=0, header=None, + + +def Book(options, logger, font_delta=0, header=None, profile=PRS500_PROFILE, **settings): ps = {} ps['topmargin'] = options.top_margin @@ -258,7 +68,7 @@ def Book(options, logger, font_delta=0, header=None, - profile.fudge if header: hdr = Header() - hb = TextBlock(textStyle=TextStyle(align='foot', + hb = TextBlock(textStyle=TextStyle(align='foot', fontsize=int(profile.header_font_size*10)), blockStyle=BlockStyle(blockwidth=ps['textwidth'])) hb.append(header) @@ -269,20 +79,20 @@ def Book(options, logger, font_delta=0, header=None, ps['topmargin'] = 0 ps['textheight'] = profile.screen_height - (options.bottom_margin + ps['topmargin']) \ - ps['headheight'] - ps['headsep'] - profile.fudge - + fontsize = int(10*profile.font_size+font_delta*20) baselineskip = fontsize + 20 fonts = find_custom_fonts(options, logger) - tsd = dict(fontsize=fontsize, - parindent=int(10*profile.parindent), + tsd = dict(fontsize=fontsize, + parindent=int(10*profile.parindent), linespace=int(10*profile.line_space), baselineskip=baselineskip, wordspace=10*options.wordspace) if fonts['serif'] and fonts['serif'].has_key('normal'): tsd['fontfacename'] = fonts['serif']['normal'][1] - - book = _Book(textstyledefault=tsd, - pagestyledefault=ps, + + book = _Book(textstyledefault=tsd, + pagestyledefault=ps, blockstyledefault=dict(blockwidth=ps['textwidth']), bookid=uuid4().hex, **settings) @@ -291,7 +101,7 @@ def Book(options, logger, font_delta=0, header=None, for font in fonts[family].values(): book.embed_font(*font) FONT_FILE_MAP[font[1]] = font[0] - + for family in ['serif', 'sans', 'mono']: if not fonts[family]: fonts[family] = { 'normal' : (None, profile.default_fonts[family]) } @@ -299,4 +109,3 @@ def Book(options, logger, font_delta=0, header=None, raise ConversionError, 'Could not find the normal version of the ' + family + ' font' return book, fonts -from calibre import entity_to_unicode diff --git a/src/calibre/ebooks/lrf/any/__init__.py b/src/calibre/ebooks/lrf/any/__init__.py deleted file mode 100644 index f832dbb7fc..0000000000 --- a/src/calibre/ebooks/lrf/any/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' diff --git a/src/calibre/ebooks/lrf/any/convert_from.py b/src/calibre/ebooks/lrf/any/convert_from.py deleted file mode 100644 index fdfe1c54d5..0000000000 --- a/src/calibre/ebooks/lrf/any/convert_from.py +++ /dev/null @@ -1,199 +0,0 @@ -from __future__ import with_statement -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' -'''Convert any ebook file into a LRF file.''' - -import sys, os, logging, shutil, tempfile, re - -from calibre.ebooks import UnknownFormatError -from calibre.ebooks.lrf import option_parser as _option_parser -from calibre import __appname__, setup_cli_handlers, extract -from calibre.ptempfile import TemporaryDirectory -from calibre.ebooks.lrf.lit.convert_from import process_file as lit2lrf -from calibre.ebooks.lrf.pdf.convert_from import process_file as pdf2lrf -from calibre.ebooks.lrf.rtf.convert_from import process_file as rtf2lrf -from calibre.ebooks.lrf.txt.convert_from import process_file as txt2lrf -from calibre.ebooks.lrf.html.convert_from import process_file as html2lrf -from calibre.ebooks.lrf.epub.convert_from import process_file as epub2lrf -from calibre.ebooks.lrf.mobi.convert_from import process_file as mobi2lrf -from calibre.ebooks.lrf.fb2.convert_from import process_file as fb22lrf - -from calibre.customize.ui import run_plugins_on_postprocess, run_plugins_on_preprocess - -def largest_file(files): - maxsize, file = 0, None - for f in files: - size = os.stat(f).st_size - if size > maxsize: - maxsize = size - file = f - return file - -def find_htmlfile(dir): - ext_pat = re.compile(r'\.(x){0,1}htm(l){0,1}', re.IGNORECASE) - toc_pat = re.compile(r'toc', re.IGNORECASE) - index_pat = re.compile(r'index', re.IGNORECASE) - toc_files, index_files, files = [], [], [] - - for root, dirs, _files in os.walk(dir): - for f in _files: - f = os.path.abspath(os.path.join(root, f)) - ext = os.path.splitext(f)[1] - if ext and ext_pat.match(ext): - toc_files.append(f) if toc_pat.search(f) else \ - index_files.append(f) if index_pat.search(f) else \ - files.append(f) - a = toc_files if toc_files else index_files if index_files else files - if a: - return largest_file(a) - -def number_of_unhidden_files(base, listing): - ans = 0 - for i in listing: - i = os.path.join(base, i) - if os.path.isdir(i) or os.path.basename(i).startswith('.'): - continue - ans += 1 - return ans - -def unhidden_directories(base, listing): - ans = [] - for i in listing: - if os.path.isdir(os.path.join(base, i)) and not i.startswith('__') and \ - not i.startswith('.'): - ans.append(i) - return ans - -def traverse_subdirs(tdir): - temp = os.listdir(tdir) - if number_of_unhidden_files(tdir, temp) == 0: - try: - cdir = os.path.join(tdir, unhidden_directories(tdir, temp)[0]) - return traverse_subdirs(cdir) - except IndexError: - pass - return tdir - -def handle_archive(path): - tdir = tempfile.mkdtemp(prefix=__appname__+'_'+'archive_') - extract(path, tdir) - files = [] - cdir = traverse_subdirs(tdir) - file = None - exts = ['lit', 'rtf', 'fb2','pdf', 'txt', 'epub', 'mobi', 'prc'] - candidates = map(lambda x:os.path.join(cdir, x), os.listdir(cdir)) - for ext in exts: - for f in candidates: - if f.lower().endswith('.'+ext): - files.append(f) - file = largest_file(files) - if not file: - file = find_htmlfile(cdir) - if isinstance(file, str): - file = file.decode(sys.getfilesystemencoding()) - return tdir, file - -def odt2lrf(path, options, logger): - from calibre.ebooks.odt.to_oeb import Extract - from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file - - if logger is None: - level = logging.DEBUG if options.verbose else logging.INFO - logger = logging.getLogger('odt2lrf') - setup_cli_handlers(logger, level) - - with TemporaryDirectory('_odt2lrf') as tdir: - opf = Extract()(path, tdir) - options.use_spine = True - options.encoding = 'utf-8' - html_process_file(opf.replace('metadata.opf', 'index.html'), options, logger) - -def process_file(path, options, logger=None): - path = os.path.abspath(os.path.expanduser(path)) - path = run_plugins_on_preprocess(path) - tdir = None - if logger is None: - level = logging.DEBUG if options.verbose else logging.INFO - logger = logging.getLogger('any2lrf') - setup_cli_handlers(logger, level) - if not os.access(path, os.R_OK): - logger.critical('Cannot read from %s', path) - return 1 - ext = os.path.splitext(path)[1] - if not ext or ext == '.': - logger.critical('Unknown file type: %s', path) - return 1 - ext = ext[1:].lower() - cwd = os.getcwd() - if not options.output: - fmt = '.lrs' if options.lrs else '.lrf' - options.output = os.path.splitext(os.path.basename(path))[0] + fmt - options.output = os.path.abspath(os.path.expanduser(options.output)) - if ext in ['zip', 'rar', 'oebzip']: - newpath = None - try: - tdir, newpath = handle_archive(path) - except: - logger.exception(' ') - if not newpath: - raise UnknownFormatError('Could not find ebook in archive') - path = newpath - logger.info('Found ebook in archive: %s', repr(path)) - try: - ext = os.path.splitext(path)[1][1:].lower() - convertor = None - if 'htm' in ext: - convertor = html2lrf - elif 'lit' == ext: - convertor = lit2lrf - elif 'pdf' == ext: - convertor = pdf2lrf - elif 'rtf' == ext: - convertor = rtf2lrf - elif 'txt' == ext: - convertor = txt2lrf - elif 'epub' == ext: - convertor = epub2lrf - elif ext in ['mobi', 'prc', 'azw']: - convertor = mobi2lrf - elif ext == 'fb2': - convertor = fb22lrf - elif ext == 'odt': - convertor = odt2lrf - if not convertor: - raise UnknownFormatError(_('Converting from %s to LRF is not supported.')%ext) - convertor(path, options, logger) - - finally: - os.chdir(cwd) - if tdir and os.path.exists(tdir): - shutil.rmtree(tdir) - return 0 - - -def option_parser(gui_mode=False): - return _option_parser(usage=_('''\ -any2lrf [options] myfile - -Convert any ebook format into LRF. Supported formats are: -LIT, RTF, TXT, HTML, EPUB, MOBI, PRC and PDF. any2lrf will also process a RAR or -ZIP archive, looking for an ebook inside the archive. - '''), gui_mode=gui_mode) - - -def main(args=sys.argv, logger=None, gui_mode=False): - parser = option_parser(gui_mode) - options, args = parser.parse_args(args) - if len(args) != 2: - parser.print_help() - print - print _('No file to convert specified.') - return 1 - - src = args[1] - if not isinstance(src, unicode): - src = src.decode(sys.getfilesystemencoding()) - return process_file(src, options, logger) - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/lrf/epub/__init__.py b/src/calibre/ebooks/lrf/epub/__init__.py deleted file mode 100644 index ab32bc9c41..0000000000 --- a/src/calibre/ebooks/lrf/epub/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' - diff --git a/src/calibre/ebooks/lrf/epub/convert_from.py b/src/calibre/ebooks/lrf/epub/convert_from.py deleted file mode 100644 index c564930ea5..0000000000 --- a/src/calibre/ebooks/lrf/epub/convert_from.py +++ /dev/null @@ -1,75 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' - -import os, sys, shutil, logging -from calibre.ebooks.lrf import option_parser as lrf_option_parser -from calibre.ebooks import ConversionError, DRMError -from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file -from calibre.ebooks.metadata.opf import OPF -from calibre.ebooks.metadata.epub import OCFDirReader -from calibre.utils.zipfile import ZipFile -from calibre import setup_cli_handlers -from calibre.ptempfile import PersistentTemporaryDirectory - - -def option_parser(): - return lrf_option_parser( -_('''Usage: %prog [options] mybook.epub - - -%prog converts mybook.epub to mybook.lrf''') - ) - -def generate_html(pathtoepub, logger): - if not os.access(pathtoepub, os.R_OK): - raise ConversionError('Cannot read from ' + pathtoepub) - tdir = PersistentTemporaryDirectory('_epub2lrf') - #os.rmdir(tdir) - try: - ZipFile(pathtoepub).extractall(tdir) - except: - raise ConversionError, '.epub extraction failed' - if os.path.exists(os.path.join(tdir, 'META-INF', 'encryption.xml')): - raise DRMError(os.path.basename(pathtoepub)) - - return tdir - -def process_file(path, options, logger=None): - if logger is None: - level = logging.DEBUG if options.verbose else logging.INFO - logger = logging.getLogger('epub2lrf') - setup_cli_handlers(logger, level) - epub = os.path.abspath(os.path.expanduser(path)) - tdir = generate_html(epub, logger) - try: - ocf = OCFDirReader(tdir) - htmlfile = ocf.opf.spine[0].path - options.opf = os.path.join(tdir, ocf.container[OPF.MIMETYPE]) - if not options.output: - ext = '.lrs' if options.lrs else '.lrf' - options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext) - options.output = os.path.abspath(os.path.expanduser(options.output)) - options.use_spine = True - - html_process_file(htmlfile, options, logger=logger) - finally: - try: - shutil.rmtree(tdir) - except: - logger.warning('Failed to delete temporary directory '+tdir) - - -def main(args=sys.argv, logger=None): - parser = option_parser() - options, args = parser.parse_args(args) - if len(args) != 2: - parser.print_help() - print - print 'No epub file specified' - return 1 - process_file(args[1], options, logger) - return 0 - - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/lrf/feeds/__init__.py b/src/calibre/ebooks/lrf/feeds/__init__.py deleted file mode 100644 index ec763fbda7..0000000000 --- a/src/calibre/ebooks/lrf/feeds/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env python - -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' diff --git a/src/calibre/ebooks/lrf/feeds/convert_from.py b/src/calibre/ebooks/lrf/feeds/convert_from.py deleted file mode 100644 index 6965ea7bf3..0000000000 --- a/src/calibre/ebooks/lrf/feeds/convert_from.py +++ /dev/null @@ -1,59 +0,0 @@ -from __future__ import with_statement -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' -''' -Convert web feeds to LRF files. -''' -from calibre.ebooks.lrf import option_parser as lrf_option_parser -from calibre.ebooks.lrf.html.convert_from import process_file -from calibre.web.feeds.main import option_parser as feeds_option_parser -from calibre.web.feeds.main import run_recipe -from calibre.ptempfile import TemporaryDirectory -from calibre import sanitize_file_name, strftime - -import sys, os - -def option_parser(): - parser = feeds_option_parser() - parser.remove_option('--output-dir') - parser.remove_option('--lrf') - parser.subsume('FEEDS2DISK OPTIONS', _('Options to control the behavior of feeds2disk')) - lrf_parser = lrf_option_parser('') - lrf_parser.subsume('HTML2LRF OPTIONS', _('Options to control the behavior of html2lrf')) - parser.merge(lrf_parser) - return parser - -def main(args=sys.argv, notification=None, handler=None): - parser = option_parser() - opts, args = parser.parse_args(args) - opts.lrf = True - - if len(args) != 2 and opts.feeds is None: - parser.print_help() - return 1 - - recipe_arg = args[1] if len(args) > 1 else None - - with TemporaryDirectory('_feeds2lrf') as tdir: - opts.output_dir = tdir - - recipe = run_recipe(opts, recipe_arg, parser, notification=notification, handler=handler) - - htmlfile = os.path.join(tdir, 'index.html') - if not os.access(htmlfile, os.R_OK): - raise RuntimeError(_('Fetching of recipe failed: ')+recipe_arg) - - lparser = lrf_option_parser('') - ropts = lparser.parse_args(['html2lrf']+recipe.html2lrf_options)[0] - parser.merge_options(ropts, opts) - - if not opts.output: - ext = '.lrs' if opts.lrs else '.lrf' - fname = recipe.title + strftime(recipe.timefmt)+ext - opts.output = os.path.join(os.getcwd(), sanitize_file_name(fname)) - print 'Generating LRF...' - process_file(htmlfile, opts) - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/lrf/html/convert_from.py b/src/calibre/ebooks/lrf/html/convert_from.py index ebfdecc6f4..515ec4182d 100644 --- a/src/calibre/ebooks/lrf/html/convert_from.py +++ b/src/calibre/ebooks/lrf/html/convert_from.py @@ -1,12 +1,12 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' -""" +""" Code to convert HTML ebooks into LRF ebooks. I am indebted to esperanc for the initial CSS->Xylog Style conversion code and to Falstaff for pylrs. """ -import os, re, sys, copy, glob, logging, tempfile +import os, re, sys, copy, glob, tempfile from collections import deque from urllib import unquote from urlparse import urlparse @@ -16,6 +16,7 @@ from calibre.customize.ui import run_plugins_on_postprocess try: from PIL import Image as PILImage + PILImage except ImportError: import Image as PILImage @@ -25,13 +26,12 @@ from calibre.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \ TextBlock, ImageBlock, JumpButton, CharButton, \ Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \ LrsError, Sup, Sub, EmpLine -from calibre.ebooks.lrf.pylrs.pylrs import Span -from calibre.ebooks.lrf import Book, entity_to_unicode -from calibre.ebooks.lrf import option_parser as lrf_option_parser +from calibre.ebooks.lrf.pylrs.pylrs import Span +from calibre.ebooks.lrf import Book from calibre.ebooks import ConversionError -from calibre.ebooks.lrf.html.table import Table -from calibre import filename_to_utf8, setup_cli_handlers, __appname__, \ - fit_image, preferred_encoding +from calibre.ebooks.lrf.html.table import Table +from calibre import filename_to_utf8, __appname__, \ + fit_image, preferred_encoding, entity_to_unicode from calibre.ptempfile import PersistentTemporaryFile from calibre.devices.interface import DevicePlugin as Device from calibre.ebooks.lrf.html.color_map import lrs_color @@ -43,7 +43,7 @@ def update_css(ncss, ocss): ocss[key].update(ncss[key]) else: ocss[key] = ncss[key] - + def munge_paths(basepath, url): purl = urlparse(unquote(url),) path, fragment = purl[2], purl[5] @@ -74,7 +74,7 @@ def strip_style_comments(match): return src def tag_regex(tagname): - '''Return non-grouping regular expressions that match the opening and closing tags for tagname''' + '''Return non-grouping regular expressions that match the opening and closing tags for tagname''' return dict(open=r'(?:<\s*%(t)s\s+[^<>]*?>|<\s*%(t)s\s*>)'%dict(t=tagname), \ close=r''%dict(t=tagname)) @@ -82,49 +82,49 @@ class HTMLConverter(object): SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}") PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE) IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction) - - + + MARKUP_MASSAGE = [ # Close tags - (re.compile(r']*)?/>', re.IGNORECASE), + (re.compile(r']*)?/>', re.IGNORECASE), lambda match: ''), - # Strip comments from )', re.IGNORECASE|re.DOTALL), strip_style_comments), - + # Remove self closing script tags as they also mess up BeautifulSoup (re.compile(r'(?i)]+?/>'), lambda match: ''), - + # BeautifulSoup treats self closing
    tags as open
    tags - (re.compile(r'(?i)<\s*div([^>]*)/\s*>'), + (re.compile(r'(?i)<\s*div([^>]*)/\s*>'), lambda match: '
    '%match.group(1)) - + ] # Fix Baen markup - BAEN = [ - (re.compile(r'page-break-before:\s*\w+([\s;\}])', re.IGNORECASE), + BAEN = [ + (re.compile(r'page-break-before:\s*\w+([\s;\}])', re.IGNORECASE), lambda match: match.group(1)), - (re.compile(r'

    \s*(\s*)\s*

    ', re.IGNORECASE), + (re.compile(r'

    \s*(\s*)\s*

    ', re.IGNORECASE), lambda match: match.group(1)), - (re.compile(r'<\s*a\s+id="p[0-9]+"\s+name="p[0-9]+"\s*>\s*', re.IGNORECASE), + (re.compile(r'<\s*a\s+id="p[0-9]+"\s+name="p[0-9]+"\s*>\s*', re.IGNORECASE), lambda match: ''), ] # Fix pdftohtml markup @@ -135,14 +135,14 @@ class HTMLConverter(object): (re.compile(r'\d+
    ', re.IGNORECASE), lambda match: ''), # Remove
    and replace

    with

    (re.compile(r'\s*', re.IGNORECASE), lambda match: '

    '), - (re.compile(r'(.*)', re.IGNORECASE), - lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40 + (re.compile(r'(.*)', re.IGNORECASE), + lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40 else match.group(1)), # Remove hyphenation (re.compile(r'-\n\r?'), lambda match: ''), - + ] - + # Fix Book Designer markup BOOK_DESIGNER = [ # HR @@ -161,23 +161,23 @@ class HTMLConverter(object): (re.compile('<]*?>( ){4}

    ', re.IGNORECASE), lambda match : '

    '), ] - + def __hasattr__(self, attr): if hasattr(self.options, attr): return True return object.__hasattr__(self, attr) - + def __getattr__(self, attr): if hasattr(self.options, attr): return getattr(self.options, attr) return object.__getattribute__(self, attr) - + def __setattr__(self, attr, val): if hasattr(self.options, attr): setattr(self.options, attr, val) else: object.__setattr__(self, attr, val) - + CSS = { 'h1' : {"font-size" : "xx-large", "font-weight":"bold", 'text-indent':'0pt'}, 'h2' : {"font-size" : "x-large", "font-weight":"bold", 'text-indent':'0pt'}, @@ -201,27 +201,28 @@ class HTMLConverter(object): 'sup' : {'vertical-align': 'super', 'font-size': '60%'}, 'sub' : {'vertical-align': 'sub', 'font-size': '60%'}, } - + def __init__(self, book, fonts, options, logger, paths): ''' Convert HTML files at C{paths} and add to C{book}. After creating the object, you must call L{self.writeto} to output the LRF/S file. - - @param book: The LRF book + + @param book: The LRF book @type book: L{lrf.pylrs.Book} @param fonts: dict specifying the font families to use ''' - # Defaults for various formatting tags + # Defaults for various formatting tags object.__setattr__(self, 'options', options) + self.log = logger self.fonts = fonts #: dict specifying font families to use - # Memory - self.scaled_images = {} #: Temporary files with scaled version of images - self.rotated_images = {} #: Temporary files with rotated version of images + # Memory + self.scaled_images = {} #: Temporary files with scaled version of images + self.rotated_images = {} #: Temporary files with rotated version of images self.text_styles = [] #: Keep track of already used textstyles self.block_styles = [] #: Keep track of already used blockstyles self.images = {} #: Images referenced in the HTML document self.targets = {} #: and id elements - self.links = deque() #: elements + self.links = deque() #: elements self.processed_files = [] self.extra_toc_entries = [] #: TOC entries gleaned from semantic information self.image_memory = [] @@ -235,30 +236,30 @@ class HTMLConverter(object): self.preserve_block_style = False #: Used so that

    tags in

    elements are handled properly self.avoid_page_break = False self.current_page = book.create_page() - - # Styles - self.blockquote_style = book.create_block_style(sidemargin=60, + + # Styles + self.blockquote_style = book.create_block_style(sidemargin=60, topskip=20, footskip=20) self.unindented_style = book.create_text_style(parindent=0) - - + + self.in_table = False # List processing self.list_level = 0 self.list_indent = 20 self.list_counter = 1 - + self.book = book #: The Book object representing a BBeB book - + self.override_css = {} self.override_pcss = {} - + if self._override_css is not None: if os.access(self._override_css, os.R_OK): src = open(self._override_css, 'rb').read() else: src = self._override_css - match = self.PAGE_BREAK_PAT.search(src) + match = self.PAGE_BREAK_PAT.search(src) if match and not re.match('avoid', match.group(1), re.IGNORECASE): self.page_break_found = True ncss, npcss = self.parse_css(src) @@ -266,12 +267,12 @@ class HTMLConverter(object): update_css(ncss, self.override_css) if npcss: update_css(npcss, self.override_pcss) - - - + + + paths = [os.path.abspath(path) for path in paths] paths = [path.decode(sys.getfilesystemencoding()) if not isinstance(path, unicode) else path for path in paths] - + while len(paths) > 0 and self.link_level <= self.link_levels: for path in paths: if path in self.processed_files: @@ -287,62 +288,62 @@ class HTMLConverter(object): if link['path'] == path: self.links.remove(link) break - self.log_warn('Could not process '+path) + self.log.warn('Could not process '+path) if self.verbose: - self.log_exception(' ') + self.log.exception(' ') self.links = self.process_links() self.link_level += 1 paths = [link['path'] for link in self.links] - + if self.current_page is not None and self.current_page.has_text(): self.book.append(self.current_page) - + for text, tb in self.extra_toc_entries: self.book.addTocEntry(text, tb) - + if self.base_font_size > 0: - self.log_info('\tRationalizing font sizes...') + self.log.info('\tRationalizing font sizes...') self.book.rationalize_font_sizes(self.base_font_size) - + def is_baen(self, soup): - return bool(soup.find('meta', attrs={'name':'Publisher', + return bool(soup.find('meta', attrs={'name':'Publisher', 'content':re.compile('Baen', re.IGNORECASE)})) - + def is_book_designer(self, raw): return bool(re.search('<]*id=BookTitle', raw)) - + def preprocess(self, raw): nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) nmassage.extend(HTMLConverter.MARKUP_MASSAGE) - + if not self.book_designer and self.is_book_designer(raw): self.book_designer = True - self.log_info(_('\tBook Designer file detected.')) - - self.log_info(_('\tParsing HTML...')) - + self.log.info(_('\tBook Designer file detected.')) + + self.log.info(_('\tParsing HTML...')) + if self.baen: nmassage.extend(HTMLConverter.BAEN) - + if self.pdftohtml: nmassage.extend(HTMLConverter.PDFTOHTML) if self.book_designer: nmassage.extend(HTMLConverter.BOOK_DESIGNER) try: - soup = BeautifulSoup(raw, + soup = BeautifulSoup(raw, convertEntities=BeautifulSoup.XHTML_ENTITIES, markupMassage=nmassage) except ConversionError, err: if 'Failed to coerce to unicode' in str(err): raw = unicode(raw, 'utf8', 'replace') - soup = BeautifulSoup(raw, + soup = BeautifulSoup(raw, convertEntities=BeautifulSoup.XHTML_ENTITIES, markupMassage=nmassage) else: raise if not self.baen and self.is_baen(soup): self.baen = True - self.log_info(_('\tBaen file detected. Re-parsing...')) + self.log.info(_('\tBaen file detected. Re-parsing...')) return self.preprocess(raw) if self.book_designer: t = soup.find(id='BookTitle') @@ -358,13 +359,13 @@ class HTMLConverter(object): try: dump = open(os.path.join(tdir, 'html2lrf-verbose.html'), 'wb') dump.write(unicode(soup).encode('utf-8')) - self.log_info(_('Written preprocessed HTML to ')+dump.name) + self.log.info(_('Written preprocessed HTML to ')+dump.name) dump.close() except: pass - + return soup - + def add_file(self, path): self.css = HTMLConverter.CSS.copy() self.pseudo_css = self.override_pcss.copy() @@ -373,13 +374,13 @@ class HTMLConverter(object): self.css[selector].update(self.override_css[selector]) else: self.css[selector] = self.override_css[selector] - + upath = path.encode(sys.getfilesystemencoding()) if isinstance(path, unicode) else path self.file_name = os.path.basename(upath.decode(sys.getfilesystemencoding())) - self.log_info(_('Processing %s'), repr(upath) if self.verbose else repr(self.file_name)) - + self.log.info(_('Processing %s')%( repr(upath) if self.verbose else repr(self.file_name))) + if not os.path.exists(upath): - upath = upath.replace('&', '%26') #convertlit replaces & with %26 in file names + upath = upath.replace('&', '%26') #convertlit replaces & with %26 in file names f = open(upath, 'rb') raw = f.read() if self.pdftohtml: # Bug in pdftohtml that causes it to output invalid UTF-8 files @@ -390,7 +391,7 @@ class HTMLConverter(object): raw = xml_to_unicode(raw, self.verbose)[0] f.close() soup = self.preprocess(raw) - self.log_info(_('\tConverting to BBeB...')) + self.log.info(_('\tConverting to BBeB...')) self.current_style = {} self.page_break_found = False if not isinstance(path, unicode): @@ -399,9 +400,9 @@ class HTMLConverter(object): self.previous_text = '\n' self.tops[path] = self.parse_file(soup) self.processed_files.append(path) - - - + + + def parse_css(self, style): """ Parse the contents of a - - -%(body)s - - -''' - res = [] - para = [] - styles = [] - for page in self.pages: - res.append(u''%page.id) - for group in page.groups: - if group.is_header or group.is_footer: - continue - if group.style is not None: - styles.append(u'.%s %s\n'%(group.id, group.style.to_css())) - for line in group.lines: - if line.is_para_start: - indent = group.left_margin - line.left - if para: - res.append(u'

    %s

    '%(indent, ''.join(para))) - para = [] - para.append(line.to_xhtml(group.id)) - if page.page_break_after: - res.append(u'
    ') - if para: - res.append(u'

    %s

    '%(''.join(para))) - para = [] - - return (header%dict(style='\n'.join(styles), body='\n'.join(res))).encode('utf-8') - -class PDFConverter(object): - - @classmethod - def generate_xml(cls, pathtopdf, logger): - pathtopdf = os.path.abspath(pathtopdf) - tdir = tempfile.mkdtemp('pdf2xml', __appname__) - atexit.register(shutil.rmtree, tdir) - xmlfile = os.path.basename(pathtopdf)+'.xml' - os.chdir(tdir) - cmd = PDFTOXML + ' -outline "%s" "%s"'%(pathtopdf, xmlfile) - p = subprocess.Popen(cmd, shell=True, stderr=subprocess.STDOUT, - stdout=subprocess.PIPE) - log = p.stdout.read() - ret = p.wait() - if ret != 0: - raise ConversionError, log - xmlfile = os.path.join(tdir, xmlfile) - if os.stat(xmlfile).st_size < 20: - raise ConversionError(os.path.basename(pathtopdf) + ' does not allow copying of text.') - return xmlfile - - - def __init__(self, pathtopdf, logger, opts): - self.cwd = os.getcwdu() - self.logger = logger - self.opts = opts - try: - self.logger.info('Converting PDF to XML') - self.xmlfile = self.generate_xml(pathtopdf, self.logger) - self.tdir = os.path.dirname(self.xmlfile) - self.data_dir = self.xmlfile + '_data' - outline_file = self.xmlfile.rpartition('.')[0]+'_outline.xml' - self.logger.info('Parsing XML') - self.document = PDFDocument(self.xmlfile) - self.outline = parse(outline_file) - finally: - os.chdir(self.cwd) - - def convert(self, output_dir): - doc = self.document.to_xhtml() - open(os.path.join(output_dir, 'document.html'), 'wb').write(doc) - - - -def option_parser(): - parser = OptionParser(usage=\ -''' -%prog [options] myfile.pdf - -Convert a PDF file to a HTML file. -''') - parser.add_option('-o', '--output-dir', default='.', - help=_('Path to output directory in which to create the HTML file. Defaults to current directory.')) - parser.add_option('--verbose', default=False, action='store_true', - help=_('Be more verbose.')) - return parser - -def main(args=sys.argv, logger=None): - parser = option_parser() - options, args = parser.parse_args() - if logger is None: - level = logging.DEBUG if options.verbose else logging.INFO - logger = logging.getLogger('pdf2html') - setup_cli_handlers(logger, level) - if len(args) != 1: - parser.print_help() - print _('You must specify a single PDF file.') - return 1 - options.output_dir = os.path.abspath(options.output_dir) - converter = PDFConverter(os.path.abspath(args[0]), logger, options) - converter.convert(options.output_dir) - - return 0 - -if __name__ == '__main__': - sys.exit(main()) \ No newline at end of file diff --git a/src/calibre/ebooks/lrf/txt/__init__.py b/src/calibre/ebooks/lrf/txt/__init__.py deleted file mode 100644 index c705e32a66..0000000000 --- a/src/calibre/ebooks/lrf/txt/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' \ No newline at end of file diff --git a/src/calibre/ebooks/lrf/txt/convert_from.py b/src/calibre/ebooks/lrf/txt/convert_from.py deleted file mode 100644 index 89441f9d6d..0000000000 --- a/src/calibre/ebooks/lrf/txt/convert_from.py +++ /dev/null @@ -1,112 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' -""" -Convert .txt files to .lrf -""" -import os, sys, codecs, logging, re, shutil - -from calibre.ptempfile import PersistentTemporaryDirectory -from calibre.ebooks.lrf import option_parser as lrf_option_parser -from calibre.ebooks import ConversionError -from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file -from calibre.ebooks.markdown import markdown -from calibre import setup_cli_handlers -from calibre.ebooks.metadata import MetaInformation -from calibre.ebooks.metadata.opf import OPFCreator - -def option_parser(): - parser = lrf_option_parser( -_('''%prog [options] mybook.txt - - -%prog converts mybook.txt to mybook.lrf''')) - parser.add_option('--debug-html-generation', action='store_true', default=False, - dest='debug_html_generation', help=_('Print generated HTML to stdout and quit.')) - return parser - -def fix_image_includes(sdir, tdir, match): - path = match.group(1).split('/') - src = os.path.join(sdir, *path) - dest = os.path.join(tdir, *path) - p = os.path.dirname(dest) - if not os.path.exists(p): - os.makedirs(p) - if not os.path.exists(dest): - shutil.copyfile(src, dest) - - -def generate_html(txtfile, encoding, tdir): - ''' - Convert txtfile to html and return a PersistentTemporaryFile object pointing - to the file with the HTML. - ''' - txtfile = os.path.abspath(txtfile) - enc = encoding - if not encoding: - encodings = ['cp1252', 'latin-1', 'utf8', 'iso-8859-1', 'koi8_r', 'koi8_u'] - txt, enc = None, None - for encoding in encodings: - try: - txt = codecs.open(txtfile, 'rb', encoding).read() - except UnicodeDecodeError: - continue - enc = encoding - break - if txt == None: - raise ConversionError, 'Could not detect encoding of %s'%(txtfile,) - else: - txt = codecs.open(txtfile, 'rb', enc).read() - - print 'Converting text to HTML...' - md = markdown.Markdown( - extensions=['footnotes', 'tables', 'toc'], - safe_mode=False, - ) - html = ''+md.convert(txt)+'' - for match in re.finditer(r']*src="([^"]+)"', html): - fix_image_includes(os.path.dirname(txtfile), tdir, match) - p = os.path.join(tdir, 'index.html') - open(p, 'wb').write(html.encode('utf-8')) - mi = MetaInformation(os.path.splitext(os.path.basename(txtfile))[0], [_('Unknown')]) - opf = OPFCreator(tdir, mi) - opf.create_manifest([(os.path.join(tdir, 'index.html'), None)]) - opf.create_spine([os.path.join(tdir, 'index.html')]) - opf.render(open(os.path.join(tdir, 'metadata.opf'), 'wb')) - return p - -def process_file(path, options, logger=None): - if logger is None: - level = logging.DEBUG if options.verbose else logging.INFO - logger = logging.getLogger('txt2lrf') - setup_cli_handlers(logger, level) - txt = os.path.abspath(os.path.expanduser(path)) - if not hasattr(options, 'debug_html_generation'): - options.debug_html_generation = False - tdir = PersistentTemporaryDirectory('_txt2lrf') - htmlfile = generate_html(txt, options.encoding, tdir) - options.encoding = 'utf-8' - if not options.debug_html_generation: - options.force_page_break = 'h2' - if not options.output: - ext = '.lrs' if options.lrs else '.lrf' - options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext) - options.output = os.path.abspath(os.path.expanduser(options.output)) - if not options.title: - options.title = os.path.splitext(os.path.basename(path))[0] - html_process_file(htmlfile, options, logger) - else: - print open(htmlfile, 'rb').read() - -def main(args=sys.argv, logger=None): - parser = option_parser() - options, args = parser.parse_args(args) - if len(args) != 2: - parser.print_help() - print - print 'No txt file specified' - return 1 - process_file(args[1], options, logger) - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/lrf/txt/demo/demo.txt b/src/calibre/ebooks/lrf/txt/demo/demo.txt deleted file mode 100644 index af4139241b..0000000000 --- a/src/calibre/ebooks/lrf/txt/demo/demo.txt +++ /dev/null @@ -1,89 +0,0 @@ -Demonstration of `txt2lrf` -========================== - -`txt2lrf` provides a convenient way to create LRF files with good formatting. -`txt2lrf` recognizes a simple markup language called *markdown*. - -The idea is to provide a lightweight markup that can be used to create -TXT files that can be read by themselves or automatically converted to LRF. -[{@name=toc}]() - -

    - -///Table of Contents/// - - -Text formatting ---------------- -**Bold** and *italic* text is easily specified. - -> Blockquotes are also very simple to specify. -> This is a basic blockquote paragraph. I absolutely -> love block quotes don't you? - - This is a preformatted code block. No formatting rules are applied to text in this block and it is rendered in a monospaced font. - - -For details on the text formatting syntax visit - - http://daringfireball.net/projects/markdown/syntax -___ -[Table of Contents](#toc) - -Lists ------ -Both ordered and unordered lists are supported. - - -### Unordered lists - -+ What a -+ *nice* -+ list - - - -### Ordered lists - -1. One -2. Two -3. Three - -**Note:** Nested lists are not supported - -___ -[Table of Contents](#toc) - -Tables ------- - -Simple tables are easily generated - -| |* Col 1 *|* Col 2 *| -|* Row 1 *| (1, 1) | (1, 2) | -|* Row 2 *| (2, 1) | (2, 2) | - -**Note:** Nested tables are not supported - -___ -[Table of Contents](#toc) - -Images ------- - -`txt2lrf` also has support for inline images like -![this one](small.jpg) this one. - -___ -[Table of Contents](#toc) - -Automatic TOC Creation ----------------------- - -By inserting `///Table of Contents///` into the text at some point -a table of contents is automatically generated with links that point -to all headings underlined with `-------`. - -___ -[Table of Contents](#toc) - diff --git a/src/calibre/ebooks/lrf/txt/demo/small.jpg b/src/calibre/ebooks/lrf/txt/demo/small.jpg deleted file mode 100644 index 6dae5fde42430e0b1638c20dccc7c31d0911051c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2055 zcmb79dpy$%8~^Rj5;l{p*~EmEipohzZOFFFWy>v7){PN|xkROe6eTm)mXXWYh{`>V zh%!zQZG;k0Vx``W_asry&UruY`~UlWKF{}`@8$D6E5j=jfHr+UZ9f14fdHy%fR#7E z8pjy!NdRCl01^NIYN}OXtvmv7004%lz5xB_TouC+00gE61%XxSFWLYI0)c=ba3~bf! zac_P`fqFm?4KH=aH^VEVZc03$cWMt^b#XG@T67Dw5OKfwNsiyD%En$J7%vx{i5$3K#!T!8yeQIl5PpI-%1L@!gg%_nLy^B9s1!DV$pse&{y1l?@yAZwjH^*?TPp` z?(?dg2iv|BL#p)dk8P+pEk1nrkCNV&%BXOIq zM5edr3AKnOuu_oN9xnL{`du!c!l$jaw-gIMVn&{%blM;4x9hF_TyyT_*r6&pmQuaD zY($#0%%G&sUv-l$^{DT^Q102c?Cdt#cTFeM^}JA=JT810FRmv*%dhGB>(pgsUI*E7 z8Nu>R4V6qzcQUhSQ_(_zAM>o3N{sjQ*B*x+v!Ow+Yqsq<(tL|u7igVU-8qQZQ-2vL zc=l^7{i1z{AwXl^1+`Q4Z3d2R!8Jb@Om(TllJE=lv6^~;?OBgzoEfPT;g???NZS-+ zPCQPKdwsyl%DPK@B6ry~qTgQGK5e|KF8`9nYB4e0IqlSJJ)gff<-1tFdpg&ypdvC) z$$2(ar2a|9r4zsQsXsI~vuTGe&LD4iKL|XS6TmutQ;@pEY2ON&rRgv9lj&F9n?e^g zVrJT^vhVA8JkuNty@Hlxo}lnAGxg?-Yt|%t3(H$v($H!=4XQ)ZZ&MYX6L&98Cg@Wq2-gCrjh6Fl$2w)b_>u;yZ-iW?1g`+F zAU{k=#1Z*%#YQtRiyQ9{uHcZuxsayPDj#D$0=HD2(HDFTU>G$ z<2vkuPrd(hzyklc)`vA1JXm`0T~9S4ZzSn;^povraGEhlQ_pQ1=-mkFS$vixg+`v8xw$FX62jp($RwWJd}a z6)iBvAvDEL(+mIWD4Rp;wt#0g6_8<_-SP$Yje*esHoM#*bciNqu3NvU)jLZtd_2Z_ zE$s2XcJ>U%MGcuDhehLSY=%k?V;2vfxuBc+2mO4L8#~P{F^2WEM(L@D4bA8bqR;Ek z&*Wm}U4XwgUE%1A|ic6tU9X*#$WBI>~^g{^ibp1Gu( zd$L;But+DS2PaC+{L}kC6eR2U^s9^b&`40rM{TjSnI9ZhGfb_{a^uujG0z>l8rgFn}9AzeZi zEZq~lmGs!h<&;XKjG9SG3&&m-BnWfOzy(V~S{~|F_zTB2pMg&%)>rj(FA^;KvVLgI zuKOr&ZJ6L%PCb~3LapoWprDfsW9QVV?-N{H_2pl;M;~(vNygeUtVI$E9PHG+T7B>& z|Ft7%%zL}~xqlZt{B0wpKFoKZ?BH%1*L9-}lO?*94!KqkE@CivZXIO^r(F>M@livG zRm^gY`;U87(cjMoHg<502}bjU-xZ3%euvG*36jc4h*pa1{> diff --git a/src/calibre/ebooks/lrf/web/__init__.py b/src/calibre/ebooks/lrf/web/__init__.py deleted file mode 100644 index c25b6259a8..0000000000 --- a/src/calibre/ebooks/lrf/web/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' - - -builtin_profiles = [] -available_profiles = [i.__module__.rpartition('.')[2] for i in builtin_profiles] diff --git a/src/calibre/ebooks/lrf/web/convert_from.py b/src/calibre/ebooks/lrf/web/convert_from.py deleted file mode 100644 index ca523e869b..0000000000 --- a/src/calibre/ebooks/lrf/web/convert_from.py +++ /dev/null @@ -1,183 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' -'''Convert websites into LRF files.''' - -import sys, tempfile, shutil, os, logging, imp, inspect, re -from urlparse import urlsplit - -from calibre import __appname__, setup_cli_handlers, CommandLineError, strftime -from calibre.ebooks.lrf import option_parser as lrf_option_parser -from calibre.ebooks.lrf.html.convert_from import process_file - -from calibre.web.fetch.simple import create_fetcher - -from calibre.ebooks.lrf.web.profiles import DefaultProfile, FullContentProfile, create_class -from calibre.ebooks.lrf.web import builtin_profiles, available_profiles - - -def option_parser(): - parser = lrf_option_parser(usage='''%prog [options] website_profile\n\n''' - '''%prog downloads a site from the web and converts it ''' - '''into a LRF file for use with the SONY Reader. ''' - '''website_profile is one of '''+str(available_profiles)+\ - ''' If you specify a website_profile of default or do not specify ''' - '''it, you must specify the --url option.''' - ) - - parser.add_option('-u', '--url', dest='url', default=None, - help='The URL to download. You only need to specify this if you are not specifying a website_profile.') - parser.add_option('--user-profile', default=None, - help='Path to a python file containing a user created profile. For help visit http://%s.kovidgoyal.net/wiki/UserProfiles'%__appname__) - parser.add_option('--username', dest='username', default=None, - help='Specify the username to be used while downloading. Only used if the profile supports it.') - parser.add_option('--password', dest='password', default=None, - help='Specify the password to be used while downloading. Only used if the profile supports it.') - parser.add_option('--timeout', help='Timeout in seconds to wait for a response from the server. Default: %d s'%DefaultProfile.timeout, - default=None, type='int', dest='timeout') - parser.add_option('-r', '--max-recursions', help='Maximum number of levels to recurse i.e. depth of links to follow. Default %d'%DefaultProfile.timeout, - default=None, type='int', dest='max_recursions') - parser.add_option('-n', '--max-files', default=None, type='int', dest='max_files', - help='The maximum number of files to download. This only applies to files from
    tags. Default is %d'%DefaultProfile.timeout) - parser.add_option('--delay', default=None, dest='delay', type='int', - help='Minimum interval in seconds between consecutive fetches. Default is %d s'%DefaultProfile.timeout) - parser.add_option('--dont-download-stylesheets', action='store_true', default=None, - help='Do not download CSS stylesheets.', dest='no_stylesheets') - parser.add_option('--match-regexp', dest='match_regexps', default=[], action='append', - help='Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.') - parser.add_option('--filter-regexp', default=[], action='append', dest='filter_regexps', - help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.') - parser.add_option('--keep-downloaded-files', default=False, action='store_true', - help='''Do not delete the downloaded files after creating the LRF''') - return parser - -def fetch_website(options, logger): - tdir = tempfile.mkdtemp(prefix=__appname__+'_', suffix='_web2lrf') - options.dir = tdir - fetcher = create_fetcher(options, logger) - fetcher.preprocess_regexps = options.preprocess_regexps - return fetcher.start_fetch(options.url), tdir - -def create_lrf(htmlfile, options, logger): - if not options.author or options.author.lower() == 'unknown': - options.author = __appname__ - options.header = True - if options.output: - options.output = os.path.abspath(os.path.expanduser(options.output)) - else: - options.output = os.path.abspath(os.path.expanduser(options.title + ('.lrs' if options.lrs else '.lrf'))) - - process_file(htmlfile, options, logger) - -def process_profile(args, options, logger=None): - tdir = None - try: - if logger is None: - level = logging.DEBUG if options.verbose else logging.INFO - logger = logging.getLogger('web2lrf') - setup_cli_handlers(logger, level) - index = -1 - - if len(args) == 2 and re.search(r'class\s+\S+\(\S+\)\s*\:', args[1]): - profile = create_class(args[1]) - else: - if options.user_profile is not None: - path = os.path.abspath(options.user_profile) - name = os.path.splitext(os.path.basename(path))[0] - res = imp.find_module(name, [os.path.dirname(path)]) - module = imp.load_module(name, *res) - classes = inspect.getmembers(module, - lambda x : inspect.isclass(x) and issubclass(x, DefaultProfile)\ - and x is not DefaultProfile and x is not FullContentProfile) - if not classes: - raise CommandLineError('Invalid user profile '+path) - builtin_profiles.append(classes[0][1]) - available_profiles.append(name) - if len(args) < 2: - args.append(name) - args[1] = name - index = -1 - if len(args) == 2: - try: - if isinstance(args[1], basestring): - if args[1] != 'default': - index = available_profiles.index(args[1]) - except ValueError: - raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], available_profiles)) - else: - raise CommandLineError('Only one profile at a time is allowed.') - profile = DefaultProfile if index == -1 else builtin_profiles[index] - - - - profile = profile(logger, options.verbose, options.username, options.password) - if profile.browser is not None: - options.browser = profile.browser - - for opt in ('url', 'timeout', 'max_recursions', 'max_files', 'delay', 'no_stylesheets'): - val = getattr(options, opt) - if val is None: - setattr(options, opt, getattr(profile, opt)) - - if not options.url: - options.url = profile.url - - if not options.url: - raise CommandLineError('You must specify the --url option or a profile from one of: %s'%(available_profiles,)) - - if not options.title: - title = profile.title - if not title: - title = urlsplit(options.url).netloc - options.title = title + strftime(profile.timefmt) - - options.match_regexps += profile.match_regexps - options.preprocess_regexps = profile.preprocess_regexps - options.filter_regexps += profile.filter_regexps - - options.encoding = profile.encoding if options.encoding is None else options.encoding - - if len(args) == 2 and args[1] != 'default': - options.anchor_ids = False - - htmlfile, tdir = fetch_website(options, logger) - options.encoding = 'utf-8' - cwd = os.getcwd() - if not options.output: - title = options.title.encode(sys.getfilesystemencoding()) if isinstance(options.title, unicode) else options.title - options.output = os.path.join(cwd, options.title+('.lrs' if options.lrs else '.lrf')) - if not os.path.isabs(options.output): - options.output = os.path.join(cwd, options.output) - - option_parser().parse_args(profile.html2lrf_options, options) - - try: - os.chdir(os.path.dirname(htmlfile)) - create_lrf(os.path.basename(htmlfile), options, logger) - finally: - os.chdir(cwd) - finally: - try: - profile.cleanup() - except: - pass - if tdir and os.path.isdir(tdir): - if options.keep_downloaded_files: - print 'Downloaded files in ', tdir - else: - shutil.rmtree(tdir) - - -def main(args=sys.argv, logger=None): - parser = option_parser() - options, args = parser.parse_args(args) - if len(args) > 2 or (len(args) == 1 and not options.user_profile): - parser.print_help() - return 1 - try: - process_profile(args, options, logger=logger) - except CommandLineError, err: - print >>sys.stderr, err - return 0 - -if __name__ == '__main__': - sys.exit(main()) \ No newline at end of file diff --git a/src/calibre/ebooks/lrf/web/profiles/__init__.py b/src/calibre/ebooks/lrf/web/profiles/__init__.py deleted file mode 100644 index 9544cad7c3..0000000000 --- a/src/calibre/ebooks/lrf/web/profiles/__init__.py +++ /dev/null @@ -1,572 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' -''' -Contains the Base Profiles that can be used to easily create profiles to download -particular websites. -''' - -import tempfile, time, calendar, re, operator, atexit, shutil, os -from htmlentitydefs import name2codepoint -from email.utils import formatdate - -from calibre import __appname__, iswindows, browser, strftime -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, NavigableString, CData, Tag - - -class DefaultProfile(object): - - #: The title to use for the LRF file - #: @type: string - title = 'Default Profile' - - #: Maximum number of articles to download from each feed - #: @type: integer - max_articles_per_feed = 10 - - #: If True process the element of the feed as HTML - #: @type: boolean - html_description = True - - #: How many days old should the oldest article downloaded from the feeds be - #: @type: integer - oldest_article = 7 - - #: Recommend frequency at which to download this profile. In days. - recommended_frequency = 7 - - #: Number of levels of links to follow - #: @type: integer - max_recursions = 1 - - #: Maximum number of files to download - #: @type: integer - max_files = 3000 - - #: Delay between consecutive downloads in seconds - #: @type: integer - delay = 0 - - #: Timeout for fetching files from server in seconds - #: @type: integer - timeout = 10 - - #: The format string for the date shown on the first page - #: @type: string - timefmt = ' [%a %d %b %Y]' - - #: The order of elements to search for a URL when parsing the RSS feed. You - #: can replace these elements by completely arbitrary elements to customize - #: feed processing. - #: @type: list of strings - url_search_order = ['guid', 'link'] - - #: The format string used to parse the publication date in the RSS feed. - #: If set to None some default heuristics are used, these may fail, - #: in which case set this to the correct string or re-implement - #: L{DefaultProfile.strptime} in your subclass. - #: @type: string or None - pubdate_fmt = None - - #: If True will look for a publication date for each article. - #: If False assumes the publication date is the current time. - #: @type: boolean - use_pubdate = True, - - #: Max number of characters in the short description. - #: Used by L{FullContentProfile} - #: @type: integer - summary_length = 500 - - #: If True stylesheets are not downloaded and processed - #: Convenient flag to disable loading of stylesheets for websites - #: that have overly complex stylesheets unsuitable for conversion - #: to ebooks formats - #: @type: boolean - no_stylesheets = False - - #: If False articles with the same title in the same feed - #: are not downloaded multiple times - #: @type: boolean - allow_duplicates = False - - #: If True the GUI will ask the user for a username and password - #: to use while downloading - #: @type: boolean - needs_subscription = False - - #: Specify an override encoding for sites that have an incorrect - #: charset specification. THe most common being specifying latin1 and - #: using cp1252 - encoding = None - - #: List of regular expressions that determines which links to follow - #: If empty, it is ignored. - #: Only one of L{match_regexps} or L{filter_regexps} should be defined - #: @type: list of strings - match_regexps = [] - - #: List of regular expressions that determines which links to ignore - #: If empty it is ignored - #: Only one of L{match_regexps} or L{filter_regexps} should be defined - #: @type: list of strings - filter_regexps = [] - - #: List of options to pass to html2lrf, to customize conversion - #: to LRF - #: @type: list of strings - html2lrf_options = [] - - #: List of regexp substitution rules to run on the downloaded HTML. Each element of the - #: list should be a two element tuple. The first element of the tuple should - #: be a compiled regular expression and the second a callable that takes - #: a single match object and returns a string to replace the match. - #: @type: list of tuples - preprocess_regexps = [] - - # See the built-in profiles for examples of these settings. - - #: The URL of the website - #: @type: string - url = '' - - feeds = [] - CDATA_PAT = re.compile(r'<\!\[CDATA\[(.*?)\]\]>', re.DOTALL) - - def get_feeds(self): - ''' - Return a list of RSS feeds to fetch for this profile. Each element of the list - must be a 2-element tuple of the form (title, url). - ''' - if not self.feeds: - raise NotImplementedError - return self.feeds - - @classmethod - def print_version(cls, url): - ''' - Take a URL pointing to an article and returns the URL pointing to the - print version of the article. - ''' - return url - - @classmethod - def get_browser(cls): - ''' - Return a browser instance used to fetch documents from the web. - - If your profile requires that you login first, override this method - in your subclass. See for example the nytimes profile. - ''' - return browser() - - - - - def __init__(self, logger, verbose=False, username=None, password=None, lrf=True): - self.logger = logger - self.username = username - self.password = password - self.verbose = verbose - self.lrf = lrf - self.temp_dir = tempfile.mkdtemp(prefix=__appname__+'_') - self.browser = self.get_browser() - try: - self.url = 'file:'+ ('' if iswindows else '//') + self.build_index() - except NotImplementedError: - self.url = None - atexit.register(cleanup, self.temp_dir) - - def build_index(self): - '''Build an RSS based index.html''' - articles = self.parse_feeds() - encoding = 'utf-8' if self.encoding is None else self.encoding - def build_sub_index(title, items): - ilist = '' - li = u'
  • %(title)s [%(date)s]
    \n'+\ - u'
    %(description)s
  • \n' - for item in items: - if not item.has_key('date'): - item['date'] = time.strftime('%a, %d %b', time.localtime()) - ilist += li%item - return u'''\ - - -

    %(title)s

    -
      - %(items)s -
    - - - '''%dict(title=title, items=ilist.rstrip()) - - cnum = 0 - clist = '' - categories = articles.keys() - categories.sort() - for category in categories: - cnum += 1 - cfile = os.path.join(self.temp_dir, 'category'+str(cnum)+'.html') - prefix = 'file:' if iswindows else '' - clist += u'
  • %s
  • \n'%(prefix+cfile, category) - src = build_sub_index(category, articles[category]) - open(cfile, 'wb').write(src.encode(encoding)) - - title = self.title - if not isinstance(title, unicode): - title = unicode(title, 'utf-8', 'replace') - src = u'''\ - - -

    %(title)s

    -
    %(date)s
    -
      - %(categories)s -
    - - - '''%dict(date=strftime('%a, %d %B, %Y'), - categories=clist, title=title) - index = os.path.join(self.temp_dir, 'index.html') - open(index, 'wb').write(src.encode(encoding)) - - return index - - - @classmethod - def tag_to_string(cls, tag, use_alt=True): - ''' - Convenience method to take a BeautifulSoup Tag and extract the text from it - recursively, including any CDATA sections and alt tag attributes. - @param use_alt: If True try to use the alt attribute for tags that don't have any textual content - @type use_alt: boolean - @return: A unicode (possibly empty) object - @rtype: unicode string - ''' - if not tag: - return '' - if isinstance(tag, basestring): - return tag - strings = [] - for item in tag.contents: - if isinstance(item, (NavigableString, CData)): - strings.append(item.string) - elif isinstance(item, Tag): - res = cls.tag_to_string(item) - if res: - strings.append(res) - elif use_alt and item.has_key('alt'): - strings.append(item['alt']) - return u''.join(strings) - - def get_article_url(self, item): - ''' - Return the article URL given an item Tag from a feed, or None if no valid URL is found - @type item: BeatifulSoup.Tag - @param item: A BeautifulSoup Tag instance corresponding to the tag from a feed. - @rtype: string or None - ''' - url = None - for element in self.url_search_order: - url = item.find(element.lower()) - if url: - break - return url - - - def parse_feeds(self, require_url=True): - ''' - Create list of articles from a list of feeds. - @param require_url: If True skip articles that don't have a link to a HTML page with the full article contents. - @type require_url: boolean - @rtype: dictionary - @return: A dictionary whose keys are feed titles and whose values are each - a list of dictionaries. Each list contains dictionaries of the form:: - { - 'title' : article title, - 'url' : URL of print version, - 'date' : The publication date of the article as a string, - 'description' : A summary of the article - 'content' : The full article (can be an empty string). This is used by FullContentProfile - } - ''' - added_articles = {} - feeds = self.get_feeds() - articles = {} - for title, url in feeds: - try: - src = self.browser.open(url).read() - except Exception, err: - self.logger.error('Could not fetch feed: %s\nError: %s'%(url, err)) - if self.verbose: - self.logger.exception(' ') - continue - - articles[title] = [] - added_articles[title] = [] - soup = BeautifulStoneSoup(src) - for item in soup.findAll('item'): - try: - atitle = item.find('title') - if not atitle: - continue - - atitle = self.tag_to_string(atitle) - if self.use_pubdate: - pubdate = item.find('pubdate') - if not pubdate: - pubdate = item.find('dc:date') - if not pubdate or not pubdate.string: - pubdate = formatdate() - pubdate = self.tag_to_string(pubdate) - pubdate = pubdate.replace('+0000', 'GMT') - - - url = self.get_article_url(item) - url = self.tag_to_string(url) - if require_url and not url: - self.logger.debug('Skipping article %s as it does not have a link url'%atitle) - continue - purl = url - try: - purl = self.print_version(url) - except Exception, err: - self.logger.debug('Skipping %s as could not find URL for print version. Error:\n%s'%(url, err)) - continue - - content = item.find('content:encoded') - if not content: - content = item.find('description') - if content: - content = self.process_html_description(content, strip_links=False) - else: - content = '' - - d = { - 'title' : atitle, - 'url' : purl, - 'timestamp': self.strptime(pubdate) if self.use_pubdate else time.time(), - 'date' : pubdate if self.use_pubdate else formatdate(), - 'content' : content, - } - delta = time.time() - d['timestamp'] - if not self.allow_duplicates: - if d['title'] in added_articles[title]: - continue - added_articles[title].append(d['title']) - if delta > self.oldest_article*3600*24: - continue - - except Exception, err: - if self.verbose: - self.logger.exception('Error parsing article:\n%s'%(item,)) - continue - try: - desc = '' - for c in item.findAll('description'): - desc = self.tag_to_string(c) - if desc: - break - d['description'] = self.process_html_description(desc) if self.html_description else desc.string - except: - d['description'] = '' - articles[title].append(d) - articles[title].sort(key=operator.itemgetter('timestamp'), reverse=True) - articles[title] = articles[title][:self.max_articles_per_feed+1] - #for item in articles[title]: - # item.pop('timestamp') - if not articles[title]: - articles.pop(title) - return articles - - - def cleanup(self): - ''' - Called after LRF file has been generated. Use it to do any cleanup like - logging out of subscription sites, etc. - ''' - pass - - @classmethod - def process_html_description(cls, tag, strip_links=True): - ''' - Process a tag that contains HTML markup, either - entity encoded or escaped in a CDATA section. - @return: HTML - @rtype: string - ''' - src = '\n'.join(tag.contents) if hasattr(tag, 'contents') else tag - match = cls.CDATA_PAT.match(src.lstrip()) - if match: - src = match.group(1) - else: - replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ] - for e in replaced_entities: - ent = '&'+e+';' - src = src.replace(ent, unichr(name2codepoint[e])) - if strip_links: - src = re.compile(r'(.*?)', re.IGNORECASE|re.DOTALL).sub(r'\1', src) - - return src - - - DAY_MAP = dict(Sun=0, Mon=1, Tue=2, Wed=3, Thu=4, Fri=5, Sat=6) - FULL_DAY_MAP = dict(Sunday=0, Monday=1, Tueday=2, Wednesday=3, Thursday=4, Friday=5, Saturday=6) - MONTH_MAP = dict(Jan=1, Feb=2, Mar=3, Apr=4, May=5, Jun=6, Jul=7, Aug=8, Sep=9, Oct=10, Nov=11, Dec=12) - FULL_MONTH_MAP = dict(January=1, February=2, March=3, April=4, May=5, June=6, - July=7, August=8, September=9, October=10, - November=11, December=12) - - @classmethod - def strptime(cls, src): - ''' - Take a string and return the date that string represents, in UTC as - an epoch (i.e. number of seconds since Jan 1, 1970). This function uses - a bunch of heuristics and is a prime candidate for being overridden in a - subclass. - @param src: Timestamp as a string - @type src: string - @return: time ans a epoch - @rtype: number - ''' - delta = 0 - zone = re.search(r'\s*(\+\d\d\:{0,1}\d\d)', src) - if zone: - delta = zone.group(1) - hrs, mins = int(delta[1:3]), int(delta[-2:].rstrip()) - delta = 60*(hrs*60 + mins) * (-1 if delta.startswith('-') else 1) - src = src.replace(zone.group(), '') - if cls.pubdate_fmt is None: - src = src.strip().split() - try: - src[0] = str(cls.DAY_MAP[src[0][:-1]])+',' - except KeyError: - src[0] = str(cls.FULL_DAY_MAP[src[0][:-1]])+',' - try: - src[2] = str(cls.MONTH_MAP[src[2]]) - except KeyError: - src[2] = str(cls.FULL_MONTH_MAP[src[2]]) - fmt = '%w, %d %m %Y %H:%M:%S' - src = src[:5] # Discard extra information - try: - time_t = time.strptime(' '.join(src), fmt) - except ValueError: - time_t = time.strptime(' '.join(src), fmt.replace('%Y', '%y')) - return calendar.timegm(time_t)-delta - else: - return calendar.timegm(time.strptime(src, cls.pubdate_fmt)) - - def command_line_options(self): - args = [] - args.append('--max-recursions='+str(self.max_recursions)) - args.append('--delay='+str(self.delay)) - args.append('--max-files='+str(self.max_files)) - for i in self.match_regexps: - args.append('--match-regexp="'+i+'"') - for i in self.filter_regexps: - args.append('--filter-regexp="'+i+'"') - return args - - -class FullContentProfile(DefaultProfile): - ''' - This profile is designed for feeds that embed the full article content in the RSS file. - ''' - - max_recursions = 0 - article_counter = 0 - - - def build_index(self): - '''Build an RSS based index.html. ''' - articles = self.parse_feeds(require_url=False) - - def build_sub_index(title, items): - ilist = '' - li = u'
  • %(title)s [%(date)s]
    \n'+\ - u'
    %(description)s
  • \n' - for item in items: - content = item['content'] - if not content: - self.logger.debug('Skipping article as it has no content:%s'%item['title']) - continue - item['description'] = cutoff(item['description'], self.summary_length)+'…' - self.article_counter = self.article_counter + 1 - url = os.path.join(self.temp_dir, 'article%d.html'%self.article_counter) - item['url'] = url - open(url, 'wb').write((u'''\ - - -

    %s

    -
    - %s -
    - - '''%(item['title'], content)).encode('utf-8') - ) - ilist += li%item - return u'''\ - - -

    %(title)s

    -
      - %(items)s -
    - - - '''%dict(title=title, items=ilist.rstrip()) - - cnum = 0 - clist = '' - categories = articles.keys() - categories.sort() - for category in categories: - cnum += 1 - cfile = os.path.join(self.temp_dir, 'category'+str(cnum)+'.html') - prefix = 'file:' if iswindows else '' - clist += u'
  • %s
  • \n'%(prefix+cfile, category) - src = build_sub_index(category, articles[category]) - open(cfile, 'wb').write(src.encode('utf-8')) - - src = '''\ - - -

    %(title)s

    -
    %(date)s
    -
      - %(categories)s -
    - - - '''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()), - categories=clist, title=self.title) - index = os.path.join(self.temp_dir, 'index.html') - open(index, 'wb').write(src.encode('utf-8')) - return index - -def cutoff(src, pos, fuzz=50): - si = src.find(';', pos) - if si > 0 and si-pos > fuzz: - si = -1 - gi = src.find('>', pos) - if gi > 0 and gi-pos > fuzz: - gi = -1 - npos = max(si, gi) - if npos < 0: - npos = pos - return src[:npos+1] - -def create_class(src): - environment = {'FullContentProfile':FullContentProfile, 'DefaultProfile':DefaultProfile} - exec src in environment - for item in environment.values(): - if hasattr(item, 'build_index'): - if item.__name__ not in ['DefaultProfile', 'FullContentProfile']: - return item - -def cleanup(tdir): - try: - if os.path.isdir(tdir): - shutil.rmtree(tdir) - except: - pass - \ No newline at end of file diff --git a/src/calibre/ebooks/lrf/web/profiles/ap.py b/src/calibre/ebooks/lrf/web/profiles/ap.py deleted file mode 100644 index 161699941a..0000000000 --- a/src/calibre/ebooks/lrf/web/profiles/ap.py +++ /dev/null @@ -1,38 +0,0 @@ -import re -from calibre.ebooks.lrf.web.profiles import DefaultProfile - - -class AssociatedPress(DefaultProfile): - - title = 'Associated Press' - max_recursions = 2 - max_articles_per_feed = 15 - html2lrf_options = ['--force-page-break-before-tag="chapter"'] - - - preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in -[ - (r'.*?' , lambda match : ''), - (r'.*?', lambda match : ''), - (r'.*?', lambda match : ''), - (r'.*?', lambda match : ''), - (r'.*?', lambda match : ''), - (r'
    .*?

    ', lambda match : '

    '), - (r'

    ', lambda match : '

    '), - (r'Learn more about our Privacy Policy.*?', lambda match : ''), - ] - ] - - - - def get_feeds(self): - return [ ('AP Headlines', 'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml?SITE=ORAST&SECTION=HOME'), - ('AP US News', 'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml?SITE=CAVIC&SECTION=HOME'), - ('AP World News', 'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml?SITE=SCAND&SECTION=HOME'), - ('AP Political News', 'http://hosted.ap.org/lineups/POLITICSHEADS-rss_2.0.xml?SITE=ORMED&SECTION=HOME'), - ('AP Washington State News', 'http://hosted.ap.org/lineups/WASHINGTONHEADS-rss_2.0.xml?SITE=NYPLA&SECTION=HOME'), - ('AP Technology News', 'http://hosted.ap.org/lineups/TECHHEADS-rss_2.0.xml?SITE=CTNHR&SECTION=HOME'), - ('AP Health News', 'http://hosted.ap.org/lineups/HEALTHHEADS-rss_2.0.xml?SITE=FLDAY&SECTION=HOME'), - ('AP Science News', 'http://hosted.ap.org/lineups/SCIENCEHEADS-rss_2.0.xml?SITE=OHCIN&SECTION=HOME'), - ('AP Strange News', 'http://hosted.ap.org/lineups/STRANGEHEADS-rss_2.0.xml?SITE=WCNC&SECTION=HOME'), - ] \ No newline at end of file diff --git a/src/calibre/ebooks/lrf/web/profiles/atlantic.py b/src/calibre/ebooks/lrf/web/profiles/atlantic.py deleted file mode 100644 index eebbe84d96..0000000000 --- a/src/calibre/ebooks/lrf/web/profiles/atlantic.py +++ /dev/null @@ -1,47 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' -import re -from calibre.ebooks.lrf.web.profiles import DefaultProfile -from calibre.ebooks.BeautifulSoup import BeautifulSoup - -class Atlantic(DefaultProfile): - - title = 'The Atlantic' - max_recursions = 2 - INDEX = 'http://www.theatlantic.com/doc/current' - - preprocess_regexps = [ - (re.compile(r'

    .*?<\!--\s+INVISIBLE SKIP .*?\s+-->', - lambda match : ''), - (r'', lambda match: ''), - ] - ] - - def __init__(self, logger, verbose=False, username=None, password=None): - DefaultProfile.__init__(self, username, password) - self.browser = None # Needed as otherwise there are timeouts while fetching actual articles - - def print_version(self, url): - return url.replace('displaystory', 'PrinterFriendly').replace('&fsrc=RSS', '') - - def get_feeds(self): - src = self.browser.open('http://economist.com/rss/').read() - soup = BeautifulSoup(src) - feeds = [] - for ul in soup.findAll('ul'): - lis = ul.findAll('li') - try: - title, link = lis[0], lis[1] - except IndexError: - continue - title = title.string - if title: - title = title.strip() - if title not in self.__class__.TITLES: - continue - a = link.find('a') - feeds.append((title, a['href'].strip())) - - return feeds diff --git a/src/calibre/ebooks/lrf/web/profiles/faznet.py b/src/calibre/ebooks/lrf/web/profiles/faznet.py deleted file mode 100644 index 53f2cde752..0000000000 --- a/src/calibre/ebooks/lrf/web/profiles/faznet.py +++ /dev/null @@ -1,28 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' -''' -Profile to download FAZ.net -''' -import re - -from calibre.ebooks.lrf.web.profiles import DefaultProfile - -class FazNet(DefaultProfile): - - title = 'FAZ NET' - max_recursions = 2 - html_description = True - max_articles_per_feed = 30 - - preprocess_regexps = [ - (re.compile(r'Zum Thema.*?', re.IGNORECASE | re.DOTALL), - lambda match : ''), - ] - - - def get_feeds(self): - return [ ('FAZ.NET', 'http://www.faz.net/s/Rub/Tpl~Epartner~SRss_.xml') ] - - def print_version(self, url): - return url.replace('.html?rss_aktuell', '~Afor~Eprint.html') - diff --git a/src/calibre/ebooks/lrf/web/profiles/jpost.py b/src/calibre/ebooks/lrf/web/profiles/jpost.py deleted file mode 100644 index ddc2a00e35..0000000000 --- a/src/calibre/ebooks/lrf/web/profiles/jpost.py +++ /dev/null @@ -1,36 +0,0 @@ -import re -from calibre.ebooks.lrf.web.profiles import DefaultProfile - -class JerusalemPost(DefaultProfile): - - title = 'Jerusalem Post' - max_recursions = 2 - max_articles_per_feed = 10 - - - - preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in -[ - (r'.*?' , lambda match : ''), - (r'.*?', lambda match : ''), - (r'.*?', lambda match : ''), - (r'.*?', lambda match : ''), - (r'
    ', lambda match : ''), - (r'\'NWAnews.com', lambda match : ''), - (r'', lambda match : ''), - (r'

    .*?', lambda match : ''), - - ] - ] - - def get_feeds(self): - return [ ('Front Page', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333346'), - ('Israel News', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463156'), - ('Middle East News', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333498'), - ('International News', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463144'), - ('Editorials', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333211'), - ] - - def print_version(self, url): - return ('http://www.jpost.com/servlet/Satellite?cid=' + url.rpartition('&')[2] + '&pagename=JPost%2FJPArticle%2FPrinter') - diff --git a/src/calibre/ebooks/lrf/web/profiles/jutarnji.py b/src/calibre/ebooks/lrf/web/profiles/jutarnji.py deleted file mode 100644 index 93da341edd..0000000000 --- a/src/calibre/ebooks/lrf/web/profiles/jutarnji.py +++ /dev/null @@ -1,44 +0,0 @@ -''' - Profile to download Jutarnji.hr by Valloric -''' - -import re - -from calibre.ebooks.lrf.web.profiles import DefaultProfile - -class Jutarnji(DefaultProfile): - - title = 'Jutarnji' - max_recursions = 2 - timefmt = ' [%d %b %Y]' - max_articles_per_feed = 80 - html_description = True - no_stylesheets = True - - preprocess_regexps = [ - (re.compile(r'', re.IGNORECASE | re.DOTALL), lambda match : ''), - (re.compile(r'

    .*?', re.IGNORECASE | re.DOTALL), lambda match : '
    '), - (re.compile(r')|(
    )|(
    )|(

    )|())', lambda match: '

    '), - - ## Remove any links/ads/comments/cruft from the end of the body of the article. - (r'(()|(
    )|(

    ©)|(