diff --git a/setup.py b/setup.py index b0ff04a983..003067b34f 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' -import sys, re, os, shutil, cStringIO, tempfile, subprocess, time +import sys, re, os, subprocess sys.path.append('src') iswindows = re.search('win(32|64)', sys.platform) isosx = 'darwin' in sys.platform @@ -54,7 +54,7 @@ if __name__ == '__main__': build_osx, upload_installers, upload_user_manual, \ upload_to_pypi, stage3, stage2, stage1, upload, \ upload_rss - + entry_points['console_scripts'].append( 'calibre_postinstall = calibre.linux:post_install') ext_modules = [ @@ -65,12 +65,15 @@ if __name__ == '__main__': 'src/calibre/utils/lzx/lzc.c', 'src/calibre/utils/lzx/lzxc.c'], include_dirs=['src/calibre/utils/lzx']), - + Extension('calibre.plugins.msdes', sources=['src/calibre/utils/msdes/msdesmodule.c', 'src/calibre/utils/msdes/des.c'], include_dirs=['src/calibre/utils/msdes']), - + + Extension('calibre.plugins.cPalmdoc', + sources=['src/calibre/ebooks/mobi/palmdoc.c']), + PyQtExtension('calibre.plugins.pictureflow', ['src/calibre/gui2/pictureflow/pictureflow.cpp', 'src/calibre/gui2/pictureflow/pictureflow.h'], @@ -81,7 +84,7 @@ if __name__ == '__main__': ext_modules.append(Extension('calibre.plugins.winutil', sources=['src/calibre/utils/windows/winutil.c'], libraries=['shell32', 'setupapi'], - include_dirs=os.environ.get('INCLUDE', + include_dirs=os.environ.get('INCLUDE', 'C:/WinDDK/6001.18001/inc/api/;' 'C:/WinDDK/6001.18001/inc/crt/').split(';'), extra_compile_args=['/X'] @@ -91,7 +94,7 @@ if __name__ == '__main__': sources=['src/calibre/devices/usbobserver/usbobserver.c'], extra_link_args=['-framework', 'IOKit']) ) - + if not iswindows: plugins = ['plugins/%s.so'%(x.name.rpartition('.')[-1]) for x in ext_modules] else: @@ -99,7 +102,7 @@ if __name__ == '__main__': ['plugins/%s.pyd.manifest'%(x.name.rpartition('.')[-1]) \ for x in ext_modules if 'pictureflow' not in x.name] - + setup( name = APPNAME, packages = find_packages('src'), @@ -152,9 +155,9 @@ if __name__ == '__main__': 'Topic :: System :: Hardware :: Hardware Drivers' ], cmdclass = { - 'build_ext' : build_ext, + 'build_ext' : build_ext, 'build' : build, - 'build_py' : build_py, + 'build_py' : build_py, 'pot' : pot, 'manual' : manual, 'resources' : resources, diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index 9e18af3cf9..5656079ead 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -2,7 +2,7 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import sys, os, re, logging, time, subprocess, atexit, mimetypes, \ +import sys, os, re, logging, time, subprocess, mimetypes, \ __builtin__, warnings __builtin__.__dict__['dynamic_property'] = lambda(func): func(None) from htmlentitydefs import name2codepoint @@ -71,7 +71,7 @@ def sanitize_file_name(name, substitute='_', as_unicode=False): **WARNING:** This function also replaces path separators, so only pass file names and not full paths to it. *NOTE:* This function always returns byte strings, not unicode objects. The byte strings - are encoded in the filesystem encoding of the platform, or UTF-8. + are encoded in the filesystem encoding of the platform, or UTF-8. ''' if isinstance(name, unicode): name = name.encode(filesystem_encoding, 'ignore') @@ -159,7 +159,7 @@ def extract(path, dir): def get_proxies(): proxies = {} - + for q in ('http', 'ftp'): proxy = os.environ.get(q+'_proxy', None) if not proxy: continue @@ -194,8 +194,8 @@ def get_proxies(): def browser(honor_time=True, max_time=2, mobile_browser=False): ''' Create a mechanize browser for web scraping. The browser handles cookies, - refresh requests and ignores robots.txt. Also uses proxy if avaialable. - + refresh requests and ignores robots.txt. Also uses proxy if avaialable. + :param honor_time: If True honors pause time in refresh requests :param max_time: Maximum time in seconds to wait during a refresh request ''' @@ -232,16 +232,16 @@ def fit_image(width, height, pwidth, pheight): return scaled, int(width), int(height) class CurrentDir(object): - + def __init__(self, path): self.path = path self.cwd = None - + def __enter__(self, *args): self.cwd = os.getcwd() os.chdir(self.path) return self.cwd - + def __exit__(self, *args): os.chdir(self.cwd) diff --git a/src/calibre/constants.py b/src/calibre/constants.py index 186eb37e34..ff641cfbeb 100644 --- a/src/calibre/constants.py +++ b/src/calibre/constants.py @@ -53,7 +53,7 @@ if plugins is None: plugin_path = getattr(pkg_resources, 'resource_filename')('calibre', 'plugins') sys.path.insert(0, plugin_path) - for plugin in ['pictureflow', 'lzx', 'msdes'] + \ + for plugin in ['pictureflow', 'lzx', 'msdes', 'cPalmdoc'] + \ (['winutil'] if iswindows else []) + \ (['usbobserver'] if isosx else []): try: diff --git a/src/calibre/ebooks/mobi/palmdoc.c b/src/calibre/ebooks/mobi/palmdoc.c new file mode 100644 index 0000000000..29e9579140 --- /dev/null +++ b/src/calibre/ebooks/mobi/palmdoc.c @@ -0,0 +1,204 @@ +/* +:mod:`cPalmdoc` -- Palmdoc compression/decompression +===================================================== + +.. module:: cPalmdoc + :platform: All + :synopsis: Compression decompression of Palmdoc implemented in C for speed + +.. moduleauthor:: Kovid Goyal Copyright 2009 + +*/ + +#define PY_SSIZE_T_CLEAN +#include +#include + +#define DELTA sizeof(Byte)*4096 + +#define BUFFER 6000 + +#define MIN(x, y) ( ((x) < (y)) ? (x) : (y) ) + +typedef unsigned short int Byte; +typedef struct { + Byte *data; + Py_ssize_t len; +} buffer; + +#ifdef bool +#undef bool +#endif +#define bool int + +#ifdef false +#undef false +#endif +#define false 0 + +#ifdef true +#undef true +#endif +#define true 1 + +#define CHAR(x) (( (x) > 127 ) ? (x)-256 : (x)) + +static PyObject * +cpalmdoc_decompress(PyObject *self, PyObject *args) { + const char *_input = NULL; Py_ssize_t input_len = 0; + Py_ssize_t i = 0, o = 0, j = 0, di, n; + if (!PyArg_ParseTuple(args, "t#", &_input, &input_len)) + return NULL; + Byte *input = (Byte *)PyMem_Malloc(sizeof(Byte)*input_len); + if (input == NULL) return PyErr_NoMemory(); + // Map chars to bytes + for (j = 0; j < input_len; j++) + input[j] = (_input[j] < 0) ? _input[j]+256 : _input[j]; + char *output = (char *)PyMem_Malloc(sizeof(char)*BUFFER); + Byte c; + PyObject *ans; + if (output == NULL) return PyErr_NoMemory(); + + while (i < input_len) { + c = input[i++]; + if (c >= 1 && c <= 8) // copy 'c' bytes + while (c--) output[o++] = input[i++]; + + else if (c <= 0x7F) // 0, 09-7F = self + output[o++] = c; + + else if (c >= 0xC0) { // space + ASCII char + output[o++] = ' '; + output[o++] = c ^ 0x80; + } + else { // 80-BF repeat sequences + c = (c << 8) + input[i++]; + di = (c & 0x3FFF) >> 3; + for ( n = (c & 7) + 3; n--; ++o ) + output[o] = output[o - di]; + } + } + ans = Py_BuildValue("s#", output, o); + if (output != NULL) PyMem_Free(output); + if (input != NULL) PyMem_Free(input); + return ans; +} + +static bool +cpalmdoc_memcmp( Byte *a, Byte *b, Py_ssize_t len) { + Py_ssize_t i; + for (i = 0; i < len; i++) if (a[i] != b[i]) return false; + return true; +} + +static Py_ssize_t +cpalmdoc_rfind(Byte *data, Py_ssize_t pos, Py_ssize_t chunk_length) { + Py_ssize_t i; + for (i = pos - chunk_length; i > -1; i--) + if (cpalmdoc_memcmp(data+i, data+pos, chunk_length)) return i; + return pos; +} + + +static Py_ssize_t +cpalmdoc_do_compress(buffer *b, char *output) { + Py_ssize_t i = 0, j, chunk_len, dist; + unsigned compound; + Byte c, n; + bool found; + char *head; + head = output; + buffer temp; + temp.data = (Byte *)PyMem_Malloc(sizeof(Byte)*8); temp.len = 0; + if (temp.data == NULL) return 0; + while (i < b->len) { + c = b->data[i]; + //do repeats + if ( i > 10 && (b->len - i) > 10) { + found = false; + for (chunk_len = 10; chunk_len > 2; chunk_len--) { + j = cpalmdoc_rfind(b->data, i, chunk_len); + dist = i - j; + if (j < i && dist <= 2047) { + found = true; + compound = (dist << 3) + chunk_len-3; + *(output++) = CHAR(0x80 + (compound >> 8 )); + *(output++) = CHAR(compound & 0xFF); + i += chunk_len; + break; + } + } + if (found) continue; + } + + //write single character + i++; + if (c == 32 && i < b->len) { + n = b->data[i]; + if ( n >= 0x40 && n <= 0x7F) { + *(output++) = CHAR(n^0x80); i++; continue; + } + } + if (c == 0 || (c > 8 && c < 0x80)) + *(output++) = CHAR(c); + else { // Write binary data + j = i; + temp.data[0] = c; temp.len = 1; + while (j < b->len && temp.len < 8) { + c = b->data[j]; + if (c == 0 || (c > 8 && c < 0x80)) break; + temp.data[temp.len++] = c; j++; + } + i += temp.len - 1; + *(output++) = temp.len; + for (j=0; j < temp.len; j++) *(output++) = temp.data[j]; + } + } + return output - head; +} + +static PyObject * +cpalmdoc_compress(PyObject *self, PyObject *args) { + const char *_input = NULL; Py_ssize_t input_len = 0; + Py_ssize_t j = 0; + buffer b; + if (!PyArg_ParseTuple(args, "t#", &_input, &input_len)) + return NULL; + b.data = (Byte *)PyMem_Malloc(sizeof(Byte)*input_len); + if (b.data == NULL) return PyErr_NoMemory(); + // Map chars to bytes + for (j = 0; j < input_len; j++) + b.data[j] = (_input[j] < 0) ? _input[j]+256 : _input[j]; + b.len = input_len; + char *output = (char *)PyMem_Malloc(sizeof(char) * b.len); + if (output == NULL) return PyErr_NoMemory(); + j = cpalmdoc_do_compress(&b, output); + if ( j == 0) return PyErr_NoMemory(); + PyObject *ans = Py_BuildValue("s#", output, j); + PyMem_Free(output); + PyMem_Free(b.data); + return ans; +} + +static PyMethodDef cPalmdocMethods[] = { + {"decompress", cpalmdoc_decompress, METH_VARARGS, + "decompress(bytestring) -> decompressed bytestring\n\n" + "Decompress a palmdoc compressed byte string. " + }, + + {"compress", cpalmdoc_compress, METH_VARARGS, + "compress(bytestring) -> compressed bytestring\n\n" + "Palmdoc compress a byte string. " + }, + {NULL, NULL, 0, NULL} +}; + +PyMODINIT_FUNC +initcPalmdoc(void) { + PyObject *m; + m = Py_InitModule3("cPalmdoc", cPalmdocMethods, + "Compress and decompress palmdoc strings." + ); + if (m == NULL) return; +} + diff --git a/src/calibre/ebooks/mobi/palmdoc.py b/src/calibre/ebooks/mobi/palmdoc.py index eedab1c88f..90dabcb5a8 100644 --- a/src/calibre/ebooks/mobi/palmdoc.py +++ b/src/calibre/ebooks/mobi/palmdoc.py @@ -2,41 +2,46 @@ # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai __license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' \ - 'and Marshall T. Vandegrift ' +__copyright__ = '2008, Kovid Goyal ' from cStringIO import StringIO from struct import pack -COUNT_BITS = 3 +from calibre.constants import plugins +cPalmdoc = plugins['cPalmdoc'][0] +if not cPalmdoc: + raise RuntimeError(('Failed to load required cPalmdoc module: ' + '%s')%plugins['cPalmdoc'][1]) def decompress_doc(data): - buffer = [ord(i) for i in data] - res = [] - i = 0 - while i < len(buffer): - c = buffer[i] - i += 1 - if c >= 1 and c <= 8: - res.extend(buffer[i:i+c]) - i += c - elif c <= 0x7f: - res.append(c) - elif c >= 0xc0: - res.extend( (ord(' '), c^0x80) ) - else: - c = (c << 8) + buffer[i] - i += 1 - di = (c & 0x3fff) >> COUNT_BITS - j = len(res) - num = (c & ((1 << COUNT_BITS) - 1)) + 3 - - for k in range( num ): - res.append(res[j - di+k]) - - return ''.join([chr(i) for i in res]) + return cPalmdoc.decompress(data) def compress_doc(data): + return cPalmdoc.compress(data) + +def test(): + TESTS = [ + 'abc\x03\x04\x05\x06ms', # Test binary writing + 'a b c \xfed ', # Test encoding of spaces + '0123456789axyz2bxyz2cdfgfo9iuyerh', + '0123456789asd0123456789asd|yyzzxxffhhjjkk', + ('ciewacnaq eiu743 r787q 0w% ; sa fd\xef\ffdxosac wocjp acoiecowei ' + 'owaic jociowapjcivcjpoivjporeivjpoavca; p9aw8743y6r74%$^$^%8 ') + ] + for test in TESTS: + print 'Test:', repr(test) + print '\tTesting compression...' + good = py_compress_doc(test) + x = compress_doc(test) + print '\t\tgood:', repr(good) + print '\t\tx :', repr(x) + assert x == good + print '\tTesting decompression...' + print '\t\t', repr(decompress_doc(x)) + assert decompress_doc(x) == test + print + +def py_compress_doc(data): out = StringIO() i = 0 ldata = len(data) @@ -85,4 +90,4 @@ def compress_doc(data): out.write(''.join(binseq)) i += len(binseq) - 1 return out.getvalue() - + diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index b68263ab28..38de3476d1 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -226,7 +226,7 @@ class MobiReader(object): page-break-after: always; margin: 0; display: block } ''') - self.tag_css_rules = [] + self.tag_css_rules = {} if hasattr(filename_or_stream, 'read'): stream = filename_or_stream @@ -328,10 +328,10 @@ class MobiReader(object): with open('styles.css', 'wb') as s: s.write(self.base_css_rules+'\n\n') - for rule in self.tag_css_rules: + for cls, rule in self.tag_css_rules.items(): if isinstance(rule, unicode): rule = rule.encode('utf-8') - s.write(rule+'\n\n') + s.write('.%s { %s }\n\n'%(cls, rule)) if self.book_header.exth is not None or self.embedded_mi is not None: @@ -389,6 +389,7 @@ class MobiReader(object): 'xx-large' : '6', } mobi_version = self.book_header.mobi_version + style_map = {} for i, tag in enumerate(root.iter(etree.Element)): if tag.tag in ('country-region', 'place', 'placetype', 'placename', 'state', 'city', 'street', 'address', 'content'): @@ -455,9 +456,18 @@ class MobiReader(object): except ValueError: pass if styles: - attrib['id'] = attrib.get('id', 'calibre_mr_gid%d'%i) - self.tag_css_rules.append('#%s {%s}'%(attrib['id'], - '; '.join(styles))) + cls = None + rule = '; '.join(styles) + for sel, srule in self.tag_css_rules.items(): + if srule == rule: + cls = sel + break + if cls is None: + ncls = 'calibre_%d'%i + self.tag_css_rules[ncls] = rule + cls = attrib.get('class', '') + cls = cls + (' ' if cls else '') + ncls + attrib['class'] = cls def create_opf(self, htmlfile, guide=None, root=None): mi = getattr(self.book_header.exth, 'mi', self.embedded_mi) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 81120aaf2e..783f09e5cc 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -726,6 +726,7 @@ class Manifest(object): % (self.id, self.href, self.media_type) def _parse_xhtml(self, data): + self.oeb.log.debug('Parsing', self.href, '...') # Convert to Unicode and normalize line endings data = self.oeb.decode(data) data = self.oeb.html_preprocessor(data) @@ -804,6 +805,7 @@ class Manifest(object): return data def _parse_css(self, data): + self.oeb.log.debug('Parsing', self.href, '...') data = self.oeb.decode(data) data = self.oeb.css_preprocessor(data) data = XHTML_CSS_NAMESPACE + data diff --git a/src/calibre/ebooks/oeb/transforms/guide.py b/src/calibre/ebooks/oeb/transforms/guide.py index 00830b1a8c..dc7123446b 100644 --- a/src/calibre/ebooks/oeb/transforms/guide.py +++ b/src/calibre/ebooks/oeb/transforms/guide.py @@ -41,10 +41,12 @@ class Clean(object): for x in list(self.oeb.guide): href = urldefrag(self.oeb.guide[x].href)[0] - if x.lower() != ('cover', 'titlepage'): + if x.lower() not in ('cover', 'titlepage'): try: if href not in protected_hrefs: - self.oeb.manifest.remove(self.oeb.manifest.hrefs[href]) + item = self.oeb.manifest.hrefs[href] + if item not in self.oeb.spine: + self.oeb.manifest.remove(self.oeb.manifest.hrefs[href]) except KeyError: pass self.oeb.guide.remove(x) diff --git a/src/calibre/ebooks/oeb/transforms/split.py b/src/calibre/ebooks/oeb/transforms/split.py index d3505a5fd9..21d71da5bb 100644 --- a/src/calibre/ebooks/oeb/transforms/split.py +++ b/src/calibre/ebooks/oeb/transforms/split.py @@ -44,14 +44,14 @@ class Split(object): self.split_on_page_breaks = split_on_page_breaks self.page_breaks_xpath = page_breaks_xpath self.max_flow_size = max_flow_size + self.page_break_selectors = None if self.page_breaks_xpath is not None: - self.page_breaks_xpath = XPath(self.page_breaks_xpath) + self.page_break_selectors = [(XPath(self.page_breaks_xpath), False)] def __call__(self, oeb, context): self.oeb = oeb self.log = oeb.log self.map = {} - self.page_break_selectors = None for item in list(self.oeb.manifest.items): if item.spine_position is not None and etree.iselement(item.data): self.split_item(item) @@ -60,10 +60,7 @@ class Split(object): def split_item(self, item): if self.split_on_page_breaks: - if self.page_breaks_xpath is None: - page_breaks, page_break_ids = self.find_page_breaks(item) - else: - page_breaks, page_break_ids = self.page_breaks_xpath(item.data) + page_breaks, page_break_ids = self.find_page_breaks(item) splitter = FlowSplitter(item, page_breaks, page_break_ids, self.max_flow_size, self.oeb)