diff --git a/INSTALL b/INSTALL index cb8261eff6..93b119b2e1 100644 --- a/INSTALL +++ b/INSTALL @@ -1,6 +1,9 @@ calibre supports installation from source, only on Linux. -On Windows and OS X use the provided installers and use -the facilities of the calibre-debug command to hack on the calibre source. + +Note that you *do not* need to install from source to hack on +the calibre source code. To get started with calibre development, +use a normal calibre install and follow the instructions at +http://calibre-ebook.com/user_manual/develop.html On Linux, there are two kinds of installation from source possible. Note that both kinds require lots of dependencies as well as a @@ -45,3 +48,4 @@ This type of install can be run with the command:: sudo python setup.py develop Use the -h flag for help on the develop command. + diff --git a/README b/README index 2c916fc7d7..b518e977c8 100644 --- a/README +++ b/README @@ -7,7 +7,7 @@ reading. It is cross platform, running on Linux, Windows and OS X. For screenshots: https://calibre-ebook.com/demo For installation/usage instructions please see -http://calibre-ebook.com +http://calibre-ebook.com/user_manual For source code access: bzr branch lp:calibre diff --git a/src/calibre/ebooks/metadata/google_books.py b/src/calibre/ebooks/metadata/google_books.py index 5a5e09234e..2e52bf020d 100644 --- a/src/calibre/ebooks/metadata/google_books.py +++ b/src/calibre/ebooks/metadata/google_books.py @@ -193,6 +193,7 @@ class ResultList(list): def search(title=None, author=None, publisher=None, isbn=None, min_viewability='none', verbose=False, max_results=40): br = browser() + br.set_handle_gzip(True) start, entries = 1, [] while start > 0 and len(entries) <= max_results: new, start = Query(title=title, author=author, publisher=publisher, diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index 9334d818ec..61b555b041 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -23,7 +23,7 @@ from calibre.ebooks.metadata.book.base import Metadata from calibre.library.comments import sanitize_comments_html from calibre.utils.date import parse_date -class Worker(Thread): # {{{ +class Worker(Thread): # Get details {{{ ''' Get book details from amazons book page in a separate thread @@ -283,6 +283,7 @@ class Amazon(Source): touched_fields = frozenset(['title', 'authors', 'identifier:amazon', 'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate']) has_html_comments = True + supports_gzip_transfer_encoding = True AMAZON_DOMAINS = { 'com': _('US'), diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py index 08012c3ee8..5903a5e710 100644 --- a/src/calibre/ebooks/metadata/sources/base.py +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -21,6 +21,7 @@ msprefs = JSONConfig('metadata_sources.json') msprefs.defaults['txt_comments'] = False msprefs.defaults['ignore_fields'] = [] msprefs.defaults['max_tags'] = 10 +msprefs.defaults['wait_after_first_identify_result'] = 30 # seconds def create_log(ostream=None): log = ThreadSafeLog(level=ThreadSafeLog.DEBUG) @@ -92,6 +93,15 @@ class InternalMetadataCompareKeyGen(object): # }}} +def get_cached_cover_urls(mi): + from calibre.customize.ui import metadata_plugins + plugins = list(metadata_plugins['identify']) + for p in plugins: + url = p.get_cached_cover_url(mi.identifiers) + if url: + yield (p, url) + + class Source(Plugin): type = _('Metadata source') @@ -110,6 +120,12 @@ class Source(Plugin): #: Set this to True if your plugin return HTML formatted comments has_html_comments = False + #: Setting this to True means that the browser object will add + #: Accept-Encoding: gzip to all requests. This can speedup downloads + #: but make sure that the source actually supports gzip transfer encoding + #: correctly first + supports_gzip_transfer_encoding = False + def __init__(self, *args, **kwargs): Plugin.__init__(self, *args, **kwargs) self._isbn_to_identifier_cache = {} @@ -133,6 +149,8 @@ class Source(Plugin): def browser(self): if self._browser is None: self._browser = browser(user_agent=random_user_agent()) + if self.supports_gzip_transfer_encoding: + self._browser.set_handle_gzip(True) return self._browser.clone_browser() # }}} diff --git a/src/calibre/ebooks/metadata/sources/google.py b/src/calibre/ebooks/metadata/sources/google.py index 989320f710..21c99fdf46 100644 --- a/src/calibre/ebooks/metadata/sources/google.py +++ b/src/calibre/ebooks/metadata/sources/google.py @@ -160,6 +160,7 @@ class GoogleBooks(Source): touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate', 'comments', 'publisher', 'identifier:isbn', 'rating', 'identifier:google']) # language currently disabled + supports_gzip_transfer_encoding = True GOOGLE_COVER = 'http://books.google.com/books?id=%s&printsec=frontcover&img=1' diff --git a/src/calibre/ebooks/metadata/sources/identify.py b/src/calibre/ebooks/metadata/sources/identify.py index ab86e8ffa2..71554595ad 100644 --- a/src/calibre/ebooks/metadata/sources/identify.py +++ b/src/calibre/ebooks/metadata/sources/identify.py @@ -21,9 +21,7 @@ from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import utc_tz from calibre.utils.html2text import html2text -# How long to wait for more results after first result is found -WAIT_AFTER_FIRST_RESULT = 30 # seconds - +# Download worker {{{ class Worker(Thread): def __init__(self, plugin, kwargs, abort): @@ -47,99 +45,9 @@ def is_worker_alive(workers): return True return False -def identify(log, abort, title=None, authors=None, identifiers=[], timeout=30): - start_time = time.time() - plugins = list(metadata_plugins['identify']) - - kwargs = { - 'title': title, - 'authors': authors, - 'identifiers': identifiers, - 'timeout': timeout, - } - - log('Running identify query with parameters:') - log(kwargs) - log('Using plugins:', ', '.join([p.name for p in plugins])) - log('The log (if any) from individual plugins is below') - - workers = [Worker(p, kwargs, abort) for p in plugins] - for w in workers: - w.start() - - first_result_at = None - results = dict.fromkeys(plugins, []) - - def get_results(): - found = False - for w in workers: - try: - result = w.rq.get_nowait() - except Empty: - pass - else: - results[w.plugin].append(result) - found = True - return found - - while True: - time.sleep(0.2) - - if get_results() and first_result_at is None: - first_result_at = time.time() - - if not is_worker_alive(workers): - break - - if (first_result_at is not None and time.time() - first_result_at < - WAIT_AFTER_FIRST_RESULT): - log('Not waiting any longer for more results') - abort.set() - break - - get_results() - sort_kwargs = dict(kwargs) - for k in list(sort_kwargs.iterkeys()): - if k not in ('title', 'authors', 'identifiers'): - sort_kwargs.pop(k) - - for plugin, results in results.iteritems(): - results.sort(key=plugin.identify_results_keygen(**sort_kwargs)) - plog = plugin.buf.getvalue().strip() - if plog: - log('\n'+'*'*35, plugin.name, '*'*35) - log('Found %d results'%len(results)) - log(plog) - log('\n'+'*'*80) - - for i, result in enumerate(results): - result.relevance_in_source = i - result.has_cached_cover_url = \ - plugin.get_cached_cover_url(result.identifiers) is not None - result.identify_plugin = plugin - - log('The identify phase took %.2f seconds'%(time.time() - start_time)) - log('Merging results from different sources and finding earliest', - 'publication dates') - start_time = time.time() - results = merge_identify_results(results, log) - log('We have %d merged results, merging took: %.2f seconds' % - (len(results), time.time() - start_time)) - - if msprefs['txt_comments']: - for r in results: - if r.plugin.has_html_comments and r.comments: - r.comments = html2text(r.comments) - - dummy = Metadata(_('Unknown')) - max_tags = msprefs['max_tags'] - for f in msprefs['ignore_fields']: - for r in results: - setattr(r, f, getattr(dummy, f)) - r.tags = r.tags[:max_tags] - - return results +# }}} +# Merge results from different sources {{{ class ISBNMerge(object): @@ -298,6 +206,147 @@ def merge_identify_results(result_map, log): return isbn_merge.finalize() +# }}} +def identify(log, abort, title=None, authors=None, identifiers=[], timeout=30): + start_time = time.time() + plugins = list(metadata_plugins['identify']) + kwargs = { + 'title': title, + 'authors': authors, + 'identifiers': identifiers, + 'timeout': timeout, + } + + log('Running identify query with parameters:') + log(kwargs) + log('Using plugins:', ', '.join([p.name for p in plugins])) + log('The log (if any) from individual plugins is below') + + workers = [Worker(p, kwargs, abort) for p in plugins] + for w in workers: + w.start() + + first_result_at = None + results = dict.fromkeys(plugins, []) + + def get_results(): + found = False + for w in workers: + try: + result = w.rq.get_nowait() + except Empty: + pass + else: + results[w.plugin].append(result) + found = True + return found + + wait_time = msprefs['wait_after_first_identify_result'] + while True: + time.sleep(0.2) + + if get_results() and first_result_at is None: + first_result_at = time.time() + + if not is_worker_alive(workers): + break + + if (first_result_at is not None and time.time() - first_result_at < + wait_time): + log('Not waiting any longer for more results') + abort.set() + break + + get_results() + sort_kwargs = dict(kwargs) + for k in list(sort_kwargs.iterkeys()): + if k not in ('title', 'authors', 'identifiers'): + sort_kwargs.pop(k) + + for plugin, results in results.iteritems(): + results.sort(key=plugin.identify_results_keygen(**sort_kwargs)) + plog = plugin.buf.getvalue().strip() + if plog: + log('\n'+'*'*35, plugin.name, '*'*35) + log('Found %d results'%len(results)) + log(plog) + log('\n'+'*'*80) + + for i, result in enumerate(results): + result.relevance_in_source = i + result.has_cached_cover_url = \ + plugin.get_cached_cover_url(result.identifiers) is not None + result.identify_plugin = plugin + + log('The identify phase took %.2f seconds'%(time.time() - start_time)) + log('Merging results from different sources and finding earliest', + 'publication dates') + start_time = time.time() + results = merge_identify_results(results, log) + log('We have %d merged results, merging took: %.2f seconds' % + (len(results), time.time() - start_time)) + + if msprefs['txt_comments']: + for r in results: + if r.plugin.has_html_comments and r.comments: + r.comments = html2text(r.comments) + + dummy = Metadata(_('Unknown')) + max_tags = msprefs['max_tags'] + for f in msprefs['ignore_fields']: + for r in results: + setattr(r, f, getattr(dummy, f)) + r.tags = r.tags[:max_tags] + + return results + +if __name__ == '__main__': # tests {{{ + # To run these test use: calibre-debug -e + # src/calibre/ebooks/metadata/sources/identify.py + from calibre.ebooks.metadata.sources.test import (test_identify, + title_test, authors_test) + test_identify( + [ + + ( # An e-book ISBN not on Amazon, one of the authors is + # unknown to Amazon + {'identifiers':{'isbn': '9780307459671'}, + 'title':'Invisible Gorilla', 'authors':['Christopher Chabris']}, + [title_test('The Invisible Gorilla: And Other Ways Our Intuitions Deceive Us', + exact=True), authors_test(['Christopher Chabris', 'Daniel Simons'])] + + ), + + ( # This isbn not on amazon + {'identifiers':{'isbn': '8324616489'}, 'title':'Learning Python', + 'authors':['Lutz']}, + [title_test('Learning Python, 3rd Edition', + exact=True), authors_test(['Mark Lutz']) + ] + + ), + + ( # Sophisticated comment formatting + {'identifiers':{'isbn': '9781416580829'}}, + [title_test('Angels & Demons - Movie Tie-In: A Novel', + exact=True), authors_test(['Dan Brown'])] + ), + + ( # No specific problems + {'identifiers':{'isbn': '0743273567'}}, + [title_test('The great gatsby', exact=True), + authors_test(['F. Scott Fitzgerald'])] + ), + + ( # A newer book + {'identifiers':{'isbn': '9780316044981'}}, + [title_test('The Heroes', exact=True), + authors_test(['Joe Abercrombie'])] + + ), + + ]) +# }}} diff --git a/src/calibre/ebooks/metadata/sources/test.py b/src/calibre/ebooks/metadata/sources/test.py index de95a9b887..a7dcc2fa14 100644 --- a/src/calibre/ebooks/metadata/sources/test.py +++ b/src/calibre/ebooks/metadata/sources/test.py @@ -14,7 +14,8 @@ from threading import Event from calibre.customize.ui import metadata_plugins from calibre import prints, sanitize_file_name2 from calibre.ebooks.metadata import check_isbn -from calibre.ebooks.metadata.sources.base import create_log +from calibre.ebooks.metadata.sources.base import (create_log, + get_cached_cover_urls) def isbn_test(isbn): isbn_ = check_isbn(isbn) @@ -45,8 +46,75 @@ def authors_test(authors): return test +def init_test(tdir_name): + tdir = tempfile.gettempdir() + lf = os.path.join(tdir, tdir_name.replace(' ', '')+'_identify_test.txt') + log = create_log(open(lf, 'wb')) + abort = Event() + return tdir, lf, log, abort -def test_identify_plugin(name, tests): +def test_identify(tests): # {{{ + ''' + :param tests: List of 2-tuples. Each two tuple is of the form (args, + test_funcs). args is a dict of keyword arguments to pass to + the identify method. test_funcs are callables that accept a + Metadata object and return True iff the object passes the + test. + ''' + from calibre.ebooks.metadata.sources.identify import identify + + tdir, lf, log, abort = init_test('Full Identify') + + times = [] + + for kwargs, test_funcs in tests: + prints('Running test with:', kwargs) + args = (log, abort) + start_time = time.time() + results = identify(*args, **kwargs) + total_time = time.time() - start_time + times.append(total_time) + if not results: + prints('identify failed to find any results') + break + + prints('Found', len(results), 'matches:', end=' ') + prints('Smaller relevance means better match') + + for i, mi in enumerate(results): + prints('*'*30, 'Relevance:', i, '*'*30) + prints(mi) + prints('\nCached cover URLs :', + [x[0].name for x in get_cached_cover_urls(mi)]) + prints('*'*75, '\n\n') + + possibles = [] + for mi in results: + test_failed = False + for tfunc in test_funcs: + if not tfunc(mi): + test_failed = True + break + if not test_failed: + possibles.append(mi) + + if not possibles: + prints('ERROR: No results that passed all tests were found') + prints('Log saved to', lf) + raise SystemExit(1) + + if results[0] is not possibles[0]: + prints('Most relevant result failed the tests') + raise SystemExit(1) + + prints('Average time per query', sum(times)/len(times)) + + if os.stat(lf).st_size > 10: + prints('There were some errors/warnings, see log', lf) + +# }}} + +def test_identify_plugin(name, tests): # {{{ ''' :param name: Plugin name :param tests: List of 2-tuples. Each two tuple is of the form (args, @@ -62,10 +130,7 @@ def test_identify_plugin(name, tests): break prints('Testing the identify function of', plugin.name) - tdir = tempfile.gettempdir() - lf = os.path.join(tdir, plugin.name.replace(' ', '')+'_identify_test.txt') - log = create_log(open(lf, 'wb')) - abort = Event() + tdir, lf, log, abort = init_test(plugin.name) prints('Log saved to', lf) times = [] @@ -159,4 +224,5 @@ def test_identify_plugin(name, tests): if os.stat(lf).st_size > 10: prints('There were some errors/warnings, see log', lf) +# }}} diff --git a/src/calibre/ebooks/oeb/stylizer.py b/src/calibre/ebooks/oeb/stylizer.py index 0cd17387fe..42974be355 100644 --- a/src/calibre/ebooks/oeb/stylizer.py +++ b/src/calibre/ebooks/oeb/stylizer.py @@ -17,6 +17,8 @@ from cssutils.css import CSSStyleRule, CSSPageRule, CSSStyleDeclaration, \ from cssutils import profile as cssprofiles from lxml import etree from lxml.cssselect import css_to_xpath, ExpressionError, SelectorSyntaxError + +from calibre import force_unicode from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES from calibre.ebooks.oeb.base import XPNSMAP, xpath, urlnormalize from calibre.ebooks.oeb.profile import PROFILES @@ -140,13 +142,22 @@ class Stylizer(object): log=logging.getLogger('calibre.css')) self.font_face_rules = [] for elem in head: - if elem.tag == XHTML('style') and elem.text \ - and elem.get('type', CSS_MIME) in OEB_STYLES: - text = XHTML_CSS_NAMESPACE + elem.text - text = oeb.css_preprocessor(text) - stylesheet = parser.parseString(text, href=cssname) - stylesheet.namespaces['h'] = XHTML_NS - stylesheets.append(stylesheet) + if (elem.tag == XHTML('style') and + elem.get('type', CSS_MIME) in OEB_STYLES): + text = elem.text if elem.text else u'' + for x in elem: + t = getattr(x, 'text', None) + if t: + text += u'\n\n' + force_unicode(t, u'utf-8') + t = getattr(x, 'tail', None) + if t: + text += u'\n\n' + force_unicode(t, u'utf-8') + if text: + text = XHTML_CSS_NAMESPACE + elem.text + text = oeb.css_preprocessor(text) + stylesheet = parser.parseString(text, href=cssname) + stylesheet.namespaces['h'] = XHTML_NS + stylesheets.append(stylesheet) elif elem.tag == XHTML('link') and elem.get('href') \ and elem.get('rel', 'stylesheet').lower() == 'stylesheet' \ and elem.get('type', CSS_MIME).lower() in OEB_STYLES: diff --git a/src/calibre/ebooks/pdf/fonts.cpp b/src/calibre/ebooks/pdf/fonts.cpp index 99ab7517c1..c3a709869e 100644 --- a/src/calibre/ebooks/pdf/fonts.cpp +++ b/src/calibre/ebooks/pdf/fonts.cpp @@ -72,6 +72,7 @@ XMLFont::XMLFont(string* font_name, double size, GfxRGB rgb) : size(size-1), line_size(-1.0), italic(false), bold(false), font_name(font_name), font_family(NULL), color(rgb) { + if (!this->font_name) this->font_name = new string(DEFAULT_FONT_FAMILY); this->font_family = family_name(this->font_name); if (strcasestr(font_name->c_str(), "bold")) this->bold = true; @@ -134,7 +135,12 @@ Fonts::size_type Fonts::add_font(XMLFont *f) { } Fonts::size_type Fonts::add_font(string* font_name, double size, GfxRGB rgb) { - XMLFont *f = new XMLFont(font_name, size, rgb); + XMLFont *f = NULL; + if (font_name == NULL) + font_name = new string("Unknown"); + // font_name must not be deleted + f = new XMLFont(font_name, size, rgb); + return this->add_font(f); } diff --git a/src/calibre/gui2/dialogs/book_info.ui b/src/calibre/gui2/dialogs/book_info.ui index 412126a610..9e9e71eda0 100644 --- a/src/calibre/gui2/dialogs/book_info.ui +++ b/src/calibre/gui2/dialogs/book_info.ui @@ -7,15 +7,25 @@ 0 0 917 - 480 + 492 Dialog + + + :/images/metadata.png:/images/metadata.png + + + + 75 + true + + TextLabel @@ -24,86 +34,104 @@ - + - - - - - TextLabel - - - Qt::AlignLeading|Qt::AlignLeft|Qt::AlignTop - - - true - - - - - - - Comments - - - - - - - 0 - 0 - - - - - 350 - 16777215 - - - - - about:blank - - - - - - - - - - - Fit &cover within view - - - - - + + + QFrame::NoFrame + + + true + + + + + 0 + 0 + 435 + 670 + + + - + - &Previous + TextLabel - - - :/images/previous.png:/images/previous.png + + Qt::AlignLeading|Qt::AlignLeft|Qt::AlignTop + + + true - - - &Next - - - - :/images/next.png:/images/next.png + + + Comments + + + + + + 0 + 0 + + + + + 350 + 16777215 + + + + + about:blank + + + + + + + + + + + + Fit &cover within view + + + + + + + + + &Previous + + + + :/images/previous.png:/images/previous.png + + + + + + + &Next + + + + :/images/next.png:/images/next.png + + diff --git a/src/calibre/utils/browser.py b/src/calibre/utils/browser.py index 2f77ede6b3..6f8703ab49 100644 --- a/src/calibre/utils/browser.py +++ b/src/calibre/utils/browser.py @@ -38,10 +38,10 @@ class Browser(B): self._clone_actions['set_handle_equiv'] = ('set_handle_equiv', args, kwargs) - def set_handle_gzip(self, *args, **kwargs): - B.set_handle_gzip(self, *args, **kwargs) + def set_handle_gzip(self, handle): + B._set_handler(self, '_gzip', handle) self._clone_actions['set_handle_gzip'] = ('set_handle_gzip', - args, kwargs) + (handle,), {}) def set_debug_redirect(self, *args, **kwargs): B.set_debug_redirect(self, *args, **kwargs)