Merge from main branch

This commit is contained in:
Tom Scholl 2011-04-04 23:00:15 +00:00
commit b29b3df37b
12 changed files with 368 additions and 183 deletions

View File

@ -1,6 +1,9 @@
calibre supports installation from source, only on Linux.
On Windows and OS X use the provided installers and use
the facilities of the calibre-debug command to hack on the calibre source.
Note that you *do not* need to install from source to hack on
the calibre source code. To get started with calibre development,
use a normal calibre install and follow the instructions at
http://calibre-ebook.com/user_manual/develop.html
On Linux, there are two kinds of installation from source possible.
Note that both kinds require lots of dependencies as well as a
@ -45,3 +48,4 @@ This type of install can be run with the command::
sudo python setup.py develop
Use the -h flag for help on the develop command.

2
README
View File

@ -7,7 +7,7 @@ reading. It is cross platform, running on Linux, Windows and OS X.
For screenshots: https://calibre-ebook.com/demo
For installation/usage instructions please see
http://calibre-ebook.com
http://calibre-ebook.com/user_manual
For source code access:
bzr branch lp:calibre

View File

@ -193,6 +193,7 @@ class ResultList(list):
def search(title=None, author=None, publisher=None, isbn=None,
min_viewability='none', verbose=False, max_results=40):
br = browser()
br.set_handle_gzip(True)
start, entries = 1, []
while start > 0 and len(entries) <= max_results:
new, start = Query(title=title, author=author, publisher=publisher,

View File

@ -23,7 +23,7 @@ from calibre.ebooks.metadata.book.base import Metadata
from calibre.library.comments import sanitize_comments_html
from calibre.utils.date import parse_date
class Worker(Thread): # {{{
class Worker(Thread): # Get details {{{
'''
Get book details from amazons book page in a separate thread
@ -283,6 +283,7 @@ class Amazon(Source):
touched_fields = frozenset(['title', 'authors', 'identifier:amazon',
'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate'])
has_html_comments = True
supports_gzip_transfer_encoding = True
AMAZON_DOMAINS = {
'com': _('US'),

View File

@ -21,6 +21,7 @@ msprefs = JSONConfig('metadata_sources.json')
msprefs.defaults['txt_comments'] = False
msprefs.defaults['ignore_fields'] = []
msprefs.defaults['max_tags'] = 10
msprefs.defaults['wait_after_first_identify_result'] = 30 # seconds
def create_log(ostream=None):
log = ThreadSafeLog(level=ThreadSafeLog.DEBUG)
@ -92,6 +93,15 @@ class InternalMetadataCompareKeyGen(object):
# }}}
def get_cached_cover_urls(mi):
from calibre.customize.ui import metadata_plugins
plugins = list(metadata_plugins['identify'])
for p in plugins:
url = p.get_cached_cover_url(mi.identifiers)
if url:
yield (p, url)
class Source(Plugin):
type = _('Metadata source')
@ -110,6 +120,12 @@ class Source(Plugin):
#: Set this to True if your plugin return HTML formatted comments
has_html_comments = False
#: Setting this to True means that the browser object will add
#: Accept-Encoding: gzip to all requests. This can speedup downloads
#: but make sure that the source actually supports gzip transfer encoding
#: correctly first
supports_gzip_transfer_encoding = False
def __init__(self, *args, **kwargs):
Plugin.__init__(self, *args, **kwargs)
self._isbn_to_identifier_cache = {}
@ -133,6 +149,8 @@ class Source(Plugin):
def browser(self):
if self._browser is None:
self._browser = browser(user_agent=random_user_agent())
if self.supports_gzip_transfer_encoding:
self._browser.set_handle_gzip(True)
return self._browser.clone_browser()
# }}}

View File

@ -160,6 +160,7 @@ class GoogleBooks(Source):
touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate',
'comments', 'publisher', 'identifier:isbn', 'rating',
'identifier:google']) # language currently disabled
supports_gzip_transfer_encoding = True
GOOGLE_COVER = 'http://books.google.com/books?id=%s&printsec=frontcover&img=1'

View File

@ -21,9 +21,7 @@ from calibre.ebooks.metadata.book.base import Metadata
from calibre.utils.date import utc_tz
from calibre.utils.html2text import html2text
# How long to wait for more results after first result is found
WAIT_AFTER_FIRST_RESULT = 30 # seconds
# Download worker {{{
class Worker(Thread):
def __init__(self, plugin, kwargs, abort):
@ -47,99 +45,9 @@ def is_worker_alive(workers):
return True
return False
def identify(log, abort, title=None, authors=None, identifiers=[], timeout=30):
start_time = time.time()
plugins = list(metadata_plugins['identify'])
kwargs = {
'title': title,
'authors': authors,
'identifiers': identifiers,
'timeout': timeout,
}
log('Running identify query with parameters:')
log(kwargs)
log('Using plugins:', ', '.join([p.name for p in plugins]))
log('The log (if any) from individual plugins is below')
workers = [Worker(p, kwargs, abort) for p in plugins]
for w in workers:
w.start()
first_result_at = None
results = dict.fromkeys(plugins, [])
def get_results():
found = False
for w in workers:
try:
result = w.rq.get_nowait()
except Empty:
pass
else:
results[w.plugin].append(result)
found = True
return found
while True:
time.sleep(0.2)
if get_results() and first_result_at is None:
first_result_at = time.time()
if not is_worker_alive(workers):
break
if (first_result_at is not None and time.time() - first_result_at <
WAIT_AFTER_FIRST_RESULT):
log('Not waiting any longer for more results')
abort.set()
break
get_results()
sort_kwargs = dict(kwargs)
for k in list(sort_kwargs.iterkeys()):
if k not in ('title', 'authors', 'identifiers'):
sort_kwargs.pop(k)
for plugin, results in results.iteritems():
results.sort(key=plugin.identify_results_keygen(**sort_kwargs))
plog = plugin.buf.getvalue().strip()
if plog:
log('\n'+'*'*35, plugin.name, '*'*35)
log('Found %d results'%len(results))
log(plog)
log('\n'+'*'*80)
for i, result in enumerate(results):
result.relevance_in_source = i
result.has_cached_cover_url = \
plugin.get_cached_cover_url(result.identifiers) is not None
result.identify_plugin = plugin
log('The identify phase took %.2f seconds'%(time.time() - start_time))
log('Merging results from different sources and finding earliest',
'publication dates')
start_time = time.time()
results = merge_identify_results(results, log)
log('We have %d merged results, merging took: %.2f seconds' %
(len(results), time.time() - start_time))
if msprefs['txt_comments']:
for r in results:
if r.plugin.has_html_comments and r.comments:
r.comments = html2text(r.comments)
dummy = Metadata(_('Unknown'))
max_tags = msprefs['max_tags']
for f in msprefs['ignore_fields']:
for r in results:
setattr(r, f, getattr(dummy, f))
r.tags = r.tags[:max_tags]
return results
# }}}
# Merge results from different sources {{{
class ISBNMerge(object):
@ -298,6 +206,147 @@ def merge_identify_results(result_map, log):
return isbn_merge.finalize()
# }}}
def identify(log, abort, title=None, authors=None, identifiers=[], timeout=30):
start_time = time.time()
plugins = list(metadata_plugins['identify'])
kwargs = {
'title': title,
'authors': authors,
'identifiers': identifiers,
'timeout': timeout,
}
log('Running identify query with parameters:')
log(kwargs)
log('Using plugins:', ', '.join([p.name for p in plugins]))
log('The log (if any) from individual plugins is below')
workers = [Worker(p, kwargs, abort) for p in plugins]
for w in workers:
w.start()
first_result_at = None
results = dict.fromkeys(plugins, [])
def get_results():
found = False
for w in workers:
try:
result = w.rq.get_nowait()
except Empty:
pass
else:
results[w.plugin].append(result)
found = True
return found
wait_time = msprefs['wait_after_first_identify_result']
while True:
time.sleep(0.2)
if get_results() and first_result_at is None:
first_result_at = time.time()
if not is_worker_alive(workers):
break
if (first_result_at is not None and time.time() - first_result_at <
wait_time):
log('Not waiting any longer for more results')
abort.set()
break
get_results()
sort_kwargs = dict(kwargs)
for k in list(sort_kwargs.iterkeys()):
if k not in ('title', 'authors', 'identifiers'):
sort_kwargs.pop(k)
for plugin, results in results.iteritems():
results.sort(key=plugin.identify_results_keygen(**sort_kwargs))
plog = plugin.buf.getvalue().strip()
if plog:
log('\n'+'*'*35, plugin.name, '*'*35)
log('Found %d results'%len(results))
log(plog)
log('\n'+'*'*80)
for i, result in enumerate(results):
result.relevance_in_source = i
result.has_cached_cover_url = \
plugin.get_cached_cover_url(result.identifiers) is not None
result.identify_plugin = plugin
log('The identify phase took %.2f seconds'%(time.time() - start_time))
log('Merging results from different sources and finding earliest',
'publication dates')
start_time = time.time()
results = merge_identify_results(results, log)
log('We have %d merged results, merging took: %.2f seconds' %
(len(results), time.time() - start_time))
if msprefs['txt_comments']:
for r in results:
if r.plugin.has_html_comments and r.comments:
r.comments = html2text(r.comments)
dummy = Metadata(_('Unknown'))
max_tags = msprefs['max_tags']
for f in msprefs['ignore_fields']:
for r in results:
setattr(r, f, getattr(dummy, f))
r.tags = r.tags[:max_tags]
return results
if __name__ == '__main__': # tests {{{
# To run these test use: calibre-debug -e
# src/calibre/ebooks/metadata/sources/identify.py
from calibre.ebooks.metadata.sources.test import (test_identify,
title_test, authors_test)
test_identify(
[
( # An e-book ISBN not on Amazon, one of the authors is
# unknown to Amazon
{'identifiers':{'isbn': '9780307459671'},
'title':'Invisible Gorilla', 'authors':['Christopher Chabris']},
[title_test('The Invisible Gorilla: And Other Ways Our Intuitions Deceive Us',
exact=True), authors_test(['Christopher Chabris', 'Daniel Simons'])]
),
( # This isbn not on amazon
{'identifiers':{'isbn': '8324616489'}, 'title':'Learning Python',
'authors':['Lutz']},
[title_test('Learning Python, 3rd Edition',
exact=True), authors_test(['Mark Lutz'])
]
),
( # Sophisticated comment formatting
{'identifiers':{'isbn': '9781416580829'}},
[title_test('Angels & Demons - Movie Tie-In: A Novel',
exact=True), authors_test(['Dan Brown'])]
),
( # No specific problems
{'identifiers':{'isbn': '0743273567'}},
[title_test('The great gatsby', exact=True),
authors_test(['F. Scott Fitzgerald'])]
),
( # A newer book
{'identifiers':{'isbn': '9780316044981'}},
[title_test('The Heroes', exact=True),
authors_test(['Joe Abercrombie'])]
),
])
# }}}

View File

@ -14,7 +14,8 @@ from threading import Event
from calibre.customize.ui import metadata_plugins
from calibre import prints, sanitize_file_name2
from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.sources.base import create_log
from calibre.ebooks.metadata.sources.base import (create_log,
get_cached_cover_urls)
def isbn_test(isbn):
isbn_ = check_isbn(isbn)
@ -45,8 +46,75 @@ def authors_test(authors):
return test
def init_test(tdir_name):
tdir = tempfile.gettempdir()
lf = os.path.join(tdir, tdir_name.replace(' ', '')+'_identify_test.txt')
log = create_log(open(lf, 'wb'))
abort = Event()
return tdir, lf, log, abort
def test_identify_plugin(name, tests):
def test_identify(tests): # {{{
'''
:param tests: List of 2-tuples. Each two tuple is of the form (args,
test_funcs). args is a dict of keyword arguments to pass to
the identify method. test_funcs are callables that accept a
Metadata object and return True iff the object passes the
test.
'''
from calibre.ebooks.metadata.sources.identify import identify
tdir, lf, log, abort = init_test('Full Identify')
times = []
for kwargs, test_funcs in tests:
prints('Running test with:', kwargs)
args = (log, abort)
start_time = time.time()
results = identify(*args, **kwargs)
total_time = time.time() - start_time
times.append(total_time)
if not results:
prints('identify failed to find any results')
break
prints('Found', len(results), 'matches:', end=' ')
prints('Smaller relevance means better match')
for i, mi in enumerate(results):
prints('*'*30, 'Relevance:', i, '*'*30)
prints(mi)
prints('\nCached cover URLs :',
[x[0].name for x in get_cached_cover_urls(mi)])
prints('*'*75, '\n\n')
possibles = []
for mi in results:
test_failed = False
for tfunc in test_funcs:
if not tfunc(mi):
test_failed = True
break
if not test_failed:
possibles.append(mi)
if not possibles:
prints('ERROR: No results that passed all tests were found')
prints('Log saved to', lf)
raise SystemExit(1)
if results[0] is not possibles[0]:
prints('Most relevant result failed the tests')
raise SystemExit(1)
prints('Average time per query', sum(times)/len(times))
if os.stat(lf).st_size > 10:
prints('There were some errors/warnings, see log', lf)
# }}}
def test_identify_plugin(name, tests): # {{{
'''
:param name: Plugin name
:param tests: List of 2-tuples. Each two tuple is of the form (args,
@ -62,10 +130,7 @@ def test_identify_plugin(name, tests):
break
prints('Testing the identify function of', plugin.name)
tdir = tempfile.gettempdir()
lf = os.path.join(tdir, plugin.name.replace(' ', '')+'_identify_test.txt')
log = create_log(open(lf, 'wb'))
abort = Event()
tdir, lf, log, abort = init_test(plugin.name)
prints('Log saved to', lf)
times = []
@ -159,4 +224,5 @@ def test_identify_plugin(name, tests):
if os.stat(lf).st_size > 10:
prints('There were some errors/warnings, see log', lf)
# }}}

View File

@ -17,6 +17,8 @@ from cssutils.css import CSSStyleRule, CSSPageRule, CSSStyleDeclaration, \
from cssutils import profile as cssprofiles
from lxml import etree
from lxml.cssselect import css_to_xpath, ExpressionError, SelectorSyntaxError
from calibre import force_unicode
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES
from calibre.ebooks.oeb.base import XPNSMAP, xpath, urlnormalize
from calibre.ebooks.oeb.profile import PROFILES
@ -140,13 +142,22 @@ class Stylizer(object):
log=logging.getLogger('calibre.css'))
self.font_face_rules = []
for elem in head:
if elem.tag == XHTML('style') and elem.text \
and elem.get('type', CSS_MIME) in OEB_STYLES:
text = XHTML_CSS_NAMESPACE + elem.text
text = oeb.css_preprocessor(text)
stylesheet = parser.parseString(text, href=cssname)
stylesheet.namespaces['h'] = XHTML_NS
stylesheets.append(stylesheet)
if (elem.tag == XHTML('style') and
elem.get('type', CSS_MIME) in OEB_STYLES):
text = elem.text if elem.text else u''
for x in elem:
t = getattr(x, 'text', None)
if t:
text += u'\n\n' + force_unicode(t, u'utf-8')
t = getattr(x, 'tail', None)
if t:
text += u'\n\n' + force_unicode(t, u'utf-8')
if text:
text = XHTML_CSS_NAMESPACE + elem.text
text = oeb.css_preprocessor(text)
stylesheet = parser.parseString(text, href=cssname)
stylesheet.namespaces['h'] = XHTML_NS
stylesheets.append(stylesheet)
elif elem.tag == XHTML('link') and elem.get('href') \
and elem.get('rel', 'stylesheet').lower() == 'stylesheet' \
and elem.get('type', CSS_MIME).lower() in OEB_STYLES:

View File

@ -72,6 +72,7 @@ XMLFont::XMLFont(string* font_name, double size, GfxRGB rgb) :
size(size-1), line_size(-1.0), italic(false), bold(false), font_name(font_name),
font_family(NULL), color(rgb) {
if (!this->font_name) this->font_name = new string(DEFAULT_FONT_FAMILY);
this->font_family = family_name(this->font_name);
if (strcasestr(font_name->c_str(), "bold")) this->bold = true;
@ -134,7 +135,12 @@ Fonts::size_type Fonts::add_font(XMLFont *f) {
}
Fonts::size_type Fonts::add_font(string* font_name, double size, GfxRGB rgb) {
XMLFont *f = new XMLFont(font_name, size, rgb);
XMLFont *f = NULL;
if (font_name == NULL)
font_name = new string("Unknown");
// font_name must not be deleted
f = new XMLFont(font_name, size, rgb);
return this->add_font(f);
}

View File

@ -7,15 +7,25 @@
<x>0</x>
<y>0</y>
<width>917</width>
<height>480</height>
<height>492</height>
</rect>
</property>
<property name="windowTitle">
<string>Dialog</string>
</property>
<property name="windowIcon">
<iconset resource="../../../../resources/images.qrc">
<normaloff>:/images/metadata.png</normaloff>:/images/metadata.png</iconset>
</property>
<layout class="QGridLayout" name="gridLayout">
<item row="0" column="0" colspan="2">
<widget class="QLabel" name="title">
<property name="font">
<font>
<weight>75</weight>
<bold>true</bold>
</font>
</property>
<property name="text">
<string>TextLabel</string>
</property>
@ -24,86 +34,104 @@
</property>
</widget>
</item>
<item row="1" column="0">
<item row="1" column="0" rowspan="3">
<widget class="CoverView" name="cover"/>
</item>
<item row="1" column="1">
<layout class="QVBoxLayout" name="verticalLayout">
<item>
<widget class="QLabel" name="text">
<property name="text">
<string>TextLabel</string>
</property>
<property name="alignment">
<set>Qt::AlignLeading|Qt::AlignLeft|Qt::AlignTop</set>
</property>
<property name="wordWrap">
<bool>true</bool>
</property>
</widget>
</item>
<item>
<widget class="QGroupBox" name="groupBox">
<property name="title">
<string>Comments</string>
</property>
<layout class="QVBoxLayout" name="verticalLayout_2">
<item>
<widget class="QWebView" name="comments">
<property name="sizePolicy">
<sizepolicy hsizetype="Preferred" vsizetype="Expanding">
<horstretch>0</horstretch>
<verstretch>0</verstretch>
</sizepolicy>
</property>
<property name="maximumSize">
<size>
<width>350</width>
<height>16777215</height>
</size>
</property>
<property name="url">
<url>
<string>about:blank</string>
</url>
</property>
</widget>
</item>
</layout>
</widget>
</item>
<item>
<widget class="QCheckBox" name="fit_cover">
<property name="text">
<string>Fit &amp;cover within view</string>
</property>
</widget>
</item>
<item>
<layout class="QHBoxLayout" name="horizontalLayout">
<widget class="QScrollArea" name="scrollArea">
<property name="frameShape">
<enum>QFrame::NoFrame</enum>
</property>
<property name="widgetResizable">
<bool>true</bool>
</property>
<widget class="QWidget" name="scrollAreaWidgetContents">
<property name="geometry">
<rect>
<x>0</x>
<y>0</y>
<width>435</width>
<height>670</height>
</rect>
</property>
<layout class="QVBoxLayout" name="verticalLayout">
<item>
<widget class="QPushButton" name="previous_button">
<widget class="QLabel" name="text">
<property name="text">
<string>&amp;Previous</string>
<string>TextLabel</string>
</property>
<property name="icon">
<iconset resource="../../../../resources/images.qrc">
<normaloff>:/images/previous.png</normaloff>:/images/previous.png</iconset>
<property name="alignment">
<set>Qt::AlignLeading|Qt::AlignLeft|Qt::AlignTop</set>
</property>
<property name="wordWrap">
<bool>true</bool>
</property>
</widget>
</item>
<item>
<widget class="QPushButton" name="next_button">
<property name="text">
<string>&amp;Next</string>
</property>
<property name="icon">
<iconset resource="../../../../resources/images.qrc">
<normaloff>:/images/next.png</normaloff>:/images/next.png</iconset>
<widget class="QGroupBox" name="groupBox">
<property name="title">
<string>Comments</string>
</property>
<layout class="QVBoxLayout" name="verticalLayout_2">
<item>
<widget class="QWebView" name="comments">
<property name="sizePolicy">
<sizepolicy hsizetype="Preferred" vsizetype="Expanding">
<horstretch>0</horstretch>
<verstretch>0</verstretch>
</sizepolicy>
</property>
<property name="maximumSize">
<size>
<width>350</width>
<height>16777215</height>
</size>
</property>
<property name="url">
<url>
<string>about:blank</string>
</url>
</property>
</widget>
</item>
</layout>
</widget>
</item>
</layout>
</widget>
</widget>
</item>
<item row="2" column="1">
<widget class="QCheckBox" name="fit_cover">
<property name="text">
<string>Fit &amp;cover within view</string>
</property>
</widget>
</item>
<item row="3" column="1">
<layout class="QHBoxLayout" name="horizontalLayout">
<item>
<widget class="QPushButton" name="previous_button">
<property name="text">
<string>&amp;Previous</string>
</property>
<property name="icon">
<iconset resource="../../../../resources/images.qrc">
<normaloff>:/images/previous.png</normaloff>:/images/previous.png</iconset>
</property>
</widget>
</item>
<item>
<widget class="QPushButton" name="next_button">
<property name="text">
<string>&amp;Next</string>
</property>
<property name="icon">
<iconset resource="../../../../resources/images.qrc">
<normaloff>:/images/next.png</normaloff>:/images/next.png</iconset>
</property>
</widget>
</item>
</layout>
</item>

View File

@ -38,10 +38,10 @@ class Browser(B):
self._clone_actions['set_handle_equiv'] = ('set_handle_equiv',
args, kwargs)
def set_handle_gzip(self, *args, **kwargs):
B.set_handle_gzip(self, *args, **kwargs)
def set_handle_gzip(self, handle):
B._set_handler(self, '_gzip', handle)
self._clone_actions['set_handle_gzip'] = ('set_handle_gzip',
args, kwargs)
(handle,), {})
def set_debug_redirect(self, *args, **kwargs):
B.set_debug_redirect(self, *args, **kwargs)