Merge from trunk

This commit is contained in:
Charles Haley 2011-03-16 11:37:56 +00:00
commit 0994495bec
13 changed files with 367 additions and 32 deletions

View File

@ -0,0 +1,15 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1295265555(BasicNewsRecipe):
title = u'Pro-Linux.de'
language = 'de'
__author__ = 'Bobus'
oldest_article = 3
max_articles_per_feed = 100
feeds = [(u'Pro-Linux', u'http://www.pro-linux.de/backend/pro-linux.rdf')]
def print_version(self, url):
return url.replace('/news/1/', '/news/1/print/').replace('/artikel/2/', '/artikel/2/print/')
remove_tags_after = [dict(name='div', attrs={'class':'print_links'})]

View File

@ -1,4 +1,4 @@
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe

View File

@ -3,7 +3,7 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import uuid, sys, os, re, logging, time, \
import uuid, sys, os, re, logging, time, random, \
__builtin__, warnings, multiprocessing
from urllib import getproxies
__builtin__.__dict__['dynamic_property'] = lambda(func): func(None)
@ -268,6 +268,17 @@ def get_parsed_proxy(typ='http', debug=True):
prints('Using http proxy', str(ans))
return ans
def random_user_agent():
choices = [
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11'
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)'
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)'
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1)'
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.1 Safari/525.19'
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11'
]
return choices[random.randint(0, len(choices)-1)]
def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None):
'''

View File

@ -1031,7 +1031,8 @@ plugins += [LookAndFeel, Behavior, Columns, Toolbar, Search, InputOptions,
# New metadata download plugins {{{
from calibre.ebooks.metadata.sources.google import GoogleBooks
from calibre.ebooks.metadata.sources.amazon import Amazon
plugins += [GoogleBooks]
plugins += [GoogleBooks, Amazon]
# }}}

View File

@ -1251,6 +1251,7 @@ def metadata_to_opf(mi, as_string=True):
from lxml import etree
import textwrap
from calibre.ebooks.oeb.base import OPF, DC
from calibre.utils.cleantext import clean_ascii_chars
if not mi.application_id:
mi.application_id = str(uuid.uuid4())
@ -1306,7 +1307,7 @@ def metadata_to_opf(mi, as_string=True):
if hasattr(mi, 'category') and mi.category:
factory(DC('type'), mi.category)
if mi.comments:
factory(DC('description'), mi.comments)
factory(DC('description'), clean_ascii_chars(mi.comments))
if mi.publisher:
factory(DC('publisher'), mi.publisher)
for key, val in mi.get_identifiers().iteritems():

View File

@ -7,16 +7,231 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import socket, time
from urllib import urlencode
from threading import Thread
from lxml.html import soupparser, tostring
from calibre import as_unicode
from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.sources.base import Source
from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.chardet import xml_to_unicode
class Worker(Thread):
def __init__(self, url, result_queue, browser, log, timeout=20):
self.url, self.result_queue = url, result_queue
self.log, self.timeout = log, timeout
self.browser = browser.clone_browser()
self.cover_url = self.amazon_id = None
def run(self):
try:
self.get_details()
except:
self.log.error('get_details failed for url: %r'%self.url)
def get_details(self):
try:
raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip()
except Exception, e:
if callable(getattr(e, 'getcode', None)) and \
e.getcode() == 404:
self.log.error('URL malformed: %r'%self.url)
return
attr = getattr(e, 'args', [None])
attr = attr if attr else [None]
if isinstance(attr[0], socket.timeout):
msg = 'Amazon timed out. Try again later.'
self.log.error(msg)
else:
msg = 'Failed to make details query: %r'%self.url
self.log.exception(msg)
return
raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0]
if '<title>404 - ' in raw:
self.log.error('URL malformed: %r'%self.url)
return
try:
root = soupparser.fromstring(clean_ascii_chars(raw))
except:
msg = 'Failed to parse amazon details page: %r'%self.url
self.log.exception(msg)
return
errmsg = root.xpath('//*[@id="errorMessage"]')
if errmsg:
msg = 'Failed to parse amazon details page: %r'%self.url
msg += tostring(errmsg, method='text', encoding=unicode).strip()
self.log.error(msg)
return
self.parse_details(root)
def parse_details(self, root):
pass
class Amazon(Source):
name = 'Amazon'
description = _('Downloads metadata from Amazon')
capabilities = frozenset(['identify', 'cover'])
touched_fields = frozenset(['title', 'authors', 'isbn', 'pubdate',
'comments', 'cover_data'])
capabilities = frozenset(['identify'])
touched_fields = frozenset(['title', 'authors', 'isbn', 'pubdate', 'comments'])
AMAZON_DOMAINS = {
'com': _('US'),
'fr' : _('France'),
'de' : _('Germany'),
}
def create_query(self, log, title=None, authors=None, identifiers={}):
domain = self.prefs.get('domain', 'com')
# See the amazon detailed search page to get all options
q = { 'search-alias' : 'aps',
'unfiltered' : '1',
}
if domain == 'com':
q['sort'] = 'relevanceexprank'
else:
q['sort'] = 'relevancerank'
asin = identifiers.get('amazon', None)
isbn = check_isbn(identifiers.get('isbn', None))
if asin is not None:
q['field-keywords'] = asin
elif isbn is not None:
q['field-isbn'] = isbn
else:
# Only return book results
q['search-alias'] = 'stripbooks'
if title:
title_tokens = list(self.get_title_tokens(title))
if title_tokens:
q['field-title'] = ' '.join(title_tokens)
if authors:
author_tokens = self.get_author_tokens(authors,
only_first_author=True)
if author_tokens:
q['field-author'] = ' '.join(author_tokens)
if not ('field-keywords' in q or 'field-isbn' in q or
('field-title' in q and 'field-author' in q)):
# Insufficient metadata to make an identify query
return None
utf8q = dict([(x.encode('utf-8'), y.encode('utf-8')) for x, y in
q.iteritems()])
url = 'http://www.amazon.%s/s/?'%domain + urlencode(utf8q)
return url
def identify(self, log, result_queue, abort, title=None, authors=None,
identifiers={}, timeout=20):
query = self.create_query(log, title=title, authors=authors,
identifiers=identifiers)
if query is None:
log.error('Insufficient metadata to construct query')
return
br = self.browser
try:
raw = br.open_novisit(query, timeout=timeout).read().strip()
except Exception, e:
if callable(getattr(e, 'getcode', None)) and \
e.getcode() == 404:
log.error('Query malformed: %r'%query)
return
attr = getattr(e, 'args', [None])
attr = attr if attr else [None]
if isinstance(attr[0], socket.timeout):
msg = _('Amazon timed out. Try again later.')
log.error(msg)
else:
msg = 'Failed to make identify query: %r'%query
log.exception(msg)
return as_unicode(msg)
raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0]
if '<title>404 - ' in raw:
log.error('No matches found for query: %r'%query)
return
try:
root = soupparser.fromstring(clean_ascii_chars(raw))
except:
msg = 'Failed to parse amazon page for query: %r'%query
log.exception(msg)
return msg
errmsg = root.xpath('//*[@id="errorMessage"]')
if errmsg:
msg = tostring(errmsg, method='text', encoding=unicode).strip()
log.error(msg)
# The error is almost always a not found error
return
matches = []
for div in root.xpath(r'//div[starts-with(@id, "result_")]'):
for a in div.xpath(r'descendant::a[@class="title" and @href]'):
title = tostring(a, method='text', encoding=unicode).lower()
if 'bulk pack' not in title:
matches.append(a.get('href'))
break
# Keep only the top 5 matches as the matches are sorted by relevance by
# Amazon so lower matches are not likely to be very relevant
matches = matches[:5]
if not matches:
log.error('No matches found with query: %r'%query)
return
workers = [Worker(url, result_queue, br, log) for url in matches]
for w in workers:
w.start()
# Don't send all requests at the same time
time.sleep(0.1)
while not abort.is_set():
a_worker_is_alive = False
for w in workers:
w.join(0.2)
if abort.is_set():
break
if w.is_alive():
a_worker_is_alive = True
if not a_worker_is_alive:
break
return None
if __name__ == '__main__':
# To run these test use: calibre-debug -e
# src/calibre/ebooks/metadata/sources/amazon.py
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
title_test)
test_identify_plugin(Amazon.name,
[
(
{'identifiers':{'isbn': '0743273567'}},
[title_test('The great gatsby', exact=True)]
),
])

View File

@ -9,8 +9,12 @@ __docformat__ = 'restructuredtext en'
import re, threading
from calibre import browser, random_user_agent
from calibre.customize import Plugin
from calibre.utils.logging import ThreadSafeLog, FileStream
from calibre.utils.config import JSONConfig
msprefs = JSONConfig('metadata_sources.json')
def create_log(ostream=None):
log = ThreadSafeLog(level=ThreadSafeLog.DEBUG)
@ -24,8 +28,6 @@ class Source(Plugin):
supported_platforms = ['windows', 'osx', 'linux']
result_of_identify_is_complete = True
capabilities = frozenset()
touched_fields = frozenset()
@ -34,6 +36,27 @@ class Source(Plugin):
Plugin.__init__(self, *args, **kwargs)
self._isbn_to_identifier_cache = {}
self.cache_lock = threading.RLock()
self._config_obj = None
self._browser = None
# Configuration {{{
@property
def prefs(self):
if self._config_obj is None:
self._config_obj = JSONConfig('metadata_sources/%s.json'%self.name)
return self._config_obj
# }}}
# Browser {{{
@property
def browser(self):
if self._browser is None:
self._browser = browser(user_agent=random_user_agent())
return self._browser
# }}}
# Utility functions {{{

View File

@ -19,7 +19,7 @@ from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.date import parse_date, utcnow
from calibre.utils.cleantext import clean_ascii_chars
from calibre import browser, as_unicode
from calibre import as_unicode
NAMESPACES = {
'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
@ -150,7 +150,7 @@ class GoogleBooks(Source):
def create_query(self, log, title=None, authors=None, identifiers={}):
BASE_URL = 'http://books.google.com/books/feeds/volumes?'
isbn = identifiers.get('isbn', None)
isbn = check_isbn(identifiers.get('isbn', None))
q = ''
if isbn is not None:
q += 'isbn:'+isbn
@ -209,10 +209,10 @@ class GoogleBooks(Source):
break
def identify(self, log, result_queue, abort, title=None, authors=None,
identifiers={}, timeout=5):
identifiers={}, timeout=20):
query = self.create_query(log, title=title, authors=authors,
identifiers=identifiers)
br = browser()
br = self.browser()
try:
raw = br.open_novisit(query, timeout=timeout).read()
except Exception, e:

View File

@ -81,6 +81,7 @@ class DetectStructure(object):
page_break_after = 'display: block; page-break-after: always'
for item, elem in self.detected_chapters:
text = xml2text(elem).strip()
text = re.sub(r'\s+', ' ', text.strip())
self.log('\tDetected chapter:', text[:50])
if chapter_mark == 'none':
continue
@ -137,7 +138,8 @@ class DetectStructure(object):
text = elem.get('title', '')
if not text:
text = elem.get('alt', '')
text = text[:100].strip()
text = re.sub(r'\s+', ' ', text.strip())
text = text[:1000].strip()
id = elem.get('id', 'calibre_toc_%d'%counter)
elem.set('id', id)
href = '#'.join((item.href, id))

View File

@ -53,7 +53,7 @@ if pictureflow is not None:
def __init__(self, model, buffer=20):
pictureflow.FlowImages.__init__(self)
self.model = model
self.model.modelReset.connect(self.reset)
self.model.modelReset.connect(self.reset, type=Qt.QueuedConnection)
def count(self):
return self.model.count()
@ -83,6 +83,8 @@ if pictureflow is not None:
class CoverFlow(pictureflow.PictureFlow):
dc_signal = pyqtSignal()
def __init__(self, parent=None):
pictureflow.PictureFlow.__init__(self, parent,
config['cover_flow_queue_length']+1)
@ -90,6 +92,8 @@ if pictureflow is not None:
self.setFocusPolicy(Qt.WheelFocus)
self.setSizePolicy(QSizePolicy(QSizePolicy.Expanding,
QSizePolicy.Expanding))
self.dc_signal.connect(self._data_changed,
type=Qt.QueuedConnection)
def sizeHint(self):
return self.minimumSize()
@ -101,6 +105,12 @@ if pictureflow is not None:
elif ev.delta() > 0:
self.showPrevious()
def dataChanged(self):
self.dc_signal.emit()
def _data_changed(self):
pictureflow.PictureFlow.dataChanged(self)
else:
CoverFlow = None
@ -135,8 +145,7 @@ class CoverFlowMixin(object):
self.cover_flow = None
if CoverFlow is not None:
self.cf_last_updated_at = None
self.cover_flow_sync_timer = QTimer(self)
self.cover_flow_sync_timer.timeout.connect(self.cover_flow_do_sync)
self.cover_flow_syncing_enabled = False
self.cover_flow_sync_flag = True
self.cover_flow = CoverFlow(parent=self)
self.cover_flow.currentChanged.connect(self.sync_listview_to_cf)
@ -179,14 +188,15 @@ class CoverFlowMixin(object):
self.cover_flow.setFocus(Qt.OtherFocusReason)
if CoverFlow is not None:
self.cover_flow.setCurrentSlide(self.library_view.currentIndex().row())
self.cover_flow_sync_timer.start(500)
self.cover_flow_syncing_enabled = True
QTimer.singleShot(500, self.cover_flow_do_sync)
self.library_view.setCurrentIndex(
self.library_view.currentIndex())
self.library_view.scroll_to_row(self.library_view.currentIndex().row())
def cover_browser_hidden(self):
if CoverFlow is not None:
self.cover_flow_sync_timer.stop()
self.cover_flow_syncing_enabled = False
idx = self.library_view.model().index(self.cover_flow.currentSlide(), 0)
if idx.isValid():
sm = self.library_view.selectionModel()
@ -242,6 +252,8 @@ class CoverFlowMixin(object):
except:
import traceback
traceback.print_exc()
if self.cover_flow_syncing_enabled:
QTimer.singleShot(500, self.cover_flow_do_sync)
def sync_listview_to_cf(self, row):
self.cf_last_updated_at = time.time()

View File

@ -1,10 +1,14 @@
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import StringIO, traceback, sys
from PyQt4.Qt import QMainWindow, QString, Qt, QFont, QCoreApplication, SIGNAL,\
QAction, QMenu, QMenuBar, QIcon, pyqtSignal
import StringIO, traceback, sys, gc
from PyQt4.Qt import QMainWindow, QString, Qt, QFont, QTimer, \
QAction, QMenu, QMenuBar, QIcon, pyqtSignal, QObject
from calibre.gui2.dialogs.conversion_error import ConversionErrorDialog
from calibre.utils.config import OptionParser
from calibre.gui2 import error_dialog
@ -35,6 +39,53 @@ class DebugWindow(ConversionErrorDialog):
def flush(self):
pass
class GarbageCollector(QObject):
'''
Disable automatic garbage collection and instead collect manually
every INTERVAL milliseconds.
This is done to ensure that garbage collection only happens in the GUI
thread, as otherwise Qt can crash.
'''
INTERVAL = 5000
def __init__(self, parent, debug=False):
QObject.__init__(self, parent)
self.debug = debug
self.timer = QTimer(self)
self.timer.timeout.connect(self.check)
self.threshold = gc.get_threshold()
gc.disable()
self.timer.start(self.INTERVAL)
#gc.set_debug(gc.DEBUG_SAVEALL)
def check(self):
#return self.debug_cycles()
l0, l1, l2 = gc.get_count()
if self.debug:
print ('gc_check called:', l0, l1, l2)
if l0 > self.threshold[0]:
num = gc.collect(0)
if self.debug:
print ('collecting gen 0, found:', num, 'unreachable')
if l1 > self.threshold[1]:
num = gc.collect(1)
if self.debug:
print ('collecting gen 1, found:', num, 'unreachable')
if l2 > self.threshold[2]:
num = gc.collect(2)
if self.debug:
print ('collecting gen 2, found:', num, 'unreachable')
def debug_cycles(self):
gc.collect()
for obj in gc.garbage:
print (obj, repr(obj), type(obj))
class MainWindow(QMainWindow):
___menu_bar = None
@ -64,19 +115,15 @@ class MainWindow(QMainWindow):
quit_action.setMenuRole(QAction.QuitRole)
return preferences_action, quit_action
def __init__(self, opts, parent=None):
def __init__(self, opts, parent=None, disable_automatic_gc=False):
QMainWindow.__init__(self, parent)
app = QCoreApplication.instance()
if app is not None:
self.connect(app, SIGNAL('unixSignal(int)'), self.unix_signal)
if disable_automatic_gc:
self._gc = GarbageCollector(self, debug=False)
if getattr(opts, 'redirect', False):
self.__console_redirect = DebugWindow(self)
sys.stdout = sys.stderr = self.__console_redirect
self.__console_redirect.show()
def unix_signal(self, signal):
print 'Received signal:', repr(signal)
def unhandled_exception(self, type, value, tb):
if type == KeyboardInterrupt:
self.keyboard_interrupt.emit()

View File

@ -439,7 +439,8 @@ void PictureFlowPrivate::setImages(FlowImages *images)
QObject::disconnect(slideImages, SIGNAL(dataChanged()), widget, SLOT(dataChanged()));
slideImages = images;
dataChanged();
QObject::connect(slideImages, SIGNAL(dataChanged()), widget, SLOT(dataChanged()));
QObject::connect(slideImages, SIGNAL(dataChanged()), widget, SLOT(dataChanged()),
Qt::QueuedConnection);
}
int PictureFlowPrivate::slideCount() const

View File

@ -9,7 +9,7 @@ __docformat__ = 'restructuredtext en'
'''The main GUI'''
import collections, os, sys, textwrap, time
import collections, os, sys, textwrap, time, gc
from Queue import Queue, Empty
from threading import Thread
from PyQt4.Qt import Qt, SIGNAL, QTimer, QHelpEvent, QAction, \
@ -95,7 +95,7 @@ class Main(MainWindow, MainWindowMixin, DeviceMixin, EmailMixin, # {{{
def __init__(self, opts, parent=None, gui_debug=None):
MainWindow.__init__(self, opts, parent)
MainWindow.__init__(self, opts, parent=parent, disable_automatic_gc=True)
self.opts = opts
self.device_connected = None
self.gui_debug = gui_debug
@ -298,6 +298,9 @@ class Main(MainWindow, MainWindowMixin, DeviceMixin, EmailMixin, # {{{
raise
self.device_manager.set_current_library_uuid(db.library_id)
# Collect cycles now
gc.collect()
if show_gui and self.gui_debug is not None:
info_dialog(self, _('Debug mode'), '<p>' +
_('You have started calibre in debug mode. After you '
@ -399,6 +402,7 @@ class Main(MainWindow, MainWindowMixin, DeviceMixin, EmailMixin, # {{{
elif msg.startswith('refreshdb:'):
self.library_view.model().refresh()
self.library_view.model().research()
self.tags_view.recount()
else:
print msg
@ -463,6 +467,9 @@ class Main(MainWindow, MainWindowMixin, DeviceMixin, EmailMixin, # {{{
self.card_a_view.reset()
self.card_b_view.reset()
self.device_manager.set_current_library_uuid(db.library_id)
# Run a garbage collection now so that it does not freeze the
# interface later
gc.collect()
def set_window_title(self):