From 7f85ac4e03e123a264b5c9c6475803971ebeb9b7 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 15 Mar 2011 11:38:43 -0600
Subject: [PATCH 1/9] Cover browser: Try harder to ensure that everything runs
 in the GUI thread

---
 src/calibre/gui2/cover_flow.py               | 22 +++++++++++++++-----
 src/calibre/gui2/pictureflow/pictureflow.cpp |  3 ++-
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/src/calibre/gui2/cover_flow.py b/src/calibre/gui2/cover_flow.py
index cb951b09be..1d79d93bb2 100644
--- a/src/calibre/gui2/cover_flow.py
+++ b/src/calibre/gui2/cover_flow.py
@@ -53,7 +53,7 @@ if pictureflow is not None:
         def __init__(self, model, buffer=20):
             pictureflow.FlowImages.__init__(self)
             self.model = model
-            self.model.modelReset.connect(self.reset)
+            self.model.modelReset.connect(self.reset, type=Qt.QueuedConnection)
 
         def count(self):
             return self.model.count()
@@ -83,6 +83,8 @@ if pictureflow is not None:
 
     class CoverFlow(pictureflow.PictureFlow):
 
+        dc_signal = pyqtSignal()
+
         def __init__(self, parent=None):
             pictureflow.PictureFlow.__init__(self, parent,
                                 config['cover_flow_queue_length']+1)
@@ -90,6 +92,8 @@ if pictureflow is not None:
             self.setFocusPolicy(Qt.WheelFocus)
             self.setSizePolicy(QSizePolicy(QSizePolicy.Expanding,
                 QSizePolicy.Expanding))
+            self.dc_signal.connect(self._data_changed,
+                    type=Qt.QueuedConnection)
 
         def sizeHint(self):
             return self.minimumSize()
@@ -101,6 +105,12 @@ if pictureflow is not None:
             elif ev.delta() > 0:
                 self.showPrevious()
 
+        def dataChanged(self):
+            self.dc_signal.emit()
+
+        def _data_changed(self):
+            pictureflow.PictureFlow.dataChanged(self)
+
 
 else:
     CoverFlow = None
@@ -135,8 +145,7 @@ class CoverFlowMixin(object):
         self.cover_flow = None
         if CoverFlow is not None:
             self.cf_last_updated_at = None
-            self.cover_flow_sync_timer = QTimer(self)
-            self.cover_flow_sync_timer.timeout.connect(self.cover_flow_do_sync)
+            self.cover_flow_syncing_enabled = False
             self.cover_flow_sync_flag = True
             self.cover_flow = CoverFlow(parent=self)
             self.cover_flow.currentChanged.connect(self.sync_listview_to_cf)
@@ -179,14 +188,15 @@ class CoverFlowMixin(object):
         self.cover_flow.setFocus(Qt.OtherFocusReason)
         if CoverFlow is not None:
             self.cover_flow.setCurrentSlide(self.library_view.currentIndex().row())
-            self.cover_flow_sync_timer.start(500)
+            self.cover_flow_syncing_enabled = True
+            QTimer.singleShot(500, self.cover_flow_do_sync)
         self.library_view.setCurrentIndex(
                 self.library_view.currentIndex())
         self.library_view.scroll_to_row(self.library_view.currentIndex().row())
 
     def cover_browser_hidden(self):
         if CoverFlow is not None:
-            self.cover_flow_sync_timer.stop()
+            self.cover_flow_syncing_enabled = False
             idx = self.library_view.model().index(self.cover_flow.currentSlide(), 0)
             if idx.isValid():
                 sm = self.library_view.selectionModel()
@@ -242,6 +252,8 @@ class CoverFlowMixin(object):
         except:
             import traceback
             traceback.print_exc()
+        if self.cover_flow_syncing_enabled:
+            QTimer.singleShot(500, self.cover_flow_do_sync)
 
     def sync_listview_to_cf(self, row):
         self.cf_last_updated_at = time.time()
diff --git a/src/calibre/gui2/pictureflow/pictureflow.cpp b/src/calibre/gui2/pictureflow/pictureflow.cpp
index 1c63ec410c..1d671154ae 100644
--- a/src/calibre/gui2/pictureflow/pictureflow.cpp
+++ b/src/calibre/gui2/pictureflow/pictureflow.cpp
@@ -439,7 +439,8 @@ void PictureFlowPrivate::setImages(FlowImages *images)
 	QObject::disconnect(slideImages, SIGNAL(dataChanged()), widget, SLOT(dataChanged()));
 	slideImages = images;
 	dataChanged();
-	QObject::connect(slideImages, SIGNAL(dataChanged()), widget, SLOT(dataChanged()));
+	QObject::connect(slideImages, SIGNAL(dataChanged()), widget, SLOT(dataChanged()),
+            Qt::QueuedConnection);
 }
 
 int PictureFlowPrivate::slideCount() const

From 388859a20975399a9a2d9eb9ac439dc82c49cee6 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 15 Mar 2011 11:47:03 -0600
Subject: [PATCH 2/9] Fix #9409 (Fix for weird char in rbc_ru.recipe)

---
 resources/recipes/rbc_ru.recipe | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resources/recipes/rbc_ru.recipe b/resources/recipes/rbc_ru.recipe
index 2495a195dc..438cd73243 100644
--- a/resources/recipes/rbc_ru.recipe
+++ b/resources/recipes/rbc_ru.recipe
@@ -1,4 +1,4 @@
-﻿# -*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
 
 from calibre.web.feeds.news import BasicNewsRecipe
 

From 9ed480327fdfb71c77183b8bc1b88fc128e86389 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 15 Mar 2011 11:54:49 -0600
Subject: [PATCH 3/9] Pro-linux.de by Bobus

---
 resources/recipes/pro_linux_de.recipe | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 resources/recipes/pro_linux_de.recipe

diff --git a/resources/recipes/pro_linux_de.recipe b/resources/recipes/pro_linux_de.recipe
new file mode 100644
index 0000000000..c10c2ec047
--- /dev/null
+++ b/resources/recipes/pro_linux_de.recipe
@@ -0,0 +1,15 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1295265555(BasicNewsRecipe):
+    title          = u'Pro-Linux.de'
+    language = 'de'
+    __author__ = 'Bobus'
+    oldest_article = 3
+    max_articles_per_feed = 100
+
+    feeds          = [(u'Pro-Linux', u'http://www.pro-linux.de/backend/pro-linux.rdf')]
+
+    def print_version(self, url):
+        return url.replace('/news/1/', '/news/1/print/').replace('/artikel/2/', '/artikel/2/print/')
+
+    remove_tags_after = [dict(name='div', attrs={'class':'print_links'})]

From 4511e1e178d393c08a7f86cd641c72ec7e94fa0e Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 15 Mar 2011 13:30:52 -0600
Subject: [PATCH 4/9] Explicitly run a garbage collection after switching a
 library to ensure that it does not freeze the interface at a later time

---
 src/calibre/gui2/ui.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/calibre/gui2/ui.py b/src/calibre/gui2/ui.py
index a2ec8c9846..6766635789 100644
--- a/src/calibre/gui2/ui.py
+++ b/src/calibre/gui2/ui.py
@@ -463,6 +463,10 @@ class Main(MainWindow, MainWindowMixin, DeviceMixin, EmailMixin, # {{{
             self.card_a_view.reset()
             self.card_b_view.reset()
         self.device_manager.set_current_library_uuid(db.library_id)
+        # Run a garbage collection now so that it does not freeze the
+        # interface later
+        import gc
+        gc.collect()
 
 
     def set_window_title(self):

From b7a92e7e3ebe94d0726319313a47158268abf556 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 15 Mar 2011 13:31:14 -0600
Subject: [PATCH 5/9] Fix #9407 (Metadata read failure from particular Kindle
 (Mobipocket) ebook)

---
 src/calibre/ebooks/metadata/opf2.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/metadata/opf2.py b/src/calibre/ebooks/metadata/opf2.py
index 9c59692628..846fdf1322 100644
--- a/src/calibre/ebooks/metadata/opf2.py
+++ b/src/calibre/ebooks/metadata/opf2.py
@@ -1251,6 +1251,7 @@ def metadata_to_opf(mi, as_string=True):
     from lxml import etree
     import textwrap
     from calibre.ebooks.oeb.base import OPF, DC
+    from calibre.utils.cleantext import clean_ascii_chars
 
     if not mi.application_id:
         mi.application_id = str(uuid.uuid4())
@@ -1306,7 +1307,7 @@ def metadata_to_opf(mi, as_string=True):
     if hasattr(mi, 'category') and mi.category:
         factory(DC('type'), mi.category)
     if mi.comments:
-        factory(DC('description'), mi.comments)
+        factory(DC('description'), clean_ascii_chars(mi.comments))
     if mi.publisher:
         factory(DC('publisher'), mi.publisher)
     for key, val in mi.get_identifiers().iteritems():

From 2f4876f4742a72095093d6220effef8a10739a2c Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 15 Mar 2011 14:39:03 -0600
Subject: [PATCH 6/9] Beginning of the new amazon metadata download plugin

---
 src/calibre/__init__.py                       |  13 +-
 src/calibre/customize/builtins.py             |   3 +-
 src/calibre/ebooks/metadata/sources/amazon.py | 221 +++++++++++++++++-
 src/calibre/ebooks/metadata/sources/base.py   |  27 ++-
 src/calibre/ebooks/metadata/sources/google.py |   6 +-
 5 files changed, 260 insertions(+), 10 deletions(-)

diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py
index fa9a8f2404..ab578d8ae6 100644
--- a/src/calibre/__init__.py
+++ b/src/calibre/__init__.py
@@ -3,7 +3,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
-import uuid, sys, os, re, logging, time, \
+import uuid, sys, os, re, logging, time, random, \
        __builtin__, warnings, multiprocessing
 from urllib import getproxies
 __builtin__.__dict__['dynamic_property'] = lambda(func): func(None)
@@ -268,6 +268,17 @@ def get_parsed_proxy(typ='http', debug=True):
                     prints('Using http proxy', str(ans))
                 return ans
 
+def random_user_agent():
+    choices = [
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11'
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)'
+        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)'
+        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1)'
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.1 Safari/525.19'
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11'
+    ]
+    return choices[random.randint(0, len(choices)-1)]
+
 
 def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None):
     '''
diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py
index b24a5c9a17..b3d435165b 100644
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@@ -1031,7 +1031,8 @@ plugins += [LookAndFeel, Behavior, Columns, Toolbar, Search, InputOptions,
 
 # New metadata download plugins {{{
 from calibre.ebooks.metadata.sources.google import GoogleBooks
+from calibre.ebooks.metadata.sources.amazon import Amazon
 
-plugins += [GoogleBooks]
+plugins += [GoogleBooks, Amazon]
 
 # }}}
diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py
index 88ac1213c5..cf09a88338 100644
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@@ -7,16 +7,231 @@ __license__   = 'GPL v3'
 __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
+import socket, time
+from urllib import urlencode
+from threading import Thread
 
+from lxml.html import soupparser, tostring
+
+from calibre import as_unicode
+from calibre.ebooks.metadata import check_isbn
 from calibre.ebooks.metadata.sources.base import Source
+from calibre.utils.cleantext import clean_ascii_chars
+from calibre.ebooks.chardet import xml_to_unicode
+
+class Worker(Thread):
+
+    def __init__(self, url, result_queue, browser, log, timeout=10):
+        self.url, self.result_queue = url, result_queue
+        self.log, self.timeout = log, timeout
+        self.browser = browser.clone_browser()
+        self.cover_url = self.amazon_id = None
+
+    def run(self):
+        try:
+            self.get_details()
+        except:
+            self.log.error('get_details failed for url: %r'%self.url)
+
+    def get_details(self):
+        try:
+            raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip()
+        except Exception, e:
+            if callable(getattr(e, 'getcode', None)) and \
+                    e.getcode() == 404:
+                self.log.error('URL malformed: %r'%self.url)
+                return
+            attr = getattr(e, 'args', [None])
+            attr = attr if attr else [None]
+            if isinstance(attr[0], socket.timeout):
+                msg = 'Amazon timed out. Try again later.'
+                self.log.error(msg)
+            else:
+                msg = 'Failed to make details query: %r'%self.url
+                self.log.exception(msg)
+            return
+
+        raw = xml_to_unicode(raw, strip_encoding_pats=True,
+                resolve_entities=True)[0]
+
+        if '<title>404 - ' in raw:
+            self.log.error('URL malformed: %r'%self.url)
+            return
+
+        try:
+            root = soupparser.fromstring(clean_ascii_chars(raw))
+        except:
+            msg = 'Failed to parse amazon details page: %r'%self.url
+            self.log.exception(msg)
+            return
+
+        errmsg = root.xpath('//*[@id="errorMessage"]')
+        if errmsg:
+            msg = 'Failed to parse amazon details page: %r'%self.url
+            msg += tostring(errmsg, method='text', encoding=unicode).strip()
+            self.log.error(msg)
+            return
+
+        self.parse_details(root)
+
+    def parse_details(self, root):
+        pass
+
 
 class Amazon(Source):
 
     name = 'Amazon'
     description = _('Downloads metadata from Amazon')
 
-    capabilities = frozenset(['identify', 'cover'])
-    touched_fields = frozenset(['title', 'authors', 'isbn', 'pubdate',
-        'comments', 'cover_data'])
+    capabilities = frozenset(['identify'])
+    touched_fields = frozenset(['title', 'authors', 'isbn', 'pubdate', 'comments'])
+
+    AMAZON_DOMAINS = {
+            'com': _('US'),
+            'fr' : _('France'),
+            'de' : _('Germany'),
+    }
+
+    def create_query(self, log, title=None, authors=None, identifiers={}):
+        domain = self.prefs.get('domain', 'com')
+
+        # See the amazon detailed search page to get all options
+        q = {   'search-alias' : 'aps',
+                'unfiltered' : '1',
+            }
+
+        if domain == 'com':
+            q['sort'] = 'relevanceexprank'
+        else:
+            q['sort'] = 'relevancerank'
+
+        asin = identifiers.get('amazon', None)
+        isbn = check_isbn(identifiers.get('isbn', None))
+
+        if asin is not None:
+            q['field-keywords'] = asin
+        elif isbn is not None:
+            q['field-isbn'] = isbn
+        else:
+            # Only return book results
+            q['search-alias'] = 'stripbooks'
+            if title:
+                title_tokens = list(self.get_title_tokens(title))
+                if title_tokens:
+                    q['field-title'] = ' '.join(title_tokens)
+            if authors:
+                author_tokens = self.get_author_tokens(authors,
+                        only_first_author=True)
+                if author_tokens:
+                    q['field-author'] = ' '.join(author_tokens)
+
+        if not ('field-keywords' in q or 'field-isbn' in q or
+                ('field-title' in q and 'field-author' in q)):
+            # Insufficient metadata to make an identify query
+            return None
+
+        utf8q = dict([(x.encode('utf-8'), y.encode('utf-8')) for x, y in
+            q.iteritems()])
+        url = 'http://www.amazon.%s/s/?'%domain + urlencode(utf8q)
+        return url
+
+
+    def identify(self, log, result_queue, abort, title=None, authors=None,
+            identifiers={}, timeout=10):
+        query = self.create_query(log, title=title, authors=authors,
+                identifiers=identifiers)
+        if query is None:
+            log.error('Insufficient metadata to construct query')
+            return
+        br = self.browser
+        try:
+            raw = br.open_novisit(query, timeout=timeout).read().strip()
+        except Exception, e:
+            if callable(getattr(e, 'getcode', None)) and \
+                    e.getcode() == 404:
+                log.error('Query malformed: %r'%query)
+                return
+            attr = getattr(e, 'args', [None])
+            attr = attr if attr else [None]
+            if isinstance(attr[0], socket.timeout):
+                msg = _('Amazon timed out. Try again later.')
+                log.error(msg)
+            else:
+                msg = 'Failed to make identify query: %r'%query
+                log.exception(msg)
+            return as_unicode(msg)
+
+
+        raw = xml_to_unicode(raw, strip_encoding_pats=True,
+                resolve_entities=True)[0]
+
+        if '<title>404 - ' in raw:
+            log.error('No matches found for query: %r'%query)
+            return
+
+        try:
+            root = soupparser.fromstring(clean_ascii_chars(raw))
+        except:
+            msg = 'Failed to parse amazon page for query: %r'%query
+            log.exception(msg)
+            return msg
+
+        errmsg = root.xpath('//*[@id="errorMessage"]')
+        if errmsg:
+            msg = tostring(errmsg, method='text', encoding=unicode).strip()
+            log.error(msg)
+            # The error is almost always a not found error
+            return
+
+        matches = []
+        for div in root.xpath(r'//div[starts-with(@id, "result_")]'):
+            for a in div.xpath(r'descendant::a[@class="title" and @href]'):
+                title = tostring(a, method='text', encoding=unicode).lower()
+                if 'bulk pack' not in title:
+                    matches.append(a.get('href'))
+                break
+
+        # Keep only the top 5 matches as the matches are sorted by relevance by
+        # Amazon so lower matches are not likely to be very relevant
+        matches = matches[:5]
+
+        if not matches:
+            log.error('No matches found with query: %r'%query)
+            return
+
+        workers = [Worker(url, result_queue, br, log) for url in matches]
+
+        for w in workers:
+            w.start()
+            # Don't send all requests at the same time
+            time.sleep(0.1)
+
+        while not abort.is_set():
+            a_worker_is_alive = False
+            for w in workers:
+                w.join(0.2)
+                if abort.is_set():
+                    break
+                if w.is_alive():
+                    a_worker_is_alive = True
+            if not a_worker_is_alive:
+                break
+
+        return None
+
+
+if __name__ == '__main__':
+    # To run these test use: calibre-debug -e
+    # src/calibre/ebooks/metadata/sources/amazon.py
+    from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
+            title_test)
+    test_identify_plugin(Amazon.name,
+        [
+
+            (
+                {'identifiers':{'isbn': '0743273567'}},
+                [title_test('The great gatsby', exact=True)]
+            ),
+        ])
 
 
diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py
index 142224c599..523d012cd5 100644
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@@ -9,8 +9,12 @@ __docformat__ = 'restructuredtext en'
 
 import re, threading
 
+from calibre import browser, random_user_agent
 from calibre.customize import Plugin
 from calibre.utils.logging import ThreadSafeLog, FileStream
+from calibre.utils.config import JSONConfig
+
+msprefs = JSONConfig('metadata_sources.json')
 
 def create_log(ostream=None):
     log = ThreadSafeLog(level=ThreadSafeLog.DEBUG)
@@ -24,8 +28,6 @@ class Source(Plugin):
 
     supported_platforms = ['windows', 'osx', 'linux']
 
-    result_of_identify_is_complete = True
-
     capabilities = frozenset()
 
     touched_fields = frozenset()
@@ -34,6 +36,27 @@ class Source(Plugin):
         Plugin.__init__(self, *args, **kwargs)
         self._isbn_to_identifier_cache = {}
         self.cache_lock = threading.RLock()
+        self._config_obj = None
+        self._browser = None
+
+    # Configuration {{{
+
+    @property
+    def prefs(self):
+        if self._config_obj is None:
+            self._config_obj = JSONConfig('metadata_sources/%s.json'%self.name)
+        return self._config_obj
+    # }}}
+
+    # Browser {{{
+
+    @property
+    def browser(self):
+        if self._browser is None:
+            self._browser = browser(user_agent=random_user_agent())
+        return self._browser
+
+    # }}}
 
     # Utility functions {{{
 
diff --git a/src/calibre/ebooks/metadata/sources/google.py b/src/calibre/ebooks/metadata/sources/google.py
index 0720b21ded..b7c300e933 100644
--- a/src/calibre/ebooks/metadata/sources/google.py
+++ b/src/calibre/ebooks/metadata/sources/google.py
@@ -19,7 +19,7 @@ from calibre.ebooks.metadata.book.base import Metadata
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.utils.date import parse_date, utcnow
 from calibre.utils.cleantext import clean_ascii_chars
-from calibre import browser, as_unicode
+from calibre import as_unicode
 
 NAMESPACES = {
               'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
@@ -150,7 +150,7 @@ class GoogleBooks(Source):
 
     def create_query(self, log, title=None, authors=None, identifiers={}):
         BASE_URL = 'http://books.google.com/books/feeds/volumes?'
-        isbn = identifiers.get('isbn', None)
+        isbn = check_isbn(identifiers.get('isbn', None))
         q = ''
         if isbn is not None:
             q += 'isbn:'+isbn
@@ -212,7 +212,7 @@ class GoogleBooks(Source):
             identifiers={}, timeout=5):
         query = self.create_query(log, title=title, authors=authors,
                 identifiers=identifiers)
-        br = browser()
+        br = self.browser()
         try:
             raw = br.open_novisit(query, timeout=timeout).read()
         except Exception, e:

From 428ed899fcd5edb70ffc11bdb3aa63154e87aa75 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 15 Mar 2011 14:51:22 -0600
Subject: [PATCH 7/9] Conversion pipeline: When detecting chapters/toc links
 from HTML normalize spaces and increase maximum TOC title length to 1000
 characters from 100 characters. Fixes #9363 (Shortening text on generating
 TOC.)

---
 src/calibre/ebooks/oeb/transforms/structure.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/oeb/transforms/structure.py b/src/calibre/ebooks/oeb/transforms/structure.py
index fc338da692..0d8bdcdf2e 100644
--- a/src/calibre/ebooks/oeb/transforms/structure.py
+++ b/src/calibre/ebooks/oeb/transforms/structure.py
@@ -81,6 +81,7 @@ class DetectStructure(object):
             page_break_after = 'display: block; page-break-after: always'
             for item, elem in self.detected_chapters:
                 text = xml2text(elem).strip()
+                text = re.sub(r'\s+', ' ', text.strip())
                 self.log('\tDetected chapter:', text[:50])
                 if chapter_mark == 'none':
                     continue
@@ -137,7 +138,8 @@ class DetectStructure(object):
             text = elem.get('title', '')
         if not text:
             text = elem.get('alt', '')
-        text = text[:100].strip()
+        text = re.sub(r'\s+', ' ', text.strip())
+        text = text[:1000].strip()
         id = elem.get('id', 'calibre_toc_%d'%counter)
         elem.set('id', id)
         href = '#'.join((item.href, id))

From e937dccaa37118165c09336e57909b3164af8ddc Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 15 Mar 2011 18:39:14 -0600
Subject: [PATCH 8/9] Disable automatic garbage collection, instead ensure
 garbage collection runs only in the GUI thread

---
 src/calibre/gui2/main_window.py | 67 ++++++++++++++++++++++++++++-----
 src/calibre/gui2/ui.py          |  9 +++--
 2 files changed, 63 insertions(+), 13 deletions(-)

diff --git a/src/calibre/gui2/main_window.py b/src/calibre/gui2/main_window.py
index e068e851c2..ec58dd3856 100644
--- a/src/calibre/gui2/main_window.py
+++ b/src/calibre/gui2/main_window.py
@@ -1,10 +1,14 @@
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 
-import StringIO, traceback, sys
 
-from PyQt4.Qt import QMainWindow, QString, Qt, QFont, QCoreApplication, SIGNAL,\
-                     QAction, QMenu, QMenuBar, QIcon, pyqtSignal
+import StringIO, traceback, sys, gc
+
+from PyQt4.Qt import QMainWindow, QString, Qt, QFont, QTimer, \
+                     QAction, QMenu, QMenuBar, QIcon, pyqtSignal, QObject
 from calibre.gui2.dialogs.conversion_error import ConversionErrorDialog
 from calibre.utils.config import OptionParser
 from calibre.gui2 import error_dialog
@@ -35,6 +39,53 @@ class DebugWindow(ConversionErrorDialog):
     def flush(self):
         pass
 
+class GarbageCollector(QObject):
+
+    '''
+    Disable automatic garbage collection and instead collect manually
+    every INTERVAL milliseconds.
+
+    This is done to ensure that garbage collection only happens in the GUI
+    thread, as otherwise Qt can crash.
+    '''
+
+    INTERVAL = 5000
+
+    def __init__(self, parent, debug=False):
+        QObject.__init__(self, parent)
+        self.debug = debug
+
+        self.timer = QTimer(self)
+        self.timer.timeout.connect(self.check)
+
+        self.threshold = gc.get_threshold()
+        gc.disable()
+        self.timer.start(self.INTERVAL)
+        #gc.set_debug(gc.DEBUG_SAVEALL)
+
+    def check(self):
+        #return self.debug_cycles()
+        l0, l1, l2 = gc.get_count()
+        if self.debug:
+            print ('gc_check called:', l0, l1, l2)
+        if l0 > self.threshold[0]:
+            num = gc.collect(0)
+            if self.debug:
+                print ('collecting gen 0, found:', num, 'unreachable')
+            if l1 > self.threshold[1]:
+                num = gc.collect(1)
+                if self.debug:
+                    print ('collecting gen 1, found:', num, 'unreachable')
+                if l2 > self.threshold[2]:
+                    num = gc.collect(2)
+                    if self.debug:
+                        print ('collecting gen 2, found:', num, 'unreachable')
+
+    def debug_cycles(self):
+        gc.collect()
+        for obj in gc.garbage:
+            print (obj, repr(obj), type(obj))
+
 class MainWindow(QMainWindow):
 
     ___menu_bar = None
@@ -64,19 +115,15 @@ class MainWindow(QMainWindow):
         quit_action.setMenuRole(QAction.QuitRole)
         return preferences_action, quit_action
 
-    def __init__(self, opts, parent=None):
+    def __init__(self, opts, parent=None, disable_automatic_gc=False):
         QMainWindow.__init__(self, parent)
-        app = QCoreApplication.instance()
-        if app is not None:
-            self.connect(app, SIGNAL('unixSignal(int)'), self.unix_signal)
+        if disable_automatic_gc:
+            self._gc = GarbageCollector(self, debug=False)
         if getattr(opts, 'redirect', False):
             self.__console_redirect = DebugWindow(self)
             sys.stdout = sys.stderr = self.__console_redirect
             self.__console_redirect.show()
 
-    def unix_signal(self, signal):
-        print 'Received signal:', repr(signal)
-
     def unhandled_exception(self, type, value, tb):
         if type == KeyboardInterrupt:
             self.keyboard_interrupt.emit()
diff --git a/src/calibre/gui2/ui.py b/src/calibre/gui2/ui.py
index 6766635789..4af8c1ea54 100644
--- a/src/calibre/gui2/ui.py
+++ b/src/calibre/gui2/ui.py
@@ -9,7 +9,7 @@ __docformat__ = 'restructuredtext en'
 
 '''The main GUI'''
 
-import collections, os, sys, textwrap, time
+import collections, os, sys, textwrap, time, gc
 from Queue import Queue, Empty
 from threading import Thread
 from PyQt4.Qt import Qt, SIGNAL, QTimer, QHelpEvent, QAction, \
@@ -95,7 +95,7 @@ class Main(MainWindow, MainWindowMixin, DeviceMixin, EmailMixin, # {{{
 
 
     def __init__(self, opts, parent=None, gui_debug=None):
-        MainWindow.__init__(self, opts, parent)
+        MainWindow.__init__(self, opts, parent=parent, disable_automatic_gc=True)
         self.opts = opts
         self.device_connected = None
         self.gui_debug = gui_debug
@@ -298,6 +298,9 @@ class Main(MainWindow, MainWindowMixin, DeviceMixin, EmailMixin, # {{{
                     raise
         self.device_manager.set_current_library_uuid(db.library_id)
 
+        # Collect cycles now
+        gc.collect()
+
         if show_gui and self.gui_debug is not None:
             info_dialog(self, _('Debug mode'), '<p>' +
                     _('You have started calibre in debug mode. After you '
@@ -399,6 +402,7 @@ class Main(MainWindow, MainWindowMixin, DeviceMixin, EmailMixin, # {{{
         elif msg.startswith('refreshdb:'):
             self.library_view.model().refresh()
             self.library_view.model().research()
+            self.tags_view.recount()
         else:
             print msg
 
@@ -465,7 +469,6 @@ class Main(MainWindow, MainWindowMixin, DeviceMixin, EmailMixin, # {{{
         self.device_manager.set_current_library_uuid(db.library_id)
         # Run a garbage collection now so that it does not freeze the
         # interface later
-        import gc
         gc.collect()
 
 

From 19581fbfc6daca006ccd60b19a6eb8abaadf8e77 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 15 Mar 2011 21:03:03 -0600
Subject: [PATCH 9/9] ...

---
 src/calibre/ebooks/metadata/sources/amazon.py | 4 ++--
 src/calibre/ebooks/metadata/sources/google.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py
index cf09a88338..a62a9683cb 100644
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@@ -21,7 +21,7 @@ from calibre.ebooks.chardet import xml_to_unicode
 
 class Worker(Thread):
 
-    def __init__(self, url, result_queue, browser, log, timeout=10):
+    def __init__(self, url, result_queue, browser, log, timeout=20):
         self.url, self.result_queue = url, result_queue
         self.log, self.timeout = log, timeout
         self.browser = browser.clone_browser()
@@ -137,7 +137,7 @@ class Amazon(Source):
 
 
     def identify(self, log, result_queue, abort, title=None, authors=None,
-            identifiers={}, timeout=10):
+            identifiers={}, timeout=20):
         query = self.create_query(log, title=title, authors=authors,
                 identifiers=identifiers)
         if query is None:
diff --git a/src/calibre/ebooks/metadata/sources/google.py b/src/calibre/ebooks/metadata/sources/google.py
index b7c300e933..923062379e 100644
--- a/src/calibre/ebooks/metadata/sources/google.py
+++ b/src/calibre/ebooks/metadata/sources/google.py
@@ -209,7 +209,7 @@ class GoogleBooks(Source):
                 break
 
     def identify(self, log, result_queue, abort, title=None, authors=None,
-            identifiers={}, timeout=5):
+            identifiers={}, timeout=20):
         query = self.create_query(log, title=title, authors=authors,
                 identifiers=identifiers)
         br = self.browser()