diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py
index 82c2519e29..bc88d473fd 100644
--- a/src/calibre/__init__.py
+++ b/src/calibre/__init__.py
@@ -350,20 +350,20 @@ def get_proxy_info(proxy_scheme, proxy_string):
 USER_AGENT = 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101210 Gentoo Firefox/3.6.13'
 USER_AGENT_MOBILE = 'Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016'
 
-def random_user_agent():
+def random_user_agent(choose=None):
     choices = [
         'Mozilla/5.0 (Windows NT 5.2; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
         'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
-        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11',
-        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.1 Safari/525.19',
-        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11',
+        'Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20110814 Firefox/6.0',
+        'Mozilla/5.0 (Windows NT 6.2; rv:9.0.1) Gecko/20100101 Firefox/9.0.1',
         'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.63 Safari/534.3',
         'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.78 Safari/532.5',
         'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
     ]
-    #return choices[-1]
-    return choices[random.randint(0, len(choices)-1)]
+    if choose is None:
+        choose = random.randint(0, len(choices)-1)
+    return choices[choose]
 
 def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None):
     '''
diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py
index 3d08b96c5f..cb724765f5 100644
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@@ -13,7 +13,7 @@ from threading import Thread
 from Queue import Queue, Empty
 
 
-from calibre import as_unicode
+from calibre import as_unicode, random_user_agent
 from calibre.ebooks.metadata import check_isbn
 from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase,
         fixauthors)
@@ -174,8 +174,8 @@ class Worker(Thread): # Get details {{{
 
     def get_details(self):
         from calibre.utils.cleantext import clean_ascii_chars
-        from calibre.utils.soupparser import fromstring
         from calibre.ebooks.chardet import xml_to_unicode
+        import html5lib
 
         try:
             raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip()
@@ -202,7 +202,8 @@ class Worker(Thread): # Get details {{{
             return
 
         try:
-            root = fromstring(clean_ascii_chars(raw))
+            root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml',
+                    namespaceHTMLElements=False)
         except:
             msg = 'Failed to parse amazon details page: %r'%self.url
             self.log.exception(msg)
@@ -356,33 +357,46 @@ class Worker(Thread): # Get details {{{
                 if m is not None:
                     return float(m.group(1))/float(m.group(3)) * 5
 
-    def parse_comments(self, root):
+    def _render_comments(self, desc):
         from calibre.library.comments import sanitize_comments_html
 
+        for c in desc.xpath('descendant::noscript'):
+            c.getparent().remove(c)
+        for c in desc.xpath('descendant::*[@class="seeAll" or'
+                ' @class="emptyClear" or @id="collapsePS" or'
+                ' @id="expandPS"]'):
+            c.getparent().remove(c)
+
+        for a in desc.xpath('descendant::a[@href]'):
+            del a.attrib['href']
+            a.tag = 'span'
+        desc = self.tostring(desc, method='html', encoding=unicode).strip()
+
+        # Encoding bug in Amazon data U+fffd (replacement char)
+        # in some examples it is present in place of '
+        desc = desc.replace('\ufffd', "'")
+        # remove all attributes from tags
+        desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
+        # Collapse whitespace
+        #desc = re.sub('\n+', '\n', desc)
+        #desc = re.sub(' +', ' ', desc)
+        # Remove the notice about text referring to out of print editions
+        desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
+        # Remove comments
+        desc = re.sub(r'(?s)<!--.*?-->', '', desc)
+        return sanitize_comments_html(desc)
+
+
+    def parse_comments(self, root):
+        ans = ''
+        desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]')
+        if desc:
+            ans = self._render_comments(desc[0])
+
         desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]')
         if desc:
-            desc = desc[0]
-            for c in desc.xpath('descendant::*[@class="seeAll" or'
-                    ' @class="emptyClear"]'):
-                c.getparent().remove(c)
-            for a in desc.xpath('descendant::a[@href]'):
-                del a.attrib['href']
-                a.tag = 'span'
-            desc = self.tostring(desc, method='html', encoding=unicode).strip()
-
-            # Encoding bug in Amazon data U+fffd (replacement char)
-            # in some examples it is present in place of '
-            desc = desc.replace('\ufffd', "'")
-            # remove all attributes from tags
-            desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
-            # Collapse whitespace
-            #desc = re.sub('\n+', '\n', desc)
-            #desc = re.sub(' +', ' ', desc)
-            # Remove the notice about text referring to out of print editions
-            desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
-            # Remove comments
-            desc = re.sub(r'(?s)<!--.*?-->', '', desc)
-            return sanitize_comments_html(desc)
+            ans += self._render_comments(desc[0])
+        return ans
 
     def parse_cover(self, root):
         imgs = root.xpath('//img[@id="prodImage" and @src]')
@@ -467,6 +481,28 @@ class Amazon(Source):
         Source.__init__(self, *args, **kwargs)
         self.set_amazon_id_touched_fields()
 
+    def test_fields(self, mi):
+        '''
+        Return the first field from self.touched_fields that is null on the
+        mi object
+        '''
+        for key in self.touched_fields:
+            if key.startswith('identifier:'):
+                key = key.partition(':')[-1]
+                if key == 'amazon':
+                    if self.domain != 'com':
+                        key += '_' + self.domain
+                if not mi.has_identifier(key):
+                    return 'identifier: ' + key
+            elif mi.is_null(key):
+                return key
+
+    @property
+    def user_agent(self):
+        # Pass in an index to random_user_agent() to test with a particular
+        # user agent
+        return random_user_agent()
+
     def save_settings(self, *args, **kwargs):
         Source.save_settings(self, *args, **kwargs)
         self.set_amazon_id_touched_fields()
@@ -507,6 +543,9 @@ class Amazon(Source):
 
     @property
     def domain(self):
+        x = getattr(self, 'testing_domain', None)
+        if x is not None:
+            return x
         domain = self.prefs['domain']
         if domain not in self.AMAZON_DOMAINS:
             domain = 'com'
@@ -599,16 +638,52 @@ class Amazon(Source):
         return url
     # }}}
 
+    def parse_results_page(self, root): # {{{
+        from lxml.html import tostring
+
+        matches = []
+
+        def title_ok(title):
+            title = title.lower()
+            for x in ('bulk pack', '[audiobook]', '[audio cd]'):
+                if x in title:
+                    return False
+            return True
+
+        for div in root.xpath(r'//div[starts-with(@id, "result_")]'):
+            for a in div.xpath(r'descendant::a[@class="title" and @href]'):
+                title = tostring(a, method='text', encoding=unicode)
+                if title_ok(title):
+                    matches.append(a.get('href'))
+                break
+
+        if not matches:
+            # This can happen for some user agents that Amazon thinks are
+            # mobile/less capable
+            for td in root.xpath(
+                r'//div[@id="Results"]/descendant::td[starts-with(@id, "search:Td:")]'):
+                for a in td.xpath(r'descendant::td[@class="dataColumn"]/descendant::a[@href]/span[@class="srTitle"]/..'):
+                    title = tostring(a, method='text', encoding=unicode)
+                    if title_ok(title):
+                        matches.append(a.get('href'))
+                    break
+
+
+        # Keep only the top 5 matches as the matches are sorted by relevance by
+        # Amazon so lower matches are not likely to be very relevant
+        return matches[:5]
+    # }}}
+
     def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
             identifiers={}, timeout=30):
         '''
         Note this method will retry without identifiers automatically if no
         match is found with identifiers.
         '''
-        from lxml.html import tostring
         from calibre.utils.cleantext import clean_ascii_chars
-        from calibre.utils.soupparser import fromstring
         from calibre.ebooks.chardet import xml_to_unicode
+        from lxml.html import tostring
+        import html5lib
 
         query, domain = self.create_query(log, title=title, authors=authors,
                 identifiers=identifiers)
@@ -616,6 +691,8 @@ class Amazon(Source):
             log.error('Insufficient metadata to construct query')
             return
         br = self.browser
+        if getattr(self, 'running_a_test', False):
+            print ('Using user agent for amazon: %s'%self.user_agent)
         try:
             raw = br.open_novisit(query, timeout=timeout).read().strip()
         except Exception as e:
@@ -634,15 +711,23 @@ class Amazon(Source):
             return as_unicode(msg)
 
 
-        raw = xml_to_unicode(raw, strip_encoding_pats=True,
-                resolve_entities=True)[0]
+        raw = clean_ascii_chars(xml_to_unicode(raw,
+            strip_encoding_pats=True, resolve_entities=True)[0])
+
+        if getattr(self, 'running_a_test', False):
+            import tempfile
+            with tempfile.NamedTemporaryFile(prefix='amazon_results_',
+                    suffix='.html', delete=False) as f:
+                f.write(raw.encode('utf-8'))
+            print ('Downloaded html for results page saved in', f.name)
 
         matches = []
         found = '<title>404 - ' not in raw
 
         if found:
             try:
-                root = fromstring(clean_ascii_chars(raw))
+                root = html5lib.parse(raw, treebuilder='lxml',
+                        namespaceHTMLElements=False)
             except:
                 msg = 'Failed to parse amazon page for query: %r'%query
                 log.exception(msg)
@@ -655,30 +740,9 @@ class Amazon(Source):
                     # The error is almost always a not found error
                     found = False
 
+
         if found:
-            for div in root.xpath(r'//div[starts-with(@id, "result_")]'):
-                for a in div.xpath(r'descendant::a[@class="title" and @href]'):
-                    title = tostring(a, method='text', encoding=unicode).lower()
-                    if 'bulk pack' not in title:
-                        matches.append(a.get('href'))
-                    break
-            if not matches:
-                # This can happen for some user agents that Amazon thinks are
-                # mobile/less capable
-                log('Trying alternate results page markup')
-                for td in root.xpath(
-                    r'//div[@id="Results"]/descendant::td[starts-with(@id, "search:Td:")]'):
-                    for a in td.xpath(r'descendant::td[@class="dataColumn"]/descendant::a[@href]/span[@class="srTitle"]/..'):
-                        title = tostring(a, method='text', encoding=unicode).lower()
-                        if ('bulk pack' not in title and '[audiobook]' not in
-                                title and '[audio cd]' not in title):
-                            matches.append(a.get('href'))
-                        break
-
-
-        # Keep only the top 5 matches as the matches are sorted by relevance by
-        # Amazon so lower matches are not likely to be very relevant
-        matches = matches[:5]
+            matches = self.parse_results_page(root)
 
         if abort.is_set():
             return
@@ -686,7 +750,7 @@ class Amazon(Source):
         if not matches:
             if identifiers and title and authors:
                 log('No matches found with identifiers, retrying using only'
-                        ' title and authors')
+                        ' title and authors. Query: %r'%query)
                 return self.identify(log, result_queue, abort, title=title,
                         authors=authors, timeout=timeout)
             log.error('No matches found with query: %r'%query)
@@ -756,9 +820,18 @@ if __name__ == '__main__': # tests {{{
     # To run these test use: calibre-debug -e
     # src/calibre/ebooks/metadata/sources/amazon.py
     from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
-            isbn_test, title_test, authors_test)
+            isbn_test, title_test, authors_test, comments_test)
     com_tests = [ # {{{
 
+            ( # Different comments markup, using Book Description section
+                {'identifiers':{'amazon':'0982514506'}},
+                [title_test(
+                "Griffin's Destiny: Book Three: The Griffin's Daughter Trilogy"
+                , exact=True),
+                comments_test('Jelena'), comments_test('Leslie'),
+                ]
+            ),
+
             ( # # in title
                 {'title':'Expert C# 2008 Business Objects',
                     'authors':['Lhotka']},
@@ -850,7 +923,17 @@ if __name__ == '__main__': # tests {{{
             ),
     ] # }}}
 
-    test_identify_plugin(Amazon.name, com_tests)
-    #test_identify_plugin(Amazon.name, de_tests)
+    def do_test(domain, start=0, stop=None):
+        tests = globals().get(domain+'_tests')
+        if stop is None:
+            stop = len(tests)
+        tests = tests[start:stop]
+        test_identify_plugin(Amazon.name, tests, modify_plugin=lambda
+                p:setattr(p, 'testing_domain', domain))
+
+    do_test('com')
+
+    #do_test('de')
+
 # }}}
 
diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py
index 4c334f4e46..4408bff6c6 100644
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@@ -253,10 +253,16 @@ class Source(Plugin):
 
     # Browser {{{
 
+    @property
+    def user_agent(self):
+        # Pass in an index to random_user_agent() to test with a particular
+        # user agent
+        return random_user_agent()
+
     @property
     def browser(self):
         if self._browser is None:
-            self._browser = browser(user_agent=random_user_agent())
+            self._browser = browser(user_agent=self.user_agent)
             if self.supports_gzip_transfer_encoding:
                 self._browser.set_handle_gzip(True)
         return self._browser.clone_browser()
diff --git a/src/calibre/ebooks/metadata/sources/test.py b/src/calibre/ebooks/metadata/sources/test.py
index bccce3dba2..4853035b27 100644
--- a/src/calibre/ebooks/metadata/sources/test.py
+++ b/src/calibre/ebooks/metadata/sources/test.py
@@ -84,6 +84,16 @@ def series_test(series, series_index):
 
     return test
 
+def comments_test(sentinel):
+
+    def test(mi):
+        comm = mi.comments.lower() if mi.comments else ''
+        if sentinel and sentinel.lower() in comm:
+            return True
+        prints('comments test failed. %s not in comments'%sentinel)
+        return False
+    return test
+
 def init_test(tdir_name):
     tdir = tempfile.gettempdir()
     lf = os.path.join(tdir, tdir_name.replace(' ', '')+'_identify_test.txt')
@@ -157,7 +167,7 @@ def test_identify(tests): # {{{
 
 # }}}
 
-def test_identify_plugin(name, tests): # {{{
+def test_identify_plugin(name, tests, modify_plugin=lambda plugin:None): # {{{
     '''
     :param name: Plugin name
     :param tests: List of 2-tuples. Each two tuple is of the form (args,
@@ -171,6 +181,7 @@ def test_identify_plugin(name, tests): # {{{
         if x.name == name:
             plugin = x
             break
+    modify_plugin(plugin)
     prints('Testing the identify function of', plugin.name)
     prints('Using extra headers:', plugin.browser.addheaders)
 
diff --git a/src/calibre/library/comments.py b/src/calibre/library/comments.py
index 5d6c43f343..0f45a47032 100644
--- a/src/calibre/library/comments.py
+++ b/src/calibre/library/comments.py
@@ -136,6 +136,7 @@ def sanitize_comments_html(html):
     text = html2text(html)
     md = markdown.Markdown(safe_mode=True)
     cleansed = re.sub('\n+', '', md.convert(text))
+    cleansed = cleansed.replace(markdown.HTML_REMOVED_TEXT, '')
     return cleansed
 
 def test():