From 4e8b9c5c0cdc8b571f05e8f4cbea12148b75e5e5 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Wed, 1 Mar 2017 23:19:02 +0530
Subject: [PATCH] A spot of refactoring

---
 src/calibre/ebooks/metadata/sources/amazon.py | 117 +++++++++---------
 1 file changed, 60 insertions(+), 57 deletions(-)
diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py
index f3430ce4e7..d5234df60d 100644
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@@ -25,6 +25,10 @@ class CaptchaError(Exception):
     pass
 
 
+class SearchFailed(ValueError):
+    pass
+
+
 ua_index = -1
 
 
@@ -1097,16 +1101,65 @@ class Amazon(Source):
         return matches[:3]
     # }}}
 
+    def search_amazon(self, br, testing, log, abort, title, authors, identifiers, timeout):
+        import html5lib
+        from calibre.utils.cleantext import clean_ascii_chars
+        from calibre.ebooks.chardet import xml_to_unicode
+        matches = []
+        query, domain = self.create_query(log, title=title, authors=authors,
+                identifiers=identifiers)
+        if query is None:
+            log.error('Insufficient metadata to construct query')
+            raise SearchFailed()
+        try:
+            raw = br.open_novisit(query, timeout=timeout).read().strip()
+        except Exception as e:
+            if callable(getattr(e, 'getcode', None)) and \
+                    e.getcode() == 404:
+                log.error('Query malformed: %r'%query)
+                raise SearchFailed()
+            attr = getattr(e, 'args', [None])
+            attr = attr if attr else [None]
+            if isinstance(attr[0], socket.timeout):
+                msg = _('Amazon timed out. Try again later.')
+                log.error(msg)
+            else:
+                msg = 'Failed to make identify query: %r'%query
+                log.exception(msg)
+            raise SearchFailed()
+
+        raw = clean_ascii_chars(xml_to_unicode(raw,
+            strip_encoding_pats=True, resolve_entities=True)[0])
+
+        if testing:
+            import tempfile
+            with tempfile.NamedTemporaryFile(prefix='amazon_results_',
+                    suffix='.html', delete=False) as f:
+                f.write(raw.encode('utf-8'))
+            print ('Downloaded html for results page saved in', f.name)
+
+        matches = []
+        found = '<title>404 - ' not in raw
+
+        if found:
+            try:
+                root = html5lib.parse(raw, treebuilder='lxml',
+                        namespaceHTMLElements=False)
+            except Exception:
+                msg = 'Failed to parse amazon page for query: %r'%query
+                log.exception(msg)
+                raise SearchFailed()
+
+        matches = self.parse_results_page(root, domain)
+
+        return matches, query, domain
+
     def identify(self, log, result_queue, abort, title=None, authors=None,  # {{{
             identifiers={}, timeout=30):
         '''
         Note this method will retry without identifiers automatically if no
         match is found with identifiers.
         '''
-        from calibre.utils.cleantext import clean_ascii_chars
-        from calibre.ebooks.chardet import xml_to_unicode
-        from lxml.html import tostring
-        import html5lib
 
         testing = getattr(self, 'running_a_test', False)
 
@@ -1127,60 +1180,10 @@ class Amazon(Source):
                         return
                     except Exception:
                         log.exception('get_details failed for url: %r'%durl)
-
-        query, domain = self.create_query(log, title=title, authors=authors,
-                identifiers=identifiers)
-        if query is None:
-            log.error('Insufficient metadata to construct query')
-            return
         try:
-            raw = br.open_novisit(query, timeout=timeout).read().strip()
-        except Exception as e:
-            if callable(getattr(e, 'getcode', None)) and \
-                    e.getcode() == 404:
-                log.error('Query malformed: %r'%query)
-                return
-            attr = getattr(e, 'args', [None])
-            attr = attr if attr else [None]
-            if isinstance(attr[0], socket.timeout):
-                msg = _('Amazon timed out. Try again later.')
-                log.error(msg)
-            else:
-                msg = 'Failed to make identify query: %r'%query
-                log.exception(msg)
-            return as_unicode(msg)
-
-        raw = clean_ascii_chars(xml_to_unicode(raw,
-            strip_encoding_pats=True, resolve_entities=True)[0])
-
-        if testing:
-            import tempfile
-            with tempfile.NamedTemporaryFile(prefix='amazon_results_',
-                    suffix='.html', delete=False) as f:
-                f.write(raw.encode('utf-8'))
-            print ('Downloaded html for results page saved in', f.name)
-
-        matches = []
-        found = '<title>404 - ' not in raw
-
-        if found:
-            try:
-                root = html5lib.parse(raw, treebuilder='lxml',
-                        namespaceHTMLElements=False)
-            except:
-                msg = 'Failed to parse amazon page for query: %r'%query
-                log.exception(msg)
-                return msg
-
-                errmsg = root.xpath('//*[@id="errorMessage"]')
-                if errmsg:
-                    msg = tostring(errmsg, method='text', encoding=unicode).strip()
-                    log.error(msg)
-                    # The error is almost always a not found error
-                    found = False
-
-        if found:
-            matches = self.parse_results_page(root, domain)
+            matches, query, domain = self.search_amazon(br, testing, log, abort, title, authors, identifiers, timeout)
+        except SearchFailed:
+            return
 
         if abort.is_set():
             return