Metadata compatibility

2025-07-09 03:04:10 -04:00 · 2011-03-09 22:21:02 +01:00 · 2011-03-09 22:21:02 +01:00 · 888aaec88f
commit 888aaec88f
parent e3ec837fd1
3 changed files with 52 additions and 234 deletions
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -580,12 +580,12 @@ from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG
 from calibre.devices.kobo.driver import KOBO
 from calibre.devices.bambook.driver import BAMBOOK

-from calibre.ebooks.metadata.fetch import KentDistrictLibrary
+from calibre.ebooks.metadata.fetch import KentDistrictLibrary, Amazon
 from calibre.ebooks.metadata.douban import DoubanBooks
 from calibre.ebooks.metadata.isbndb import ISBNDB
 from calibre.ebooks.metadata.google_books import GoogleBooks
 from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers
-from calibre.ebooks.metadata.amazon import Amazon, AmazonSocial
+# from calibre.ebooks.metadata.amazon import Amazon , AmazonSocial
 from calibre.ebooks.metadata.fictionwise import Fictionwise
 from calibre.ebooks.metadata.covers import OpenLibraryCovers, \
        AmazonCovers, DoubanCovers, LibrarythingCovers
@ -593,7 +593,7 @@ from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX
 from calibre.ebooks.epub.fix.unmanifested import Unmanifested
 from calibre.ebooks.epub.fix.epubcheck import Epubcheck

-plugins = [HTML2ZIP, PML2PMLZ, TXT2TXTZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, AmazonSocial,
+plugins = [HTML2ZIP, PML2PMLZ, TXT2TXTZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, #AmazonSocial,
        KentDistrictLibrary, DoubanBooks, NiceBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested,
        Epubcheck, OpenLibraryCovers, AmazonCovers, DoubanCovers, LibrarythingCovers,
        NiceBooksCovers]
--- a/src/calibre/ebooks/metadata/amazon.py
+++ b/src/calibre/ebooks/metadata/amazon.py
@ -1,7 +1,11 @@
-from __future__ import with_statement
-__license__ = 'GPL 3'
-__copyright__ = '2010, sengian <sengian1@gmail.com>'
+#!/usr/bin/env  python
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
+__docformat__ = 'restructuredtext en'

+'''
+Fetch metadata using Amazon AWS
+'''
 import sys, re
 from threading import RLock

@ -12,10 +16,6 @@ from calibre import browser
 from calibre.ebooks.metadata import check_isbn
 from calibre.ebooks.metadata.book.base import Metadata
 from calibre.ebooks.chardet import xml_to_unicode
-from calibre.ebooks.metadata import MetaInformation, check_isbn, \
-    authors_to_sort_string
-from calibre.ebooks.metadata.fetch import MetadataSource
-from calibre.utils.config import OptionParser
 from calibre.library.comments import sanitize_comments_html

 asin_cache = {}
@ -160,229 +160,31 @@ def get_metadata(br, asin, mi):
            m = pat.match(t)
            if m is not None:
                try:
-                    default = utcnow().replace(day=15)
-                    if self.lang != 'all':
-                        d = replace_months(d, self.lang)
-                    d = parse_date(d, assume_utc=True, default=default)
-                    mi.pubdate = d
+                    mi.rating = float(m.group(1))/float(m.group(2)) * 5
+                    break
                except:
-                    report(verbose)
-        #ISBN
-        elt = filter(lambda x: self.reisbn.search(x.find('b').text), elts)
-        if elt:
-            isbn = elt[0].find('b').tail.replace('-', '').strip()
-            if check_isbn(isbn):
-                    mi.isbn = unicode(isbn)
-            elif len(elt) > 1:
-                isbnone = elt[1].find('b').tail.replace('-', '').strip()
-                if check_isbn(isbnone):
-                    mi.isbn = unicode(isbnone)
-            else:
-                #assume ASIN-> find a check for asin
-                mi.isbn = unicode(isbn)
-        #Langue
-        elt = filter(lambda x: self.relang.search(x.find('b').text), elts)
-        if elt:
-            langue = elt[0].find('b').tail.strip()
-            if langue:
-                mi.language = unicode(langue)
-        #ratings
-        elt = filter(lambda x: self.reratelt.search(x.find('b').text), elts)
-        if elt:
-            ratings = elt[0].find_class('swSprite')
-            if ratings:
-                ratings = self.rerat.findall(ratings[0].get('title'))
-                if len(ratings) == 2:
-                    mi.rating = float(ratings[0])/float(ratings[1]) * 5
-        return mi
+                    pass

-    def fill_MI(self, entry, verbose):
-        try:
-            title = self.get_title(entry)
-            authors = self.get_authors(entry)
-        except Exception, e:
-            if verbose:
-                print _('Failed to get all details for an entry')
-                print e
-                print _('URL who failed: %s') % x
-                report(verbose)
-            return None
-        mi = MetaInformation(title, authors)
-        mi.author_sort = authors_to_sort_string(authors)
-        try:
-            mi.comments = self.get_description(entry, verbose)
-            mi = self.get_book_info(entry, mi, verbose)
-        except:
-            pass
-        return mi
+    desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]')
+    if desc:
+        desc = desc[0]
+        for c in desc.xpath('descendant::*[@class="seeAll" or'
+                ' @class="emptyClear" or @href]'):
+            c.getparent().remove(c)
+        desc = html.tostring(desc, method='html', encoding=unicode).strip()
+        # remove all attributes from tags
+        desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
+        # Collapse whitespace
+        #desc = re.sub('\n+', '\n', desc)
+        #desc = re.sub(' +', ' ', desc)
+        # Remove the notice about text referring to out of print editions
+        desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
+        # Remove comments
+        desc = re.sub(r'(?s)<!--.*?-->', '', desc)
+        mi.comments = sanitize_comments_html(desc)

-    def get_individual_metadata(self, url, br, verbose):
-        try:
-            raw = br.open_novisit(url).read()
-        except Exception, e:
-            import socket
-            report(verbose)
-            if callable(getattr(e, 'getcode', None)) and \
-                    e.getcode() == 404:
-                return None
-            attr = getattr(e, 'args', [None])
-            attr = attr if attr else [None]
-            if isinstance(attr[0], socket.timeout):
-                raise AmazonError(_('Amazon timed out. Try again later.'))
-            raise AmazonError(_('Amazon encountered an error.'))
-        if '<title>404 - ' in raw:
-            report(verbose)
-            return None
-        raw = xml_to_unicode(raw, strip_encoding_pats=True,
-                resolve_entities=True)[0]
-        try:
-            return soupparser.fromstring(raw)
-        except:
-            try:
-                #remove ASCII invalid chars
-                return soupparser.fromstring(clean_ascii_chars(raw))
-            except:
-                report(verbose)
-                return None
+    return True

-    def fetchdatathread(self, qbr, qsync, nb, url, verbose):
-        try:
-            browser = qbr.get(True)
-            entry = self.get_individual_metadata(url, browser, verbose)
-        except:
-            report(verbose)
-            entry = None
-        finally:
-            qbr.put(browser, True)
-            qsync.put((nb, entry), True)
-
-    def producer(self, sync, urls, br, verbose=False):
-        for i in xrange(len(urls)):
-            thread = Thread(target=self.fetchdatathread, 
-                        args=(br, sync, i, urls[i], verbose))
-            thread.start()
-
-    def consumer(self, sync, syncbis, br, total_entries, verbose=False):
-        i=0
-        self.extend([None]*total_entries)
-        while i < total_entries:
-            rq = sync.get(True)
-            nb = int(rq[0])
-            entry = rq[1]
-            i+=1
-            if entry is not None:
-                mi = self.fill_MI(entry, verbose)
-                if mi is not None:
-                    mi.tags, atag = self.get_tags(entry, verbose)
-                    self[nb] = mi
-                    if atag:
-                        thread = Thread(target=self.fetchdatathread, 
-                                args=(br, syncbis, nb, mi.tags, verbose))
-                        thread.start()
-                    else:
-                        syncbis.put((nb, None), True)
-
-    def final(self, sync, total_entries, verbose):
-        i=0
-        while i < total_entries:
-            rq = sync.get(True)
-            nb = int(rq[0])
-            tags = rq[1]
-            i+=1
-            if tags is not None:
-                self[nb].tags = self.get_tags(tags, verbose)[0]
-
-    def populate(self, entries, ibr, verbose=False, brcall=3):
-        br = Queue(brcall)
-        cbr = Queue(brcall-1)
-        
-        syncp = Queue(1)
-        syncc = Queue(1)
-        
-        for i in xrange(brcall-1):
-            br.put(browser(), True)
-            cbr.put(browser(), True)
-        br.put(ibr, True)
-        
-        prod_thread = Thread(target=self.producer, args=(syncp, entries, br, verbose))
-        cons_thread = Thread(target=self.consumer, args=(syncp, syncc, cbr, len(entries), verbose))
-        fin_thread = Thread(target=self.final, args=(syncc, len(entries), verbose))
-        prod_thread.start()
-        cons_thread.start()
-        fin_thread.start()
-        prod_thread.join()
-        cons_thread.join()
-        fin_thread.join()
-
-
-def search(title=None, author=None, publisher=None, isbn=None,
-           max_results=5, verbose=False, keywords=None, lang='all'):
-    br = browser()
-    entries, baseurl = Query(title=title, author=author, isbn=isbn, publisher=publisher,
-        keywords=keywords, max_results=max_results,rlang=lang)(br, verbose)
-
-    if entries is None or len(entries) == 0:
-        return None
-
-    #List of entry
-    ans = ResultList(baseurl, lang)
-    ans.populate(entries, br, verbose)
-    return [x for x in ans if x is not None]
-
-def get_social_metadata(title, authors, publisher, isbn, verbose=False,
-        max_results=1, lang='all'):
-    mi = MetaInformation(title, authors)
-    if not isbn or not check_isbn(isbn):
-        return [mi]
-
-    amazresults = search(isbn=isbn, verbose=verbose,
-                max_results=max_results, lang=lang)
-    if amazresults is None or amazresults[0] is None:
-        from calibre.ebooks.metadata.xisbn import xisbn
-        for i in xisbn.get_associated_isbns(isbn):
-            amazresults = search(isbn=i, verbose=verbose,
-                max_results=max_results, lang=lang)
-            if amazresults is not None and amazresults[0] is not None:
-                break
-    if amazresults is None or amazresults[0] is None:
-        return [mi]
-    
-    miaz = amazresults[0]
-    if miaz.rating is not None:
-        mi.rating = miaz.rating
-    if miaz.comments is not None:
-        mi.comments = miaz.comments
-    if miaz.tags is not None:
-        mi.tags = miaz.tags
-    return [mi]
-
-def option_parser():
-    import textwrap
-    parser = OptionParser(textwrap.dedent(\
-    _('''\
-        %prog [options]
-
-        Fetch book metadata from Amazon. You must specify one of title, author,
-        ISBN, publisher or keywords. Will fetch a maximum of 20 matches,
-        so you should make your query as specific as possible.
-        You can chose the language for metadata retrieval:
-        english & french & german
-    '''
-    )))
-    parser.add_option('-t', '--title', help=_('Book title'))
-    parser.add_option('-a', '--author', help=_('Book author(s)'))
-    parser.add_option('-p', '--publisher', help=_('Book publisher'))
-    parser.add_option('-i', '--isbn', help=_('Book ISBN'))
-    parser.add_option('-k', '--keywords', help=_('Keywords'))
-    parser.add_option('-s', '--social', default=0, action='count',
-                      help=_('Get social data only'))
-    parser.add_option('-m', '--max-results', default=10,
-                      help=_('Maximum number of results to fetch'))
-    parser.add_option('-l', '--lang', default='all',
-                      help=_('Chosen language for metadata search (en, fr, de)'))
-    parser.add_option('-v', '--verbose', default=0, action='count',
-                      help=_('Be more verbose about errors'))
-    return parser

 def main(args=sys.argv):
    import tempfile, os
@ -412,8 +214,3 @@ def main(args=sys.argv):

 if __name__ == '__main__':
    sys.exit(main())
-    # import cProfile
-    # sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonbis; calibre.ebooks.metadata.amazonbis.main()"))
-    # sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonbis; calibre.ebooks.metadata.amazonbis.main()", "profile"))
-
-# calibre-debug -e "D:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\amazon.py" -m 5 -a gore -v>data.html
--- a/src/calibre/ebooks/metadata/fetch.py
+++ b/src/calibre/ebooks/metadata/fetch.py
@ -212,6 +212,27 @@ class MetadataSource(Plugin): # {{{

    # }}}

+class Amazon(MetadataSource): # {{{
+
+    name = 'Amazon'
+    metadata_type = 'social'
+    description = _('Downloads social metadata from amazon.com')
+
+    has_html_comments = True
+
+    def fetch(self):
+        if not self.isbn:
+            return
+        from calibre.ebooks.metadata.amazon import get_social_metadata
+        try:
+            self.results = get_social_metadata(self.title, self.book_author,
+                    self.publisher, self.isbn)
+        except Exception, e:
+            self.exception = e
+            self.tb = traceback.format_exc()
+
+    # }}}
+
 class KentDistrictLibrary(MetadataSource): # {{{

    name = 'Kent District Library'