Metadata compatibility

2025-07-09 03:04:10 -04:00 · 2011-03-09 22:21:02 +01:00 · 2011-03-09 22:21:02 +01:00 · 888aaec88f
commit 888aaec88f
parent e3ec837fd1
3 changed files with 52 additions and 234 deletions
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -580,12 +580,12 @@ from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG
 from calibre.devices.kobo.driver import KOBO
 from calibre.devices.bambook.driver import BAMBOOK
-from calibre.ebooks.metadata.fetch import KentDistrictLibrary
+from calibre.ebooks.metadata.fetch import KentDistrictLibrary, Amazon
 from calibre.ebooks.metadata.douban import DoubanBooks
 from calibre.ebooks.metadata.isbndb import ISBNDB
 from calibre.ebooks.metadata.google_books import GoogleBooks
 from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers
-from calibre.ebooks.metadata.amazon import Amazon, AmazonSocial
+# from calibre.ebooks.metadata.amazon import Amazon , AmazonSocial
 from calibre.ebooks.metadata.fictionwise import Fictionwise
 from calibre.ebooks.metadata.covers import OpenLibraryCovers, \
        AmazonCovers, DoubanCovers, LibrarythingCovers
@ -593,7 +593,7 @@ from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX
 from calibre.ebooks.epub.fix.unmanifested import Unmanifested
 from calibre.ebooks.epub.fix.epubcheck import Epubcheck
-plugins = [HTML2ZIP, PML2PMLZ, TXT2TXTZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, AmazonSocial,
+plugins = [HTML2ZIP, PML2PMLZ, TXT2TXTZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, #AmazonSocial,
        KentDistrictLibrary, DoubanBooks, NiceBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested,
        Epubcheck, OpenLibraryCovers, AmazonCovers, DoubanCovers, LibrarythingCovers,
        NiceBooksCovers]
--- a/src/calibre/ebooks/metadata/amazon.py
+++ b/src/calibre/ebooks/metadata/amazon.py
@ -1,7 +1,11 @@
-from __future__ import with_statement
+#!/usr/bin/env  python
-__license__ = 'GPL 3'
+__license__   = 'GPL v3'
-__copyright__ = '2010, sengian <sengian1@gmail.com>'
+__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
 '''
 Fetch metadata using Amazon AWS
 '''
 import sys, re
 from threading import RLock
@ -12,10 +16,6 @@ from calibre import browser
 from calibre.ebooks.metadata import check_isbn
 from calibre.ebooks.metadata.book.base import Metadata
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.ebooks.metadata import MetaInformation, check_isbn, \
    authors_to_sort_string
 from calibre.ebooks.metadata.fetch import MetadataSource
 from calibre.utils.config import OptionParser
 from calibre.library.comments import sanitize_comments_html
 asin_cache = {}
@ -160,229 +160,31 @@ def get_metadata(br, asin, mi):
            m = pat.match(t)
            if m is not None:
                try:
-                    default = utcnow().replace(day=15)
+                    mi.rating = float(m.group(1))/float(m.group(2)) * 5
-                    if self.lang != 'all':
+                    break
                        d = replace_months(d, self.lang)
                    d = parse_date(d, assume_utc=True, default=default)
                    mi.pubdate = d
                except:
                    report(verbose)
        #ISBN
        elt = filter(lambda x: self.reisbn.search(x.find('b').text), elts)
        if elt:
            isbn = elt[0].find('b').tail.replace('-', '').strip()
            if check_isbn(isbn):
                    mi.isbn = unicode(isbn)
            elif len(elt) > 1:
                isbnone = elt[1].find('b').tail.replace('-', '').strip()
                if check_isbn(isbnone):
                    mi.isbn = unicode(isbnone)
            else:
                #assume ASIN-> find a check for asin
                mi.isbn = unicode(isbn)
        #Langue
        elt = filter(lambda x: self.relang.search(x.find('b').text), elts)
        if elt:
            langue = elt[0].find('b').tail.strip()
            if langue:
                mi.language = unicode(langue)
        #ratings
        elt = filter(lambda x: self.reratelt.search(x.find('b').text), elts)
        if elt:
            ratings = elt[0].find_class('swSprite')
            if ratings:
                ratings = self.rerat.findall(ratings[0].get('title'))
                if len(ratings) == 2:
                    mi.rating = float(ratings[0])/float(ratings[1]) * 5
        return mi
    def fill_MI(self, entry, verbose):
        try:
            title = self.get_title(entry)
            authors = self.get_authors(entry)
        except Exception, e:
            if verbose:
                print _('Failed to get all details for an entry')
                print e
                print _('URL who failed: %s') % x
                report(verbose)
            return None
        mi = MetaInformation(title, authors)
        mi.author_sort = authors_to_sort_string(authors)
        try:
            mi.comments = self.get_description(entry, verbose)
            mi = self.get_book_info(entry, mi, verbose)
                except:
                    pass
        return mi
-    def get_individual_metadata(self, url, br, verbose):
+    desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]')
-        try:
+    if desc:
-            raw = br.open_novisit(url).read()
+        desc = desc[0]
-        except Exception, e:
+        for c in desc.xpath('descendant::*[@class="seeAll" or'
-            import socket
+                ' @class="emptyClear" or @href]'):
-            report(verbose)
+            c.getparent().remove(c)
-            if callable(getattr(e, 'getcode', None)) and \
+        desc = html.tostring(desc, method='html', encoding=unicode).strip()
-                    e.getcode() == 404:
+        # remove all attributes from tags
-                return None
+        desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
-            attr = getattr(e, 'args', [None])
+        # Collapse whitespace
-            attr = attr if attr else [None]
+        #desc = re.sub('\n+', '\n', desc)
-            if isinstance(attr[0], socket.timeout):
+        #desc = re.sub(' +', ' ', desc)
-                raise AmazonError(_('Amazon timed out. Try again later.'))
+        # Remove the notice about text referring to out of print editions
-            raise AmazonError(_('Amazon encountered an error.'))
+        desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
-        if '<title>404 - ' in raw:
+        # Remove comments
-            report(verbose)
+        desc = re.sub(r'(?s)<!--.*?-->', '', desc)
-            return None
+        mi.comments = sanitize_comments_html(desc)
        raw = xml_to_unicode(raw, strip_encoding_pats=True,
                resolve_entities=True)[0]
        try:
            return soupparser.fromstring(raw)
        except:
            try:
                #remove ASCII invalid chars
                return soupparser.fromstring(clean_ascii_chars(raw))
            except:
                report(verbose)
                return None
-    def fetchdatathread(self, qbr, qsync, nb, url, verbose):
+    return True
        try:
            browser = qbr.get(True)
            entry = self.get_individual_metadata(url, browser, verbose)
        except:
            report(verbose)
            entry = None
        finally:
            qbr.put(browser, True)
            qsync.put((nb, entry), True)
    def producer(self, sync, urls, br, verbose=False):
        for i in xrange(len(urls)):
            thread = Thread(target=self.fetchdatathread, 
                        args=(br, sync, i, urls[i], verbose))
            thread.start()
    def consumer(self, sync, syncbis, br, total_entries, verbose=False):
        i=0
        self.extend([None]*total_entries)
        while i < total_entries:
            rq = sync.get(True)
            nb = int(rq[0])
            entry = rq[1]
            i+=1
            if entry is not None:
                mi = self.fill_MI(entry, verbose)
                if mi is not None:
                    mi.tags, atag = self.get_tags(entry, verbose)
                    self[nb] = mi
                    if atag:
                        thread = Thread(target=self.fetchdatathread, 
                                args=(br, syncbis, nb, mi.tags, verbose))
                        thread.start()
                    else:
                        syncbis.put((nb, None), True)
    def final(self, sync, total_entries, verbose):
        i=0
        while i < total_entries:
            rq = sync.get(True)
            nb = int(rq[0])
            tags = rq[1]
            i+=1
            if tags is not None:
                self[nb].tags = self.get_tags(tags, verbose)[0]
    def populate(self, entries, ibr, verbose=False, brcall=3):
        br = Queue(brcall)
        cbr = Queue(brcall-1)
        syncp = Queue(1)
        syncc = Queue(1)
        for i in xrange(brcall-1):
            br.put(browser(), True)
            cbr.put(browser(), True)
        br.put(ibr, True)
        prod_thread = Thread(target=self.producer, args=(syncp, entries, br, verbose))
        cons_thread = Thread(target=self.consumer, args=(syncp, syncc, cbr, len(entries), verbose))
        fin_thread = Thread(target=self.final, args=(syncc, len(entries), verbose))
        prod_thread.start()
        cons_thread.start()
        fin_thread.start()
        prod_thread.join()
        cons_thread.join()
        fin_thread.join()
 def search(title=None, author=None, publisher=None, isbn=None,
           max_results=5, verbose=False, keywords=None, lang='all'):
    br = browser()
    entries, baseurl = Query(title=title, author=author, isbn=isbn, publisher=publisher,
        keywords=keywords, max_results=max_results,rlang=lang)(br, verbose)
    if entries is None or len(entries) == 0:
        return None
    #List of entry
    ans = ResultList(baseurl, lang)
    ans.populate(entries, br, verbose)
    return [x for x in ans if x is not None]
 def get_social_metadata(title, authors, publisher, isbn, verbose=False,
        max_results=1, lang='all'):
    mi = MetaInformation(title, authors)
    if not isbn or not check_isbn(isbn):
        return [mi]
    amazresults = search(isbn=isbn, verbose=verbose,
                max_results=max_results, lang=lang)
    if amazresults is None or amazresults[0] is None:
        from calibre.ebooks.metadata.xisbn import xisbn
        for i in xisbn.get_associated_isbns(isbn):
            amazresults = search(isbn=i, verbose=verbose,
                max_results=max_results, lang=lang)
            if amazresults is not None and amazresults[0] is not None:
                break
    if amazresults is None or amazresults[0] is None:
        return [mi]
    miaz = amazresults[0]
    if miaz.rating is not None:
        mi.rating = miaz.rating
    if miaz.comments is not None:
        mi.comments = miaz.comments
    if miaz.tags is not None:
        mi.tags = miaz.tags
    return [mi]
 def option_parser():
    import textwrap
    parser = OptionParser(textwrap.dedent(\
    _('''\
        %prog [options]
        Fetch book metadata from Amazon. You must specify one of title, author,
        ISBN, publisher or keywords. Will fetch a maximum of 20 matches,
        so you should make your query as specific as possible.
        You can chose the language for metadata retrieval:
        english & french & german
    '''
    )))
    parser.add_option('-t', '--title', help=_('Book title'))
    parser.add_option('-a', '--author', help=_('Book author(s)'))
    parser.add_option('-p', '--publisher', help=_('Book publisher'))
    parser.add_option('-i', '--isbn', help=_('Book ISBN'))
    parser.add_option('-k', '--keywords', help=_('Keywords'))
    parser.add_option('-s', '--social', default=0, action='count',
                      help=_('Get social data only'))
    parser.add_option('-m', '--max-results', default=10,
                      help=_('Maximum number of results to fetch'))
    parser.add_option('-l', '--lang', default='all',
                      help=_('Chosen language for metadata search (en, fr, de)'))
    parser.add_option('-v', '--verbose', default=0, action='count',
                      help=_('Be more verbose about errors'))
    return parser
 def main(args=sys.argv):
    import tempfile, os
@ -412,8 +214,3 @@ def main(args=sys.argv):
 if __name__ == '__main__':
    sys.exit(main())
    # import cProfile
    # sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonbis; calibre.ebooks.metadata.amazonbis.main()"))
    # sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonbis; calibre.ebooks.metadata.amazonbis.main()", "profile"))
 # calibre-debug -e "D:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\amazon.py" -m 5 -a gore -v>data.html
--- a/src/calibre/ebooks/metadata/fetch.py
+++ b/src/calibre/ebooks/metadata/fetch.py
@ -212,6 +212,27 @@ class MetadataSource(Plugin): # {{{
    # }}}
 class Amazon(MetadataSource): # {{{
    name = 'Amazon'
    metadata_type = 'social'
    description = _('Downloads social metadata from amazon.com')
    has_html_comments = True
    def fetch(self):
        if not self.isbn:
            return
        from calibre.ebooks.metadata.amazon import get_social_metadata
        try:
            self.results = get_social_metadata(self.title, self.book_author,
                    self.publisher, self.isbn)
        except Exception, e:
            self.exception = e
            self.tb = traceback.format_exc()
    # }}}
 class KentDistrictLibrary(MetadataSource): # {{{
    name = 'Kent District Library'