Remove not working Overdrive metadata source

Fixes #1927973 [overdrive search HTTP Error 404](https://bugs.launchpad.net/calibre/+bug/1927973)
This commit is contained in:
Kovid Goyal 2021-05-10 19:45:30 +05:30
parent d447613b2b
commit eb251e0976
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 1 additions and 488 deletions

View File

@ -832,11 +832,10 @@ from calibre.ebooks.metadata.sources.google import GoogleBooks
from calibre.ebooks.metadata.sources.amazon import Amazon from calibre.ebooks.metadata.sources.amazon import Amazon
from calibre.ebooks.metadata.sources.edelweiss import Edelweiss from calibre.ebooks.metadata.sources.edelweiss import Edelweiss
from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary
from calibre.ebooks.metadata.sources.overdrive import OverDrive
from calibre.ebooks.metadata.sources.google_images import GoogleImages from calibre.ebooks.metadata.sources.google_images import GoogleImages
from calibre.ebooks.metadata.sources.big_book_search import BigBookSearch from calibre.ebooks.metadata.sources.big_book_search import BigBookSearch
plugins += [GoogleBooks, GoogleImages, Amazon, Edelweiss, OpenLibrary, OverDrive, BigBookSearch] plugins += [GoogleBooks, GoogleImages, Amazon, Edelweiss, OpenLibrary, BigBookSearch]
# }}} # }}}

View File

@ -1,486 +0,0 @@
#!/usr/bin/env python
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
Fetch metadata using Overdrive Content Reserve
'''
import re, random, copy, json
from threading import RLock
try:
from queue import Empty, Queue
except ImportError:
from Queue import Empty, Queue
from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.sources.base import Source, Option
from calibre.ebooks.metadata.book.base import Metadata
ovrdrv_data_cache = {}
cache_lock = RLock()
base_url = 'https://search.overdrive.com/'
class OverDrive(Source):
name = 'Overdrive'
version = (1, 0, 1)
minimum_calibre_version = (2, 80, 0)
description = _('Downloads metadata and covers from Overdrive\'s Content Reserve')
capabilities = frozenset(['identify', 'cover'])
touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate',
'comments', 'publisher', 'identifier:isbn', 'series', 'series_index',
'languages', 'identifier:overdrive'])
has_html_comments = True
supports_gzip_transfer_encoding = False
cached_cover_url_is_reliable = True
options = (
Option('get_full_metadata', 'bool', True,
_('Download all metadata (slow)'),
_('Enable this option to gather all metadata available from Overdrive.')),
)
config_help_message = '<p>'+_('Additional metadata can be taken from Overdrive\'s book detail'
' page. This includes a limited set of tags used by libraries, comments, language,'
' and the e-book ISBN. Collecting this data is disabled by default due to the extra'
' time required. Check the download all metadata option below to'
' enable downloading this data.')
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
identifiers={}, timeout=30):
ovrdrv_id = identifiers.get('overdrive', None)
isbn = identifiers.get('isbn', None)
br = self.browser
ovrdrv_data = self.to_ovrdrv_data(br, log, title, authors, ovrdrv_id)
if ovrdrv_data:
title = ovrdrv_data[8]
authors = ovrdrv_data[6]
mi = Metadata(title, authors)
self.parse_search_results(ovrdrv_data, mi)
if ovrdrv_id is None:
ovrdrv_id = ovrdrv_data[7]
if self.prefs['get_full_metadata']:
self.get_book_detail(br, ovrdrv_data[1], mi, ovrdrv_id, log)
if isbn is not None:
self.cache_isbn_to_identifier(isbn, ovrdrv_id)
result_queue.put(mi)
return None
# }}}
def download_cover(self, log, result_queue, abort, # {{{
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
import mechanize
cached_url = self.get_cached_cover_url(identifiers)
if cached_url is None:
log.info('No cached cover found, running identify')
rq = Queue()
self.identify(log, rq, abort, title=title, authors=authors,
identifiers=identifiers)
if abort.is_set():
return
results = []
while True:
try:
results.append(rq.get_nowait())
except Empty:
break
results.sort(key=self.identify_results_keygen(
title=title, authors=authors, identifiers=identifiers))
for mi in results:
cached_url = self.get_cached_cover_url(mi.identifiers)
if cached_url is not None:
break
if cached_url is None:
log.info('No cover found')
return
if abort.is_set():
return
ovrdrv_id = identifiers.get('overdrive', None)
br = self.browser
req = mechanize.Request(cached_url)
if ovrdrv_id is not None:
referer = self.get_base_referer()+'ContentDetails-Cover.htm?ID='+ovrdrv_id
req.add_header('referer', referer)
log('Downloading cover from:', cached_url)
try:
cdata = br.open_novisit(req, timeout=timeout).read()
result_queue.put((self, cdata))
except:
log.exception('Failed to download cover from:', cached_url)
# }}}
def get_cached_cover_url(self, identifiers): # {{{
url = None
ovrdrv_id = identifiers.get('overdrive', None)
if ovrdrv_id is None:
isbn = identifiers.get('isbn', None)
if isbn is not None:
ovrdrv_id = self.cached_isbn_to_identifier(isbn)
if ovrdrv_id is not None:
url = self.cached_identifier_to_cover_url(ovrdrv_id)
return url
# }}}
def get_base_referer(self): # to be used for passing referrer headers to cover download
choices = [
'https://overdrive.chipublib.org/82DC601D-7DDE-4212-B43A-09D821935B01/10/375/en/',
'https://emedia.clevnet.org/9D321DAD-EC0D-490D-BFD8-64AE2C96ECA8/10/241/en/',
'https://singapore.lib.overdrive.com/F11D55BE-A917-4D63-8111-318E88B29740/10/382/en/',
'https://ebooks.nypl.org/20E48048-A377-4520-BC43-F8729A42A424/10/257/en/',
'https://spl.lib.overdrive.com/5875E082-4CB2-4689-9426-8509F354AFEF/10/335/en/'
]
return choices[random.randint(0, len(choices)-1)]
def format_results(self, reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid):
fix_slashes = re.compile(r'\\/')
thumbimage = fix_slashes.sub('/', thumbimage)
worldcatlink = fix_slashes.sub('/', worldcatlink)
cover_url = re.sub(r'(?P<img>(Ima?g(eType-)?))200', r'\g<img>100', thumbimage)
social_metadata_url = base_url+'TitleInfo.aspx?ReserveID='+reserveid+'&FormatID='+formatid
series_num = ''
if not series:
if subtitle:
title = od_title+': '+subtitle
else:
title = od_title
else:
title = od_title
m = re.search("([0-9]+$)", subtitle)
if m:
series_num = float(m.group(1))
return [cover_url, social_metadata_url, worldcatlink, series, series_num, publisher, creators, reserveid, title]
def safe_query(self, br, query_url, post=''):
'''
The query must be initialized by loading an empty search results page
this page attempts to set a cookie that Mechanize doesn't like
copy the cookiejar to a separate instance and make a one-off request with the temp cookiejar
'''
import mechanize
goodcookies = br._ua_handlers['_cookies'].cookiejar
clean_cj = mechanize.CookieJar()
cookies_to_copy = []
for cookie in goodcookies:
copied_cookie = copy.deepcopy(cookie)
cookies_to_copy.append(copied_cookie)
for copied_cookie in cookies_to_copy:
clean_cj.set_cookie(copied_cookie)
if post:
br.open_novisit(query_url, post)
else:
br.open_novisit(query_url)
br.set_cookiejar(clean_cj)
def overdrive_search(self, br, log, q, title, author):
import mechanize
# re-initialize the cookiejar to so that it's clean
clean_cj = mechanize.CookieJar()
br.set_cookiejar(clean_cj)
q_query = q+'default.aspx/SearchByKeyword'
q_init_search = q+'SearchResults.aspx'
# get first author as string - convert this to a proper cleanup function later
author_tokens = list(self.get_author_tokens(author,
only_first_author=True))
title_tokens = list(self.get_title_tokens(title,
strip_joiners=False, strip_subtitle=True))
xref_q = ''
if len(author_tokens) <= 1:
initial_q = ' '.join(title_tokens)
xref_q = '+'.join(author_tokens)
else:
initial_q = ' '.join(author_tokens)
for token in title_tokens:
if len(xref_q) < len(token):
xref_q = token
log.error('Initial query is %s'%initial_q)
log.error('Cross reference query is %s'%xref_q)
q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
query = '{"szKeyword":"'+initial_q+'"}'
# main query, requires specific Content Type header
req = mechanize.Request(q_query)
req.add_header('Content-Type', 'application/json; charset=utf-8')
br.open_novisit(req, query)
# initiate the search without messing up the cookiejar
self.safe_query(br, q_init_search)
# get the search results object
results = False
iterations = 0
while results is False:
iterations += 1
xreq = mechanize.Request(q_xref)
xreq.add_header('X-Requested-With', 'XMLHttpRequest')
xreq.add_header('Referer', q_init_search)
xreq.add_header('Accept', 'application/json, text/javascript, */*')
raw = br.open_novisit(xreq).read()
for m in re.finditer(type('')(r'"iTotalDisplayRecords":(?P<displayrecords>\d+).*?"iTotalRecords":(?P<totalrecords>\d+)'), raw):
if int(m.group('totalrecords')) == 0:
return ''
elif int(m.group('displayrecords')) >= 1:
results = True
elif int(m.group('totalrecords')) >= 1 and iterations < 3:
if xref_q.find('+') != -1:
xref_tokens = xref_q.split('+')
xref_q = xref_tokens[0]
for token in xref_tokens:
if len(xref_q) < len(token):
xref_q = token
# log.error('rewrote xref_q, new query is '+xref_q)
else:
xref_q = ''
q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
return self.sort_ovrdrv_results(raw, log, title, title_tokens, author, author_tokens)
def sort_ovrdrv_results(self, raw, log, title=None, title_tokens=None, author=None, author_tokens=None, ovrdrv_id=None):
close_matches = []
raw = re.sub(r'.*?\[\[(?P<content>.*?)\]\].*', r'[[\g<content>]]', raw)
results = json.loads(raw)
# log.error('raw results are:'+type('')(results))
# The search results are either from a keyword search or a multi-format list from a single ID,
# sort through the results for closest match/format
if results:
for reserveid, od_title, subtitle, edition, series, publisher, format, formatid, creators, \
thumbimage, shortdescription, worldcatlink, excerptlink, creatorfile, sorttitle, \
availabletolibrary, availabletoretailer, relevancyrank, unknown1, unknown2, unknown3 in results:
# log.error("this record's title is "+od_title+", subtitle is "+subtitle+", author[s] are "+creators+", series is "+series)
if ovrdrv_id is not None and int(formatid) in [1, 50, 410, 900]:
# log.error('overdrive id is not None, searching based on format type priority')
return self.format_results(reserveid, od_title, subtitle, series, publisher,
creators, thumbimage, worldcatlink, formatid)
else:
if creators:
creators = creators.split(', ')
# if an exact match in a preferred format occurs
if ((author and creators and creators[0] == author[0]) or (not author and not creators)) and \
od_title.lower() == title.lower() and int(formatid) in [1, 50, 410, 900] and thumbimage:
return self.format_results(reserveid, od_title, subtitle, series, publisher,
creators, thumbimage, worldcatlink, formatid)
else:
close_title_match = False
close_author_match = False
for token in title_tokens:
if od_title.lower().find(token.lower()) != -1:
close_title_match = True
else:
close_title_match = False
break
for author in creators:
for token in author_tokens:
if author.lower().find(token.lower()) != -1:
close_author_match = True
else:
close_author_match = False
break
if close_author_match:
break
if close_title_match and close_author_match and int(formatid) in [1, 50, 410, 900] and thumbimage:
if subtitle and series:
close_matches.insert(0, self.format_results(reserveid, od_title, subtitle, series,
publisher, creators, thumbimage, worldcatlink, formatid))
else:
close_matches.append(self.format_results(reserveid, od_title, subtitle, series,
publisher, creators, thumbimage, worldcatlink, formatid))
elif close_title_match and close_author_match and int(formatid) in [1, 50, 410, 900]:
close_matches.append(self.format_results(reserveid, od_title, subtitle, series,
publisher, creators, thumbimage, worldcatlink, formatid))
if close_matches:
return close_matches[0]
else:
return ''
else:
return ''
def overdrive_get_record(self, br, log, q, ovrdrv_id):
import mechanize
search_url = q+'SearchResults.aspx?ReserveID={'+ovrdrv_id+'}'
results_url = q+'SearchResults.svc/GetResults?sEcho=1&iColumns=18&sColumns=ReserveID%2CTitle%2CSubtitle%2CEdition%2CSeries%2CPublisher%2CFormat%2CFormatID%2CCreators%2CThumbImage%2CShortDescription%2CWorldCatLink%2CExcerptLink%2CCreatorFile%2CSortTitle%2CAvailableToLibrary%2CAvailableToRetailer%2CRelevancyRank&iDisplayStart=0&iDisplayLength=10&sSearch=&bEscapeRegex=true&iSortingCols=1&iSortCol_0=17&sSortDir_0=asc' # noqa
# re-initialize the cookiejar to so that it's clean
clean_cj = mechanize.CookieJar()
br.set_cookiejar(clean_cj)
# get the base url to set the proper session cookie
br.open_novisit(q)
# initialize the search
self.safe_query(br, search_url)
# get the results
req = mechanize.Request(results_url)
req.add_header('X-Requested-With', 'XMLHttpRequest')
req.add_header('Referer', search_url)
req.add_header('Accept', 'application/json, text/javascript, */*')
raw = br.open_novisit(req)
raw = type('')(list(raw))
clean_cj = mechanize.CookieJar()
br.set_cookiejar(clean_cj)
return self.sort_ovrdrv_results(raw, log, None, None, None, ovrdrv_id)
def find_ovrdrv_data(self, br, log, title, author, isbn, ovrdrv_id=None):
q = base_url
if ovrdrv_id is None:
return self.overdrive_search(br, log, q, title, author)
else:
return self.overdrive_get_record(br, log, q, ovrdrv_id)
def to_ovrdrv_data(self, br, log, title=None, author=None, ovrdrv_id=None):
'''
Takes either a title/author combo or an Overdrive ID. One of these
two must be passed to this function.
'''
if ovrdrv_id is not None:
with cache_lock:
ans = ovrdrv_data_cache.get(ovrdrv_id, None)
if ans:
return ans
elif ans is False:
return None
else:
ovrdrv_data = self.find_ovrdrv_data(br, log, title, author, ovrdrv_id)
else:
try:
ovrdrv_data = self.find_ovrdrv_data(br, log, title, author, ovrdrv_id)
except:
import traceback
traceback.print_exc()
ovrdrv_data = None
with cache_lock:
ovrdrv_data_cache[ovrdrv_id] = ovrdrv_data if ovrdrv_data else False
return ovrdrv_data if ovrdrv_data else False
def parse_search_results(self, ovrdrv_data, mi):
'''
Parse the formatted search results from the initial Overdrive query and
add the values to the metadta.
The list object has these values:
[cover_url[0], social_metadata_url[1], worldcatlink[2], series[3], series_num[4],
publisher[5], creators[6], reserveid[7], title[8]]
'''
ovrdrv_id = ovrdrv_data[7]
mi.set_identifier('overdrive', ovrdrv_id)
if len(ovrdrv_data[3]) > 1:
mi.series = ovrdrv_data[3]
if ovrdrv_data[4]:
try:
mi.series_index = float(ovrdrv_data[4])
except:
pass
mi.publisher = ovrdrv_data[5]
mi.authors = ovrdrv_data[6]
mi.title = ovrdrv_data[8]
cover_url = ovrdrv_data[0]
if cover_url:
self.cache_identifier_to_cover_url(ovrdrv_id,
cover_url)
def get_book_detail(self, br, metadata_url, mi, ovrdrv_id, log):
from html5_parser import parse
from lxml import html
from calibre.ebooks.chardet import xml_to_unicode
from calibre.library.comments import sanitize_comments_html
try:
raw = br.open_novisit(metadata_url).read()
except Exception as e:
if callable(getattr(e, 'getcode', None)) and \
e.getcode() == 404:
return False
raise
raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0]
try:
root = parse(raw, maybe_xhtml=False, sanitize_names=True)
except Exception:
return False
pub_date = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()")
lang = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()")
subjects = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()")
ebook_isbn = root.xpath("//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()")
desc = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]")
if pub_date:
from calibre.utils.date import parse_date
try:
mi.pubdate = parse_date(pub_date[0].strip())
except:
pass
if lang:
lang = lang[0].strip().lower()
lang = {'english':'eng', 'french':'fra', 'german':'deu',
'spanish':'spa'}.get(lang, None)
if lang:
mi.language = lang
if ebook_isbn:
# print("ebook isbn is "+type('')(ebook_isbn[0]))
isbn = check_isbn(ebook_isbn[0].strip())
if isbn:
self.cache_isbn_to_identifier(isbn, ovrdrv_id)
mi.isbn = isbn
if subjects:
mi.tags = [tag.strip() for tag in subjects[0].split(',')]
if desc:
desc = desc[0]
desc = html.tostring(desc, method='html', encoding='unicode').strip()
# remove all attributes from tags
desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
# Remove comments
desc = re.sub(r'(?s)<!--.*?-->', '', desc)
mi.comments = sanitize_comments_html(desc)
return None
if __name__ == '__main__':
# To run these test use:
# calibre-debug -e src/calibre/ebooks/metadata/sources/overdrive.py
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
title_test, authors_test)
test_identify_plugin(OverDrive.name,
[
(
{'title':'The Sea Kings Daughter',
'authors':['Elizabeth Peters']},
[title_test('The Sea Kings Daughter', exact=False),
authors_test(['Elizabeth Peters'])]
),
(
{'title': 'Elephants', 'authors':['Agatha']},
[title_test('Elephants Can Remember', exact=False),
authors_test(['Agatha Christie'])]
),
])