Add Overdrive as a metadata download source

This commit is contained in:
Kovid Goyal 2011-04-18 20:53:15 -06:00
commit 727ada1ac4
8 changed files with 503 additions and 31 deletions

View File

@ -19,10 +19,23 @@ class Slashdot(BasicNewsRecipe):
__author__ = 'floweros edited by Huan T'
no_stylesheets = True
# keep_only_tags = [
# dict(name='div',attrs={'class':'article'}),
# dict(name='div',attrs={'class':'commentTop'}),
# ]
keep_only_tags = [
dict(name='div',attrs={'id':'article'}),
dict(name='div',attrs={'class':['postBody' 'details']}),
dict(name='footer',attrs={'class':['clearfix meta article-foot']}),
dict(name='article',attrs={'class':['fhitem fhitem-story article usermode thumbs grid_24']}),
dict(name='dl',attrs={'class':'relatedPosts'}),
dict(name='h2',attrs={'class':'story'}),
dict(name='span',attrs={'class':'comments'}),
]
remove_tags = [
dict(name='aside',attrs={'id':'slashboxes'}),
dict(name='div',attrs={'class':'paginate'}),
dict(name='section',attrs={'id':'comments'}),
dict(name='span',attrs={'class':'topic'}),
]
feeds = [
(u'Slashdot',
@ -37,5 +50,3 @@ class Slashdot(BasicNewsRecipe):
u'http://rss.slashdot.org/Slashdot/slashdotYourRightsOnline')
]
def get_article_url(self, article):
return article.get('feedburner_origlink', None)

View File

@ -626,8 +626,9 @@ if test_eight_code:
from calibre.ebooks.metadata.sources.amazon import Amazon
from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary
from calibre.ebooks.metadata.sources.isbndb import ISBNDB
from calibre.ebooks.metadata.sources.overdrive import OverDrive
plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB]
plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive]
# }}}
else:

View File

@ -399,7 +399,7 @@ class HTMLPreProcessor(object):
(re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'),
# If pdf printed from a browser then the header/footer has a reliable pattern
(re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
(re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
# Center separator lines
(re.compile(u'<br>\s*(?P<break>([*#•✦=]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),

View File

@ -764,6 +764,7 @@ class HeuristicProcessor(object):
# Multiple sequential blank paragraphs are merged with appropriate margins
# If non-blank scene breaks exist they are center aligned and styled with appropriate margins.
if getattr(self.extra_opts, 'format_scene_breaks', False):
html = re.sub('(?i)<div[^>]*>\s*<br(\s?/)?>\s*</div>', '<p></p>', html)
html = self.detect_whitespace(html)
html = self.detect_soft_breaks(html)
blanks_count = len(self.any_multi_blank.findall(html))

View File

@ -274,26 +274,34 @@ class Source(Plugin):
if authors:
# Leave ' in there for Irish names
pat = re.compile(r'[-,:;+!@#$%^&*(){}.`~"\s\[\]/]')
remove_pat = re.compile(r'[,!@#$%^&*(){}`~"\s\[\]/]')
replace_pat = re.compile(r'[-+.:;]')
if only_first_author:
authors = authors[:1]
for au in authors:
au = replace_pat.sub(' ', au)
parts = au.split()
if ',' in au:
# au probably in ln, fn form
parts = parts[1:] + parts[:1]
for tok in parts:
tok = pat.sub('', tok).strip()
tok = remove_pat.sub('', tok).strip()
if len(tok) > 2 and tok.lower() not in ('von', ):
yield tok
def get_title_tokens(self, title):
def get_title_tokens(self, title, strip_joiners=True, strip_subtitle=False):
'''
Take a title and return a list of tokens useful for an AND search query.
Excludes connectives and punctuation.
Excludes connectives(optionally) and punctuation.
'''
if title:
# strip sub-titles
if strip_subtitle:
subtitle = re.compile(r'([\(\[\{].*?[\)\]\}]|[/:\\].*$)')
if len(subtitle.sub('', title)) > 1:
title = subtitle.sub('', title)
title_patterns = [(re.compile(pat, re.IGNORECASE), repl) for pat, repl in
[
# Remove things like: (2010) (Omnibus) etc.
@ -305,17 +313,20 @@ class Source(Plugin):
(r'(\d+),(\d+)', r'\1\2'),
# Remove hyphens only if they have whitespace before them
(r'(\s-)', ' '),
# Remove single quotes
(r"'", ''),
# Remove single quotes not followed by 's'
(r"'(?!s)", ''),
# Replace other special chars with a space
(r'''[:,;+!@#$%^&*(){}.`~"\s\[\]/]''', ' ')
]]
for pat, repl in title_patterns:
title = pat.sub(repl, title)
tokens = title.split()
for token in tokens:
token = token.strip()
if token and token.lower() not in ('a', 'and', 'the'):
if token and (not strip_joiners or token.lower() not in ('a',
'and', 'the', '&')):
yield token
def split_jobs(self, jobs, num):
@ -363,7 +374,11 @@ class Source(Plugin):
def get_book_url(self, identifiers):
'''
Return the URL for the book identified by identifiers at this source.
If no URL is found, return None.
This URL must be browseable to by a human using a browser. It is meant
to provide a clickable link for the user to easily visit the books page
at this source.
If no URL is found, return None. This method must be quick, either it
should construct the URL using a known URL scheme or use a cached URL.
'''
return None

View File

@ -433,7 +433,7 @@ def urls_from_identifiers(identifiers): # {{{
pass
isbn = identifiers.get('isbn', None)
if isbn:
ans.append(('ISBN',
ans.append((isbn,
'http://www.worldcat.org/search?q=bn%%3A%s&qt=advanced'%isbn))
return ans
# }}}
@ -444,13 +444,18 @@ if __name__ == '__main__': # tests {{{
from calibre.ebooks.metadata.sources.test import (test_identify,
title_test, authors_test)
tests = [
(
{'title':'Magykal Papers',
'authors':['Sage']},
[title_test('The Magykal Papers', exact=True)],
),
( # An e-book ISBN not on Amazon, one of the authors is
# unknown to Amazon
{'identifiers':{'isbn': '9780307459671'},
'title':'Invisible Gorilla', 'authors':['Christopher Chabris']},
[title_test('The Invisible Gorilla',
exact=True), authors_test(['Christopher Chabris', 'Daniel Simons'])]
[title_test('The Invisible Gorilla', exact=True)]
),

View File

@ -0,0 +1,439 @@
#!/usr/bin/env python
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
Fetch metadata using Overdrive Content Reserve
'''
import re, random, mechanize, copy
from threading import RLock
from Queue import Queue, Empty
from lxml import html
from lxml.html import soupparser
from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.sources.base import Source
from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.chardet import xml_to_unicode
from calibre.library.comments import sanitize_comments_html
ovrdrv_data_cache = {}
cover_url_cache = {}
cache_lock = RLock()
base_url = 'http://search.overdrive.com/'
class OverDrive(Source):
name = 'Overdrive'
description = _('Downloads metadata from Overdrive\'s Content Reserve')
capabilities = frozenset(['identify', 'cover'])
touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate',
'comments', 'publisher', 'identifier:isbn', 'series', 'series_index',
'language', 'identifier:overdrive'])
has_html_comments = True
supports_gzip_transfer_encoding = False
cached_cover_url_is_reliable = True
def __init__(self, *args, **kwargs):
Source.__init__(self, *args, **kwargs)
self.prefs.defaults['ignore_fields'] =['tags', 'pubdate', 'comments', 'identifier:isbn', 'language']
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
identifiers={}, timeout=30):
ovrdrv_id = identifiers.get('overdrive', None)
isbn = identifiers.get('isbn', None)
br = self.browser
ovrdrv_data = self.to_ovrdrv_data(br, title, authors, ovrdrv_id)
if ovrdrv_data:
title = ovrdrv_data[8]
authors = ovrdrv_data[6]
mi = Metadata(title, authors)
self.parse_search_results(ovrdrv_data, mi)
if ovrdrv_id is None:
ovrdrv_id = ovrdrv_data[7]
if isbn is not None:
self.cache_isbn_to_identifier(isbn, ovrdrv_id)
self.get_book_detail(br, ovrdrv_data[1], mi, ovrdrv_id, log)
result_queue.put(mi)
return None
# }}}
def download_cover(self, log, result_queue, abort, # {{{
title=None, authors=None, identifiers={}, timeout=30):
cached_url = self.get_cached_cover_url(identifiers)
if cached_url is None:
log.info('No cached cover found, running identify')
rq = Queue()
self.identify(log, rq, abort, title=title, authors=authors,
identifiers=identifiers)
if abort.is_set():
return
results = []
while True:
try:
results.append(rq.get_nowait())
except Empty:
break
results.sort(key=self.identify_results_keygen(
title=title, authors=authors, identifiers=identifiers))
for mi in results:
cached_url = self.get_cached_cover_url(mi.identifiers)
if cached_url is not None:
break
if cached_url is None:
log.info('No cover found')
return
if abort.is_set():
return
ovrdrv_id = identifiers.get('overdrive', None)
br = self.browser
referer = self.get_base_referer()+'ContentDetails-Cover.htm?ID='+ovrdrv_id
req = mechanize.Request(cached_url)
req.add_header('referer', referer)
log('Downloading cover from:', cached_url)
try:
cdata = br.open_novisit(req, timeout=timeout).read()
result_queue.put((self, cdata))
except:
log.exception('Failed to download cover from:', cached_url)
# }}}
def get_cached_cover_url(self, identifiers): # {{{
url = None
ovrdrv_id = identifiers.get('overdrive', None)
if ovrdrv_id is None:
isbn = identifiers.get('isbn', None)
if isbn is not None:
ovrdrv_id = self.cached_isbn_to_identifier(isbn)
if ovrdrv_id is not None:
url = self.cached_identifier_to_cover_url(ovrdrv_id)
return url
# }}}
def get_base_referer(self): # to be used for passing referrer headers to cover download
choices = [
'http://overdrive.chipublib.org/82DC601D-7DDE-4212-B43A-09D821935B01/10/375/en/',
'http://emedia.clevnet.org/9D321DAD-EC0D-490D-BFD8-64AE2C96ECA8/10/241/en/',
'http://singapore.lib.overdrive.com/F11D55BE-A917-4D63-8111-318E88B29740/10/382/en/',
'http://ebooks.nypl.org/20E48048-A377-4520-BC43-F8729A42A424/10/257/en/',
'http://spl.lib.overdrive.com/5875E082-4CB2-4689-9426-8509F354AFEF/10/335/en/'
]
return choices[random.randint(0, len(choices)-1)]
def format_results(self, reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid):
fix_slashes = re.compile(r'\\/')
thumbimage = fix_slashes.sub('/', thumbimage)
worldcatlink = fix_slashes.sub('/', worldcatlink)
cover_url = re.sub('(?P<img>(Ima?g(eType-)?))200', '\g<img>100', thumbimage)
social_metadata_url = base_url+'TitleInfo.aspx?ReserveID='+reserveid+'&FormatID='+formatid
series_num = ''
if not series:
if subtitle:
title = od_title+': '+subtitle
else:
title = od_title
else:
title = od_title
m = re.search("([0-9]+$)", subtitle)
if m:
series_num = float(m.group(1))
return [cover_url, social_metadata_url, worldcatlink, series, series_num, publisher, creators, reserveid, title]
def safe_query(self, br, query_url, post=''):
'''
The query must be initialized by loading an empty search results page
this page attempts to set a cookie that Mechanize doesn't like
copy the cookiejar to a separate instance and make a one-off request with the temp cookiejar
'''
goodcookies = br._ua_handlers['_cookies'].cookiejar
clean_cj = mechanize.CookieJar()
cookies_to_copy = []
for cookie in goodcookies:
copied_cookie = copy.deepcopy(cookie)
cookies_to_copy.append(copied_cookie)
for copied_cookie in cookies_to_copy:
clean_cj.set_cookie(copied_cookie)
if post:
br.open_novisit(query_url, post)
else:
br.open_novisit(query_url)
br.set_cookiejar(clean_cj)
def overdrive_search(self, br, q, title, author):
# re-initialize the cookiejar to so that it's clean
clean_cj = mechanize.CookieJar()
br.set_cookiejar(clean_cj)
q_query = q+'default.aspx/SearchByKeyword'
q_init_search = q+'SearchResults.aspx'
# get first author as string - convert this to a proper cleanup function later
author_tokens = list(self.get_author_tokens(author,
only_first_author=True))
title_tokens = list(self.get_title_tokens(title,
strip_joiners=False, strip_subtitle=True))
if len(title_tokens) >= len(author_tokens):
initial_q = ' '.join(title_tokens)
xref_q = '+'.join(author_tokens)
else:
initial_q = ' '.join(author_tokens)
xref_q = '+'.join(title_tokens)
q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
query = '{"szKeyword":"'+initial_q+'"}'
# main query, requires specific Content Type header
req = mechanize.Request(q_query)
req.add_header('Content-Type', 'application/json; charset=utf-8')
br.open_novisit(req, query)
# initiate the search without messing up the cookiejar
self.safe_query(br, q_init_search)
# get the search results object
results = False
while results == False:
xreq = mechanize.Request(q_xref)
xreq.add_header('X-Requested-With', 'XMLHttpRequest')
xreq.add_header('Referer', q_init_search)
xreq.add_header('Accept', 'application/json, text/javascript, */*')
raw = br.open_novisit(xreq).read()
for m in re.finditer(ur'"iTotalDisplayRecords":(?P<displayrecords>\d+).*?"iTotalRecords":(?P<totalrecords>\d+)', raw):
if int(m.group('displayrecords')) >= 1:
results = True
elif int(m.group('totalrecords')) >= 1:
xref_q = ''
q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
elif int(m.group('totalrecords')) == 0:
return ''
return self.sort_ovrdrv_results(raw, title, title_tokens, author, author_tokens)
def sort_ovrdrv_results(self, raw, title=None, title_tokens=None, author=None, author_tokens=None, ovrdrv_id=None):
close_matches = []
raw = re.sub('.*?\[\[(?P<content>.*?)\]\].*', '[[\g<content>]]', raw)
results = eval(raw)
#print results
# The search results are either from a keyword search or a multi-format list from a single ID,
# sort through the results for closest match/format
if results:
for reserveid, od_title, subtitle, edition, series, publisher, format, formatid, creators, \
thumbimage, shortdescription, worldcatlink, excerptlink, creatorfile, sorttitle, \
availabletolibrary, availabletoretailer, relevancyrank, unknown1, unknown2, unknown3 in results:
#print "this record's title is "+od_title+", subtitle is "+subtitle+", author[s] are "+creators+", series is "+series
if ovrdrv_id is not None and int(formatid) in [1, 50, 410, 900]:
#print "overdrive id is not None, searching based on format type priority"
return self.format_results(reserveid, od_title, subtitle, series, publisher,
creators, thumbimage, worldcatlink, formatid)
else:
creators = creators.split(', ')
# if an exact match in a preferred format occurs
if (author and creators[0] == author[0]) and od_title == title and int(formatid) in [1, 50, 410, 900]:
return self.format_results(reserveid, od_title, subtitle, series, publisher,
creators, thumbimage, worldcatlink, formatid)
else:
close_title_match = False
close_author_match = False
for token in title_tokens:
if od_title.lower().find(token.lower()) != -1:
close_title_match = True
else:
close_title_match = False
break
for author in creators:
for token in author_tokens:
if author.lower().find(token.lower()) != -1:
close_author_match = True
else:
close_author_match = False
break
if close_author_match:
break
if close_title_match and close_author_match and int(formatid) in [1, 50, 410, 900] and thumbimage:
if subtitle and series:
close_matches.insert(0, self.format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid))
else:
close_matches.append(self.format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid))
if close_matches:
return close_matches[0]
else:
return ''
else:
return ''
def overdrive_get_record(self, br, q, ovrdrv_id):
search_url = q+'SearchResults.aspx?ReserveID={'+ovrdrv_id+'}'
results_url = q+'SearchResults.svc/GetResults?sEcho=1&iColumns=18&sColumns=ReserveID%2CTitle%2CSubtitle%2CEdition%2CSeries%2CPublisher%2CFormat%2CFormatID%2CCreators%2CThumbImage%2CShortDescription%2CWorldCatLink%2CExcerptLink%2CCreatorFile%2CSortTitle%2CAvailableToLibrary%2CAvailableToRetailer%2CRelevancyRank&iDisplayStart=0&iDisplayLength=10&sSearch=&bEscapeRegex=true&iSortingCols=1&iSortCol_0=17&sSortDir_0=asc'
# re-initialize the cookiejar to so that it's clean
clean_cj = mechanize.CookieJar()
br.set_cookiejar(clean_cj)
# get the base url to set the proper session cookie
br.open_novisit(q)
# initialize the search
self.safe_query(br, search_url)
# get the results
req = mechanize.Request(results_url)
req.add_header('X-Requested-With', 'XMLHttpRequest')
req.add_header('Referer', search_url)
req.add_header('Accept', 'application/json, text/javascript, */*')
raw = br.open_novisit(req)
raw = str(list(raw))
clean_cj = mechanize.CookieJar()
br.set_cookiejar(clean_cj)
return self.sort_ovrdrv_results(raw, None, None, None, ovrdrv_id)
def find_ovrdrv_data(self, br, title, author, isbn, ovrdrv_id=None):
q = base_url
if ovrdrv_id is None:
return self.overdrive_search(br, q, title, author)
else:
return self.overdrive_get_record(br, q, ovrdrv_id)
def to_ovrdrv_data(self, br, title=None, author=None, ovrdrv_id=None):
'''
Takes either a title/author combo or an Overdrive ID. One of these
two must be passed to this function.
'''
if ovrdrv_id is not None:
with cache_lock:
ans = ovrdrv_data_cache.get(ovrdrv_id, None)
if ans:
return ans
elif ans is False:
return None
else:
ovrdrv_data = self.find_ovrdrv_data(br, title, author, ovrdrv_id)
else:
try:
ovrdrv_data = self.find_ovrdrv_data(br, title, author, ovrdrv_id)
except:
import traceback
traceback.print_exc()
ovrdrv_data = None
with cache_lock:
ovrdrv_data_cache[ovrdrv_id] = ovrdrv_data if ovrdrv_data else False
return ovrdrv_data if ovrdrv_data else False
def parse_search_results(self, ovrdrv_data, mi):
'''
Parse the formatted search results from the initial Overdrive query and
add the values to the metadta.
The list object has these values:
[cover_url[0], social_metadata_url[1], worldcatlink[2], series[3], series_num[4],
publisher[5], creators[6], reserveid[7], title[8]]
'''
ovrdrv_id = ovrdrv_data[7]
mi.set_identifier('overdrive', ovrdrv_id)
if len(ovrdrv_data[3]) > 1:
mi.series = ovrdrv_data[3]
if ovrdrv_data[4]:
try:
mi.series_index = float(ovrdrv_data[4])
except:
pass
mi.publisher = ovrdrv_data[5]
mi.authors = ovrdrv_data[6]
mi.title = ovrdrv_data[8]
cover_url = ovrdrv_data[0]
if cover_url:
self.cache_identifier_to_cover_url(ovrdrv_id,
cover_url)
def get_book_detail(self, br, metadata_url, mi, ovrdrv_id, log):
try:
raw = br.open_novisit(metadata_url).read()
except Exception, e:
if callable(getattr(e, 'getcode', None)) and \
e.getcode() == 404:
return False
raise
raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0]
try:
root = soupparser.fromstring(raw)
except:
return False
pub_date = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()")
lang = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()")
subjects = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()")
ebook_isbn = root.xpath("//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()")
desc = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]")
if pub_date:
from calibre.utils.date import parse_date
mi.pubdate = parse_date(pub_date[0].strip())
if lang:
mi.language = lang[0].strip()
if ebook_isbn:
#print "ebook isbn is "+str(ebook_isbn[0])
isbn = check_isbn(ebook_isbn[0].strip())
if isbn:
self.cache_isbn_to_identifier(isbn, ovrdrv_id)
mi.isbn = isbn
if subjects:
mi.tags = [tag.strip() for tag in subjects[0].split(',')]
if desc:
desc = desc[0]
desc = html.tostring(desc, method='html', encoding=unicode).strip()
# remove all attributes from tags
desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
# Remove comments
desc = re.sub(r'(?s)<!--.*?-->', '', desc)
mi.comments = sanitize_comments_html(desc)
return None
if __name__ == '__main__':
# To run these test use:
# calibre-debug -e src/calibre/ebooks/metadata/sources/overdrive.py
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
title_test, authors_test)
test_identify_plugin(OverDrive.name,
[
(
{'title':'Foundation and Earth',
'authors':['Asimov']},
[title_test('Foundation and Earth', exact=True),
authors_test(['Isaac Asimov'])]
),
(
{'title': 'Elephants', 'authors':['Agatha']},
[title_test('Elephants Can Remember', exact=False),
authors_test(['Agatha Christie'])]
),
])

View File

@ -949,7 +949,7 @@ class CoverFetch(QDialog): # {{{
# }}}
if __name__ == '__main__':
DEBUG_DIALOG = True
#DEBUG_DIALOG = True
app = QApplication([])
d = FullFetch()
d.start(title='great gatsby', authors=['fitzgerald'])