Optimize metadata retrieval

This commit is contained in:
Sengian 2010-11-21 10:24:56 +01:00
parent 9c30a41612
commit 3a37d7e78f

View File

@ -181,13 +181,15 @@ class ResultList(list):
self.reautclean = re.compile(u'\s*\(.*\)\s*') self.reautclean = re.compile(u'\s*\(.*\)\s*')
def get_title(self, entry): def get_title(self, entry):
title = deepcopy(entry.find("div[@id='book-info']")) # title = deepcopy(entry.find("div[@id='book-info']"))
title = deepcopy(entry)
title.remove(title.find("dl[@title='Informations sur le livre']")) title.remove(title.find("dl[@title='Informations sur le livre']"))
title = ' '.join([i.text_content() for i in title.iterchildren()]) title = ' '.join([i.text_content() for i in title.iterchildren()])
return unicode(title.replace('\n', '')) return unicode(title.replace('\n', ''))
def get_authors(self, entry): def get_authors(self, entry):
author = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") # author = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']")
author = entry.find("dl[@title='Informations sur le livre']")
authortext = [] authortext = []
for x in author.getiterator('dt'): for x in author.getiterator('dt'):
if self.reauteur.match(x.text): if self.reauteur.match(x.text):
@ -202,22 +204,46 @@ class ResultList(list):
def get_description(self, entry, verbose): def get_description(self, entry, verbose):
try: try:
return 'RESUME:\n' + unicode(entry.xpath("//p[@id='book-description']")[0].text) return u'RESUME:\n' + unicode(entry.getparent().xpath("//p[@id='book-description']")[0].text)
except: except:
report(verbose) report(verbose)
return None return None
def get_book_info(self, entry, mi):
entry = entry.find("dl[@title='Informations sur le livre']")
for x in entry.getiterator('dt'):
if x.text == 'ISBN':
isbntext = x.getnext().text_content().replace('-', '')
if check_isbn(isbntext):
mi.isbn = unicode(isbntext)
elif self.repub.match(x.text):
mi.publisher = unicode(x.getnext().text_content())
elif x.text == 'Langue':
mi.language = unicode(x.getnext().text_content())
elif x.text == 'Date de parution':
d = x.getnext().text_content()
try:
default = utcnow().replace(day=15)
d = replace_monthsfr(d)
d = parse_date(d, assume_utc=True, default=default)
mi.pubdate = d
except:
report(verbose)
return mi
def get_publisher(self, entry): def get_publisher(self, entry):
publisher = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") # publisher = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']")
publisher = entry
publitext = None publitext = None
for x in publisher.getiterator('dt'): for x in publisher.getiterator('dt'):
if self.repub.match(x.text): if self.repub.match(x.text):
publitext = x.getnext().text_content() publitext = x.getnext().text_content()
break break
return unicode(publitext).strip() return unicode(publitext)
def get_date(self, entry, verbose): def get_date(self, entry, verbose):
date = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") # date = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']")
date = entry
d = '' d = ''
for x in date.getiterator('dt'): for x in date.getiterator('dt'):
if x.text == 'Date de parution': if x.text == 'Date de parution':
@ -235,35 +261,37 @@ class ResultList(list):
return d return d
def get_ISBN(self, entry): def get_ISBN(self, entry):
isbn = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") # isbn = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']")
isbn = entry
isbntext = None isbntext = None
for x in isbn.getiterator('dt'): for x in isbn.getiterator('dt'):
if x.text == 'ISBN': if x.text == 'ISBN':
isbntext = x.getnext().text_content() isbntext = x.getnext().text_content().replace('-', '')
if not check_isbn(isbntext): if not check_isbn(isbntext):
return None return None
isbntext = isbntext.replace('-', '')
break break
return unicode(isbntext) return unicode(isbntext)
def get_language(self, entry): def get_language(self, entry):
language = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") # language = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']")
language = entry
langtext = None langtext = None
for x in language.getiterator('dt'): for x in language.getiterator('dt'):
if x.text == 'Langue': if x.text == 'Langue':
langtext = x.getnext().text_content() langtext = x.getnext().text_content()
break break
return unicode(langtext).strip() return unicode(langtext)
def fill_MI(self, entry, title, authors, verbose): def fill_MI(self, entry, title, authors, verbose):
mi = MetaInformation(title, authors) mi = MetaInformation(title, authors)
mi.comments = self.get_description(entry, verbose)
mi.publisher = self.get_publisher(entry)
mi.pubdate = self.get_date(entry, verbose)
mi.isbn = self.get_ISBN(entry)
mi.author_sort = authors_to_sort_string(authors) mi.author_sort = authors_to_sort_string(authors)
mi.language = self.get_language(entry) mi.comments = self.get_description(entry, verbose)
return mi # entry = entry.find("dl[@title='Informations sur le livre']")
# mi.publisher = self.get_publisher(entry)
# mi.pubdate = self.get_date(entry, verbose)
# mi.isbn = self.get_ISBN(entry)
# mi.language = self.get_language(entry)
return self.get_book_info(entry, mi)
def get_individual_metadata(self, browser, linkdata, verbose): def get_individual_metadata(self, browser, linkdata, verbose):
try: try:
@ -292,6 +320,7 @@ class ResultList(list):
if len(entries) ==1: if len(entries) ==1:
try: try:
entry = entries[0].xpath("//div[@id='container']")[0] entry = entries[0].xpath("//div[@id='container']")[0]
entry = entry.find("div[@id='book-info']")
title = self.get_title(entry) title = self.get_title(entry)
authors = self.get_authors(entry) authors = self.get_authors(entry)
except Exception, e: except Exception, e: