mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Optimize metadata retrieval
This commit is contained in:
parent
9c30a41612
commit
3a37d7e78f
@ -181,13 +181,15 @@ class ResultList(list):
|
|||||||
self.reautclean = re.compile(u'\s*\(.*\)\s*')
|
self.reautclean = re.compile(u'\s*\(.*\)\s*')
|
||||||
|
|
||||||
def get_title(self, entry):
|
def get_title(self, entry):
|
||||||
title = deepcopy(entry.find("div[@id='book-info']"))
|
# title = deepcopy(entry.find("div[@id='book-info']"))
|
||||||
|
title = deepcopy(entry)
|
||||||
title.remove(title.find("dl[@title='Informations sur le livre']"))
|
title.remove(title.find("dl[@title='Informations sur le livre']"))
|
||||||
title = ' '.join([i.text_content() for i in title.iterchildren()])
|
title = ' '.join([i.text_content() for i in title.iterchildren()])
|
||||||
return unicode(title.replace('\n', ''))
|
return unicode(title.replace('\n', ''))
|
||||||
|
|
||||||
def get_authors(self, entry):
|
def get_authors(self, entry):
|
||||||
author = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']")
|
# author = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']")
|
||||||
|
author = entry.find("dl[@title='Informations sur le livre']")
|
||||||
authortext = []
|
authortext = []
|
||||||
for x in author.getiterator('dt'):
|
for x in author.getiterator('dt'):
|
||||||
if self.reauteur.match(x.text):
|
if self.reauteur.match(x.text):
|
||||||
@ -202,22 +204,46 @@ class ResultList(list):
|
|||||||
|
|
||||||
def get_description(self, entry, verbose):
|
def get_description(self, entry, verbose):
|
||||||
try:
|
try:
|
||||||
return 'RESUME:\n' + unicode(entry.xpath("//p[@id='book-description']")[0].text)
|
return u'RESUME:\n' + unicode(entry.getparent().xpath("//p[@id='book-description']")[0].text)
|
||||||
except:
|
except:
|
||||||
report(verbose)
|
report(verbose)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def get_book_info(self, entry, mi):
|
||||||
|
entry = entry.find("dl[@title='Informations sur le livre']")
|
||||||
|
for x in entry.getiterator('dt'):
|
||||||
|
if x.text == 'ISBN':
|
||||||
|
isbntext = x.getnext().text_content().replace('-', '')
|
||||||
|
if check_isbn(isbntext):
|
||||||
|
mi.isbn = unicode(isbntext)
|
||||||
|
elif self.repub.match(x.text):
|
||||||
|
mi.publisher = unicode(x.getnext().text_content())
|
||||||
|
elif x.text == 'Langue':
|
||||||
|
mi.language = unicode(x.getnext().text_content())
|
||||||
|
elif x.text == 'Date de parution':
|
||||||
|
d = x.getnext().text_content()
|
||||||
|
try:
|
||||||
|
default = utcnow().replace(day=15)
|
||||||
|
d = replace_monthsfr(d)
|
||||||
|
d = parse_date(d, assume_utc=True, default=default)
|
||||||
|
mi.pubdate = d
|
||||||
|
except:
|
||||||
|
report(verbose)
|
||||||
|
return mi
|
||||||
|
|
||||||
def get_publisher(self, entry):
|
def get_publisher(self, entry):
|
||||||
publisher = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']")
|
# publisher = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']")
|
||||||
|
publisher = entry
|
||||||
publitext = None
|
publitext = None
|
||||||
for x in publisher.getiterator('dt'):
|
for x in publisher.getiterator('dt'):
|
||||||
if self.repub.match(x.text):
|
if self.repub.match(x.text):
|
||||||
publitext = x.getnext().text_content()
|
publitext = x.getnext().text_content()
|
||||||
break
|
break
|
||||||
return unicode(publitext).strip()
|
return unicode(publitext)
|
||||||
|
|
||||||
def get_date(self, entry, verbose):
|
def get_date(self, entry, verbose):
|
||||||
date = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']")
|
# date = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']")
|
||||||
|
date = entry
|
||||||
d = ''
|
d = ''
|
||||||
for x in date.getiterator('dt'):
|
for x in date.getiterator('dt'):
|
||||||
if x.text == 'Date de parution':
|
if x.text == 'Date de parution':
|
||||||
@ -235,35 +261,37 @@ class ResultList(list):
|
|||||||
return d
|
return d
|
||||||
|
|
||||||
def get_ISBN(self, entry):
|
def get_ISBN(self, entry):
|
||||||
isbn = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']")
|
# isbn = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']")
|
||||||
|
isbn = entry
|
||||||
isbntext = None
|
isbntext = None
|
||||||
for x in isbn.getiterator('dt'):
|
for x in isbn.getiterator('dt'):
|
||||||
if x.text == 'ISBN':
|
if x.text == 'ISBN':
|
||||||
isbntext = x.getnext().text_content()
|
isbntext = x.getnext().text_content().replace('-', '')
|
||||||
if not check_isbn(isbntext):
|
if not check_isbn(isbntext):
|
||||||
return None
|
return None
|
||||||
isbntext = isbntext.replace('-', '')
|
|
||||||
break
|
break
|
||||||
return unicode(isbntext)
|
return unicode(isbntext)
|
||||||
|
|
||||||
def get_language(self, entry):
|
def get_language(self, entry):
|
||||||
language = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']")
|
# language = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']")
|
||||||
|
language = entry
|
||||||
langtext = None
|
langtext = None
|
||||||
for x in language.getiterator('dt'):
|
for x in language.getiterator('dt'):
|
||||||
if x.text == 'Langue':
|
if x.text == 'Langue':
|
||||||
langtext = x.getnext().text_content()
|
langtext = x.getnext().text_content()
|
||||||
break
|
break
|
||||||
return unicode(langtext).strip()
|
return unicode(langtext)
|
||||||
|
|
||||||
def fill_MI(self, entry, title, authors, verbose):
|
def fill_MI(self, entry, title, authors, verbose):
|
||||||
mi = MetaInformation(title, authors)
|
mi = MetaInformation(title, authors)
|
||||||
mi.comments = self.get_description(entry, verbose)
|
|
||||||
mi.publisher = self.get_publisher(entry)
|
|
||||||
mi.pubdate = self.get_date(entry, verbose)
|
|
||||||
mi.isbn = self.get_ISBN(entry)
|
|
||||||
mi.author_sort = authors_to_sort_string(authors)
|
mi.author_sort = authors_to_sort_string(authors)
|
||||||
mi.language = self.get_language(entry)
|
mi.comments = self.get_description(entry, verbose)
|
||||||
return mi
|
# entry = entry.find("dl[@title='Informations sur le livre']")
|
||||||
|
# mi.publisher = self.get_publisher(entry)
|
||||||
|
# mi.pubdate = self.get_date(entry, verbose)
|
||||||
|
# mi.isbn = self.get_ISBN(entry)
|
||||||
|
# mi.language = self.get_language(entry)
|
||||||
|
return self.get_book_info(entry, mi)
|
||||||
|
|
||||||
def get_individual_metadata(self, browser, linkdata, verbose):
|
def get_individual_metadata(self, browser, linkdata, verbose):
|
||||||
try:
|
try:
|
||||||
@ -292,6 +320,7 @@ class ResultList(list):
|
|||||||
if len(entries) ==1:
|
if len(entries) ==1:
|
||||||
try:
|
try:
|
||||||
entry = entries[0].xpath("//div[@id='container']")[0]
|
entry = entries[0].xpath("//div[@id='container']")[0]
|
||||||
|
entry = entry.find("div[@id='book-info']")
|
||||||
title = self.get_title(entry)
|
title = self.get_title(entry)
|
||||||
authors = self.get_authors(entry)
|
authors = self.get_authors(entry)
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user