From 9f34a0c27017adcbb1b9126142ed304812729ddd Mon Sep 17 00:00:00 2001 From: Sengian Date: Sun, 21 Nov 2010 20:09:42 +0100 Subject: [PATCH 01/24] Modification of single metadata_single.py interface to add columns saying if there is a summary and a cover, add an option to automatically download a cover --- src/calibre/gui2/__init__.py | 2 + src/calibre/gui2/dialogs/fetch_metadata.py | 16 +- src/calibre/gui2/dialogs/fetch_metadata.ui | 351 ++++++++++---------- src/calibre/gui2/dialogs/metadata_single.py | 2 + 4 files changed, 198 insertions(+), 173 deletions(-) diff --git a/src/calibre/gui2/__init__.py b/src/calibre/gui2/__init__.py index f035c40cb4..6ea533d396 100644 --- a/src/calibre/gui2/__init__.py +++ b/src/calibre/gui2/__init__.py @@ -123,6 +123,8 @@ def _config(): help=_('Download social metadata (tags/rating/etc.)')) c.add_opt('overwrite_author_title_metadata', default=True, help=_('Overwrite author and title with new metadata')) + c.add_opt('overwrite_cover_image', default=False, + help=_('Overwrite cover with new new cover if existing')) c.add_opt('enforce_cpu_limit', default=True, help=_('Limit max simultaneous jobs to number of CPUs')) c.add_opt('tag_browser_hidden_categories', default=set(), diff --git a/src/calibre/gui2/dialogs/fetch_metadata.py b/src/calibre/gui2/dialogs/fetch_metadata.py index 2c64219464..f577632781 100644 --- a/src/calibre/gui2/dialogs/fetch_metadata.py +++ b/src/calibre/gui2/dialogs/fetch_metadata.py @@ -48,7 +48,7 @@ class Matches(QAbstractTableModel): return len(self.matches) def columnCount(self, *args): - return 6 + return 8 def headerData(self, section, orientation, role): if role != Qt.DisplayRole: @@ -61,6 +61,8 @@ class Matches(QAbstractTableModel): elif section == 3: text = _("Publisher") elif section == 4: text = _("ISBN") elif section == 5: text = _("Published") + elif section == 6: text = _("Cover?") + elif section == 7: text = _("Summary?") return QVariant(text) else: @@ -87,6 +89,10 @@ class Matches(QAbstractTableModel): elif col == 5: if hasattr(book.pubdate, 'timetuple'): res = strftime('%b %Y', book.pubdate.timetuple()) + elif col == 6 and book.has_cover: + res = 'OK' + elif col == 7 and book.comments: + res = 'OK' if not res: return NONE return QVariant(res) @@ -131,6 +137,7 @@ class FetchMetadata(QDialog, Ui_FetchMetadata): self.fetch_metadata() self.opt_get_social_metadata.setChecked(config['get_social_metadata']) self.opt_overwrite_author_title_metadata.setChecked(config['overwrite_author_title_metadata']) + self.opt_overwrite_cover_image.setChecked(config['overwrite_cover_image']) def show_summary(self, current, *args): @@ -213,6 +220,13 @@ class FetchMetadata(QDialog, Ui_FetchMetadata): _hung_fetchers.add(self.fetcher) if hasattr(self, '_hangcheck') and self._hangcheck.isActive(): self._hangcheck.stop() + #option configure + if self.opt_get_social_metadata.isChecked() != config['get_social_metadata']: + config.set('get_social_metadata', self.opt_get_social_metadata.isChecked()) + if self.opt_overwrite_author_title_metadata.isChecked() != config['overwrite_author_title_metadata']: + config.set('overwrite_author_title_metadata', self.opt_overwrite_author_title_metadata.isChecked()) + if self.opt_overwrite_cover_image.isChecked() != config['overwrite_cover_image']: + config.set('overwrite_cover_image', self.opt_overwrite_cover_image.isChecked()) def __enter__(self, *args): return self diff --git a/src/calibre/gui2/dialogs/fetch_metadata.ui b/src/calibre/gui2/dialogs/fetch_metadata.ui index 03a362096c..0b39089ee3 100644 --- a/src/calibre/gui2/dialogs/fetch_metadata.ui +++ b/src/calibre/gui2/dialogs/fetch_metadata.ui @@ -1,172 +1,179 @@ - - - FetchMetadata - - - Qt::WindowModal - - - - 0 - 0 - 830 - 642 - - - - Fetch metadata - - - - :/images/metadata.png:/images/metadata.png - - - - - - <p>calibre can find metadata for your books from two locations: <b>Google Books</b> and <b>isbndb.com</b>. <p>To use isbndb.com you must sign up for a <a href="http://www.isbndb.com">free account</a> and enter your access key below. - - - Qt::AlignCenter - - - true - - - true - - - - - - - - - &Access Key: - - - key - - - - - - - - - - Fetch - - - - - - - - - - - - true - - - - - - - Matches - - - - - - Select the book that most closely matches your copy from the list below - - - - - - - - 0 - 1 - - - - true - - - QAbstractItemView::SingleSelection - - - QAbstractItemView::SelectRows - - - - - - - - - - - - - Download &social metadata (tags/rating/etc.) for the selected book - - - - - - - Overwrite author and title with author and title of selected book - - - - - - - QDialogButtonBox::Cancel|QDialogButtonBox::Ok - - - - - - - - - - - buttonBox - accepted() - FetchMetadata - accept() - - - 460 - 599 - - - 657 - 530 - - - - - buttonBox - rejected() - FetchMetadata - reject() - - - 417 - 599 - - - 0 - 491 - - - - - + + + FetchMetadata + + + Qt::WindowModal + + + + 0 + 0 + 890 + 642 + + + + Fetch metadata + + + + :/images/metadata.png:/images/metadata.png + + + + + + <p>calibre can find metadata for your books from two locations: <b>Google Books</b> and <b>isbndb.com</b>. <p>To use isbndb.com you must sign up for a <a href="http://www.isbndb.com">free account</a> and enter your access key below. + + + Qt::AlignCenter + + + true + + + true + + + + + + + + + &Access Key: + + + key + + + + + + + + + + Fetch + + + + + + + + + + + + true + + + + + + + Matches + + + + + + Select the book that most closely matches your copy from the list below + + + + + + + + 0 + 1 + + + + true + + + QAbstractItemView::SingleSelection + + + QAbstractItemView::SelectRows + + + + + + + + + + + + + Overwrite author and title with author and title of selected book + + + + + + + Download &social metadata (tags/rating/etc.) for the selected book + + + + + + + Overwrite cover image with downloaded cover if available for the selected book + + + + + + + QDialogButtonBox::Cancel|QDialogButtonBox::Ok + + + + + + + + + + + buttonBox + accepted() + FetchMetadata + accept() + + + 460 + 599 + + + 657 + 530 + + + + + buttonBox + rejected() + FetchMetadata + reject() + + + 417 + 599 + + + 0 + 491 + + + + + diff --git a/src/calibre/gui2/dialogs/metadata_single.py b/src/calibre/gui2/dialogs/metadata_single.py index 2b951a7b2b..1eae761561 100644 --- a/src/calibre/gui2/dialogs/metadata_single.py +++ b/src/calibre/gui2/dialogs/metadata_single.py @@ -718,6 +718,8 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): if book.author_sort: self.author_sort.setText(book.author_sort) if book.publisher: self.publisher.setEditText(book.publisher) if book.isbn: self.isbn.setText(book.isbn) + if d.opt_overwrite_cover_image.isChecked() and book.has_cover: + self.fetch_cover() if book.pubdate: d = book.pubdate self.pubdate.setDate(QDate(d.year, d.month, d.day)) From ded56f11dd62cf5eee39bebfa0890b5dcf788acc Mon Sep 17 00:00:00 2001 From: Sengian Date: Sun, 21 Nov 2010 20:16:55 +0100 Subject: [PATCH 02/24] Cleaner handle of html_comments ie make the option appears only if it is needed --- src/calibre/ebooks/metadata/fetch.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index b6b3fb9c40..b797a477d6 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -145,18 +145,21 @@ class MetadataSource(Plugin): # {{{ setattr(w, '_'+x, cb) cb.setChecked(c.get(x, True)) w._layout.addWidget(cb) - - cb = QCheckBox(_('Convert comments downloaded from %s to plain text')%(self.name)) - setattr(w, '_textcomments', cb) - cb.setChecked(c.get('textcomments', False)) - w._layout.addWidget(cb) + + if self.has_html_comments: + cb = QCheckBox(_('Convert comments downloaded from %s to plain text')%(self.name)) + setattr(w, '_textcomments', cb) + cb.setChecked(c.get('textcomments', False)) + w._layout.addWidget(cb) return w def save_settings(self, w): dl_settings = {} - for x in ('rating', 'tags', 'comments', 'textcomments'): + for x in ('rating', 'tags', 'comments'): dl_settings[x] = getattr(w, '_'+x).isChecked() + if self.has_html_comments: + dl_settings['textcomments'] = getattr(w, '_textcomments').isChecked() c = self.config_store() c.set(self.name, dl_settings) if hasattr(w, '_sc'): From f6f96ae97c768c2b8e0a1167cc7d37b69d7b4eb5 Mon Sep 17 00:00:00 2001 From: Sengian Date: Sun, 21 Nov 2010 20:28:55 +0100 Subject: [PATCH 03/24] Switch Bibtex catalog to class use and mreplace for speed gain more efficiency --- src/calibre/library/catalog.py | 41 ++++++----- src/calibre/utils/bibtex.py | 125 ++++++++++++++++----------------- src/calibre/utils/mreplace.py | 32 +++++++++ 3 files changed, 117 insertions(+), 81 deletions(-) create mode 100644 src/calibre/utils/mreplace.py diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py index 19519d6d71..ffd7769f17 100644 --- a/src/calibre/library/catalog.py +++ b/src/calibre/library/catalog.py @@ -278,10 +278,10 @@ class BIBTEX(CatalogPlugin): from calibre.library.save_to_disk import preprocess_template #Bibtex functions - from calibre.utils.bibtex import bibtex_author_format, utf8ToBibtex, ValidateCitationKey + from calibre.utils.bibtex import BibTeX def create_bibtex_entry(entry, fields, mode, template_citation, - asccii_bibtex = True, citation_bibtex = True): + bibtexdict, citation_bibtex = True): #Bibtex doesn't like UTF-8 but keep unicode until writing #Define starting chain or if book valid strict and not book return a Fail string @@ -297,7 +297,8 @@ class BIBTEX(CatalogPlugin): if citation_bibtex : # Citation tag - bibtex_entry.append(make_bibtex_citation(entry, template_citation, asccii_bibtex)) + bibtex_entry.append(make_bibtex_citation(entry, template_citation, + bibtexdict)) bibtex_entry = [u' '.join(bibtex_entry)] for field in fields: @@ -312,11 +313,11 @@ class BIBTEX(CatalogPlugin): pass if field == 'authors' : - bibtex_entry.append(u'author = "%s"' % bibtex_author_format(item)) + bibtex_entry.append(u'author = "%s"' % bibtexdict.bibtex_author_format(item)) elif field in ['title', 'publisher', 'cover', 'uuid', 'author_sort', 'series'] : - bibtex_entry.append(u'%s = "%s"' % (field, utf8ToBibtex(item, asccii_bibtex))) + bibtex_entry.append(u'%s = "%s"' % (field, bibtexdict.utf8ToBibtex(item))) elif field == 'id' : bibtex_entry.append(u'calibreid = "%s"' % int(item)) @@ -329,13 +330,13 @@ class BIBTEX(CatalogPlugin): elif field == 'tags' : #A list to flatten - bibtex_entry.append(u'tags = "%s"' % utf8ToBibtex(u', '.join(item), asccii_bibtex)) + bibtex_entry.append(u'tags = "%s"' % bibtexdict.utf8ToBibtex(u', '.join(item))) elif field == 'comments' : #\n removal item = item.replace(u'\r\n',u' ') item = item.replace(u'\n',u' ') - bibtex_entry.append(u'note = "%s"' % utf8ToBibtex(item, asccii_bibtex)) + bibtex_entry.append(u'note = "%s"' % bibtexdict.utf8ToBibtex(item)) elif field == 'isbn' : # Could be 9, 10 or 13 digits @@ -353,8 +354,7 @@ class BIBTEX(CatalogPlugin): elif field == 'pubdate' : bibtex_entry.append(u'year = "%s"' % item.year) - bibtex_entry.append(u'month = "%s"' % utf8ToBibtex(strftime("%b", item), - asccii_bibtex)) + bibtex_entry.append(u'month = "%s"' % bibtexdict.utf8ToBibtex(strftime("%b", item))) bibtex_entry = u',\n '.join(bibtex_entry) bibtex_entry += u' }\n\n' @@ -371,7 +371,7 @@ class BIBTEX(CatalogPlugin): else : return True - def make_bibtex_citation(entry, template_citation, asccii_bibtex): + def make_bibtex_citation(entry, template_citation, bibtexclass): #define a function to replace the template entry by its value def tpl_replace(objtplname) : @@ -392,8 +392,9 @@ class BIBTEX(CatalogPlugin): return u'' if len(template_citation) >0 : - tpl_citation = utf8ToBibtex(ValidateCitationKey(re.sub(u'\{[^{}]*\}', - tpl_replace, template_citation)), asccii_bibtex) + tpl_citation = bibtexclass.utf8ToBibtex( + bibtexclass.ValidateCitationKey(re.sub(u'\{[^{}]*\}', + tpl_replace, template_citation))) if len(tpl_citation) >0 : return tpl_citation @@ -405,9 +406,9 @@ class BIBTEX(CatalogPlugin): template_citation = u'%s' % str(entry["id"]) if asccii_bibtex : - return ValidateCitationKey(template_citation.encode('ascii', 'replace')) + return bibtexclass.ValidateCitationKey(template_citation.encode('ascii', 'replace')) else : - return ValidateCitationKey(template_citation) + return bibtexclass.ValidateCitationKey(template_citation) self.fmt = path_to_output.rpartition('.')[2] self.notification = notification @@ -475,13 +476,16 @@ class BIBTEX(CatalogPlugin): if not len(data): log.error("\nNo matching database entries for search criteria '%s'" % opts.search_text) + #Initialize BibTeX class + bibtexc = BibTeX() + #Entries writing after Bibtex formating (or not) if bibfile_enc != 'ascii' : - asccii_bibtex = False + bibtexc.ascii_bibtex = False else : - asccii_bibtex = True + bibtexc.ascii_bibtex = True - #Check and go to default in case of bad CLI + #Check citation choice and go to default in case of bad CLI if isinstance(opts.impcit, (StringType, UnicodeType)) : if opts.impcit == 'False' : citation_bibtex= False @@ -493,6 +497,7 @@ class BIBTEX(CatalogPlugin): else : citation_bibtex= opts.impcit + #Preprocess for error and light correction template_citation = preprocess_template(opts.bib_cit) #Open output and write entries @@ -514,7 +519,7 @@ class BIBTEX(CatalogPlugin): for entry in data: outfile.write(create_bibtex_entry(entry, fields, bib_entry, template_citation, - asccii_bibtex, citation_bibtex)) + bibtexc, citation_bibtex)) outfile.close() diff --git a/src/calibre/utils/bibtex.py b/src/calibre/utils/bibtex.py index 1328aa9157..09868ccdb1 100644 --- a/src/calibre/utils/bibtex.py +++ b/src/calibre/utils/bibtex.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - """ Collection of python utility-methodes commonly used by other bibliograph packages. From http://pypi.python.org/pypi/bibliograph.core/ @@ -62,10 +60,14 @@ DAMAGE. """ -__docformat__ = 'reStructuredText' __author__ = 'sengian ' +__docformat__ = 'restructuredtext en' import re, string +from UserDict import UserDict + +from calibre.constants import preferred_encoding +from calibre.utils.mreplace import MReplace utf8enc2latex_mapping = { # This is a mapping of Unicode characters to LaTeX equivalents. @@ -2842,69 +2844,66 @@ entity_mapping = { '"':'{"}', } -def ValidateCitationKey(text): - """ - removes characters not allowed in BibTeX keys +class BibTeX: + def __init__(self): + self.rep_utf8 = MReplace(utf8enc2latex_mapping) + self.rep_ent = MReplace(entity_mapping) + #Set default conversion to ASCII BibTeX + self.ascii_bibtex = True + # This substitution is based on the description of cite key restrictions at + # http://bibdesk.sourceforge.net/manual/BibDesk%20Help_2.html + self.invalid_cit = re.compile(u'[ "@\',\\#}{~%&$^]') + self.upper = re.compile(u'[' + + string.uppercase.decode(preferred_encoding) + u']') + self.escape = re.compile(u'[~#&%_]') + + def ValidateCitationKey(self, text): + """ + removes characters not allowed in BibTeX keys + >>> ValidateCitationKey(DummyEntry('my@id')) + 'myid' + """ + return self.invalid_cit.sub(u'', text) - >>> from bibliograph.core.utils import _validKey - >>> _validKey(DummyEntry('Foo Bar')) - 'FooBar' + def braceUppercase(self, text): + """ Convert uppercase letters to bibtex encoded uppercase + >>> braceUppercase('Foo Bar') + '{F}oo {B}ar' + """ + return self.upper.sub(lambda m: u'{%s}' % m.group(), text) - >>> _validKey(DummyEntry('my@id')) - 'myid' + def resolveEntities(self, text): + #for entity, entity_map in entity_mapping.iteritems(): + # text = text.replace(entity, entity_map) + #return text + return self.rep_ent.mreplace(text) - """ - # This substitution is based on the description of cite key restrictions at - # http://bibdesk.sourceforge.net/manual/BibDesk%20Help_2.html - return re.sub(u'[ "@\',\\#}{~%&$^]', u'', text) + def resolveUnicode(self, text): + #UTF-8 text as entry + #for unichar, latexenc in utf8enc2latex_mapping.iteritems() : + # text = text.replace(unichar, latexenc) + text = self.rep_utf8.mreplace(text) + return text.replace(u'$}{$', u'') -def BraceUppercase(text): - """ Convert uppercase letters to bibtex encoded uppercase + def escapeSpecialCharacters(self, text): + """ + latex escaping some (not all) special characters + """ + text.replace('\\', '\\\\') + return self.escape.sub(lambda m: u'\\%s' % m.group(), text) - >>> from bibliograph.core.utils import _braceUppercase - >>> _braceUppercase('foo bar') - 'foo bar' + #Calibre functions + #Option to go to official ASCII Bibtex or unofficial UTF-8 + #Go from an unicode entry to ASCII Bibtex format without encoding + def utf8ToBibtex(self, text): + if len(text) == 0: + return '' + text.replace('\\', '\\\\') + text = self.resolveEntities(text) + if self.ascii_bibtex : + text = self.resolveUnicode(text) + return self.escapeSpecialCharacters(text) - >>> _braceUppercase('Foo Bar') - '{F}oo {B}ar' - """ - for uc in string.uppercase: - text = text.replace(uc, u'{%s}' % uc) - return text - -def resolveEntities(text): - for entity, entity_map in entity_mapping.iteritems(): - text = text.replace(entity, entity_map) - return text - -def resolveUnicode(text): - #UTF-8 text as entry - for unichar, latexenc in utf8enc2latex_mapping.iteritems() : - text = text.replace(unichar, latexenc) - return text.replace(u'$}{$', u'') - -def escapeSpecialCharacters(text): - """ - latex escaping some (not all) special characters - """ - text.replace('\\', '\\\\') - escape = ['~', '#', '&', '%', '_'] - for c in escape: - text = text.replace(c, '\\' + c ) - return text - -#Calibre functions -#Go from an unicode entry to ASCII Bibtex format without encoding -#Option to go to official ASCII Bibtex or unofficial UTF-8 -def utf8ToBibtex(text, asccii_bibtex = True): - if len(text) == 0: - return '' - text.replace('\\', '\\\\') - text = resolveEntities(text) - if asccii_bibtex : - text = resolveUnicode(text) - return escapeSpecialCharacters(text) - -def bibtex_author_format(item): - #Format authors for Bibtex compliance (get a list as input) - return utf8ToBibtex(u' and'.join([author for author in item])) + def bibtex_author_format(self, item): + #Format authors for Bibtex compliance (get a list as input) + return self.utf8ToBibtex(u' and'.join([author for author in item])) diff --git a/src/calibre/utils/mreplace.py b/src/calibre/utils/mreplace.py new file mode 100644 index 0000000000..b9fbc0bded --- /dev/null +++ b/src/calibre/utils/mreplace.py @@ -0,0 +1,32 @@ +#multiple replace from dictionnary : http://code.activestate.com/recipes/81330/ +__license__ = 'GPL v3' +__copyright__ = '2010, sengian ' +__docformat__ = 'restructuredtext en' + +import re +from UserDict import UserDict + +class MReplace(UserDict): + def __init__(self, dict = None): + UserDict.__init__(self, dict) + self.re = None + self.regex = None + self.compile_regex() + + def compile_regex(self): + if len(self.data) > 0: + keys = sorted(self.data.keys(), key=len) + keys.reverse() + tmp = "(%s)" % "|".join(map(re.escape, keys)) + if self.re != tmp: + self.re = tmp + self.regex = re.compile(self.re) + + def __call__(self, mo): + return self[mo.string[mo.start():mo.end()]] + + def mreplace(self, text): + #Replace without regex compile + if len(self.data) < 1 or self.re is None: + return text + return self.regex.sub(self, text) \ No newline at end of file From 3137b37b0122c47d1d3904614cc385207aa8953d Mon Sep 17 00:00:00 2001 From: Sengian Date: Sun, 21 Nov 2010 20:34:29 +0100 Subject: [PATCH 04/24] Minor fix to isbndb.py --- src/calibre/ebooks/metadata/isbndb.py | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/src/calibre/ebooks/metadata/isbndb.py b/src/calibre/ebooks/metadata/isbndb.py index 83cf6ee0ed..615b4ab818 100644 --- a/src/calibre/ebooks/metadata/isbndb.py +++ b/src/calibre/ebooks/metadata/isbndb.py @@ -90,10 +90,8 @@ def build_isbn(base_url, opts): return base_url + 'index1=isbn&value1='+opts.isbn def build_combined(base_url, opts): - query = '' - for e in (opts.title, opts.author, opts.publisher): - if e is not None: - query += ' ' + e + query = ' '.join([e for e in (opts.title, opts.author, opts.publisher) \ + if e is not None ]) query = query.strip() if len(query) == 0: raise ISBNDBError('You must specify at least one of --author, --title or --publisher') @@ -139,17 +137,10 @@ def create_books(opts, args, timeout=5.): if opts.verbose: print ('ISBNDB query: '+url) - + tans = [ISBNDBMetadata(book) for book in fetch_metadata(url, timeout=timeout)] - ans = [] - for x in tans: - add = True - for y in ans: - if y.isbn == x.isbn: - add = False - if add: - ans.append(x) - return ans + #remove duplicates ISBN + return dict((book.isbn, book) for book in tans).values() def main(args=sys.argv): parser = option_parser() From 3c01ad453a8c0659bdee160d1742da478b940d54 Mon Sep 17 00:00:00 2001 From: Sengian Date: Sun, 28 Nov 2010 01:51:33 +0100 Subject: [PATCH 05/24] First draft of amazon plugin --- src/calibre/ebooks/metadata/amazonfr.py | 457 ++++++++++++++++++++++++ 1 file changed, 457 insertions(+) create mode 100644 src/calibre/ebooks/metadata/amazonfr.py diff --git a/src/calibre/ebooks/metadata/amazonfr.py b/src/calibre/ebooks/metadata/amazonfr.py new file mode 100644 index 0000000000..7091719f30 --- /dev/null +++ b/src/calibre/ebooks/metadata/amazonfr.py @@ -0,0 +1,457 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2010, sengian ' + +import sys, textwrap, re, traceback +from urllib import urlencode +from math import ceil + +from lxml import html +from lxml.html import soupparser + +from calibre.utils.date import parse_date, utcnow +from calibre import browser, preferred_encoding +from calibre.ebooks.chardet import xml_to_unicode +from calibre.ebooks.metadata import MetaInformation, check_isbn, \ + authors_to_sort_string +from calibre.ebooks.metadata.fetch import MetadataSource +from calibre.utils.config import OptionParser +from calibre.library.comments import sanitize_comments_html + + +class AmazonFr(MetadataSource): + + name = 'Amazon french' + description = _('Downloads social metadata from amazon.fr') + supported_platforms = ['windows', 'osx', 'linux'] + author = 'Sengian' + version = (1, 0, 0) + has_html_comments = True + + def fetch(self): + try: + self.results = search(self.title, self.book_author, self.publisher, + self.isbn, max_results=10, verbose=self.verbose, lang='fr') + except Exception, e: + self.exception = e + self.tb = traceback.format_exc() + +class Amazon(MetadataSource): + + name = 'Amazon' + description = _('Downloads social metadata from amazon.com') + supported_platforms = ['windows', 'osx', 'linux'] + author = 'Kovid Goyal & Sengian' + version = (1, 1, 0) + has_html_comments = True + + def fetch(self): + try: + self.results = search(self.title, self.book_author, self.publisher, + self.isbn, max_results=10, verbose=self.verbose, lang='en') + except Exception, e: + self.exception = e + self.tb = traceback.format_exc() + + +def report(verbose): + if verbose: + import traceback + traceback.print_exc() + +def replace_monthsfr(datefr): + # Replace french months by english equivalent for parse_date + frtoen = { + u'[jJ]anvier': u'jan', + u'[fF].vrier': u'feb', + u'[mM]ars': u'mar', + u'[aA]vril': u'apr', + u'[mM]ai': u'may', + u'[jJ]uin': u'jun', + u'[jJ]uillet': u'jul', + u'[aA]o.t': u'aug', + u'[sS]eptembre': u'sep', + u'[Oo]ctobre': u'oct', + u'[nN]ovembre': u'nov', + u'[dD].cembre': u'dec' } + for k in frtoen.iterkeys(): + tmp = re.sub(k, frtoen[k], datefr) + if tmp <> datefr: break + return tmp + +class Query(object): + + BASE_URL_FR = 'http://www.amazon.fr' + BASE_URL_EN = 'http://www.amazon.com' + + def __init__(self, title=None, author=None, publisher=None, isbn=None, keywords=None, + max_results=20, rlang='en'): + assert not(title is None and author is None and publisher is None \ + and isbn is None and keywords is None) + assert (max_results < 21) + + self.max_results = int(max_results) + self.renbres = re.compile(u'\s*(\d+)\s*') + + q = { 'search-alias' : 'stripbooks' , + 'unfiltered' : '1', + 'field-keywords' : '', + 'field-author' : '', + 'field-title' : '', + 'field-isbn' : '', + 'field-publisher' : '' + #get to amazon detailed search page to get all options + # 'node' : '', + # 'field-binding' : '', + #before, during, after + # 'field-dateop' : '', + #month as number + # 'field-datemod' : '', + # 'field-dateyear' : '', + #french only + # 'field-collection' : '', + #many options available + } + + if rlang =='en': + q['sort'] = 'relevanceexprank' + self.urldata = self.BASE_URL_EN + elif rlang =='fr': + q['sort'] = 'relevancerank' + self.urldata = self.BASE_URL_FR + self.baseurl = self.urldata + + if isbn is not None: + q['field-isbn'] = isbn.replace('-', '') + else: + if title is not None: + q['field-title'] = title + if author is not None: + q['field-author'] = author + if publisher is not None: + q['field-publisher'] = publisher + if keywords is not None: + q['field-keywords'] = keywords + + if isinstance(q, unicode): + q = q.encode('utf-8') + self.urldata += '/gp/search/ref=sr_adv_b/?' + urlencode(q) + + def __call__(self, browser, verbose, timeout = 5.): + if verbose: + print 'Query:', self.urldata + + try: + raw = browser.open_novisit(self.urldata, timeout=timeout).read() + except Exception, e: + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return + raise + if '404 - ' in raw: + return + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + feed = soupparser.fromstring(raw) + except: + return None, self.urldata + + #nb of page + try: + nbresults = self.renbres.findall(feed.xpath("//*[@class='resultCount']")[0].text) + rpp = 0 + if len(nbresults) > 1: + rpp = int(nbresults[1]) + nbresults = int(nbresults[2]) + except: + return None, self.urldata + + pages =[feed] + if rpp: + nbpagetoquery = int(ceil(float(min(nbresults, self.max_results))/ rpp)) + for i in xrange(2, nbpagetoquery + 1): + try: + urldata = self.urldata + '&page=' + str(i) + raw = browser.open_novisit(urldata, timeout=timeout).read() + except Exception, e: + continue + if '<title>404 - ' in raw: + continue + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + feed = soupparser.fromstring(raw) + except: + continue + pages.append(feed) + + results = [] + for x in pages: + results.extend([i.getparent().get('href') \ + for i in x.xpath("//a/span[@class='srTitle']")]) + return results[:self.max_results], self.baseurl + +class ResultList(list): + + def __init__(self, baseurl, lang = 'en'): + self.baseurl = baseurl + self.lang = lang + self.repub = re.compile(u'\((.*)\)') + self.rerat = re.compile(u'([0-9.]+)') + self.reattr = re.compile(r'<([a-zA-Z0-9]+)\s[^>]+>') + self.reoutp = re.compile(r'(?s)<em>--This text ref.*?</em>') + self.recom = re.compile(r'(?s)<!--.*?-->') + self.republi = re.compile(u'(Editeur|Publisher)', re.I) + self.reisbn = re.compile(u'(ISBN-10|ISBN-10|ASIN)', re.I) + self.relang = re.compile(u'(Language|Langue)', re.I) + self.reratelt = re.compile(u'(Average\s*Customer\s*Review|Moyenne\s*des\s*commentaires\s*client)', re.I) + self.reprod = re.compile(u'(Product\s*Details|D.tails\s*sur\s*le\s*produit)', re.I) + + def strip_tags_etree(self, etreeobj, invalid_tags): + for (itag, rmv) in invalid_tags.iteritems(): + if rmv: + for elts in etreeobj.getiterator(itag): + elts.drop_tree() + else: + for elts in etreeobj.getiterator(itag): + elts.drop_tag() + + def clean_entry(self, entry, invalid_tags = {'script': True}, + invalid_id = (), invalid_class=()): + #invalid_tags: remove tag and keep content if False else remove + #remove tags + if invalid_tags: + self.strip_tags_etree(entry, invalid_tags) + #remove id + if invalid_id: + for eltid in invalid_id: + elt = entry.get_element_by_id(eltid) + if elt is not None: + elt.drop_tree() + #remove class + if invalid_class: + for eltclass in invalid_class: + elts = entry.find_class(eltclass) + if elts is not None: + for elt in elts: + elt.drop_tree() + + def get_title(self, entry): + title = entry.get_element_by_id('btAsinTitle') + if title is not None: + title = title.text + return unicode(title.replace('\n', '').strip()) + + def get_authors(self, entry): + author = entry.get_element_by_id('btAsinTitle') + while author.getparent().tag != 'div': + author = author.getparent() + author = author.getparent() + authortext = [] + for x in author.getiterator('a'): + authortext.append(unicode(x.text_content().strip())) + return authortext + + def get_description(self, entry, verbose): + try: + description = entry.get_element_by_id("productDescription").find("div[@class='content']") + inv_class = ('seeAll', 'emptyClear') + inv_tags ={'img': True, 'a': False} + self.clean_entry(description, invalid_tags=inv_tags, invalid_class=inv_class) + description = html.tostring(description, method='html', encoding=unicode).strip() + # remove all attributes from tags + description = self.reattr.sub(r'<\1>', description) + # Remove the notice about text referring to out of print editions + description = self.reoutp.sub('', description) + # Remove comments + description = self.recom.sub('', description) + return unicode(sanitize_comments_html(description)) + except: + report(verbose) + return None + + def get_tags(self, entry, browser, verbose): + try: + tags = entry.get_element_by_id('tagContentHolder') + testptag = tags.find_class('see-all') + if testptag: + for x in testptag: + alink = x.xpath('descendant-or-self::a') + if alink: + if alink[0].get('class') == 'tgJsActive': + continue + link = self.baseurl + alink[0].get('href') + entry = self.get_individual_metadata(browser, link, verbose) + tags = entry.get_element_by_id('tagContentHolder') + break + tags = [a.text for a in tags.getiterator('a') if a.get('rel') == 'tag'] + except: + report(verbose) + tags = [] + return tags + + def get_book_info(self, entry, mi, verbose): + try: + entry = entry.get_element_by_id('SalesRank').getparent() + except: + try: + for z in entry.getiterator('h2'): + if self.reprod.search(z.text_content()): + entry = z.getparent().find("div[@class='content']/ul") + break + except: + report(verbose) + return mi + elts = entry.findall('li') + #pub & date + elt = filter(lambda x: self.republi.search(x.find('b').text), elts) + if elt: + pub = elt[0].find('b').tail + mi.publisher = unicode(self.repub.sub('', pub).strip()) + d = self.repub.search(pub) + if d is not None: + d = d.group(1) + try: + default = utcnow().replace(day=15) + if self.lang == 'fr': + d = replace_monthsfr(d) + d = parse_date(d, assume_utc=True, default=default) + mi.pubdate = d + except: + report(verbose) + #ISBN + elt = filter(lambda x: self.reisbn.search(x.find('b').text), elts) + if elt: + isbn = elt[0].find('b').tail.replace('-', '').strip() + if check_isbn(isbn): + mi.isbn = unicode(isbn) + elif len(elt) > 1: + isbn = elt[1].find('b').tail.replace('-', '').strip() + if check_isbn(isbn): + mi.isbn = unicode(isbn) + #Langue + elt = filter(lambda x: self.relang.search(x.find('b').text), elts) + if elt: + langue = elt[0].find('b').tail.strip() + if langue: + mi.language = unicode(langue) + #ratings + elt = filter(lambda x: self.reratelt.search(x.find('b').text), elts) + if elt: + ratings = elt[0].find_class('swSprite') + if ratings: + ratings = self.rerat.findall(ratings[0].get('title')) + if len(ratings) == 2: + mi.rating = float(ratings[0])/float(ratings[1]) * 5 + return mi + + def fill_MI(self, entry, title, authors, browser, verbose): + mi = MetaInformation(title, authors) + mi.author_sort = authors_to_sort_string(authors) + mi.comments = self.get_description(entry, verbose) + mi = self.get_book_info(entry, mi, verbose) + mi.tags = self.get_tags(entry, browser, verbose) + return mi + + def get_individual_metadata(self, browser, linkdata, verbose): + try: + raw = browser.open_novisit(linkdata).read() + except Exception, e: + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return + raise + if '<title>404 - ' in raw: + report(verbose) + return + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + return soupparser.fromstring(raw) + except: + return + + def populate(self, entries, browser, verbose=False): + for x in entries: + try: + entry = self.get_individual_metadata(browser, x, verbose) + # clean results + # inv_ids = ('divsinglecolumnminwidth', 'sims.purchase', 'AutoBuyXGetY', 'A9AdsMiddleBoxTop') + # inv_class = ('buyingDetailsGrid', 'productImageGrid') + # inv_tags ={'script': True, 'style': True, 'form': False} + # self.clean_entry(entry, invalid_id=inv_ids) + title = self.get_title(entry) + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print 'Failed to get all details for an entry' + print e + continue + self.append(self.fill_MI(entry, title, authors, browser, verbose)) + + +def search(title=None, author=None, publisher=None, isbn=None, + max_results=5, verbose=False, keywords=None, lang='en'): + br = browser() + entries, baseurl = Query(title=title, author=author, isbn=isbn, publisher=publisher, + keywords=keywords, max_results=max_results,rlang=lang)(br, verbose) + + if entries is None or len(entries) == 0: + return + + #List of entry + ans = ResultList(baseurl, lang) + ans.populate(entries, br, verbose) + return ans + +def option_parser(): + parser = OptionParser(textwrap.dedent(\ + '''\ + %prog [options] + + Fetch book metadata from Amazon. You must specify one of title, author, + ISBN, publisher or keywords. Will fetch a maximum of 10 matches, + so you should make your query as specific as possible. + You can chose the language for metadata retrieval (french & american). + ''' + )) + parser.add_option('-t', '--title', help='Book title') + parser.add_option('-a', '--author', help='Book author(s)') + parser.add_option('-p', '--publisher', help='Book publisher') + parser.add_option('-i', '--isbn', help='Book ISBN') + parser.add_option('-k', '--keywords', help='Keywords') + parser.add_option('-c', '--covers', default=0, + help='Covers: 1-Check/ 2-Download') + parser.add_option('-p', '--coverspath', default='', + help='Covers files path') + parser.add_option('-m', '--max-results', default=10, + help='Maximum number of results to fetch') + parser.add_option('-l', '--lang', default='en', + help='Chosen language for metadata search') + parser.add_option('-v', '--verbose', default=0, action='count', + help='Be more verbose about errors') + return parser + +def main(args=sys.argv): + parser = option_parser() + opts, args = parser.parse_args(args) + try: + results = search(opts.title, opts.author, isbn=opts.isbn, publisher=opts.publisher, + keywords=opts.keywords, verbose=opts.verbose, max_results=opts.max_results, + lang=opts.lang) + except AssertionError: + report(True) + parser.print_help() + return 1 + if results is None or len(results) == 0: + print 'No result found for this search!' + return 0 + for result in results: + print unicode(result).encode(preferred_encoding, 'replace') + print + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file From 07d93425da6b5800e76798ffbe1b3469846f8fa6 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 28 Nov 2010 14:02:30 +0100 Subject: [PATCH 06/24] Amazon modification & integration of german metadata --- src/calibre/ebooks/metadata/amazonfr.py | 73 +++++++++++++++++++------ 1 file changed, 56 insertions(+), 17 deletions(-) diff --git a/src/calibre/ebooks/metadata/amazonfr.py b/src/calibre/ebooks/metadata/amazonfr.py index 7091719f30..9512fc0c3f 100644 --- a/src/calibre/ebooks/metadata/amazonfr.py +++ b/src/calibre/ebooks/metadata/amazonfr.py @@ -36,6 +36,23 @@ class AmazonFr(MetadataSource): self.exception = e self.tb = traceback.format_exc() +class AmazonDe(MetadataSource): + + name = 'Amazon german' + description = _('Downloads social metadata from amazon.de') + supported_platforms = ['windows', 'osx', 'linux'] + author = 'Sengian' + version = (1, 0, 0) + has_html_comments = True + + def fetch(self): + try: + self.results = search(self.title, self.book_author, self.publisher, + self.isbn, max_results=10, verbose=self.verbose, lang='de') + except Exception, e: + self.exception = e + self.tb = traceback.format_exc() + class Amazon(MetadataSource): name = 'Amazon' @@ -59,8 +76,8 @@ def report(verbose): import traceback traceback.print_exc() -def replace_monthsfr(datefr): - # Replace french months by english equivalent for parse_date +def replace_months(datez, clang): + # Replace months by english equivalent for parse_date frtoen = { u'[jJ]anvier': u'jan', u'[fF].vrier': u'feb', @@ -74,15 +91,38 @@ def replace_monthsfr(datefr): u'[Oo]ctobre': u'oct', u'[nN]ovembre': u'nov', u'[dD].cembre': u'dec' } - for k in frtoen.iterkeys(): - tmp = re.sub(k, frtoen[k], datefr) - if tmp <> datefr: break + detoen = { + u'[jJ]anuar': u'jan', + u'[fF]ebruar': u'feb', + u'[mM].rz': u'mar', + u'[aA]pril': u'apr', + u'[mM]ai': u'may', + u'[jJ]uni': u'jun', + u'[jJ]uli': u'jul', + u'[aA]ugust': u'aug', + u'[sS]eptember': u'sep', + u'[Oo]ktober': u'oct', + u'[nN]ovember': u'nov', + u'[dD]ezember': u'dec' } + + if clang == 'fr': + dictoen = frtoen + elif clang == 'de': + dictoen = detoen + else: + return datez + + for k in dictoen.iterkeys(): + tmp = re.sub(k, dictoen[k], datez) + if tmp != datez: break return tmp + class Query(object): BASE_URL_FR = 'http://www.amazon.fr' BASE_URL_EN = 'http://www.amazon.com' + BASE_URL_DE = 'http://www.amazon.de' def __init__(self, title=None, author=None, publisher=None, isbn=None, keywords=None, max_results=20, rlang='en'): @@ -119,6 +159,9 @@ class Query(object): elif rlang =='fr': q['sort'] = 'relevancerank' self.urldata = self.BASE_URL_FR + elif rlang =='de': + q['sort'] = 'relevancerank' + self.urldata = self.BASE_URL_DE self.baseurl = self.urldata if isbn is not None: @@ -203,11 +246,11 @@ class ResultList(list): self.reattr = re.compile(r'<([a-zA-Z0-9]+)\s[^>]+>') self.reoutp = re.compile(r'(?s)<em>--This text ref.*?</em>') self.recom = re.compile(r'(?s)<!--.*?-->') - self.republi = re.compile(u'(Editeur|Publisher)', re.I) + self.republi = re.compile(u'(Editeur|Publisher|Verlag)', re.I) self.reisbn = re.compile(u'(ISBN-10|ISBN-10|ASIN)', re.I) - self.relang = re.compile(u'(Language|Langue)', re.I) - self.reratelt = re.compile(u'(Average\s*Customer\s*Review|Moyenne\s*des\s*commentaires\s*client)', re.I) - self.reprod = re.compile(u'(Product\s*Details|D.tails\s*sur\s*le\s*produit)', re.I) + self.relang = re.compile(u'(Language|Langue|Sprache)', re.I) + self.reratelt = re.compile(u'(Average\s*Customer\s*Review|Moyenne\s*des\s*commentaires\s*client|Durchschnittliche\s*Kundenbewertung)', re.I) + self.reprod = re.compile(u'(Product\s*Details|D.tails\s*sur\s*le\s*produit|Produktinformation)', re.I) def strip_tags_etree(self, etreeobj, invalid_tags): for (itag, rmv) in invalid_tags.iteritems(): @@ -315,8 +358,8 @@ class ResultList(list): d = d.group(1) try: default = utcnow().replace(day=15) - if self.lang == 'fr': - d = replace_monthsfr(d) + if self.lang != 'en': + d = replace_months(d, self.lang) d = parse_date(d, assume_utc=True, default=default) mi.pubdate = d except: @@ -415,7 +458,7 @@ def option_parser(): Fetch book metadata from Amazon. You must specify one of title, author, ISBN, publisher or keywords. Will fetch a maximum of 10 matches, so you should make your query as specific as possible. - You can chose the language for metadata retrieval (french & american). + You can chose the language for metadata retrieval (french & american & german). ''' )) parser.add_option('-t', '--title', help='Book title') @@ -423,14 +466,10 @@ def option_parser(): parser.add_option('-p', '--publisher', help='Book publisher') parser.add_option('-i', '--isbn', help='Book ISBN') parser.add_option('-k', '--keywords', help='Keywords') - parser.add_option('-c', '--covers', default=0, - help='Covers: 1-Check/ 2-Download') - parser.add_option('-p', '--coverspath', default='', - help='Covers files path') parser.add_option('-m', '--max-results', default=10, help='Maximum number of results to fetch') parser.add_option('-l', '--lang', default='en', - help='Chosen language for metadata search') + help='Chosen language for metadata search (fr, en , de)') parser.add_option('-v', '--verbose', default=0, action='count', help='Be more verbose about errors') return parser From 61db7b02b6a065a8727955ac5de8d2235d121e27 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 28 Nov 2010 15:11:55 +0100 Subject: [PATCH 07/24] Add language options to amazon & move replace_months to utils.date.py --- src/calibre/ebooks/metadata/amazonfr.py | 108 ++++++++++++------------ src/calibre/utils/date.py | 41 +++++++++ 2 files changed, 96 insertions(+), 53 deletions(-) diff --git a/src/calibre/ebooks/metadata/amazonfr.py b/src/calibre/ebooks/metadata/amazonfr.py index 9512fc0c3f..a77ea16d9c 100644 --- a/src/calibre/ebooks/metadata/amazonfr.py +++ b/src/calibre/ebooks/metadata/amazonfr.py @@ -9,7 +9,7 @@ from math import ceil from lxml import html from lxml.html import soupparser -from calibre.utils.date import parse_date, utcnow +from calibre.utils.date import parse_date, utcnow, replace_months from calibre import browser, preferred_encoding from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.metadata import MetaInformation, check_isbn, \ @@ -36,6 +36,40 @@ class AmazonFr(MetadataSource): self.exception = e self.tb = traceback.format_exc() +class AmazonEs(MetadataSource): + + name = 'Amazon spanish' + description = _('Downloads social metadata from amazon.com in spanish') + supported_platforms = ['windows', 'osx', 'linux'] + author = 'Sengian' + version = (1, 0, 0) + has_html_comments = True + + def fetch(self): + try: + self.results = search(self.title, self.book_author, self.publisher, + self.isbn, max_results=10, verbose=self.verbose, lang='es') + except Exception, e: + self.exception = e + self.tb = traceback.format_exc() + +class AmazonUS(MetadataSource): + + name = 'Amazon US english' + description = _('Downloads social metadata from amazon.com in english') + supported_platforms = ['windows', 'osx', 'linux'] + author = 'Sengian' + version = (1, 0, 0) + has_html_comments = True + + def fetch(self): + try: + self.results = search(self.title, self.book_author, self.publisher, + self.isbn, max_results=10, verbose=self.verbose, lang='us') + except Exception, e: + self.exception = e + self.tb = traceback.format_exc() + class AmazonDe(MetadataSource): name = 'Amazon german' @@ -65,7 +99,7 @@ class Amazon(MetadataSource): def fetch(self): try: self.results = search(self.title, self.book_author, self.publisher, - self.isbn, max_results=10, verbose=self.verbose, lang='en') + self.isbn, max_results=10, verbose=self.verbose, lang='all') except Exception, e: self.exception = e self.tb = traceback.format_exc() @@ -76,56 +110,15 @@ def report(verbose): import traceback traceback.print_exc() -def replace_months(datez, clang): - # Replace months by english equivalent for parse_date - frtoen = { - u'[jJ]anvier': u'jan', - u'[fF].vrier': u'feb', - u'[mM]ars': u'mar', - u'[aA]vril': u'apr', - u'[mM]ai': u'may', - u'[jJ]uin': u'jun', - u'[jJ]uillet': u'jul', - u'[aA]o.t': u'aug', - u'[sS]eptembre': u'sep', - u'[Oo]ctobre': u'oct', - u'[nN]ovembre': u'nov', - u'[dD].cembre': u'dec' } - detoen = { - u'[jJ]anuar': u'jan', - u'[fF]ebruar': u'feb', - u'[mM].rz': u'mar', - u'[aA]pril': u'apr', - u'[mM]ai': u'may', - u'[jJ]uni': u'jun', - u'[jJ]uli': u'jul', - u'[aA]ugust': u'aug', - u'[sS]eptember': u'sep', - u'[Oo]ktober': u'oct', - u'[nN]ovember': u'nov', - u'[dD]ezember': u'dec' } - - if clang == 'fr': - dictoen = frtoen - elif clang == 'de': - dictoen = detoen - else: - return datez - - for k in dictoen.iterkeys(): - tmp = re.sub(k, dictoen[k], datez) - if tmp != datez: break - return tmp - class Query(object): BASE_URL_FR = 'http://www.amazon.fr' - BASE_URL_EN = 'http://www.amazon.com' + BASE_URL_ALL = 'http://www.amazon.com' BASE_URL_DE = 'http://www.amazon.de' def __init__(self, title=None, author=None, publisher=None, isbn=None, keywords=None, - max_results=20, rlang='en'): + max_results=20, rlang='all'): assert not(title is None and author is None and publisher is None \ and isbn is None and keywords is None) assert (max_results < 21) @@ -153,9 +146,17 @@ class Query(object): #many options available } - if rlang =='en': + if rlang =='all': q['sort'] = 'relevanceexprank' - self.urldata = self.BASE_URL_EN + self.urldata = self.BASE_URL_ALL + elif rlang =='es': + q['sort'] = 'relevanceexprank' + q['field-language'] = 'Spanish' + self.urldata = self.BASE_URL_ALL + elif rlang =='us': + q['sort'] = 'relevanceexprank' + q['field-language'] = 'English' + self.urldata = self.BASE_URL_ALL elif rlang =='fr': q['sort'] = 'relevancerank' self.urldata = self.BASE_URL_FR @@ -238,7 +239,7 @@ class Query(object): class ResultList(list): - def __init__(self, baseurl, lang = 'en'): + def __init__(self, baseurl, lang = 'all'): self.baseurl = baseurl self.lang = lang self.repub = re.compile(u'\((.*)\)') @@ -358,7 +359,7 @@ class ResultList(list): d = d.group(1) try: default = utcnow().replace(day=15) - if self.lang != 'en': + if self.lang != 'all': d = replace_months(d, self.lang) d = parse_date(d, assume_utc=True, default=default) mi.pubdate = d @@ -437,7 +438,7 @@ class ResultList(list): def search(title=None, author=None, publisher=None, isbn=None, - max_results=5, verbose=False, keywords=None, lang='en'): + max_results=5, verbose=False, keywords=None, lang='all'): br = browser() entries, baseurl = Query(title=title, author=author, isbn=isbn, publisher=publisher, keywords=keywords, max_results=max_results,rlang=lang)(br, verbose) @@ -458,7 +459,8 @@ def option_parser(): Fetch book metadata from Amazon. You must specify one of title, author, ISBN, publisher or keywords. Will fetch a maximum of 10 matches, so you should make your query as specific as possible. - You can chose the language for metadata retrieval (french & american & german). + You can chose the language for metadata retrieval: + All & US english & french & german & spanish ''' )) parser.add_option('-t', '--title', help='Book title') @@ -468,8 +470,8 @@ def option_parser(): parser.add_option('-k', '--keywords', help='Keywords') parser.add_option('-m', '--max-results', default=10, help='Maximum number of results to fetch') - parser.add_option('-l', '--lang', default='en', - help='Chosen language for metadata search (fr, en , de)') + parser.add_option('-l', '--lang', default='all', + help='Chosen language for metadata search (all, us, fr, es , de)') parser.add_option('-v', '--verbose', default=0, action='count', help='Be more verbose about errors') return parser diff --git a/src/calibre/utils/date.py b/src/calibre/utils/date.py index ec58c49628..1ea8a2c4a0 100644 --- a/src/calibre/utils/date.py +++ b/src/calibre/utils/date.py @@ -151,3 +151,44 @@ def format_date(dt, format, assume_utc=False, as_utc=False): format = re.sub('d{1,4}', format_day, format) format = re.sub('M{1,4}', format_month, format) return re.sub('yyyy|yy', format_year, format) + +def replace_months(datez, clang): + # Replace months by english equivalent for parse_date + frtoen = { + u'[jJ]anvier': u'jan', + u'[fF].vrier': u'feb', + u'[mM]ars': u'mar', + u'[aA]vril': u'apr', + u'[mM]ai': u'may', + u'[jJ]uin': u'jun', + u'[jJ]uillet': u'jul', + u'[aA]o.t': u'aug', + u'[sS]eptembre': u'sep', + u'[Oo]ctobre': u'oct', + u'[nN]ovembre': u'nov', + u'[dD].cembre': u'dec' } + detoen = { + u'[jJ]anuar': u'jan', + u'[fF]ebruar': u'feb', + u'[mM].rz': u'mar', + u'[aA]pril': u'apr', + u'[mM]ai': u'may', + u'[jJ]uni': u'jun', + u'[jJ]uli': u'jul', + u'[aA]ugust': u'aug', + u'[sS]eptember': u'sep', + u'[Oo]ktober': u'oct', + u'[nN]ovember': u'nov', + u'[dD]ezember': u'dec' } + + if clang == 'fr': + dictoen = frtoen + elif clang == 'de': + dictoen = detoen + else: + return datez + + for k in dictoen.iterkeys(): + tmp = re.sub(k, dictoen[k], datez) + if tmp != datez: break + return tmp From 8af48a9d0678c533b98158877a912422460d68f5 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 28 Nov 2010 23:07:33 +0100 Subject: [PATCH 08/24] Various fix in amazon metadata, add german site for german users Add a clean ascii trial: this should be corrected everytime soupparser is used from lxml.html due to problems with xml_to_unicode output --- src/calibre/ebooks/metadata/amazonfr.py | 56 ++++++++++++++++--------- src/calibre/utils/cleantext.py | 15 +++++++ 2 files changed, 52 insertions(+), 19 deletions(-) create mode 100644 src/calibre/utils/cleantext.py diff --git a/src/calibre/ebooks/metadata/amazonfr.py b/src/calibre/ebooks/metadata/amazonfr.py index a77ea16d9c..5df962d8f5 100644 --- a/src/calibre/ebooks/metadata/amazonfr.py +++ b/src/calibre/ebooks/metadata/amazonfr.py @@ -10,6 +10,7 @@ from lxml import html from lxml.html import soupparser from calibre.utils.date import parse_date, utcnow, replace_months +from calibre.utils.cleantext import clean_ascii_char from calibre import browser, preferred_encoding from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.metadata import MetaInformation, check_isbn, \ @@ -53,9 +54,9 @@ class AmazonEs(MetadataSource): self.exception = e self.tb = traceback.format_exc() -class AmazonUS(MetadataSource): +class AmazonEn(MetadataSource): - name = 'Amazon US english' + name = 'Amazon english' description = _('Downloads social metadata from amazon.com in english') supported_platforms = ['windows', 'osx', 'linux'] author = 'Sengian' @@ -65,7 +66,7 @@ class AmazonUS(MetadataSource): def fetch(self): try: self.results = search(self.title, self.book_author, self.publisher, - self.isbn, max_results=10, verbose=self.verbose, lang='us') + self.isbn, max_results=10, verbose=self.verbose, lang='en') except Exception, e: self.exception = e self.tb = traceback.format_exc() @@ -97,24 +98,29 @@ class Amazon(MetadataSource): has_html_comments = True def fetch(self): + # if not self.site_customization: + # return try: self.results = search(self.title, self.book_author, self.publisher, self.isbn, max_results=10, verbose=self.verbose, lang='all') except Exception, e: self.exception = e self.tb = traceback.format_exc() + + # @property + # def string_customization_help(self): + # return _('You can select here the language for metadata search with amazon.com') def report(verbose): if verbose: - import traceback traceback.print_exc() class Query(object): - BASE_URL_FR = 'http://www.amazon.fr' BASE_URL_ALL = 'http://www.amazon.com' + BASE_URL_FR = 'http://www.amazon.fr' BASE_URL_DE = 'http://www.amazon.de' def __init__(self, title=None, author=None, publisher=None, isbn=None, keywords=None, @@ -153,7 +159,7 @@ class Query(object): q['sort'] = 'relevanceexprank' q['field-language'] = 'Spanish' self.urldata = self.BASE_URL_ALL - elif rlang =='us': + elif rlang =='en': q['sort'] = 'relevanceexprank' q['field-language'] = 'English' self.urldata = self.BASE_URL_ALL @@ -197,24 +203,25 @@ class Query(object): return raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] + try: feed = soupparser.fromstring(raw) except: - return None, self.urldata + try: + #remove ASCII invalid chars + return soupparser.fromstring(clean_ascii_char(raw)) + except: + return None, self.urldata #nb of page try: nbresults = self.renbres.findall(feed.xpath("//*[@class='resultCount']")[0].text) - rpp = 0 - if len(nbresults) > 1: - rpp = int(nbresults[1]) - nbresults = int(nbresults[2]) except: return None, self.urldata pages =[feed] - if rpp: - nbpagetoquery = int(ceil(float(min(nbresults, self.max_results))/ rpp)) + if len(nbresults) > 1: + nbpagetoquery = int(ceil(float(min(int(nbresults[2]), self.max_results))/ int(nbresults[1]))) for i in xrange(2, nbpagetoquery + 1): try: urldata = self.urldata + '&page=' + str(i) @@ -228,7 +235,11 @@ class Query(object): try: feed = soupparser.fromstring(raw) except: - continue + try: + #remove ASCII invalid chars + return soupparser.fromstring(clean_ascii_char(raw)) + except: + continue pages.append(feed) results = [] @@ -416,7 +427,12 @@ class ResultList(list): try: return soupparser.fromstring(raw) except: - return + try: + #remove ASCII invalid chars + return soupparser.fromstring(clean_ascii_char(raw)) + except: + report(verbose) + return def populate(self, entries, browser, verbose=False): for x in entries: @@ -433,6 +449,8 @@ class ResultList(list): if verbose: print 'Failed to get all details for an entry' print e + print 'URL who failed:', x + report(verbose) continue self.append(self.fill_MI(entry, title, authors, browser, verbose)) @@ -453,16 +471,16 @@ def search(title=None, author=None, publisher=None, isbn=None, def option_parser(): parser = OptionParser(textwrap.dedent(\ - '''\ + _('''\ %prog [options] Fetch book metadata from Amazon. You must specify one of title, author, ISBN, publisher or keywords. Will fetch a maximum of 10 matches, so you should make your query as specific as possible. You can chose the language for metadata retrieval: - All & US english & french & german & spanish + All & english & french & german & spanish ''' - )) + ))) parser.add_option('-t', '--title', help='Book title') parser.add_option('-a', '--author', help='Book author(s)') parser.add_option('-p', '--publisher', help='Book publisher') @@ -471,7 +489,7 @@ def option_parser(): parser.add_option('-m', '--max-results', default=10, help='Maximum number of results to fetch') parser.add_option('-l', '--lang', default='all', - help='Chosen language for metadata search (all, us, fr, es , de)') + help='Chosen language for metadata search (all, en, fr, es, de)') parser.add_option('-v', '--verbose', default=0, action='count', help='Be more verbose about errors') return parser diff --git a/src/calibre/utils/cleantext.py b/src/calibre/utils/cleantext.py new file mode 100644 index 0000000000..6655129c15 --- /dev/null +++ b/src/calibre/utils/cleantext.py @@ -0,0 +1,15 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2010, sengian <sengian1@gmail.com>' +__docformat__ = 'restructuredtext en' + +import re + +def clean_ascii_char(txt, charlist = None): + #remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27 by default + chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19)) \ + + [0x1A, 0x1B] + if charlist is not None: + chars = charlist + illegal_chars = re.compile(u'|'.join(map(unichr, chars))) + return illegal_chars.sub('', txt) \ No newline at end of file From 9c025f4736d477b8478ae0684dff8aeeca41fe4c Mon Sep 17 00:00:00 2001 From: Hiroshi Miura <miurahr@linux.com> Date: Fri, 3 Dec 2010 00:43:59 +0900 Subject: [PATCH 09/24] recipe: fix japanese recipes - mainichi news: handle ad pages - yomiuri: remove table tag - nikkei life: reduce feeds --- resources/recipes/mainichi.recipe | 16 ++++++++++++++++ resources/recipes/mainichi_it_news.recipe | 16 +++++++++++++++- resources/recipes/nikkei_sub_life.recipe | 5 +---- resources/recipes/yomiuri.recipe | 2 +- resources/recipes/yomiuri_world.recipe | 2 +- 5 files changed, 34 insertions(+), 7 deletions(-) diff --git a/resources/recipes/mainichi.recipe b/resources/recipes/mainichi.recipe index 2a44fa0980..baa7f409ec 100644 --- a/resources/recipes/mainichi.recipe +++ b/resources/recipes/mainichi.recipe @@ -4,6 +4,7 @@ __copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>' www.mainichi.jp ''' +import re from calibre.web.feeds.news import BasicNewsRecipe class MainichiDailyNews(BasicNewsRecipe): @@ -22,3 +23,18 @@ class MainichiDailyNews(BasicNewsRecipe): remove_tags = [{'class':"RelatedArticle"}] remove_tags_after = {'class':"Credit"} + def parse_feeds(self): + + feeds = BasicNewsRecipe.parse_feeds(self) + + for curfeed in feeds: + delList = [] + for a,curarticle in enumerate(curfeed.articles): + if re.search(r'pheedo.jp', curarticle.url): + delList.append(curarticle) + if len(delList)>0: + for d in delList: + index = curfeed.articles.index(d) + curfeed.articles[index:index+1] = [] + + return feeds diff --git a/resources/recipes/mainichi_it_news.recipe b/resources/recipes/mainichi_it_news.recipe index 8e15496e57..4c285a2c01 100644 --- a/resources/recipes/mainichi_it_news.recipe +++ b/resources/recipes/mainichi_it_news.recipe @@ -14,5 +14,19 @@ class MainichiDailyITNews(BasicNewsRecipe): remove_tags_before = {'class':"NewsTitle"} remove_tags = [{'class':"RelatedArticle"}] - remove_tags_after = {'class':"Credit"} + def parse_feeds(self): + + feeds = BasicNewsRecipe.parse_feeds(self) + + for curfeed in feeds: + delList = [] + for a,curarticle in enumerate(curfeed.articles): + if re.search(r'pheedo.jp', curarticle.url): + delList.append(curarticle) + if len(delList)>0: + for d in delList: + index = curfeed.articles.index(d) + curfeed.articles[index:index+1] = [] + + return feeds remove_tags_after = {'class':"Credit"} diff --git a/resources/recipes/nikkei_sub_life.recipe b/resources/recipes/nikkei_sub_life.recipe index 1bfa08a55f..60e5b170ca 100644 --- a/resources/recipes/nikkei_sub_life.recipe +++ b/resources/recipes/nikkei_sub_life.recipe @@ -32,12 +32,9 @@ class NikkeiNet_sub_life(BasicNewsRecipe): remove_tags_after = {'class':"cmn-pr_list"} feeds = [ (u'\u304f\u3089\u3057', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kurashi'), - (u'\u30b9\u30dd\u30fc\u30c4', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sports'), - (u'\u793e\u4f1a', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shakai'), (u'\u30a8\u30b3', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=eco'), (u'\u5065\u5eb7', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kenkou'), - (u'\u7279\u96c6', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=special'), - (u'\u30e9\u30f3\u30ad\u30f3\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=ranking') + (u'\u7279\u96c6', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=special') ] def get_browser(self): diff --git a/resources/recipes/yomiuri.recipe b/resources/recipes/yomiuri.recipe index d30aa9066f..fb17bb1210 100644 --- a/resources/recipes/yomiuri.recipe +++ b/resources/recipes/yomiuri.recipe @@ -21,7 +21,7 @@ class YOLNews(BasicNewsRecipe): remove_javascript = True masthead_title = u'YOMIURI ONLINE' - remove_tags_before = {'class':"article-def"} + keep_only_tags = [{'class':"article-def"}] remove_tags = [{'class':"RelatedArticle"}, {'class':"sbtns"} ] diff --git a/resources/recipes/yomiuri_world.recipe b/resources/recipes/yomiuri_world.recipe index f5f21c4aab..41ee4fd23d 100644 --- a/resources/recipes/yomiuri_world.recipe +++ b/resources/recipes/yomiuri_world.recipe @@ -21,7 +21,7 @@ class YOLNews(BasicNewsRecipe): remove_javascript = True masthead_title = u"YOMIURI ONLINE" - remove_tags_before = {'class':"article-def"} + keep_only_tags = [{'class':"article-def"}] remove_tags = [{'class':"RelatedArticle"}, {'class':"sbtns"} ] From 6173ff8c591f5205555758a5cc94dc0b8a44eb94 Mon Sep 17 00:00:00 2001 From: Hiroshi Miura <miurahr@linux.com> Date: Fri, 3 Dec 2010 00:46:25 +0900 Subject: [PATCH 10/24] recipe: add nikkei news about social --- resources/recipes/nikkei_sub_shakai.recipe | 102 +++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 resources/recipes/nikkei_sub_shakai.recipe diff --git a/resources/recipes/nikkei_sub_shakai.recipe b/resources/recipes/nikkei_sub_shakai.recipe new file mode 100644 index 0000000000..ed86493265 --- /dev/null +++ b/resources/recipes/nikkei_sub_shakai.recipe @@ -0,0 +1,102 @@ +__license__ = 'GPL v3' +__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>' +''' +www.nikkei.com +''' + +import re +from calibre.web.feeds.recipes import BasicNewsRecipe +import mechanize +from calibre.ptempfile import PersistentTemporaryFile + + +class NikkeiNet_sub_life(BasicNewsRecipe): + title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u751f\u6d3b)' + __author__ = 'Hiroshi Miura' + description = 'News and current market affairs from Japan' + cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' + masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' + needs_subscription = True + oldest_article = 2 + max_articles_per_feed = 20 + language = 'ja' + remove_javascript = False + temp_files = [] + + remove_tags_before = {'class':"cmn-section cmn-indent"} + remove_tags = [ + {'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"}, + {'class':"cmn-article_keyword cmn-clearfix"}, + {'class':"cmn-print_headline cmn-clearfix"}, + ] + remove_tags_after = {'class':"cmn-pr_list"} + + feeds = [ + (u'\u793e\u4f1a', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shakai') + ] + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + + cj = mechanize.LWPCookieJar() + br.set_cookiejar(cj) + + #br.set_debug_http(True) + #br.set_debug_redirects(True) + #br.set_debug_responses(True) + + if self.username is not None and self.password is not None: + #print "----------------------------get login form--------------------------------------------" + # open login form + br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam') + response = br.response() + #print "----------------------------get login form---------------------------------------------" + #print "----------------------------set login form---------------------------------------------" + # remove disabled input which brings error on mechanize + response.set_data(response.get_data().replace("<input id=\"j_id48\"", "<!-- ")) + response.set_data(response.get_data().replace("gm_home_on.gif\" />", " -->")) + br.set_response(response) + br.select_form(name='LA0010Form01') + br['LA0010Form01:LA0010Email'] = self.username + br['LA0010Form01:LA0010Password'] = self.password + br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True + br.submit() + br.response() + #print "----------------------------send login form---------------------------------------------" + #print "----------------------------open news main page-----------------------------------------" + # open news site + br.open('http://www.nikkei.com/') + br.response() + #print "----------------------------www.nikkei.com BODY --------------------------------------" + #print response2.get_data() + #print "-------------------------^^-got auto redirect form----^^--------------------------------" + # forced redirect in default + br.select_form(nr=0) + br.submit() + response3 = br.response() + # return some cookie which should be set by Javascript + #print response3.geturl() + raw = response3.get_data() + #print "---------------------------response to form --------------------------------------------" + # grab cookie from JS and set it + redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1) + br.select_form(nr=0) + + self.temp_files.append(PersistentTemporaryFile('_fa.html')) + self.temp_files[-1].write("#LWP-Cookies-2.0\n") + + self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") + self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") + self.temp_files[-1].close() + cj.load(self.temp_files[-1].name) + + br.submit() + + #br.set_debug_http(False) + #br.set_debug_redirects(False) + #br.set_debug_responses(False) + return br + + + + From ca0df54bf3cd4dc9a2e8342685251e947a4653f6 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Fri, 3 Dec 2010 19:37:08 +0100 Subject: [PATCH 11/24] Add provisory fictionwise metadata --- src/calibre/ebooks/metadata/fictionwise.py | 370 +++++++++++++++++++++ 1 file changed, 370 insertions(+) create mode 100644 src/calibre/ebooks/metadata/fictionwise.py diff --git a/src/calibre/ebooks/metadata/fictionwise.py b/src/calibre/ebooks/metadata/fictionwise.py new file mode 100644 index 0000000000..828ea31c3a --- /dev/null +++ b/src/calibre/ebooks/metadata/fictionwise.py @@ -0,0 +1,370 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2010, sengian <sengian1@gmail.com>' +__docformat__ = 'restructuredtext en' + +import sys, textwrap, re +from urllib import urlencode + +from lxml import html, etree +from lxml.html import soupparser +from lxml.etree import tostring + +from calibre import browser, preferred_encoding +from calibre.ebooks.chardet import xml_to_unicode +from calibre.ebooks.metadata import MetaInformation, check_isbn, \ + authors_to_sort_string +from calibre.library.comments import sanitize_comments_html +from calibre.ebooks.metadata.fetch import MetadataSource +from calibre.utils.config import OptionParser +from calibre.utils.date import parse_date, utcnow + +class Fictionwise(MetadataSource): # {{{ + + author = 'Sengian' + name = 'Fictionwise' + description = _('Downloads metadata from Fictionwise') + + has_html_comments = True + + def fetch(self): + try: + self.results = search(self.title, self.book_author, self.publisher, + self.isbn, max_results=10, verbose=self.verbose) + except Exception, e: + self.exception = e + self.tb = traceback.format_exc() + + # }}} + + +def report(verbose): + if verbose: + import traceback + traceback.print_exc() + +class Query(object): + + BASE_URL = 'http://www.fictionwise.com/servlet/mw' + + def __init__(self, title=None, author=None, publisher=None, keywords=None, max_results=20): + assert not(title is None and author is None and publisher is None and keywords is None) + assert (max_results < 21) + + self.max_results = int(max_results) + + q = { 'template' : 'searchresults_adv.htm' , + 'searchtitle' : '', + 'searchauthor' : '', + 'searchpublisher' : '', + 'searchkeyword' : '', + #possibilities startoflast, fullname, lastfirst + 'searchauthortype' : 'startoflast', + 'searchcategory' : '', + 'searchcategory2' : '', + 'searchprice_s' : '0', + 'searchprice_e' : 'ANY', + 'searchformat' : '', + 'searchgeo' : 'US', + 'searchfwdatetype' : '', + #maybe use dates fields if needed? + #'sortorder' : 'DESC', + #many options available: b.SortTitle, a.SortName, + #b.DateFirstPublished, b.FWPublishDate + 'sortby' : 'b.SortTitle' + } + if title is not None: + q['searchtitle'] = title + if author is not None: + q['searchauthor'] = author + if publisher is not None: + q['searchpublisher'] = publisher + if keywords is not None: + q['searchkeyword'] = keywords + + if isinstance(q, unicode): + q = q.encode('utf-8') + self.urldata = urlencode(q) + + def __call__(self, browser, verbose): + if verbose: + print 'Query:', self.BASE_URL+self.urldata + + try: + raw = browser.open_novisit(self.BASE_URL, self.urldata).read() + except Exception, e: + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return + raise + if '<title>404 - ' in raw: + return + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + feed = soupparser.fromstring(raw) + except: + return + + # get list of results as links + results = feed.xpath("//table[3]/tr/td[2]/table/tr/td/p/table[2]/tr[@valign]") + results = results[:self.max_results] + results = [i.xpath('descendant-or-self::a')[0].get('href') for i in results] + #return feed if no links ie normally a single book or nothing + if not results: + results = [feed] + return results + +class ResultList(list): + + BASE_URL = 'http://www.fictionwise.com' + COLOR_VALUES = {'BLUE': 4, 'GREEN': 3, 'YELLOW': 2, 'RED': 1, 'NA': 0} + + def __init__(self): + self.retitle = re.compile(r'\[[^\[\]]+\]') + self.rechkauth = re.compile(r'.*book\s*by', re.I) + self.redesc = re.compile(r'book\s*description\s*:\s*(<br[^>]+>)*(?P<desc>.*)' \ + + '<br[^>]+>.{,15}publisher\s*:', re.I) + self.repub = re.compile(r'.*publisher\s*:\s*', re.I) + self.redate = re.compile(r'.*release\s*date\s*:\s*', re.I) + self.retag = re.compile(r'.*book\s*category\s*:\s*', re.I) + self.resplitbr = re.compile(r'<br[^>]+>', re.I) + self.recomment = re.compile(r'(?s)<!--.*?-->') + self.reimg = re.compile(r'<img[^>]*>', re.I) + self.resanitize = re.compile(r'\[HTML_REMOVED\]\s*', re.I) + self.renbcom = re.compile('(?P<nbcom>\d+)\s*Reader Ratings:') + self.recolor = re.compile('(?P<ncolor>[^/]+).gif') + self.resplitbrdiv = re.compile(r'(<br[^>]+>|</?div[^>]*>)', re.I) + self.reisbn = re.compile(r'.*ISBN\s*:\s*', re.I) + + def strip_tags_etree(self, etreeobj, invalid_tags): + for itag in invalid_tags: + for elt in etreeobj.getiterator(itag): + elt.drop_tag() + return etreeobj + + def clean_entry(self, entry, + invalid_tags = ('font', 'strong', 'b', 'ul', 'span', 'a'), + remove_tags_trees = ('script',)): + for it in entry[0].iterchildren(tag='table'): + entry[0].remove(it) + entry[0].remove(entry[0].xpath( 'descendant-or-self::p[1]')[0]) + entry = entry[0] + cleantree = self.strip_tags_etree(entry, invalid_tags) + for itag in remove_tags_trees: + for elts in cleantree.getiterator(itag): + elts.drop_tree() + return cleantree + + def output_entry(self, entry, prettyout = True, htmlrm="\d+"): + out = tostring(entry, pretty_print=prettyout) + reclean = re.compile('(\n+|\t+|\r+|&#'+htmlrm+';)') + return reclean.sub('', out) + + def get_title(self, entry): + title = entry.findtext('./') + return self.retitle.sub('', title).strip() + + def get_authors(self, entry): + authortext = entry.find('./br').tail + if not self.rechkauth.search(authortext): + return [] + #TODO: parse all tag if necessary + authortext = self.rechkauth.sub('', authortext) + return [a.strip() for a in authortext.split('&')] + + def get_rating(self, entrytable, verbose): + nbcomment = tostring(entrytable.getprevious()) + try: + nbcomment = self.renbcom.search(nbcomment).group("nbcom") + except: + report(verbose) + return None + hval = dict((self.COLOR_VALUES[self.recolor.search(image.get('src', default='NA.gif')).group("ncolor")], + float(image.get('height', default=0))) \ + for image in entrytable.getiterator('img')) + #ratings as x/5 + return 1.25*sum(k*v for (k, v) in hval.iteritems())/sum(hval.itervalues()) + + def get_description(self, entry): + description = self.output_entry(entry.find('./p'),htmlrm="") + description = self.redesc.search(description) + if not description and not description.group("desc"): + return None + #remove invalid tags + description = self.reimg.sub('', description.group("desc")) + description = self.recomment.sub('', description) + description = self.resanitize.sub('', sanitize_comments_html(description)) + return 'SUMMARY:\n' + re.sub(r'\n\s+</p>','\n</p>', description) + + def get_publisher(self, entry): + publisher = self.output_entry(entry.find('./p')) + publisher = filter(lambda x: self.repub.search(x) is not None, + self.resplitbr.split(publisher)) + if not len(publisher): + return None + publisher = self.repub.sub('', publisher[0]) + return publisher.split(',')[0].strip() + + def get_tags(self, entry): + tag = self.output_entry(entry.find('./p')) + tag = filter(lambda x: self.retag.search(x) is not None, + self.resplitbr.split(tag)) + if not len(tag): + return [] + return map(lambda x: x.strip(), self.retag.sub('', tag[0]).split('/')) + + def get_date(self, entry, verbose): + date = self.output_entry(entry.find('./p')) + date = filter(lambda x: self.redate.search(x) is not None, + self.resplitbr.split(date)) + if not len(date): + return None + #TODO: parse all tag if necessary + try: + d = self.redate.sub('', date[0]) + if d: + default = utcnow().replace(day=15) + d = parse_date(d, assume_utc=True, default=default) + else: + d = None + except: + report(verbose) + d = None + return d + + def get_ISBN(self, entry): + isbns = self.output_entry(entry.getchildren()[2]) + isbns = filter(lambda x: self.reisbn.search(x) is not None, + self.resplitbrdiv.split(isbns)) + if not len(isbns): + return None + #TODO: parse all tag if necessary + isbns = [self.reisbn.sub('', x) for x in isbns if check_isbn(self.reisbn.sub('', x))] + return sorted(isbns, cmp=lambda x,y:cmp(len(x), len(y)))[-1] + + def fill_MI(self, entry, title, authors, ratings, verbose): + mi = MetaInformation(title, authors) + mi.rating = ratings + mi.comments = self.get_description(entry) + mi.publisher = self.get_publisher(entry) + mi.tags = self.get_tags(entry) + mi.pubdate = self.get_date(entry, verbose) + mi.isbn = self.get_ISBN(entry) + mi.author_sort = authors_to_sort_string(authors) + # mi.language = self.get_language(x, verbose) + return mi + + def get_individual_metadata(self, browser, linkdata, verbose): + try: + raw = browser.open_novisit(self.BASE_URL + linkdata).read() + except Exception, e: + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return + raise + if '<title>404 - ' in raw: + report(verbose) + return + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + feed = soupparser.fromstring(raw) + except: + return + + # get results + return feed.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td") + + def populate(self, entries, browser, verbose=False): + for x in entries: + try: + entry = self.get_individual_metadata(browser, x, verbose) + entry = self.clean_entry(entry) + title = self.get_title(entry) + #ratings: get table for rating then drop + for elt in entry.getiterator('table'): + ratings = self.get_rating(elt, verbose) + elt.getprevious().drop_tree() + elt.drop_tree() + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print 'Failed to get all details for an entry' + print e + continue + self.append(self.fill_MI(entry, title, authors, ratings, verbose)) + + def populate_single(self, feed, verbose=False): + try: + entry = feed.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td") + entry = self.clean_entry(entry) + title = self.get_title(entry) + #ratings: get table for rating then drop + for elt in entry.getiterator('table'): + ratings = self.get_rating(elt, verbose) + elt.getprevious().drop_tree() + elt.drop_tree() + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print 'Failed to get all details for an entry' + print e + return + self.append(self.fill_MI(entry, title, authors, ratings, verbose)) + + +def search(title=None, author=None, publisher=None, isbn=None, + min_viewability='none', verbose=False, max_results=5, + keywords=None): + br = browser() + entries = Query(title=title, author=author, publisher=publisher, + keywords=keywords, max_results=max_results)(br, verbose) + + #List of entry + ans = ResultList() + if len(entries) > 1: + ans.populate(entries, br, verbose) + else: + ans.populate_single(entries[0], verbose) + return ans + + +def option_parser(): + parser = OptionParser(textwrap.dedent(\ + '''\ + %prog [options] + + Fetch book metadata from Fictionwise. You must specify one of title, author, + or keywords. No ISBN specification possible. Will fetch a maximum of 20 matches, + so you should make your query as specific as possible. + ''' + )) + parser.add_option('-t', '--title', help='Book title') + parser.add_option('-a', '--author', help='Book author(s)') + parser.add_option('-p', '--publisher', help='Book publisher') + parser.add_option('-k', '--keywords', help='Keywords') + parser.add_option('-m', '--max-results', default=20, + help='Maximum number of results to fetch') + parser.add_option('-v', '--verbose', default=0, action='count', + help='Be more verbose about errors') + return parser + +def main(args=sys.argv): + parser = option_parser() + opts, args = parser.parse_args(args) + try: + results = search(opts.title, opts.author, publisher=opts.publisher, + keywords=opts.keywords, verbose=opts.verbose, max_results=opts.max_results) + except AssertionError: + report(True) + parser.print_help() + return 1 + for result in results: + print unicode(result).encode(preferred_encoding, 'replace') + print + +if __name__ == '__main__': + sys.exit(main()) From 37d51495d227b2ce2689477ff45109bbbd12c987 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 5 Dec 2010 15:28:41 +0100 Subject: [PATCH 12/24] Update of nicebooks --- src/calibre/ebooks/metadata/nicebooks.py | 113 ++++++++++------------- 1 file changed, 50 insertions(+), 63 deletions(-) diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py index 4d19e9611b..01e20261b3 100644 --- a/src/calibre/ebooks/metadata/nicebooks.py +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -10,7 +10,8 @@ from copy import deepcopy from lxml.html import soupparser -from calibre.utils.date import parse_date, utcnow +from calibre.utils.date import parse_date, utcnow, replace_months +from calibre.utils.cleantext import clean_ascii_char from calibre import browser, preferred_encoding from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.metadata import MetaInformation, check_isbn, \ @@ -71,31 +72,16 @@ class NiceBooksCovers(CoverDownload): traceback.format_exc(), self.name)) +class NiceBooksError(Exception): + pass + +class ISBNNotFound(NiceBooksError): + pass + def report(verbose): if verbose: - import traceback traceback.print_exc() -def replace_monthsfr(datefr): - # Replace french months by english equivalent for parse_date - frtoen = { - u'[jJ]anvier': u'jan', - u'[fF].vrier': u'feb', - u'[mM]ars': u'mar', - u'[aA]vril': u'apr', - u'[mM]ai': u'may', - u'[jJ]uin': u'jun', - u'[jJ]uillet': u'jul', - u'[aA]o.t': u'aug', - u'[sS]eptembre': u'sep', - u'[Oo]ctobre': u'oct', - u'[nN]ovembre': u'nov', - u'[dD].cembre': u'dec' } - for k in frtoen.iterkeys(): - tmp = re.sub(k, frtoen[k], datefr) - if tmp <> datefr: break - return tmp - class Query(object): BASE_URL = 'http://fr.nicebooks.com/' @@ -119,7 +105,7 @@ class Query(object): def __call__(self, browser, verbose, timeout = 5.): if verbose: - print 'Query:', self.BASE_URL+self.urldata + print _('Query: %s') % self.BASE_URL+self.urldata try: raw = browser.open_novisit(self.BASE_URL+self.urldata, timeout=timeout).read() @@ -128,7 +114,9 @@ class Query(object): if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: return - raise + if isinstance(getattr(e, 'args', [None])[0], socket.timeout): + raise NiceBooksError(_('Nicebooks timed out. Try again later.')) + raise NiceBooksError(_('Nicebooks encountered an error.')) if '<title>404 - ' in raw: return raw = xml_to_unicode(raw, strip_encoding_pats=True, @@ -136,7 +124,11 @@ class Query(object): try: feed = soupparser.fromstring(raw) except: - return + try: + #remove ASCII invalid chars + feed = soupparser.fromstring(clean_ascii_char(raw)) + except: + return None #nb of page to call try: @@ -161,7 +153,11 @@ class Query(object): try: feed = soupparser.fromstring(raw) except: - continue + try: + #remove ASCII invalid chars + feed = soupparser.fromstring(clean_ascii_char(raw)) + except: + continue pages.append(feed) results = [] @@ -180,14 +176,12 @@ class ResultList(list): self.reautclean = re.compile(u'\s*\(.*\)\s*') def get_title(self, entry): - # title = deepcopy(entry.find("div[@id='book-info']")) title = deepcopy(entry) title.remove(title.find("dl[@title='Informations sur le livre']")) title = ' '.join([i.text_content() for i in title.iterchildren()]) return unicode(title.replace('\n', '')) def get_authors(self, entry): - # author = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") author = entry.find("dl[@title='Informations sur le livre']") authortext = [] for x in author.getiterator('dt'): @@ -223,7 +217,7 @@ class ResultList(list): d = x.getnext().text_content() try: default = utcnow().replace(day=15) - d = replace_monthsfr(d) + d = replace_months(d, 'fr') d = parse_date(d, assume_utc=True, default=default) mi.pubdate = d except: @@ -234,11 +228,6 @@ class ResultList(list): mi = MetaInformation(title, authors) mi.author_sort = authors_to_sort_string(authors) mi.comments = self.get_description(entry, verbose) - # entry = entry.find("dl[@title='Informations sur le livre']") - # mi.publisher = self.get_publisher(entry) - # mi.pubdate = self.get_date(entry, verbose) - # mi.isbn = self.get_ISBN(entry) - # mi.language = self.get_language(entry) return self.get_book_info(entry, mi, verbose) def get_individual_metadata(self, browser, linkdata, verbose): @@ -249,7 +238,9 @@ class ResultList(list): if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: return - raise + if isinstance(getattr(e, 'args', [None])[0], socket.timeout): + raise NiceBooksError(_('Nicebooks timed out. Try again later.')) + raise NiceBooksError(_('Nicebooks encountered an error.')) if '<title>404 - ' in raw: report(verbose) return @@ -258,7 +249,11 @@ class ResultList(list): try: feed = soupparser.fromstring(raw) except: - return + try: + #remove ASCII invalid chars + feed = soupparser.fromstring(clean_ascii_char(raw)) + except: + return None # get results return feed.xpath("//div[@id='container']")[0] @@ -292,13 +287,6 @@ class ResultList(list): continue self.append(self.fill_MI(entry, title, authors, verbose)) - -class NiceBooksError(Exception): - pass - -class ISBNNotFound(NiceBooksError): - pass - class Covers(object): def __init__(self, isbn = None): @@ -329,11 +317,10 @@ class Covers(object): return cover, ext if ext else 'jpg' except Exception, err: if isinstance(getattr(err, 'args', [None])[0], socket.timeout): - err = NiceBooksError(_('Nicebooks timed out. Try again later.')) - raise err + raise NiceBooksError(_('Nicebooks timed out. Try again later.')) if not len(self.urlimg): if not self.isbnf: - raise ISBNNotFound('ISBN: '+self.isbn+_(' not found.')) + raise ISBNNotFound(_('ISBN: %s not found.') % self.isbn) raise NiceBooksError(_('An errror occured with Nicebooks cover fetcher')) @@ -341,10 +328,10 @@ def search(title=None, author=None, publisher=None, isbn=None, max_results=5, verbose=False, keywords=None): br = browser() entries = Query(title=title, author=author, isbn=isbn, publisher=publisher, - keywords=keywords, max_results=max_results)(br, verbose) + keywords=keywords, max_results=max_results)(br, verbose,timeout = 10.) if entries is None or len(entries) == 0: - return + return None #List of entry ans = ResultList() @@ -364,28 +351,28 @@ def cover_from_isbn(isbn, timeout = 5.): def option_parser(): parser = OptionParser(textwrap.dedent(\ - '''\ + _('''\ %prog [options] Fetch book metadata from Nicebooks. You must specify one of title, author, ISBN, publisher or keywords. Will fetch a maximum of 20 matches, so you should make your query as specific as possible. It can also get covers if the option is activated. - ''' + ''') )) - parser.add_option('-t', '--title', help='Book title') - parser.add_option('-a', '--author', help='Book author(s)') - parser.add_option('-p', '--publisher', help='Book publisher') - parser.add_option('-i', '--isbn', help='Book ISBN') - parser.add_option('-k', '--keywords', help='Keywords') + parser.add_option('-t', '--title', help=_('Book title')) + parser.add_option('-a', '--author', help=_('Book author(s)')) + parser.add_option('-p', '--publisher', help=_('Book publisher')) + parser.add_option('-i', '--isbn', help=_('Book ISBN')) + parser.add_option('-k', '--keywords', help=_('Keywords')) parser.add_option('-c', '--covers', default=0, - help='Covers: 1-Check/ 2-Download') + help=_('Covers: 1-Check/ 2-Download')) parser.add_option('-p', '--coverspath', default='', - help='Covers files path') + help=_('Covers files path')) parser.add_option('-m', '--max-results', default=20, - help='Maximum number of results to fetch') + help=_('Maximum number of results to fetch')) parser.add_option('-v', '--verbose', default=0, action='count', - help='Be more verbose about errors') + help=_('Be more verbose about errors')) return parser def main(args=sys.argv): @@ -400,15 +387,15 @@ def main(args=sys.argv): parser.print_help() return 1 if results is None or len(results) == 0: - print 'No result found for this search!' + print _('No result found for this search!') return 0 for result in results: print unicode(result).encode(preferred_encoding, 'replace') covact = int(opts.covers) if covact == 1: - textcover = 'No cover found!' + textcover = _('No cover found!') if check_for_cover(result.isbn): - textcover = 'A cover was found for this book' + textcover = _('A cover was found for this book') print textcover elif covact == 2: cover_data, ext = cover_from_isbn(result.isbn) @@ -417,7 +404,7 @@ def main(args=sys.argv): cpath = os.path.normpath(opts.coverspath + '/' + result.isbn) oname = os.path.abspath(cpath+'.'+ext) open(oname, 'wb').write(cover_data) - print 'Cover saved to file ', oname + print _('Cover saved to file '), oname print if __name__ == '__main__': From e610f16ca073fc0a4960143484c56031e8ac9069 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 5 Dec 2010 20:09:17 +0100 Subject: [PATCH 13/24] Update fictionwise.py (broken) --- src/calibre/ebooks/metadata/fictionwise.py | 146 +++++++++++++-------- 1 file changed, 93 insertions(+), 53 deletions(-) diff --git a/src/calibre/ebooks/metadata/fictionwise.py b/src/calibre/ebooks/metadata/fictionwise.py index 828ea31c3a..e56c697e3c 100644 --- a/src/calibre/ebooks/metadata/fictionwise.py +++ b/src/calibre/ebooks/metadata/fictionwise.py @@ -3,12 +3,11 @@ __license__ = 'GPL 3' __copyright__ = '2010, sengian <sengian1@gmail.com>' __docformat__ = 'restructuredtext en' -import sys, textwrap, re +import sys, textwrap, re, traceback, socket from urllib import urlencode -from lxml import html, etree -from lxml.html import soupparser -from lxml.etree import tostring +from lxml import html +from lxml.html import soupparser, tostring from calibre import browser, preferred_encoding from calibre.ebooks.chardet import xml_to_unicode @@ -18,6 +17,7 @@ from calibre.library.comments import sanitize_comments_html from calibre.ebooks.metadata.fetch import MetadataSource from calibre.utils.config import OptionParser from calibre.utils.date import parse_date, utcnow +from calibre.utils.cleantext import clean_ascii_char class Fictionwise(MetadataSource): # {{{ @@ -37,10 +37,11 @@ class Fictionwise(MetadataSource): # {{{ # }}} +class FictionwiseError(Exception): + pass def report(verbose): if verbose: - import traceback traceback.print_exc() class Query(object): @@ -86,18 +87,20 @@ class Query(object): q = q.encode('utf-8') self.urldata = urlencode(q) - def __call__(self, browser, verbose): + def __call__(self, browser, verbose, timeout = 5.): if verbose: - print 'Query:', self.BASE_URL+self.urldata + print _('Query: %s') % self.BASE_URL+self.urldata try: - raw = browser.open_novisit(self.BASE_URL, self.urldata).read() + raw = browser.open_novisit(self.BASE_URL, self.urldata, timeout=timeout).read() except Exception, e: report(verbose) if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: return - raise + if isinstance(getattr(e, 'args', [None])[0], socket.timeout): + raise FictionwiseError(_('Fictionwise timed out. Try again later.')) + raise FictionwiseError(_('Fictionwise encountered an error.')) if '<title>404 - ' in raw: return raw = xml_to_unicode(raw, strip_encoding_pats=True, @@ -105,7 +108,11 @@ class Query(object): try: feed = soupparser.fromstring(raw) except: - return + try: + #remove ASCII invalid chars + feed = soupparser.fromstring(clean_ascii_char(raw)) + except: + return None # get list of results as links results = feed.xpath("//table[3]/tr/td[2]/table/tr/td/p/table[2]/tr[@valign]") @@ -139,12 +146,41 @@ class ResultList(list): self.reisbn = re.compile(r'.*ISBN\s*:\s*', re.I) def strip_tags_etree(self, etreeobj, invalid_tags): - for itag in invalid_tags: - for elt in etreeobj.getiterator(itag): - elt.drop_tag() - return etreeobj + for (itag, rmv) in invalid_tags.iteritems(): + if rmv: + for elts in etreeobj.getiterator(itag): + elts.drop_tree() + else: + for elts in etreeobj.getiterator(itag): + elts.drop_tag() - def clean_entry(self, entry, + def clean_entry(self, entry, invalid_tags = {'script': True}, + invalid_id = (), invalid_class=(), invalid_xpath = ()): + #invalid_tags: remove tag and keep content if False else remove + #remove tags + if invalid_tags: + self.strip_tags_etree(entry, invalid_tags) + #remove xpath + if invalid_xpath: + for eltid in invalid_xpath: + elt = entry.xpath(eltid) + for el in elt: + el.drop_tree() + #remove id + if invalid_id: + for eltid in invalid_id: + elt = entry.get_element_by_id(eltid) + if elt is not None: + elt.drop_tree() + #remove class + if invalid_class: + for eltclass in invalid_class: + elts = entry.find_class(eltclass) + if elts is not None: + for elt in elts: + elt.drop_tree() + + def clean_entry_dffdfbdjbf(self, entry, invalid_tags = ('font', 'strong', 'b', 'ul', 'span', 'a'), remove_tags_trees = ('script',)): for it in entry[0].iterchildren(tag='table'): @@ -170,7 +206,6 @@ class ResultList(list): authortext = entry.find('./br').tail if not self.rechkauth.search(authortext): return [] - #TODO: parse all tag if necessary authortext = self.rechkauth.sub('', authortext) return [a.strip() for a in authortext.split('&')] @@ -185,7 +220,7 @@ class ResultList(list): float(image.get('height', default=0))) \ for image in entrytable.getiterator('img')) #ratings as x/5 - return 1.25*sum(k*v for (k, v) in hval.iteritems())/sum(hval.itervalues()) + return float(1.25*sum(k*v for (k, v) in hval.iteritems())/sum(hval.itervalues())) def get_description(self, entry): description = self.output_entry(entry.find('./p'),htmlrm="") @@ -221,7 +256,6 @@ class ResultList(list): self.resplitbr.split(date)) if not len(date): return None - #TODO: parse all tag if necessary try: d = self.redate.sub('', date[0]) if d: @@ -279,9 +313,14 @@ class ResultList(list): return feed.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td") def populate(self, entries, browser, verbose=False): - for x in entries: + inv_tags ={'script': True, 'a': False, 'font': False, 'strong': False, 'b': False, + 'ul': False, 'span': False, 'table': True} + inv_xpath =('descendant-or-self::p[1]',) + #single entry + if len(entries) == 1 and not isinstance(entries[0], str): try: - entry = self.get_individual_metadata(browser, x, verbose) + entry = entries.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td") + self.clean_entry(entry, invalid_tags=inv_tags, invalid_xpath=inv_xpath) entry = self.clean_entry(entry) title = self.get_title(entry) #ratings: get table for rating then drop @@ -292,28 +331,29 @@ class ResultList(list): authors = self.get_authors(entry) except Exception, e: if verbose: - print 'Failed to get all details for an entry' + print _('Failed to get all details for an entry') print e - continue + return self.append(self.fill_MI(entry, title, authors, ratings, verbose)) - - def populate_single(self, feed, verbose=False): - try: - entry = feed.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td") - entry = self.clean_entry(entry) - title = self.get_title(entry) - #ratings: get table for rating then drop - for elt in entry.getiterator('table'): - ratings = self.get_rating(elt, verbose) - elt.getprevious().drop_tree() - elt.drop_tree() - authors = self.get_authors(entry) - except Exception, e: - if verbose: - print 'Failed to get all details for an entry' - print e - return - self.append(self.fill_MI(entry, title, authors, ratings, verbose)) + else: + #multiple entries + for x in entries: + try: + entry = self.get_individual_metadata(browser, x, verbose) + self.clean_entry(entry, invalid_tags=inv_tags, invalid_xpath=inv_xpath) + title = self.get_title(entry) + #ratings: get table for rating then drop + for elt in entry.getiterator('table'): + ratings = self.get_rating(elt, verbose) + elt.getprevious().drop_tree() + elt.drop_tree() + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print _('Failed to get all details for an entry') + print e + continue + self.append(self.fill_MI(entry, title, authors, ratings, verbose)) def search(title=None, author=None, publisher=None, isbn=None, @@ -321,35 +361,32 @@ def search(title=None, author=None, publisher=None, isbn=None, keywords=None): br = browser() entries = Query(title=title, author=author, publisher=publisher, - keywords=keywords, max_results=max_results)(br, verbose) + keywords=keywords, max_results=max_results)(br, verbose, timeout = 10.) #List of entry ans = ResultList() - if len(entries) > 1: - ans.populate(entries, br, verbose) - else: - ans.populate_single(entries[0], verbose) + ans.populate(entries, br, verbose) return ans def option_parser(): parser = OptionParser(textwrap.dedent(\ - '''\ + _('''\ %prog [options] Fetch book metadata from Fictionwise. You must specify one of title, author, or keywords. No ISBN specification possible. Will fetch a maximum of 20 matches, so you should make your query as specific as possible. - ''' + ''') )) - parser.add_option('-t', '--title', help='Book title') - parser.add_option('-a', '--author', help='Book author(s)') - parser.add_option('-p', '--publisher', help='Book publisher') - parser.add_option('-k', '--keywords', help='Keywords') + parser.add_option('-t', '--title', help=_('Book title')) + parser.add_option('-a', '--author', help=_('Book author(s)')) + parser.add_option('-p', '--publisher', help=_('Book publisher')) + parser.add_option('-k', '--keywords', help=_('Keywords')) parser.add_option('-m', '--max-results', default=20, - help='Maximum number of results to fetch') + help=_('Maximum number of results to fetch')) parser.add_option('-v', '--verbose', default=0, action='count', - help='Be more verbose about errors') + help=_('Be more verbose about errors')) return parser def main(args=sys.argv): @@ -362,6 +399,9 @@ def main(args=sys.argv): report(True) parser.print_help() return 1 + if results is None or len(results) == 0: + print _('No result found for this search!') + return 0 for result in results: print unicode(result).encode(preferred_encoding, 'replace') print From 57e0e1820a65af5a273bfe5080c8bd6ef17e871d Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Mon, 6 Dec 2010 01:57:07 +0100 Subject: [PATCH 14/24] Update of fictionwise.py --- src/calibre/ebooks/metadata/fictionwise.py | 73 ++++++++-------------- 1 file changed, 27 insertions(+), 46 deletions(-) diff --git a/src/calibre/ebooks/metadata/fictionwise.py b/src/calibre/ebooks/metadata/fictionwise.py index e56c697e3c..c4a8597dde 100644 --- a/src/calibre/ebooks/metadata/fictionwise.py +++ b/src/calibre/ebooks/metadata/fictionwise.py @@ -53,7 +53,6 @@ class Query(object): assert (max_results < 21) self.max_results = int(max_results) - q = { 'template' : 'searchresults_adv.htm' , 'searchtitle' : '', 'searchauthor' : '', @@ -131,12 +130,11 @@ class ResultList(list): def __init__(self): self.retitle = re.compile(r'\[[^\[\]]+\]') self.rechkauth = re.compile(r'.*book\s*by', re.I) - self.redesc = re.compile(r'book\s*description\s*:\s*(<br[^>]+>)*(?P<desc>.*)' \ - + '<br[^>]+>.{,15}publisher\s*:', re.I) + self.redesc = re.compile(r'book\s*description\s*:\s*(<br[^>]+>)*(?P<desc>.*)<br[^>]*>.{,15}publisher\s*:', re.I) self.repub = re.compile(r'.*publisher\s*:\s*', re.I) self.redate = re.compile(r'.*release\s*date\s*:\s*', re.I) self.retag = re.compile(r'.*book\s*category\s*:\s*', re.I) - self.resplitbr = re.compile(r'<br[^>]+>', re.I) + self.resplitbr = re.compile(r'<br[^>]*>', re.I) self.recomment = re.compile(r'(?s)<!--.*?-->') self.reimg = re.compile(r'<img[^>]*>', re.I) self.resanitize = re.compile(r'\[HTML_REMOVED\]\s*', re.I) @@ -180,21 +178,9 @@ class ResultList(list): for elt in elts: elt.drop_tree() - def clean_entry_dffdfbdjbf(self, entry, - invalid_tags = ('font', 'strong', 'b', 'ul', 'span', 'a'), - remove_tags_trees = ('script',)): - for it in entry[0].iterchildren(tag='table'): - entry[0].remove(it) - entry[0].remove(entry[0].xpath( 'descendant-or-self::p[1]')[0]) - entry = entry[0] - cleantree = self.strip_tags_etree(entry, invalid_tags) - for itag in remove_tags_trees: - for elts in cleantree.getiterator(itag): - elts.drop_tree() - return cleantree - def output_entry(self, entry, prettyout = True, htmlrm="\d+"): out = tostring(entry, pretty_print=prettyout) + #try to work around tostring to remove this encoding for exemle reclean = re.compile('(\n+|\t+|\r+|&#'+htmlrm+';)') return reclean.sub('', out) @@ -223,18 +209,18 @@ class ResultList(list): return float(1.25*sum(k*v for (k, v) in hval.iteritems())/sum(hval.itervalues())) def get_description(self, entry): - description = self.output_entry(entry.find('./p'),htmlrm="") + description = self.output_entry(entry.xpath('./p')[1],htmlrm="") description = self.redesc.search(description) - if not description and not description.group("desc"): + if not description or not description.group("desc"): return None #remove invalid tags description = self.reimg.sub('', description.group("desc")) description = self.recomment.sub('', description) description = self.resanitize.sub('', sanitize_comments_html(description)) - return 'SUMMARY:\n' + re.sub(r'\n\s+</p>','\n</p>', description) + return _('SUMMARY:\n %s') % re.sub(r'\n\s+</p>','\n</p>', description) def get_publisher(self, entry): - publisher = self.output_entry(entry.find('./p')) + publisher = self.output_entry(entry.xpath('./p')[1]) publisher = filter(lambda x: self.repub.search(x) is not None, self.resplitbr.split(publisher)) if not len(publisher): @@ -243,7 +229,7 @@ class ResultList(list): return publisher.split(',')[0].strip() def get_tags(self, entry): - tag = self.output_entry(entry.find('./p')) + tag = self.output_entry(entry.xpath('./p')[1]) tag = filter(lambda x: self.retag.search(x) is not None, self.resplitbr.split(tag)) if not len(tag): @@ -251,7 +237,7 @@ class ResultList(list): return map(lambda x: x.strip(), self.retag.sub('', tag[0]).split('/')) def get_date(self, entry, verbose): - date = self.output_entry(entry.find('./p')) + date = self.output_entry(entry.xpath('./p')[1]) date = filter(lambda x: self.redate.search(x) is not None, self.resplitbr.split(date)) if not len(date): @@ -269,12 +255,11 @@ class ResultList(list): return d def get_ISBN(self, entry): - isbns = self.output_entry(entry.getchildren()[2]) + isbns = self.output_entry(entry.xpath('./p')[2]) isbns = filter(lambda x: self.reisbn.search(x) is not None, self.resplitbrdiv.split(isbns)) if not len(isbns): return None - #TODO: parse all tag if necessary isbns = [self.reisbn.sub('', x) for x in isbns if check_isbn(self.reisbn.sub('', x))] return sorted(isbns, cmp=lambda x,y:cmp(len(x), len(y)))[-1] @@ -287,7 +272,6 @@ class ResultList(list): mi.pubdate = self.get_date(entry, verbose) mi.isbn = self.get_ISBN(entry) mi.author_sort = authors_to_sort_string(authors) - # mi.language = self.get_language(x, verbose) return mi def get_individual_metadata(self, browser, linkdata, verbose): @@ -298,36 +282,35 @@ class ResultList(list): if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: return - raise + if isinstance(getattr(e, 'args', [None])[0], socket.timeout): + raise FictionwiseError(_('Fictionwise timed out. Try again later.')) + raise FictionwiseError(_('Fictionwise encountered an error.')) if '<title>404 - ' in raw: report(verbose) return raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] try: - feed = soupparser.fromstring(raw) + return soupparser.fromstring(raw) except: - return - - # get results - return feed.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td") + try: + #remove ASCII invalid chars + return soupparser.fromstring(clean_ascii_char(raw)) + except: + return None def populate(self, entries, browser, verbose=False): inv_tags ={'script': True, 'a': False, 'font': False, 'strong': False, 'b': False, - 'ul': False, 'span': False, 'table': True} - inv_xpath =('descendant-or-self::p[1]',) + 'ul': False, 'span': False} + inv_xpath =('./table',) #single entry if len(entries) == 1 and not isinstance(entries[0], str): try: entry = entries.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td") self.clean_entry(entry, invalid_tags=inv_tags, invalid_xpath=inv_xpath) - entry = self.clean_entry(entry) title = self.get_title(entry) - #ratings: get table for rating then drop - for elt in entry.getiterator('table'): - ratings = self.get_rating(elt, verbose) - elt.getprevious().drop_tree() - elt.drop_tree() + #maybe strenghten the search + ratings = self.get_rating(entry.xpath("./p/table")[1], verbose) authors = self.get_authors(entry) except Exception, e: if verbose: @@ -340,13 +323,11 @@ class ResultList(list): for x in entries: try: entry = self.get_individual_metadata(browser, x, verbose) + entry = entry.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td")[0] self.clean_entry(entry, invalid_tags=inv_tags, invalid_xpath=inv_xpath) title = self.get_title(entry) - #ratings: get table for rating then drop - for elt in entry.getiterator('table'): - ratings = self.get_rating(elt, verbose) - elt.getprevious().drop_tree() - elt.drop_tree() + #maybe strenghten the search + ratings = self.get_rating(entry.xpath("./p/table")[1], verbose) authors = self.get_authors(entry) except Exception, e: if verbose: @@ -361,7 +342,7 @@ def search(title=None, author=None, publisher=None, isbn=None, keywords=None): br = browser() entries = Query(title=title, author=author, publisher=publisher, - keywords=keywords, max_results=max_results)(br, verbose, timeout = 10.) + keywords=keywords, max_results=max_results)(br, verbose, timeout = 15.) #List of entry ans = ResultList() From e31735960bf42e443c59e1f5fec52d809dadb363 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Mon, 6 Dec 2010 00:33:37 -0700 Subject: [PATCH 15/24] Save to disk: Refactor to not open a database connection in the worker process. Also fix a bug that could lead to save failures not being reported. --- src/calibre/ebooks/metadata/worker.py | 91 ++++++++++++++++----- src/calibre/gui2/add.py | 13 +++ src/calibre/library/save_to_disk.py | 111 +++++++++++++++++--------- 3 files changed, 160 insertions(+), 55 deletions(-) diff --git a/src/calibre/ebooks/metadata/worker.py b/src/calibre/ebooks/metadata/worker.py index 247050856d..7dff988679 100644 --- a/src/calibre/ebooks/metadata/worker.py +++ b/src/calibre/ebooks/metadata/worker.py @@ -8,12 +8,12 @@ __docformat__ = 'restructuredtext en' from threading import Thread from Queue import Empty -import os, time, sys, shutil +import os, time, sys, shutil, json from calibre.utils.ipc.job import ParallelJob from calibre.utils.ipc.server import Server from calibre.ptempfile import PersistentTemporaryDirectory, TemporaryDirectory -from calibre import prints +from calibre import prints, isbytestring from calibre.constants import filesystem_encoding @@ -194,14 +194,44 @@ class SaveWorker(Thread): self.daemon = True self.path, self.opts = path, opts self.ids = ids - self.library_path = db.library_path + self.db = db self.canceled = False self.result_queue = result_queue self.error = None self.spare_server = spare_server self.start() + def collect_data(self, ids): + from calibre.ebooks.metadata.opf2 import metadata_to_opf + data = {} + for i in set(ids): + mi = self.db.get_metadata(i, index_is_id=True, get_cover=True) + opf = metadata_to_opf(mi) + if isbytestring(opf): + opf = opf.decode('utf-8') + cpath = None + if mi.cover: + cpath = mi.cover + if isbytestring(cpath): + cpath = cpath.decode(filesystem_encoding) + formats = {} + fmts = self.db.formats(i, index_is_id=True, verify_formats=False) + if fmts: + fmts = fmts.split(',') + for fmt in fmts: + fpath = self.db.format_abspath(i, fmt, index_is_id=True) + if fpath is not None: + if isbytestring(fpath): + fpath = fpath.decode(filesystem_encoding) + formats[fmt.lower()] = fpath + data[i] = [opf, cpath, formats] + return data + def run(self): + with TemporaryDirectory('save_to_disk_data') as tdir: + self._run(tdir) + + def _run(self, tdir): from calibre.library.save_to_disk import config server = Server() if self.spare_server is None else self.spare_server ids = set(self.ids) @@ -212,12 +242,19 @@ class SaveWorker(Thread): for pref in c.preferences: recs[pref.name] = getattr(self.opts, pref.name) + plugboards = self.db.prefs.get('plugboards', {}) + for i, task in enumerate(tasks): tids = [x[-1] for x in task] + data = self.collect_data(tids) + dpath = os.path.join(tdir, '%d.json'%i) + with open(dpath, 'wb') as f: + f.write(json.dumps(data, ensure_ascii=False).encode('utf-8')) + job = ParallelJob('save_book', 'Save books (%d of %d)'%(i, len(tasks)), lambda x,y:x, - args=[tids, self.library_path, self.path, recs]) + args=[tids, dpath, plugboards, self.path, recs]) jobs.add(job) server.add_job(job) @@ -226,21 +263,19 @@ class SaveWorker(Thread): time.sleep(0.2) running = False for job in jobs: - job.update(consume_notifications=False) - while True: - try: - id, title, ok, tb = job.notifications.get_nowait()[0] - if id in ids: - self.result_queue.put((id, title, ok, tb)) - ids.remove(id) - except Empty: - break + self.get_notifications(job, ids) if not job.is_finished: running = True if not running: break + for job in jobs: + for id_, title, ok, tb in job.result: + if id_ in ids: + self.result_queue.put((id_, title, ok, tb)) + ids.remove(id_) + server.close() time.sleep(1) @@ -257,21 +292,39 @@ class SaveWorker(Thread): except: pass + def get_notifications(self, job, ids): + job.update(consume_notifications=False) + while True: + try: + id, title, ok, tb = job.notifications.get_nowait()[0] + if id in ids: + self.result_queue.put((id, title, ok, tb)) + ids.remove(id) + except Empty: + break -def save_book(task, library_path, path, recs, notification=lambda x,y:x): - from calibre.library.database2 import LibraryDatabase2 - db = LibraryDatabase2(library_path) - from calibre.library.save_to_disk import config, save_to_disk + +def save_book(ids, dpath, plugboards, path, recs, notification=lambda x,y:x): + from calibre.library.save_to_disk import config, save_serialized_to_disk from calibre.customize.ui import apply_null_metadata opts = config().parse() for name in recs: setattr(opts, name, recs[name]) + results = [] def callback(id, title, failed, tb): + results.append((id, title, not failed, tb)) notification((id, title, not failed, tb)) return True - with apply_null_metadata: - save_to_disk(db, task, path, opts, callback) + data_ = json.loads(open(dpath, 'rb').read().decode('utf-8')) + data = {} + for k, v in data_.iteritems(): + data[int(k)] = v + + with apply_null_metadata: + save_serialized_to_disk(ids, data, plugboards, path, opts, callback) + + return results diff --git a/src/calibre/gui2/add.py b/src/calibre/gui2/add.py index 1339070446..d1af2a6f0c 100644 --- a/src/calibre/gui2/add.py +++ b/src/calibre/gui2/add.py @@ -427,11 +427,23 @@ class Saver(QObject): # {{{ if not self.ids or not self.worker.is_alive(): self.timer.stop() self.pd.hide() + while self.ids: + before = len(self.ids) + self.get_result() + if before == len(self.ids): + for i in list(self.ids): + self.failures.add(('id:%d'%i, 'Unknown error')) + self.ids.remove(i) + break if not self.callback_called: self.callback(self.worker.path, self.failures, self.worker.error) self.callback_called = True return + self.get_result() + + + def get_result(self): try: id, title, ok, tb = self.rq.get_nowait() except Empty: @@ -441,6 +453,7 @@ class Saver(QObject): # {{{ if not isinstance(title, unicode): title = str(title).decode(preferred_encoding, 'replace') self.pd.set_msg(_('Saved')+' '+title) + if not ok: self.failures.add((title, tb)) # }}} diff --git a/src/calibre/library/save_to_disk.py b/src/calibre/library/save_to_disk.py index c6cc12a978..af57d563ac 100644 --- a/src/calibre/library/save_to_disk.py +++ b/src/calibre/library/save_to_disk.py @@ -6,7 +6,7 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' -import os, traceback, cStringIO, re +import os, traceback, cStringIO, re, shutil from calibre.constants import DEBUG from calibre.utils.config import Config, StringConfig, tweaks @@ -203,31 +203,49 @@ def get_components(template, mi, id, timefmt='%b %Y', length=250, return shorten_components_to(length, components) -def save_book_to_disk(id, db, root, opts, length): - mi = db.get_metadata(id, index_is_id=True) +def save_book_to_disk(id_, db, root, opts, length): + mi = db.get_metadata(id_, index_is_id=True) + cover = db.cover(id_, index_is_id=True, as_path=True) + plugboards = db.prefs.get('plugboards', {}) - available_formats = db.formats(id, index_is_id=True) + available_formats = db.formats(id_, index_is_id=True) if not available_formats: available_formats = [] else: available_formats = [x.lower().strip() for x in available_formats.split(',')] + formats = {} + fmts = db.formats(id_, index_is_id=True, verify_formats=False) + if fmts: + fmts = fmts.split(',') + for fmt in fmts: + fpath = db.format_abspath(id_, fmt, index_is_id=True) + if fpath is not None: + formats[fmt.lower()] = fpath + + return do_save_book_to_disk(id_, mi, cover, plugboards, + formats, root, opts, length) + + +def do_save_book_to_disk(id_, mi, cover, plugboards, + format_map, root, opts, length): + available_formats = [x.lower().strip() for x in format_map.keys()] if opts.formats == 'all': asked_formats = available_formats else: asked_formats = [x.lower().strip() for x in opts.formats.split(',')] formats = set(available_formats).intersection(set(asked_formats)) if not formats: - return True, id, mi.title + return True, id_, mi.title - components = get_components(opts.template, mi, id, opts.timefmt, length, + components = get_components(opts.template, mi, id_, opts.timefmt, length, ascii_filename if opts.asciiize else sanitize_file_name, to_lowercase=opts.to_lowercase, replace_whitespace=opts.replace_whitespace) base_path = os.path.join(root, *components) base_name = os.path.basename(base_path) dirpath = os.path.dirname(base_path) - # Don't test for existence first are the test could fail but + # Don't test for existence first as the test could fail but # another worker process could create the directory before # the call to makedirs try: @@ -236,29 +254,23 @@ def save_book_to_disk(id, db, root, opts, length): if not os.path.exists(dirpath): raise - cdata = db.cover(id, index_is_id=True) - if opts.save_cover: - if cdata is not None: - with open(base_path+'.jpg', 'wb') as f: - f.write(cdata) - mi.cover = base_name+'.jpg' - else: - mi.cover = None + if opts.save_cover and cover and os.access(cover, os.R_OK): + with open(base_path+'.jpg', 'wb') as f: + with open(cover, 'rb') as s: + shutil.copyfileobj(s, f) + mi.cover = base_name+'.jpg' + else: + mi.cover = None if opts.write_opf: opf = metadata_to_opf(mi) with open(base_path+'.opf', 'wb') as f: f.write(opf) - if cdata is not None: - mi.cover_data = ('jpg', cdata) - mi.cover = None - written = False for fmt in formats: global plugboard_save_to_disk_value, plugboard_any_format_value dev_name = plugboard_save_to_disk_value - plugboards = db.prefs.get('plugboards', {}) cpb = None if fmt in plugboards: cpb = plugboards[fmt] @@ -275,11 +287,12 @@ def save_book_to_disk(id, db, root, opts, length): # Leave this here for a while, in case problems arise. if cpb is not None: prints('Save-to-disk using plugboard:', fmt, cpb) - data = db.format(id, fmt, index_is_id=True) - if data is None: + fp = format_map.get(fmt, None) + if fp is None: continue - else: - written = True + with open(fp, 'rb') as f: + data = f.read() + written = True if opts.update_metadata: stream = cStringIO.StringIO() stream.write(data) @@ -300,9 +313,21 @@ def save_book_to_disk(id, db, root, opts, length): with open(fmt_path, 'wb') as f: f.write(data) - return not written, id, mi.title + return not written, id_, mi.title +def _sanitize_args(root, opts): + if opts is None: + opts = config().parse() + if isinstance(root, unicode): + root = root.encode(filesystem_encoding) + root = os.path.abspath(root) + opts.template = preprocess_template(opts.template) + length = 1000 if supports_long_names(root) else 250 + length -= len(root) + if length < 5: + raise ValueError('%r is too long.'%root) + return root, opts, length def save_to_disk(db, ids, root, opts=None, callback=None): ''' @@ -316,17 +341,7 @@ def save_to_disk(db, ids, root, opts=None, callback=None): :return: A list of failures. Each element of the list is a tuple (id, title, traceback) ''' - if opts is None: - opts = config().parse() - if isinstance(root, unicode): - root = root.encode(filesystem_encoding) - root = os.path.abspath(root) - - opts.template = preprocess_template(opts.template) - length = 1000 if supports_long_names(root) else 250 - length -= len(root) - if length < 5: - raise ValueError('%r is too long.'%root) + root, opts, length = _sanitize_args(root, opts) failures = [] for x in ids: tb = '' @@ -343,4 +358,28 @@ def save_to_disk(db, ids, root, opts=None, callback=None): break return failures +def save_serialized_to_disk(ids, data, plugboards, root, opts, callback): + from calibre.ebooks.metadata.opf2 import OPF + root, opts, length = _sanitize_args(root, opts) + failures = [] + for x in ids: + opf, cover, format_map = data[x] + if isinstance(opf, unicode): + opf = opf.encode('utf-8') + mi = OPF(cStringIO.StringIO(opf)).to_book_metadata() + tb = '' + try: + failed, id, title = do_save_book_to_disk(x, mi, cover, plugboards, + format_map, root, opts, length) + tb = _('Requested formats not available') + except: + failed, id, title = True, x, mi.title + tb = traceback.format_exc() + if failed: + failures.append((id, title, tb)) + if callable(callback): + if not callback(int(id), title, failed, tb): + break + + return failures From f9a861b3c884bcf1a4a940d38acd1efbf2ac6d47 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Mon, 6 Dec 2010 09:09:36 -0700 Subject: [PATCH 16/24] ... --- src/calibre/ebooks/metadata/worker.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/calibre/ebooks/metadata/worker.py b/src/calibre/ebooks/metadata/worker.py index 7dff988679..a7a8177176 100644 --- a/src/calibre/ebooks/metadata/worker.py +++ b/src/calibre/ebooks/metadata/worker.py @@ -271,6 +271,8 @@ class SaveWorker(Thread): break for job in jobs: + if not job.result: + continue for id_, title, ok, tb in job.result: if id_ in ids: self.result_queue.put((id_, title, ok, tb)) From f5b4029751afc18267b20794d14e3aa03c74aa08 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Mon, 6 Dec 2010 09:44:45 -0700 Subject: [PATCH 17/24] ... --- src/calibre/ebooks/metadata/worker.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/metadata/worker.py b/src/calibre/ebooks/metadata/worker.py index a7a8177176..d059d7e34c 100644 --- a/src/calibre/ebooks/metadata/worker.py +++ b/src/calibre/ebooks/metadata/worker.py @@ -215,10 +215,8 @@ class SaveWorker(Thread): if isbytestring(cpath): cpath = cpath.decode(filesystem_encoding) formats = {} - fmts = self.db.formats(i, index_is_id=True, verify_formats=False) - if fmts: - fmts = fmts.split(',') - for fmt in fmts: + if mi.formats: + for fmt in mi.formats: fpath = self.db.format_abspath(i, fmt, index_is_id=True) if fpath is not None: if isbytestring(fpath): From 54c3ecced6c4fc86f656c18c183ad1d1464a4bf2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Mon, 6 Dec 2010 10:00:25 -0700 Subject: [PATCH 18/24] ... --- src/calibre/gui2/add.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/calibre/gui2/add.py b/src/calibre/gui2/add.py index d1af2a6f0c..5f41f3a8e0 100644 --- a/src/calibre/gui2/add.py +++ b/src/calibre/gui2/add.py @@ -436,6 +436,10 @@ class Saver(QObject): # {{{ self.ids.remove(i) break if not self.callback_called: + try: + self.worker.join(1.5) + except: + pass # The worker was not yet started self.callback(self.worker.path, self.failures, self.worker.error) self.callback_called = True return From 2ee84bad3ad6e646a08a9bbca5712fa45d2b11a9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Mon, 6 Dec 2010 10:08:18 -0700 Subject: [PATCH 19/24] ... --- src/calibre/utils/icu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/utils/icu.py b/src/calibre/utils/icu.py index 6ae7398fb4..4b0f6d4821 100644 --- a/src/calibre/utils/icu.py +++ b/src/calibre/utils/icu.py @@ -56,7 +56,7 @@ def py_sort_key(obj): def icu_sort_key(collator, obj): if not obj: return _none2 - return collator.sort_key(obj.lower()) + return collator.sort_key(lower(obj)) def py_case_sensitive_sort_key(obj): if not obj: From 73f54f5e9dc2a6238f52aa352b2e198c5ed68cb4 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Mon, 6 Dec 2010 10:16:46 -0700 Subject: [PATCH 20/24] Fix #7812 (Bookmarks don't work.) --- src/calibre/utils/zipfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/utils/zipfile.py b/src/calibre/utils/zipfile.py index dbcc125274..5c19444bd6 100644 --- a/src/calibre/utils/zipfile.py +++ b/src/calibre/utils/zipfile.py @@ -1227,7 +1227,7 @@ class ZipFile: self.fp.flush() if zinfo.flag_bits & 0x08: # Write CRC and file sizes after the file data - self.fp.write(struct.pack("<lLL", zinfo.CRC, zinfo.compress_size, + self.fp.write(struct.pack("<LLL", zinfo.CRC, zinfo.compress_size, zinfo.file_size)) self.filelist.append(zinfo) self.NameToInfo[zinfo.filename] = zinfo From 10dc583d4bfce7ceea71cb707257484e5981db94 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Mon, 6 Dec 2010 10:18:53 -0700 Subject: [PATCH 21/24] Fix #7815 (Device support for HTC Legend with Wordplayer) --- src/calibre/devices/android/driver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/devices/android/driver.py b/src/calibre/devices/android/driver.py index 0deef5eb92..46fad13a2a 100644 --- a/src/calibre/devices/android/driver.py +++ b/src/calibre/devices/android/driver.py @@ -21,7 +21,7 @@ class ANDROID(USBMS): # HTC 0x0bb4 : { 0x0c02 : [0x100, 0x0227, 0x0226], 0x0c01 : [0x100, 0x0227], 0x0ff9 : [0x0100, 0x0227, 0x0226], 0x0c87: [0x0100, 0x0227, 0x0226], - 0xc92 : [0x100]}, + 0xc92 : [0x100], 0xc97: [0x226]}, # Eken 0x040d : { 0x8510 : [0x0001] }, From c9510ba730224cc41f2acdd1dc343a40770d90c8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Mon, 6 Dec 2010 10:20:41 -0700 Subject: [PATCH 22/24] Fix #7802 (Driver for Samsumg Epic?) --- src/calibre/devices/android/driver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/devices/android/driver.py b/src/calibre/devices/android/driver.py index 46fad13a2a..9c37b6ff59 100644 --- a/src/calibre/devices/android/driver.py +++ b/src/calibre/devices/android/driver.py @@ -63,7 +63,7 @@ class ANDROID(USBMS): WINDOWS_MAIN_MEM = ['ANDROID_PHONE', 'A855', 'A853', 'INC.NEXUS_ONE', '__UMS_COMPOSITE', '_MB200', 'MASS_STORAGE', '_-_CARD', 'SGH-I897', 'GT-I9000', 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', - 'SCH-I500_CARD'] + 'SCH-I500_CARD', 'SPH-D700_CARD'] WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897', 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID'] From 51f48f0cb2923b41d6a69c46d803aabbd5b36cfc Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Mon, 6 Dec 2010 11:00:25 -0700 Subject: [PATCH 23/24] ... --- src/calibre/utils/icu.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/calibre/utils/icu.c b/src/calibre/utils/icu.c index 51d9ac25ba..38542a44c6 100644 --- a/src/calibre/utils/icu.c +++ b/src/calibre/utils/icu.c @@ -237,8 +237,6 @@ static PyTypeObject icu_CollatorType = { // {{{ // }} -// }}} - // }}} // Module initialization {{{ From 2ed1365eb16d028e58218635cc838fb3617452ea Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Mon, 6 Dec 2010 12:20:15 -0700 Subject: [PATCH 24/24] Edit metadata dialog: When trying to download metadata, if there are multiple matches indicate which matches have a cover and summary in the list. Also add an option to automatically download the cover of the selected match. --- src/calibre/gui2/__init__.py | 2 + src/calibre/gui2/dialogs/fetch_metadata.py | 22 +- src/calibre/gui2/dialogs/fetch_metadata.ui | 351 ++++++++++---------- src/calibre/gui2/dialogs/metadata_single.py | 11 +- 4 files changed, 206 insertions(+), 180 deletions(-) diff --git a/src/calibre/gui2/__init__.py b/src/calibre/gui2/__init__.py index 57b914877d..57ca2a1880 100644 --- a/src/calibre/gui2/__init__.py +++ b/src/calibre/gui2/__init__.py @@ -123,6 +123,8 @@ def _config(): help=_('Download social metadata (tags/rating/etc.)')) c.add_opt('overwrite_author_title_metadata', default=True, help=_('Overwrite author and title with new metadata')) + c.add_opt('auto_download_cover', default=False, + help=_('Automatically download the cover, if available')) c.add_opt('enforce_cpu_limit', default=True, help=_('Limit max simultaneous jobs to number of CPUs')) c.add_opt('tag_browser_hidden_categories', default=set(), diff --git a/src/calibre/gui2/dialogs/fetch_metadata.py b/src/calibre/gui2/dialogs/fetch_metadata.py index 2c64219464..3da0e67e3d 100644 --- a/src/calibre/gui2/dialogs/fetch_metadata.py +++ b/src/calibre/gui2/dialogs/fetch_metadata.py @@ -9,7 +9,7 @@ from threading import Thread from PyQt4.QtCore import Qt, QObject, SIGNAL, QVariant, pyqtSignal, \ QAbstractTableModel, QCoreApplication, QTimer -from PyQt4.QtGui import QDialog, QItemSelectionModel +from PyQt4.QtGui import QDialog, QItemSelectionModel, QIcon from calibre.gui2.dialogs.fetch_metadata_ui import Ui_FetchMetadata from calibre.gui2 import error_dialog, NONE, info_dialog, config @@ -42,13 +42,14 @@ class Matches(QAbstractTableModel): def __init__(self, matches): self.matches = matches + self.yes_icon = QVariant(QIcon(I('ok.png'))) QAbstractTableModel.__init__(self) def rowCount(self, *args): return len(self.matches) def columnCount(self, *args): - return 6 + return 8 def headerData(self, section, orientation, role): if role != Qt.DisplayRole: @@ -61,6 +62,8 @@ class Matches(QAbstractTableModel): elif section == 3: text = _("Publisher") elif section == 4: text = _("ISBN") elif section == 5: text = _("Published") + elif section == 6: text = _("Has Cover") + elif section == 7: text = _("Has Summary") return QVariant(text) else: @@ -71,8 +74,8 @@ class Matches(QAbstractTableModel): def data(self, index, role): row, col = index.row(), index.column() + book = self.matches[row] if role == Qt.DisplayRole: - book = self.matches[row] res = None if col == 0: res = book.title @@ -90,6 +93,11 @@ class Matches(QAbstractTableModel): if not res: return NONE return QVariant(res) + elif role == Qt.DecorationRole: + if col == 6 and book.has_cover: + return self.yes_icon + if col == 7 and book.comments: + return self.yes_icon return NONE class FetchMetadata(QDialog, Ui_FetchMetadata): @@ -131,7 +139,7 @@ class FetchMetadata(QDialog, Ui_FetchMetadata): self.fetch_metadata() self.opt_get_social_metadata.setChecked(config['get_social_metadata']) self.opt_overwrite_author_title_metadata.setChecked(config['overwrite_author_title_metadata']) - + self.opt_auto_download_cover.setChecked(config['auto_download_cover']) def show_summary(self, current, *args): row = current.row() @@ -213,6 +221,12 @@ class FetchMetadata(QDialog, Ui_FetchMetadata): _hung_fetchers.add(self.fetcher) if hasattr(self, '_hangcheck') and self._hangcheck.isActive(): self._hangcheck.stop() + # Save value of auto_download_cover, since this is the only place it can + # be set. The values of the other options can be set in + # Preferences->Behavior and should not be set here as they affect bulk + # downloading as well. + if self.opt_auto_download_cover.isChecked() != config['auto_download_cover']: + config.set('auto_download_cover', self.opt_auto_download_cover.isChecked()) def __enter__(self, *args): return self diff --git a/src/calibre/gui2/dialogs/fetch_metadata.ui b/src/calibre/gui2/dialogs/fetch_metadata.ui index 03a362096c..b140fa158d 100644 --- a/src/calibre/gui2/dialogs/fetch_metadata.ui +++ b/src/calibre/gui2/dialogs/fetch_metadata.ui @@ -1,172 +1,179 @@ -<?xml version="1.0" encoding="UTF-8"?> -<ui version="4.0"> - <class>FetchMetadata</class> - <widget class="QDialog" name="FetchMetadata"> - <property name="windowModality"> - <enum>Qt::WindowModal</enum> - </property> - <property name="geometry"> - <rect> - <x>0</x> - <y>0</y> - <width>830</width> - <height>642</height> - </rect> - </property> - <property name="windowTitle"> - <string>Fetch metadata</string> - </property> - <property name="windowIcon"> - <iconset resource="../../../../resources/images.qrc"> - <normaloff>:/images/metadata.png</normaloff>:/images/metadata.png</iconset> - </property> - <layout class="QVBoxLayout"> - <item> - <widget class="QLabel" name="tlabel"> - <property name="text"> - <string><p>calibre can find metadata for your books from two locations: <b>Google Books</b> and <b>isbndb.com</b>. <p>To use isbndb.com you must sign up for a <a href="http://www.isbndb.com">free account</a> and enter your access key below.</string> - </property> - <property name="alignment"> - <set>Qt::AlignCenter</set> - </property> - <property name="wordWrap"> - <bool>true</bool> - </property> - <property name="openExternalLinks"> - <bool>true</bool> - </property> - </widget> - </item> - <item> - <layout class="QHBoxLayout"> - <item> - <widget class="QLabel" name="label_2"> - <property name="text"> - <string>&Access Key:</string> - </property> - <property name="buddy"> - <cstring>key</cstring> - </property> - </widget> - </item> - <item> - <widget class="QLineEdit" name="key"/> - </item> - <item> - <widget class="QPushButton" name="fetch"> - <property name="text"> - <string>Fetch</string> - </property> - </widget> - </item> - </layout> - </item> - <item> - <widget class="QLabel" name="warning"> - <property name="text"> - <string/> - </property> - <property name="wordWrap"> - <bool>true</bool> - </property> - </widget> - </item> - <item> - <widget class="QGroupBox" name="groupBox"> - <property name="title"> - <string>Matches</string> - </property> - <layout class="QVBoxLayout"> - <item> - <widget class="QLabel" name="label_3"> - <property name="text"> - <string>Select the book that most closely matches your copy from the list below</string> - </property> - </widget> - </item> - <item> - <widget class="QTableView" name="matches"> - <property name="sizePolicy"> - <sizepolicy hsizetype="Expanding" vsizetype="Expanding"> - <horstretch>0</horstretch> - <verstretch>1</verstretch> - </sizepolicy> - </property> - <property name="alternatingRowColors"> - <bool>true</bool> - </property> - <property name="selectionMode"> - <enum>QAbstractItemView::SingleSelection</enum> - </property> - <property name="selectionBehavior"> - <enum>QAbstractItemView::SelectRows</enum> - </property> - </widget> - </item> - <item> - <widget class="QTextBrowser" name="summary"/> - </item> - </layout> - </widget> - </item> - <item> - <widget class="QCheckBox" name="opt_get_social_metadata"> - <property name="text"> - <string>Download &social metadata (tags/rating/etc.) for the selected book</string> - </property> - </widget> - </item> - <item> - <widget class="QCheckBox" name="opt_overwrite_author_title_metadata"> - <property name="text"> - <string>Overwrite author and title with author and title of selected book</string> - </property> - </widget> - </item> - <item> - <widget class="QDialogButtonBox" name="buttonBox"> - <property name="standardButtons"> - <set>QDialogButtonBox::Cancel|QDialogButtonBox::Ok</set> - </property> - </widget> - </item> - </layout> - </widget> - <resources> - <include location="../../../../resources/images.qrc"/> - </resources> - <connections> - <connection> - <sender>buttonBox</sender> - <signal>accepted()</signal> - <receiver>FetchMetadata</receiver> - <slot>accept()</slot> - <hints> - <hint type="sourcelabel"> - <x>460</x> - <y>599</y> - </hint> - <hint type="destinationlabel"> - <x>657</x> - <y>530</y> - </hint> - </hints> - </connection> - <connection> - <sender>buttonBox</sender> - <signal>rejected()</signal> - <receiver>FetchMetadata</receiver> - <slot>reject()</slot> - <hints> - <hint type="sourcelabel"> - <x>417</x> - <y>599</y> - </hint> - <hint type="destinationlabel"> - <x>0</x> - <y>491</y> - </hint> - </hints> - </connection> - </connections> -</ui> +<?xml version="1.0" encoding="UTF-8"?> +<ui version="4.0"> + <class>FetchMetadata</class> + <widget class="QDialog" name="FetchMetadata"> + <property name="windowModality"> + <enum>Qt::WindowModal</enum> + </property> + <property name="geometry"> + <rect> + <x>0</x> + <y>0</y> + <width>890</width> + <height>642</height> + </rect> + </property> + <property name="windowTitle"> + <string>Fetch metadata</string> + </property> + <property name="windowIcon"> + <iconset resource="../../../../resources/images.qrc"> + <normaloff>:/images/metadata.png</normaloff>:/images/metadata.png</iconset> + </property> + <layout class="QVBoxLayout"> + <item> + <widget class="QLabel" name="tlabel"> + <property name="text"> + <string><p>calibre can find metadata for your books from two locations: <b>Google Books</b> and <b>isbndb.com</b>. <p>To use isbndb.com you must sign up for a <a href="http://www.isbndb.com">free account</a> and enter your access key below.</string> + </property> + <property name="alignment"> + <set>Qt::AlignCenter</set> + </property> + <property name="wordWrap"> + <bool>true</bool> + </property> + <property name="openExternalLinks"> + <bool>true</bool> + </property> + </widget> + </item> + <item> + <layout class="QHBoxLayout"> + <item> + <widget class="QLabel" name="label_2"> + <property name="text"> + <string>&Access Key:</string> + </property> + <property name="buddy"> + <cstring>key</cstring> + </property> + </widget> + </item> + <item> + <widget class="QLineEdit" name="key"/> + </item> + <item> + <widget class="QPushButton" name="fetch"> + <property name="text"> + <string>Fetch</string> + </property> + </widget> + </item> + </layout> + </item> + <item> + <widget class="QLabel" name="warning"> + <property name="text"> + <string/> + </property> + <property name="wordWrap"> + <bool>true</bool> + </property> + </widget> + </item> + <item> + <widget class="QGroupBox" name="groupBox"> + <property name="title"> + <string>Matches</string> + </property> + <layout class="QVBoxLayout"> + <item> + <widget class="QLabel" name="label_3"> + <property name="text"> + <string>Select the book that most closely matches your copy from the list below</string> + </property> + </widget> + </item> + <item> + <widget class="QTableView" name="matches"> + <property name="sizePolicy"> + <sizepolicy hsizetype="Expanding" vsizetype="Expanding"> + <horstretch>0</horstretch> + <verstretch>1</verstretch> + </sizepolicy> + </property> + <property name="alternatingRowColors"> + <bool>true</bool> + </property> + <property name="selectionMode"> + <enum>QAbstractItemView::SingleSelection</enum> + </property> + <property name="selectionBehavior"> + <enum>QAbstractItemView::SelectRows</enum> + </property> + </widget> + </item> + <item> + <widget class="QTextBrowser" name="summary"/> + </item> + </layout> + </widget> + </item> + <item> + <widget class="QCheckBox" name="opt_overwrite_author_title_metadata"> + <property name="text"> + <string>Overwrite author and title with author and title of selected book</string> + </property> + </widget> + </item> + <item> + <widget class="QCheckBox" name="opt_get_social_metadata"> + <property name="text"> + <string>Download &social metadata (tags/rating/etc.) for the selected book</string> + </property> + </widget> + </item> + <item> + <widget class="QCheckBox" name="opt_auto_download_cover"> + <property name="text"> + <string>Automatically download the cover, if available</string> + </property> + </widget> + </item> + <item> + <widget class="QDialogButtonBox" name="buttonBox"> + <property name="standardButtons"> + <set>QDialogButtonBox::Cancel|QDialogButtonBox::Ok</set> + </property> + </widget> + </item> + </layout> + </widget> + <resources> + <include location="../../../../resources/images.qrc"/> + </resources> + <connections> + <connection> + <sender>buttonBox</sender> + <signal>accepted()</signal> + <receiver>FetchMetadata</receiver> + <slot>accept()</slot> + <hints> + <hint type="sourcelabel"> + <x>460</x> + <y>599</y> + </hint> + <hint type="destinationlabel"> + <x>657</x> + <y>530</y> + </hint> + </hints> + </connection> + <connection> + <sender>buttonBox</sender> + <signal>rejected()</signal> + <receiver>FetchMetadata</receiver> + <slot>reject()</slot> + <hints> + <hint type="sourcelabel"> + <x>417</x> + <y>599</y> + </hint> + <hint type="destinationlabel"> + <x>0</x> + <y>491</y> + </hint> + </hints> + </connection> + </connections> +</ui> diff --git a/src/calibre/gui2/dialogs/metadata_single.py b/src/calibre/gui2/dialogs/metadata_single.py index 8f068075cf..fec58a74f6 100644 --- a/src/calibre/gui2/dialogs/metadata_single.py +++ b/src/calibre/gui2/dialogs/metadata_single.py @@ -760,8 +760,8 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): if book.publisher: self.publisher.setEditText(book.publisher) if book.isbn: self.isbn.setText(book.isbn) if book.pubdate: - d = book.pubdate - self.pubdate.setDate(QDate(d.year, d.month, d.day)) + dt = book.pubdate + self.pubdate.setDate(QDate(dt.year, dt.month, dt.day)) summ = book.comments if summ: prefix = unicode(self.comments.toPlainText()) @@ -777,8 +777,11 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): self.series.setText(book.series) if book.series_index is not None: self.series_index.setValue(book.series_index) - # Needed because of Qt focus bug on OS X - self.fetch_cover_button.setFocus(Qt.OtherFocusReason) + if book.has_cover: + if d.opt_auto_download_cover.isChecked() and book.has_cover: + self.fetch_cover() + else: + self.fetch_cover_button.setFocus(Qt.OtherFocusReason) else: error_dialog(self, _('Cannot fetch metadata'), _('You must specify at least one of ISBN, Title, '