From 7ff09a5d0c1357614bad3dbb7caf844bccfc3b24 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 22 Nov 2010 12:40:56 -0700 Subject: [PATCH] Speedup for bibtex catalog generation. Only show HTML comment customization if the download plugin says it gets HTML comments --- src/calibre/ebooks/metadata/fetch.py | 15 ++-- src/calibre/ebooks/metadata/isbndb.py | 17 +--- src/calibre/library/catalog.py | 42 ++++----- src/calibre/utils/bibtex.py | 124 +++++++++++++------------- src/calibre/utils/mreplace.py | 32 +++++++ 5 files changed, 128 insertions(+), 102 deletions(-) create mode 100644 src/calibre/utils/mreplace.py diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index b6b3fb9c40..b797a477d6 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -145,18 +145,21 @@ class MetadataSource(Plugin): # {{{ setattr(w, '_'+x, cb) cb.setChecked(c.get(x, True)) w._layout.addWidget(cb) - - cb = QCheckBox(_('Convert comments downloaded from %s to plain text')%(self.name)) - setattr(w, '_textcomments', cb) - cb.setChecked(c.get('textcomments', False)) - w._layout.addWidget(cb) + + if self.has_html_comments: + cb = QCheckBox(_('Convert comments downloaded from %s to plain text')%(self.name)) + setattr(w, '_textcomments', cb) + cb.setChecked(c.get('textcomments', False)) + w._layout.addWidget(cb) return w def save_settings(self, w): dl_settings = {} - for x in ('rating', 'tags', 'comments', 'textcomments'): + for x in ('rating', 'tags', 'comments'): dl_settings[x] = getattr(w, '_'+x).isChecked() + if self.has_html_comments: + dl_settings['textcomments'] = getattr(w, '_textcomments').isChecked() c = self.config_store() c.set(self.name, dl_settings) if hasattr(w, '_sc'): diff --git a/src/calibre/ebooks/metadata/isbndb.py b/src/calibre/ebooks/metadata/isbndb.py index 83cf6ee0ed..9169227326 100644 --- a/src/calibre/ebooks/metadata/isbndb.py +++ b/src/calibre/ebooks/metadata/isbndb.py @@ -90,10 +90,8 @@ def build_isbn(base_url, opts): return base_url + 'index1=isbn&value1='+opts.isbn def build_combined(base_url, opts): - query = '' - for e in (opts.title, opts.author, opts.publisher): - if e is not None: - query += ' ' + e + query = ' '.join([e for e in (opts.title, opts.author, opts.publisher) \ + if e is not None ]) query = query.strip() if len(query) == 0: raise ISBNDBError('You must specify at least one of --author, --title or --publisher') @@ -141,15 +139,8 @@ def create_books(opts, args, timeout=5.): print ('ISBNDB query: '+url) tans = [ISBNDBMetadata(book) for book in fetch_metadata(url, timeout=timeout)] - ans = [] - for x in tans: - add = True - for y in ans: - if y.isbn == x.isbn: - add = False - if add: - ans.append(x) - return ans + #remove duplicates ISBN + return list(dict((book.isbn, book) for book in tans).values()) def main(args=sys.argv): parser = option_parser() diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py index 19519d6d71..33525f6540 100644 --- a/src/calibre/library/catalog.py +++ b/src/calibre/library/catalog.py @@ -278,10 +278,10 @@ class BIBTEX(CatalogPlugin): from calibre.library.save_to_disk import preprocess_template #Bibtex functions - from calibre.utils.bibtex import bibtex_author_format, utf8ToBibtex, ValidateCitationKey + from calibre.utils.bibtex import BibTeX def create_bibtex_entry(entry, fields, mode, template_citation, - asccii_bibtex = True, citation_bibtex = True): + bibtexdict, citation_bibtex = True): #Bibtex doesn't like UTF-8 but keep unicode until writing #Define starting chain or if book valid strict and not book return a Fail string @@ -297,7 +297,8 @@ class BIBTEX(CatalogPlugin): if citation_bibtex : # Citation tag - bibtex_entry.append(make_bibtex_citation(entry, template_citation, asccii_bibtex)) + bibtex_entry.append(make_bibtex_citation(entry, template_citation, + bibtexdict)) bibtex_entry = [u' '.join(bibtex_entry)] for field in fields: @@ -312,11 +313,11 @@ class BIBTEX(CatalogPlugin): pass if field == 'authors' : - bibtex_entry.append(u'author = "%s"' % bibtex_author_format(item)) + bibtex_entry.append(u'author = "%s"' % bibtexdict.bibtex_author_format(item)) elif field in ['title', 'publisher', 'cover', 'uuid', 'author_sort', 'series'] : - bibtex_entry.append(u'%s = "%s"' % (field, utf8ToBibtex(item, asccii_bibtex))) + bibtex_entry.append(u'%s = "%s"' % (field, bibtexdict.utf8ToBibtex(item))) elif field == 'id' : bibtex_entry.append(u'calibreid = "%s"' % int(item)) @@ -329,13 +330,13 @@ class BIBTEX(CatalogPlugin): elif field == 'tags' : #A list to flatten - bibtex_entry.append(u'tags = "%s"' % utf8ToBibtex(u', '.join(item), asccii_bibtex)) + bibtex_entry.append(u'tags = "%s"' % bibtexdict.utf8ToBibtex(u', '.join(item))) elif field == 'comments' : #\n removal item = item.replace(u'\r\n',u' ') item = item.replace(u'\n',u' ') - bibtex_entry.append(u'note = "%s"' % utf8ToBibtex(item, asccii_bibtex)) + bibtex_entry.append(u'note = "%s"' % bibtexdict.utf8ToBibtex(item)) elif field == 'isbn' : # Could be 9, 10 or 13 digits @@ -353,8 +354,7 @@ class BIBTEX(CatalogPlugin): elif field == 'pubdate' : bibtex_entry.append(u'year = "%s"' % item.year) - bibtex_entry.append(u'month = "%s"' % utf8ToBibtex(strftime("%b", item), - asccii_bibtex)) + bibtex_entry.append(u'month = "%s"' % bibtexdict.utf8ToBibtex(strftime("%b", item))) bibtex_entry = u',\n '.join(bibtex_entry) bibtex_entry += u' }\n\n' @@ -371,7 +371,7 @@ class BIBTEX(CatalogPlugin): else : return True - def make_bibtex_citation(entry, template_citation, asccii_bibtex): + def make_bibtex_citation(entry, template_citation, bibtexclass): #define a function to replace the template entry by its value def tpl_replace(objtplname) : @@ -392,8 +392,9 @@ class BIBTEX(CatalogPlugin): return u'' if len(template_citation) >0 : - tpl_citation = utf8ToBibtex(ValidateCitationKey(re.sub(u'\{[^{}]*\}', - tpl_replace, template_citation)), asccii_bibtex) + tpl_citation = bibtexclass.utf8ToBibtex( + bibtexclass.ValidateCitationKey(re.sub(u'\{[^{}]*\}', + tpl_replace, template_citation))) if len(tpl_citation) >0 : return tpl_citation @@ -404,10 +405,7 @@ class BIBTEX(CatalogPlugin): else : template_citation = u'%s' % str(entry["id"]) - if asccii_bibtex : - return ValidateCitationKey(template_citation.encode('ascii', 'replace')) - else : - return ValidateCitationKey(template_citation) + return bibtexclass.ValidateCitationKey(template_citation) self.fmt = path_to_output.rpartition('.')[2] self.notification = notification @@ -475,13 +473,16 @@ class BIBTEX(CatalogPlugin): if not len(data): log.error("\nNo matching database entries for search criteria '%s'" % opts.search_text) + #Initialize BibTeX class + bibtexc = BibTeX() + #Entries writing after Bibtex formating (or not) if bibfile_enc != 'ascii' : - asccii_bibtex = False + bibtexc.ascii_bibtex = False else : - asccii_bibtex = True + bibtexc.ascii_bibtex = True - #Check and go to default in case of bad CLI + #Check citation choice and go to default in case of bad CLI if isinstance(opts.impcit, (StringType, UnicodeType)) : if opts.impcit == 'False' : citation_bibtex= False @@ -493,6 +494,7 @@ class BIBTEX(CatalogPlugin): else : citation_bibtex= opts.impcit + #Preprocess for error and light correction template_citation = preprocess_template(opts.bib_cit) #Open output and write entries @@ -514,7 +516,7 @@ class BIBTEX(CatalogPlugin): for entry in data: outfile.write(create_bibtex_entry(entry, fields, bib_entry, template_citation, - asccii_bibtex, citation_bibtex)) + bibtexc, citation_bibtex)) outfile.close() diff --git a/src/calibre/utils/bibtex.py b/src/calibre/utils/bibtex.py index 1328aa9157..d19a6b05fe 100644 --- a/src/calibre/utils/bibtex.py +++ b/src/calibre/utils/bibtex.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - """ Collection of python utility-methodes commonly used by other bibliograph packages. From http://pypi.python.org/pypi/bibliograph.core/ @@ -62,11 +60,14 @@ DAMAGE. """ -__docformat__ = 'reStructuredText' __author__ = 'sengian ' +__docformat__ = 'restructuredtext en' import re, string +from calibre.constants import preferred_encoding +from calibre.utils.mreplace import MReplace + utf8enc2latex_mapping = { # This is a mapping of Unicode characters to LaTeX equivalents. # The information has been extracted from @@ -2463,7 +2464,7 @@ utf8enc2latex_mapping = { u'\U0001d7fd': '$\\mathtt{7}$', u'\U0001d7fe': '$\\mathtt{8}$', u'\U0001d7ff': '$\\mathtt{9}$', - + #Items from simple list u'\u0106': "{\\a\\'C}", u'\u0408': '{\\CYRJE}', @@ -2842,69 +2843,66 @@ entity_mapping = { '"':'{"}', } -def ValidateCitationKey(text): - """ - removes characters not allowed in BibTeX keys +class BibTeX: + def __init__(self): + self.rep_utf8 = MReplace(utf8enc2latex_mapping) + self.rep_ent = MReplace(entity_mapping) + #Set default conversion to ASCII BibTeX + self.ascii_bibtex = True + # This substitution is based on the description of cite key restrictions at + # http://bibdesk.sourceforge.net/manual/BibDesk%20Help_2.html + self.invalid_cit = re.compile(u'[ "@\',\\#}{~%&$^]') + self.upper = re.compile(u'[' + + string.uppercase.decode(preferred_encoding) + u']') + self.escape = re.compile(u'[~#&%_]') - >>> from bibliograph.core.utils import _validKey - >>> _validKey(DummyEntry('Foo Bar')) - 'FooBar' + def ValidateCitationKey(self, text): + """ + removes characters not allowed in BibTeX keys + >>> ValidateCitationKey(DummyEntry('my@id')) + 'myid' + """ + return self.invalid_cit.sub(u'', text) - >>> _validKey(DummyEntry('my@id')) - 'myid' + def braceUppercase(self, text): + """ Convert uppercase letters to bibtex encoded uppercase + >>> braceUppercase('Foo Bar') + '{F}oo {B}ar' + """ + return self.upper.sub(lambda m: u'{%s}' % m.group(), text) - """ - # This substitution is based on the description of cite key restrictions at - # http://bibdesk.sourceforge.net/manual/BibDesk%20Help_2.html - return re.sub(u'[ "@\',\\#}{~%&$^]', u'', text) + def resolveEntities(self, text): + #for entity, entity_map in entity_mapping.iteritems(): + # text = text.replace(entity, entity_map) + #return text + return self.rep_ent.mreplace(text) -def BraceUppercase(text): - """ Convert uppercase letters to bibtex encoded uppercase + def resolveUnicode(self, text): + #UTF-8 text as entry + #for unichar, latexenc in utf8enc2latex_mapping.iteritems() : + # text = text.replace(unichar, latexenc) + text = self.rep_utf8.mreplace(text) + return text.replace(u'$}{$', u'') - >>> from bibliograph.core.utils import _braceUppercase - >>> _braceUppercase('foo bar') - 'foo bar' + def escapeSpecialCharacters(self, text): + """ + latex escaping some (not all) special characters + """ + text.replace('\\', '\\\\') + return self.escape.sub(lambda m: u'\\%s' % m.group(), text) - >>> _braceUppercase('Foo Bar') - '{F}oo {B}ar' - """ - for uc in string.uppercase: - text = text.replace(uc, u'{%s}' % uc) - return text + #Calibre functions + #Option to go to official ASCII Bibtex or unofficial UTF-8 + #Go from an unicode entry to ASCII Bibtex format without encoding + def utf8ToBibtex(self, text): + if len(text) == 0: + return '' + text.replace('\\', '\\\\') + text = self.resolveEntities(text) + if self.ascii_bibtex : + text = self.resolveUnicode(text) + return self.escapeSpecialCharacters(text) -def resolveEntities(text): - for entity, entity_map in entity_mapping.iteritems(): - text = text.replace(entity, entity_map) - return text - -def resolveUnicode(text): - #UTF-8 text as entry - for unichar, latexenc in utf8enc2latex_mapping.iteritems() : - text = text.replace(unichar, latexenc) - return text.replace(u'$}{$', u'') - -def escapeSpecialCharacters(text): - """ - latex escaping some (not all) special characters - """ - text.replace('\\', '\\\\') - escape = ['~', '#', '&', '%', '_'] - for c in escape: - text = text.replace(c, '\\' + c ) - return text - -#Calibre functions -#Go from an unicode entry to ASCII Bibtex format without encoding -#Option to go to official ASCII Bibtex or unofficial UTF-8 -def utf8ToBibtex(text, asccii_bibtex = True): - if len(text) == 0: - return '' - text.replace('\\', '\\\\') - text = resolveEntities(text) - if asccii_bibtex : - text = resolveUnicode(text) - return escapeSpecialCharacters(text) - -def bibtex_author_format(item): - #Format authors for Bibtex compliance (get a list as input) - return utf8ToBibtex(u' and'.join([author for author in item])) + def bibtex_author_format(self, item): + #Format authors for Bibtex compliance (get a list as input) + return self.utf8ToBibtex(u' and'.join([author for author in item])) diff --git a/src/calibre/utils/mreplace.py b/src/calibre/utils/mreplace.py new file mode 100644 index 0000000000..b9fbc0bded --- /dev/null +++ b/src/calibre/utils/mreplace.py @@ -0,0 +1,32 @@ +#multiple replace from dictionnary : http://code.activestate.com/recipes/81330/ +__license__ = 'GPL v3' +__copyright__ = '2010, sengian ' +__docformat__ = 'restructuredtext en' + +import re +from UserDict import UserDict + +class MReplace(UserDict): + def __init__(self, dict = None): + UserDict.__init__(self, dict) + self.re = None + self.regex = None + self.compile_regex() + + def compile_regex(self): + if len(self.data) > 0: + keys = sorted(self.data.keys(), key=len) + keys.reverse() + tmp = "(%s)" % "|".join(map(re.escape, keys)) + if self.re != tmp: + self.re = tmp + self.regex = re.compile(self.re) + + def __call__(self, mo): + return self[mo.string[mo.start():mo.end()]] + + def mreplace(self, text): + #Replace without regex compile + if len(self.data) < 1 or self.re is None: + return text + return self.regex.sub(self, text) \ No newline at end of file