From 2f5f2a9d335a77abaa97fe34ef86592c3acab5e3 Mon Sep 17 00:00:00 2001 From: Sengian Date: Mon, 26 Jul 2010 22:43:11 +0200 Subject: [PATCH 001/132] Bug correction: negative values of first line indent where converted to positive values causing a lot of formatting problems --- src/calibre/ebooks/rtf2xml/process_tokens.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/rtf2xml/process_tokens.py b/src/calibre/ebooks/rtf2xml/process_tokens.py index 19a7d38135..9cb7c3c6a4 100755 --- a/src/calibre/ebooks/rtf2xml/process_tokens.py +++ b/src/calibre/ebooks/rtf2xml/process_tokens.py @@ -680,7 +680,7 @@ class ProcessTokens: return the_string def divide_num(self, numerator, denominator): try: - numerator = float(re.search('[0-9.]+', numerator).group()) + numerator = float(re.search('[0-9.\-]+', numerator).group()) #calibre why ignore negative number? Wrong in case of \fi except TypeError, msg: if self.__run_level > 3: msg = 'no number to process?\n' From a2702d99c29c2a2eb86c1f957141544f2e11399b Mon Sep 17 00:00:00 2001 From: Sengian Date: Tue, 27 Jul 2010 19:33:12 +0200 Subject: [PATCH 002/132] Formatting --- resources/templates/rtf.xsl | 4 ---- src/calibre/ebooks/rtf/input.py | 7 ------- 2 files changed, 11 deletions(-) diff --git a/resources/templates/rtf.xsl b/resources/templates/rtf.xsl index bf016efaaf..ae054186d4 100644 --- a/resources/templates/rtf.xsl +++ b/resources/templates/rtf.xsl @@ -81,7 +81,6 @@ - @@ -182,14 +181,12 @@ - - unnamed @@ -386,7 +383,6 @@ - true true false diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 50f5571d58..df74a7b3cb 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -141,7 +141,6 @@ class RTFInput(InputFormatPlugin): return name - def write_inline_css(self, ic): font_size_classes = ['span.fs%d { font-size: %spt }'%(i, x) for i, x in enumerate(ic.font_sizes)] @@ -152,17 +151,11 @@ class RTFInput(InputFormatPlugin): text-decoration: none; font-weight: normal; font-style: normal; font-variant: normal } - span.italics { font-style: italic } - span.bold { font-weight: bold } - span.small-caps { font-variant: small-caps } - span.underlined { text-decoration: underline } - span.strike-through { text-decoration: line-through } - ''') css += '\n'+'\n'.join(font_size_classes) css += '\n' +'\n'.join(color_classes) From 3cf9f7986a174a4404764790800272f2ecdf787d Mon Sep 17 00:00:00 2001 From: Sengian Date: Wed, 28 Jul 2010 00:47:31 +0200 Subject: [PATCH 003/132] Implementation of a multiple replace class based on Dict substitutions. Very fast for large dictionnaries. --- src/calibre/utils/mreplace.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 src/calibre/utils/mreplace.py diff --git a/src/calibre/utils/mreplace.py b/src/calibre/utils/mreplace.py new file mode 100644 index 0000000000..dff5fab578 --- /dev/null +++ b/src/calibre/utils/mreplace.py @@ -0,0 +1,32 @@ +#multiple replace from dictionnary : http://code.activestate.com/recipes/81330/ +__license__ = 'GPL v3' +__copyright__ = '2010, sengian ' +__docformat__ = 'restructuredtext en' + +import re +from UserDict import UserDict + +class MReplace(UserDict): + def __init__(self, dict = None): + UserDict.__init__(self, dict) + self.re = None + self.regex = None + self.compile_regex() + + def compile_regex(self): + if len(self.data) > 0: + keys = sorted(self.data.keys(), key=len) + keys.reverse() + tmp = "(%s)" % "|".join([re.escape(item) for item in keys]) + if self.re != tmp: + self.re = tmp + self.regex = re.compile(self.re) + + def __call__(self, mo): + return self[mo.string[mo.start():mo.end()]] + + def mreplace(self, text): + #Replace without regex compile + if len(self.data) < 1 or self.re is None: + return text + return self.regex.sub(self, text) \ No newline at end of file From 7ebf416513125cee88fc487aa3306a25e4ac6681 Mon Sep 17 00:00:00 2001 From: Sengian Date: Wed, 28 Jul 2010 00:49:37 +0200 Subject: [PATCH 004/132] Modifications of BIBTEX catalog generation: create a class for bibtex fonctions, use the new Mreplace fonction as the dictionnary is very large. Divide by 10 the total execution time. --- src/calibre/library/catalog.py | 41 ++++++----- src/calibre/utils/bibtex.py | 125 ++++++++++++++++----------------- 2 files changed, 85 insertions(+), 81 deletions(-) diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py index a540a8a660..5ee0683b87 100644 --- a/src/calibre/library/catalog.py +++ b/src/calibre/library/catalog.py @@ -270,10 +270,10 @@ class BIBTEX(CatalogPlugin): from calibre.library.save_to_disk import preprocess_template #Bibtex functions - from calibre.utils.bibtex import bibtex_author_format, utf8ToBibtex, ValidateCitationKey + from calibre.utils.bibtex import BibTeX def create_bibtex_entry(entry, fields, mode, template_citation, - asccii_bibtex = True, citation_bibtex = True): + bibtexdict, citation_bibtex = True): #Bibtex doesn't like UTF-8 but keep unicode until writing #Define starting chain or if book valid strict and not book return a Fail string @@ -289,7 +289,8 @@ class BIBTEX(CatalogPlugin): if citation_bibtex : # Citation tag - bibtex_entry.append(make_bibtex_citation(entry, template_citation, asccii_bibtex)) + bibtex_entry.append(make_bibtex_citation(entry, template_citation, + bibtexdict)) bibtex_entry = [u' '.join(bibtex_entry)] for field in fields: @@ -304,11 +305,11 @@ class BIBTEX(CatalogPlugin): pass if field == 'authors' : - bibtex_entry.append(u'author = "%s"' % bibtex_author_format(item)) + bibtex_entry.append(u'author = "%s"' % bibtexdict.bibtex_author_format(item)) elif field in ['title', 'publisher', 'cover', 'uuid', 'author_sort', 'series'] : - bibtex_entry.append(u'%s = "%s"' % (field, utf8ToBibtex(item, asccii_bibtex))) + bibtex_entry.append(u'%s = "%s"' % (field, bibtexdict.utf8ToBibtex(item))) elif field == 'id' : bibtex_entry.append(u'calibreid = "%s"' % int(item)) @@ -321,13 +322,13 @@ class BIBTEX(CatalogPlugin): elif field == 'tags' : #A list to flatten - bibtex_entry.append(u'tags = "%s"' % utf8ToBibtex(u', '.join(item), asccii_bibtex)) + bibtex_entry.append(u'tags = "%s"' % bibtexdict.utf8ToBibtex(u', '.join(item))) elif field == 'comments' : #\n removal item = item.replace(u'\r\n',u' ') item = item.replace(u'\n',u' ') - bibtex_entry.append(u'note = "%s"' % utf8ToBibtex(item, asccii_bibtex)) + bibtex_entry.append(u'note = "%s"' % bibtexdict.utf8ToBibtex(item)) elif field == 'isbn' : # Could be 9, 10 or 13 digits @@ -345,8 +346,7 @@ class BIBTEX(CatalogPlugin): elif field == 'pubdate' : bibtex_entry.append(u'year = "%s"' % item.year) - bibtex_entry.append(u'month = "%s"' % utf8ToBibtex(strftime("%b", item), - asccii_bibtex)) + bibtex_entry.append(u'month = "%s"' % bibtexdict.utf8ToBibtex(strftime("%b", item))) bibtex_entry = u',\n '.join(bibtex_entry) bibtex_entry += u' }\n\n' @@ -363,7 +363,7 @@ class BIBTEX(CatalogPlugin): else : return True - def make_bibtex_citation(entry, template_citation, asccii_bibtex): + def make_bibtex_citation(entry, template_citation, bibtexclass): #define a function to replace the template entry by its value def tpl_replace(objtplname) : @@ -384,8 +384,9 @@ class BIBTEX(CatalogPlugin): return u'' if len(template_citation) >0 : - tpl_citation = utf8ToBibtex(ValidateCitationKey(re.sub(u'\{[^{}]*\}', - tpl_replace, template_citation)), asccii_bibtex) + tpl_citation = bibtexclass.utf8ToBibtex( + bibtexclass.ValidateCitationKey(re.sub(u'\{[^{}]*\}', + tpl_replace, template_citation))) if len(tpl_citation) >0 : return tpl_citation @@ -397,9 +398,9 @@ class BIBTEX(CatalogPlugin): template_citation = u'%s' % str(entry["id"]) if asccii_bibtex : - return ValidateCitationKey(template_citation.encode('ascii', 'replace')) + return bibtexclass.ValidateCitationKey(template_citation.encode('ascii', 'replace')) else : - return ValidateCitationKey(template_citation) + return bibtexclass.ValidateCitationKey(template_citation) self.fmt = path_to_output.rpartition('.')[2] self.notification = notification @@ -467,13 +468,16 @@ class BIBTEX(CatalogPlugin): if not len(data): log.error("\nNo matching database entries for search criteria '%s'" % opts.search_text) + #Initialize BibTeX class + bibtexc = BibTeX() + #Entries writing after Bibtex formating (or not) if bibfile_enc != 'ascii' : - asccii_bibtex = False + bibtexc.ascii_bibtex = False else : - asccii_bibtex = True + bibtexc.ascii_bibtex = True - #Check and go to default in case of bad CLI + #Check citation choice and go to default in case of bad CLI if isinstance(opts.impcit, (StringType, UnicodeType)) : if opts.impcit == 'False' : citation_bibtex= False @@ -485,6 +489,7 @@ class BIBTEX(CatalogPlugin): else : citation_bibtex= opts.impcit + #Preprocess for error and light correction template_citation = preprocess_template(opts.bib_cit) #Open output and write entries @@ -506,7 +511,7 @@ class BIBTEX(CatalogPlugin): for entry in data: outfile.write(create_bibtex_entry(entry, fields, bib_entry, template_citation, - asccii_bibtex, citation_bibtex)) + bibtexc, citation_bibtex)) outfile.close() diff --git a/src/calibre/utils/bibtex.py b/src/calibre/utils/bibtex.py index f6e596e8f0..5b9193d16d 100644 --- a/src/calibre/utils/bibtex.py +++ b/src/calibre/utils/bibtex.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - """ Collection of python utility-methodes commonly used by other bibliograph packages. From http://pypi.python.org/pypi/bibliograph.core/ @@ -62,10 +60,14 @@ DAMAGE. """ -__docformat__ = 'reStructuredText' __author__ = 'sengian ' +__docformat__ = 'restructuredtext en' import re, string +from UserDict import UserDict + +from calibre.constants import preferred_encoding +from calibre.utils.mreplace import MReplace utf8enc2latex_mapping = { # This is a mapping of Unicode characters to LaTeX equivalents. @@ -2842,69 +2844,66 @@ entity_mapping = { '"':'{"}', } -def ValidateCitationKey(text): - """ - removes characters not allowed in BibTeX keys +class BibTeX: + def __init__(self): + self.rep_utf8 = MReplace(utf8enc2latex_mapping) + self.rep_ent = MReplace(entity_mapping) + #Set default conversion to ASCII BibTeX + self.ascii_bibtex = True + # This substitution is based on the description of cite key restrictions at + # http://bibdesk.sourceforge.net/manual/BibDesk%20Help_2.html + self.invalid_cit = re.compile(u'[ "@\',\\#}{~%&$^]') + self.upper = re.compile(u'[' + + string.uppercase.decode(preferred_encoding) + u']') + self.escape = re.compile(u'[~#&%_]') + + def ValidateCitationKey(self, text): + """ + removes characters not allowed in BibTeX keys + >>> ValidateCitationKey(DummyEntry('my@id')) + 'myid' + """ + return self.invalid_cit.sub(u'', text) - >>> from bibliograph.core.utils import _validKey - >>> _validKey(DummyEntry('Foo Bar')) - 'FooBar' + def braceUppercase(self, text): + """ Convert uppercase letters to bibtex encoded uppercase + >>> braceUppercase('Foo Bar') + '{F}oo {B}ar' + """ + return self.upper.sub(lambda m: u'{%s}' % m.group(), text) - >>> _validKey(DummyEntry('my@id')) - 'myid' + def resolveEntities(self, text): + #for entity, entity_map in entity_mapping.iteritems(): + # text = text.replace(entity, entity_map) + #return text + return self.rep_ent.mreplace(text) - """ - # This substitution is based on the description of cite key restrictions at - # http://bibdesk.sourceforge.net/manual/BibDesk%20Help_2.html - return re.sub(u'[ "@\',\\#}{~%&$^]', u'', text) + def resolveUnicode(self, text): + #UTF-8 text as entry + #for unichar, latexenc in utf8enc2latex_mapping.iteritems() : + # text = text.replace(unichar, latexenc) + text = self.rep_utf8.mreplace(text) + return text.replace(u'$}{$', u'') -def BraceUppercase(text): - """ Convert uppercase letters to bibtex encoded uppercase + def escapeSpecialCharacters(self, text): + """ + latex escaping some (not all) special characters + """ + text.replace('\\', '\\\\') + return self.escape.sub(lambda m: u'\\%s' % m.group(), text) - >>> from bibliograph.core.utils import _braceUppercase - >>> _braceUppercase('foo bar') - 'foo bar' + #Calibre functions + #Option to go to official ASCII Bibtex or unofficial UTF-8 + #Go from an unicode entry to ASCII Bibtex format without encoding + def utf8ToBibtex(self, text): + if len(text) == 0: + return '' + text.replace('\\', '\\\\') + text = self.resolveEntities(text) + if self.ascii_bibtex : + text = self.resolveUnicode(text) + return self.escapeSpecialCharacters(text) - >>> _braceUppercase('Foo Bar') - '{F}oo {B}ar' - """ - for uc in string.uppercase: - text = text.replace(uc, u'{%s}' % uc) - return text - -def resolveEntities(text): - for entity, entity_map in entity_mapping.iteritems(): - text = text.replace(entity, entity_map) - return text - -def resolveUnicode(text): - #UTF-8 text as entry - for unichar, latexenc in utf8enc2latex_mapping.iteritems() : - text = text.replace(unichar, latexenc) - return text.replace(u'$}{$', u'') - -def escapeSpecialCharacters(text): - """ - latex escaping some (not all) special characters - """ - text.replace('\\', '\\\\') - escape = ['~', '#', '&', '%', '_'] - for c in escape: - text = text.replace(c, '\\' + c ) - return text - -#Calibre functions -#Go from an unicode entry to ASCII Bibtex format without encoding -#Option to go to official ASCII Bibtex or unofficial UTF-8 -def utf8ToBibtex(text, asccii_bibtex = True): - if len(text) == 0: - return '' - text.replace('\\', '\\\\') - text = resolveEntities(text) - if asccii_bibtex : - text = resolveUnicode(text) - return escapeSpecialCharacters(text) - -def bibtex_author_format(item): - #Format authors for Bibtex compliance (get a list as input) - return utf8ToBibtex(u' and'.join([author for author in item])) + def bibtex_author_format(self, item): + #Format authors for Bibtex compliance (get a list as input) + return self.utf8ToBibtex(u' and'.join([author for author in item])) From 8512f57866262b66f4cd542ac96cccf2b9c05737 Mon Sep 17 00:00:00 2001 From: Sengian Date: Wed, 28 Jul 2010 23:08:02 +0200 Subject: [PATCH 005/132] Check if RTF is asccii early. Will be effactive after preprocess integration in rtf2xml. --- src/calibre/ebooks/rtf2xml/ParseRtf.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index 7b89407f79..f494b7a9c1 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -17,7 +17,8 @@ ######################################################################### # $Revision: 1.41 $ # $Date: 2006/03/24 23:50:07 $ -import sys,os +import sys, os, codecs + from calibre.ebooks.rtf2xml import headings_to_sections, \ line_endings, footnote, fields_small, default_encoding, \ make_lists, preamble_div, header, colors, group_borders, \ @@ -90,7 +91,6 @@ class ParseRtf: out_file = '', out_dir = None, dtd = '', - #debug = 0, #why? calibre deb_dir = None, convert_symbol = None, convert_wingdings = None, @@ -107,6 +107,7 @@ class ParseRtf: no_dtd = 0, char_data = '', ): + """ Requires: 'file' --file to parse @@ -125,14 +126,16 @@ class ParseRtf: through a file. Only for debugging. Returns: Nothing """ + self.__file = in_file self.__out_file = out_file self.__out_dir = out_dir self.__temp_dir = out_dir self.__dtd_path = dtd self.__check_file(in_file,"file_to_parse") + self.__check_ascii(in_file) self.__char_data = char_data - self.__debug_dir = deb_dir #self.__debug_dir = debug calibre + self.__debug_dir = deb_dir self.__check_dir(self.__temp_dir) self.__copy = self.__check_dir(self.__debug_dir) self.__convert_caps = convert_caps @@ -149,19 +152,17 @@ class ParseRtf: self.__group_borders = group_borders self.__empty_paragraphs = empty_paragraphs self.__no_dtd = no_dtd - def __check_file(self, the_file, type): """Check to see if files exist""" if hasattr(the_file, 'read'): return if the_file == None: if type == "file_to_parse": - message = "You must provide a file for the script to work" - msg = message + msg = "\nYou must provide a file for the script to work" raise RtfInvalidCodeException, msg elif os.path.exists(the_file): pass # do nothing else: - message = "The file '%s' cannot be found" % the_file + message = "\nThe file '%s' cannot be found" % the_file msg = message raise RtfInvalidCodeException, msg def __check_dir(self, the_dir): @@ -170,7 +171,16 @@ class ParseRtf: return dir_exists = os.path.isdir(the_dir) if not dir_exists: - message = "%s is not a directory" % the_dir + msg = "\n%s is not a directory" % the_dir + raise RtfInvalidCodeException, msg + return 1 + def __check_ascii(self, the_file): + """Check to see if the file is correct ascii""" + try: + test = codecs.open(the_file, 'r', 'ascii', 'strict') + test.close() + except UnicodeError: + message= "\n%s is not a correct ascii file" % the_file msg = message raise RtfInvalidCodeException, msg return 1 From 09c8f13a1f17c869d06ace0d6cf76f0ff9b3fdc7 Mon Sep 17 00:00:00 2001 From: Sengian Date: Sat, 31 Jul 2010 10:47:12 +0200 Subject: [PATCH 006/132] Global overhaul of rtf2xml : RTF fixes (1) --- src/calibre/ebooks/rtf/input.py | 1 + src/calibre/ebooks/rtf2xml/ParseRtf.py | 53 ++++++++------------ src/calibre/ebooks/rtf2xml/check_brackets.py | 10 ++-- src/calibre/ebooks/rtf2xml/line_endings.py | 52 ++++++++----------- src/calibre/ebooks/rtf2xml/process_tokens.py | 2 - src/calibre/ebooks/rtf2xml/tokenize.py | 6 +-- src/calibre/ebooks/txt/processor.py | 5 +- 7 files changed, 52 insertions(+), 77 deletions(-) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index df74a7b3cb..2622d82d99 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -50,6 +50,7 @@ class RTFInput(InputFormatPlugin): parser = ParseRtf( in_file = stream, out_file = ofile, + #deb_dir = 'I:\\Calibre\\rtfdebug', # Convert symbol fonts to unicode equivalents. Default # is 1 convert_symbol = 1, diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index f494b7a9c1..3a804792c5 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -143,7 +143,7 @@ class ParseRtf: self.__convert_wingdings = convert_wingdings self.__convert_zapf = convert_zapf self.__run_level = run_level - self.__exit_level = 0 + #self.__exit_level = 0 self.__indent = indent self.__replace_illegals = replace_illegals self.__form_lists = form_lists @@ -162,8 +162,7 @@ class ParseRtf: elif os.path.exists(the_file): pass # do nothing else: - message = "\nThe file '%s' cannot be found" % the_file - msg = message + msg = "\nThe file '%s' cannot be found" % the_file raise RtfInvalidCodeException, msg def __check_dir(self, the_dir): """Check to see if directory exists""" @@ -180,8 +179,7 @@ class ParseRtf: test = codecs.open(the_file, 'r', 'ascii', 'strict') test.close() except UnicodeError: - message= "\n%s is not a correct ascii file" % the_file - msg = message + msg = "\n%s is not a correct ascii file" % the_file raise RtfInvalidCodeException, msg return 1 def parse_rtf(self): @@ -204,27 +202,29 @@ class ParseRtf: copy_obj.set_dir(self.__debug_dir) copy_obj.remove_files() copy_obj.copy_file(self.__temp_file, "original_file") - # new as of 2005-08-02. Do I want this? + # Function to check if bracket are well handled if self.__debug_dir or self.__run_level > 2: self.__check_brack_obj = check_brackets.CheckBrackets\ (file = self.__temp_file, bug_handler = RtfInvalidCodeException, ) - # convert Macintosh line endings to Unix line endings + # convert Macintosh and Windows line endings to Unix line endings + #why do this if you don't wb after? line_obj = line_endings.FixLineEndings( in_file = self.__temp_file, bug_handler = RtfInvalidCodeException, copy = self.__copy, - run_level = self.__run_level, + #run_level = self.__run_level, replace_illegals = self.__replace_illegals, ) - return_value = line_obj.fix_endings() - self.__return_code(return_value) + line_obj.fix_endings() + #return_value = line_obj.fix_endings() #calibre: no return in this function, why keep it? + #self.__return_code(return_value) tokenize_obj = tokenize.Tokenize( bug_handler = RtfInvalidCodeException, in_file = self.__temp_file, - copy = self.__copy, - run_level = self.__run_level,) + copy = self.__copy,) + #run_level = self.__run_level,) tokenize_obj.tokenize() process_tokens_obj = process_tokens.ProcessTokens( in_file = self.__temp_file, @@ -529,36 +529,27 @@ class ParseRtf: ) output_obj.output() os.remove(self.__temp_file) - return self.__exit_level + #return self.__exit_level def __bracket_match(self, file_name): if self.__run_level > 2: good_br, msg = self.__check_brack_obj.check_brackets() if good_br: pass - # sys.stderr.write( msg + ' in ' + file_name + "\n") + #sys.stderr.write( msg + ' in ' + file_name + "\n") else: msg += msg + " in file '" + file_name + "'\n" raise RtfInvalidCodeException, msg - def __return_code(self, num): - if num == None: - return - if int(num) > self.__exit_level: - self.__exit_level = num + #def __return_code(self, num): calibre not used + # if num == None: + # return + # if int(num) > self.__exit_level: + # self.__exit_level = num def __make_temp_file(self,file): """Make a temporary file to parse""" write_file="rtf_write_file" read_obj = file if hasattr(file, 'read') else open(file,'r') write_obj = open(write_file, 'w') - line = "dummy" - while line: - line = read_obj.read(1000) - write_obj.write(line ) + for line in read_obj: + write_obj.write(line) write_obj.close() - return write_file - """ -mi1\n -mi33\n -mi 0: length_byte = len(txt.encode('utf-8')) From 3405615e54da2f2aa7345d1f51525acd250cbd91 Mon Sep 17 00:00:00 2001 From: Sengian Date: Sat, 31 Jul 2010 13:15:47 +0200 Subject: [PATCH 007/132] Remove invalid ASCII characters from plain text files --- src/calibre/ebooks/txt/input.py | 3 ++- src/calibre/ebooks/txt/processor.py | 25 +++++++++++++++---------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index b444bf1cf4..935a187d5d 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -57,6 +57,7 @@ class TXTInput(InputFormatPlugin): txt = preserve_spaces(txt) txt = _ent_pat.sub(xml_entity_to_unicode, txt) + txt = txt.encode('utf-8') if options.markdown: log.debug('Running text though markdown conversion...') @@ -79,7 +80,7 @@ class TXTInput(InputFormatPlugin): base = os.path.dirname(stream.name) htmlfile = open(os.path.join(base, 'temp_calibre_txt_input_to_html.html'), 'wb') - htmlfile.write(html.encode('utf-8')) + htmlfile.write(html) #html.encode('utf-8') htmlfile.close() cwd = os.getcwdu() odi = options.debug_pipeline diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 91c274a7b1..6bd635b6df 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -19,7 +19,7 @@ HTML_TEMPLATE = u' ] - + @@ -294,7 +294,7 @@ - + From 1f237c99bfe5bb875f4dc384b4b80938967d7ae9 Mon Sep 17 00:00:00 2001 From: Sengian Date: Sat, 31 Jul 2010 20:01:54 +0200 Subject: [PATCH 010/132] Change in the convert to bibtex reference for euro symbol --- src/calibre/utils/bibtex.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/utils/bibtex.py b/src/calibre/utils/bibtex.py index 5b9193d16d..09868ccdb1 100644 --- a/src/calibre/utils/bibtex.py +++ b/src/calibre/utils/bibtex.py @@ -80,7 +80,7 @@ utf8enc2latex_mapping = { #Fix some encoding problem between cp1252 and latin1 # from http://www.microsoft.com/typography/unicode/1252.htm - u'\x80': '{\\mbox{\\texteuro}}', # EURO SIGN + u'\x80': '{\\texteuro}', # EURO SIGN u'\x82': '{,}', # SINGLE LOW-9 QUOTATION MARK u'\x83': '$f$', # LATIN SMALL LETTER F WITH HOOK u'\x84': '{,,}', # DOUBLE LOW-9 QUOTATION MARK @@ -746,7 +746,7 @@ utf8enc2latex_mapping = { u'\u205f': '{\\mkern4mu}', u'\u2060': '{\\nolinebreak}', u'\u20a7': '{\\ensuremath{\\Elzpes}}', - u'\u20ac': '{\\mbox{\\texteuro}}', + u'\u20ac': '{\\texteuro}', u'\u20db': '$\\dddot$', u'\u20dc': '$\\ddddot$', u'\u2102': '$\\mathbb{C}$', From 2eb20249319e551f41d4d721c831e3e64abaf72c Mon Sep 17 00:00:00 2001 From: Sengian Date: Tue, 10 Aug 2010 12:38:59 +0200 Subject: [PATCH 011/132] Merge from trunk --- resources/catalog/stylesheet.css | 142 +++---- resources/content_server/gui.css | 163 ++++---- resources/content_server/index.html | 103 ++--- resources/content_server/mobile.css | 91 ++--- resources/templates/html.css | 361 ++++++++--------- setup/installer/windows/en-us.xml | 19 +- setup/installer/windows/wix-template.xml | 267 ++++++------- src/calibre/ebooks/lrf/html/demo/demo.html | 440 +++++++++++++-------- src/calibre/manual/templates/layout.html | 24 +- src/calibre/manual/xpath.xhtml | 30 +- 10 files changed, 871 insertions(+), 769 deletions(-) diff --git a/resources/catalog/stylesheet.css b/resources/catalog/stylesheet.css index 4f9ca9ac41..ea01aeb43b 100644 --- a/resources/catalog/stylesheet.css +++ b/resources/catalog/stylesheet.css @@ -1,102 +1,104 @@ -body { background-color: white; } +body { + background-color: white; +} -p.title { - margin-top:0em; - margin-bottom:1em; - text-align:center; - font-style:italic; - font-size:xx-large; - border-bottom: solid black 4px; - } +p.title { + margin-top: 0em; + margin-bottom: 1em; + text-align: center; + font-style: italic; + font-size: xx-large; + border-bottom: solid black 4px; +} p.author { - margin-top:0em; - margin-bottom:0em; + margin-top: 0em; + margin-bottom: 0em; text-align: left; text-indent: 1em; - font-size:large; - } + font-size: large; +} p.tags { - margin-top:0em; - margin-bottom:0em; + margin-top: 0em; + margin-bottom: 0em; text-align: left; text-indent: 1em; - font-size:small; - } + font-size: small; +} p.description { - text-align:left; - font-style:normal; + text-align: left; + font-style: normal; margin-top: 0em; - } +} p.date_index { - font-size:x-large; - text-align:center; - font-weight:bold; - margin-top:1em; - margin-bottom:0px; - } + font-size: x-large; + text-align: center; + font-weight: bold; + margin-top: 1em; + margin-bottom: 0px; +} p.letter_index { - font-size:x-large; - text-align:center; - font-weight:bold; - margin-top:1em; - margin-bottom:0px; - } + font-size: x-large; + text-align: center; + font-weight: bold; + margin-top: 1em; + margin-bottom: 0px; +} p.author_index { - font-size:large; - text-align:left; - margin-top:0px; - margin-bottom:0px; + font-size: large; + text-align: left; + margin-top: 0px; + margin-bottom: 0px; text-indent: 0em; - } +} p.series { text-align: left; - margin-top:0px; - margin-bottom:0px; - margin-left:2em; - text-indent:-2em; - } + margin-top: 0px; + margin-bottom: 0px; + margin-left: 2em; + text-indent: -2em; +} p.read_book { - text-align:left; - margin-top:0px; - margin-bottom:0px; - margin-left:2em; - text-indent:-2em; - } + text-align: left; + margin-top: 0px; + margin-bottom: 0px; + margin-left: 2em; + text-indent: -2em; +} p.unread_book { - text-align:left; - margin-top:0px; - margin-bottom:0px; - margin-left:2em; - text-indent:-2em; - } + text-align: left; + margin-top: 0px; + margin-bottom: 0px; + margin-left: 2em; + text-indent: -2em; +} p.date_read { - text-align:left; - margin-top:0px; - margin-bottom:0px; - margin-left:6em; - text-indent:-6em; - } + text-align: left; + margin-top: 0px; + margin-bottom: 0px; + margin-left: 6em; + text-indent: -6em; +} hr.series_divider { - width:50%; - margin-left:1em; - margin-top:0em; - margin-bottom:0em; - } + width: 50%; + margin-left: 1em; + margin-top: 0em; + margin-bottom: 0em; +} hr.annotations_divider { - width:50%; - margin-left:1em; - margin-top:0em; - margin-bottom:0em; - } + width: 50%; + margin-left: 1em; + margin-top: 0em; + margin-bottom: 0em; +} \ No newline at end of file diff --git a/resources/content_server/gui.css b/resources/content_server/gui.css index 1bcc4e1eb0..d7a3eda51e 100644 --- a/resources/content_server/gui.css +++ b/resources/content_server/gui.css @@ -1,142 +1,157 @@ body { - background-color: white; + background-color: white; } #banner { - position: absolute; - left: 5px; top: 0px; + position: absolute; + left: 5px; + top: 0px; } /* Search bar */ #search_box { - width: 201px; - height: 31px; - background: url(bg_search_box.png); - top: 5px; right: 20px; - position: absolute; + width: 201px; + height: 31px; + background: url(bg_search_box.png); + top: 5px; + right: 20px; + position: absolute; } + #search_box #s { - float: left; - padding: 0; - margin: 6px 0 0 6px; - border-width: 0px; - font-size: 16px; - width: 159px; - background: transparent; + float: left; + padding: 0; + margin: 6px 0 0 6px; + border-width: 0px; + font-size: 16px; + width: 159px; + background: transparent; } + #search_box #go { - float: right; - margin: 3px 4px 0 0; + float: right; + margin: 3px 4px 0 0; } /* Count bar */ #count_bar { - position: absolute; - right: 30px; - top: 80px; - font-size:smaller; - padding-bottom: 5px; + position: absolute; + right: 30px; + top: 80px; + font-size: smaller; + padding-bottom: 5px; } #count_bar * img { - cursor: pointer; + cursor: pointer; } -#count { cursor: default;} +#count { + cursor: default; +} /* Styles for the book list */ #main { - width:95%; - overflow: auto; - border: solid thin black; - position: absolute; - top: 115px; left: 10px; - z-index: 1; + width: 95%; + overflow: auto; + border: solid thin black; + position: absolute; + top: 115px; + left: 10px; + z-index: 1; } table#book_list thead tr td { - width: 100%; - padding-right: 1em; padding-left: 1em; - text-align: center; - font-weight: bold; - font-size: 130%; - border-bottom: thick solid black; - border-top: thick solid black; - cursor: pointer; - font-family: serif; - padding-top: 0.5ex; padding-bottom: 0.5ex; + width: 100%; + padding-right: 1em; + padding-left: 1em; + text-align: center; + font-weight: bold; + font-size: 130%; + border-bottom: thick solid black; + border-top: thick solid black; + cursor: pointer; + font-family: serif; + padding-top: 0.5ex; + padding-bottom: 0.5ex; } table#book_list tbody tr td { - padding-right: 1em; padding-left: 1em; - /*border-bottom: thin solid black;*/ - padding-bottom: 0.7ex; padding-top: 0.7ex; - margin: 0pt; - cursor: pointer; - + padding-right: 1em; + padding-left: 1em; + /*border-bottom: thin solid black;*/ + padding-bottom: 0.7ex; + padding-top: 0.7ex; + margin: 0pt; + cursor: pointer; } table#book_list * .sort_indicator { - visibility:hidden; - color: #9f9f9f; + visibility: hidden; + color: #9f9f9f; } table#book_list * .rating { - color: #3fbbe4; + color: #3fbbe4; } table#book_list * span.subtitle { - font-size: smaller; + font-size: smaller; } table#book_list * a.format { - text-decoration: none; - color: blue; - font-family: monospace; + text-decoration: none; + color: blue; + font-family: monospace; } table#book_list * a.format:hover { - color: red; + color: red; } table#book_list * a.format:visited { - color: blue; + color: blue; } table#book_list * .comments { - font-size: smaller; - display: none; + font-size: smaller; + display: none; } + /* Loading message */ #loading { - top: 10px; left: 10px; - position: absolute; - font-size: 160%; font-family: monospace; - text-align: center; - visibility: hidden; - z-index: 10000; - background-color: #aaaaaa; - opacity: 0.8; - + top: 10px; + left: 10px; + position: absolute; + font-size: 160%; + font-family: monospace; + text-align: center; + visibility: hidden; + z-index: 10000; + background-color: #aaaaaa; + opacity: 0.8; } #loading div { - top: 50%; position: relative; + top: 50%; + position: relative; } #cover_pane { - overflow: auto; - position: absolute; - visibility: hidden; - text-align: right; - z-index: 2; - margin: 0pt; padding: 0pt; border-width: 0pt; -} + overflow: auto; + position: absolute; + visibility: hidden; + text-align: right; + z-index: 2; + margin: 0pt; + padding: 0pt; + border-width: 0pt; +} \ No newline at end of file diff --git a/resources/content_server/index.html b/resources/content_server/index.html index f9f0aff491..ff11acc719 100644 --- a/resources/content_server/index.html +++ b/resources/content_server/index.html @@ -1,49 +1,60 @@ - - - calibre library - - - - - - - - - - - -
- Show first set of books Show previous set of books              Show next set of books Show last set of books -
- -
- - - - - - - -
-
- -
-
- Loading... Loading… -
-
- -
- -
- + + +calibre library + + + + + + + + + + + +
Show first set of books Show previous set of books              Show next set of books Show last set of books
+ +
+ + + + + + + +
+
+ +
+
Loading... Loading… +
+
+ +
+ diff --git a/resources/content_server/mobile.css b/resources/content_server/mobile.css index 9be755b954..e3a4b58422 100644 --- a/resources/content_server/mobile.css +++ b/resources/content_server/mobile.css @@ -1,83 +1,78 @@ /* CSS for the mobile version of the content server webpage */ - .navigation table.buttons { - width: 100%; + width: 100%; } .navigation .button { - width: 50%; + width: 50%; } -.button a, .button:visited a { - padding: 0.5em; - font-size: 1.25em; - border: 1px solid black; - text-color: black; - background-color: #ddd; - border-top: 1px solid ThreeDLightShadow; - border-right: 1px solid ButtonShadow; - border-bottom: 1px solid ButtonShadow; - border-left: 1 px solid ThreeDLightShadow; - -moz-border-radius: 0.25em; - -webkit-border-radius: 0.25em; +.button a,.button:visited a { + padding: 0.5em; + font-size: 1.25em; + border: 1px solid black; + text-color: black; + background-color: #ddd; + border-top: 1px solid ThreeDLightShadow; + border-right: 1px solid ButtonShadow; + border-bottom: 1px solid ButtonShadow; + border-left: 1 px solid ThreeDLightShadow; + -moz-border-radius: 0.25em; + -webkit-border-radius: 0.25em; } .button:hover a { - border-top: 1px solid #666; - border-right: 1px solid #CCC; - border-bottom: 1 px solid #CCC; - border-left: 1 px solid #666; - - + border-top: 1px solid #666; + border-right: 1px solid #CCC; + border-bottom: 1 px solid #CCC; + border-left: 1 px solid #666; } div.navigation { - padding-bottom: 1em; - clear: both; + padding-bottom: 1em; + clear: both; } #search_box { - border: 1px solid #393; - -moz-border-radius: 0.5em; - -webkit-border-radius: 0.5em; - padding: 1em; - margin-bottom: 0.5em; - float: right; + border: 1px solid #393; + -moz-border-radius: 0.5em; + -webkit-border-radius: 0.5em; + padding: 1em; + margin-bottom: 0.5em; + float: right; } #listing { - width: 100%; - border-collapse: collapse; + width: 100%; + border-collapse: collapse; } + #listing td { - padding: 0.25em; + padding: 0.25em; } #listing td.thumbnail { - height: 60px; - width: 60px; + height: 60px; + width: 60px; } #listing tr:nth-child(even) { - - background: #eee; + background: #eee; } -#listing .button a{ - display: inline-block; - width: 2.5em; - padding-left: 0em; - padding-right: 0em; - overflow: hidden; - text-align: center; +#listing .button a { + display: inline-block; + width: 2.5em; + padding-left: 0em; + padding-right: 0em; + overflow: hidden; + text-align: center; } #logo { - float: left; + float: left; } #spacer { - clear: both; -} - - + clear: both; +} \ No newline at end of file diff --git a/resources/templates/html.css b/resources/templates/html.css index e9b683ca34..448ec596b9 100644 --- a/resources/templates/html.css +++ b/resources/templates/html.css @@ -34,380 +34,367 @@ * the terms of any one of the MPL, the GPL or the LGPL. * * ***** END LICENSE BLOCK ***** */ +@ +namespace url (http: //www.w3.org /1999/xhtml); + @namespace svg url (http: //www.w3.org /2000/svg); + /* blocks */ -@namespace url(http://www.w3.org/1999/xhtml); -@namespace svg url(http://www.w3.org/2000/svg); - -/* blocks */ - -html, div, map, dt, isindex, form { - display: block; +html,div,map,dt,isindex,form { + display: block; } body { - display: block; + display: block; } -p, dl, multicol { - display: block; - margin: 1em 0; +p,dl,multicol { + display: block; + margin: 1em 0; } dd { - display: block; - margin-left: 40px; + display: block; + margin-left: 40px; } blockquote { - display: block; - margin: 1em; + display: block; + margin: 1em; } address { - display: block; - font-style: italic; + display: block; + font-style: italic; } center { - display: block; - text-align: center; + display: block; + text-align: center; } blockquote[type=cite] { - display: block; - margin: 1em 0em; - border-color: blue; - border-width: thin; + display: block; + margin: 1em 0em; + border-color: blue; + border-width: thin; } span[_moz_quote=true] { - color: blue; + color: blue; } pre[_moz_quote=true] { - color: blue; + color: blue; } h1 { - display: block; - font-size: 2em; - font-weight: bold; - margin: .67em 0; + display: block; + font-size: 2em; + font-weight: bold; + margin: .67em 0; } h2 { - display: block; - font-size: 1.5em; - font-weight: bold; - margin: .83em 0; + display: block; + font-size: 1.5em; + font-weight: bold; + margin: .83em 0; } h3 { - display: block; - font-size: 1.17em; - font-weight: bold; - margin: 1em 0; + display: block; + font-size: 1.17em; + font-weight: bold; + margin: 1em 0; } h4 { - display: block; - font-weight: bold; - margin: 1.33em 0; + display: block; + font-weight: bold; + margin: 1.33em 0; } h5 { - display: block; - font-size: 0.83em; - font-weight: bold; - margin: 1.67em 0; + display: block; + font-size: 0.83em; + font-weight: bold; + margin: 1.67em 0; } h6 { - display: block; - font-size: 0.67em; - font-weight: bold; - margin: 2.33em 0; + display: block; + font-size: 0.67em; + font-weight: bold; + margin: 2.33em 0; } listing { - display: block; - font-family: monospace; - font-size: medium; - white-space: pre; - margin: 1em 0; + display: block; + font-family: monospace; + font-size: medium; + white-space: pre; + margin: 1em 0; } -xmp, pre, plaintext { - display: block; - font-family: monospace; - white-space: pre; - margin: 1em 0; +xmp,pre,plaintext { + display: block; + font-family: monospace; + white-space: pre; + margin: 1em 0; } /* tables */ - table { - display: table; - border-spacing: 2px; - border-collapse: separate; - margin-top: 0; - margin-bottom: 0; - text-indent: 0; + display: table; + border-spacing: 2px; + border-collapse: separate; + margin-top: 0; + margin-bottom: 0; + text-indent: 0; } table[align="left"] { - float: left; + float: left; } table[align="right"] { - float: right; + float: right; } -table[rules]:not([rules="none"]) { - border-collapse: collapse; +table[rules]:not ([rules="none"] ) { + border-collapse: collapse; } - -/* caption inherits from table not table-outer */ + +/* caption inherits from table not table-outer */ caption { - display: table-caption; - text-align: center; + display: table-caption; + text-align: center; } -table[align="center"] > caption { - margin-left: auto; - margin-right: auto; +table[align="center"]>caption { + margin-left: auto; + margin-right: auto; } -table[align="center"] > caption[align="left"] { - margin-right: 0; +table[align="center"]>caption[align="left"] { + margin-right: 0; } -table[align="center"] > caption[align="right"] { - margin-left: 0; +table[align="center"]>caption[align="right"] { + margin-left: 0; } tr { - display: table-row; - vertical-align: inherit; + display: table-row; + vertical-align: inherit; } col { - display: table-column; + display: table-column; } colgroup { - display: table-column-group; + display: table-column-group; } tbody { - display: table-row-group; - vertical-align: middle; + display: table-row-group; + vertical-align: middle; } thead { - display: table-header-group; - vertical-align: middle; + display: table-header-group; + vertical-align: middle; } tfoot { - display: table-footer-group; - vertical-align: middle; + display: table-footer-group; + vertical-align: middle; } /* for XHTML tables without tbody */ -table > tr { - vertical-align: middle; +table>tr { + vertical-align: middle; } -td { - display: table-cell; - vertical-align: inherit; - text-align: inherit; - padding: 1px; +td { + display: table-cell; + vertical-align: inherit; + text-align: inherit; + padding: 1px; } th { - display: table-cell; - vertical-align: inherit; - font-weight: bold; - padding: 1px; + display: table-cell; + vertical-align: inherit; + font-weight: bold; + padding: 1px; } /* inlines */ - -b, strong { - font-weight: bolder; +b,strong { + font-weight: bolder; } -i, cite, em, var, dfn { - font-style: italic; +i,cite,em,var,dfn { + font-style: italic; } -tt, code, kbd, samp { - font-family: monospace; +tt,code,kbd,samp { + font-family: monospace; } -u, ins { - text-decoration: underline; +u,ins { + text-decoration: underline; } -s, strike, del { - text-decoration: line-through; +s,strike,del { + text-decoration: line-through; } blink { - text-decoration: blink; + text-decoration: blink; } big { - font-size: larger; + font-size: larger; } small { - font-size: smaller; + font-size: smaller; } sub { - vertical-align: sub; - font-size: smaller; - line-height: normal; + vertical-align: sub; + font-size: smaller; + line-height: normal; } sup { - vertical-align: super; - font-size: smaller; - line-height: normal; + vertical-align: super; + font-size: smaller; + line-height: normal; } nobr { - white-space: nowrap; + white-space: nowrap; } /* titles */ -abbr[title], acronym[title] { - border-bottom: dotted 1px; +abbr[title],acronym[title] { + border-bottom: dotted 1px; } /* lists */ - -ul, menu, dir { - display: block; - list-style-type: disc; - margin: 1em 0; +ul,menu,dir { + display: block; + list-style-type: disc; + margin: 1em 0; } ol { - display: block; - list-style-type: decimal; - margin: 1em 0; + display: block; + list-style-type: decimal; + margin: 1em 0; } li { - display: list-item; + display: list-item; } /* nested lists have no top/bottom margins */ -ul ul, ul ol, ul dir, ul menu, ul dl, -ol ul, ol ol, ol dir, ol menu, ol dl, -dir ul, dir ol, dir dir, dir menu, dir dl, -menu ul, menu ol, menu dir, menu menu, menu dl, -dl ul, dl ol, dl dir, dl menu, dl dl { - margin-top: 0; - margin-bottom: 0; +ul ul,ul ol,ul dir,ul menu,ul dl,ol ul,ol ol,ol dir,ol menu,ol dl,dir ul,dir ol,dir dir,dir menu,dir dl,menu ul,menu ol,menu dir,menu menu,menu dl,dl ul,dl ol,dl dir,dl menu,dl dl + { + margin-top: 0; + margin-bottom: 0; } /* 2 deep unordered lists use a circle */ -ol ul, ul ul, menu ul, dir ul, -ol menu, ul menu, menu menu, dir menu, -ol dir, ul dir, menu dir, dir dir { - list-style-type: circle; +ol ul,ul ul,menu ul,dir ul,ol menu,ul menu,menu menu,dir menu,ol dir,ul dir,menu dir,dir dir + { + list-style-type: circle; } /* 3 deep (or more) unordered lists use a square */ -ol ol ul, ol ul ul, ol menu ul, ol dir ul, -ol ol menu, ol ul menu, ol menu menu, ol dir menu, -ol ol dir, ol ul dir, ol menu dir, ol dir dir, -ul ol ul, ul ul ul, ul menu ul, ul dir ul, -ul ol menu, ul ul menu, ul menu menu, ul dir menu, -ul ol dir, ul ul dir, ul menu dir, ul dir dir, -menu ol ul, menu ul ul, menu menu ul, menu dir ul, -menu ol menu, menu ul menu, menu menu menu, menu dir menu, -menu ol dir, menu ul dir, menu menu dir, menu dir dir, -dir ol ul, dir ul ul, dir menu ul, dir dir ul, -dir ol menu, dir ul menu, dir menu menu, dir dir menu, -dir ol dir, dir ul dir, dir menu dir, dir dir dir { - list-style-type: square; +ol ol ul,ol ul ul,ol menu ul,ol dir ul,ol ol menu,ol ul menu,ol menu menu,ol dir menu,ol ol dir,ol ul dir,ol menu dir,ol dir dir,ul ol ul,ul ul ul,ul menu ul,ul dir ul,ul ol menu,ul ul menu,ul menu menu,ul dir menu,ul ol dir,ul ul dir,ul menu dir,ul dir dir,menu ol ul,menu ul ul,menu menu ul,menu dir ul,menu ol menu,menu ul menu,menu menu menu,menu dir menu,menu ol dir,menu ul dir,menu menu dir,menu dir dir,dir ol ul,dir ul ul,dir menu ul,dir dir ul,dir ol menu,dir ul menu,dir menu menu,dir dir menu,dir ol dir,dir ul dir,dir menu dir,dir dir dir + { + list-style-type: square; } - /* leafs */ - -/*
noshade and color attributes are handled completely by + /*
noshade and color attributes are handled completely by * the nsHTMLHRElement attribute mapping code */ hr { - display: block; - height: 2px; - border: 1px inset; - margin: 0.5em auto 0.5em auto; - color: gray; + display: block; + height: 2px; + border: 1px inset; + margin: 0.5em auto 0.5em auto; + color: gray; } hr[size="1"] { - border-style: solid none none none; + border-style: solid none none none; } -img[usemap], object[usemap] { - color: blue; +img[usemap],object[usemap] { + color: blue; } frameset { - display: block ! important; - position: static ! important; - float: none ! important; - border: none ! important; + display: block ! important; + position: static ! important; + float: none ! important; + border: none ! important; } frame { - border: none ! important; + border: none ! important; } iframe { - border: 2px inset; + border: 2px inset; } noframes { - display: none; + display: none; } spacer { - position: static ! important; - float: none ! important; + position: static ! important; + float: none ! important; } /* hidden elements */ -area, base, basefont, head, meta, script, style, title, -noembed, param, link { - display: none; +area,base,basefont,head,meta,script,style,title,noembed,param,link { + display: none; } /* Page breaks at body tags, to help out with LIT-generation */ body { - page-break-before: always; + page-break-before: always; } /* Explicit line-breaks are blocks, sure... */ br { - display: block; + display: block; } /* Images, embedded object, and SVG size defaults */ -img, object, svg|svg { - width: auto; - height: auto; +img,object,svg |svg { + width: auto; + height: auto; } /* These are needed because ADE renders anchors the same as links */ +a { + text-decoration: inherit; + color: inherit; + cursor: inherit +} -a { text-decoration: inherit; color: inherit; cursor: inherit } -a[href] { text-decoration: underline; color: blue; cursor: pointer } +a[href] { + text-decoration: underline; + color: blue; + cursor: pointer +} \ No newline at end of file diff --git a/setup/installer/windows/en-us.xml b/setup/installer/windows/en-us.xml index 89cc25f0a2..ed181c524b 100644 --- a/setup/installer/windows/en-us.xml +++ b/setup/installer/windows/en-us.xml @@ -1,9 +1,16 @@ - - If you are upgrading from a {app} version older than 0.6.17, please uninstall {app} first. Click Advanced to change installation settings. - Computing space requirements, this may take upto five minutes... - Computing space requirements, this may take upto five minutes... - Computing space requirements, this may take upto five minutes... - Please wait while the installer finishes determining your disk space requirements, this may take upto five minutes... + + If you are upgrading from a {app} version older than + 0.6.17, please uninstall {app} first. Click Advanced to change + installation settings. + Computing space requirements, this may take upto five + minutes... + Computing space requirements, this may take upto five + minutes... + Computing space requirements, this may take upto five + minutes... + Please wait while the installer finishes determining + your disk space requirements, this may take upto five minutes... diff --git a/setup/installer/windows/wix-template.xml b/setup/installer/windows/wix-template.xml index 37dd8b25a8..1300eba956 100644 --- a/setup/installer/windows/wix-template.xml +++ b/setup/installer/windows/wix-template.xml @@ -1,164 +1,157 @@ - + - - - + - - - - - - - + - - - - - - - - - - + - - {app_components} - - - - - + + + + + - - - - - - - + + + + + + + + + + - - - - + + {app_components} + + + + + - - - - - - - + + + + + + + + + + + + + + + + + + + - - - + - - - + + - - - + + + - - - + + + - - - - + + + - - - + + + + - + + + + + = 501)]]> - - - NEWPRODUCTFOUND - - - - NEWPRODUCTFOUND - + + + NEWPRODUCTFOUND + + + + NEWPRODUCTFOUND + - - - WIXUI_EXITDIALOGOPTIONALCHECKBOX = 1 and NOT Installed + + + WIXUI_EXITDIALOGOPTIONALCHECKBOX = 1 and NOT Installed - + - - - - + + + + - - - - - + + + + + - - - + + + - + diff --git a/src/calibre/ebooks/lrf/html/demo/demo.html b/src/calibre/ebooks/lrf/html/demo/demo.html index 7d2f783ccc..37bed69b88 100644 --- a/src/calibre/ebooks/lrf/html/demo/demo.html +++ b/src/calibre/ebooks/lrf/html/demo/demo.html @@ -1,187 +1,279 @@ -

Demo of html2lrf

-

- This document contains a demonstration of the capabilities of html2lrf, the HTML to LRF converter from calibre. To obtain calibre visit
http://calibre-ebook.com -

-
-

Table of Contents

- +

Demo of html2lrf

+

This document contains a demonstration of the capabilities of html2lrf, the HTML to LRF +converter from calibre. To obtain calibre visit
+http://calibre-ebook.com

+
+

Table of Contents

+ -

Lists

- -

Nested lists

-
    -
  1. Item 1
  2. -
      -
    • Sub item 1
    • -
    • Sub item 2
    • -
        -
      1. Sub sub item 1. This is a multiline item with almost correct blocking.
      2. -
      3. Sub sub item 2
      4. -
      -
    -
  3. Item 2
  4. -
-

-

Definition Lists

-
-
Term 1
-
Definition of Term 1. A multi line definition showing correct blocking.
-
Term 2
-
Definition of Term 2
-
-

-


- Table of Contents -

+

Lists

-

Tables

- - - - - - -

A matrix

Column 1Column 2Column 3
Row 1

(1, 1)

Row 2

(2, 2)

Row 3

(3, 3)

-
-

- html2lrf supports both rowspan and colspan, but no other HTML table attributes, as it uses its own algorithm to determine optimal placement of cells. -

-

- Note that if you have custom fonts on your reader, the table may not be properly aligned. Also html2lrf does not support nested tables. -

-

- On the next page you'll see a real life example taken from a Project Gutenberg text with no modifications. It shows off html2lrf's handling of rowspan and colspan. -

-

Sample Complex Table of Contents

- - - - - - - - - - - - - - -
 PAGE
Prefacev
List of Works of Referencevii
List of Illustrationsxi
ChapterI.History of the Foundation3
II.Exterior of the Church25
III.Interior of the Church33
IV.St. Bartholomew-the-Less and the Hospital63
AppendixI.The Priory Seals73
II.The Priors and Rectors77
III.Inventory of Vestments, etc.79
IV.The Organ80
Index83
- -

-


- Table of Contents -

- -

Text formatting

-

- A simple paragraph of formatted - text, with a ruled line following it. - Superscripts and Subscripts. -

-
-
-

A - similar - paragraph, but now using - CSS - to perform the text formatting.

-
-
A centered phrase
- A right aligned phrase - A normal phrase -
-

A paragraph containing a <blockquote> -

This is blockquoted text. It is rendered in a separate block with margins.
The above text should be distinct from the rest of the paragraph. -

-
-

A very indented paragraph

-

An unindented paragraph

-

A default indented paragraph

-

-


- Table of Contents -

- - -

Inline images

-

- Here I demonstrate the use of inline images in the midst of text. Here is a small image embedded in a sentence. Now we have a slightly larger image that is automatically put in its own block and finally we have a large image which is put on a page by itself. Try changing sizes from S to M to L and see how the images behave. -

+

Nested lists

+
    +
  1. Item 1
  2. +
      +
    • Sub item 1
    • +
    • Sub item 2
    • +
        +
      1. Sub sub item 1. This is a multiline item with almost + correct blocking.
      2. +
      3. Sub sub item 2
      4. +
      +
    +
  3. Item 2
  4. +
+

+

Definition Lists

+
+
Term 1
+
Definition of Term 1. A multi line definition showing correct + blocking.
+
Term 2
+
Definition of Term 2
+

-


- Table of Contents -

+
+Table of Contents

-

Embedded fonts

-

This LRF file has been prepared by embedding Times New Roman and Andale Mono - as the default serif and monospace fonts. This allows it to correctly display - non English characters such as:

-
    -
  • mouse in German: mÅ«s
  • -
  • mouse in Russian: мышь
  • -
-

- Note that embedding fonts in LRF files slows down page turns slightly. -
-

- -

-


- Table of Contents -

- -

Paragraph Emphasis

-
-

beautiful image based dropcaps to emphasize this - paragraph. Image based dropcaps are specified by adding the class = 'libprs500_dropcaps' - attribute to an <img> tag.

-
- -

This is a plain text based dropcaps. It - is not nearly as dramatic, but easier to code ;-) -

-
- -

This is an Example of small-caps. - It can also be used to highlight the start of a paragraph very effectively. -

-
-

A paragraph with a hanging indent. This is especially - useful for highly structured text like verse, or dialogue.

-

-


- Table of Contents -

+

Tables

+ + + + + + + + + + + + + + + + + + + + + + + + + +
+

A matrix

+
Column 1Column 2Column 3
Row 1 +

(1, 1)

+
Row 2 +

(2, 2)

+
Row 3 +

(3, 3)

+
+
+

html2lrf supports both rowspan and colspan, but no other HTML +table attributes, as it uses its own algorithm to determine optimal +placement of cells.

+

Note that if you have custom fonts on your reader, the table may +not be properly aligned. Also html2lrf does not support nested tables.

+

On the next page you'll see a +real life example taken from a Project Gutenberg text with no +modifications. It shows off html2lrf's handling of rowspan and colspan. +

+

Sample Complex Table of Contents

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
 PAGE
Prefacev
List of Works of Referencevii
List of Illustrationsxi
ChapterI.History of the Foundation3
II.Exterior of the Church25
III.Interior of the Church33
IV.St. Bartholomew-the-Less and the Hospital63
AppendixI.The Priory Seals73
II.The Priors and Rectors77
III.Inventory of Vestments, etc.79
IV.The Organ80
Index83
-

Recursive link following

-

- html2lrf follows links in HTML files that point to other files, recursively. Thus it can be used to convert a whole tree of HTML files into a single LRF file. -
-

-


- Table of Contents -

+
+Table of Contents

+

Text formatting

+

A simple paragraph of formatted text, with a +ruled line following it. Superscripts and Subscripts. +

+
+
+

A similar paragraph, but +now using CSS to perform the text +formatting.

+
+
A centered phrase
+A right aligned phrase +A normal phrase +
+

A paragraph containing a <blockquote> +

This is blockquoted text. It is rendered in a +separate block with margins.
+The above text should be distinct from the rest of the paragraph.

+
+

A very indented paragraph

+

An unindented paragraph

+

A default indented paragraph

+

+


+Table of Contents

+ + +

Inline images

+

Here I demonstrate the use of inline images in the midst of text. +Here is a small image embedded in a sentence. +Now we have a slightly larger image that is automatically put in its own +block and finally +we have a large image which is put on a page by itself. Try changing +sizes from S to M to L and see how the images behave.

+

+


+Table of Contents

+ +

Embedded fonts

+

This LRF file has been prepared by embedding Times New Roman and +Andale Mono as the default serif and monospace fonts. This allows it to +correctly display non English characters such as:

+
    +
  • mouse in German: mÅ«s
  • +
  • mouse in Russian: мышь
  • +
+

Note that embedding fonts in LRF files slows down page turns +slightly.
+

+ +

+


+Table of Contents

+ +

Paragraph Emphasis

+
+

beautiful image +based dropcaps to emphasize this paragraph. Image based dropcaps are +specified by adding the class = 'libprs500_dropcaps' +attribute to an <img> tag. +


+ +

This is a plain text based dropcaps. It is not +nearly as dramatic, but easier to code ;-)

+
+ +

This is an Example +of small-caps. It can also be used to highlight the start of a paragraph +very effectively.

+
+

A paragraph with a hanging indent. This is +especially useful for highly structured text like verse, or dialogue.
+

+

+


+Table of Contents

+ +

Recursive link following

+

html2lrf follows links in +HTML files that point to other files, recursively. Thus it can be used +to convert a whole tree of HTML files into a single LRF file.
+

+

+


+Table of Contents

diff --git a/src/calibre/manual/templates/layout.html b/src/calibre/manual/templates/layout.html index c5a857650f..8ec8c949e8 100644 --- a/src/calibre/manual/templates/layout.html +++ b/src/calibre/manual/templates/layout.html @@ -1,14 +1,14 @@ -{% extends "!layout.html" %} -{% block sidebarlogo %} - -
- - - - -
-
+{% extends "!layout.html" %} {% block sidebarlogo %} + +
+ +
+
{% endblock %} diff --git a/src/calibre/manual/xpath.xhtml b/src/calibre/manual/xpath.xhtml index 7468e3d856..3a78863236 100644 --- a/src/calibre/manual/xpath.xhtml +++ b/src/calibre/manual/xpath.xhtml @@ -1,19 +1,19 @@ - - A very short ebook - - - -

A very short ebook

-

Written by Kovid Goyal

-
-

A very short ebook to demonstrate the use of XPath.

-
+ +A very short ebook + + + +

A very short ebook

+

Written by Kovid Goyal

+
+

A very short ebook to demonstrate the use of XPath.

+
-

Chapter One

-

This is a truly fascinating chapter.

+

Chapter One

+

This is a truly fascinating chapter.

-

Chapter Two

-

A worthy continuation of a fine tradition.

- +

Chapter Two

+

A worthy continuation of a fine tradition.

+ From ae8fcb1fd4579026c55f8ee6686fcc096b861b30 Mon Sep 17 00:00:00 2001 From: Sengian Date: Tue, 10 Aug 2010 13:07:29 +0200 Subject: [PATCH 012/132] Correct error with setup.py --- setup.py | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 setup.py diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000..d8bd0267ee --- /dev/null +++ b/setup.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import sys, os, optparse + +sys.path.insert(0, os.path.abspath(os.path.dirname(__file__))) + +import setup.commands as commands +from setup import prints, get_warnings + +def check_version_info(): + vi = sys.version_info + if vi[0] == 2 and vi[1] > 5: + return None + return 'calibre requires python >= 2.6' + +def option_parser(): + parser = optparse.OptionParser() + parser.add_option('-c', '--clean', default=False, action='store_true', + help=('Instead of running the command delete all files generated ' + 'by the command')) + parser.add_option('--clean-backups', default=False, action='store_true', + help='Delete all backup files from the source tree') + parser.add_option('--clean-all', default=False, action='store_true', + help='Delete all machine generated files from the source tree') + return parser + +def clean_backups(): + for root, _, files in os.walk('.'): + for name in files: + for t in ('.pyc', '.pyo', '~', '.swp', '.swo'): + if name.endswith(t): + os.remove(os.path.join(root, name)) + + +def main(args=sys.argv): + if len(args) == 1 or args[1] in ('-h', '--help'): + print 'Usage: python', args[0], 'command', '[options]' + print '\nWhere command is one of:' + print + for x in sorted(commands.__all__): + print '%-20s -'%x, + c = getattr(commands, x) + desc = getattr(c, 'short_description', c.description) + print desc + + print '\nTo get help on a particular command, run:' + print '\tpython', args[0], 'command -h' + return 1 + + command = args[1] + if command not in commands.__all__: + print command, 'is not a recognized command.' + print 'Valid commands:', ', '.join(commands.__all__) + return 1 + + command = getattr(commands, command) + + parser = option_parser() + command.add_all_options(parser) + parser.set_usage('Usage: python setup.py %s [options]\n\n'%args[1]+\ + command.description) + + opts, args = parser.parse_args(args) + + if opts.clean_backups: + clean_backups() + + if opts.clean: + prints('Cleaning', args[1]) + command.clean() + return 0 + + if opts.clean_all: + for cmd in commands.__all__: + prints('Cleaning', cmd) + getattr(commands, cmd).clean() + return 0 + + command.run_all(opts) + + warnings = get_warnings() + if warnings: + print + prints('There were', len(warnings), 'warning(s):') + print + for args, kwargs in warnings: + prints('*', *args, **kwargs) + print + + return 0 + +if __name__ == '__main__': + sys.exit(main()) From 7c70914ad30fc358bfcd7c099494b0a43682ba27 Mon Sep 17 00:00:00 2001 From: Sengian Date: Thu, 12 Aug 2010 16:25:09 +0200 Subject: [PATCH 013/132] Global overhaul of rtf2xml: RTFfixes (3) ->removal of preprocessing, first draft of tokenize finished, introduction of \ud:\upr for unicode --- src/calibre/ebooks/rtf2xml/tokenize.py | 104 +++++++++++++++---------- 1 file changed, 64 insertions(+), 40 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/tokenize.py b/src/calibre/ebooks/rtf2xml/tokenize.py index 3aa2079fb3..e594fed80d 100755 --- a/src/calibre/ebooks/rtf2xml/tokenize.py +++ b/src/calibre/ebooks/rtf2xml/tokenize.py @@ -26,7 +26,7 @@ class Tokenize: in_file, bug_handler, copy = None, - #run_level = 1, + run_level = 1, ): self.__file = in_file self.__bug_handler = bug_handler @@ -37,17 +37,22 @@ class Tokenize: self.__uc_char = 0 self.__uc_bin = False self.__uc_value = [1] - - def __from_ms_to_utf8(self,match_obj): - uni_char = int(match_obj.group(1)) - if uni_char < 0: - uni_char += 65536 - return '&#x' + str('%X' % uni_char) + ';' - + def __reini_utf8_counters(self): self.__uc_char = 0 self.__uc_bin = False + def __remove_uc_chars(self, startchar, token): + for i in xrange(startchar, len(token)): + if token[i] == " ": + continue + elif self.__uc_char: + self.__uc_char -= 1 + else: + return token[i:] + #if only " " and char to skip + return '' + def __unicode_process(self, token): #change scope in if token == '\{': @@ -55,9 +60,9 @@ class Tokenize: #basic error handling self.__reini_utf8_counters() return token - #change scope out: evaluate dict and rebuild + #change scope out elif token == '\}': - #self.__uc_value.pop() + self.__uc_value.pop() self.__reini_utf8_counters() return token #add a uc control @@ -65,58 +70,65 @@ class Tokenize: self.__uc_value[-1] = int(token[3:]) self.__reini_utf8_counters() return token - #handle uc skippable char + #bin data to slip + elif self.__uc_bin: + self.__uc_bin = False + return '' + #uc char to remove elif self.__uc_char: - #if token[:1] == "\" and token[:1] == "\" - pass + #handle \bin tag in case of uc char to skip + if token[:4] == '\bin': + self.__uc_char -=1 + self.__uc_bin = True + return '' + elif token[:1] == "\\" : + self.__uc_char -=1 + return '' + else: + return self.__remove_uc_chars(0, token) #go for real \u token match_obj = self.__utf_exp.match(token) if match_obj is not None: + self.__reini_utf8_counters() #get value and handle negative case uni_char = int(match_obj.group(1)) uni_len = len(match_obj.group(1)) + 2 if uni_char < 0: uni_char += 65536 uni_char = unichr(uni_char).encode('ascii', 'xmlcharrefreplace') - #if not uc0 - if self.__uc_value[-1]: - self.__uc_char = self.__uc_value[-1] + self.__uc_char = self.__uc_value[-1] #there is only an unicode char if len(token)<= uni_len: return uni_char #an unicode char and something else #must be after as it is splited on \ - elif not self.__uc_value[-1]: - print('not only token uc0 token: ' + uni_char + token[uni_len:]) + #necessary? maybe for \bin? + elif not self.__uc_char: return uni_char + token[uni_len:] #if not uc0 and chars else: - for i in xrange(uni_len, len(token)): - if token[i] == " ": - continue - elif self.__uc_char > 0: - self.__uc_char -= 1 - else: - return uni_char + token[i:] - #print('uc: ' + str(self.__uc_value) + 'uni: ' + str(uni_char) + 'token: ' + token) + return uni_char + self.__remove_uc_chars(uni_len, token) #default return token - + def __sub_reg_split(self,input_file): input_file = self.__replace_spchar.mreplace(input_file) - #input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file) - # line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line) - # this is for older RTF - #line = re.sub(self.__par_exp, '\\par ', line) - input_file = re.sub(self.__ms_hex_exp, "\\mshex0\g<1> ", input_file) + input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file) + input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file) + #remove \n in bin data + input_file = self.__bin_exp.sub(lambda x: \ + x.group().replace('\n', '') +'\n', input_file) #split tokens = re.split(self.__splitexp, input_file) #remove empty tokens and \n return filter(lambda x: len(x) > 0 and x != '\n', tokens) + #input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file) + # line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line) + # this is for older RTF + #line = re.sub(self.__par_exp, '\\par ', line) #return filter(lambda x: len(x) > 0, \ #(self.__remove_line.sub('', x) for x in tokens)) - - + def __compile_expressions(self): SIMPLE_RPL = { "\\\\": "\\backslash ", @@ -145,18 +157,25 @@ class Tokenize: r'\\$': '\\par ', } self.__replace_spchar = MReplace(SIMPLE_RPL) + #add ;? in case of char following \u self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") #r"\\\'(..)" - self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) {0,1}") #modify this - #self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})") + self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) ?") + self.__bin_exp = re.compile(r"(?:\\bin(-?\d{0,10})[\n ]+)[01\n]+") + #manage upr/ud situations + self.__utf_ud = re.compile(r"\\{[\n ]?\\upr[\n ]?(?:\\{.*?\\})[\n ]?" + \ + r"\\{[\n ]?\\*[\n ]?\\ud[\n ]?(\\{.*?\\})[\n ]?\\}[\n ]?\\}") #add \n in split for whole file reading - #self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)") #why keep backslash whereas \is replaced before? + #remove \n from endline char self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)") + #self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}") + #self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})") + #self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)") #self.__par_exp = re.compile(r'\\$') #self.__remove_line = re.compile(r'\n+') #self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)") ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)") - + def tokenize(self): """Main class for handling other methods. Reads the file \ , uses method self.sub_reg to make basic substitutions,\ @@ -170,9 +189,9 @@ class Tokenize: #remove '' and \n in the process tokens = self.__sub_reg_split(input_file) #correct unicode - #tokens = map(self.__unicode_process, tokens) + tokens = map(self.__unicode_process, tokens) #remove empty items created by removing \uc - #tokens = filter(lambda x: len(x) > 0, tokens) + tokens = filter(lambda x: len(x) > 0, tokens) #write write_obj = open(self.__write_to, 'wb') @@ -241,4 +260,9 @@ class Tokenize: neg_uni_char = int(match_obj.group(1)) * -1 # sys.stderr.write(str( neg_uni_char)) uni_char = neg_uni_char + 65536 + return '&#x' + str('%X' % uni_char) + ';''' + '''def __from_ms_to_utf8(self,match_obj): + uni_char = int(match_obj.group(1)) + if uni_char < 0: + uni_char += 65536 return '&#x' + str('%X' % uni_char) + ';''' \ No newline at end of file From b9ed0c6b3d579f1dc2e2c5b94df5e2e8f9ec75d4 Mon Sep 17 00:00:00 2001 From: Sengian Date: Thu, 12 Aug 2010 17:16:37 +0200 Subject: [PATCH 014/132] Global overhaul of rtf2xml: RTFfixes (4) ->minors corrections in line endings and check brackets, move check encoding first to eliminate non ascii RTF --- src/calibre/ebooks/rtf2xml/ParseRtf.py | 40 ++++++++++---------- src/calibre/ebooks/rtf2xml/check_brackets.py | 1 - src/calibre/ebooks/rtf2xml/check_encoding.py | 10 +++-- src/calibre/ebooks/rtf2xml/line_endings.py | 11 ++++-- 4 files changed, 33 insertions(+), 29 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index 3a804792c5..76bdcc08af 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -133,7 +133,6 @@ class ParseRtf: self.__temp_dir = out_dir self.__dtd_path = dtd self.__check_file(in_file,"file_to_parse") - self.__check_ascii(in_file) self.__char_data = char_data self.__debug_dir = deb_dir self.__check_dir(self.__temp_dir) @@ -152,6 +151,7 @@ class ParseRtf: self.__group_borders = group_borders self.__empty_paragraphs = empty_paragraphs self.__no_dtd = no_dtd + def __check_file(self, the_file, type): """Check to see if files exist""" if hasattr(the_file, 'read'): return @@ -164,6 +164,7 @@ class ParseRtf: else: msg = "\nThe file '%s' cannot be found" % the_file raise RtfInvalidCodeException, msg + def __check_dir(self, the_dir): """Check to see if directory exists""" if not the_dir : @@ -173,15 +174,7 @@ class ParseRtf: msg = "\n%s is not a directory" % the_dir raise RtfInvalidCodeException, msg return 1 - def __check_ascii(self, the_file): - """Check to see if the file is correct ascii""" - try: - test = codecs.open(the_file, 'r', 'ascii', 'strict') - test.close() - except UnicodeError: - msg = "\n%s is not a correct ascii file" % the_file - raise RtfInvalidCodeException, msg - return 1 + def parse_rtf(self): """ Parse the file by calling on other classes. @@ -192,6 +185,18 @@ class ParseRtf: depending on the value of 'output' when the instance was created. """ self.__temp_file = self.__make_temp_file(self.__file) + #Check to see if the file is correct ascii first + check_encoding_obj = check_encoding.CheckEncoding( + bug_handler = RtfInvalidCodeException, + ) + if check_encoding_obj.check_encoding(self.__file): + try: + os.remove(self.__temp_file) + except OSError: + pass + sys.stderr.write('File "%s" does not appear to be ascii.\n' \ + % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8')) + raise InvalidRtfException # if the self.__deb_dir is true, then create a copy object, # set the directory to write to, remove files, and copy # the new temporary file to this directory @@ -214,7 +219,7 @@ class ParseRtf: in_file = self.__temp_file, bug_handler = RtfInvalidCodeException, copy = self.__copy, - #run_level = self.__run_level, + run_level = self.__run_level, replace_illegals = self.__replace_illegals, ) line_obj.fix_endings() @@ -223,8 +228,8 @@ class ParseRtf: tokenize_obj = tokenize.Tokenize( bug_handler = RtfInvalidCodeException, in_file = self.__temp_file, - copy = self.__copy,) - #run_level = self.__run_level,) + copy = self.__copy, + run_level = self.__run_level) tokenize_obj.tokenize() process_tokens_obj = process_tokens.ProcessTokens( in_file = self.__temp_file, @@ -240,10 +245,6 @@ class ParseRtf: os.remove(self.__temp_file) except OSError: pass - check_encoding_obj = check_encoding.CheckEncoding( - bug_handler = RtfInvalidCodeException, - ) - check_encoding_obj.check_encoding(self.__file) sys.stderr.write('File "%s" does not appear to be RTF.\n' % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8')) raise InvalidRtfException, msg delete_info_obj = delete_info.DeleteInfo( @@ -548,8 +549,7 @@ class ParseRtf: """Make a temporary file to parse""" write_file="rtf_write_file" read_obj = file if hasattr(file, 'read') else open(file,'r') - write_obj = open(write_file, 'w') - for line in read_obj: - write_obj.write(line) + write_obj = open(write_file, 'wb') + write_obj.write(read_obj.read()) write_obj.close() return write_file \ No newline at end of file diff --git a/src/calibre/ebooks/rtf2xml/check_brackets.py b/src/calibre/ebooks/rtf2xml/check_brackets.py index 53f9363d63..8917780746 100755 --- a/src/calibre/ebooks/rtf2xml/check_brackets.py +++ b/src/calibre/ebooks/rtf2xml/check_brackets.py @@ -30,7 +30,6 @@ class CheckBrackets: self.__bracket_count += 1 def close_brack(self, line): num = line[-5:-1] - ##self.__open_bracket_num.append(num) try: last_num = self.__open_bracket_num.pop() except: diff --git a/src/calibre/ebooks/rtf2xml/check_encoding.py b/src/calibre/ebooks/rtf2xml/check_encoding.py index f6810e4909..1f8645bb0c 100755 --- a/src/calibre/ebooks/rtf2xml/check_encoding.py +++ b/src/calibre/ebooks/rtf2xml/check_encoding.py @@ -14,12 +14,11 @@ class CheckEncoding: sys.stderr.write(str(msg) + '\n') def check_encoding(self, path, encoding='us-ascii'): read_obj = open(path, 'r') - line_to_read = 1 + input_file = read_obj.read() + read_obj.close() line_num = 0 - while line_to_read: + for line in input_file: line_num += 1 - line_to_read = read_obj.readline() - line = line_to_read try: line.decode(encoding) except UnicodeError: @@ -27,6 +26,9 @@ class CheckEncoding: self.__get_position_error(line, encoding, line_num) else: sys.stderr.write('line: %d has bad encoding\n'%line_num) + return True + return False + if __name__ == '__main__': check_encoding_obj = CheckEncoding() check_encoding_obj.check_encoding(sys.argv[1]) diff --git a/src/calibre/ebooks/rtf2xml/line_endings.py b/src/calibre/ebooks/rtf2xml/line_endings.py index e77e5d747c..86546967a7 100755 --- a/src/calibre/ebooks/rtf2xml/line_endings.py +++ b/src/calibre/ebooks/rtf2xml/line_endings.py @@ -23,7 +23,7 @@ class FixLineEndings: bug_handler, in_file = None, copy = None, - #run_level = 1, calibre why keep it? + run_level = 1, replace_illegals = 1, ): self.__file = in_file @@ -32,8 +32,11 @@ class FixLineEndings: self.__write_to = tempfile.mktemp() self.__replace_illegals = replace_illegals def fix_endings(self): - illegal_regx = re.compile('\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13') - # always check since I have to get rid of illegal characters + #remove ASCII invalid chars : 0 to 8 and 11-14 to 24 + #always check since I have to get rid of illegal characters + chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19)) + illegal_regx = re.compile(u'|'.join(map(unichr, chars))) + #illegal_regx = re.compile('\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13') #read read_obj = open(self.__file, 'r') input_file = read_obj.read() @@ -42,7 +45,7 @@ class FixLineEndings: input_file = input_file.replace ('\r\n', '\n') input_file = input_file.replace ('\r', '\n') if self.__replace_illegals: - input_file = re.sub(illegal_regx, '', input_file) + input_file = illegal_regx.sub('', input_file) #write write_obj = open(self.__write_to, 'wb') write_obj.write(input_file) From a9fd0ad4ba9acdcc07d5bfcae503c378c25a7303 Mon Sep 17 00:00:00 2001 From: Sengian Date: Mon, 16 Aug 2010 10:08:59 +0200 Subject: [PATCH 015/132] Global overhaul of rtf2xml: RTFfixes (5) ->minors corrections and regression correction --- src/calibre/ebooks/rtf/input.py | 2 +- src/calibre/ebooks/rtf2xml/ParseRtf.py | 14 +- src/calibre/ebooks/rtf2xml/check_encoding.py | 11 +- src/calibre/ebooks/rtf2xml/copy.py | 14 +- src/calibre/ebooks/rtf2xml/process_tokens.py | 163 ++++++++++--------- 5 files changed, 104 insertions(+), 100 deletions(-) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 824da7d6f1..f4fbdf411c 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -50,7 +50,7 @@ class RTFInput(InputFormatPlugin): parser = ParseRtf( in_file = stream, out_file = ofile, - deb_dir = 'I:\\Calibre\\rtfdebug', + deb_dir = 'D:\\calibre\\pierre\\debug\\rtfdebug', # Convert symbol fonts to unicode equivalents. Default # is 1 convert_symbol = 1, diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index 76bdcc08af..1230ae150e 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -120,8 +120,6 @@ class ParseRtf: script tries to output to directory where is script is exectued.) 'deb_dir' --debug directory. If a debug_dir is provided, the script will copy each run through as a file to examine in the debug_dir - 'perl_script'--use perl to make tokens. This runs just a bit faster. - (I will probably phase this out.) 'check_brackets' -- make sure the brackets match up after each run through a file. Only for debugging. Returns: Nothing @@ -142,7 +140,7 @@ class ParseRtf: self.__convert_wingdings = convert_wingdings self.__convert_zapf = convert_zapf self.__run_level = run_level - #self.__exit_level = 0 + #self.__exit_level = 0 See what this means and if it is consistent self.__indent = indent self.__replace_illegals = replace_illegals self.__form_lists = form_lists @@ -184,19 +182,15 @@ class ParseRtf: A parsed file in XML, either to standard output or to a file, depending on the value of 'output' when the instance was created. """ - self.__temp_file = self.__make_temp_file(self.__file) #Check to see if the file is correct ascii first check_encoding_obj = check_encoding.CheckEncoding( bug_handler = RtfInvalidCodeException, ) if check_encoding_obj.check_encoding(self.__file): - try: - os.remove(self.__temp_file) - except OSError: - pass sys.stderr.write('File "%s" does not appear to be ascii.\n' \ % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8')) raise InvalidRtfException + self.__temp_file = self.__make_temp_file(self.__file) # if the self.__deb_dir is true, then create a copy object, # set the directory to write to, remove files, and copy # the new temporary file to this directory @@ -223,7 +217,6 @@ class ParseRtf: replace_illegals = self.__replace_illegals, ) line_obj.fix_endings() - #return_value = line_obj.fix_endings() #calibre: no return in this function, why keep it? #self.__return_code(return_value) tokenize_obj = tokenize.Tokenize( bug_handler = RtfInvalidCodeException, @@ -550,6 +543,7 @@ class ParseRtf: write_file="rtf_write_file" read_obj = file if hasattr(file, 'read') else open(file,'r') write_obj = open(write_file, 'wb') - write_obj.write(read_obj.read()) + for line in read_obj: + write_obj.write(line) write_obj.close() return write_file \ No newline at end of file diff --git a/src/calibre/ebooks/rtf2xml/check_encoding.py b/src/calibre/ebooks/rtf2xml/check_encoding.py index 1f8645bb0c..444fd373e4 100755 --- a/src/calibre/ebooks/rtf2xml/check_encoding.py +++ b/src/calibre/ebooks/rtf2xml/check_encoding.py @@ -14,10 +14,10 @@ class CheckEncoding: sys.stderr.write(str(msg) + '\n') def check_encoding(self, path, encoding='us-ascii'): read_obj = open(path, 'r') - input_file = read_obj.read() - read_obj.close() + line_num = 0 - for line in input_file: + error_found = False + for line in read_obj: line_num += 1 try: line.decode(encoding) @@ -26,8 +26,9 @@ class CheckEncoding: self.__get_position_error(line, encoding, line_num) else: sys.stderr.write('line: %d has bad encoding\n'%line_num) - return True - return False + error_found = True + read_obj.close() + return error_found if __name__ == '__main__': check_encoding_obj = CheckEncoding() diff --git a/src/calibre/ebooks/rtf2xml/copy.py b/src/calibre/ebooks/rtf2xml/copy.py index ff029c1841..1b620b9fbf 100755 --- a/src/calibre/ebooks/rtf2xml/copy.py +++ b/src/calibre/ebooks/rtf2xml/copy.py @@ -23,6 +23,7 @@ class Copy: def __init__(self, bug_handler, file = None, deb_dir = None, ): self.__file = file self.__bug_handler = bug_handler + def set_dir(self, deb_dir): """Set the temporary directory to write files to""" if deb_dir is None: @@ -33,19 +34,11 @@ class Copy: message = "%(deb_dir)s is not a directory" % vars() raise self.__bug_handler , message Copy.__dir = deb_dir + def remove_files(self ): """Remove files from directory""" self.__remove_the_files(Copy.__dir) - """ - list_of_files = os.listdir(Copy.__dir) - list_of_files = os.listdir(the_dir) - for file in list_of_files: - rem_file = os.path.join(Copy.__dir,file) - if os.path.isdir(rem_file): - self.remove_files(rem_file) - else: - os.remove(rem_file) - """ + def __remove_the_files(self, the_dir): """Remove files from directory""" list_of_files = os.listdir(the_dir) @@ -58,6 +51,7 @@ class Copy: os.remove(rem_file) except OSError: pass + def copy_file(self, file, new_file): """ Copy the file to a new name diff --git a/src/calibre/ebooks/rtf2xml/process_tokens.py b/src/calibre/ebooks/rtf2xml/process_tokens.py index 072d8b02e4..2c5c0c7df0 100755 --- a/src/calibre/ebooks/rtf2xml/process_tokens.py +++ b/src/calibre/ebooks/rtf2xml/process_tokens.py @@ -735,8 +735,94 @@ class ProcessTokens: pre, token, action = self.dict_token.get(token, (None, None, None)) if action: return action(pre, token, num) - # unused function - def initiate_token_actions(self): + + def __check_brackets(self, in_file): + self.__check_brack_obj = check_brackets.CheckBrackets\ + (file = in_file) + good_br = self.__check_brack_obj.check_brackets()[0] + if not good_br: + return 1 + def process_tokens(self): + """Main method for handling other methods. """ + + read_obj= open(self.__file, 'r') + write_obj = open(self.__write_to, 'wb') + + '''first_token = 0 + second_token = 0''' + line_count = 0 + + for line in read_obj: + token = line.replace("\n","") + #calibre not necessary normaly, fixed in tokenize + '''if not token: + continue''' + line_count += 1 + #calibre not necessary, encoding checked before + """try: + token.decode('us-ascii') + except UnicodeError, msg: + msg = str(msg) + msg += 'Invalid RTF: File not ascii encoded.\n' + raise self.__exception_handler, msg""" + #calibre: with tokenize, should be first and second line, why bother? + """if not first_token: + if token != '\\{': + msg = 'Invalid RTF: document doesn\'t start with {\n' + raise self.__exception_handler, msg + first_token = 1 + elif line_count == and not second_token: + if token[0:4] != '\\rtf': + msg ='Invalid RTF: document doesn\'t start with \\rtf \n' + raise self.__exception_handler, msg + second_token = 1""" + if line_count == 1 and token != '\\{': + msg = 'Invalid RTF: document doesn\'t start with {\n' + raise self.__exception_handler, msg + elif line_count == 2 and token[0:4] != '\\rtf': + msg ='Invalid RTF: document doesn\'t start with \\rtf \n' + raise self.__exception_handler, msg + + ##token = self.evaluate_token(token) + the_index = token.find('\\ ') + if token is not None and the_index > -1: + msg ='Invalid RTF: token "\\ " not valid.\n' + raise self.__exception_handler, msg + elif token[:1] == "\\": + line = self.process_cw(token) + if line is not None: + write_obj.write(line) + else: + fields = re.split(self.__utf_exp, token) + for field in fields: + if not field: + continue + if field[0:1] == '&': + write_obj.write('tx -1: - msg ='Invalid RTF: token "\\ " not valid. \n' - raise self.__exception_handler, msg - elif token[0:1] == "\\": - line = self.process_cw(token) - if line != None: - write_obj.write(line) - else: - fields = re.split(self.__utf_exp, token) - for field in fields: - if not field: - continue - if field[0:1] == '&': - write_obj.write('tx Date: Sun, 26 Sep 2010 17:49:59 +0200 Subject: [PATCH 016/132] Modif debug --- src/calibre/ebooks/rtf/input.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 1de064df5c..4c7dfd9260 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -51,7 +51,7 @@ class RTFInput(InputFormatPlugin): parser = ParseRtf( in_file = stream, out_file = ofile, - deb_dir = 'D:\\calibre\\pierre\\debug\\rtfdebug', + deb_dir = 'H:\\Temp\\Calibre\\rtfdebug', # Convert symbol fonts to unicode equivalents. Default # is 1 convert_symbol = 1, From 9590ba62348930d93c496e507549a8c97d43ef16 Mon Sep 17 00:00:00 2001 From: Sengian Date: Mon, 11 Oct 2010 00:35:07 +0200 Subject: [PATCH 017/132] isbndb.py minor changes --- src/calibre/ebooks/metadata/isbndb.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/metadata/isbndb.py b/src/calibre/ebooks/metadata/isbndb.py index 221cfc13d1..2bbffc2c8b 100644 --- a/src/calibre/ebooks/metadata/isbndb.py +++ b/src/calibre/ebooks/metadata/isbndb.py @@ -74,14 +74,14 @@ class ISBNDBMetadata(Metadata): if authors: self.authors = authors try: - self.author_sort = self.tostring(book.find('authors').find('person')) + self.author_sort = tostring(book.find('authors').find('person')) if self.authors and self.author_sort == self.authors[0]: self.author_sort = None except: pass - self.publisher = self.tostring(book.find('publishertext')) + self.publisher = tostring(book.find('publishertext')) - summ = self.tostring(book.find('summary')) + summ = tostring(book.find('summary')) if summ: self.comments = 'SUMMARY:\n'+summ @@ -141,7 +141,7 @@ def create_books(opts, args, timeout=5.): print ('ISBNDB query: '+url) tans = [ISBNDBMetadata(book) for book in fetch_metadata(url, timeout=timeout)] - ans = [] + '''ans = [] for x in tans: add = True for y in ans: @@ -149,7 +149,9 @@ def create_books(opts, args, timeout=5.): add = False if add: ans.append(x) - return ans + return ans''' + #remove duplicates ISBN + return dict((book.isbn, book) for book in tans).values() def main(args=sys.argv): parser = option_parser() From 19288b38acd138e4e3702845e1b1b61ef82c0d2d Mon Sep 17 00:00:00 2001 From: Sengian Date: Mon, 11 Oct 2010 00:36:26 +0200 Subject: [PATCH 018/132] Merge from trunk --- resources/catalog/stylesheet.css | 142 ++++++++++++++-------------- resources/content_server/index.html | 6 +- resources/templates/fb2.xsl | 97 ++++++++++--------- resources/templates/html.css | 35 +++++-- 4 files changed, 154 insertions(+), 126 deletions(-) diff --git a/resources/catalog/stylesheet.css b/resources/catalog/stylesheet.css index 4f9ca9ac41..ea01aeb43b 100644 --- a/resources/catalog/stylesheet.css +++ b/resources/catalog/stylesheet.css @@ -1,102 +1,104 @@ -body { background-color: white; } +body { + background-color: white; +} -p.title { - margin-top:0em; - margin-bottom:1em; - text-align:center; - font-style:italic; - font-size:xx-large; - border-bottom: solid black 4px; - } +p.title { + margin-top: 0em; + margin-bottom: 1em; + text-align: center; + font-style: italic; + font-size: xx-large; + border-bottom: solid black 4px; +} p.author { - margin-top:0em; - margin-bottom:0em; + margin-top: 0em; + margin-bottom: 0em; text-align: left; text-indent: 1em; - font-size:large; - } + font-size: large; +} p.tags { - margin-top:0em; - margin-bottom:0em; + margin-top: 0em; + margin-bottom: 0em; text-align: left; text-indent: 1em; - font-size:small; - } + font-size: small; +} p.description { - text-align:left; - font-style:normal; + text-align: left; + font-style: normal; margin-top: 0em; - } +} p.date_index { - font-size:x-large; - text-align:center; - font-weight:bold; - margin-top:1em; - margin-bottom:0px; - } + font-size: x-large; + text-align: center; + font-weight: bold; + margin-top: 1em; + margin-bottom: 0px; +} p.letter_index { - font-size:x-large; - text-align:center; - font-weight:bold; - margin-top:1em; - margin-bottom:0px; - } + font-size: x-large; + text-align: center; + font-weight: bold; + margin-top: 1em; + margin-bottom: 0px; +} p.author_index { - font-size:large; - text-align:left; - margin-top:0px; - margin-bottom:0px; + font-size: large; + text-align: left; + margin-top: 0px; + margin-bottom: 0px; text-indent: 0em; - } +} p.series { text-align: left; - margin-top:0px; - margin-bottom:0px; - margin-left:2em; - text-indent:-2em; - } + margin-top: 0px; + margin-bottom: 0px; + margin-left: 2em; + text-indent: -2em; +} p.read_book { - text-align:left; - margin-top:0px; - margin-bottom:0px; - margin-left:2em; - text-indent:-2em; - } + text-align: left; + margin-top: 0px; + margin-bottom: 0px; + margin-left: 2em; + text-indent: -2em; +} p.unread_book { - text-align:left; - margin-top:0px; - margin-bottom:0px; - margin-left:2em; - text-indent:-2em; - } + text-align: left; + margin-top: 0px; + margin-bottom: 0px; + margin-left: 2em; + text-indent: -2em; +} p.date_read { - text-align:left; - margin-top:0px; - margin-bottom:0px; - margin-left:6em; - text-indent:-6em; - } + text-align: left; + margin-top: 0px; + margin-bottom: 0px; + margin-left: 6em; + text-indent: -6em; +} hr.series_divider { - width:50%; - margin-left:1em; - margin-top:0em; - margin-bottom:0em; - } + width: 50%; + margin-left: 1em; + margin-top: 0em; + margin-bottom: 0em; +} hr.annotations_divider { - width:50%; - margin-left:1em; - margin-top:0em; - margin-bottom:0em; - } + width: 50%; + margin-left: 1em; + margin-top: 0em; + margin-bottom: 0em; +} \ No newline at end of file diff --git a/resources/content_server/index.html b/resources/content_server/index.html index ff11acc719..1bc13096d5 100644 --- a/resources/content_server/index.html +++ b/resources/content_server/index.html @@ -29,9 +29,9 @@
Show first set of books Show previous set of books               Show previous set of books              Show next set of books Show last set of books - - - + + + + - + - <xsl:value-of select="fb:description/fb:title-info/fb:book-title"/> + <xsl:value-of select="fb:description/fb:title-info/fb:book-title" /> @@ -51,37 +58,37 @@
- +
-
+
    - +
-
- +
+ -
+

- +

- +
- + - +
diff --git a/resources/templates/html.css b/resources/templates/html.css index 448ec596b9..bfbb646afb 100644 --- a/resources/templates/html.css +++ b/resources/templates/html.css @@ -35,9 +35,9 @@ * * ***** END LICENSE BLOCK ***** */ @ -namespace url (http: //www.w3.org /1999/xhtml); - @namespace svg url (http: //www.w3.org /2000/svg); - /* blocks */ +namespace url (http: //www.w3.org /1999/xhtml); + @namespace svg url (http: //www.w3.org /2000/svg); + /* blocks */ html,div,map,dt,isindex,form { display: block; @@ -161,10 +161,29 @@ table[align="right"] { float: right; } -table[rules]:not ([rules="none"] ) { - border-collapse: collapse; -} +table +[ +rules +] +:not + +( +[ +rules += +"none" +] + +) +{ +border-collapse +: + +collapse +; + +} /* caption inherits from table not table-outer */ caption { display: table-caption; @@ -322,7 +341,7 @@ ol ol ul,ol ul ul,ol menu ul,ol dir ul,ol ol menu,ol ul menu,ol menu menu,ol dir } /* leafs */ - /*
noshade and color attributes are handled completely by +/*
noshade and color attributes are handled completely by * the nsHTMLHRElement attribute mapping code */ hr { @@ -381,7 +400,7 @@ br { } /* Images, embedded object, and SVG size defaults */ -img,object,svg |svg { +img,object,svg |svg { width: auto; height: auto; } From 282c6aaa49006086c0887115edd3da1381d663e9 Mon Sep 17 00:00:00 2001 From: Sengian Date: Fri, 15 Oct 2010 08:45:09 +0200 Subject: [PATCH 019/132] Minor modification to isbndb.py --- src/calibre/ebooks/metadata/isbndb.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/src/calibre/ebooks/metadata/isbndb.py b/src/calibre/ebooks/metadata/isbndb.py index 2bbffc2c8b..615b4ab818 100644 --- a/src/calibre/ebooks/metadata/isbndb.py +++ b/src/calibre/ebooks/metadata/isbndb.py @@ -90,10 +90,8 @@ def build_isbn(base_url, opts): return base_url + 'index1=isbn&value1='+opts.isbn def build_combined(base_url, opts): - query = '' - for e in (opts.title, opts.author, opts.publisher): - if e is not None: - query += ' ' + e + query = ' '.join([e for e in (opts.title, opts.author, opts.publisher) \ + if e is not None ]) query = query.strip() if len(query) == 0: raise ISBNDBError('You must specify at least one of --author, --title or --publisher') @@ -141,15 +139,6 @@ def create_books(opts, args, timeout=5.): print ('ISBNDB query: '+url) tans = [ISBNDBMetadata(book) for book in fetch_metadata(url, timeout=timeout)] - '''ans = [] - for x in tans: - add = True - for y in ans: - if y.isbn == x.isbn: - add = False - if add: - ans.append(x) - return ans''' #remove duplicates ISBN return dict((book.isbn, book) for book in tans).values() From 18d2c55d4bccfaff1b32416a7fe7c7507dcaee0b Mon Sep 17 00:00:00 2001 From: Sengian Date: Tue, 19 Oct 2010 23:10:34 +0200 Subject: [PATCH 020/132] Modify single metadata display to include summary and covers check --- src/calibre/gui2/dialogs/fetch_metadata.py | 8 +- src/calibre/gui2/dialogs/fetch_metadata.ui | 344 ++++++++++----------- 2 files changed, 179 insertions(+), 173 deletions(-) diff --git a/src/calibre/gui2/dialogs/fetch_metadata.py b/src/calibre/gui2/dialogs/fetch_metadata.py index eb6edce75d..950f014442 100644 --- a/src/calibre/gui2/dialogs/fetch_metadata.py +++ b/src/calibre/gui2/dialogs/fetch_metadata.py @@ -48,7 +48,7 @@ class Matches(QAbstractTableModel): return len(self.matches) def columnCount(self, *args): - return 6 + return 8 def headerData(self, section, orientation, role): if role != Qt.DisplayRole: @@ -61,6 +61,8 @@ class Matches(QAbstractTableModel): elif section == 3: text = _("Publisher") elif section == 4: text = _("ISBN") elif section == 5: text = _("Published") + elif section == 6: text = _("Cover?") + elif section == 7: text = _("Summary?") return QVariant(text) else: @@ -87,6 +89,10 @@ class Matches(QAbstractTableModel): elif col == 5: if hasattr(book.pubdate, 'timetuple'): res = strftime('%b %Y', book.pubdate.timetuple()) + elif col == 6 and book.has_cover: + res = 'OK' + elif col == 7 and book.comments: + res = 'OK' if not res: return NONE return QVariant(res) diff --git a/src/calibre/gui2/dialogs/fetch_metadata.ui b/src/calibre/gui2/dialogs/fetch_metadata.ui index 03a362096c..c54ee66044 100644 --- a/src/calibre/gui2/dialogs/fetch_metadata.ui +++ b/src/calibre/gui2/dialogs/fetch_metadata.ui @@ -1,172 +1,172 @@ - - - FetchMetadata - - - Qt::WindowModal - - - - 0 - 0 - 830 - 642 - - - - Fetch metadata - - - - :/images/metadata.png:/images/metadata.png - - - - - - <p>calibre can find metadata for your books from two locations: <b>Google Books</b> and <b>isbndb.com</b>. <p>To use isbndb.com you must sign up for a <a href="http://www.isbndb.com">free account</a> and enter your access key below. - - - Qt::AlignCenter - - - true - - - true - - - - - - - - - &Access Key: - - - key - - - - - - - - - - Fetch - - - - - - - - - - - - true - - - - - - - Matches - - - - - - Select the book that most closely matches your copy from the list below - - - - - - - - 0 - 1 - - - - true - - - QAbstractItemView::SingleSelection - - - QAbstractItemView::SelectRows - - - - - - - - - - - - - Download &social metadata (tags/rating/etc.) for the selected book - - - - - - - Overwrite author and title with author and title of selected book - - - - - - - QDialogButtonBox::Cancel|QDialogButtonBox::Ok - - - - - - - - - - - buttonBox - accepted() - FetchMetadata - accept() - - - 460 - 599 - - - 657 - 530 - - - - - buttonBox - rejected() - FetchMetadata - reject() - - - 417 - 599 - - - 0 - 491 - - - - - + + + FetchMetadata + + + Qt::WindowModal + + + + 0 + 0 + 890 + 642 + + + + Fetch metadata + + + + :/images/metadata.png:/images/metadata.png + + + + + + <p>calibre can find metadata for your books from two locations: <b>Google Books</b> and <b>isbndb.com</b>. <p>To use isbndb.com you must sign up for a <a href="http://www.isbndb.com">free account</a> and enter your access key below. + + + Qt::AlignCenter + + + true + + + true + + + + + + + + + &Access Key: + + + key + + + + + + + + + + Fetch + + + + + + + + + + + + true + + + + + + + Matches + + + + + + Select the book that most closely matches your copy from the list below + + + + + + + + 0 + 1 + + + + true + + + QAbstractItemView::SingleSelection + + + QAbstractItemView::SelectRows + + + + + + + + + + + + + Download &social metadata (tags/rating/etc.) for the selected book + + + + + + + Overwrite author and title with author and title of selected book + + + + + + + QDialogButtonBox::Cancel|QDialogButtonBox::Ok + + + + + + + + + + + buttonBox + accepted() + FetchMetadata + accept() + + + 460 + 599 + + + 657 + 530 + + + + + buttonBox + rejected() + FetchMetadata + reject() + + + 417 + 599 + + + 0 + 491 + + + + + From b59631db5f348c2cba069ffc725251afc87a3a1c Mon Sep 17 00:00:00 2001 From: Sengian Date: Sun, 24 Oct 2010 23:26:17 +0200 Subject: [PATCH 021/132] Add a get cover option which overwrite the cover if one is available to metadata_single.py but needs to be modified to remember the option --- src/calibre/gui2/dialogs/fetch_metadata.ui | 11 +++++++++-- src/calibre/gui2/dialogs/metadata_single.py | 2 ++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/calibre/gui2/dialogs/fetch_metadata.ui b/src/calibre/gui2/dialogs/fetch_metadata.ui index c54ee66044..0b39089ee3 100644 --- a/src/calibre/gui2/dialogs/fetch_metadata.ui +++ b/src/calibre/gui2/dialogs/fetch_metadata.ui @@ -109,6 +109,13 @@ + + + + Overwrite author and title with author and title of selected book + + + @@ -117,9 +124,9 @@ - + - Overwrite author and title with author and title of selected book + Overwrite cover image with downloaded cover if available for the selected book diff --git a/src/calibre/gui2/dialogs/metadata_single.py b/src/calibre/gui2/dialogs/metadata_single.py index ef1bddca0c..65cfdf57d4 100644 --- a/src/calibre/gui2/dialogs/metadata_single.py +++ b/src/calibre/gui2/dialogs/metadata_single.py @@ -709,6 +709,8 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): self.title.setText(book.title) self.authors.setText(authors_to_string(book.authors)) if book.author_sort: self.author_sort.setText(book.author_sort) + if d.opt_overwrite_cover_image.isChecked() and book.has_cover: + self.fetch_cover() if book.publisher: self.publisher.setEditText(book.publisher) if book.isbn: self.isbn.setText(book.isbn) if book.pubdate: From c7995f136f839c2719f5aada74c59239916bfd7f Mon Sep 17 00:00:00 2001 From: Sengian Date: Sat, 30 Oct 2010 18:11:50 +0200 Subject: [PATCH 022/132] Finishing the option of downloading cover in single metadata and correcting a bug concerning option saving --- src/calibre/gui2/__init__.py | 2 ++ src/calibre/gui2/dialogs/fetch_metadata.py | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/src/calibre/gui2/__init__.py b/src/calibre/gui2/__init__.py index 4820bd251c..712c6b8a04 100644 --- a/src/calibre/gui2/__init__.py +++ b/src/calibre/gui2/__init__.py @@ -123,6 +123,8 @@ def _config(): help=_('Download social metadata (tags/rating/etc.)')) c.add_opt('overwrite_author_title_metadata', default=True, help=_('Overwrite author and title with new metadata')) + c.add_opt('overwrite_cover_image', default=False, + help=_('Overwrite cover with new new cover if existing')) c.add_opt('enforce_cpu_limit', default=True, help=_('Limit max simultaneous jobs to number of CPUs')) c.add_opt('tag_browser_hidden_categories', default=set(), diff --git a/src/calibre/gui2/dialogs/fetch_metadata.py b/src/calibre/gui2/dialogs/fetch_metadata.py index 35b5e576e6..a0ee250457 100644 --- a/src/calibre/gui2/dialogs/fetch_metadata.py +++ b/src/calibre/gui2/dialogs/fetch_metadata.py @@ -137,6 +137,7 @@ class FetchMetadata(QDialog, Ui_FetchMetadata): self.fetch_metadata() self.opt_get_social_metadata.setChecked(config['get_social_metadata']) self.opt_overwrite_author_title_metadata.setChecked(config['overwrite_author_title_metadata']) + self.opt_overwrite_cover_image.setChecked(config['overwrite_cover_image']) def show_summary(self, current, *args): @@ -219,6 +220,13 @@ class FetchMetadata(QDialog, Ui_FetchMetadata): _hung_fetchers.add(self.fetcher) if hasattr(self, '_hangcheck') and self._hangcheck.isActive(): self._hangcheck.stop() + #option configure + if self.opt_get_social_metadata.isChecked() != config['get_social_metadata']: + config.set('get_social_metadata', self.opt_get_social_metadata.isChecked()) + if self.opt_overwrite_author_title_metadata.isChecked() != config['overwrite_author_title_metadata']: + config.set('overwrite_author_title_metadata', self.opt_overwrite_author_title_metadata.isChecked()) + if self.opt_overwrite_cover_image.isChecked() != config['overwrite_cover_image']: + config.set('overwrite_cover_image', self.opt_overwrite_cover_image.isChecked()) def __enter__(self, *args): return self From c369ff9534d597bda6b7b8910278adaed9b359e9 Mon Sep 17 00:00:00 2001 From: Sengian Date: Sat, 30 Oct 2010 21:59:03 +0200 Subject: [PATCH 023/132] Modify for html correct display --- src/calibre/gui2/dialogs/metadata_single.ui | 1626 +++++++++---------- 1 file changed, 813 insertions(+), 813 deletions(-) diff --git a/src/calibre/gui2/dialogs/metadata_single.ui b/src/calibre/gui2/dialogs/metadata_single.ui index 18bcf2dc4c..29f5d48a11 100644 --- a/src/calibre/gui2/dialogs/metadata_single.ui +++ b/src/calibre/gui2/dialogs/metadata_single.ui @@ -1,813 +1,813 @@ - - - MetadataSingleDialog - - - - 0 - 0 - 887 - 750 - - - - - 0 - 0 - - - - Edit Meta Information - - - - :/images/edit_input.png:/images/edit_input.png - - - true - - - true - - - - - - QFrame::NoFrame - - - true - - - - - 0 - 0 - 879 - 711 - - - - - 0 - - - - - - 800 - 665 - - - - 0 - - - - &Basic metadata - - - - - - Qt::Horizontal - - - - - - - Meta information - - - - - - &Title: - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - title - - - - - - - Change the title of this book - - - - - - - Swap the author and title - - - ... - - - - :/images/swap.png:/images/swap.png - - - - 16 - 16 - - - - - - - - &Author(s): - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - authors - - - - - - - Author S&ort: - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - author_sort - - - - - - - - - Specify how the author(s) of this book should be sorted. For example Charles Dickens should be sorted as Dickens, Charles. -If the box is colored green, then text matches the individual author's sort strings. If it is colored red, then the authors and this text do not match. - - - - - - - Automatically create the author sort entry based on the current author entry. -Using this button to create author sort will change author sort from red to green. - - - ... - - - - :/images/auto_author_sort.png:/images/auto_author_sort.png - - - - - - - - - &Rating: - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - rating - - - - - - - Rating of this book. 0-5 stars - - - Rating of this book. 0-5 stars - - - QAbstractSpinBox::PlusMinus - - - stars - - - 5 - - - - - - - &Publisher: - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - publisher - - - - - - - Ta&gs: - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - tags - - - - - - - - - Tags categorize the book. This is particularly useful while searching. <br><br>They can be any words or phrases, separated by commas. - - - - - - - Open Tag Editor - - - Open Tag Editor - - - - :/images/chapters.png:/images/chapters.png - - - - - - - - - &Series: - - - Qt::PlainText - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - series - - - - - - - 5 - - - - - List of known series. You can add new series. - - - List of known series. You can add new series. - - - true - - - QComboBox::InsertAlphabetically - - - - - - - Remove unused series (Series that have no books) - - - ... - - - - :/images/trash.png:/images/trash.png - - - - - - - - - IS&BN: - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - isbn - - - - - - - - - - Publishe&d: - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - pubdate - - - - - - - true - - - - - - - false - - - Book - - - 9999.989999999999782 - - - - - - - MMM yyyy - - - true - - - - - - - true - - - - - - - dd MMM yyyy - - - true - - - - - - - &Date: - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - date - - - - - - - - - - &Comments - - - - - - true - - - false - - - - - - - - - - &Fetch metadata from server - - - - - - - - - - - - 0 - 0 - - - - Available Formats - - - - - - - - - 0 - 0 - - - - - 16777215 - 130 - - - - QAbstractItemView::DropOnly - - - - 64 - 64 - - - - - - - - Add a new format for this book to the database - - - ... - - - - :/images/add_book.png:/images/add_book.png - - - - 32 - 32 - - - - - - - - Remove the selected formats for this book from the database. - - - ... - - - - :/images/trash.png:/images/trash.png - - - - 32 - 32 - - - - - - - - Set the cover for the book from the selected format - - - ... - - - - :/images/book.png:/images/book.png - - - - 32 - 32 - - - - - - - - Update metadata from the metadata in the selected format - - - - - - - :/images/edit_input.png:/images/edit_input.png - - - - 32 - 32 - - - - - - - - - - - - - - - 0 - 10 - - - - Book Cover - - - - - - - 0 - 100 - - - - - - - - 6 - - - QLayout::SetMaximumSize - - - 0 - - - - - Change &cover image: - - - cover_path - - - - - - - 6 - - - 0 - - - - - true - - - - - - - &Browse - - - - :/images/document_open.png:/images/document_open.png - - - - - - - Remove border (if any) from cover - - - T&rim - - - - :/images/trim.png:/images/trim.png - - - Qt::ToolButtonTextBesideIcon - - - - - - - Reset cover to default - - - ... - - - - :/images/trash.png:/images/trash.png - - - - - - - - - - - - - Download co&ver - - - - - - - Generate a default cover based on the title and author - - - &Generate cover - - - - - - - - - - - - - - - - - &Custom metadata - - - - - - - - - - - - - Qt::Horizontal - - - QDialogButtonBox::Cancel|QDialogButtonBox::Ok - - - - - - - - EnLineEdit - QLineEdit -
widgets.h
-
- - EnComboBox - QComboBox -
widgets.h
-
- - TagsLineEdit - QLineEdit -
widgets.h
-
- - FormatList - QListWidget -
calibre/gui2/widgets.h
-
- - ImageView - QWidget -
calibre/gui2/widgets.h
- 1 -
-
- - title - swap_button - authors - author_sort - auto_author_sort - rating - publisher - tags - tag_editor_button - series - remove_series_button - series_index - isbn - date - pubdate - comments - fetch_metadata_button - add_format_button - remove_format_button - button_set_cover - button_set_metadata - formats - cover_path - reset_cover - fetch_cover_button - generate_cover_button - scrollArea - central_widget - button_box - - - - - - - button_box - accepted() - MetadataSingleDialog - accept() - - - 261 - 710 - - - 157 - 274 - - - - - button_box - rejected() - MetadataSingleDialog - reject() - - - 329 - 710 - - - 286 - 274 - - - - -
+ + + MetadataSingleDialog + + + + 0 + 0 + 887 + 750 + + + + + 0 + 0 + + + + Edit Meta Information + + + + :/images/edit_input.png:/images/edit_input.png + + + true + + + true + + + + + + QFrame::NoFrame + + + true + + + + + 0 + 0 + 879 + 711 + + + + + 0 + + + + + + 800 + 665 + + + + 0 + + + + &Basic metadata + + + + + + Qt::Horizontal + + + + + + + Meta information + + + + + + &Title: + + + Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter + + + title + + + + + + + Change the title of this book + + + + + + + Swap the author and title + + + ... + + + + :/images/swap.png:/images/swap.png + + + + 16 + 16 + + + + + + + + &Author(s): + + + Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter + + + authors + + + + + + + Author S&ort: + + + Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter + + + author_sort + + + + + + + + + Specify how the author(s) of this book should be sorted. For example Charles Dickens should be sorted as Dickens, Charles. +If the box is colored green, then text matches the individual author's sort strings. If it is colored red, then the authors and this text do not match. + + + + + + + Automatically create the author sort entry based on the current author entry. +Using this button to create author sort will change author sort from red to green. + + + ... + + + + :/images/auto_author_sort.png:/images/auto_author_sort.png + + + + + + + + + &Rating: + + + Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter + + + rating + + + + + + + Rating of this book. 0-5 stars + + + Rating of this book. 0-5 stars + + + QAbstractSpinBox::PlusMinus + + + stars + + + 5 + + + + + + + &Publisher: + + + Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter + + + publisher + + + + + + + Ta&gs: + + + Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter + + + tags + + + + + + + + + Tags categorize the book. This is particularly useful while searching. <br><br>They can be any words or phrases, separated by commas. + + + + + + + Open Tag Editor + + + Open Tag Editor + + + + :/images/chapters.png:/images/chapters.png + + + + + + + + + &Series: + + + Qt::PlainText + + + Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter + + + series + + + + + + + 5 + + + + + List of known series. You can add new series. + + + List of known series. You can add new series. + + + true + + + QComboBox::InsertAlphabetically + + + + + + + Remove unused series (Series that have no books) + + + ... + + + + :/images/trash.png:/images/trash.png + + + + + + + + + IS&BN: + + + Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter + + + isbn + + + + + + + + + + Publishe&d: + + + Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter + + + pubdate + + + + + + + true + + + + + + + false + + + Book + + + 9999.989999999999782 + + + + + + + MMM yyyy + + + true + + + + + + + true + + + + + + + dd MMM yyyy + + + true + + + + + + + &Date: + + + Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter + + + date + + + + + + + + + + &Comments + + + + + + true + + + true + + + + + + + + + + &Fetch metadata from server + + + + + + + + + + + + 0 + 0 + + + + Available Formats + + + + + + + + + 0 + 0 + + + + + 16777215 + 130 + + + + QAbstractItemView::DropOnly + + + + 64 + 64 + + + + + + + + Add a new format for this book to the database + + + ... + + + + :/images/add_book.png:/images/add_book.png + + + + 32 + 32 + + + + + + + + Remove the selected formats for this book from the database. + + + ... + + + + :/images/trash.png:/images/trash.png + + + + 32 + 32 + + + + + + + + Set the cover for the book from the selected format + + + ... + + + + :/images/book.png:/images/book.png + + + + 32 + 32 + + + + + + + + Update metadata from the metadata in the selected format + + + + + + + :/images/edit_input.png:/images/edit_input.png + + + + 32 + 32 + + + + + + + + + + + + + + + 0 + 10 + + + + Book Cover + + + + + + + 0 + 100 + + + + + + + + 6 + + + QLayout::SetMaximumSize + + + 0 + + + + + Change &cover image: + + + cover_path + + + + + + + 6 + + + 0 + + + + + true + + + + + + + &Browse + + + + :/images/document_open.png:/images/document_open.png + + + + + + + Remove border (if any) from cover + + + T&rim + + + + :/images/trim.png:/images/trim.png + + + Qt::ToolButtonTextBesideIcon + + + + + + + Reset cover to default + + + ... + + + + :/images/trash.png:/images/trash.png + + + + + + + + + + + + + Download co&ver + + + + + + + Generate a default cover based on the title and author + + + &Generate cover + + + + + + + + + + + + + + + + + &Custom metadata + + + + + + + + + + + + + Qt::Horizontal + + + QDialogButtonBox::Cancel|QDialogButtonBox::Ok + + + + + + + + EnLineEdit + QLineEdit +
widgets.h
+
+ + EnComboBox + QComboBox +
widgets.h
+
+ + TagsLineEdit + QLineEdit +
widgets.h
+
+ + FormatList + QListWidget +
calibre/gui2/widgets.h
+
+ + ImageView + QWidget +
calibre/gui2/widgets.h
+ 1 +
+
+ + title + swap_button + authors + author_sort + auto_author_sort + rating + publisher + tags + tag_editor_button + series + remove_series_button + series_index + isbn + date + pubdate + comments + fetch_metadata_button + add_format_button + remove_format_button + button_set_cover + button_set_metadata + formats + cover_path + reset_cover + fetch_cover_button + generate_cover_button + scrollArea + central_widget + button_box + + + + + + + button_box + accepted() + MetadataSingleDialog + accept() + + + 261 + 710 + + + 157 + 274 + + + + + button_box + rejected() + MetadataSingleDialog + reject() + + + 329 + 710 + + + 286 + 274 + + + + +
From dd522b051e85ccef7e153a873510ae681988e89c Mon Sep 17 00:00:00 2001 From: Sengian Date: Sun, 31 Oct 2010 23:37:19 +0100 Subject: [PATCH 024/132] Add a choice to get text instead of html in metadata plugins --- src/calibre/ebooks/metadata/fetch.py | 11 +- src/calibre/utils/html2text.py | 451 +++++++++++++++++++++++++++ 2 files changed, 461 insertions(+), 1 deletion(-) create mode 100644 src/calibre/utils/html2text.py diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index 9b8a42e482..87989a4d42 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -10,6 +10,7 @@ from calibre import prints from calibre.utils.config import OptionParser from calibre.utils.logging import default_log from calibre.utils.titlecase import titlecase +from calibre.utils.html2text import html2text from calibre.customize import Plugin from calibre.ebooks.metadata.covers import check_for_cover @@ -79,6 +80,8 @@ class MetadataSource(Plugin): # {{{ mi.comments = None if not c.get('tags', True): mi.tags = [] + if c.get('textconvert', True) and mi.comments is not None: + mi.comments = html2text(mi.comments) except Exception, e: self.exception = e @@ -132,11 +135,17 @@ class MetadataSource(Plugin): # {{{ setattr(w, '_'+x, cb) cb.setChecked(c.get(x, True)) w._layout.addWidget(cb) + #textconvert for comments + cb = QCheckBox(_('Convert comments from %s to text')%(self.name)) + setattr(w, '_textconvert', cb) + cb.setChecked(c.get('textconvert', False)) + w._layout.addWidget(cb) + return w def save_settings(self, w): dl_settings = {} - for x in ('rating', 'tags', 'comments'): + for x in ('rating', 'tags', 'comments', 'textconvert'): dl_settings[x] = getattr(w, '_'+x).isChecked() c = self.config_store() c.set(self.name, dl_settings) diff --git a/src/calibre/utils/html2text.py b/src/calibre/utils/html2text.py new file mode 100644 index 0000000000..b271def4bb --- /dev/null +++ b/src/calibre/utils/html2text.py @@ -0,0 +1,451 @@ +#!/usr/bin/env python +"""html2text: Turn HTML into equivalent Markdown-structured text.""" +__version__ = "2.39" +__author__ = "Aaron Swartz (me@aaronsw.com)" +__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." +__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"] + +# TODO: +# Support decoded entities with unifiable. + +if not hasattr(__builtins__, 'True'): True, False = 1, 0 +import re, sys, urllib, htmlentitydefs, codecs, StringIO, types +import sgmllib +import urlparse +sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]') + +try: from textwrap import wrap +except: pass + +# Use Unicode characters instead of their ascii pseudo-replacements +UNICODE_SNOB = 0 + +# Put the links after each paragraph instead of at the end. +LINKS_EACH_PARAGRAPH = 0 + +# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.) +BODY_WIDTH = 0 + +# Don't show internal links (href="#local-anchor") -- corresponding link targets +# won't be visible in the plain text file anyway. +SKIP_INTERNAL_LINKS = True + +### Entity Nonsense ### + +def name2cp(k): + if k == 'apos': return ord("'") + if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3 + return htmlentitydefs.name2codepoint[k] + else: + k = htmlentitydefs.entitydefs[k] + if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1 + return ord(codecs.latin_1_decode(k)[0]) + +unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', +'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*', +'ndash':'-', 'oelig':'oe', 'aelig':'ae', +'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', +'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', +'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i', +'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', +'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'} + +unifiable_n = {} + +for k in unifiable.keys(): + unifiable_n[name2cp(k)] = unifiable[k] + +def charref(name): + if name[0] in ['x','X']: + c = int(name[1:], 16) + else: + c = int(name) + + if not UNICODE_SNOB and c in unifiable_n.keys(): + return unifiable_n[c] + else: + return unichr(c) + +def entityref(c): + if not UNICODE_SNOB and c in unifiable.keys(): + return unifiable[c] + else: + try: name2cp(c) + except KeyError: return "&" + c + else: return unichr(name2cp(c)) + +def replaceEntities(s): + s = s.group(1) + if s[0] == "#": + return charref(s[1:]) + else: return entityref(s) + +r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));") +def unescape(s): + return r_unescape.sub(replaceEntities, s) + +def fixattrs(attrs): + # Fix bug in sgmllib.py + if not attrs: return attrs + newattrs = [] + for attr in attrs: + newattrs.append((attr[0], unescape(attr[1]))) + return newattrs + +### End Entity Nonsense ### + +def onlywhite(line): + """Return true if the line does only consist of whitespace characters.""" + for c in line: + if c is not ' ' and c is not ' ': + return c is ' ' + return line + +def optwrap(text): + """Wrap all paragraphs in the provided text.""" + if not BODY_WIDTH: + return text + + assert wrap, "Requires Python 2.3." + result = '' + newlines = 0 + for para in text.split("\n"): + if len(para) > 0: + if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*': + for line in wrap(para, BODY_WIDTH): + result += line + "\n" + result += "\n" + newlines = 2 + else: + if not onlywhite(para): + result += para + "\n" + newlines = 1 + else: + if newlines < 2: + result += "\n" + newlines += 1 + return result + +def hn(tag): + if tag[0] == 'h' and len(tag) == 2: + try: + n = int(tag[1]) + if n in range(1, 10): return n + except ValueError: return 0 + +class _html2text(sgmllib.SGMLParser): + def __init__(self, out=None, baseurl=''): + sgmllib.SGMLParser.__init__(self) + + if out is None: self.out = self.outtextf + else: self.out = out + self.outtext = u'' + self.quiet = 0 + self.p_p = 0 + self.outcount = 0 + self.start = 1 + self.space = 0 + self.a = [] + self.astack = [] + self.acount = 0 + self.list = [] + self.blockquote = 0 + self.pre = 0 + self.startpre = 0 + self.lastWasNL = 0 + self.abbr_title = None # current abbreviation definition + self.abbr_data = None # last inner HTML (for abbr being defined) + self.abbr_list = {} # stack of abbreviations to write later + self.baseurl = baseurl + + def outtextf(self, s): + self.outtext += s + + def close(self): + sgmllib.SGMLParser.close(self) + + self.pbr() + self.o('', 0, 'end') + + return self.outtext + + def handle_charref(self, c): + self.o(charref(c)) + + def handle_entityref(self, c): + self.o(entityref(c)) + + def unknown_starttag(self, tag, attrs): + self.handle_tag(tag, attrs, 1) + + def unknown_endtag(self, tag): + self.handle_tag(tag, None, 0) + + def previousIndex(self, attrs): + """ returns the index of certain set of attributes (of a link) in the + self.a list + + If the set of attributes is not found, returns None + """ + if not attrs.has_key('href'): return None + + i = -1 + for a in self.a: + i += 1 + match = 0 + + if a.has_key('href') and a['href'] == attrs['href']: + if a.has_key('title') or attrs.has_key('title'): + if (a.has_key('title') and attrs.has_key('title') and + a['title'] == attrs['title']): + match = True + else: + match = True + + if match: return i + + def handle_tag(self, tag, attrs, start): + attrs = fixattrs(attrs) + + if hn(tag): + self.p() + if start: self.o(hn(tag)*"#" + ' ') + + if tag in ['p', 'div']: self.p() + + if tag == "br" and start: self.o(" \n") + + if tag == "hr" and start: + self.p() + self.o("* * *") + self.p() + + if tag in ["head", "style", 'script']: + if start: self.quiet += 1 + else: self.quiet -= 1 + + if tag in ["body"]: + self.quiet = 0 # sites like 9rules.com never close + + if tag == "blockquote": + if start: + self.p(); self.o('> ', 0, 1); self.start = 1 + self.blockquote += 1 + else: + self.blockquote -= 1 + self.p() + + if tag in ['em', 'i', 'u']: self.o("_") + if tag in ['strong', 'b']: self.o("**") + if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` `` + if tag == "abbr": + if start: + attrsD = {} + for (x, y) in attrs: attrsD[x] = y + attrs = attrsD + + self.abbr_title = None + self.abbr_data = '' + if attrs.has_key('title'): + self.abbr_title = attrs['title'] + else: + if self.abbr_title != None: + self.abbr_list[self.abbr_data] = self.abbr_title + self.abbr_title = None + self.abbr_data = '' + + if tag == "a": + if start: + attrsD = {} + for (x, y) in attrs: attrsD[x] = y + attrs = attrsD + if attrs.has_key('href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')): + self.astack.append(attrs) + self.o("[") + else: + self.astack.append(None) + else: + if self.astack: + a = self.astack.pop() + if a: + i = self.previousIndex(a) + if i is not None: + a = self.a[i] + else: + self.acount += 1 + a['count'] = self.acount + a['outcount'] = self.outcount + self.a.append(a) + self.o("][" + `a['count']` + "]") + + if tag == "img" and start: + attrsD = {} + for (x, y) in attrs: attrsD[x] = y + attrs = attrsD + if attrs.has_key('src'): + attrs['href'] = attrs['src'] + alt = attrs.get('alt', '') + i = self.previousIndex(attrs) + if i is not None: + attrs = self.a[i] + else: + self.acount += 1 + attrs['count'] = self.acount + attrs['outcount'] = self.outcount + self.a.append(attrs) + self.o("![") + self.o(alt) + self.o("]["+`attrs['count']`+"]") + + if tag == 'dl' and start: self.p() + if tag == 'dt' and not start: self.pbr() + if tag == 'dd' and start: self.o(' ') + if tag == 'dd' and not start: self.pbr() + + if tag in ["ol", "ul"]: + if start: + self.list.append({'name':tag, 'num':0}) + else: + if self.list: self.list.pop() + + self.p() + + if tag == 'li': + if start: + self.pbr() + if self.list: li = self.list[-1] + else: li = {'name':'ul', 'num':0} + self.o(" "*len(self.list)) #TODO: line up
  1. s > 9 correctly. + if li['name'] == "ul": self.o("* ") + elif li['name'] == "ol": + li['num'] += 1 + self.o(`li['num']`+". ") + self.start = 1 + else: + self.pbr() + + if tag in ["table", "tr"] and start: self.p() + if tag == 'td': self.pbr() + + if tag == "pre": + if start: + self.startpre = 1 + self.pre = 1 + else: + self.pre = 0 + self.p() + + def pbr(self): + if self.p_p == 0: self.p_p = 1 + + def p(self): self.p_p = 2 + + def o(self, data, puredata=0, force=0): + if self.abbr_data is not None: self.abbr_data += data + + if not self.quiet: + if puredata and not self.pre: + data = re.sub('\s+', ' ', data) + if data and data[0] == ' ': + self.space = 1 + data = data[1:] + if not data and not force: return + + if self.startpre: + #self.out(" :") #TODO: not output when already one there + self.startpre = 0 + + bq = (">" * self.blockquote) + if not (force and data and data[0] == ">") and self.blockquote: bq += " " + + if self.pre: + bq += " " + data = data.replace("\n", "\n"+bq) + + if self.start: + self.space = 0 + self.p_p = 0 + self.start = 0 + + if force == 'end': + # It's the end. + self.p_p = 0 + self.out("\n") + self.space = 0 + + + if self.p_p: + self.out(('\n'+bq)*self.p_p) + self.space = 0 + + if self.space: + if not self.lastWasNL: self.out(' ') + self.space = 0 + + if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"): + if force == "end": self.out("\n") + + newa = [] + for link in self.a: + if self.outcount > link['outcount']: + self.out(" ["+`link['count']`+"]: " + urlparse.urljoin(self.baseurl, link['href'])) + if link.has_key('title'): self.out(" ("+link['title']+")") + self.out("\n") + else: + newa.append(link) + + if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done. + + self.a = newa + + if self.abbr_list and force == "end": + for abbr, definition in self.abbr_list.items(): + self.out(" *[" + abbr + "]: " + definition + "\n") + + self.p_p = 0 + self.out(data) + self.lastWasNL = data and data[-1] == '\n' + self.outcount += 1 + + def handle_data(self, data): + if r'\/script>' in data: self.quiet -= 1 + self.o(data, 1) + + def unknown_decl(self, data): pass + +def wrapwrite(text): sys.stdout.write(text.encode('utf8')) + +def html2text_file(html, out=wrapwrite, baseurl=''): + h = _html2text(out, baseurl) + h.feed(html) + h.feed("") + return h.close() + +def html2text(html, baseurl=''): + return optwrap(html2text_file(html, None, baseurl)) + +if __name__ == "__main__": + baseurl = '' + if sys.argv[1:]: + arg = sys.argv[1] + if arg.startswith('http://') or arg.startswith('https://'): + baseurl = arg + j = urllib.urlopen(baseurl) + try: + from feedparser import _getCharacterEncoding as enc + except ImportError: + enc = lambda x, y: ('utf-8', 1) + text = j.read() + encoding = enc(j.headers, text)[0] + if encoding == 'us-ascii': encoding = 'utf-8' + data = text.decode(encoding) + + else: + encoding = 'utf8' + if len(sys.argv) > 2: + encoding = sys.argv[2] + data = open(arg, 'r').read().decode(encoding) + else: + data = sys.stdin.read().decode('utf8') + wrapwrite(html2text(data, baseurl)) + From 9aefafc74506ac60fbc0e0ffbe1c53d48edbc0a5 Mon Sep 17 00:00:00 2001 From: Sengian Date: Mon, 1 Nov 2010 01:22:47 +0100 Subject: [PATCH 025/132] Implemented basic html check and none check to avoid problems with html2text --- src/calibre/ebooks/metadata/fetch.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index 87989a4d42..d45a299e39 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -16,6 +16,8 @@ from calibre.ebooks.metadata.covers import check_for_cover metadata_config = None +html_check = re.compile("([\<])([^\>]{1,})*([\>])", re.I) + class MetadataSource(Plugin): # {{{ ''' Represents a source to query for metadata. Subclasses must implement @@ -78,10 +80,11 @@ class MetadataSource(Plugin): # {{{ mi.rating = None if not c.get('comments', True): mi.comments = None + if c.get('textconvert', True) and mi.comments is not None \ + and html_check.search(mi.comments) is not None: + mi.comments = html2text(mi.comments) if not c.get('tags', True): mi.tags = [] - if c.get('textconvert', True) and mi.comments is not None: - mi.comments = html2text(mi.comments) except Exception, e: self.exception = e From a8578eee2d4008a547f0cf2ac9c880ef02cf0a37 Mon Sep 17 00:00:00 2001 From: Sengian Date: Tue, 2 Nov 2010 00:05:20 +0100 Subject: [PATCH 026/132] minor corrections linked to bug 7345 --- src/calibre/ebooks/metadata/fetch.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index 36a1af9c07..dedd251640 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -17,8 +17,6 @@ from calibre.utils.html2text import html2text metadata_config = None -html_check = re.compile("([\<])([^\>]{1,})*([\>])", re.I) - class MetadataSource(Plugin): # {{{ ''' Represents a source to query for metadata. Subclasses must implement @@ -86,9 +84,6 @@ class MetadataSource(Plugin): # {{{ mi.rating = None if not c.get('comments', True): mi.comments = None - if c.get('textconvert', True) and mi.comments is not None \ - and html_check.search(mi.comments) is not None: - mi.comments = html2text(mi.comments) if not c.get('tags', True): mi.tags = [] if self.has_html_comments and mi.comments and \ @@ -151,18 +146,21 @@ class MetadataSource(Plugin): # {{{ setattr(w, '_'+x, cb) cb.setChecked(c.get(x, True)) w._layout.addWidget(cb) - - cb = QCheckBox(_('Convert comments downloaded from %s to plain text')%(self.name)) - setattr(w, '_textcomments', cb) - cb.setChecked(c.get('textcomments', False)) - w._layout.addWidget(cb) + + if self.has_html_comments: + cb = QCheckBox(_('Convert comments downloaded from %s to plain text')%(self.name)) + setattr(w, '_textcomments', cb) + cb.setChecked(c.get('textcomments', False)) + w._layout.addWidget(cb) return w def save_settings(self, w): dl_settings = {} - for x in ('rating', 'tags', 'comments', 'textcomments'): + for x in ('rating', 'tags', 'comments'): dl_settings[x] = getattr(w, '_'+x).isChecked() + if self.has_html_comments: + dl_settings['textcomments'] = getattr(w, '_textcomments').isChecked() c = self.config_store() c.set(self.name, dl_settings) if hasattr(w, '_sc'): From a0fc1086364cab8d744530274ac5149ecfdda2f1 Mon Sep 17 00:00:00 2001 From: Sengian Date: Sat, 13 Nov 2010 15:22:18 +0100 Subject: [PATCH 027/132] Adding Fictionwise metadata source --- src/calibre/customize/builtins.py | 4 +- src/calibre/ebooks/metadata/fetch.py | 18 ++ src/calibre/ebooks/metadata/fictionwise.py | 351 +++++++++++++++++++++ 3 files changed, 371 insertions(+), 2 deletions(-) create mode 100644 src/calibre/ebooks/metadata/fictionwise.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index bd766827a5..04364b6b28 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -481,7 +481,7 @@ from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG from calibre.devices.kobo.driver import KOBO from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \ - LibraryThing + LibraryThing, Fictionwise from calibre.ebooks.metadata.douban import DoubanBooks from calibre.ebooks.metadata.covers import OpenLibraryCovers, \ LibraryThingCovers, DoubanCovers @@ -490,7 +490,7 @@ from calibre.ebooks.epub.fix.unmanifested import Unmanifested from calibre.ebooks.epub.fix.epubcheck import Epubcheck plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, - LibraryThing, DoubanBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, + LibraryThing, Fictionwise, DoubanBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers] plugins += [ ComicInput, diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index dedd251640..c9d6a74cb2 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -267,6 +267,24 @@ class LibraryThing(MetadataSource): # {{{ # }}} +class Fictionwise(MetadataSource): # {{{ + + author = 'Sengian' + name = 'Fictionwise' + description = _('Downloads metadata from Fictionwise') + + has_html_comments = True + + def fetch(self): + from calibre.ebooks.metadata.fictionwise import search + try: + self.results = search(self.title, self.book_author, self.publisher, + self.isbn, max_results=10, verbose=self.verbose) + except Exception, e: + self.exception = e + self.tb = traceback.format_exc() + + # }}} def result_index(source, result): if not result.isbn: diff --git a/src/calibre/ebooks/metadata/fictionwise.py b/src/calibre/ebooks/metadata/fictionwise.py new file mode 100644 index 0000000000..2fa9a1bcee --- /dev/null +++ b/src/calibre/ebooks/metadata/fictionwise.py @@ -0,0 +1,351 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2010, sengian ' +__docformat__ = 'restructuredtext en' + +import sys, textwrap, re +from urllib import urlencode + +from lxml import html, etree +from lxml.html import soupparser +from lxml.etree import tostring + +from calibre import browser, preferred_encoding +from calibre.ebooks.chardet import xml_to_unicode +from calibre.ebooks.metadata import MetaInformation, check_isbn, \ + authors_to_sort_string +from calibre.library.comments import sanitize_comments_html +from calibre.utils.config import OptionParser +from calibre.utils.date import parse_date, utcnow + + +def report(verbose): + if verbose: + import traceback + traceback.print_exc() + +class Query(object): + + BASE_URL = 'http://www.fictionwise.com/servlet/mw' + + def __init__(self, title=None, author=None, publisher=None, keywords=None, max_results=20): + assert not(title is None and author is None and keywords is None) + assert (max_results < 21) + + self.max_results = max_results + + q = { 'template' : 'searchresults_adv.htm' , + 'searchtitle' : '', + 'searchauthor' : '', + 'searchpublisher' : '', + 'searchkeyword' : '', + #possibilities startoflast, fullname, lastfirst + 'searchauthortype' : 'startoflast', + 'searchcategory' : '', + 'searchcategory2' : '', + 'searchprice_s' : '0', + 'searchprice_e' : 'ANY', + 'searchformat' : '', + 'searchgeo' : 'US', + 'searchfwdatetype' : '', + #maybe use dates fields if needed? + #'sortorder' : 'DESC', + #many options available: b.SortTitle, a.SortName, + #b.DateFirstPublished, b.FWPublishDate + 'sortby' : 'b.SortTitle' + } + if title is not None: + q['searchtitle'] = title + if author is not None: + q['searchauthor'] = author + if publisher is not None: + q['searchpublisher'] = publisher + if keywords is not None: + q['searchkeyword'] = keywords + + if isinstance(q, unicode): + q = q.encode('utf-8') + self.urldata = urlencode(q) + + def __call__(self, browser, verbose): + if verbose: + print 'Query:', self.BASE_URL+self.urldata + + try: + raw = browser.open_novisit(self.BASE_URL, self.urldata).read() + except Exception, e: + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return + raise + if '404 - ' in raw: + return + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + feed = soupparser.fromstring(raw) + except: + return + + # get list of results as links + results = feed.xpath("//table[3]/tr/td[2]/table/tr/td/p/table[2]/tr[@valign]") + results = results[:self.max_results] + results = [i.xpath('descendant-or-self::a')[0].get('href') for i in results] + #return feed if no links ie normally a single book or nothing + if not results: + results = [feed] + return results + +class ResultList(list): + + BASE_URL = 'http://www.fictionwise.com' + COLOR_VALUES = {'BLUE': 4, 'GREEN': 3, 'YELLOW': 2, 'RED': 1, 'NA': 0} + + def __init__(self): + self.retitle = re.compile(r'\[[^\[\]]+\]') + self.rechkauth = re.compile(r'.*book\s*by', re.I) + self.redesc = re.compile(r'book\s*description\s*:\s*(<br[^>]+>)*(?P<desc>.*)' \ + + '<br[^>]+>.{,15}publisher\s*:', re.I) + self.repub = re.compile(r'.*publisher\s*:\s*', re.I) + self.redate = re.compile(r'.*release\s*date\s*:\s*', re.I) + self.retag = re.compile(r'.*book\s*category\s*:\s*', re.I) + self.resplitbr = re.compile(r'<br[^>]+>', re.I) + self.recomment = re.compile(r'(?s)<!--.*?-->') + self.reimg = re.compile(r'<img[^>]*>', re.I) + self.resanitize = re.compile(r'\[HTML_REMOVED\]\s*', re.I) + self.renbcom = re.compile('(?P<nbcom>\d+)\s*Reader Ratings:') + self.recolor = re.compile('(?P<ncolor>[^/]+).gif') + self.resplitbrdiv = re.compile(r'(<br[^>]+>|</?div[^>]*>)', re.I) + self.reisbn = re.compile(r'.*ISBN\s*:\s*', re.I) + + def strip_tags_etree(self, etreeobj, invalid_tags): + for itag in invalid_tags: + for elt in etreeobj.getiterator(itag): + elt.drop_tag() + return etreeobj + + def clean_entry(self, entry, + invalid_tags = ('font', 'strong', 'b', 'ul', 'span', 'a'), + remove_tags_trees = ('script',)): + for it in entry[0].iterchildren(tag='table'): + entry[0].remove(it) + entry[0].remove(entry[0].xpath( 'descendant-or-self::p[1]')[0]) + entry = entry[0] + cleantree = self.strip_tags_etree(entry, invalid_tags) + for itag in remove_tags_trees: + for elts in cleantree.getiterator(itag): + elts.drop_tree() + return cleantree + + def output_entry(self, entry, prettyout = True, htmlrm="\d+"): + out = tostring(entry, pretty_print=prettyout) + reclean = re.compile('(\n+|\t+|\r+|&#'+htmlrm+';)') + return reclean.sub('', out) + + def get_title(self, entry): + title = entry.findtext('./') + return self.retitle.sub('', title).strip() + + def get_authors(self, entry): + authortext = entry.find('./br').tail + if not self.rechkauth.search(authortext): + return [] + #TODO: parse all tag if necessary + authortext = self.rechkauth.sub('', authortext) + return [a.strip() for a in authortext.split('&')] + + def get_rating(self, entrytable, verbose): + nbcomment = tostring(entrytable.getprevious()) + try: + nbcomment = self.renbcom.search(nbcomment).group("nbcom") + except: + report(verbose) + return None + hval = dict((self.COLOR_VALUES[self.recolor.search(image.get('src', default='NA.gif')).group("ncolor")], + float(image.get('height', default=0))) \ + for image in entrytable.getiterator('img')) + #ratings as x/20, not sure + return 5*sum(k*v for (k, v) in hval.iteritems())/sum(hval.itervalues()) + + def get_description(self, entry): + description = self.output_entry(entry.find('./p'),htmlrm="") + description = self.redesc.search(description) + if not description and not description.group("desc"): + return None + #remove invalid tags + description = self.reimg.sub('', description.group("desc")) + description = self.recomment.sub('', description) + description = self.resanitize.sub('', sanitize_comments_html(description)) + return 'SUMMARY:\n' + re.sub(r'\n\s+</p>','\n</p>', description) + + def get_publisher(self, entry): + publisher = self.output_entry(entry.find('./p')) + publisher = filter(lambda x: self.repub.search(x) is not None, + self.resplitbr.split(publisher)) + if not len(publisher): + return None + publisher = self.repub.sub('', publisher[0]) + return publisher.split(',')[0].strip() + + def get_tags(self, entry): + tag = self.output_entry(entry.find('./p')) + tag = filter(lambda x: self.retag.search(x) is not None, + self.resplitbr.split(tag)) + if not len(tag): + return [] + return map(lambda x: x.strip(), self.retag.sub('', tag[0]).split('/')) + + def get_date(self, entry, verbose): + date = self.output_entry(entry.find('./p')) + date = filter(lambda x: self.redate.search(x) is not None, + self.resplitbr.split(date)) + if not len(date): + return None + #TODO: parse all tag if necessary + try: + d = self.redate.sub('', date[0]) + if d: + default = utcnow().replace(day=15) + d = parse_date(d, assume_utc=True, default=default) + else: + d = None + except: + report(verbose) + d = None + return d + + def get_ISBN(self, entry): + isbns = self.output_entry(entry.getchildren()[2]) + isbns = filter(lambda x: self.reisbn.search(x) is not None, + self.resplitbrdiv.split(isbns)) + if not len(isbns): + return None + #TODO: parse all tag if necessary + isbns = [self.reisbn.sub('', x) for x in isbns if check_isbn(self.reisbn.sub('', x))] + return sorted(isbns, cmp=lambda x,y:cmp(len(x), len(y)))[-1] + + def fill_MI(self, entry, title, authors, ratings, verbose): + mi = MetaInformation(title, authors) + mi.rating = ratings + mi.comments = self.get_description(entry) + mi.publisher = self.get_publisher(entry) + mi.tags = self.get_tags(entry) + mi.pubdate = self.get_date(entry, verbose) + mi.isbn = self.get_ISBN(entry) + mi.author_sort = authors_to_sort_string(authors) + # mi.language = self.get_language(x, verbose) + return mi + + def get_individual_metadata(self, browser, linkdata): + try: + raw = browser.open_novisit(self.BASE_URL + linkdata).read() + except Exception, e: + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return + raise + if '<title>404 - ' in raw: + report(verbose) + return + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + feed = soupparser.fromstring(raw) + except: + return + + # get results + return feed.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td") + + def populate(self, entries, browser, verbose=False): + for x in entries: + try: + entry = self.get_individual_metadata(browser, x) + entry = self.clean_entry(entry) + title = self.get_title(entry) + #ratings: get table for rating then drop + for elt in entry.getiterator('table'): + ratings = self.get_rating(elt, verbose) + elt.getprevious().drop_tree() + elt.drop_tree() + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print 'Failed to get all details for an entry' + print e + continue + self.append(self.fill_MI(entry, title, authors, ratings, verbose)) + + def populate_single(self, feed, verbose=False): + try: + entry = feed.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td") + entry = self.clean_entry(entry) + title = self.get_title(entry) + #ratings: get table for rating then drop + for elt in entry.getiterator('table'): + ratings = self.get_rating(elt, verbose) + elt.getprevious().drop_tree() + elt.drop_tree() + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print 'Failed to get all details for an entry' + print e + return + self.append(self.fill_MI(entry, title, authors, ratings, verbose)) + + +def search(title=None, author=None, publisher=None, isbn=None, + min_viewability='none', verbose=False, max_results=5, + keywords=None): + br = browser() + entries = Query(title=title, author=author, publisher=publisher, + keywords=keywords, max_results=max_results)(br, verbose) + + #List of entry + ans = ResultList() + if len(entries) > 1: + ans.populate(entries, br, verbose) + else: + ans.populate_single(entries[0], verbose) + return ans + + +def option_parser(): + parser = OptionParser(textwrap.dedent(\ + '''\ + %prog [options] + + Fetch book metadata from Fictionwise. You must specify one of title, author, + or keywords. No ISBN specification possible. Will fetch a maximum of 20 matches, + so you should make your query as specific as possible. + ''' + )) + parser.add_option('-t', '--title', help='Book title') + parser.add_option('-a', '--author', help='Book author(s)') + parser.add_option('-p', '--publisher', help='Book publisher') + parser.add_option('-k', '--keywords', help='Keywords') + parser.add_option('-m', '--max-results', default=5, + help='Maximum number of results to fetch') + parser.add_option('-v', '--verbose', default=0, action='count', + help='Be more verbose about errors') + return parser + +def main(args=sys.argv): + parser = option_parser() + opts, args = parser.parse_args(args) + try: + results = search(opts.title, opts.author, publisher=opts.publisher, + keywords=opts.keywords, verbose=opts.verbose, max_results=opts.max_results) + except AssertionError: + report(True) + parser.print_help() + return 1 + for result in results: + print unicode(result).encode(preferred_encoding, 'replace') + print + +if __name__ == '__main__': + sys.exit(main()) From 041fbd293227dbc52dc9d823e37512d4ed441c0e Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sat, 13 Nov 2010 22:35:32 +0100 Subject: [PATCH 028/132] Correct rating scale for fictionwise.py --- src/calibre/ebooks/metadata/fictionwise.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/metadata/fictionwise.py b/src/calibre/ebooks/metadata/fictionwise.py index 2fa9a1bcee..ca438805ea 100644 --- a/src/calibre/ebooks/metadata/fictionwise.py +++ b/src/calibre/ebooks/metadata/fictionwise.py @@ -165,8 +165,8 @@ class ResultList(list): hval = dict((self.COLOR_VALUES[self.recolor.search(image.get('src', default='NA.gif')).group("ncolor")], float(image.get('height', default=0))) \ for image in entrytable.getiterator('img')) - #ratings as x/20, not sure - return 5*sum(k*v for (k, v) in hval.iteritems())/sum(hval.itervalues()) + #ratings as x/5 + return 1.25*sum(k*v for (k, v) in hval.iteritems())/sum(hval.itervalues()) def get_description(self, entry): description = self.output_entry(entry.find('./p'),htmlrm="") From c92271dc2d8b71a01e6484d611ec0b28d1d9a6ae Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Thu, 18 Nov 2010 21:22:21 +0100 Subject: [PATCH 029/132] minor revisions finctionwise plugin --- src/calibre/ebooks/metadata/fictionwise.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/metadata/fictionwise.py b/src/calibre/ebooks/metadata/fictionwise.py index ca438805ea..de60cd9dca 100644 --- a/src/calibre/ebooks/metadata/fictionwise.py +++ b/src/calibre/ebooks/metadata/fictionwise.py @@ -29,10 +29,10 @@ class Query(object): BASE_URL = 'http://www.fictionwise.com/servlet/mw' def __init__(self, title=None, author=None, publisher=None, keywords=None, max_results=20): - assert not(title is None and author is None and keywords is None) + assert not(title is None and author is None and publisher is None and keywords is None) assert (max_results < 21) - self.max_results = max_results + self.max_results = int(max_results) q = { 'template' : 'searchresults_adv.htm' , 'searchtitle' : '', @@ -327,7 +327,7 @@ def option_parser(): parser.add_option('-a', '--author', help='Book author(s)') parser.add_option('-p', '--publisher', help='Book publisher') parser.add_option('-k', '--keywords', help='Keywords') - parser.add_option('-m', '--max-results', default=5, + parser.add_option('-m', '--max-results', default=20, help='Maximum number of results to fetch') parser.add_option('-v', '--verbose', default=0, action='count', help='Be more verbose about errors') From 78e4aba18ce8cd86f2e91834a866029c0f3ab476 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Thu, 18 Nov 2010 21:37:51 +0100 Subject: [PATCH 030/132] Revert --- resources/catalog/stylesheet.css | 198 +++++++++++++++++-------------- src/calibre/ebooks/rtf/input.py | 74 ++++++++++-- 2 files changed, 177 insertions(+), 95 deletions(-) diff --git a/resources/catalog/stylesheet.css b/resources/catalog/stylesheet.css index afda6ffc05..057c6c9f42 100644 --- a/resources/catalog/stylesheet.css +++ b/resources/catalog/stylesheet.css @@ -1,87 +1,98 @@ -body { - background-color: white; -} +body { background-color: white; } -p.title { - margin-top: 0em; - margin-bottom: 1em; - text-align: center; - font-style: italic; - font-size: xx-large; - border-bottom: solid black 4px; -} +p.title { + margin-top:0em; + margin-bottom:1em; + text-align:center; + font-style:italic; + font-size:xx-large; + border-bottom: solid black 2px; + } p.author { - margin-top: 0em; - margin-bottom: 0em; - text-align: left; - text-indent: 1em; - font-size: large; -} - -p.tags { - margin-top: 0em; - margin-bottom: 0em; - text-align: left; - text-indent: 1em; - font-size: small; -} - -p.description { - text-align: left; - font-style: normal; - margin-top: 0em; -} - -p.date_index { - font-size: x-large; + margin-top:0em; + margin-bottom:0em; text-align: center; - font-weight: bold; - margin-top: 1em; - margin-bottom: 0px; -} - -p.letter_index { - font-size: x-large; - text-align: center; - font-weight: bold; - margin-top: 1em; - margin-bottom: 0px; -} + text-indent: 0em; + font-size:large; + } p.author_index { - font-size: large; - text-align: left; - margin-top: 0px; - margin-bottom: 0px; + font-size:large; + font-weight:bold; + text-align:left; + margin-top:0px; + margin-bottom:-2px; text-indent: 0em; -} + } + +p.tags { + margin-top:0.5em; + margin-bottom:0em; + text-align: left; + text-indent: 0.0in; + } + +p.formats { + font-size:90%; + margin-top:0em; + margin-bottom:0.5em; + text-align: left; + text-indent: 0.0in; + } + +div.description > p:first-child { + margin: 0 0 0 0; + text-indent: 0em; + } + +div.description { + margin: 0 0 0 0; + text-indent: 1em; + } + +p.date_index { + font-size:x-large; + text-align:center; + font-weight:bold; + margin-top:1em; + margin-bottom:0px; + } + +p.letter_index { + font-size:x-large; + text-align:center; + font-weight:bold; + margin-top:1em; + margin-bottom:0px; + } p.series { - text-align: left; - margin-top: 0px; - margin-bottom: 0px; - margin-left: 2em; - text-indent: -2em; -} + font-style:italic; + margin-top:2px; + margin-bottom:0px; + margin-left:2em; + text-align:left; + text-indent:-2em; + } p.read_book { - text-align: left; - margin-top: 0px; - margin-bottom: 0px; - margin-left: 2em; - text-indent: -2em; -} + text-align:left; + margin-top:0px; + margin-bottom:0px; + margin-left:2em; + text-indent:-2em; + } p.unread_book { - text-align: left; - margin-top: 0px; - margin-bottom: 0px; - margin-left: 2em; - text-indent: -2em; -} + text-align:left; + margin-top:0px; + margin-bottom:0px; + margin-left:2em; + text-indent:-2em; + } -p.missing_book { +p.wishlist_item { text-align:left; margin-top:0px; margin-bottom:0px; @@ -90,23 +101,36 @@ p.missing_book { } p.date_read { - text-align: left; - margin-top: 0px; - margin-bottom: 0px; - margin-left: 6em; - text-indent: -6em; -} + text-align:left; + margin-top:0px; + margin-bottom:0px; + margin-left:6em; + text-indent:-6em; + } -hr.series_divider { - width: 50%; - margin-left: 1em; - margin-top: 0em; - margin-bottom: 0em; -} +hr.description_divider { + width:90%; + margin-left:5%; + border-top: solid white 0px; + border-right: solid white 0px; + border-bottom: solid black 1px; + border-left: solid white 0px; + } hr.annotations_divider { - width: 50%; - margin-left: 1em; - margin-top: 0em; - margin-bottom: 0em; -} \ No newline at end of file + width:50%; + margin-left:1em; + margin-top:0em; + margin-bottom:0em; + } + +td.publisher, td.date { + font-weight:bold; + text-align:center; + } +td.rating { + text-align: center; + } +td.thumbnail img { + -webkit-box-shadow: 4px 4px 12px #999; + } \ No newline at end of file diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index ec6f9a04d3..32de91c011 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -9,6 +9,36 @@ from lxml import etree from calibre.customize.conversion import InputFormatPlugin from calibre.ebooks.conversion.utils import PreProcessor +border_style_map = { + 'single' : 'solid', + 'double-thickness-border' : 'double', + 'shadowed-border': 'outset', + 'double-border': 'double', + 'dotted-border': 'dotted', + 'dashed': 'dashed', + 'hairline': 'solid', + 'inset': 'inset', + 'dash-small': 'dashed', + 'dot-dash': 'dotted', + 'dot-dot-dash': 'dotted', + 'outset': 'outset', + 'tripple': 'double', + 'thick-thin-small': 'solid', + 'thin-thick-small': 'solid', + 'thin-thick-thin-small': 'solid', + 'thick-thin-medium': 'solid', + 'thin-thick-medium': 'solid', + 'thin-thick-thin-medium': 'solid', + 'thick-thin-large': 'solid', + 'thin-thick-thin-large': 'solid', + 'wavy': 'ridge', + 'double-wavy': 'ridge', + 'striped': 'ridge', + 'emboss': 'inset', + 'engrave': 'inset', + 'frame': 'ridge', +} + class InlineClass(etree.XSLTExtension): FMTS = ('italics', 'bold', 'underlined', 'strike-through', 'small-caps') @@ -51,7 +81,6 @@ class RTFInput(InputFormatPlugin): parser = ParseRtf( in_file = stream, out_file = ofile, - deb_dir = 'H:\\Temp\\Calibre\\rtfdebug', # Convert symbol fonts to unicode equivalents. Default # is 1 convert_symbol = 1, @@ -138,8 +167,7 @@ class RTFInput(InputFormatPlugin): return name - - def write_inline_css(self, ic): + def write_inline_css(self, ic, border_styles): font_size_classes = ['span.fs%d { font-size: %spt }'%(i, x) for i, x in enumerate(ic.font_sizes)] color_classes = ['span.col%d { color: %s }'%(i, x) for i, x in @@ -163,6 +191,10 @@ class RTFInput(InputFormatPlugin): ''') css += '\n'+'\n'.join(font_size_classes) css += '\n' +'\n'.join(color_classes) + + for cls, val in border_styles.items(): + css += '\n\n.%s {\n%s\n}'%(cls, val) + with open('styles.css', 'ab') as f: f.write(css) @@ -182,6 +214,32 @@ class RTFInput(InputFormatPlugin): 'Failed to preprocess RTF to convert unicode sequences, ignoring...') return fname + def convert_borders(self, doc): + border_styles = [] + style_map = {} + for elem in doc.xpath(r'//*[local-name()="cell"]'): + style = ['border-style: hidden', 'border-width: 1px', + 'border-color: black'] + for x in ('bottom', 'top', 'left', 'right'): + bs = elem.get('border-cell-%s-style'%x, None) + if bs: + cbs = border_style_map.get(bs, 'solid') + style.append('border-%s-style: %s'%(x, cbs)) + bw = elem.get('border-cell-%s-line-width'%x, None) + if bw: + style.append('border-%s-width: %spt'%(x, bw)) + bc = elem.get('border-cell-%s-color'%x, None) + if bc: + style.append('border-%s-color: %s'%(x, bc)) + style = ';\n'.join(style) + if style not in border_styles: + border_styles.append(style) + idx = border_styles.index(style) + cls = 'border_style%d'%idx + style_map[cls] = style + elem.set('class', cls) + return style_map + def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.metadata.meta import get_metadata @@ -191,17 +249,16 @@ class RTFInput(InputFormatPlugin): self.log = log self.log('Converting RTF to XML...') #Name of the preprocesssed RTF file - #fname = self.preprocess(stream.name) - fname = stream.name + fname = self.preprocess(stream.name) try: xml = self.generate_xml(fname) except RtfInvalidCodeException, e: raise ValueError(_('This RTF file has a feature calibre does not ' 'support. Convert it to HTML first and then try it.\n%s')%e) - dataxml = open('dataxml.xml', 'w') + '''dataxml = open('dataxml.xml', 'w') dataxml.write(xml) - dataxml.close + dataxml.close''' d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf')) if d: @@ -214,6 +271,7 @@ class RTFInput(InputFormatPlugin): self.log('Parsing XML...') parser = etree.XMLParser(recover=True, no_network=True) doc = etree.fromstring(xml, parser=parser) + border_styles = self.convert_borders(doc) for pict in doc.xpath('//rtf:pict[@num]', namespaces={'rtf':'http://rtf2xml.sourceforge.net/'}): num = int(pict.get('num')) @@ -235,7 +293,7 @@ class RTFInput(InputFormatPlugin): preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) res = preprocessor(res) f.write(res) - self.write_inline_css(inline_class) + self.write_inline_css(inline_class, border_styles) stream.seek(0) mi = get_metadata(stream, 'rtf') if not mi.title: From 8f6cc227cd46db8f008720ef7f50250152a5788e Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 21 Nov 2010 00:08:23 +0100 Subject: [PATCH 031/132] Minor modification mreplace.py --- src/calibre/utils/mreplace.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/utils/mreplace.py b/src/calibre/utils/mreplace.py index dff5fab578..b9fbc0bded 100644 --- a/src/calibre/utils/mreplace.py +++ b/src/calibre/utils/mreplace.py @@ -17,7 +17,7 @@ class MReplace(UserDict): if len(self.data) > 0: keys = sorted(self.data.keys(), key=len) keys.reverse() - tmp = "(%s)" % "|".join([re.escape(item) for item in keys]) + tmp = "(%s)" % "|".join(map(re.escape, keys)) if self.re != tmp: self.re = tmp self.regex = re.compile(self.re) From 229f511202b408f0627685e4eeab39022604b450 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 21 Nov 2010 00:08:53 +0100 Subject: [PATCH 032/132] Minor modif fictionwise.py --- src/calibre/ebooks/metadata/fictionwise.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/metadata/fictionwise.py b/src/calibre/ebooks/metadata/fictionwise.py index de60cd9dca..706d38b559 100644 --- a/src/calibre/ebooks/metadata/fictionwise.py +++ b/src/calibre/ebooks/metadata/fictionwise.py @@ -237,7 +237,7 @@ class ResultList(list): # mi.language = self.get_language(x, verbose) return mi - def get_individual_metadata(self, browser, linkdata): + def get_individual_metadata(self, browser, linkdata, verbose): try: raw = browser.open_novisit(self.BASE_URL + linkdata).read() except Exception, e: @@ -262,7 +262,7 @@ class ResultList(list): def populate(self, entries, browser, verbose=False): for x in entries: try: - entry = self.get_individual_metadata(browser, x) + entry = self.get_individual_metadata(browser, x, verbose) entry = self.clean_entry(entry) title = self.get_title(entry) #ratings: get table for rating then drop From eb4e7154dbcb63863ee70bb8dcc14c508631272f Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 21 Nov 2010 00:16:24 +0100 Subject: [PATCH 033/132] Plugin nicebooks for metadatas and cover. Should be disable by default. --- src/calibre/customize/builtins.py | 6 +- src/calibre/ebooks/metadata/nicebooks.py | 458 +++++++++++++++++++++++ 2 files changed, 462 insertions(+), 2 deletions(-) create mode 100644 src/calibre/ebooks/metadata/nicebooks.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 5723da34a8..ce5275d35e 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -483,15 +483,17 @@ from calibre.devices.kobo.driver import KOBO from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \ LibraryThing, Fictionwise from calibre.ebooks.metadata.douban import DoubanBooks +from calibre.ebooks.metadata.nicebooks import NiceBooks from calibre.ebooks.metadata.covers import OpenLibraryCovers, \ LibraryThingCovers, DoubanCovers +from calibre.ebooks.metadata.nicebooks import NiceBooksCovers from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX from calibre.ebooks.epub.fix.unmanifested import Unmanifested from calibre.ebooks.epub.fix.epubcheck import Epubcheck plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, - LibraryThing, Fictionwise, DoubanBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, - Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers] + LibraryThing, Fictionwise, DoubanBooks, NiceBooks,CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, + Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers, NiceBooksCovers] plugins += [ ComicInput, EPUBInput, diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py new file mode 100644 index 0000000000..28fb2de562 --- /dev/null +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -0,0 +1,458 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2010, sengian <sengian1@gmail.com>' +__docformat__ = 'restructuredtext en' + +import sys, textwrap, re, traceback, socket +from urllib import urlencode +from functools import partial +from math import ceil +from copy import deepcopy + +from lxml import html +from lxml.html import soupparser + +from calibre.utils.date import parse_date, utcnow +from calibre import browser, preferred_encoding +from calibre.ebooks.chardet import xml_to_unicode +from calibre.ebooks.metadata import MetaInformation, check_isbn, \ + authors_to_sort_string +from calibre.ebooks.metadata.fetch import MetadataSource +from calibre.ebooks.metadata.covers import CoverDownload +from calibre.utils.config import OptionParser + +class NiceBooks(MetadataSource): + + name = 'Nicebooks' + description = _('Downloads metadata from french Nicebooks') + supported_platforms = ['windows', 'osx', 'linux'] + author = 'Sengian' + version = (1, 0, 0) + + def fetch(self): + try: + self.results = search(self.title, self.book_author, self.publisher, + self.isbn, max_results=10, verbose=self.verbose) + except Exception, e: + self.exception = e + self.tb = traceback.format_exc() + +class NiceBooksCovers(CoverDownload): + + name = 'Nicebooks covers' + description = _('Downloads covers from french Nicebooks') + supported_platforms = ['windows', 'osx', 'linux'] + author = 'Sengian' + type = _('Cover download') + version = (1, 0, 0) + + def has_cover(self, mi, ans, timeout=5.): + if not mi.isbn: + return False + br = browser() + try: + entry = Query(isbn=mi.isbn, max_results=1)(br, False, timeout)[0] + if Covers(isbn)(entry).check_cover(): + self.debug('cover for', mi.isbn, 'found') + ans.set() + except Exception, e: + self.debug(e) + + def get_covers(self, mi, result_queue, abort, timeout=5.): + if not mi.isbn: + return + br = browser() + try: + entry = Query(isbn=mi.isbn, max_results=1)(br, False, timeout)[0] + cover_data, ext = Covers(isbn)(entry).get_cover(br, timeout) + if not ext: + ext = 'jpg' + result_queue.put((True, cover_data, ext, self.name)) + except Exception, e: + result_queue.put((False, self.exception_to_string(e), + traceback.format_exc(), self.name)) + + +def report(verbose): + if verbose: + import traceback + traceback.print_exc() + +def replace_monthsfr(datefr): + # Replace french months by english equivalent for parse_date + frtoen = { + u'[jJ]anvier': u'jan', + u'[fF].vrier': u'feb', + u'[mM]ars': u'mar', + u'[aA]vril': u'apr', + u'[mM]ai': u'may', + u'[jJ]uin': u'jun', + u'[jJ]uillet': u'jul', + u'[aA]o.t': u'aug', + u'[sS]eptembre': u'sep', + u'[Oo]ctobre': u'oct', + u'[nN]ovembre': u'nov', + u'[dD].cembre': u'dec' } + for k in frtoen.iterkeys(): + tmp = re.sub(k, frtoen[k], datefr) + if tmp <> datefr: break + return tmp + +class Query(object): + + BASE_URL = 'http://fr.nicebooks.com/' + + def __init__(self, title=None, author=None, publisher=None, isbn=None, keywords=None, max_results=20): + assert not(title is None and author is None and publisher is None \ + and isbn is None and keywords is None) + assert (max_results < 21) + + self.max_results = int(max_results) + + q = '' + if isbn is not None: + q += isbn + else: + + if title is not None: + q += title + if author is not None: + q += author + if publisher is not None: + q += publisher + if keywords is not None: + q += keywords + + if isinstance(q, unicode): + q = q.encode('utf-8') + self.urldata = 'search?' + urlencode({'q':q,'s':'Rechercher'}) + + def __call__(self, browser, verbose, timeout = 5.): + if verbose: + print 'Query:', self.BASE_URL+self.urldata + + try: + raw = browser.open_novisit(self.BASE_URL+self.urldata, timeout=timeout).read() + except Exception, e: + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return + raise + if '<title>404 - ' in raw: + return + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + feed = soupparser.fromstring(raw) + except: + return + + #nb of page to call + try: + nbresults = int(feed.xpath("//div[@id='topbar']/b")[0].text) + except: + #direct hit + return [feed] + + nbpagetoquery = ceil(min(nbresults, self.max_results)/10) + pages =[feed] + if nbpagetoquery > 1: + for i in xrange(2, nbpagetoquery + 1): + try: + urldata = self.urldata + '&p=' + str(i) + raw = browser.open_novisit(self.BASE_URL+urldata, timeout=timeout).read() + except Exception, e: + continue + if '<title>404 - ' in raw: + continue + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + feed = soupparser.fromstring(raw) + except: + continue + pages.append(feed) + + results = [] + for x in pages: + results.extend([i.find_class('title')[0].get('href') \ + for i in x.xpath("//ul[@id='results']/li")]) + return results[:self.max_results] + +class ResultList(list): + + BASE_URL = 'http://fr.nicebooks.com' + + def __init__(self): + self.repub = re.compile(r'\s*.diteur\s*', re.I) + self.reauteur = re.compile(r'\s*auteur.*', re.I) + self.reautclean = re.compile(r'\s*\(.*\)\s*') + + def get_title(self, entry): + title = deepcopy(entry.find("div[@id='book-info']")) + title.remove(title.find("dl[@title='Informations sur le livre']")) + title = ' '.join([i.text_content() for i in title.iterchildren()]) + return title.replace('\n', '') + + def get_authors(self, entry): + author = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + authortext = [] + for x in author.getiterator('dt'): + if self.reauteur.match(x.text): + elt = x.getnext() + i = 0 + while elt.tag <> 'dt' and i < 20: + authortext.append(elt.text_content()) + elt = elt.getnext() + i += 1 + break + if len(authortext) == 1: + authortext = [self.reautclean.sub('', authortext[0])] + return authortext + + def get_description(self, entry, verbose): + try: + return 'RESUME:\n' + entry.xpath("//p[@id='book-description']")[0].text + except: + report(verbose) + return None + + def get_publisher(self, entry): + publisher = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + publitext = None + for x in publisher.getiterator('dt'): + if self.repub.match(x.text): + publitext = x.getnext().text_content() + break + return publitext + + def get_date(self, entry, verbose): + date = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + for x in date.getiterator('dt'): + if x.text == 'Date de parution': + d = x.getnext().text_content() + break + if not len(d): + return None + try: + default = utcnow().replace(day=15) + d = replace_monthsfr(d) + d = parse_date(d, assume_utc=True, default=default) + except: + report(verbose) + d = None + return d + + def get_ISBN(self, entry): + isbn = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + isbntext = None + for x in isbn.getiterator('dt'): + if x.text == 'ISBN': + isbntext = x.getnext().text_content() + if not check_isbn(isbntext): + return None + break + return isbntext + + def get_language(self, entry): + language = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + langtext = None + for x in language.getiterator('dt'): + if x.text == 'Langue': + langtext = x.getnext().text_content() + break + return langtext + + def fill_MI(self, entry, title, authors, verbose): + mi = MetaInformation(title, authors) + mi.comments = self.get_description(entry, verbose) + mi.publisher = self.get_publisher(entry) + mi.pubdate = self.get_date(entry, verbose) + mi.isbn = self.get_ISBN(entry) + mi.author_sort = authors_to_sort_string(authors) + mi.language = self.get_language(entry) + return mi + + def get_individual_metadata(self, browser, linkdata, verbose): + try: + raw = browser.open_novisit(self.BASE_URL + linkdata).read() + except Exception, e: + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return + raise + if '<title>404 - ' in raw: + report(verbose) + return + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + feed = soupparser.fromstring(raw) + except: + return + + # get results + return feed.xpath("//div[@id='container']")[0] + + def populate(self, entries, browser, verbose=False): + for x in entries: + try: + entry = self.get_individual_metadata(browser, x, verbose) + title = self.get_title(entry) + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print 'Failed to get all details for an entry' + print e + continue + self.append(self.fill_MI(entry, title, authors, verbose)) + + def populate_single(self, feed, verbose=False): + try: + entry = feed.xpath("//div[@id='container']")[0] + title = self.get_title(entry) + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print 'Failed to get all details for an entry' + print e + return + self.append(self.fill_MI(entry, title, authors, verbose)) + +class NiceBooksError(Exception): + pass + +class ISBNNotFound(NiceBooksError): + pass + +class Covers(object): + + def __init__(self, isbn = None): + assert isbn is not None + self.urlimg = '' + self.isbn = isbn + self.isbnf = False + + def __call__(self, entry = None): + try: + self.urlimg = entry.xpath("//div[@id='book-picture']/a")[0].get('href') + except: + return self + isbno = entry.get_element_by_id('book-info').find("dl[@title='Informations sur le livre']") + isbntext = None + for x in isbno.getiterator('dt'): + if x.text == 'ISBN': + isbntext = x.getnext().text_content() + break + if isbntext is not None: + self.isbnf = True + return self + + def check_cover(self): + if self.urlimg: + return True + else: + return False + + def get_cover(self, browser, timeout = 5.): + try: + return browser.open_novisit(self.urlimg, timeout=timeout).read(), \ + self.urlimg.rpartition('.')[-1] + except Exception, err: + if isinstance(getattr(err, 'args', [None])[0], socket.timeout): + err = NiceBooksError(_('Nicebooks timed out. Try again later.')) + raise err + if not len(self.urlimg): + if not self.isbnf: + raise ISBNNotFound('ISBN: '+self.isbn+_(' not found.')) + raise NiceBooksError(_('An errror occured with Nicebooks cover fetcher')) + + +def search(title=None, author=None, publisher=None, isbn=None, + verbose=False, max_results=5, keywords=None): + br = browser() + entries = Query(title=title, author=author, isbn=isbn, publisher=publisher, + keywords=keywords, max_results=max_results)(br, verbose) + + if entries is None: + return + + #List of entry + ans = ResultList() + if len(entries) > 1: + ans.populate(entries, br, verbose) + else: + ans.populate_single(entries[0], verbose) + return ans + +def check_for_cover(isbn): + br = browser() + entry = Query(isbn=isbn, max_results=1)(br, False)[0] + return Covers(isbn)(entry).check_cover() + +def cover_from_isbn(isbn, timeout = 5.): + br = browser() + entry = Query(isbn=isbn, max_results=1)(br, False, timeout)[0] + return Covers(isbn)(entry).get_cover(br, timeout) + + +def option_parser(): + parser = OptionParser(textwrap.dedent(\ + '''\ + %prog [options] + + Fetch book metadata from Nicebooks. You must specify one of title, author, + ISBN, publisher or keywords. Will fetch a maximum of 20 matches, + so you should make your query as specific as possible. + It can also get covers if the option is activated. + ''' + )) + parser.add_option('-t', '--title', help='Book title') + parser.add_option('-a', '--author', help='Book author(s)') + parser.add_option('-p', '--publisher', help='Book publisher') + parser.add_option('-i', '--isbn', help='Book ISBN') + parser.add_option('-k', '--keywords', help='Keywords') + parser.add_option('-c', '--covers', default=0, + help='Covers: 1-Check/ 2-Download') + parser.add_option('-p', '--coverspath', default='', + help='Covers files path') + parser.add_option('-m', '--max-results', default=20, + help='Maximum number of results to fetch') + parser.add_option('-v', '--verbose', default=0, action='count', + help='Be more verbose about errors') + return parser + +def main(args=sys.argv): + parser = option_parser() + opts, args = parser.parse_args(args) + try: + results = search(opts.title, opts.author, isbn=opts.isbn, publisher=opts.publisher, + keywords=opts.keywords, verbose=opts.verbose, max_results=opts.max_results) + except AssertionError: + report(True) + parser.print_help() + return 1 + for result in results: + print unicode(result).encode(preferred_encoding, 'replace') + covact = int(opts.covers) + if covact == 1: + textcover = 'No cover found!' + if check_for_cover(result.isbn): + textcover = 'A cover was found for this book' + print textcover + elif covact == 2: + cover_data, ext = cover_from_isbn(result.isbn) + if not ext: + ext = 'jpg' + cpath = result.isbn + if len(opts.coverspath): + cpath = os.path.normpath(opts.coverspath + '/' + result.isbn) + oname = os.path.abspath(cpath+'.'+ext) + open(oname, 'wb').write(cover_data) + print 'Cover saved to file ', oname + print + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file From fd711e6075e2dec43ab37c76fad9ed299fcdc71d Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 21 Nov 2010 01:28:56 +0100 Subject: [PATCH 034/132] Minor fix for nicebooks.py --- src/calibre/ebooks/metadata/nicebooks.py | 49 +++++++++++------------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py index 28fb2de562..98ecdf3625 100644 --- a/src/calibre/ebooks/metadata/nicebooks.py +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -52,7 +52,7 @@ class NiceBooksCovers(CoverDownload): br = browser() try: entry = Query(isbn=mi.isbn, max_results=1)(br, False, timeout)[0] - if Covers(isbn)(entry).check_cover(): + if Covers(mi.isbn)(entry).check_cover(): self.debug('cover for', mi.isbn, 'found') ans.set() except Exception, e: @@ -64,7 +64,7 @@ class NiceBooksCovers(CoverDownload): br = browser() try: entry = Query(isbn=mi.isbn, max_results=1)(br, False, timeout)[0] - cover_data, ext = Covers(isbn)(entry).get_cover(br, timeout) + cover_data, ext = Covers(mi.isbn)(entry).get_cover(br, timeout) if not ext: ext = 'jpg' result_queue.put((True, cover_data, ext, self.name)) @@ -109,20 +109,12 @@ class Query(object): self.max_results = int(max_results) - q = '' if isbn is not None: - q += isbn + q = isbn else: - - if title is not None: - q += title - if author is not None: - q += author - if publisher is not None: - q += publisher - if keywords is not None: - q += keywords - + q = ' '.join([i for i in (title, author, publisher, keywords) \ + if i is not None]) + if isinstance(q, unicode): q = q.encode('utf-8') self.urldata = 'search?' + urlencode({'q':q,'s':'Rechercher'}) @@ -185,15 +177,15 @@ class ResultList(list): BASE_URL = 'http://fr.nicebooks.com' def __init__(self): - self.repub = re.compile(r'\s*.diteur\s*', re.I) - self.reauteur = re.compile(r'\s*auteur.*', re.I) - self.reautclean = re.compile(r'\s*\(.*\)\s*') + self.repub = re.compile(u'\s*.diteur\s*', re.I) + self.reauteur = re.compile(u'\s*auteur.*', re.I) + self.reautclean = re.compile(u'\s*\(.*\)\s*') def get_title(self, entry): title = deepcopy(entry.find("div[@id='book-info']")) title.remove(title.find("dl[@title='Informations sur le livre']")) title = ' '.join([i.text_content() for i in title.iterchildren()]) - return title.replace('\n', '') + return unicode(title.replace('\n', '')) def get_authors(self, entry): author = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") @@ -203,7 +195,7 @@ class ResultList(list): elt = x.getnext() i = 0 while elt.tag <> 'dt' and i < 20: - authortext.append(elt.text_content()) + authortext.append(unicode(elt.text_content())) elt = elt.getnext() i += 1 break @@ -213,7 +205,7 @@ class ResultList(list): def get_description(self, entry, verbose): try: - return 'RESUME:\n' + entry.xpath("//p[@id='book-description']")[0].text + return 'RESUME:\n' + unicode(entry.xpath("//p[@id='book-description']")[0].text) except: report(verbose) return None @@ -225,15 +217,16 @@ class ResultList(list): if self.repub.match(x.text): publitext = x.getnext().text_content() break - return publitext + return unicode(publitext).strip() def get_date(self, entry, verbose): date = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + d = '' for x in date.getiterator('dt'): if x.text == 'Date de parution': d = x.getnext().text_content() break - if not len(d): + if len(d) == 0: return None try: default = utcnow().replace(day=15) @@ -252,8 +245,9 @@ class ResultList(list): isbntext = x.getnext().text_content() if not check_isbn(isbntext): return None + isbntext = isbntext.replace('-', '') break - return isbntext + return unicode(isbntext) def get_language(self, entry): language = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") @@ -262,7 +256,7 @@ class ResultList(list): if x.text == 'Langue': langtext = x.getnext().text_content() break - return langtext + return unicode(langtext).strip() def fill_MI(self, entry, title, authors, verbose): mi = MetaInformation(title, authors) @@ -371,12 +365,12 @@ class Covers(object): def search(title=None, author=None, publisher=None, isbn=None, - verbose=False, max_results=5, keywords=None): + max_results=5, verbose=False, keywords=None): br = browser() entries = Query(title=title, author=author, isbn=isbn, publisher=publisher, keywords=keywords, max_results=max_results)(br, verbose) - if entries is None: + if entries is None or len(entries) == 0: return #List of entry @@ -434,6 +428,9 @@ def main(args=sys.argv): report(True) parser.print_help() return 1 + if results is None or len(results) == 0: + print 'No result found for this search!' + return 0 for result in results: print unicode(result).encode(preferred_encoding, 'replace') covact = int(opts.covers) From bc98b043fd4a7e7a09ab765c1d94f5782bda8676 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 21 Nov 2010 01:29:22 +0100 Subject: [PATCH 035/132] Fix for download cover regression --- src/calibre/gui2/dialogs/metadata_single.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/gui2/dialogs/metadata_single.py b/src/calibre/gui2/dialogs/metadata_single.py index 0b9b33868c..1eae761561 100644 --- a/src/calibre/gui2/dialogs/metadata_single.py +++ b/src/calibre/gui2/dialogs/metadata_single.py @@ -716,10 +716,10 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): self.title.setText(book.title) self.authors.setText(authors_to_string(book.authors)) if book.author_sort: self.author_sort.setText(book.author_sort) - if d.opt_overwrite_cover_image.isChecked() and book.has_cover: - self.fetch_cover() if book.publisher: self.publisher.setEditText(book.publisher) if book.isbn: self.isbn.setText(book.isbn) + if d.opt_overwrite_cover_image.isChecked() and book.has_cover: + self.fetch_cover() if book.pubdate: d = book.pubdate self.pubdate.setDate(QDate(d.year, d.month, d.day)) From 681c451238bbcf4d0f9e7c8102ef9e83de79e9ce Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 21 Nov 2010 09:14:44 +0100 Subject: [PATCH 036/132] Disable by default my plugins --- src/calibre/customize/ui.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/calibre/customize/ui.py b/src/calibre/customize/ui.py index 844269e453..e963a17df9 100644 --- a/src/calibre/customize/ui.py +++ b/src/calibre/customize/ui.py @@ -121,6 +121,8 @@ def enable_plugin(plugin_or_name): default_disabled_plugins = set([ 'Douban Books', 'Douban.com covers', + 'NiceBooks', 'NiceBooksCovers', + 'Fictionwise' ]) def is_disabled(plugin): From c5cbaffd20b042150a4c654584bbc526e613f5f6 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 21 Nov 2010 09:19:59 +0100 Subject: [PATCH 037/132] Externalize metadata plugin in fictionwise.py --- src/calibre/customize/builtins.py | 1 + src/calibre/ebooks/metadata/fetch.py | 18 ------------------ src/calibre/ebooks/metadata/fictionwise.py | 19 +++++++++++++++++++ 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index ce5275d35e..4815375563 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -484,6 +484,7 @@ from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \ LibraryThing, Fictionwise from calibre.ebooks.metadata.douban import DoubanBooks from calibre.ebooks.metadata.nicebooks import NiceBooks +from calibre.ebooks.metadata.fictionwise import Fictionwise from calibre.ebooks.metadata.covers import OpenLibraryCovers, \ LibraryThingCovers, DoubanCovers from calibre.ebooks.metadata.nicebooks import NiceBooksCovers diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index c9d6a74cb2..dedd251640 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -267,24 +267,6 @@ class LibraryThing(MetadataSource): # {{{ # }}} -class Fictionwise(MetadataSource): # {{{ - - author = 'Sengian' - name = 'Fictionwise' - description = _('Downloads metadata from Fictionwise') - - has_html_comments = True - - def fetch(self): - from calibre.ebooks.metadata.fictionwise import search - try: - self.results = search(self.title, self.book_author, self.publisher, - self.isbn, max_results=10, verbose=self.verbose) - except Exception, e: - self.exception = e - self.tb = traceback.format_exc() - - # }}} def result_index(source, result): if not result.isbn: diff --git a/src/calibre/ebooks/metadata/fictionwise.py b/src/calibre/ebooks/metadata/fictionwise.py index 706d38b559..828ea31c3a 100644 --- a/src/calibre/ebooks/metadata/fictionwise.py +++ b/src/calibre/ebooks/metadata/fictionwise.py @@ -15,9 +15,28 @@ from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.metadata import MetaInformation, check_isbn, \ authors_to_sort_string from calibre.library.comments import sanitize_comments_html +from calibre.ebooks.metadata.fetch import MetadataSource from calibre.utils.config import OptionParser from calibre.utils.date import parse_date, utcnow +class Fictionwise(MetadataSource): # {{{ + + author = 'Sengian' + name = 'Fictionwise' + description = _('Downloads metadata from Fictionwise') + + has_html_comments = True + + def fetch(self): + try: + self.results = search(self.title, self.book_author, self.publisher, + self.isbn, max_results=10, verbose=self.verbose) + except Exception, e: + self.exception = e + self.tb = traceback.format_exc() + + # }}} + def report(verbose): if verbose: From 9c30a416120d257e5bd9078408287683d150c191 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 21 Nov 2010 09:50:53 +0100 Subject: [PATCH 038/132] Correct nicebook max result problem --- src/calibre/ebooks/metadata/nicebooks.py | 45 +++++++++++------------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py index 98ecdf3625..e72d4b26ae 100644 --- a/src/calibre/ebooks/metadata/nicebooks.py +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -5,7 +5,6 @@ __docformat__ = 'restructuredtext en' import sys, textwrap, re, traceback, socket from urllib import urlencode -from functools import partial from math import ceil from copy import deepcopy @@ -147,7 +146,7 @@ class Query(object): #direct hit return [feed] - nbpagetoquery = ceil(min(nbresults, self.max_results)/10) + nbpagetoquery = int(ceil(float(min(nbresults, self.max_results))/10)) pages =[feed] if nbpagetoquery > 1: for i in xrange(2, nbpagetoquery + 1): @@ -193,11 +192,9 @@ class ResultList(list): for x in author.getiterator('dt'): if self.reauteur.match(x.text): elt = x.getnext() - i = 0 - while elt.tag <> 'dt' and i < 20: + while elt.tag == 'dd': authortext.append(unicode(elt.text_content())) elt = elt.getnext() - i += 1 break if len(authortext) == 1: authortext = [self.reautclean.sub('', authortext[0])] @@ -291,29 +288,32 @@ class ResultList(list): return feed.xpath("//div[@id='container']")[0] def populate(self, entries, browser, verbose=False): - for x in entries: + #single entry + if len(entries) ==1: try: - entry = self.get_individual_metadata(browser, x, verbose) + entry = entries[0].xpath("//div[@id='container']")[0] title = self.get_title(entry) authors = self.get_authors(entry) except Exception, e: if verbose: print 'Failed to get all details for an entry' print e - continue + return self.append(self.fill_MI(entry, title, authors, verbose)) + else: + #multiple entries + for x in entries: + try: + entry = self.get_individual_metadata(browser, x, verbose) + title = self.get_title(entry) + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print 'Failed to get all details for an entry' + print e + continue + self.append(self.fill_MI(entry, title, authors, verbose)) - def populate_single(self, feed, verbose=False): - try: - entry = feed.xpath("//div[@id='container']")[0] - title = self.get_title(entry) - authors = self.get_authors(entry) - except Exception, e: - if verbose: - print 'Failed to get all details for an entry' - print e - return - self.append(self.fill_MI(entry, title, authors, verbose)) class NiceBooksError(Exception): pass @@ -372,13 +372,10 @@ def search(title=None, author=None, publisher=None, isbn=None, if entries is None or len(entries) == 0: return - + #List of entry ans = ResultList() - if len(entries) > 1: - ans.populate(entries, br, verbose) - else: - ans.populate_single(entries[0], verbose) + ans.populate(entries, br, verbose) return ans def check_for_cover(isbn): From 3a37d7e78fa94dff29c86bde480e085463070f56 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 21 Nov 2010 10:24:56 +0100 Subject: [PATCH 039/132] Optimize metadata retrieval --- src/calibre/ebooks/metadata/nicebooks.py | 65 +++++++++++++++++------- 1 file changed, 47 insertions(+), 18 deletions(-) diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py index e72d4b26ae..f7cffa959b 100644 --- a/src/calibre/ebooks/metadata/nicebooks.py +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -181,13 +181,15 @@ class ResultList(list): self.reautclean = re.compile(u'\s*\(.*\)\s*') def get_title(self, entry): - title = deepcopy(entry.find("div[@id='book-info']")) + # title = deepcopy(entry.find("div[@id='book-info']")) + title = deepcopy(entry) title.remove(title.find("dl[@title='Informations sur le livre']")) title = ' '.join([i.text_content() for i in title.iterchildren()]) return unicode(title.replace('\n', '')) def get_authors(self, entry): - author = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + # author = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + author = entry.find("dl[@title='Informations sur le livre']") authortext = [] for x in author.getiterator('dt'): if self.reauteur.match(x.text): @@ -202,22 +204,46 @@ class ResultList(list): def get_description(self, entry, verbose): try: - return 'RESUME:\n' + unicode(entry.xpath("//p[@id='book-description']")[0].text) + return u'RESUME:\n' + unicode(entry.getparent().xpath("//p[@id='book-description']")[0].text) except: report(verbose) return None - + + def get_book_info(self, entry, mi): + entry = entry.find("dl[@title='Informations sur le livre']") + for x in entry.getiterator('dt'): + if x.text == 'ISBN': + isbntext = x.getnext().text_content().replace('-', '') + if check_isbn(isbntext): + mi.isbn = unicode(isbntext) + elif self.repub.match(x.text): + mi.publisher = unicode(x.getnext().text_content()) + elif x.text == 'Langue': + mi.language = unicode(x.getnext().text_content()) + elif x.text == 'Date de parution': + d = x.getnext().text_content() + try: + default = utcnow().replace(day=15) + d = replace_monthsfr(d) + d = parse_date(d, assume_utc=True, default=default) + mi.pubdate = d + except: + report(verbose) + return mi + def get_publisher(self, entry): - publisher = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + # publisher = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + publisher = entry publitext = None for x in publisher.getiterator('dt'): if self.repub.match(x.text): publitext = x.getnext().text_content() break - return unicode(publitext).strip() + return unicode(publitext) def get_date(self, entry, verbose): - date = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + # date = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + date = entry d = '' for x in date.getiterator('dt'): if x.text == 'Date de parution': @@ -235,35 +261,37 @@ class ResultList(list): return d def get_ISBN(self, entry): - isbn = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + # isbn = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + isbn = entry isbntext = None for x in isbn.getiterator('dt'): if x.text == 'ISBN': - isbntext = x.getnext().text_content() + isbntext = x.getnext().text_content().replace('-', '') if not check_isbn(isbntext): return None - isbntext = isbntext.replace('-', '') break return unicode(isbntext) def get_language(self, entry): - language = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + # language = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + language = entry langtext = None for x in language.getiterator('dt'): if x.text == 'Langue': langtext = x.getnext().text_content() break - return unicode(langtext).strip() + return unicode(langtext) def fill_MI(self, entry, title, authors, verbose): mi = MetaInformation(title, authors) - mi.comments = self.get_description(entry, verbose) - mi.publisher = self.get_publisher(entry) - mi.pubdate = self.get_date(entry, verbose) - mi.isbn = self.get_ISBN(entry) mi.author_sort = authors_to_sort_string(authors) - mi.language = self.get_language(entry) - return mi + mi.comments = self.get_description(entry, verbose) + # entry = entry.find("dl[@title='Informations sur le livre']") + # mi.publisher = self.get_publisher(entry) + # mi.pubdate = self.get_date(entry, verbose) + # mi.isbn = self.get_ISBN(entry) + # mi.language = self.get_language(entry) + return self.get_book_info(entry, mi) def get_individual_metadata(self, browser, linkdata, verbose): try: @@ -292,6 +320,7 @@ class ResultList(list): if len(entries) ==1: try: entry = entries[0].xpath("//div[@id='container']")[0] + entry = entry.find("div[@id='book-info']") title = self.get_title(entry) authors = self.get_authors(entry) except Exception, e: From 4887bac205622d0c6fe486278286b7eecbc30acc Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 21 Nov 2010 10:29:55 +0100 Subject: [PATCH 040/132] bug --- src/calibre/ebooks/metadata/nicebooks.py | 52 +----------------------- 1 file changed, 1 insertion(+), 51 deletions(-) diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py index f7cffa959b..9a06bad998 100644 --- a/src/calibre/ebooks/metadata/nicebooks.py +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -230,57 +230,6 @@ class ResultList(list): except: report(verbose) return mi - - def get_publisher(self, entry): - # publisher = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") - publisher = entry - publitext = None - for x in publisher.getiterator('dt'): - if self.repub.match(x.text): - publitext = x.getnext().text_content() - break - return unicode(publitext) - - def get_date(self, entry, verbose): - # date = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") - date = entry - d = '' - for x in date.getiterator('dt'): - if x.text == 'Date de parution': - d = x.getnext().text_content() - break - if len(d) == 0: - return None - try: - default = utcnow().replace(day=15) - d = replace_monthsfr(d) - d = parse_date(d, assume_utc=True, default=default) - except: - report(verbose) - d = None - return d - - def get_ISBN(self, entry): - # isbn = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") - isbn = entry - isbntext = None - for x in isbn.getiterator('dt'): - if x.text == 'ISBN': - isbntext = x.getnext().text_content().replace('-', '') - if not check_isbn(isbntext): - return None - break - return unicode(isbntext) - - def get_language(self, entry): - # language = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") - language = entry - langtext = None - for x in language.getiterator('dt'): - if x.text == 'Langue': - langtext = x.getnext().text_content() - break - return unicode(langtext) def fill_MI(self, entry, title, authors, verbose): mi = MetaInformation(title, authors) @@ -334,6 +283,7 @@ class ResultList(list): for x in entries: try: entry = self.get_individual_metadata(browser, x, verbose) + entry = entry.find("div[@id='book-info']") title = self.get_title(entry) authors = self.get_authors(entry) except Exception, e: From 3490c73ad93fa9bd55fd0d9ed513ded5eb6ea1c9 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 21 Nov 2010 11:10:21 +0100 Subject: [PATCH 041/132] Optimisation of nicebooks covers --- src/calibre/ebooks/metadata/nicebooks.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py index 9a06bad998..51858e4b77 100644 --- a/src/calibre/ebooks/metadata/nicebooks.py +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -266,7 +266,7 @@ class ResultList(list): def populate(self, entries, browser, verbose=False): #single entry - if len(entries) ==1: + if len(entries) == 1 and not isinstance(entries[0], str): try: entry = entries[0].xpath("//div[@id='container']")[0] entry = entry.find("div[@id='book-info']") @@ -314,25 +314,20 @@ class Covers(object): except: return self isbno = entry.get_element_by_id('book-info').find("dl[@title='Informations sur le livre']") - isbntext = None for x in isbno.getiterator('dt'): - if x.text == 'ISBN': - isbntext = x.getnext().text_content() + if x.text == 'ISBN' and check_isbn(x.getnext().text_content()): + self.isbnf = True break - if isbntext is not None: - self.isbnf = True return self def check_cover(self): - if self.urlimg: - return True - else: - return False + return True if self.urlimg else False def get_cover(self, browser, timeout = 5.): try: - return browser.open_novisit(self.urlimg, timeout=timeout).read(), \ + cover, ext = browser.open_novisit(self.urlimg, timeout=timeout).read(), \ self.urlimg.rpartition('.')[-1] + return cover, ext if ext else 'jpg' except Exception, err: if isinstance(getattr(err, 'args', [None])[0], socket.timeout): err = NiceBooksError(_('Nicebooks timed out. Try again later.')) @@ -417,8 +412,6 @@ def main(args=sys.argv): print textcover elif covact == 2: cover_data, ext = cover_from_isbn(result.isbn) - if not ext: - ext = 'jpg' cpath = result.isbn if len(opts.coverspath): cpath = os.path.normpath(opts.coverspath + '/' + result.isbn) From 251cde290283b6b2f29fed61ad638a9c5a504e72 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Tue, 7 Dec 2010 21:40:34 +0100 Subject: [PATCH 042/132] Remove unecessary check --- src/calibre/gui2/dialogs/metadata_single.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/gui2/dialogs/metadata_single.py b/src/calibre/gui2/dialogs/metadata_single.py index 3205b1d23c..eb9ae71397 100644 --- a/src/calibre/gui2/dialogs/metadata_single.py +++ b/src/calibre/gui2/dialogs/metadata_single.py @@ -781,7 +781,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): if book.series_index is not None: self.series_index.setValue(book.series_index) if book.has_cover: - if d.opt_auto_download_cover.isChecked() and book.has_cover: + if d.opt_auto_download_cover.isChecked(): self.fetch_cover() else: self.fetch_cover_button.setFocus(Qt.OtherFocusReason) From 824f8b5a67fc354e1dd9dad7dc8dd1c183275295 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Tue, 7 Dec 2010 21:43:09 +0100 Subject: [PATCH 043/132] Use clean_ascii_chars in txt/processor --- src/calibre/ebooks/txt/processor.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index dac1e34df7..50d8419110 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -9,6 +9,7 @@ import os, re from calibre import prepare_string_for_xml, isbytestring from calibre.ebooks.markdown import markdown from calibre.ebooks.metadata.opf2 import OPFCreator +from calibre.utils.cleantext import clean_ascii_chars __license__ = 'GPL v3' __copyright__ = '2009, John Schember <john@nachtimwald.com>' @@ -31,10 +32,8 @@ def convert_basic(txt, title='', epub_split_size_kb=0): txt = re.sub('(?<=.)\s+$', '', txt) # Remove excessive line breaks. txt = re.sub('\n{3,}', '\n\n', txt) - #remove ASCII invalid chars : 0 to 8 and 11-14 to 24 - chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19)) - illegal_chars = re.compile(u'|'.join(map(unichr, chars))) - txt = illegal_chars.sub('', txt) + #remove ASCII invalid chars + txt = clean_ascii_chars(txt) #Takes care if there is no point to split if epub_split_size_kb > 0: if isinstance(txt, unicode): From da4cdeb1d1763ef7b4fdf19a5538ba0439b5d97f Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Tue, 7 Dec 2010 21:50:10 +0100 Subject: [PATCH 044/132] Introduce fictionwise as a disabled plugin --- src/calibre/customize/builtins.py | 5 +++-- src/calibre/customize/ui.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 5f3aab142e..06da355d6a 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -484,6 +484,7 @@ from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \ LibraryThing from calibre.ebooks.metadata.douban import DoubanBooks from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers +from calibre.ebooks.metadata.fictionwise import Fictionwise from calibre.ebooks.metadata.covers import OpenLibraryCovers, \ LibraryThingCovers, DoubanCovers from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX @@ -491,8 +492,8 @@ from calibre.ebooks.epub.fix.unmanifested import Unmanifested from calibre.ebooks.epub.fix.epubcheck import Epubcheck plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, - LibraryThing, DoubanBooks, NiceBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, - Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers, + LibraryThing, DoubanBooks, NiceBooks, Fictionwise, CSV_XML, EPUB_MOBI, BIBTEX, + Unmanifested, Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers, NiceBooksCovers] plugins += [ ComicInput, diff --git a/src/calibre/customize/ui.py b/src/calibre/customize/ui.py index c360122842..2c9daed994 100644 --- a/src/calibre/customize/ui.py +++ b/src/calibre/customize/ui.py @@ -120,7 +120,8 @@ def enable_plugin(plugin_or_name): config['enabled_plugins'] = ep default_disabled_plugins = set([ - 'Douban Books', 'Douban.com covers', 'Nicebooks', 'Nicebooks covers' + 'Douban Books', 'Douban.com covers', 'Nicebooks', 'Nicebooks covers', + 'Fictionwise' ]) def is_disabled(plugin): From 4d20351e8b583e883cdfa4695c987ed70fc7d6bc Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Wed, 8 Dec 2010 06:47:25 +0100 Subject: [PATCH 045/132] Add threading to nicebooks.py --- src/calibre/ebooks/metadata/nicebooks.py | 142 ++++++++++++++--------- 1 file changed, 85 insertions(+), 57 deletions(-) diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py index 8914e2d985..7beececd7e 100644 --- a/src/calibre/ebooks/metadata/nicebooks.py +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -3,7 +3,8 @@ __license__ = 'GPL 3' __copyright__ = '2010, sengian <sengian1@gmail.com>' __docformat__ = 'restructuredtext en' -import sys, textwrap, re, traceback, socket +import sys, textwrap, re, traceback, socket, threading +from Queue import Queue from urllib import urlencode from math import ceil from copy import deepcopy @@ -23,7 +24,7 @@ from calibre.utils.config import OptionParser class NiceBooks(MetadataSource): name = 'Nicebooks' - description = _('Downloads metadata from french Nicebooks') + description = _('Downloads metadata from French Nicebooks') supported_platforms = ['windows', 'osx', 'linux'] author = 'Sengian' version = (1, 0, 0) @@ -78,10 +79,50 @@ class NiceBooksError(Exception): class ISBNNotFound(NiceBooksError): pass +class BrowserThread(threading.Thread): + + def __init__(self, url, verbose=False, timeout=10., ex=Exception, name='Meta'): + self.url = url + self.ex = ex + self.name = name + self.verbose = verbose + self.timeout = timeout + self.result = None + threading.Thread.__init__(self) + + def get_result(self): + return self.result + + def run(self): + try: + raw = browser().open_novisit(self.url, timeout=self.timeout).read() + except Exception, e: + report(self.verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + self.result = None + if isinstance(getattr(e, 'args', [None])[0], socket.timeout): + raise self.ex(_('%s timed out. Try again later.') % self.name) + raise self.ex(_('%s encountered an error.') % self.name) + if '<title>404 - ' in raw: + report(self.verbose) + self.result = None + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + self.result = soupparser.fromstring(raw) + except: + try: + #remove ASCII invalid chars + self.result = soupparser.fromstring(clean_ascii_chars(raw)) + except: + self.result = None + def report(verbose): if verbose: traceback.print_exc() + class Query(object): BASE_URL = 'http://fr.nicebooks.com/' @@ -224,68 +265,53 @@ class ResultList(list): report(verbose) return mi - def fill_MI(self, entry, title, authors, verbose): + def fill_MI(self, data, verbose): + '''create and return an mi if possible, None otherwise''' + try: + entry = data.xpath("//div[@id='container']/div[@id='book-info']")[0] + title = self.get_title(entry) + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print 'Failed to get all details for an entry' + print e + return None mi = MetaInformation(title, authors) mi.author_sort = authors_to_sort_string(authors) mi.comments = self.get_description(entry, verbose) return self.get_book_info(entry, mi, verbose) - def get_individual_metadata(self, browser, linkdata, verbose): - try: - raw = browser.open_novisit(self.BASE_URL + linkdata).read() - except Exception, e: - report(verbose) - if callable(getattr(e, 'getcode', None)) and \ - e.getcode() == 404: - return - if isinstance(getattr(e, 'args', [None])[0], socket.timeout): - raise NiceBooksError(_('Nicebooks timed out. Try again later.')) - raise NiceBooksError(_('Nicebooks encountered an error.')) - if '<title>404 - ' in raw: - report(verbose) - return - raw = xml_to_unicode(raw, strip_encoding_pats=True, - resolve_entities=True)[0] - try: - feed = soupparser.fromstring(raw) - except: - try: - #remove ASCII invalid chars - feed = soupparser.fromstring(clean_ascii_chars(raw)) - except: - return None + def producer(self, q, data, verbose=False): + for x in data: + thread = BrowserThread(self.BASE_URL+x, verbose=verbose, ex=NiceBooksError, + name='Nicebooks') + thread.start() + q.put(thread, True) - # get results - return feed.xpath("//div[@id='container']")[0] + def consumer(self, q, total_entries, verbose=False): + while len(self) < total_entries: + thread = q.get(True) + thread.join() + mi, order = thread.get_result() + if mi is None: + self.append(None) + self.append(self.fill_MI(mi, verbose)) - def populate(self, entries, browser, verbose=False): - #single entry + def populate(self, entries, verbose=False, brcall=3): if len(entries) == 1 and not isinstance(entries[0], str): - try: - entry = entries[0].xpath("//div[@id='container']")[0] - entry = entry.find("div[@id='book-info']") - title = self.get_title(entry) - authors = self.get_authors(entry) - except Exception, e: - if verbose: - print 'Failed to get all details for an entry' - print e - return - self.append(self.fill_MI(entry, title, authors, verbose)) + #single entry + mi = self.fill_MI(entries[0], verbose) + if mi: + self.append(mi) else: - #multiple entries - for x in entries: - try: - entry = self.get_individual_metadata(browser, x, verbose) - entry = entry.find("div[@id='book-info']") - title = self.get_title(entry) - authors = self.get_authors(entry) - except Exception, e: - if verbose: - print 'Failed to get all details for an entry' - print e - continue - self.append(self.fill_MI(entry, title, authors, verbose)) + #multiple entries + q = Queue(brcall) + prod_thread = threading.Thread(target=self.producer, args=(q, entries, verbose)) + cons_thread = threading.Thread(target=self.consumer, args=(q, len(entries), verbose)) + prod_thread.start() + cons_thread.start() + prod_thread.join() + cons_thread.join() class Covers(object): @@ -328,14 +354,14 @@ def search(title=None, author=None, publisher=None, isbn=None, max_results=5, verbose=False, keywords=None): br = browser() entries = Query(title=title, author=author, isbn=isbn, publisher=publisher, - keywords=keywords, max_results=max_results)(br, verbose,timeout = 10.) + keywords=keywords, max_results=max_results)(br, verbose, timeout = 10.) if entries is None or len(entries) == 0: return None #List of entry ans = ResultList() - ans.populate(entries, br, verbose) + ans.populate(entries, verbose) return ans def check_for_cover(isbn): @@ -409,3 +435,5 @@ def main(args=sys.argv): if __name__ == '__main__': sys.exit(main()) + +# calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\nicebooks.py" -m 5 -a mankel >data.html \ No newline at end of file From 1610a739afb09ccb9d211234eafec5e635daf532 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Wed, 8 Dec 2010 20:47:47 +0100 Subject: [PATCH 046/132] Threading in fictionwise and some cleanup --- src/calibre/ebooks/metadata/fictionwise.py | 160 ++++++++++++--------- src/calibre/ebooks/metadata/nicebooks.py | 6 +- src/calibre/utils/cleantext.py | 32 ++++- 3 files changed, 127 insertions(+), 71 deletions(-) diff --git a/src/calibre/ebooks/metadata/fictionwise.py b/src/calibre/ebooks/metadata/fictionwise.py index 1d6aceecdd..a06516c7dc 100644 --- a/src/calibre/ebooks/metadata/fictionwise.py +++ b/src/calibre/ebooks/metadata/fictionwise.py @@ -4,6 +4,7 @@ __copyright__ = '2010, sengian <sengian1@gmail.com>' __docformat__ = 'restructuredtext en' import sys, textwrap, re, traceback, socket +from threading import Thread from Queue import Queue from urllib import urlencode @@ -17,7 +18,7 @@ from calibre.library.comments import sanitize_comments_html from calibre.ebooks.metadata.fetch import MetadataSource from calibre.utils.config import OptionParser from calibre.utils.date import parse_date, utcnow -from calibre.utils.cleantext import clean_ascii_chars +from calibre.utils.cleantext import clean_ascii_chars, unescape class Fictionwise(MetadataSource): # {{{ @@ -40,7 +41,45 @@ class Fictionwise(MetadataSource): # {{{ class FictionwiseError(Exception): pass - +class BrowserThread(Thread): + + def __init__(self, url, verbose=False, timeout=10., ex=Exception, name='Meta'): + self.url = url + self.ex = ex + self.plugname = name + self.verbose = verbose + self.timeout = timeout + self.result = None + Thread.__init__(self) + + def get_result(self): + return self.result + + def run(self): + try: + raw = browser().open_novisit(self.url, timeout=self.timeout).read() + except Exception, e: + report(self.verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + self.result = None + if isinstance(getattr(e, 'args', [None])[0], socket.timeout): + raise self.ex(_('%s timed out. Try again later.') % self.plugname) + raise self.ex(_('%s encountered an error.') % self.plugname) + if '<title>404 - ' in raw: + report(self.verbose) + self.result = None + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + self.result = soupparser.fromstring(raw) + except: + try: + #remove ASCII invalid chars + self.result = soupparser.fromstring(clean_ascii_chars(raw)) + except: + self.result = None + def report(verbose): if verbose: @@ -180,10 +219,13 @@ class ResultList(list): for elt in elts: elt.drop_tree() - def output_entry(self, entry, prettyout = True, htmlrm="\d+"): + def output_entry(self, entry, prettyout = True, rmhtmlchar=True): out = tostring(entry, pretty_print=prettyout) - #try to work around tostring to remove this encoding for exemle - reclean = re.compile('(\n+|\t+|\r+|&#'+htmlrm+';)') + #remove html chars + if rmhtmlchar: + out = unescape(out, rm=True) + # Remove \n\t\r. + reclean = re.compile('(\n+|\t+|\r+)') return reclean.sub('', out) def get_title(self, entry): @@ -211,7 +253,7 @@ class ResultList(list): return float(1.25*sum(k*v for (k, v) in hval.iteritems())/sum(hval.itervalues())) def get_description(self, entry): - description = self.output_entry(entry.xpath('./p')[1],htmlrm="") + description = self.output_entry(entry.xpath('./p')[1],rmhtmlchar=False) description = self.redesc.search(description) if not description or not description.group("desc"): return None @@ -265,9 +307,24 @@ class ResultList(list): isbns = [self.reisbn.sub('', x) for x in isbns if check_isbn(self.reisbn.sub('', x))] return sorted(isbns, cmp=lambda x,y:cmp(len(x), len(y)))[-1] - def fill_MI(self, entry, title, authors, ratings, verbose): + def fill_MI(self, data, verbose): + inv_tags ={'script': True, 'a': False, 'font': False, 'strong': False, 'b': False, + 'ul': False, 'span': False} + inv_xpath =('./table',) + try: + entry = data.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td")[0] + self.clean_entry(entry, invalid_tags=inv_tags, invalid_xpath=inv_xpath) + title = self.get_title(entry) + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print _('Failed to get all details for an entry') + print e + return None mi = MetaInformation(title, authors) - mi.rating = ratings + ratings = entry.xpath("./p/table") + if len(ratings) >= 2: + mi.rating = self.get_rating(ratings[1], verbose) mi.comments = self.get_description(entry) mi.publisher = self.get_publisher(entry) mi.tags = self.get_tags(entry) @@ -276,67 +333,36 @@ class ResultList(list): mi.author_sort = authors_to_sort_string(authors) return mi - def get_individual_metadata(self, browser, linkdata, verbose): - try: - raw = browser.open_novisit(self.BASE_URL + linkdata).read() - except Exception, e: - report(verbose) - if callable(getattr(e, 'getcode', None)) and \ - e.getcode() == 404: - return - if isinstance(getattr(e, 'args', [None])[0], socket.timeout): - raise FictionwiseError(_('Fictionwise timed out. Try again later.')) - raise FictionwiseError(_('Fictionwise encountered an error.')) - if '<title>404 - ' in raw: - report(verbose) - return - raw = xml_to_unicode(raw, strip_encoding_pats=True, - resolve_entities=True)[0] - try: - return soupparser.fromstring(raw) - except: - try: - #remove ASCII invalid chars - return soupparser.fromstring(clean_ascii_chars(raw)) - except: - return None + def producer(self, q, data, verbose=False): + for x in data: + thread = BrowserThread(self.BASE_URL+x, verbose=verbose, ex=FictionwiseError, + name='Fictionwise') + thread.start() + q.put(thread, True) - def populate(self, entries, browser, verbose=False): - inv_tags ={'script': True, 'a': False, 'font': False, 'strong': False, 'b': False, - 'ul': False, 'span': False} - inv_xpath =('./table',) - #single entry + def consumer(self, q, total_entries, verbose=False): + while len(self) < total_entries: + thread = q.get(True) + thread.join() + mi = thread.get_result() + if mi is None: + self.append(None) + else: + self.append(self.fill_MI(mi, verbose)) + + def populate(self, entries, verbose=False, brcall=3): if len(entries) == 1 and not isinstance(entries[0], str): - try: - entry = entries.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td") - self.clean_entry(entry, invalid_tags=inv_tags, invalid_xpath=inv_xpath) - title = self.get_title(entry) - #maybe strenghten the search - ratings = self.get_rating(entry.xpath("./p/table")[1], verbose) - authors = self.get_authors(entry) - except Exception, e: - if verbose: - print _('Failed to get all details for an entry') - print e - return - self.append(self.fill_MI(entry, title, authors, ratings, verbose)) + #single entry + self.append(self.fill_MI(entries[0], verbose)) else: #multiple entries - for x in entries: - try: - entry = self.get_individual_metadata(browser, x, verbose) - entry = entry.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td")[0] - self.clean_entry(entry, invalid_tags=inv_tags, invalid_xpath=inv_xpath) - title = self.get_title(entry) - #maybe strenghten the search - ratings = self.get_rating(entry.xpath("./p/table")[1], verbose) - authors = self.get_authors(entry) - except Exception, e: - if verbose: - print _('Failed to get all details for an entry') - print e - continue - self.append(self.fill_MI(entry, title, authors, ratings, verbose)) + q = Queue(brcall) + prod_thread = Thread(target=self.producer, args=(q, entries, verbose)) + cons_thread = Thread(target=self.consumer, args=(q, len(entries), verbose)) + prod_thread.start() + cons_thread.start() + prod_thread.join() + cons_thread.join() def search(title=None, author=None, publisher=None, isbn=None, @@ -349,7 +375,7 @@ def search(title=None, author=None, publisher=None, isbn=None, #List of entry ans = ResultList() ans.populate(entries, br, verbose) - return ans + return [x for x in ans if x is not None] def option_parser(): @@ -391,3 +417,5 @@ def main(args=sys.argv): if __name__ == '__main__': sys.exit(main()) + +# calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\fictionwise.py" -m 5 -a gore -v>data.html \ No newline at end of file diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py index 580e645320..5bd360ed6c 100644 --- a/src/calibre/ebooks/metadata/nicebooks.py +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -302,9 +302,7 @@ class ResultList(list): def populate(self, entries, verbose=False, brcall=3): if len(entries) == 1 and not isinstance(entries[0], str): #single entry - mi = self.fill_MI(entries[0], verbose) - if mi: - self.append(mi) + self.append(self.fill_MI(entries[0], verbose)) else: #multiple entries q = Queue(brcall) @@ -364,7 +362,7 @@ def search(title=None, author=None, publisher=None, isbn=None, #List of entry ans = ResultList() ans.populate(entries, verbose) - return [x for x in ans if x] + return [x for x in ans if x is not None] def check_for_cover(isbn): br = browser() diff --git a/src/calibre/utils/cleantext.py b/src/calibre/utils/cleantext.py index b4afe7576d..a27f74529e 100644 --- a/src/calibre/utils/cleantext.py +++ b/src/calibre/utils/cleantext.py @@ -3,7 +3,8 @@ __license__ = 'GPL 3' __copyright__ = '2010, sengian <sengian1@gmail.com>' __docformat__ = 'restructuredtext en' -import re +import re, htmlentitydefs +from functools import partial _ascii_pat = None @@ -21,3 +22,32 @@ def clean_ascii_chars(txt, charlist=None): pat = re.compile(u'|'.join(map(unichr, charlist))) return pat.sub('', txt) +## +# Fredrik Lundh: http://effbot.org/zone/re-sub.htm#unescape-html +# Removes HTML or XML character references and entities from a text string. +# +# @param text The HTML (or XML) source text. +# @return The plain text, as a Unicode string, if necessary. + +def unescape(text, rm=False, rchar=u''): + def fixup(m, rm=rm, rchar=rchar): + text = m.group(0) + if text[:2] == "&#": + # character reference + try: + if text[:3] == "&#x": + return unichr(int(text[3:-1], 16)) + else: + return unichr(int(text[2:-1])) + except ValueError: + pass + else: + # named entity + try: + text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) + except KeyError: + pass + if rm: + return rchar #replace by char + return text # leave as is + return re.sub("&#?\w+;", fixup, text) \ No newline at end of file From f766eb871c54fa249f2c0e6b71067ee9517b5de8 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Wed, 8 Dec 2010 22:50:57 +0100 Subject: [PATCH 047/132] Add threading to Amazon (still lagging like hell) --- src/calibre/ebooks/metadata/amazonfr.py | 151 +++++++++++++++------ src/calibre/ebooks/metadata/fictionwise.py | 21 +-- src/calibre/ebooks/metadata/nicebooks.py | 8 +- 3 files changed, 128 insertions(+), 52 deletions(-) diff --git a/src/calibre/ebooks/metadata/amazonfr.py b/src/calibre/ebooks/metadata/amazonfr.py index 156fff3d75..6d8c2e407c 100644 --- a/src/calibre/ebooks/metadata/amazonfr.py +++ b/src/calibre/ebooks/metadata/amazonfr.py @@ -3,11 +3,12 @@ __license__ = 'GPL 3' __copyright__ = '2010, sengian <sengian1@gmail.com>' import sys, textwrap, re, traceback +from threading import Thread +from Queue import Queue from urllib import urlencode from math import ceil -from lxml import html -from lxml.html import soupparser +from lxml.html import soupparser, tostring from calibre.utils.date import parse_date, utcnow, replace_months from calibre.utils.cleantext import clean_ascii_chars @@ -116,6 +117,48 @@ def report(verbose): if verbose: traceback.print_exc() +class AmazonError(Exception): + pass + +class BrowserThread(Thread): + + def __init__(self, url, verbose=False, timeout=10., ex=Exception, name='Meta'): + self.url = url + self.ex = ex + self.plugname = name + self.verbose = verbose + self.timeout = timeout + self.result = None + Thread.__init__(self) + + def get_result(self): + return self.result + + def run(self): + try: + raw = browser().open_novisit(self.url, timeout=self.timeout).read() + except Exception, e: + report(self.verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + self.result = None + if isinstance(getattr(e, 'args', [None])[0], socket.timeout): + raise self.ex(_('%s timed out. Try again later.') % self.plugname) + raise self.ex(_('%s encountered an error.') % self.plugname) + if '<title>404 - ' in raw: + report(self.verbose) + self.result = None + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + self.result = soupparser.fromstring(raw) + except: + try: + #remove ASCII invalid chars + self.result = soupparser.fromstring(clean_ascii_chars(raw)) + except: + self.result = None + class Query(object): @@ -189,7 +232,7 @@ class Query(object): def __call__(self, browser, verbose, timeout = 5.): if verbose: - print 'Query:', self.urldata + print _('Query: %s') % self.urldata try: raw = browser.open_novisit(self.urldata, timeout=timeout).read() @@ -197,10 +240,12 @@ class Query(object): report(verbose) if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: - return - raise + return None, self.urldata + if isinstance(getattr(e, 'args', [None])[0], socket.timeout): + raise AmazonError(_('Amazon timed out. Try again later.')) + raise AmazonError(_('Amazon encountered an error.')) if '<title>404 - ' in raw: - return + return None, self.urldata raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] @@ -315,7 +360,7 @@ class ResultList(list): inv_class = ('seeAll', 'emptyClear') inv_tags ={'img': True, 'a': False} self.clean_entry(description, invalid_tags=inv_tags, invalid_class=inv_class) - description = html.tostring(description, method='html', encoding=unicode).strip() + description = tostring(description, method='html', encoding=unicode).strip() # remove all attributes from tags description = self.reattr.sub(r'<\1>', description) # Remove the notice about text referring to out of print editions @@ -327,7 +372,7 @@ class ResultList(list): report(verbose) return None - def get_tags(self, entry, browser, verbose): + def get_tags(self, entry, verbose): try: tags = entry.get_element_by_id('tagContentHolder') testptag = tags.find_class('see-all') @@ -338,7 +383,7 @@ class ResultList(list): if alink[0].get('class') == 'tgJsActive': continue link = self.baseurl + alink[0].get('href') - entry = self.get_individual_metadata(browser, link, verbose) + entry = self.get_individual_metadata(link, verbose) tags = entry.get_element_by_id('tagContentHolder') break tags = [a.text for a in tags.getiterator('a') if a.get('rel') == 'tag'] @@ -402,26 +447,41 @@ class ResultList(list): mi.rating = float(ratings[0])/float(ratings[1]) * 5 return mi - def fill_MI(self, entry, title, authors, browser, verbose): + def fill_MI(self, entry, verbose): + try: + title = self.get_title(entry) + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print _('Failed to get all details for an entry') + print e + print _('URL who failed: %s') % x + report(verbose) + return None mi = MetaInformation(title, authors) mi.author_sort = authors_to_sort_string(authors) - mi.comments = self.get_description(entry, verbose) - mi = self.get_book_info(entry, mi, verbose) - mi.tags = self.get_tags(entry, browser, verbose) + try: + mi.comments = self.get_description(entry, verbose) + mi = self.get_book_info(entry, mi, verbose) + mi.tags = self.get_tags(entry, verbose) + except: + pass return mi - def get_individual_metadata(self, browser, linkdata, verbose): + def get_individual_metadata(self, url, verbose): try: - raw = browser.open_novisit(linkdata).read() + raw = browser().open_novisit(url).read() except Exception, e: report(verbose) if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: - return - raise + return None + if isinstance(getattr(e, 'args', [None])[0], socket.timeout): + raise AmazonError(_('Amazon timed out. Try again later.')) + raise AmazonError(_('Amazon encountered an error.')) if '<title>404 - ' in raw: report(verbose) - return + return None raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] try: @@ -432,27 +492,34 @@ class ResultList(list): return soupparser.fromstring(clean_ascii_chars(raw)) except: report(verbose) - return + return None - def populate(self, entries, browser, verbose=False): - for x in entries: - try: - entry = self.get_individual_metadata(browser, x, verbose) - # clean results - # inv_ids = ('divsinglecolumnminwidth', 'sims.purchase', 'AutoBuyXGetY', 'A9AdsMiddleBoxTop') - # inv_class = ('buyingDetailsGrid', 'productImageGrid') - # inv_tags ={'script': True, 'style': True, 'form': False} - # self.clean_entry(entry, invalid_id=inv_ids) - title = self.get_title(entry) - authors = self.get_authors(entry) - except Exception, e: - if verbose: - print 'Failed to get all details for an entry' - print e - print 'URL who failed:', x - report(verbose) - continue - self.append(self.fill_MI(entry, title, authors, browser, verbose)) + def producer(self, q, data, verbose=False): + for x in data: + thread = BrowserThread(x, verbose=verbose, ex=AmazonError, + name='Amazon') + thread.start() + q.put(thread, True) + + def consumer(self, q, total_entries, verbose=False): + while len(self) < total_entries: + thread = q.get(True) + thread.join() + mi = thread.get_result() + if mi is None: + self.append(None) + else: + self.append(self.fill_MI(mi, verbose)) + + def populate(self, entries, verbose=False, brcall=5): + #multiple entries + q = Queue(brcall) + prod_thread = Thread(target=self.producer, args=(q, entries, verbose)) + cons_thread = Thread(target=self.consumer, args=(q, len(entries), verbose)) + prod_thread.start() + cons_thread.start() + prod_thread.join() + cons_thread.join() def search(title=None, author=None, publisher=None, isbn=None, @@ -466,8 +533,8 @@ def search(title=None, author=None, publisher=None, isbn=None, #List of entry ans = ResultList(baseurl, lang) - ans.populate(entries, br, verbose) - return ans + ans.populate(entries, verbose) + return [x for x in ans if x is not None] def option_parser(): parser = OptionParser(textwrap.dedent(\ @@ -506,7 +573,7 @@ def main(args=sys.argv): parser.print_help() return 1 if results is None or len(results) == 0: - print 'No result found for this search!' + print _('No result found for this search!') return 0 for result in results: print unicode(result).encode(preferred_encoding, 'replace') @@ -514,3 +581,5 @@ def main(args=sys.argv): if __name__ == '__main__': sys.exit(main()) + +# calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\amazonfr.py" -m 5 -a gore -v>data.html \ No newline at end of file diff --git a/src/calibre/ebooks/metadata/fictionwise.py b/src/calibre/ebooks/metadata/fictionwise.py index a06516c7dc..892e286810 100644 --- a/src/calibre/ebooks/metadata/fictionwise.py +++ b/src/calibre/ebooks/metadata/fictionwise.py @@ -80,11 +80,11 @@ class BrowserThread(Thread): except: self.result = None - def report(verbose): if verbose: traceback.print_exc() + class Query(object): BASE_URL = 'http://www.fictionwise.com/servlet/mw' @@ -322,15 +322,18 @@ class ResultList(list): print e return None mi = MetaInformation(title, authors) - ratings = entry.xpath("./p/table") - if len(ratings) >= 2: - mi.rating = self.get_rating(ratings[1], verbose) - mi.comments = self.get_description(entry) - mi.publisher = self.get_publisher(entry) - mi.tags = self.get_tags(entry) - mi.pubdate = self.get_date(entry, verbose) - mi.isbn = self.get_ISBN(entry) mi.author_sort = authors_to_sort_string(authors) + try: + ratings = entry.xpath("./p/table") + if len(ratings) >= 2: + mi.rating = self.get_rating(ratings[1], verbose) + mi.comments = self.get_description(entry) + mi.publisher = self.get_publisher(entry) + mi.tags = self.get_tags(entry) + mi.pubdate = self.get_date(entry, verbose) + mi.isbn = self.get_ISBN(entry) + except: + pass return mi def producer(self, q, data, verbose=False): diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py index 5bd360ed6c..8911b31c08 100644 --- a/src/calibre/ebooks/metadata/nicebooks.py +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -279,8 +279,12 @@ class ResultList(list): return None mi = MetaInformation(title, authors) mi.author_sort = authors_to_sort_string(authors) - mi.comments = self.get_description(entry, verbose) - return self.get_book_info(entry, mi, verbose) + try: + mi.comments = self.get_description(entry, verbose) + mi = self.get_book_info(entry, mi, verbose) + except: + pass + return mi def producer(self, q, data, verbose=False): for x in data: From 8f7bc53128ca3b1d1b9e7fb4e607b4610f527a62 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Thu, 9 Dec 2010 00:13:08 +0100 Subject: [PATCH 048/132] Improve speed: first minimization of browser creation calls --- src/calibre/ebooks/metadata/amazonfr.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/calibre/ebooks/metadata/amazonfr.py b/src/calibre/ebooks/metadata/amazonfr.py index 6d8c2e407c..eaab7001b7 100644 --- a/src/calibre/ebooks/metadata/amazonfr.py +++ b/src/calibre/ebooks/metadata/amazonfr.py @@ -129,14 +129,15 @@ class BrowserThread(Thread): self.verbose = verbose self.timeout = timeout self.result = None + self.br = browser() Thread.__init__(self) def get_result(self): - return self.result + return self.result, self.br def run(self): try: - raw = browser().open_novisit(self.url, timeout=self.timeout).read() + raw = self.br.open_novisit(self.url, timeout=self.timeout).read() except Exception, e: report(self.verbose) if callable(getattr(e, 'getcode', None)) and \ @@ -447,7 +448,7 @@ class ResultList(list): mi.rating = float(ratings[0])/float(ratings[1]) * 5 return mi - def fill_MI(self, entry, verbose): + def fill_MI(self, entry, br, verbose): try: title = self.get_title(entry) authors = self.get_authors(entry) @@ -463,14 +464,14 @@ class ResultList(list): try: mi.comments = self.get_description(entry, verbose) mi = self.get_book_info(entry, mi, verbose) - mi.tags = self.get_tags(entry, verbose) + mi.tags = self.get_tags(entry, br, verbose) except: pass return mi - def get_individual_metadata(self, url, verbose): + def get_individual_metadata(self, url, br, verbose): try: - raw = browser().open_novisit(url).read() + raw = br.open_novisit(url).read() except Exception, e: report(verbose) if callable(getattr(e, 'getcode', None)) and \ @@ -505,11 +506,11 @@ class ResultList(list): while len(self) < total_entries: thread = q.get(True) thread.join() - mi = thread.get_result() + mi, br = thread.get_result() if mi is None: self.append(None) else: - self.append(self.fill_MI(mi, verbose)) + self.append(self.fill_MI(mi, br, verbose)) def populate(self, entries, verbose=False, brcall=5): #multiple entries @@ -581,5 +582,8 @@ def main(args=sys.argv): if __name__ == '__main__': sys.exit(main()) + # import cProfile + # sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonfr; calibre.ebooks.metadata.amazonfr.main()")) + # sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonfr; calibre.ebooks.metadata.amazonfr.main()", "profile_tmp")) # calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\amazonfr.py" -m 5 -a gore -v>data.html \ No newline at end of file From a74346498729e91e18f65b15bb536b04581f4a1e Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sat, 11 Dec 2010 13:55:31 +0100 Subject: [PATCH 049/132] Minor modifications to Nicebooks/Fictionwise --- src/calibre/ebooks/metadata/fictionwise.py | 5 +++-- src/calibre/ebooks/metadata/nicebooks.py | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/metadata/fictionwise.py b/src/calibre/ebooks/metadata/fictionwise.py index 892e286810..efb19ca249 100644 --- a/src/calibre/ebooks/metadata/fictionwise.py +++ b/src/calibre/ebooks/metadata/fictionwise.py @@ -69,6 +69,7 @@ class BrowserThread(Thread): if '<title>404 - ' in raw: report(self.verbose) self.result = None + return None raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] try: @@ -137,12 +138,12 @@ class Query(object): report(verbose) if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: - return + return None if isinstance(getattr(e, 'args', [None])[0], socket.timeout): raise FictionwiseError(_('Fictionwise timed out. Try again later.')) raise FictionwiseError(_('Fictionwise encountered an error.')) if '<title>404 - ' in raw: - return + return None raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] try: diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py index 8911b31c08..cdf915c827 100644 --- a/src/calibre/ebooks/metadata/nicebooks.py +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -108,6 +108,7 @@ class BrowserThread(Thread): if '<title>404 - ' in raw: report(self.verbose) self.result = None + return None raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] try: From 8aa50c106e0c2f0db9c8ef294fa71e94173b3d2c Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sat, 11 Dec 2010 13:57:06 +0100 Subject: [PATCH 050/132] Amazon threading --- src/calibre/ebooks/metadata/amazonfr.py | 143 +++++++++++++----------- 1 file changed, 79 insertions(+), 64 deletions(-) diff --git a/src/calibre/ebooks/metadata/amazonfr.py b/src/calibre/ebooks/metadata/amazonfr.py index eaab7001b7..96bac89690 100644 --- a/src/calibre/ebooks/metadata/amazonfr.py +++ b/src/calibre/ebooks/metadata/amazonfr.py @@ -3,7 +3,7 @@ __license__ = 'GPL 3' __copyright__ = '2010, sengian <sengian1@gmail.com>' import sys, textwrap, re, traceback -from threading import Thread +from threading import Thread, Lock from Queue import Queue from urllib import urlencode from math import ceil @@ -122,9 +122,12 @@ class AmazonError(Exception): class BrowserThread(Thread): - def __init__(self, url, verbose=False, timeout=10., ex=Exception, name='Meta'): + def __init__(self, url, qbr, qsync, nb, verbose=False, timeout=10., ex=Exception, name='Meta'): self.url = url self.ex = ex + self.qbr = qbr + self.qsync = qsync + self.nb = nb self.plugname = name self.verbose = verbose self.timeout = timeout @@ -133,10 +136,11 @@ class BrowserThread(Thread): Thread.__init__(self) def get_result(self): - return self.result, self.br + return self.result def run(self): try: + browser = self.qbr.get(True) raw = self.br.open_novisit(self.url, timeout=self.timeout).read() except Exception, e: report(self.verbose) @@ -146,9 +150,13 @@ class BrowserThread(Thread): if isinstance(getattr(e, 'args', [None])[0], socket.timeout): raise self.ex(_('%s timed out. Try again later.') % self.plugname) raise self.ex(_('%s encountered an error.') % self.plugname) + finally: + self.qbr.put(browser, True) + if '<title>404 - ' in raw: report(self.verbose) self.result = None + return None raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] try: @@ -159,6 +167,8 @@ class BrowserThread(Thread): self.result = soupparser.fromstring(clean_ascii_chars(raw)) except: self.result = None + finally: + self.qsync.put(self.nb, True) class Query(object): @@ -174,7 +184,7 @@ class Query(object): assert (max_results < 21) self.max_results = int(max_results) - self.renbres = re.compile(u'\s*(\d+)\s*') + self.renbres = re.compile(u'\s*([0-9.,]+)\s*') q = { 'search-alias' : 'stripbooks' , 'unfiltered' : '1', @@ -262,6 +272,7 @@ class Query(object): #nb of page try: nbresults = self.renbres.findall(feed.xpath("//*[@class='resultCount']")[0].text) + nbresults = [re.sub(r'[.,]', '', x) for x in nbresults] except: return None, self.urldata @@ -294,11 +305,14 @@ class Query(object): for i in x.xpath("//a/span[@class='srTitle']")]) return results[:self.max_results], self.baseurl -class ResultList(list): +class ResultList(object): def __init__(self, baseurl, lang = 'all'): self.baseurl = baseurl self.lang = lang + self.thread = [] + self.res = [] + self.nbtag = 0 self.repub = re.compile(u'\((.*)\)') self.rerat = re.compile(u'([0-9.]+)') self.reattr = re.compile(r'<([a-zA-Z0-9]+)\s[^>]+>') @@ -383,15 +397,12 @@ class ResultList(list): if alink: if alink[0].get('class') == 'tgJsActive': continue - link = self.baseurl + alink[0].get('href') - entry = self.get_individual_metadata(link, verbose) - tags = entry.get_element_by_id('tagContentHolder') - break + return self.baseurl + alink[0].get('href'), True tags = [a.text for a in tags.getiterator('a') if a.get('rel') == 'tag'] except: report(verbose) - tags = [] - return tags + tags = [], False + return tags, False def get_book_info(self, entry, mi, verbose): try: @@ -429,9 +440,12 @@ class ResultList(list): if check_isbn(isbn): mi.isbn = unicode(isbn) elif len(elt) > 1: - isbn = elt[1].find('b').tail.replace('-', '').strip() - if check_isbn(isbn): - mi.isbn = unicode(isbn) + isbnone = elt[1].find('b').tail.replace('-', '').strip() + if check_isbn(isbnone): + mi.isbn = unicode(isbnone) + else: + #assume ASIN-> find a check for asin + mi.isbn = unicode(isbn) #Langue elt = filter(lambda x: self.relang.search(x.find('b').text), elts) if elt: @@ -448,7 +462,7 @@ class ResultList(list): mi.rating = float(ratings[0])/float(ratings[1]) * 5 return mi - def fill_MI(self, entry, br, verbose): + def fill_MI(self, entry, verbose): try: title = self.get_title(entry) authors = self.get_authors(entry) @@ -464,63 +478,65 @@ class ResultList(list): try: mi.comments = self.get_description(entry, verbose) mi = self.get_book_info(entry, mi, verbose) - mi.tags = self.get_tags(entry, br, verbose) except: pass return mi - def get_individual_metadata(self, url, br, verbose): - try: - raw = br.open_novisit(url).read() - except Exception, e: - report(verbose) - if callable(getattr(e, 'getcode', None)) and \ - e.getcode() == 404: - return None - if isinstance(getattr(e, 'args', [None])[0], socket.timeout): - raise AmazonError(_('Amazon timed out. Try again later.')) - raise AmazonError(_('Amazon encountered an error.')) - if '<title>404 - ' in raw: - report(verbose) - return None - raw = xml_to_unicode(raw, strip_encoding_pats=True, - resolve_entities=True)[0] - try: - return soupparser.fromstring(raw) - except: - try: - #remove ASCII invalid chars - return soupparser.fromstring(clean_ascii_chars(raw)) - except: - report(verbose) - return None - - def producer(self, q, data, verbose=False): - for x in data: - thread = BrowserThread(x, verbose=verbose, ex=AmazonError, + def producer(self, sync, data, br, verbose=False): + for i in xrange(len(data)): + thread = BrowserThread(data[i], br, sync, i, verbose=verbose, ex=AmazonError, name='Amazon') thread.start() - q.put(thread, True) + self.thread.append(thread) - def consumer(self, q, total_entries, verbose=False): - while len(self) < total_entries: - thread = q.get(True) - thread.join() - mi, br = thread.get_result() - if mi is None: - self.append(None) - else: - self.append(self.fill_MI(mi, br, verbose)) + def consumer(self, sync, syncbis, br, total_entries, verbose=False): + i=0 + while i < total_entries: + nb = int(sync.get(True)) + entry = self.thread[nb].get_result() + i+=1 + if entry is not None: + mi = self.fill_MI(entry, verbose) + if mi is not None: + mi.tags, atag = self.get_tags(entry, verbose) + self.res[nb] = mi + if atag: + threadbis = BrowserThread(mi.tags, br, syncbis, nb, verbose=verbose, ex=AmazonError, + name='Amazon') + self.thread[nb] = threadbis + self.nbtag +=1 + threadbis.start() - def populate(self, entries, verbose=False, brcall=5): + def populate(self, entries, ibr, verbose=False, brcall=3): #multiple entries - q = Queue(brcall) - prod_thread = Thread(target=self.producer, args=(q, entries, verbose)) - cons_thread = Thread(target=self.consumer, args=(q, len(entries), verbose)) + br = Queue(brcall) + cbr = Queue(brcall-1) + + syncp = Queue(1) + syncc = Queue(len(entries)) + + for i in xrange(brcall-1): + br.put(browser(), True) + cbr.put(browser(), True) + br.put(ibr, True) + + self.res = [None]*len(entries) + + prod_thread = Thread(target=self.producer, args=(syncp, entries, br, verbose)) + cons_thread = Thread(target=self.consumer, args=(syncp, syncc, cbr, len(entries), verbose)) prod_thread.start() cons_thread.start() prod_thread.join() cons_thread.join() + + #finish processing + for i in xrange(self.nbtag): + nb = int(syncc.get(True)) + tags = self.thread[nb].get_result() + if tags is not None: + self.res[nb].tags = self.get_tags(tags, verbose)[0] + + return self.res def search(title=None, author=None, publisher=None, isbn=None, @@ -534,8 +550,7 @@ def search(title=None, author=None, publisher=None, isbn=None, #List of entry ans = ResultList(baseurl, lang) - ans.populate(entries, verbose) - return [x for x in ans if x is not None] + return [x for x in ans.populate(entries, br, verbose) if x is not None] def option_parser(): parser = OptionParser(textwrap.dedent(\ @@ -581,9 +596,9 @@ def main(args=sys.argv): print if __name__ == '__main__': - sys.exit(main()) - # import cProfile + # sys.exit(main()) + import cProfile # sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonfr; calibre.ebooks.metadata.amazonfr.main()")) - # sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonfr; calibre.ebooks.metadata.amazonfr.main()", "profile_tmp")) + sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonfr; calibre.ebooks.metadata.amazonfr.main()", "profile_tmp_threading_1")) # calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\amazonfr.py" -m 5 -a gore -v>data.html \ No newline at end of file From 34c6caeeecfa2ea5d6f934c2e79e057155351854 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sat, 11 Dec 2010 18:22:33 +0100 Subject: [PATCH 051/132] Remove threading (no gain) --- src/calibre/ebooks/metadata/amazonfr.py | 150 ++++++------------------ 1 file changed, 38 insertions(+), 112 deletions(-) diff --git a/src/calibre/ebooks/metadata/amazonfr.py b/src/calibre/ebooks/metadata/amazonfr.py index 96bac89690..3842977654 100644 --- a/src/calibre/ebooks/metadata/amazonfr.py +++ b/src/calibre/ebooks/metadata/amazonfr.py @@ -2,9 +2,7 @@ from __future__ import with_statement __license__ = 'GPL 3' __copyright__ = '2010, sengian <sengian1@gmail.com>' -import sys, textwrap, re, traceback -from threading import Thread, Lock -from Queue import Queue +import sys, textwrap, re, traceback, socket from urllib import urlencode from math import ceil @@ -108,10 +106,6 @@ class Amazon(MetadataSource): self.exception = e self.tb = traceback.format_exc() - # @property - # def string_customization_help(self): - # return _('You can select here the language for metadata search with amazon.com') - def report(verbose): if verbose: @@ -120,56 +114,6 @@ def report(verbose): class AmazonError(Exception): pass -class BrowserThread(Thread): - - def __init__(self, url, qbr, qsync, nb, verbose=False, timeout=10., ex=Exception, name='Meta'): - self.url = url - self.ex = ex - self.qbr = qbr - self.qsync = qsync - self.nb = nb - self.plugname = name - self.verbose = verbose - self.timeout = timeout - self.result = None - self.br = browser() - Thread.__init__(self) - - def get_result(self): - return self.result - - def run(self): - try: - browser = self.qbr.get(True) - raw = self.br.open_novisit(self.url, timeout=self.timeout).read() - except Exception, e: - report(self.verbose) - if callable(getattr(e, 'getcode', None)) and \ - e.getcode() == 404: - self.result = None - if isinstance(getattr(e, 'args', [None])[0], socket.timeout): - raise self.ex(_('%s timed out. Try again later.') % self.plugname) - raise self.ex(_('%s encountered an error.') % self.plugname) - finally: - self.qbr.put(browser, True) - - if '<title>404 - ' in raw: - report(self.verbose) - self.result = None - return None - raw = xml_to_unicode(raw, strip_encoding_pats=True, - resolve_entities=True)[0] - try: - self.result = soupparser.fromstring(raw) - except: - try: - #remove ASCII invalid chars - self.result = soupparser.fromstring(clean_ascii_chars(raw)) - except: - self.result = None - finally: - self.qsync.put(self.nb, True) - class Query(object): @@ -305,14 +249,11 @@ class Query(object): for i in x.xpath("//a/span[@class='srTitle']")]) return results[:self.max_results], self.baseurl -class ResultList(object): +class ResultList(list): def __init__(self, baseurl, lang = 'all'): self.baseurl = baseurl self.lang = lang - self.thread = [] - self.res = [] - self.nbtag = 0 self.repub = re.compile(u'\((.*)\)') self.rerat = re.compile(u'([0-9.]+)') self.reattr = re.compile(r'<([a-zA-Z0-9]+)\s[^>]+>') @@ -482,61 +423,45 @@ class ResultList(object): pass return mi - def producer(self, sync, data, br, verbose=False): - for i in xrange(len(data)): - thread = BrowserThread(data[i], br, sync, i, verbose=verbose, ex=AmazonError, - name='Amazon') - thread.start() - self.thread.append(thread) + def get_individual_metadata(self, url, br, verbose): + try: + raw = br.open_novisit(url).read() + except Exception, e: + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return None + if isinstance(getattr(e, 'args', [None])[0], socket.timeout): + raise AmazonError(_('Amazon timed out. Try again later.')) + raise AmazonError(_('Amazon encountered an error.')) + if '<title>404 - ' in raw: + report(verbose) + return None + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + return soupparser.fromstring(raw) + except: + try: + #remove ASCII invalid chars + return soupparser.fromstring(clean_ascii_chars(raw)) + except: + report(verbose) + return None - def consumer(self, sync, syncbis, br, total_entries, verbose=False): - i=0 - while i < total_entries: - nb = int(sync.get(True)) - entry = self.thread[nb].get_result() - i+=1 + def populate(self, entries, br, verbose=False): + #multiple entries + for x in entries: + entry = self.get_individual_metadata(x, br, verbose) if entry is not None: mi = self.fill_MI(entry, verbose) if mi is not None: mi.tags, atag = self.get_tags(entry, verbose) - self.res[nb] = mi if atag: - threadbis = BrowserThread(mi.tags, br, syncbis, nb, verbose=verbose, ex=AmazonError, - name='Amazon') - self.thread[nb] = threadbis - self.nbtag +=1 - threadbis.start() - - def populate(self, entries, ibr, verbose=False, brcall=3): - #multiple entries - br = Queue(brcall) - cbr = Queue(brcall-1) - - syncp = Queue(1) - syncc = Queue(len(entries)) - - for i in xrange(brcall-1): - br.put(browser(), True) - cbr.put(browser(), True) - br.put(ibr, True) - - self.res = [None]*len(entries) - - prod_thread = Thread(target=self.producer, args=(syncp, entries, br, verbose)) - cons_thread = Thread(target=self.consumer, args=(syncp, syncc, cbr, len(entries), verbose)) - prod_thread.start() - cons_thread.start() - prod_thread.join() - cons_thread.join() - - #finish processing - for i in xrange(self.nbtag): - nb = int(syncc.get(True)) - tags = self.thread[nb].get_result() - if tags is not None: - self.res[nb].tags = self.get_tags(tags, verbose)[0] - - return self.res + tags = self.get_individual_metadata(mi.tags, br, verbose) + if tags is not None: + mi.tags = self.get_tags(tags, verbose)[0] + self.append(mi) def search(title=None, author=None, publisher=None, isbn=None, @@ -550,7 +475,8 @@ def search(title=None, author=None, publisher=None, isbn=None, #List of entry ans = ResultList(baseurl, lang) - return [x for x in ans.populate(entries, br, verbose) if x is not None] + ans.populate(entries, br, verbose) + return [x for x in ans if x is not None] def option_parser(): parser = OptionParser(textwrap.dedent(\ @@ -599,6 +525,6 @@ if __name__ == '__main__': # sys.exit(main()) import cProfile # sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonfr; calibre.ebooks.metadata.amazonfr.main()")) - sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonfr; calibre.ebooks.metadata.amazonfr.main()", "profile_tmp_threading_1")) + sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonfr; calibre.ebooks.metadata.amazonfr.main()", "profile_tmp_2")) # calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\amazonfr.py" -m 5 -a gore -v>data.html \ No newline at end of file From d5bc18b5c2b3ab0bb2dfa86e65191b8ccf4c7a67 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sat, 11 Dec 2010 22:07:35 +0100 Subject: [PATCH 052/132] Modify amazon to get social meta and split in 2 plugins --- src/calibre/customize/builtins.py | 6 +- .../metadata/{amazonfr.py => amazonbis.py} | 207 +++++++++++------- src/calibre/ebooks/metadata/fetch.py | 30 +-- 3 files changed, 143 insertions(+), 100 deletions(-) rename src/calibre/ebooks/metadata/{amazonfr.py => amazonbis.py} (76%) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 06da355d6a..4798c46516 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -480,10 +480,10 @@ from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, \ from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG from calibre.devices.kobo.driver import KOBO -from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \ - LibraryThing +from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, LibraryThing from calibre.ebooks.metadata.douban import DoubanBooks from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers +from calibre.ebooks.metadata.amazonbis import Amazon, AmazonSocial from calibre.ebooks.metadata.fictionwise import Fictionwise from calibre.ebooks.metadata.covers import OpenLibraryCovers, \ LibraryThingCovers, DoubanCovers @@ -491,7 +491,7 @@ from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX from calibre.ebooks.epub.fix.unmanifested import Unmanifested from calibre.ebooks.epub.fix.epubcheck import Epubcheck -plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, +plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, AmazonSocial, LibraryThing, DoubanBooks, NiceBooks, Fictionwise, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers, NiceBooksCovers] diff --git a/src/calibre/ebooks/metadata/amazonfr.py b/src/calibre/ebooks/metadata/amazonbis.py similarity index 76% rename from src/calibre/ebooks/metadata/amazonfr.py rename to src/calibre/ebooks/metadata/amazonbis.py index 3842977654..a94883b003 100644 --- a/src/calibre/ebooks/metadata/amazonfr.py +++ b/src/calibre/ebooks/metadata/amazonbis.py @@ -19,73 +19,56 @@ from calibre.utils.config import OptionParser from calibre.library.comments import sanitize_comments_html -class AmazonFr(MetadataSource): +# class AmazonFr(MetadataSource): - name = 'Amazon French' - description = _('Downloads metadata from amazon.fr') - supported_platforms = ['windows', 'osx', 'linux'] - author = 'Sengian' - version = (1, 0, 0) - has_html_comments = True + # name = 'Amazon French' + # description = _('Downloads metadata from amazon.fr') + # supported_platforms = ['windows', 'osx', 'linux'] + # author = 'Sengian' + # version = (1, 0, 0) + # has_html_comments = True - def fetch(self): - try: - self.results = search(self.title, self.book_author, self.publisher, - self.isbn, max_results=10, verbose=self.verbose, lang='fr') - except Exception, e: - self.exception = e - self.tb = traceback.format_exc() + # def fetch(self): + # try: + # self.results = search(self.title, self.book_author, self.publisher, + # self.isbn, max_results=10, verbose=self.verbose, lang='fr') + # except Exception, e: + # self.exception = e + # self.tb = traceback.format_exc() -class AmazonEs(MetadataSource): +# class AmazonEs(MetadataSource): - name = 'Amazon Spanish' - description = _('Downloads metadata from amazon.com in spanish') - supported_platforms = ['windows', 'osx', 'linux'] - author = 'Sengian' - version = (1, 0, 0) - has_html_comments = True + # name = 'Amazon Spanish' + # description = _('Downloads metadata from amazon.com in spanish') + # supported_platforms = ['windows', 'osx', 'linux'] + # author = 'Sengian' + # version = (1, 0, 0) + # has_html_comments = True - def fetch(self): - try: - self.results = search(self.title, self.book_author, self.publisher, - self.isbn, max_results=10, verbose=self.verbose, lang='es') - except Exception, e: - self.exception = e - self.tb = traceback.format_exc() + # def fetch(self): + # try: + # self.results = search(self.title, self.book_author, self.publisher, + # self.isbn, max_results=10, verbose=self.verbose, lang='es') + # except Exception, e: + # self.exception = e + # self.tb = traceback.format_exc() -class AmazonEn(MetadataSource): +# class AmazonDe(MetadataSource): - name = 'Amazon English' - description = _('Downloads metadata from amazon.com in english') - supported_platforms = ['windows', 'osx', 'linux'] - author = 'Sengian' - version = (1, 0, 0) - has_html_comments = True + # name = 'Amazon German' + # description = _('Downloads metadata from amazon.de') + # supported_platforms = ['windows', 'osx', 'linux'] + # author = 'Sengian' + # version = (1, 0, 0) + # has_html_comments = True - def fetch(self): - try: - self.results = search(self.title, self.book_author, self.publisher, - self.isbn, max_results=10, verbose=self.verbose, lang='en') - except Exception, e: - self.exception = e - self.tb = traceback.format_exc() - -class AmazonDe(MetadataSource): - - name = 'Amazon German' - description = _('Downloads metadata from amazon.de') - supported_platforms = ['windows', 'osx', 'linux'] - author = 'Sengian' - version = (1, 0, 0) - has_html_comments = True - - def fetch(self): - try: - self.results = search(self.title, self.book_author, self.publisher, - self.isbn, max_results=10, verbose=self.verbose, lang='de') - except Exception, e: - self.exception = e - self.tb = traceback.format_exc() + # def fetch(self): + # try: + # self.results = search(self.title, self.book_author, self.publisher, + # self.isbn, max_results=10, verbose=self.verbose, lang='de') + # except Exception, e: + # self.exception = e + # self.tb = traceback.format_exc() class Amazon(MetadataSource): @@ -93,15 +76,31 @@ class Amazon(MetadataSource): description = _('Downloads metadata from amazon.com') supported_platforms = ['windows', 'osx', 'linux'] author = 'Kovid Goyal & Sengian' - version = (1, 1, 0) + version = (1, 0, 0) has_html_comments = True def fetch(self): - # if not self.site_customization: - # return try: self.results = search(self.title, self.book_author, self.publisher, - self.isbn, max_results=10, verbose=self.verbose, lang='all') + self.isbn, max_results=5, verbose=self.verbose, lang='all') + except Exception, e: + self.exception = e + self.tb = traceback.format_exc() + +class AmazonSocial(MetadataSource): + + name = 'AmazonSocial' + metadata_type = 'social' + description = _('Downloads social metadata from amazon.com') + supported_platforms = ['windows', 'osx', 'linux'] + author = 'Kovid Goyal & Sengian' + version = (1, 0, 1) + has_html_comments = True + + def fetch(self): + try: + self.results = search(self.title, self.book_author, self.publisher, + self.isbn, max_results=5, verbose=self.verbose, lang='all') except Exception, e: self.exception = e self.tb = traceback.format_exc() @@ -450,7 +449,6 @@ class ResultList(list): return None def populate(self, entries, br, verbose=False): - #multiple entries for x in entries: entry = self.get_individual_metadata(x, br, verbose) if entry is not None: @@ -471,13 +469,40 @@ def search(title=None, author=None, publisher=None, isbn=None, keywords=keywords, max_results=max_results,rlang=lang)(br, verbose) if entries is None or len(entries) == 0: - return + return None #List of entry ans = ResultList(baseurl, lang) ans.populate(entries, br, verbose) return [x for x in ans if x is not None] +def get_social_metadata(title, authors, publisher, isbn, verbose=False, + max_results=1, lang='all'): + mi = MetaInformation(title, authors) + if not isbn or not check_isbn(isbn): + return [mi] + + amazresults = search(isbn=isbn, verbose=verbose, + max_results=max_results, lang='all') + if amazresults is None or amazresults[0] is None: + from calibre.ebooks.metadata.xisbn import xisbn + for i in xisbn.get_associated_isbns(isbn): + amazresults = search(isbn=i, verbose=verbose, + max_results=max_results, lang='all') + if amazresults is not None and amazresults[0] is not None: + break + if amazresults is None or amazresults[0] is None: + return [mi] + + miaz = amazresults[0] + if miaz.rating is not None: + mi.rating = miaz.rating + if miaz.comments is not None: + mi.comments = miaz.comments + if miaz.tags is not None: + mi.tags = miaz.tags + return [mi] + def option_parser(): parser = OptionParser(textwrap.dedent(\ _('''\ @@ -490,41 +515,59 @@ def option_parser(): All & english & french & german & spanish ''' ))) - parser.add_option('-t', '--title', help='Book title') - parser.add_option('-a', '--author', help='Book author(s)') - parser.add_option('-p', '--publisher', help='Book publisher') - parser.add_option('-i', '--isbn', help='Book ISBN') - parser.add_option('-k', '--keywords', help='Keywords') + parser.add_option('-t', '--title', help=_('Book title')) + parser.add_option('-a', '--author', help=_('Book author(s)')) + parser.add_option('-p', '--publisher', help=_('Book publisher')) + parser.add_option('-i', '--isbn', help=_('Book ISBN')) + parser.add_option('-k', '--keywords', help=_('Keywords')) + parser.add_option('-s', '--social', default=0, action='count', + help=_('Get social data only')) parser.add_option('-m', '--max-results', default=10, - help='Maximum number of results to fetch') + help=_('Maximum number of results to fetch')) parser.add_option('-l', '--lang', default='all', - help='Chosen language for metadata search (all, en, fr, es, de)') + help=_('Chosen language for metadata search (all, en, fr, es, de)')) parser.add_option('-v', '--verbose', default=0, action='count', - help='Be more verbose about errors') + help=_('Be more verbose about errors')) return parser def main(args=sys.argv): parser = option_parser() opts, args = parser.parse_args(args) try: - results = search(opts.title, opts.author, isbn=opts.isbn, publisher=opts.publisher, - keywords=opts.keywords, verbose=opts.verbose, max_results=opts.max_results, - lang=opts.lang) + if opts.social: + results = get_social_metadata(opts.title, opts.author, + opts.publisher, opts.isbn, verbose=opts.verbose, lang=opts.lang) + else: + results = search(opts.title, opts.author, isbn=opts.isbn, + publisher=opts.publisher, keywords=opts.keywords, verbose=opts.verbose, + max_results=opts.max_results, lang=opts.lang) except AssertionError: report(True) parser.print_help() return 1 - if results is None or len(results) == 0: + if results is None and len(results) == 0: print _('No result found for this search!') return 0 for result in results: print unicode(result).encode(preferred_encoding, 'replace') print + + #test social + # '''Test xisbn''' + # print get_social_metadata('Learning Python', None, None, '8324616489')[0] + # print + # '''Test sophisticated comment formatting''' + # print get_social_metadata('Angels & Demons', None, None, '9781416580829')[0] + # print + # '''Random tests''' + # print get_social_metadata('Star Trek: Destiny: Mere Mortals', None, None, '9781416551720')[0] + # print + # print get_social_metadata('The Great Gatsby', None, None, '0743273567')[0] if __name__ == '__main__': - # sys.exit(main()) - import cProfile - # sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonfr; calibre.ebooks.metadata.amazonfr.main()")) - sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonfr; calibre.ebooks.metadata.amazonfr.main()", "profile_tmp_2")) + sys.exit(main()) + # import cProfile + # sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonbis; calibre.ebooks.metadata.amazonbis.main()")) + # sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonbis; calibre.ebooks.metadata.amazonbis.main()", "profile_tmp_2")) -# calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\amazonfr.py" -m 5 -a gore -v>data.html \ No newline at end of file +# calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\amazonbis.py" -m 5 -a gore -v>data.html \ No newline at end of file diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index b797a477d6..f1bf88da84 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -226,24 +226,24 @@ class ISBNDB(MetadataSource): # {{{ # }}} -class Amazon(MetadataSource): # {{{ +# class Amazon(MetadataSource): # {{{ - name = 'Amazon' - metadata_type = 'social' - description = _('Downloads social metadata from amazon.com') + # name = 'Amazon' + # metadata_type = 'social' + # description = _('Downloads social metadata from amazon.com') - has_html_comments = True + # has_html_comments = True - def fetch(self): - if not self.isbn: - return - from calibre.ebooks.metadata.amazon import get_social_metadata - try: - self.results = get_social_metadata(self.title, self.book_author, - self.publisher, self.isbn) - except Exception, e: - self.exception = e - self.tb = traceback.format_exc() + # def fetch(self): + # if not self.isbn: + # return + # from calibre.ebooks.metadata.amazon import get_social_metadata + # try: + # self.results = get_social_metadata(self.title, self.book_author, + # self.publisher, self.isbn) + # except Exception, e: + # self.exception = e + # self.tb = traceback.format_exc() # }}} From b2004ad77bb3e1d7f6630f417740cc3cbd089cb1 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sat, 11 Dec 2010 22:41:37 +0100 Subject: [PATCH 053/132] Remove threading from fictionwise --- src/calibre/ebooks/metadata/fictionwise.py | 112 +++++++-------------- 1 file changed, 38 insertions(+), 74 deletions(-) diff --git a/src/calibre/ebooks/metadata/fictionwise.py b/src/calibre/ebooks/metadata/fictionwise.py index efb19ca249..418a8ca771 100644 --- a/src/calibre/ebooks/metadata/fictionwise.py +++ b/src/calibre/ebooks/metadata/fictionwise.py @@ -4,8 +4,6 @@ __copyright__ = '2010, sengian <sengian1@gmail.com>' __docformat__ = 'restructuredtext en' import sys, textwrap, re, traceback, socket -from threading import Thread -from Queue import Queue from urllib import urlencode from lxml.html import soupparser, tostring @@ -20,7 +18,7 @@ from calibre.utils.config import OptionParser from calibre.utils.date import parse_date, utcnow from calibre.utils.cleantext import clean_ascii_chars, unescape -class Fictionwise(MetadataSource): # {{{ +class Fictionwise(MetadataSource): author = 'Sengian' name = 'Fictionwise' @@ -36,51 +34,10 @@ class Fictionwise(MetadataSource): # {{{ self.exception = e self.tb = traceback.format_exc() - # }}} class FictionwiseError(Exception): pass -class BrowserThread(Thread): - - def __init__(self, url, verbose=False, timeout=10., ex=Exception, name='Meta'): - self.url = url - self.ex = ex - self.plugname = name - self.verbose = verbose - self.timeout = timeout - self.result = None - Thread.__init__(self) - - def get_result(self): - return self.result - - def run(self): - try: - raw = browser().open_novisit(self.url, timeout=self.timeout).read() - except Exception, e: - report(self.verbose) - if callable(getattr(e, 'getcode', None)) and \ - e.getcode() == 404: - self.result = None - if isinstance(getattr(e, 'args', [None])[0], socket.timeout): - raise self.ex(_('%s timed out. Try again later.') % self.plugname) - raise self.ex(_('%s encountered an error.') % self.plugname) - if '<title>404 - ' in raw: - report(self.verbose) - self.result = None - return None - raw = xml_to_unicode(raw, strip_encoding_pats=True, - resolve_entities=True)[0] - try: - self.result = soupparser.fromstring(raw) - except: - try: - #remove ASCII invalid chars - self.result = soupparser.fromstring(clean_ascii_chars(raw)) - except: - self.result = None - def report(verbose): if verbose: traceback.print_exc() @@ -161,15 +118,16 @@ class Query(object): results = [i.xpath('descendant-or-self::a')[0].get('href') for i in results] #return feed if no links ie normally a single book or nothing if not results: - results = [feed] - return results + return [feed], False + return results, True class ResultList(list): BASE_URL = 'http://www.fictionwise.com' COLOR_VALUES = {'BLUE': 4, 'GREEN': 3, 'YELLOW': 2, 'RED': 1, 'NA': 0} - def __init__(self): + def __init__(self, islink): + self.islink = islink self.retitle = re.compile(r'\[[^\[\]]+\]') self.rechkauth = re.compile(r'.*book\s*by', re.I) self.redesc = re.compile(r'book\s*description\s*:\s*(<br[^>]+>)*(?P<desc>.*)<br[^>]*>.{,15}publisher\s*:', re.I) @@ -337,47 +295,53 @@ class ResultList(list): pass return mi - def producer(self, q, data, verbose=False): - for x in data: - thread = BrowserThread(self.BASE_URL+x, verbose=verbose, ex=FictionwiseError, - name='Fictionwise') - thread.start() - q.put(thread, True) + def get_individual_metadata(self, url, br, verbose): + try: + raw = br.open_novisit(url).read() + except Exception, e: + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return None + if isinstance(getattr(e, 'args', [None])[0], socket.timeout): + raise FictionwiseError(_('Fictionwise timed out. Try again later.')) + raise FictionwiseError(_('Fictionwise encountered an error.')) + if '<title>404 - ' in raw: + report(verbose) + return None + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + return soupparser.fromstring(raw) + except: + try: + #remove ASCII invalid chars + return soupparser.fromstring(clean_ascii_chars(raw)) + except: + report(verbose) + return None - def consumer(self, q, total_entries, verbose=False): - while len(self) < total_entries: - thread = q.get(True) - thread.join() - mi = thread.get_result() - if mi is None: - self.append(None) - else: - self.append(self.fill_MI(mi, verbose)) - - def populate(self, entries, verbose=False, brcall=3): - if len(entries) == 1 and not isinstance(entries[0], str): + def populate(self, entries, br, verbose=False): + if not self.islink: #single entry self.append(self.fill_MI(entries[0], verbose)) else: #multiple entries - q = Queue(brcall) - prod_thread = Thread(target=self.producer, args=(q, entries, verbose)) - cons_thread = Thread(target=self.consumer, args=(q, len(entries), verbose)) - prod_thread.start() - cons_thread.start() - prod_thread.join() - cons_thread.join() + for x in entries: + entry = self.get_individual_metadata(self.BASE_URL+x, br, verbose) + if entry is not None: + self.append(self.fill_MI(entry, verbose)) def search(title=None, author=None, publisher=None, isbn=None, min_viewability='none', verbose=False, max_results=5, keywords=None): br = browser() - entries = Query(title=title, author=author, publisher=publisher, + entries, islink = Query(title=title, author=author, publisher=publisher, keywords=keywords, max_results=max_results)(br, verbose, timeout = 15.) #List of entry - ans = ResultList() + ans = ResultList(islink) ans.populate(entries, br, verbose) return [x for x in ans if x is not None] From 1d968f71b71bdbf01a7d7ef654dd953e6806a5cb Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sat, 11 Dec 2010 23:19:25 +0100 Subject: [PATCH 054/132] Remove threading from fictionwise and nicebooks --- src/calibre/ebooks/metadata/fictionwise.py | 1 + src/calibre/ebooks/metadata/nicebooks.py | 112 ++++++++------------- 2 files changed, 41 insertions(+), 72 deletions(-) diff --git a/src/calibre/ebooks/metadata/fictionwise.py b/src/calibre/ebooks/metadata/fictionwise.py index 418a8ca771..914fa2b228 100644 --- a/src/calibre/ebooks/metadata/fictionwise.py +++ b/src/calibre/ebooks/metadata/fictionwise.py @@ -337,6 +337,7 @@ def search(title=None, author=None, publisher=None, isbn=None, min_viewability='none', verbose=False, max_results=5, keywords=None): br = browser() + islink = False entries, islink = Query(title=title, author=author, publisher=publisher, keywords=keywords, max_results=max_results)(br, verbose, timeout = 15.) diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py index cdf915c827..3886eae201 100644 --- a/src/calibre/ebooks/metadata/nicebooks.py +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -80,46 +80,6 @@ class NiceBooksError(Exception): class ISBNNotFound(NiceBooksError): pass -class BrowserThread(Thread): - - def __init__(self, url, verbose=False, timeout=10., ex=Exception, name='Meta'): - self.url = url - self.ex = ex - self.plugname = name - self.verbose = verbose - self.timeout = timeout - self.result = None - Thread.__init__(self) - - def get_result(self): - return self.result - - def run(self): - try: - raw = browser().open_novisit(self.url, timeout=self.timeout).read() - except Exception, e: - report(self.verbose) - if callable(getattr(e, 'getcode', None)) and \ - e.getcode() == 404: - self.result = None - if isinstance(getattr(e, 'args', [None])[0], socket.timeout): - raise self.ex(_('%s timed out. Try again later.') % self.plugname) - raise self.ex(_('%s encountered an error.') % self.plugname) - if '<title>404 - ' in raw: - report(self.verbose) - self.result = None - return None - raw = xml_to_unicode(raw, strip_encoding_pats=True, - resolve_entities=True)[0] - try: - self.result = soupparser.fromstring(raw) - except: - try: - #remove ASCII invalid chars - self.result = soupparser.fromstring(clean_ascii_chars(raw)) - except: - self.result = None - def report(verbose): if verbose: traceback.print_exc() @@ -156,7 +116,7 @@ class Query(object): report(verbose) if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: - return + return None if isinstance(getattr(e, 'args', [None])[0], socket.timeout): raise NiceBooksError(_('Nicebooks timed out. Try again later.')) raise NiceBooksError(_('Nicebooks encountered an error.')) @@ -178,7 +138,7 @@ class Query(object): nbresults = int(feed.xpath("//div[@id='topbar']/b")[0].text) except: #direct hit - return [feed] + return [feed], False nbpagetoquery = int(ceil(float(min(nbresults, self.max_results))/10)) pages =[feed] @@ -207,13 +167,14 @@ class Query(object): for x in pages: results.extend([i.find_class('title')[0].get('href') \ for i in x.xpath("//ul[@id='results']/li")]) - return results[:self.max_results] + return results[:self.max_results], True class ResultList(list): BASE_URL = 'http://fr.nicebooks.com' - def __init__(self): + def __init__(self, islink): + self.islink = islink self.repub = re.compile(u'\s*.diteur\s*', re.I) self.reauteur = re.compile(u'\s*auteur.*', re.I) self.reautclean = re.compile(u'\s*\(.*\)\s*') @@ -287,36 +248,42 @@ class ResultList(list): pass return mi - def producer(self, q, data, verbose=False): - for x in data: - thread = BrowserThread(self.BASE_URL+x, verbose=verbose, ex=NiceBooksError, - name='Nicebooks') - thread.start() - q.put(thread, True) + def get_individual_metadata(self, url, br, verbose): + try: + raw = br.open_novisit(url).read() + except Exception, e: + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return None + if isinstance(getattr(e, 'args', [None])[0], socket.timeout): + raise NiceBooksError(_('NiceBooks timed out. Try again later.')) + raise NiceBooksError(_('NiceBooks encountered an error.')) + if '<title>404 - ' in raw: + report(verbose) + return None + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + return soupparser.fromstring(raw) + except: + try: + #remove ASCII invalid chars + return soupparser.fromstring(clean_ascii_chars(raw)) + except: + report(verbose) + return None - def consumer(self, q, total_entries, verbose=False): - while len(self) < total_entries: - thread = q.get(True) - thread.join() - mi = thread.get_result() - if mi is None: - self.append(None) - else: - self.append(self.fill_MI(mi, verbose)) - - def populate(self, entries, verbose=False, brcall=3): - if len(entries) == 1 and not isinstance(entries[0], str): + def populate(self, entries, br, verbose=False): + if not self.islink: #single entry self.append(self.fill_MI(entries[0], verbose)) else: #multiple entries - q = Queue(brcall) - prod_thread = Thread(target=self.producer, args=(q, entries, verbose)) - cons_thread = Thread(target=self.consumer, args=(q, len(entries), verbose)) - prod_thread.start() - cons_thread.start() - prod_thread.join() - cons_thread.join() + for x in entries: + entry = self.get_individual_metadata(self.BASE_URL+x, br, verbose) + if entry is not None: + self.append(self.fill_MI(entry, verbose)) class Covers(object): @@ -358,15 +325,16 @@ class Covers(object): def search(title=None, author=None, publisher=None, isbn=None, max_results=5, verbose=False, keywords=None): br = browser() - entries = Query(title=title, author=author, isbn=isbn, publisher=publisher, + islink = False + entries, islink = Query(title=title, author=author, isbn=isbn, publisher=publisher, keywords=keywords, max_results=max_results)(br, verbose, timeout = 10.) if entries is None or len(entries) == 0: return None #List of entry - ans = ResultList() - ans.populate(entries, verbose) + ans = ResultList(islink) + ans.populate(entries, br, verbose) return [x for x in ans if x is not None] def check_for_cover(isbn): From 9a3933354ab261cc35fb2fc9ff8ad7a47b75b58f Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 12 Dec 2010 00:23:47 +0100 Subject: [PATCH 055/132] Minor fix to amazon social --- src/calibre/ebooks/metadata/amazonbis.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/metadata/amazonbis.py b/src/calibre/ebooks/metadata/amazonbis.py index a94883b003..f86f00b94f 100644 --- a/src/calibre/ebooks/metadata/amazonbis.py +++ b/src/calibre/ebooks/metadata/amazonbis.py @@ -98,9 +98,11 @@ class AmazonSocial(MetadataSource): has_html_comments = True def fetch(self): + if not self.isbn: + return try: - self.results = search(self.title, self.book_author, self.publisher, - self.isbn, max_results=5, verbose=self.verbose, lang='all') + self.results = get_social_metadata(self.title, self.book_author, self.publisher, + self.isbn, verbose=self.verbose, lang='all')[0] except Exception, e: self.exception = e self.tb = traceback.format_exc() From f5736c59316d98042266006394cc2fc8b65b0ad7 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 12 Dec 2010 00:45:08 +0100 Subject: [PATCH 056/132] ... --- src/calibre/ebooks/metadata/nicebooks.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py index 3886eae201..c852a81873 100644 --- a/src/calibre/ebooks/metadata/nicebooks.py +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -4,8 +4,6 @@ __copyright__ = '2010, sengian <sengian1@gmail.com>' __docformat__ = 'restructuredtext en' import sys, textwrap, re, traceback, socket -from threading import Thread -from Queue import Queue from urllib import urlencode from math import ceil from copy import deepcopy From ae781ae61433d57ab14ddb6d033246105f70afd1 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 12 Dec 2010 03:19:23 +0100 Subject: [PATCH 057/132] Add localisation site in amazon social (fr, de) --- src/calibre/ebooks/metadata/amazonbis.py | 90 ++++++++++++++++++------ 1 file changed, 70 insertions(+), 20 deletions(-) diff --git a/src/calibre/ebooks/metadata/amazonbis.py b/src/calibre/ebooks/metadata/amazonbis.py index f86f00b94f..acd7f97c1e 100644 --- a/src/calibre/ebooks/metadata/amazonbis.py +++ b/src/calibre/ebooks/metadata/amazonbis.py @@ -3,6 +3,7 @@ __license__ = 'GPL 3' __copyright__ = '2010, sengian <sengian1@gmail.com>' import sys, textwrap, re, traceback, socket +from threading import Thread from urllib import urlencode from math import ceil @@ -10,6 +11,7 @@ from lxml.html import soupparser, tostring from calibre.utils.date import parse_date, utcnow, replace_months from calibre.utils.cleantext import clean_ascii_chars +from calibre.utils.localization import get_lang from calibre import browser, preferred_encoding from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.metadata import MetaInformation, check_isbn, \ @@ -101,8 +103,36 @@ class AmazonSocial(MetadataSource): if not self.isbn: return try: - self.results = get_social_metadata(self.title, self.book_author, self.publisher, + lang = get_lang() + lang = lang[:2] if re.match(r'(fr.*|de.*)', lang) else 'all' + if lang == 'all': + self.results = get_social_metadata(self.title, self.book_author, self.publisher, self.isbn, verbose=self.verbose, lang='all')[0] + else: + tmploc = ThreadwithResults(AmazonError, self.verbose, get_social_metadata, self.title, + self.book_author, self.publisher,self.isbn, verbose=self.verbose, lang=lang) + tmpnoloc = ThreadwithResults(AmazonError, self.verbose, get_social_metadata, self.title, + self.book_author, self.publisher, self.isbn, verbose=self.verbose, lang='all') + tmploc.start() + tmpnoloc.start() + tmploc.join() + tmpnoloc.join() + tmploc= tmploc.get_result() + if tmploc is not None: + tmploc = tmploc[0] + tmpnoloc= tmpnoloc.get_result() + if tmpnoloc is not None: + tmpnoloc = tmpnoloc[0] + print tmpnoloc + + if tmploc is not None and tmpnoloc is not None: + if tmploc.rating is None: + tmploc.rating = tmpnoloc.rating + if tmploc.comments is not None: + tmploc.comments = tmpnoloc.comments + if tmploc.tags is None: + tmploc.tags = tmpnoloc.tags + self.results = tmploc except Exception, e: self.exception = e self.tb = traceback.format_exc() @@ -115,6 +145,25 @@ def report(verbose): class AmazonError(Exception): pass +class ThreadwithResults(Thread): + def __init__(self, error, verb, func, *args, **kargs): + self.func = func + self.args = args + self.kargs = kargs + self.verbose = verb + self.ex = error + self.result = None + Thread.__init__(self) + + def get_result(self): + return self.result + + def run(self): + try: + self.result = self.func(*self.args, **self.kargs) + except Exception, e: + report(self.verbose) + raise self.ex(_('An error was encountered in the function threading')) class Query(object): @@ -123,10 +172,10 @@ class Query(object): BASE_URL_DE = 'http://www.amazon.de' def __init__(self, title=None, author=None, publisher=None, isbn=None, keywords=None, - max_results=20, rlang='all'): + max_results=10, rlang='all'): assert not(title is None and author is None and publisher is None \ and isbn is None and keywords is None) - assert (max_results < 21) + assert (max_results < 11) self.max_results = int(max_results) self.renbres = re.compile(u'\s*([0-9.,]+)\s*') @@ -151,17 +200,17 @@ class Query(object): #many options available } - if rlang =='all': + if rlang =='all' or rlang =='en': q['sort'] = 'relevanceexprank' self.urldata = self.BASE_URL_ALL - elif rlang =='es': - q['sort'] = 'relevanceexprank' - q['field-language'] = 'Spanish' - self.urldata = self.BASE_URL_ALL - elif rlang =='en': - q['sort'] = 'relevanceexprank' - q['field-language'] = 'English' - self.urldata = self.BASE_URL_ALL + # elif rlang =='es': + # q['sort'] = 'relevanceexprank' + # q['field-language'] = 'Spanish' + # self.urldata = self.BASE_URL_ALL + # elif rlang =='en': + # q['sort'] = 'relevanceexprank' + # q['field-language'] = 'English' + # self.urldata = self.BASE_URL_ALL elif rlang =='fr': q['sort'] = 'relevancerank' self.urldata = self.BASE_URL_FR @@ -250,7 +299,7 @@ class Query(object): for i in x.xpath("//a/span[@class='srTitle']")]) return results[:self.max_results], self.baseurl -class ResultList(list): +class ResultList(object): def __init__(self, baseurl, lang = 'all'): self.baseurl = baseurl @@ -451,6 +500,7 @@ class ResultList(list): return None def populate(self, entries, br, verbose=False): + res = [] for x in entries: entry = self.get_individual_metadata(x, br, verbose) if entry is not None: @@ -461,7 +511,8 @@ class ResultList(list): tags = self.get_individual_metadata(mi.tags, br, verbose) if tags is not None: mi.tags = self.get_tags(tags, verbose)[0] - self.append(mi) + res.append(mi) + return res def search(title=None, author=None, publisher=None, isbn=None, @@ -475,8 +526,7 @@ def search(title=None, author=None, publisher=None, isbn=None, #List of entry ans = ResultList(baseurl, lang) - ans.populate(entries, br, verbose) - return [x for x in ans if x is not None] + return [x for x in ans.populate(entries, br, verbose) if x is not None] def get_social_metadata(title, authors, publisher, isbn, verbose=False, max_results=1, lang='all'): @@ -485,12 +535,12 @@ def get_social_metadata(title, authors, publisher, isbn, verbose=False, return [mi] amazresults = search(isbn=isbn, verbose=verbose, - max_results=max_results, lang='all') + max_results=max_results, lang=lang) if amazresults is None or amazresults[0] is None: from calibre.ebooks.metadata.xisbn import xisbn for i in xisbn.get_associated_isbns(isbn): amazresults = search(isbn=i, verbose=verbose, - max_results=max_results, lang='all') + max_results=max_results, lang=lang) if amazresults is not None and amazresults[0] is not None: break if amazresults is None or amazresults[0] is None: @@ -514,7 +564,7 @@ def option_parser(): ISBN, publisher or keywords. Will fetch a maximum of 10 matches, so you should make your query as specific as possible. You can chose the language for metadata retrieval: - All & english & french & german & spanish + english & french & german ''' ))) parser.add_option('-t', '--title', help=_('Book title')) @@ -527,7 +577,7 @@ def option_parser(): parser.add_option('-m', '--max-results', default=10, help=_('Maximum number of results to fetch')) parser.add_option('-l', '--lang', default='all', - help=_('Chosen language for metadata search (all, en, fr, es, de)')) + help=_('Chosen language for metadata search (en, fr, de)')) parser.add_option('-v', '--verbose', default=0, action='count', help=_('Be more verbose about errors')) return parser From 5c89b576e31b85e17cf14e85a72b1b876f87579c Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 12 Dec 2010 11:57:00 +0100 Subject: [PATCH 058/132] Fix threading in amazon --- src/calibre/ebooks/metadata/amazonbis.py | 195 +++++++++++++---------- 1 file changed, 109 insertions(+), 86 deletions(-) diff --git a/src/calibre/ebooks/metadata/amazonbis.py b/src/calibre/ebooks/metadata/amazonbis.py index acd7f97c1e..7060ca4cb5 100644 --- a/src/calibre/ebooks/metadata/amazonbis.py +++ b/src/calibre/ebooks/metadata/amazonbis.py @@ -4,6 +4,7 @@ __copyright__ = '2010, sengian <sengian1@gmail.com>' import sys, textwrap, re, traceback, socket from threading import Thread +from Queue import Queue from urllib import urlencode from math import ceil @@ -21,57 +22,6 @@ from calibre.utils.config import OptionParser from calibre.library.comments import sanitize_comments_html -# class AmazonFr(MetadataSource): - - # name = 'Amazon French' - # description = _('Downloads metadata from amazon.fr') - # supported_platforms = ['windows', 'osx', 'linux'] - # author = 'Sengian' - # version = (1, 0, 0) - # has_html_comments = True - - # def fetch(self): - # try: - # self.results = search(self.title, self.book_author, self.publisher, - # self.isbn, max_results=10, verbose=self.verbose, lang='fr') - # except Exception, e: - # self.exception = e - # self.tb = traceback.format_exc() - -# class AmazonEs(MetadataSource): - - # name = 'Amazon Spanish' - # description = _('Downloads metadata from amazon.com in spanish') - # supported_platforms = ['windows', 'osx', 'linux'] - # author = 'Sengian' - # version = (1, 0, 0) - # has_html_comments = True - - # def fetch(self): - # try: - # self.results = search(self.title, self.book_author, self.publisher, - # self.isbn, max_results=10, verbose=self.verbose, lang='es') - # except Exception, e: - # self.exception = e - # self.tb = traceback.format_exc() - -# class AmazonDe(MetadataSource): - - # name = 'Amazon German' - # description = _('Downloads metadata from amazon.de') - # supported_platforms = ['windows', 'osx', 'linux'] - # author = 'Sengian' - # version = (1, 0, 0) - # has_html_comments = True - - # def fetch(self): - # try: - # self.results = search(self.title, self.book_author, self.publisher, - # self.isbn, max_results=10, verbose=self.verbose, lang='de') - # except Exception, e: - # self.exception = e - # self.tb = traceback.format_exc() - class Amazon(MetadataSource): name = 'Amazon' @@ -83,8 +33,33 @@ class Amazon(MetadataSource): def fetch(self): try: - self.results = search(self.title, self.book_author, self.publisher, + lang = get_lang() + lang = lang[:2] if re.match(r'(fr.*|de.*)', lang) else 'all' + if lang == 'all': + self.results = search(self.title, self.book_author, self.publisher, self.isbn, max_results=5, verbose=self.verbose, lang='all') + else: + tmploc = ThreadwithResults(search, self.title, self.book_author, + self.publisher,self.isbn, max_results=5, + verbose=self.verbose, lang=lang) + tmpnoloc = ThreadwithResults(search, self.title, self.book_author, + self.publisher, self.isbn, max_results=5, + verbose=self.verbose, lang='all') + tmploc.start() + tmpnoloc.start() + tmploc.join() + tmpnoloc.join() + tmploc= tmploc.get_result() + tmpnoloc= tmpnoloc.get_result() + + tempres = None + if tmpnoloc is not None: + tempres = tmpnoloc + if tmploc is not None: + tempres = tmploc + if tmpnoloc is not None: + tempres.extend(tmpnoloc) + self.results = tmpres except Exception, e: self.exception = e self.tb = traceback.format_exc() @@ -107,12 +82,12 @@ class AmazonSocial(MetadataSource): lang = lang[:2] if re.match(r'(fr.*|de.*)', lang) else 'all' if lang == 'all': self.results = get_social_metadata(self.title, self.book_author, self.publisher, - self.isbn, verbose=self.verbose, lang='all')[0] + self.isbn, verbose=self.verbose, lang='all')[0] else: - tmploc = ThreadwithResults(AmazonError, self.verbose, get_social_metadata, self.title, - self.book_author, self.publisher,self.isbn, verbose=self.verbose, lang=lang) - tmpnoloc = ThreadwithResults(AmazonError, self.verbose, get_social_metadata, self.title, - self.book_author, self.publisher, self.isbn, verbose=self.verbose, lang='all') + tmploc = ThreadwithResults(get_social_metadata, self.title, self.book_author, + self.publisher,self.isbn, verbose=self.verbose, lang=lang) + tmpnoloc = ThreadwithResults(get_social_metadata, self.title, self.book_author, + self.publisher, self.isbn, verbose=self.verbose, lang='all') tmploc.start() tmpnoloc.start() tmploc.join() @@ -123,15 +98,13 @@ class AmazonSocial(MetadataSource): tmpnoloc= tmpnoloc.get_result() if tmpnoloc is not None: tmpnoloc = tmpnoloc[0] - print tmpnoloc - - if tmploc is not None and tmpnoloc is not None: - if tmploc.rating is None: - tmploc.rating = tmpnoloc.rating - if tmploc.comments is not None: - tmploc.comments = tmpnoloc.comments - if tmploc.tags is None: - tmploc.tags = tmpnoloc.tags + if tmpnoloc is not None: + if tmploc.rating is None: + tmploc.rating = tmpnoloc.rating + if tmploc.comments is not None: + tmploc.comments = tmpnoloc.comments + if tmploc.tags is None: + tmploc.tags = tmpnoloc.tags self.results = tmploc except Exception, e: self.exception = e @@ -146,12 +119,10 @@ class AmazonError(Exception): pass class ThreadwithResults(Thread): - def __init__(self, error, verb, func, *args, **kargs): + def __init__(self, func, *args, **kargs): self.func = func self.args = args self.kargs = kargs - self.verbose = verb - self.ex = error self.result = None Thread.__init__(self) @@ -159,11 +130,8 @@ class ThreadwithResults(Thread): return self.result def run(self): - try: - self.result = self.func(*self.args, **self.kargs) - except Exception, e: - report(self.verbose) - raise self.ex(_('An error was encountered in the function threading')) + self.result = self.func(*self.args, **self.kargs) + class Query(object): @@ -172,10 +140,10 @@ class Query(object): BASE_URL_DE = 'http://www.amazon.de' def __init__(self, title=None, author=None, publisher=None, isbn=None, keywords=None, - max_results=10, rlang='all'): + max_results=20, rlang='all'): assert not(title is None and author is None and publisher is None \ and isbn is None and keywords is None) - assert (max_results < 11) + assert (max_results < 21) self.max_results = int(max_results) self.renbres = re.compile(u'\s*([0-9.,]+)\s*') @@ -304,6 +272,9 @@ class ResultList(object): def __init__(self, baseurl, lang = 'all'): self.baseurl = baseurl self.lang = lang + self.thread = [] + self.res = [] + self.nbtag = 0 self.repub = re.compile(u'\((.*)\)') self.rerat = re.compile(u'([0-9.]+)') self.reattr = re.compile(r'<([a-zA-Z0-9]+)\s[^>]+>') @@ -499,20 +470,72 @@ class ResultList(object): report(verbose) return None - def populate(self, entries, br, verbose=False): - res = [] - for x in entries: - entry = self.get_individual_metadata(x, br, verbose) + def fetchdatathread(self, qbr, qsync, nb, url, verbose): + try: + browser = qbr.get(True) + entry = self.get_individual_metadata(url, browser, verbose) + except: + report(verbose) + entry = None + finally: + qbr.put(browser, True) + qsync.put(nb, True) + return entry + + def producer(self, sync, urls, br, verbose=False): + for i in xrange(len(urls)): + thread = ThreadwithResults(self.fetchdatathread, br, sync, + i, urls[i], verbose) + thread.start() + self.thread.append(thread) + + def consumer(self, sync, syncbis, br, total_entries, verbose=False): + i=0 + while i < total_entries: + nb = int(sync.get(True)) + self.thread[nb].join() + entry = self.thread[nb].get_result() + i+=1 if entry is not None: mi = self.fill_MI(entry, verbose) if mi is not None: mi.tags, atag = self.get_tags(entry, verbose) + self.res[nb] = mi if atag: - tags = self.get_individual_metadata(mi.tags, br, verbose) - if tags is not None: - mi.tags = self.get_tags(tags, verbose)[0] - res.append(mi) - return res + threadbis = ThreadwithResults(self.fetchdatathread, + br, syncbis, nb, mi.tags, verbose) + self.thread[nb] = threadbis + self.nbtag +=1 + threadbis.start() + + def populate(self, entries, ibr, verbose=False, brcall=3): + br = Queue(brcall) + cbr = Queue(brcall-1) + + syncp = Queue(1) + syncc = Queue(len(entries)) + + for i in xrange(brcall-1): + br.put(browser(), True) + cbr.put(browser(), True) + br.put(ibr, True) + + self.res = [None]*len(entries) + + prod_thread = Thread(target=self.producer, args=(syncp, entries, br, verbose)) + cons_thread = Thread(target=self.consumer, args=(syncp, syncc, cbr, len(entries), verbose)) + prod_thread.start() + cons_thread.start() + prod_thread.join() + cons_thread.join() + + #finish processing + for i in xrange(self.nbtag): + nb = int(syncc.get(True)) + tags = self.thread[nb].get_result() + if tags is not None: + self.res[nb].tags = self.get_tags(tags, verbose)[0] + return self.res def search(title=None, author=None, publisher=None, isbn=None, @@ -561,7 +584,7 @@ def option_parser(): %prog [options] Fetch book metadata from Amazon. You must specify one of title, author, - ISBN, publisher or keywords. Will fetch a maximum of 10 matches, + ISBN, publisher or keywords. Will fetch a maximum of 20 matches, so you should make your query as specific as possible. You can chose the language for metadata retrieval: english & french & german From 0a2b5d4c2381d12e8cf711b701408cffbf621593 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 12 Dec 2010 13:36:34 +0100 Subject: [PATCH 059/132] Add threadind to nicebooks (some problems with autor= mankell in interface with multiple authors, not the plugin) --- src/calibre/ebooks/metadata/amazonbis.py | 7 ++- src/calibre/ebooks/metadata/nicebooks.py | 80 +++++++++++++++++++++--- 2 files changed, 77 insertions(+), 10 deletions(-) diff --git a/src/calibre/ebooks/metadata/amazonbis.py b/src/calibre/ebooks/metadata/amazonbis.py index 7060ca4cb5..dd973ba3d8 100644 --- a/src/calibre/ebooks/metadata/amazonbis.py +++ b/src/calibre/ebooks/metadata/amazonbis.py @@ -186,7 +186,12 @@ class Query(object): q['sort'] = 'relevancerank' self.urldata = self.BASE_URL_DE self.baseurl = self.urldata - + + if title == _('Unknown'): + title=None + if author == _('Unknown'): + author=None + if isbn is not None: q['field-isbn'] = isbn.replace('-', '') else: diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py index c852a81873..3f4f24902c 100644 --- a/src/calibre/ebooks/metadata/nicebooks.py +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -4,6 +4,8 @@ __copyright__ = '2010, sengian <sengian1@gmail.com>' __docformat__ = 'restructuredtext en' import sys, textwrap, re, traceback, socket +from threading import Thread +from Queue import Queue from urllib import urlencode from math import ceil from copy import deepcopy @@ -39,7 +41,7 @@ class NiceBooks(MetadataSource): class NiceBooksCovers(CoverDownload): name = 'Nicebooks covers' - description = _('Downloads covers from french Nicebooks') + description = _('Downloads covers from French Nicebooks') supported_platforms = ['windows', 'osx', 'linux'] author = 'Sengian' type = _('Cover download') @@ -78,6 +80,20 @@ class NiceBooksError(Exception): class ISBNNotFound(NiceBooksError): pass +class ThreadwithResults(Thread): + def __init__(self, func, *args, **kargs): + self.func = func + self.args = args + self.kargs = kargs + self.result = None + Thread.__init__(self) + + def get_result(self): + return self.result + + def run(self): + self.result = self.func(*self.args, **self.kargs) + def report(verbose): if verbose: traceback.print_exc() @@ -97,6 +113,10 @@ class Query(object): if isbn is not None: q = isbn else: + if title == _('Unknown'): + title=None + if author == _('Unknown'): + author=None q = ' '.join([i for i in (title, author, publisher, keywords) \ if i is not None]) @@ -173,6 +193,7 @@ class ResultList(list): def __init__(self, islink): self.islink = islink + self.thread = [] self.repub = re.compile(u'\s*.diteur\s*', re.I) self.reauteur = re.compile(u'\s*auteur.*', re.I) self.reautclean = re.compile(u'\s*\(.*\)\s*') @@ -227,7 +248,6 @@ class ResultList(list): return mi def fill_MI(self, data, verbose): - '''create and return an mi if possible, None otherwise''' try: entry = data.xpath("//div[@id='container']/div[@id='book-info']")[0] title = self.get_title(entry) @@ -272,16 +292,58 @@ class ResultList(list): report(verbose) return None - def populate(self, entries, br, verbose=False): + def fetchdatathread(self, qbr, qsync, nb, url, verbose): + try: + browser = qbr.get(True) + entry = self.get_individual_metadata(url, browser, verbose) + except: + report(verbose) + entry = None + finally: + qbr.put(browser, True) + qsync.put(nb, True) + return entry + + def producer(self, sync, urls, br, verbose=False): + for i in xrange(len(urls)): + thread = ThreadwithResults(self.fetchdatathread, br, sync, + i, self.BASE_URL+urls[i], verbose) + thread.start() + self.thread.append(thread) + + def consumer(self, sync, total_entries, verbose=False): + res=[None]*total_entries + i=0 + while i < total_entries: + nb = int(sync.get(True)) + self.thread[nb].join() + entry = self.thread[nb].get_result() + mi = None + i+=1 + if entry is not None: + mi = self.fill_MI(entry, verbose) + res[nb]=mi + return res + + def populate(self, entries, br, verbose=False, brcall=3): if not self.islink: #single entry self.append(self.fill_MI(entries[0], verbose)) else: #multiple entries - for x in entries: - entry = self.get_individual_metadata(self.BASE_URL+x, br, verbose) - if entry is not None: - self.append(self.fill_MI(entry, verbose)) + pbr = Queue(brcall) + sync = Queue(1) + for i in xrange(brcall-1): + pbr.put(browser(), True) + pbr.put(br, True) + + prod_thread = Thread(target=self.producer, args=(sync, entries, pbr, verbose)) + cons_thread = ThreadwithResults(self.consumer, sync, len(entries), verbose) + prod_thread.start() + cons_thread.start() + prod_thread.join() + cons_thread.join() + self.extend(cons_thread.get_result()) class Covers(object): @@ -321,7 +383,7 @@ class Covers(object): def search(title=None, author=None, publisher=None, isbn=None, - max_results=5, verbose=False, keywords=None): + max_results=10, verbose=False, keywords=None): br = browser() islink = False entries, islink = Query(title=title, author=author, isbn=isbn, publisher=publisher, @@ -407,4 +469,4 @@ def main(args=sys.argv): if __name__ == '__main__': sys.exit(main()) -# calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\nicebooks.py" -m 5 -a mankel >data.html \ No newline at end of file +# calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\nicebooks.py" -m 10 -a mankel >data.html \ No newline at end of file From 43ecf8c40d8f447dbfcbcaf686fa353ab8e3a57e Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 12 Dec 2010 14:39:36 +0100 Subject: [PATCH 060/132] Add threading to fictionwise --- src/calibre/ebooks/metadata/fictionwise.py | 76 ++++++++++++++++++++-- src/calibre/ebooks/metadata/nicebooks.py | 4 +- 2 files changed, 70 insertions(+), 10 deletions(-) diff --git a/src/calibre/ebooks/metadata/fictionwise.py b/src/calibre/ebooks/metadata/fictionwise.py index 914fa2b228..909d186702 100644 --- a/src/calibre/ebooks/metadata/fictionwise.py +++ b/src/calibre/ebooks/metadata/fictionwise.py @@ -4,6 +4,8 @@ __copyright__ = '2010, sengian <sengian1@gmail.com>' __docformat__ = 'restructuredtext en' import sys, textwrap, re, traceback, socket +from threading import Thread +from Queue import Queue from urllib import urlencode from lxml.html import soupparser, tostring @@ -23,7 +25,6 @@ class Fictionwise(MetadataSource): author = 'Sengian' name = 'Fictionwise' description = _('Downloads metadata from Fictionwise') - has_html_comments = True def fetch(self): @@ -38,6 +39,20 @@ class Fictionwise(MetadataSource): class FictionwiseError(Exception): pass +class ThreadwithResults(Thread): + def __init__(self, func, *args, **kargs): + self.func = func + self.args = args + self.kargs = kargs + self.result = None + Thread.__init__(self) + + def get_result(self): + return self.result + + def run(self): + self.result = self.func(*self.args, **self.kargs) + def report(verbose): if verbose: traceback.print_exc() @@ -50,8 +65,13 @@ class Query(object): def __init__(self, title=None, author=None, publisher=None, keywords=None, max_results=20): assert not(title is None and author is None and publisher is None and keywords is None) assert (max_results < 21) - + + if title == _('Unknown'): + title=None + if author == _('Unknown'): + author=None self.max_results = int(max_results) + q = { 'template' : 'searchresults_adv.htm' , 'searchtitle' : '', 'searchauthor' : '', @@ -72,6 +92,7 @@ class Query(object): #b.DateFirstPublished, b.FWPublishDate 'sortby' : 'b.SortTitle' } + if title is not None: q['searchtitle'] = title if author is not None: @@ -128,6 +149,7 @@ class ResultList(list): def __init__(self, islink): self.islink = islink + self.thread = [] self.retitle = re.compile(r'\[[^\[\]]+\]') self.rechkauth = re.compile(r'.*book\s*by', re.I) self.redesc = re.compile(r'book\s*description\s*:\s*(<br[^>]+>)*(?P<desc>.*)<br[^>]*>.{,15}publisher\s*:', re.I) @@ -321,16 +343,56 @@ class ResultList(list): report(verbose) return None - def populate(self, entries, br, verbose=False): + def fetchdatathread(self, qbr, qsync, nb, url, verbose): + try: + browser = qbr.get(True) + entry = self.get_individual_metadata(url, browser, verbose) + except: + report(verbose) + entry = None + finally: + qbr.put(browser, True) + qsync.put(nb, True) + return entry + + def producer(self, sync, urls, br, verbose=False): + for i in xrange(len(urls)): + thread = ThreadwithResults(self.fetchdatathread, br, sync, + i, self.BASE_URL+urls[i], verbose) + thread.start() + self.thread.append(thread) + + def consumer(self, sync, total_entries, verbose=False): + res=[None]*total_entries + i=0 + while i < total_entries: + nb = int(sync.get(True)) + self.thread[nb].join() + entry = self.thread[nb].get_result() + i+=1 + if entry is not None: + res[nb] = self.fill_MI(entry, verbose) + return res + + def populate(self, entries, br, verbose=False, brcall=3): if not self.islink: #single entry self.append(self.fill_MI(entries[0], verbose)) else: #multiple entries - for x in entries: - entry = self.get_individual_metadata(self.BASE_URL+x, br, verbose) - if entry is not None: - self.append(self.fill_MI(entry, verbose)) + pbr = Queue(brcall) + sync = Queue(1) + for i in xrange(brcall-1): + pbr.put(browser(), True) + pbr.put(br, True) + + prod_thread = Thread(target=self.producer, args=(sync, entries, pbr, verbose)) + cons_thread = ThreadwithResults(self.consumer, sync, len(entries), verbose) + prod_thread.start() + cons_thread.start() + prod_thread.join() + cons_thread.join() + self.extend(cons_thread.get_result()) def search(title=None, author=None, publisher=None, isbn=None, diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py index 3f4f24902c..6cb7c9a6ae 100644 --- a/src/calibre/ebooks/metadata/nicebooks.py +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -318,11 +318,9 @@ class ResultList(list): nb = int(sync.get(True)) self.thread[nb].join() entry = self.thread[nb].get_result() - mi = None i+=1 if entry is not None: - mi = self.fill_MI(entry, verbose) - res[nb]=mi + res[nb] = self.fill_MI(entry, verbose) return res def populate(self, entries, br, verbose=False, brcall=3): From d4e4c8b1564de4acc09850c1b66207fca3ca2741 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 12 Dec 2010 18:31:18 +0100 Subject: [PATCH 061/132] Replace amazon default plugin --- src/calibre/customize/builtins.py | 2 +- src/calibre/ebooks/metadata/amazon.py | 741 +++++++++++++++++++---- src/calibre/ebooks/metadata/amazonbis.py | 653 -------------------- 3 files changed, 633 insertions(+), 763 deletions(-) delete mode 100644 src/calibre/ebooks/metadata/amazonbis.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 4798c46516..342d0e8456 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -483,7 +483,7 @@ from calibre.devices.kobo.driver import KOBO from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, LibraryThing from calibre.ebooks.metadata.douban import DoubanBooks from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers -from calibre.ebooks.metadata.amazonbis import Amazon, AmazonSocial +from calibre.ebooks.metadata.amazon import Amazon, AmazonSocial from calibre.ebooks.metadata.fictionwise import Fictionwise from calibre.ebooks.metadata.covers import OpenLibraryCovers, \ LibraryThingCovers, DoubanCovers diff --git a/src/calibre/ebooks/metadata/amazon.py b/src/calibre/ebooks/metadata/amazon.py index cf96c9732c..1362349685 100644 --- a/src/calibre/ebooks/metadata/amazon.py +++ b/src/calibre/ebooks/metadata/amazon.py @@ -1,130 +1,653 @@ -#!/usr/bin/env python -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' -__docformat__ = 'restructuredtext en' +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2010, sengian <sengian1@gmail.com>' -''' -Fetch metadata using Amazon AWS -''' -import sys, re +import sys, textwrap, re, traceback, socket +from threading import Thread +from Queue import Queue +from urllib import urlencode +from math import ceil -from lxml import html -from lxml.html import soupparser +from lxml.html import soupparser, tostring -from calibre import browser -from calibre.ebooks.metadata import check_isbn -from calibre.ebooks.metadata.book.base import Metadata +from calibre.utils.date import parse_date, utcnow, replace_months +from calibre.utils.cleantext import clean_ascii_chars +from calibre.utils.localization import get_lang +from calibre import browser, preferred_encoding from calibre.ebooks.chardet import xml_to_unicode +from calibre.ebooks.metadata import MetaInformation, check_isbn, \ + authors_to_sort_string +from calibre.ebooks.metadata.fetch import MetadataSource +from calibre.utils.config import OptionParser from calibre.library.comments import sanitize_comments_html -def find_asin(br, isbn): - q = 'http://www.amazon.com/s?field-keywords='+isbn - raw = br.open_novisit(q).read() - raw = xml_to_unicode(raw, strip_encoding_pats=True, - resolve_entities=True)[0] - root = html.fromstring(raw) - revs = root.xpath('//*[@class="asinReviewsSummary" and @name]') - revs = [x.get('name') for x in revs] - if revs: - return revs[0] -def to_asin(br, isbn): - if len(isbn) == 13: +class Amazon(MetadataSource): + + name = 'Amazon' + description = _('Downloads metadata from amazon.com') + supported_platforms = ['windows', 'osx', 'linux'] + author = 'Kovid Goyal & Sengian' + version = (1, 0, 0) + has_html_comments = True + + def fetch(self): try: - asin = find_asin(br, isbn) + lang = get_lang() + lang = lang[:2] if re.match(r'(fr.*|de.*)', lang) else 'all' + if lang == 'all': + self.results = search(self.title, self.book_author, self.publisher, + self.isbn, max_results=10, verbose=self.verbose, lang='all') + else: + tmploc = ThreadwithResults(search, self.title, self.book_author, + self.publisher,self.isbn, max_results=5, + verbose=self.verbose, lang=lang) + tmpnoloc = ThreadwithResults(search, self.title, self.book_author, + self.publisher, self.isbn, max_results=5, + verbose=self.verbose, lang='all') + tmploc.start() + tmpnoloc.start() + tmploc.join() + tmpnoloc.join() + tmploc= tmploc.get_result() + tmpnoloc= tmpnoloc.get_result() + + tempres = None + if tmpnoloc is not None: + tempres = tmpnoloc + if tmploc is not None: + tempres = tmploc + if tmpnoloc is not None: + tempres.extend(tmpnoloc) + self.results = tempres + except Exception, e: + self.exception = e + self.tb = traceback.format_exc() + +class AmazonSocial(MetadataSource): + + name = 'AmazonSocial' + metadata_type = 'social' + description = _('Downloads social metadata from amazon.com') + supported_platforms = ['windows', 'osx', 'linux'] + author = 'Kovid Goyal & Sengian' + version = (1, 0, 1) + has_html_comments = True + + def fetch(self): + if not self.isbn: + return + try: + lang = get_lang() + lang = lang[:2] if re.match(r'(fr.*|de.*)', lang) else 'all' + if lang == 'all': + self.results = get_social_metadata(self.title, self.book_author, self.publisher, + self.isbn, verbose=self.verbose, lang='all')[0] + else: + tmploc = ThreadwithResults(get_social_metadata, self.title, self.book_author, + self.publisher,self.isbn, verbose=self.verbose, lang=lang) + tmpnoloc = ThreadwithResults(get_social_metadata, self.title, self.book_author, + self.publisher, self.isbn, verbose=self.verbose, lang='all') + tmploc.start() + tmpnoloc.start() + tmploc.join() + tmpnoloc.join() + tmploc= tmploc.get_result() + if tmploc is not None: + tmploc = tmploc[0] + tmpnoloc= tmpnoloc.get_result() + if tmpnoloc is not None: + tmpnoloc = tmpnoloc[0] + if tmpnoloc is not None: + if tmploc.rating is None: + tmploc.rating = tmpnoloc.rating + if tmploc.comments is not None: + tmploc.comments = tmpnoloc.comments + if tmploc.tags is None: + tmploc.tags = tmpnoloc.tags + self.results = tmploc + except Exception, e: + self.exception = e + self.tb = traceback.format_exc() + + +def report(verbose): + if verbose: + traceback.print_exc() + +class AmazonError(Exception): + pass + +class ThreadwithResults(Thread): + def __init__(self, func, *args, **kargs): + self.func = func + self.args = args + self.kargs = kargs + self.result = None + Thread.__init__(self) + + def get_result(self): + return self.result + + def run(self): + self.result = self.func(*self.args, **self.kargs) + + +class Query(object): + + BASE_URL_ALL = 'http://www.amazon.com' + BASE_URL_FR = 'http://www.amazon.fr' + BASE_URL_DE = 'http://www.amazon.de' + + def __init__(self, title=None, author=None, publisher=None, isbn=None, keywords=None, + max_results=20, rlang='all'): + assert not(title is None and author is None and publisher is None \ + and isbn is None and keywords is None) + assert (max_results < 21) + + self.max_results = int(max_results) + self.renbres = re.compile(u'\s*([0-9.,]+)\s*') + + q = { 'search-alias' : 'stripbooks' , + 'unfiltered' : '1', + 'field-keywords' : '', + 'field-author' : '', + 'field-title' : '', + 'field-isbn' : '', + 'field-publisher' : '' + #get to amazon detailed search page to get all options + # 'node' : '', + # 'field-binding' : '', + #before, during, after + # 'field-dateop' : '', + #month as number + # 'field-datemod' : '', + # 'field-dateyear' : '', + #french only + # 'field-collection' : '', + #many options available + } + + if rlang =='all' or rlang =='en': + q['sort'] = 'relevanceexprank' + self.urldata = self.BASE_URL_ALL + # elif rlang =='es': + # q['sort'] = 'relevanceexprank' + # q['field-language'] = 'Spanish' + # self.urldata = self.BASE_URL_ALL + # elif rlang =='en': + # q['sort'] = 'relevanceexprank' + # q['field-language'] = 'English' + # self.urldata = self.BASE_URL_ALL + elif rlang =='fr': + q['sort'] = 'relevancerank' + self.urldata = self.BASE_URL_FR + elif rlang =='de': + q['sort'] = 'relevancerank' + self.urldata = self.BASE_URL_DE + self.baseurl = self.urldata + + if title == _('Unknown'): + title=None + if author == _('Unknown'): + author=None + + if isbn is not None: + q['field-isbn'] = isbn.replace('-', '') + else: + if title is not None: + q['field-title'] = title + if author is not None: + q['field-author'] = author + if publisher is not None: + q['field-publisher'] = publisher + if keywords is not None: + q['field-keywords'] = keywords + + if isinstance(q, unicode): + q = q.encode('utf-8') + self.urldata += '/gp/search/ref=sr_adv_b/?' + urlencode(q) + + def __call__(self, browser, verbose, timeout = 5.): + if verbose: + print _('Query: %s') % self.urldata + + try: + raw = browser.open_novisit(self.urldata, timeout=timeout).read() + except Exception, e: + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return None, self.urldata + if isinstance(getattr(e, 'args', [None])[0], socket.timeout): + raise AmazonError(_('Amazon timed out. Try again later.')) + raise AmazonError(_('Amazon encountered an error.')) + if '<title>404 - ' in raw: + return None, self.urldata + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + + try: + feed = soupparser.fromstring(raw) except: - import traceback - traceback.print_exc() - asin = None - else: - asin = isbn - return asin + try: + #remove ASCII invalid chars + return soupparser.fromstring(clean_ascii_chars(raw)) + except: + return None, self.urldata -def get_social_metadata(title, authors, publisher, isbn): - mi = Metadata(title, authors) - if not isbn: - return mi - isbn = check_isbn(isbn) - if not isbn: - return mi - br = browser() - asin = to_asin(br, isbn) - if asin and get_metadata(br, asin, mi): - return mi - from calibre.ebooks.metadata.xisbn import xisbn - for i in xisbn.get_associated_isbns(isbn): - asin = to_asin(br, i) - if asin and get_metadata(br, asin, mi): - return mi - return mi + #nb of page + try: + nbresults = self.renbres.findall(feed.xpath("//*[@class='resultCount']")[0].text) + nbresults = [re.sub(r'[.,]', '', x) for x in nbresults] + except: + return None, self.urldata -def get_metadata(br, asin, mi): - q = 'http://amzn.com/'+asin - try: - raw = br.open_novisit(q).read() - except Exception, e: - if callable(getattr(e, 'getcode', None)) and \ - e.getcode() == 404: - return False - raise - if '<title>404 - ' in raw: - return False - raw = xml_to_unicode(raw, strip_encoding_pats=True, - resolve_entities=True)[0] - try: - root = soupparser.fromstring(raw) - except: - return False - ratings = root.xpath('//form[@id="handleBuy"]/descendant::*[@class="asinReviewsSummary"]') - if ratings: - pat = re.compile(r'([0-9.]+) out of (\d+) stars') - r = ratings[0] - for elem in r.xpath('descendant::*[@title]'): - t = elem.get('title') - m = pat.match(t) - if m is not None: + pages =[feed] + if len(nbresults) > 1: + nbpagetoquery = int(ceil(float(min(int(nbresults[2]), self.max_results))/ int(nbresults[1]))) + for i in xrange(2, nbpagetoquery + 1): try: - mi.rating = float(m.group(1))/float(m.group(2)) * 5 - break + urldata = self.urldata + '&page=' + str(i) + raw = browser.open_novisit(urldata, timeout=timeout).read() + except Exception, e: + continue + if '<title>404 - ' in raw: + continue + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + feed = soupparser.fromstring(raw) except: - pass + try: + #remove ASCII invalid chars + return soupparser.fromstring(clean_ascii_chars(raw)) + except: + continue + pages.append(feed) - desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]') - if desc: - desc = desc[0] - for c in desc.xpath('descendant::*[@class="seeAll" or' - ' @class="emptyClear" or @href]'): - c.getparent().remove(c) - desc = html.tostring(desc, method='html', encoding=unicode).strip() - # remove all attributes from tags - desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) - # Collapse whitespace - #desc = re.sub('\n+', '\n', desc) - #desc = re.sub(' +', ' ', desc) - # Remove the notice about text referring to out of print editions - desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc) - # Remove comments - desc = re.sub(r'(?s)<!--.*?-->', '', desc) - mi.comments = sanitize_comments_html(desc) + results = [] + for x in pages: + results.extend([i.getparent().get('href') \ + for i in x.xpath("//a/span[@class='srTitle']")]) + return results[:self.max_results], self.baseurl - return True +class ResultList(object): + def __init__(self, baseurl, lang = 'all'): + self.baseurl = baseurl + self.lang = lang + self.thread = [] + self.res = [] + self.nbtag = 0 + self.repub = re.compile(u'\((.*)\)') + self.rerat = re.compile(u'([0-9.]+)') + self.reattr = re.compile(r'<([a-zA-Z0-9]+)\s[^>]+>') + self.reoutp = re.compile(r'(?s)<em>--This text ref.*?</em>') + self.recom = re.compile(r'(?s)<!--.*?-->') + self.republi = re.compile(u'(Editeur|Publisher|Verlag)', re.I) + self.reisbn = re.compile(u'(ISBN-10|ISBN-10|ASIN)', re.I) + self.relang = re.compile(u'(Language|Langue|Sprache)', re.I) + self.reratelt = re.compile(u'(Average\s*Customer\s*Review|Moyenne\s*des\s*commentaires\s*client|Durchschnittliche\s*Kundenbewertung)', re.I) + self.reprod = re.compile(u'(Product\s*Details|D.tails\s*sur\s*le\s*produit|Produktinformation)', re.I) + + def strip_tags_etree(self, etreeobj, invalid_tags): + for (itag, rmv) in invalid_tags.iteritems(): + if rmv: + for elts in etreeobj.getiterator(itag): + elts.drop_tree() + else: + for elts in etreeobj.getiterator(itag): + elts.drop_tag() + + def clean_entry(self, entry, invalid_tags = {'script': True}, + invalid_id = (), invalid_class=()): + #invalid_tags: remove tag and keep content if False else remove + #remove tags + if invalid_tags: + self.strip_tags_etree(entry, invalid_tags) + #remove id + if invalid_id: + for eltid in invalid_id: + elt = entry.get_element_by_id(eltid) + if elt is not None: + elt.drop_tree() + #remove class + if invalid_class: + for eltclass in invalid_class: + elts = entry.find_class(eltclass) + if elts is not None: + for elt in elts: + elt.drop_tree() + + def get_title(self, entry): + title = entry.get_element_by_id('btAsinTitle') + if title is not None: + title = title.text + return unicode(title.replace('\n', '').strip()) + + def get_authors(self, entry): + author = entry.get_element_by_id('btAsinTitle') + while author.getparent().tag != 'div': + author = author.getparent() + author = author.getparent() + authortext = [] + for x in author.getiterator('a'): + authortext.append(unicode(x.text_content().strip())) + return authortext + + def get_description(self, entry, verbose): + try: + description = entry.get_element_by_id("productDescription").find("div[@class='content']") + inv_class = ('seeAll', 'emptyClear') + inv_tags ={'img': True, 'a': False} + self.clean_entry(description, invalid_tags=inv_tags, invalid_class=inv_class) + description = tostring(description, method='html', encoding=unicode).strip() + # remove all attributes from tags + description = self.reattr.sub(r'<\1>', description) + # Remove the notice about text referring to out of print editions + description = self.reoutp.sub('', description) + # Remove comments + description = self.recom.sub('', description) + return unicode(sanitize_comments_html(description)) + except: + report(verbose) + return None + + def get_tags(self, entry, verbose): + try: + tags = entry.get_element_by_id('tagContentHolder') + testptag = tags.find_class('see-all') + if testptag: + for x in testptag: + alink = x.xpath('descendant-or-self::a') + if alink: + if alink[0].get('class') == 'tgJsActive': + continue + return self.baseurl + alink[0].get('href'), True + tags = [a.text for a in tags.getiterator('a') if a.get('rel') == 'tag'] + except: + report(verbose) + tags = [], False + return tags, False + + def get_book_info(self, entry, mi, verbose): + try: + entry = entry.get_element_by_id('SalesRank').getparent() + except: + try: + for z in entry.getiterator('h2'): + if self.reprod.search(z.text_content()): + entry = z.getparent().find("div[@class='content']/ul") + break + except: + report(verbose) + return mi + elts = entry.findall('li') + #pub & date + elt = filter(lambda x: self.republi.search(x.find('b').text), elts) + if elt: + pub = elt[0].find('b').tail + mi.publisher = unicode(self.repub.sub('', pub).strip()) + d = self.repub.search(pub) + if d is not None: + d = d.group(1) + try: + default = utcnow().replace(day=15) + if self.lang != 'all': + d = replace_months(d, self.lang) + d = parse_date(d, assume_utc=True, default=default) + mi.pubdate = d + except: + report(verbose) + #ISBN + elt = filter(lambda x: self.reisbn.search(x.find('b').text), elts) + if elt: + isbn = elt[0].find('b').tail.replace('-', '').strip() + if check_isbn(isbn): + mi.isbn = unicode(isbn) + elif len(elt) > 1: + isbnone = elt[1].find('b').tail.replace('-', '').strip() + if check_isbn(isbnone): + mi.isbn = unicode(isbnone) + else: + #assume ASIN-> find a check for asin + mi.isbn = unicode(isbn) + #Langue + elt = filter(lambda x: self.relang.search(x.find('b').text), elts) + if elt: + langue = elt[0].find('b').tail.strip() + if langue: + mi.language = unicode(langue) + #ratings + elt = filter(lambda x: self.reratelt.search(x.find('b').text), elts) + if elt: + ratings = elt[0].find_class('swSprite') + if ratings: + ratings = self.rerat.findall(ratings[0].get('title')) + if len(ratings) == 2: + mi.rating = float(ratings[0])/float(ratings[1]) * 5 + return mi + + def fill_MI(self, entry, verbose): + try: + title = self.get_title(entry) + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print _('Failed to get all details for an entry') + print e + print _('URL who failed: %s') % x + report(verbose) + return None + mi = MetaInformation(title, authors) + mi.author_sort = authors_to_sort_string(authors) + try: + mi.comments = self.get_description(entry, verbose) + mi = self.get_book_info(entry, mi, verbose) + except: + pass + return mi + + def get_individual_metadata(self, url, br, verbose): + try: + raw = br.open_novisit(url).read() + except Exception, e: + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return None + if isinstance(getattr(e, 'args', [None])[0], socket.timeout): + raise AmazonError(_('Amazon timed out. Try again later.')) + raise AmazonError(_('Amazon encountered an error.')) + if '<title>404 - ' in raw: + report(verbose) + return None + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + return soupparser.fromstring(raw) + except: + try: + #remove ASCII invalid chars + return soupparser.fromstring(clean_ascii_chars(raw)) + except: + report(verbose) + return None + + def fetchdatathread(self, qbr, qsync, nb, url, verbose): + try: + browser = qbr.get(True) + entry = self.get_individual_metadata(url, browser, verbose) + except: + report(verbose) + entry = None + finally: + qbr.put(browser, True) + qsync.put(nb, True) + return entry + + def producer(self, sync, urls, br, verbose=False): + for i in xrange(len(urls)): + thread = ThreadwithResults(self.fetchdatathread, br, sync, + i, urls[i], verbose) + thread.start() + self.thread.append(thread) + + def consumer(self, sync, syncbis, br, total_entries, verbose=False): + i=0 + while i < total_entries: + nb = int(sync.get(True)) + self.thread[nb].join() + entry = self.thread[nb].get_result() + i+=1 + if entry is not None: + mi = self.fill_MI(entry, verbose) + if mi is not None: + mi.tags, atag = self.get_tags(entry, verbose) + self.res[nb] = mi + if atag: + threadbis = ThreadwithResults(self.fetchdatathread, + br, syncbis, nb, mi.tags, verbose) + self.thread[nb] = threadbis + self.nbtag +=1 + threadbis.start() + + def populate(self, entries, ibr, verbose=False, brcall=3): + br = Queue(brcall) + cbr = Queue(brcall-1) + + syncp = Queue(1) + syncc = Queue(len(entries)) + + for i in xrange(brcall-1): + br.put(browser(), True) + cbr.put(browser(), True) + br.put(ibr, True) + + self.res = [None]*len(entries) + + prod_thread = Thread(target=self.producer, args=(syncp, entries, br, verbose)) + cons_thread = Thread(target=self.consumer, args=(syncp, syncc, cbr, len(entries), verbose)) + prod_thread.start() + cons_thread.start() + prod_thread.join() + cons_thread.join() + + #finish processing + for i in xrange(self.nbtag): + nb = int(syncc.get(True)) + tags = self.thread[nb].get_result() + if tags is not None: + self.res[nb].tags = self.get_tags(tags, verbose)[0] + return self.res + + +def search(title=None, author=None, publisher=None, isbn=None, + max_results=5, verbose=False, keywords=None, lang='all'): + br = browser() + entries, baseurl = Query(title=title, author=author, isbn=isbn, publisher=publisher, + keywords=keywords, max_results=max_results,rlang=lang)(br, verbose) + + if entries is None or len(entries) == 0: + return None + + #List of entry + ans = ResultList(baseurl, lang) + return [x for x in ans.populate(entries, br, verbose) if x is not None] + +def get_social_metadata(title, authors, publisher, isbn, verbose=False, + max_results=1, lang='all'): + mi = MetaInformation(title, authors) + if not isbn or not check_isbn(isbn): + return [mi] + + amazresults = search(isbn=isbn, verbose=verbose, + max_results=max_results, lang=lang) + if amazresults is None or amazresults[0] is None: + from calibre.ebooks.metadata.xisbn import xisbn + for i in xisbn.get_associated_isbns(isbn): + amazresults = search(isbn=i, verbose=verbose, + max_results=max_results, lang=lang) + if amazresults is not None and amazresults[0] is not None: + break + if amazresults is None or amazresults[0] is None: + return [mi] + + miaz = amazresults[0] + if miaz.rating is not None: + mi.rating = miaz.rating + if miaz.comments is not None: + mi.comments = miaz.comments + if miaz.tags is not None: + mi.tags = miaz.tags + return [mi] + +def option_parser(): + parser = OptionParser(textwrap.dedent(\ + _('''\ + %prog [options] + + Fetch book metadata from Amazon. You must specify one of title, author, + ISBN, publisher or keywords. Will fetch a maximum of 20 matches, + so you should make your query as specific as possible. + You can chose the language for metadata retrieval: + english & french & german + ''' + ))) + parser.add_option('-t', '--title', help=_('Book title')) + parser.add_option('-a', '--author', help=_('Book author(s)')) + parser.add_option('-p', '--publisher', help=_('Book publisher')) + parser.add_option('-i', '--isbn', help=_('Book ISBN')) + parser.add_option('-k', '--keywords', help=_('Keywords')) + parser.add_option('-s', '--social', default=0, action='count', + help=_('Get social data only')) + parser.add_option('-m', '--max-results', default=10, + help=_('Maximum number of results to fetch')) + parser.add_option('-l', '--lang', default='all', + help=_('Chosen language for metadata search (en, fr, de)')) + parser.add_option('-v', '--verbose', default=0, action='count', + help=_('Be more verbose about errors')) + return parser def main(args=sys.argv): - # Test xisbn - print get_social_metadata('Learning Python', None, None, '8324616489') - print - - # Test sophisticated comment formatting - print get_social_metadata('Angels & Demons', None, None, '9781416580829') - print - - # Random tests - print get_social_metadata('Star Trek: Destiny: Mere Mortals', None, None, '9781416551720') - print - print get_social_metadata('The Great Gatsby', None, None, '0743273567') - - return 0 + parser = option_parser() + opts, args = parser.parse_args(args) + try: + if opts.social: + results = get_social_metadata(opts.title, opts.author, + opts.publisher, opts.isbn, verbose=opts.verbose, lang=opts.lang) + else: + results = search(opts.title, opts.author, isbn=opts.isbn, + publisher=opts.publisher, keywords=opts.keywords, verbose=opts.verbose, + max_results=opts.max_results, lang=opts.lang) + except AssertionError: + report(True) + parser.print_help() + return 1 + if results is None and len(results) == 0: + print _('No result found for this search!') + return 0 + for result in results: + print unicode(result).encode(preferred_encoding, 'replace') + print + + #test social + # '''Test xisbn''' + # print get_social_metadata('Learning Python', None, None, '8324616489')[0] + # print + # '''Test sophisticated comment formatting''' + # print get_social_metadata('Angels & Demons', None, None, '9781416580829')[0] + # print + # '''Random tests''' + # print get_social_metadata('Star Trek: Destiny: Mere Mortals', None, None, '9781416551720')[0] + # print + # print get_social_metadata('The Great Gatsby', None, None, '0743273567')[0] if __name__ == '__main__': sys.exit(main()) + # import cProfile + # sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonbis; calibre.ebooks.metadata.amazonbis.main()")) + # sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonbis; calibre.ebooks.metadata.amazonbis.main()", "profile_tmp_2")) + +# calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\amazonbis.py" -m 5 -a gore -v>data.html \ No newline at end of file diff --git a/src/calibre/ebooks/metadata/amazonbis.py b/src/calibre/ebooks/metadata/amazonbis.py deleted file mode 100644 index dd973ba3d8..0000000000 --- a/src/calibre/ebooks/metadata/amazonbis.py +++ /dev/null @@ -1,653 +0,0 @@ -from __future__ import with_statement -__license__ = 'GPL 3' -__copyright__ = '2010, sengian <sengian1@gmail.com>' - -import sys, textwrap, re, traceback, socket -from threading import Thread -from Queue import Queue -from urllib import urlencode -from math import ceil - -from lxml.html import soupparser, tostring - -from calibre.utils.date import parse_date, utcnow, replace_months -from calibre.utils.cleantext import clean_ascii_chars -from calibre.utils.localization import get_lang -from calibre import browser, preferred_encoding -from calibre.ebooks.chardet import xml_to_unicode -from calibre.ebooks.metadata import MetaInformation, check_isbn, \ - authors_to_sort_string -from calibre.ebooks.metadata.fetch import MetadataSource -from calibre.utils.config import OptionParser -from calibre.library.comments import sanitize_comments_html - - -class Amazon(MetadataSource): - - name = 'Amazon' - description = _('Downloads metadata from amazon.com') - supported_platforms = ['windows', 'osx', 'linux'] - author = 'Kovid Goyal & Sengian' - version = (1, 0, 0) - has_html_comments = True - - def fetch(self): - try: - lang = get_lang() - lang = lang[:2] if re.match(r'(fr.*|de.*)', lang) else 'all' - if lang == 'all': - self.results = search(self.title, self.book_author, self.publisher, - self.isbn, max_results=5, verbose=self.verbose, lang='all') - else: - tmploc = ThreadwithResults(search, self.title, self.book_author, - self.publisher,self.isbn, max_results=5, - verbose=self.verbose, lang=lang) - tmpnoloc = ThreadwithResults(search, self.title, self.book_author, - self.publisher, self.isbn, max_results=5, - verbose=self.verbose, lang='all') - tmploc.start() - tmpnoloc.start() - tmploc.join() - tmpnoloc.join() - tmploc= tmploc.get_result() - tmpnoloc= tmpnoloc.get_result() - - tempres = None - if tmpnoloc is not None: - tempres = tmpnoloc - if tmploc is not None: - tempres = tmploc - if tmpnoloc is not None: - tempres.extend(tmpnoloc) - self.results = tmpres - except Exception, e: - self.exception = e - self.tb = traceback.format_exc() - -class AmazonSocial(MetadataSource): - - name = 'AmazonSocial' - metadata_type = 'social' - description = _('Downloads social metadata from amazon.com') - supported_platforms = ['windows', 'osx', 'linux'] - author = 'Kovid Goyal & Sengian' - version = (1, 0, 1) - has_html_comments = True - - def fetch(self): - if not self.isbn: - return - try: - lang = get_lang() - lang = lang[:2] if re.match(r'(fr.*|de.*)', lang) else 'all' - if lang == 'all': - self.results = get_social_metadata(self.title, self.book_author, self.publisher, - self.isbn, verbose=self.verbose, lang='all')[0] - else: - tmploc = ThreadwithResults(get_social_metadata, self.title, self.book_author, - self.publisher,self.isbn, verbose=self.verbose, lang=lang) - tmpnoloc = ThreadwithResults(get_social_metadata, self.title, self.book_author, - self.publisher, self.isbn, verbose=self.verbose, lang='all') - tmploc.start() - tmpnoloc.start() - tmploc.join() - tmpnoloc.join() - tmploc= tmploc.get_result() - if tmploc is not None: - tmploc = tmploc[0] - tmpnoloc= tmpnoloc.get_result() - if tmpnoloc is not None: - tmpnoloc = tmpnoloc[0] - if tmpnoloc is not None: - if tmploc.rating is None: - tmploc.rating = tmpnoloc.rating - if tmploc.comments is not None: - tmploc.comments = tmpnoloc.comments - if tmploc.tags is None: - tmploc.tags = tmpnoloc.tags - self.results = tmploc - except Exception, e: - self.exception = e - self.tb = traceback.format_exc() - - -def report(verbose): - if verbose: - traceback.print_exc() - -class AmazonError(Exception): - pass - -class ThreadwithResults(Thread): - def __init__(self, func, *args, **kargs): - self.func = func - self.args = args - self.kargs = kargs - self.result = None - Thread.__init__(self) - - def get_result(self): - return self.result - - def run(self): - self.result = self.func(*self.args, **self.kargs) - - -class Query(object): - - BASE_URL_ALL = 'http://www.amazon.com' - BASE_URL_FR = 'http://www.amazon.fr' - BASE_URL_DE = 'http://www.amazon.de' - - def __init__(self, title=None, author=None, publisher=None, isbn=None, keywords=None, - max_results=20, rlang='all'): - assert not(title is None and author is None and publisher is None \ - and isbn is None and keywords is None) - assert (max_results < 21) - - self.max_results = int(max_results) - self.renbres = re.compile(u'\s*([0-9.,]+)\s*') - - q = { 'search-alias' : 'stripbooks' , - 'unfiltered' : '1', - 'field-keywords' : '', - 'field-author' : '', - 'field-title' : '', - 'field-isbn' : '', - 'field-publisher' : '' - #get to amazon detailed search page to get all options - # 'node' : '', - # 'field-binding' : '', - #before, during, after - # 'field-dateop' : '', - #month as number - # 'field-datemod' : '', - # 'field-dateyear' : '', - #french only - # 'field-collection' : '', - #many options available - } - - if rlang =='all' or rlang =='en': - q['sort'] = 'relevanceexprank' - self.urldata = self.BASE_URL_ALL - # elif rlang =='es': - # q['sort'] = 'relevanceexprank' - # q['field-language'] = 'Spanish' - # self.urldata = self.BASE_URL_ALL - # elif rlang =='en': - # q['sort'] = 'relevanceexprank' - # q['field-language'] = 'English' - # self.urldata = self.BASE_URL_ALL - elif rlang =='fr': - q['sort'] = 'relevancerank' - self.urldata = self.BASE_URL_FR - elif rlang =='de': - q['sort'] = 'relevancerank' - self.urldata = self.BASE_URL_DE - self.baseurl = self.urldata - - if title == _('Unknown'): - title=None - if author == _('Unknown'): - author=None - - if isbn is not None: - q['field-isbn'] = isbn.replace('-', '') - else: - if title is not None: - q['field-title'] = title - if author is not None: - q['field-author'] = author - if publisher is not None: - q['field-publisher'] = publisher - if keywords is not None: - q['field-keywords'] = keywords - - if isinstance(q, unicode): - q = q.encode('utf-8') - self.urldata += '/gp/search/ref=sr_adv_b/?' + urlencode(q) - - def __call__(self, browser, verbose, timeout = 5.): - if verbose: - print _('Query: %s') % self.urldata - - try: - raw = browser.open_novisit(self.urldata, timeout=timeout).read() - except Exception, e: - report(verbose) - if callable(getattr(e, 'getcode', None)) and \ - e.getcode() == 404: - return None, self.urldata - if isinstance(getattr(e, 'args', [None])[0], socket.timeout): - raise AmazonError(_('Amazon timed out. Try again later.')) - raise AmazonError(_('Amazon encountered an error.')) - if '<title>404 - ' in raw: - return None, self.urldata - raw = xml_to_unicode(raw, strip_encoding_pats=True, - resolve_entities=True)[0] - - try: - feed = soupparser.fromstring(raw) - except: - try: - #remove ASCII invalid chars - return soupparser.fromstring(clean_ascii_chars(raw)) - except: - return None, self.urldata - - #nb of page - try: - nbresults = self.renbres.findall(feed.xpath("//*[@class='resultCount']")[0].text) - nbresults = [re.sub(r'[.,]', '', x) for x in nbresults] - except: - return None, self.urldata - - pages =[feed] - if len(nbresults) > 1: - nbpagetoquery = int(ceil(float(min(int(nbresults[2]), self.max_results))/ int(nbresults[1]))) - for i in xrange(2, nbpagetoquery + 1): - try: - urldata = self.urldata + '&page=' + str(i) - raw = browser.open_novisit(urldata, timeout=timeout).read() - except Exception, e: - continue - if '<title>404 - ' in raw: - continue - raw = xml_to_unicode(raw, strip_encoding_pats=True, - resolve_entities=True)[0] - try: - feed = soupparser.fromstring(raw) - except: - try: - #remove ASCII invalid chars - return soupparser.fromstring(clean_ascii_chars(raw)) - except: - continue - pages.append(feed) - - results = [] - for x in pages: - results.extend([i.getparent().get('href') \ - for i in x.xpath("//a/span[@class='srTitle']")]) - return results[:self.max_results], self.baseurl - -class ResultList(object): - - def __init__(self, baseurl, lang = 'all'): - self.baseurl = baseurl - self.lang = lang - self.thread = [] - self.res = [] - self.nbtag = 0 - self.repub = re.compile(u'\((.*)\)') - self.rerat = re.compile(u'([0-9.]+)') - self.reattr = re.compile(r'<([a-zA-Z0-9]+)\s[^>]+>') - self.reoutp = re.compile(r'(?s)<em>--This text ref.*?</em>') - self.recom = re.compile(r'(?s)<!--.*?-->') - self.republi = re.compile(u'(Editeur|Publisher|Verlag)', re.I) - self.reisbn = re.compile(u'(ISBN-10|ISBN-10|ASIN)', re.I) - self.relang = re.compile(u'(Language|Langue|Sprache)', re.I) - self.reratelt = re.compile(u'(Average\s*Customer\s*Review|Moyenne\s*des\s*commentaires\s*client|Durchschnittliche\s*Kundenbewertung)', re.I) - self.reprod = re.compile(u'(Product\s*Details|D.tails\s*sur\s*le\s*produit|Produktinformation)', re.I) - - def strip_tags_etree(self, etreeobj, invalid_tags): - for (itag, rmv) in invalid_tags.iteritems(): - if rmv: - for elts in etreeobj.getiterator(itag): - elts.drop_tree() - else: - for elts in etreeobj.getiterator(itag): - elts.drop_tag() - - def clean_entry(self, entry, invalid_tags = {'script': True}, - invalid_id = (), invalid_class=()): - #invalid_tags: remove tag and keep content if False else remove - #remove tags - if invalid_tags: - self.strip_tags_etree(entry, invalid_tags) - #remove id - if invalid_id: - for eltid in invalid_id: - elt = entry.get_element_by_id(eltid) - if elt is not None: - elt.drop_tree() - #remove class - if invalid_class: - for eltclass in invalid_class: - elts = entry.find_class(eltclass) - if elts is not None: - for elt in elts: - elt.drop_tree() - - def get_title(self, entry): - title = entry.get_element_by_id('btAsinTitle') - if title is not None: - title = title.text - return unicode(title.replace('\n', '').strip()) - - def get_authors(self, entry): - author = entry.get_element_by_id('btAsinTitle') - while author.getparent().tag != 'div': - author = author.getparent() - author = author.getparent() - authortext = [] - for x in author.getiterator('a'): - authortext.append(unicode(x.text_content().strip())) - return authortext - - def get_description(self, entry, verbose): - try: - description = entry.get_element_by_id("productDescription").find("div[@class='content']") - inv_class = ('seeAll', 'emptyClear') - inv_tags ={'img': True, 'a': False} - self.clean_entry(description, invalid_tags=inv_tags, invalid_class=inv_class) - description = tostring(description, method='html', encoding=unicode).strip() - # remove all attributes from tags - description = self.reattr.sub(r'<\1>', description) - # Remove the notice about text referring to out of print editions - description = self.reoutp.sub('', description) - # Remove comments - description = self.recom.sub('', description) - return unicode(sanitize_comments_html(description)) - except: - report(verbose) - return None - - def get_tags(self, entry, verbose): - try: - tags = entry.get_element_by_id('tagContentHolder') - testptag = tags.find_class('see-all') - if testptag: - for x in testptag: - alink = x.xpath('descendant-or-self::a') - if alink: - if alink[0].get('class') == 'tgJsActive': - continue - return self.baseurl + alink[0].get('href'), True - tags = [a.text for a in tags.getiterator('a') if a.get('rel') == 'tag'] - except: - report(verbose) - tags = [], False - return tags, False - - def get_book_info(self, entry, mi, verbose): - try: - entry = entry.get_element_by_id('SalesRank').getparent() - except: - try: - for z in entry.getiterator('h2'): - if self.reprod.search(z.text_content()): - entry = z.getparent().find("div[@class='content']/ul") - break - except: - report(verbose) - return mi - elts = entry.findall('li') - #pub & date - elt = filter(lambda x: self.republi.search(x.find('b').text), elts) - if elt: - pub = elt[0].find('b').tail - mi.publisher = unicode(self.repub.sub('', pub).strip()) - d = self.repub.search(pub) - if d is not None: - d = d.group(1) - try: - default = utcnow().replace(day=15) - if self.lang != 'all': - d = replace_months(d, self.lang) - d = parse_date(d, assume_utc=True, default=default) - mi.pubdate = d - except: - report(verbose) - #ISBN - elt = filter(lambda x: self.reisbn.search(x.find('b').text), elts) - if elt: - isbn = elt[0].find('b').tail.replace('-', '').strip() - if check_isbn(isbn): - mi.isbn = unicode(isbn) - elif len(elt) > 1: - isbnone = elt[1].find('b').tail.replace('-', '').strip() - if check_isbn(isbnone): - mi.isbn = unicode(isbnone) - else: - #assume ASIN-> find a check for asin - mi.isbn = unicode(isbn) - #Langue - elt = filter(lambda x: self.relang.search(x.find('b').text), elts) - if elt: - langue = elt[0].find('b').tail.strip() - if langue: - mi.language = unicode(langue) - #ratings - elt = filter(lambda x: self.reratelt.search(x.find('b').text), elts) - if elt: - ratings = elt[0].find_class('swSprite') - if ratings: - ratings = self.rerat.findall(ratings[0].get('title')) - if len(ratings) == 2: - mi.rating = float(ratings[0])/float(ratings[1]) * 5 - return mi - - def fill_MI(self, entry, verbose): - try: - title = self.get_title(entry) - authors = self.get_authors(entry) - except Exception, e: - if verbose: - print _('Failed to get all details for an entry') - print e - print _('URL who failed: %s') % x - report(verbose) - return None - mi = MetaInformation(title, authors) - mi.author_sort = authors_to_sort_string(authors) - try: - mi.comments = self.get_description(entry, verbose) - mi = self.get_book_info(entry, mi, verbose) - except: - pass - return mi - - def get_individual_metadata(self, url, br, verbose): - try: - raw = br.open_novisit(url).read() - except Exception, e: - report(verbose) - if callable(getattr(e, 'getcode', None)) and \ - e.getcode() == 404: - return None - if isinstance(getattr(e, 'args', [None])[0], socket.timeout): - raise AmazonError(_('Amazon timed out. Try again later.')) - raise AmazonError(_('Amazon encountered an error.')) - if '<title>404 - ' in raw: - report(verbose) - return None - raw = xml_to_unicode(raw, strip_encoding_pats=True, - resolve_entities=True)[0] - try: - return soupparser.fromstring(raw) - except: - try: - #remove ASCII invalid chars - return soupparser.fromstring(clean_ascii_chars(raw)) - except: - report(verbose) - return None - - def fetchdatathread(self, qbr, qsync, nb, url, verbose): - try: - browser = qbr.get(True) - entry = self.get_individual_metadata(url, browser, verbose) - except: - report(verbose) - entry = None - finally: - qbr.put(browser, True) - qsync.put(nb, True) - return entry - - def producer(self, sync, urls, br, verbose=False): - for i in xrange(len(urls)): - thread = ThreadwithResults(self.fetchdatathread, br, sync, - i, urls[i], verbose) - thread.start() - self.thread.append(thread) - - def consumer(self, sync, syncbis, br, total_entries, verbose=False): - i=0 - while i < total_entries: - nb = int(sync.get(True)) - self.thread[nb].join() - entry = self.thread[nb].get_result() - i+=1 - if entry is not None: - mi = self.fill_MI(entry, verbose) - if mi is not None: - mi.tags, atag = self.get_tags(entry, verbose) - self.res[nb] = mi - if atag: - threadbis = ThreadwithResults(self.fetchdatathread, - br, syncbis, nb, mi.tags, verbose) - self.thread[nb] = threadbis - self.nbtag +=1 - threadbis.start() - - def populate(self, entries, ibr, verbose=False, brcall=3): - br = Queue(brcall) - cbr = Queue(brcall-1) - - syncp = Queue(1) - syncc = Queue(len(entries)) - - for i in xrange(brcall-1): - br.put(browser(), True) - cbr.put(browser(), True) - br.put(ibr, True) - - self.res = [None]*len(entries) - - prod_thread = Thread(target=self.producer, args=(syncp, entries, br, verbose)) - cons_thread = Thread(target=self.consumer, args=(syncp, syncc, cbr, len(entries), verbose)) - prod_thread.start() - cons_thread.start() - prod_thread.join() - cons_thread.join() - - #finish processing - for i in xrange(self.nbtag): - nb = int(syncc.get(True)) - tags = self.thread[nb].get_result() - if tags is not None: - self.res[nb].tags = self.get_tags(tags, verbose)[0] - return self.res - - -def search(title=None, author=None, publisher=None, isbn=None, - max_results=5, verbose=False, keywords=None, lang='all'): - br = browser() - entries, baseurl = Query(title=title, author=author, isbn=isbn, publisher=publisher, - keywords=keywords, max_results=max_results,rlang=lang)(br, verbose) - - if entries is None or len(entries) == 0: - return None - - #List of entry - ans = ResultList(baseurl, lang) - return [x for x in ans.populate(entries, br, verbose) if x is not None] - -def get_social_metadata(title, authors, publisher, isbn, verbose=False, - max_results=1, lang='all'): - mi = MetaInformation(title, authors) - if not isbn or not check_isbn(isbn): - return [mi] - - amazresults = search(isbn=isbn, verbose=verbose, - max_results=max_results, lang=lang) - if amazresults is None or amazresults[0] is None: - from calibre.ebooks.metadata.xisbn import xisbn - for i in xisbn.get_associated_isbns(isbn): - amazresults = search(isbn=i, verbose=verbose, - max_results=max_results, lang=lang) - if amazresults is not None and amazresults[0] is not None: - break - if amazresults is None or amazresults[0] is None: - return [mi] - - miaz = amazresults[0] - if miaz.rating is not None: - mi.rating = miaz.rating - if miaz.comments is not None: - mi.comments = miaz.comments - if miaz.tags is not None: - mi.tags = miaz.tags - return [mi] - -def option_parser(): - parser = OptionParser(textwrap.dedent(\ - _('''\ - %prog [options] - - Fetch book metadata from Amazon. You must specify one of title, author, - ISBN, publisher or keywords. Will fetch a maximum of 20 matches, - so you should make your query as specific as possible. - You can chose the language for metadata retrieval: - english & french & german - ''' - ))) - parser.add_option('-t', '--title', help=_('Book title')) - parser.add_option('-a', '--author', help=_('Book author(s)')) - parser.add_option('-p', '--publisher', help=_('Book publisher')) - parser.add_option('-i', '--isbn', help=_('Book ISBN')) - parser.add_option('-k', '--keywords', help=_('Keywords')) - parser.add_option('-s', '--social', default=0, action='count', - help=_('Get social data only')) - parser.add_option('-m', '--max-results', default=10, - help=_('Maximum number of results to fetch')) - parser.add_option('-l', '--lang', default='all', - help=_('Chosen language for metadata search (en, fr, de)')) - parser.add_option('-v', '--verbose', default=0, action='count', - help=_('Be more verbose about errors')) - return parser - -def main(args=sys.argv): - parser = option_parser() - opts, args = parser.parse_args(args) - try: - if opts.social: - results = get_social_metadata(opts.title, opts.author, - opts.publisher, opts.isbn, verbose=opts.verbose, lang=opts.lang) - else: - results = search(opts.title, opts.author, isbn=opts.isbn, - publisher=opts.publisher, keywords=opts.keywords, verbose=opts.verbose, - max_results=opts.max_results, lang=opts.lang) - except AssertionError: - report(True) - parser.print_help() - return 1 - if results is None and len(results) == 0: - print _('No result found for this search!') - return 0 - for result in results: - print unicode(result).encode(preferred_encoding, 'replace') - print - - #test social - # '''Test xisbn''' - # print get_social_metadata('Learning Python', None, None, '8324616489')[0] - # print - # '''Test sophisticated comment formatting''' - # print get_social_metadata('Angels & Demons', None, None, '9781416580829')[0] - # print - # '''Random tests''' - # print get_social_metadata('Star Trek: Destiny: Mere Mortals', None, None, '9781416551720')[0] - # print - # print get_social_metadata('The Great Gatsby', None, None, '0743273567')[0] - -if __name__ == '__main__': - sys.exit(main()) - # import cProfile - # sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonbis; calibre.ebooks.metadata.amazonbis.main()")) - # sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonbis; calibre.ebooks.metadata.amazonbis.main()", "profile_tmp_2")) - -# calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\amazonbis.py" -m 5 -a gore -v>data.html \ No newline at end of file From a54cbc1a91ea517d3457857d23691bd5d971c8f4 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Mon, 13 Dec 2010 00:50:53 +0100 Subject: [PATCH 062/132] First draft of google books refactoring & adding threading --- src/calibre/ebooks/metadata/google_books.py | 243 +++++++++++++++----- 1 file changed, 190 insertions(+), 53 deletions(-) diff --git a/src/calibre/ebooks/metadata/google_books.py b/src/calibre/ebooks/metadata/google_books.py index 2087b7c489..12d92ca5ae 100644 --- a/src/calibre/ebooks/metadata/google_books.py +++ b/src/calibre/ebooks/metadata/google_books.py @@ -3,7 +3,9 @@ __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' -import sys, textwrap +import sys, textwrap, traceback, socket +from threading import Thread +from Queue import Queue from urllib import urlencode from functools import partial @@ -11,8 +13,10 @@ from lxml import etree from calibre import browser, preferred_encoding from calibre.ebooks.metadata import MetaInformation +from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.config import OptionParser from calibre.utils.date import parse_date, utcnow +from calibre.utils.cleantext import clean_ascii_chars NAMESPACES = { 'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/', @@ -35,9 +39,25 @@ subject = XPath('descendant::dc:subject') description = XPath('descendant::dc:description') language = XPath('descendant::dc:language') +class GoogleBooksError(Exception): + pass + +class ThreadwithResults(Thread): + def __init__(self, func, *args, **kargs): + self.func = func + self.args = args + self.kargs = kargs + self.result = None + Thread.__init__(self) + + def get_result(self): + return self.result + + def run(self): + self.result = self.func(*self.args, **self.kargs) + def report(verbose): if verbose: - import traceback traceback.print_exc() @@ -46,48 +66,93 @@ class Query(object): BASE_URL = 'http://books.google.com/books/feeds/volumes?' def __init__(self, title=None, author=None, publisher=None, isbn=None, - max_results=20, min_viewability='none', start_index=1): + max_results=40, min_viewability='none', start_index=1): assert not(title is None and author is None and publisher is None and \ isbn is None) - assert (max_results < 21) + assert (max_results < 41) assert (min_viewability in ('none', 'partial', 'full')) - q = '' + if title == _('Unknown'): + title=None + if author == _('Unknown'): + author=None + self.sindex = str(start_index) + self.maxresults = int(max_results) + + q = [] if isbn is not None: - q += 'isbn:'+isbn + q.append(('isbn:%s') % (isbn,)) else: def build_term(prefix, parts): - return ' '.join('in'+prefix + ':' + x for x in parts) + return ' '.join(('in%s:%s') % (prefix, x) for x in parts) if title is not None: - q += build_term('title', title.split()) + q.append(build_term('title', title.split())) if author is not None: - q += ('+' if q else '')+build_term('author', author.split()) + q.append(build_term('author', author.split())) if publisher is not None: - q += ('+' if q else '')+build_term('publisher', publisher.split()) - + q.append(build_term('publisher', publisher.split())) + q='+'.join(q) + if isinstance(q, unicode): q = q.encode('utf-8') - self.url = self.BASE_URL+urlencode({ + self.urlbase = self.BASE_URL+urlencode({ 'q':q, 'max-results':max_results, - 'start-index':start_index, 'min-viewability':min_viewability, - }) + })+'&start-index=' - def __call__(self, browser, verbose): + def brcall(self, browser, url, verbose, timeout): if verbose: - print 'Query:', self.url - feed = etree.fromstring(browser.open(self.url).read()) - #print etree.tostring(feed, pretty_print=True) + print _('Query: %s') % url + + try: + raw = browser.open_novisit(url, timeout=timeout).read() + except Exception, e: + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return None + attr = getattr(e, 'args', [None]) + attr = attr if attr else [None] + if isinstance(attr[0], socket.timeout): + raise GoogleBooksError(_('GoogleBooks timed out. Try again later.')) + raise GoogleBooksError(_('GoogleBooks encountered an error.')) + if '<title>404 - ' in raw: + return None + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + return etree.fromstring(raw) + except: + try: + #remove ASCII invalid chars (normally not needed) + return etree.fromstring(clean_ascii_chars(raw)) + except: + return None + + def __call__(self, browser, verbose, timeout = 5.): + #get a feed + url = self.urlbase+self.sindex + feed = self.brcall(browser, url, verbose, timeout) + if feed is None: + return None + + # print etree.tostring(feed, pretty_print=True) total = int(total_results(feed)[0].text) + nbresultstoget = total if total<self.maxresults else self.maxresults + start = int(start_index(feed)[0].text) entries = entry(feed) - new_start = start + len(entries) - if new_start > total: - new_start = 0 - return entries, new_start - + while len(entries) < nbresultstoget: + url = self.urlbase+str(start+len(entries)) + feed = self.brcall(browser, url, verbose, timeout) + if feed is None: + break + entries.extend(entry(feed)) + return entries class ResultList(list): + def __init__(self): + self.thread = [] def get_description(self, entry, verbose): try: @@ -164,44 +229,114 @@ class ResultList(list): d = None return d - def populate(self, entries, browser, verbose=False): - for x in entries: + def fill_MI(self, entry, data, verbose): + x = entry + try: + title = self.get_title(entry) + x = entry(data)[0] + except Exception, e: + if verbose: + print _('Failed to get all details for an entry') + print e + authors = self.get_authors(x) + mi = MetaInformation(title, authors) + mi.author_sort = self.get_author_sort(x, verbose) + mi.comments = self.get_description(x, verbose) + self.get_identifiers(x, mi) + mi.tags = self.get_tags(x, verbose) + mi.publisher = self.get_publisher(x, verbose) + mi.pubdate = self.get_date(x, verbose) + mi.language = self.get_language(x, verbose) + return mi + + def get_individual_metadata(self, url, br, verbose): + if url is None: + return None + try: + raw = br.open_novisit(url).read() + except Exception, e: + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return None + attr = getattr(e, 'args', [None]) + attr = attr if attr else [None] + if isinstance(attr[0], socket.timeout): + raise GoogleBooksError(_('GoogleBooks timed out. Try again later.')) + raise GoogleBooksError(_('GoogleBooks encountered an error.')) + if '<title>404 - ' in raw: + report(verbose) + return None + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + return etree.fromstring(raw) + except: try: - id_url = entry_id(x)[0].text - title = self.get_title(x) + #remove ASCII invalid chars + return etree.fromstring(clean_ascii_chars(raw)) except: report(verbose) - mi = MetaInformation(title, self.get_authors(x)) + return None + + def fetchdatathread(self, qbr, qsync, nb, url, verbose): + try: + browser = qbr.get(True) + entry = self.get_individual_metadata(url, browser, verbose) + except: + report(verbose) + entry = None + finally: + qbr.put(browser, True) + qsync.put(nb, True) + return entry + + def producer(self, sync, entries, br, verbose=False): + for i in xrange(len(entries)): try: - raw = browser.open(id_url).read() - feed = etree.fromstring(raw) - x = entry(feed)[0] - except Exception, e: - if verbose: - print 'Failed to get all details for an entry' - print e - mi.author_sort = self.get_author_sort(x, verbose) - mi.comments = self.get_description(x, verbose) - self.get_identifiers(x, mi) - mi.tags = self.get_tags(x, verbose) - mi.publisher = self.get_publisher(x, verbose) - mi.pubdate = self.get_date(x, verbose) - mi.language = self.get_language(x, verbose) - self.append(mi) + id_url = entry_id(entries[i])[0].text + except: + id_url = None + report(verbose) + thread = ThreadwithResults(self.fetchdatathread, br, sync, + i, id_url, verbose) + thread.start() + self.thread.append(thread) + + def consumer(self, entries, sync, total_entries, verbose=False): + res=[None]*total_entries #remove? + i=0 + while i < total_entries: + nb = int(sync.get(True)) + self.thread[nb].join() + data = self.thread[nb].get_result() + res[nb] = self.fill_MI(entries[nb], data, verbose) + i+=1 + return res + + def populate(self, entries, br, verbose=False, brcall=3): + #multiple entries + pbr = Queue(brcall) + sync = Queue(1) + for i in xrange(brcall-1): + pbr.put(browser(), True) + pbr.put(br, True) + + prod_thread = Thread(target=self.producer, args=(sync, entries, pbr, verbose)) + cons_thread = ThreadwithResults(self.consumer, entries, sync, len(entries), verbose) + prod_thread.start() + cons_thread.start() + prod_thread.join() + cons_thread.join() + self.extend(cons_thread.get_result()) def search(title=None, author=None, publisher=None, isbn=None, min_viewability='none', verbose=False, max_results=40): br = browser() - start, entries = 1, [] - while start > 0 and len(entries) <= max_results: - new, start = Query(title=title, author=author, publisher=publisher, - isbn=isbn, min_viewability=min_viewability)(br, verbose) - if not new: - break - entries.extend(new) - - entries = entries[:max_results] + entries = Query(title=title, author=author, publisher=publisher, + isbn=isbn, max_results=max_results, + min_viewability=min_viewability)(br, verbose) ans = ResultList() ans.populate(entries, br, verbose) @@ -214,7 +349,7 @@ def option_parser(): Fetch book metadata from Google. You must specify one of title, author, publisher or ISBN. If you specify ISBN the others are ignored. Will - fetch a maximum of 100 matches, so you should make your query as + fetch a maximum of 20 matches, so you should make your query as specific as possible. ''' )) @@ -244,3 +379,5 @@ def main(args=sys.argv): if __name__ == '__main__': sys.exit(main()) + +# C:\Users\Pierre>calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\google_books.py" -m 5 -a gore -v>data.html \ No newline at end of file From aa7630f392aa05ec97bfe525b644c78417817cc9 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Mon, 13 Dec 2010 08:59:20 +0100 Subject: [PATCH 063/132] Finish to add threading to google_books & minor changes --- src/calibre/customize/builtins.py | 3 +- src/calibre/ebooks/metadata/fetch.py | 24 +++--- src/calibre/ebooks/metadata/google_books.py | 92 ++++++++++++--------- 3 files changed, 69 insertions(+), 50 deletions(-) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 342d0e8456..9e34d33941 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -480,8 +480,9 @@ from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, \ from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG from calibre.devices.kobo.driver import KOBO -from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, LibraryThing +from calibre.ebooks.metadata.fetch import ISBNDB, LibraryThing from calibre.ebooks.metadata.douban import DoubanBooks +from calibre.ebooks.metadata.google_books import GoogleBooks from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers from calibre.ebooks.metadata.amazon import Amazon, AmazonSocial from calibre.ebooks.metadata.fictionwise import Fictionwise diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index f1bf88da84..d6494de54d 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -172,20 +172,20 @@ class MetadataSource(Plugin): # {{{ # }}} -class GoogleBooks(MetadataSource): # {{{ +# class GoogleBooks(MetadataSource): # {{{ - name = 'Google Books' - description = _('Downloads metadata from Google Books') + # name = 'Google Books' + # description = _('Downloads metadata from Google Books') - def fetch(self): - from calibre.ebooks.metadata.google_books import search - try: - self.results = search(self.title, self.book_author, self.publisher, - self.isbn, max_results=10, - verbose=self.verbose) - except Exception, e: - self.exception = e - self.tb = traceback.format_exc() + # def fetch(self): + # from calibre.ebooks.metadata.google_books import search + # try: + # self.results = search(self.title, self.book_author, self.publisher, + # self.isbn, max_results=10, + # verbose=self.verbose) + # except Exception, e: + # self.exception = e + # self.tb = traceback.format_exc() # }}} diff --git a/src/calibre/ebooks/metadata/google_books.py b/src/calibre/ebooks/metadata/google_books.py index 12d92ca5ae..1eb5d11441 100644 --- a/src/calibre/ebooks/metadata/google_books.py +++ b/src/calibre/ebooks/metadata/google_books.py @@ -12,7 +12,9 @@ from functools import partial from lxml import etree from calibre import browser, preferred_encoding -from calibre.ebooks.metadata import MetaInformation +from calibre.ebooks.metadata import MetaInformation, check_isbn, \ + authors_to_sort_string +from calibre.ebooks.metadata.fetch import MetadataSource from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.config import OptionParser from calibre.utils.date import parse_date, utcnow @@ -39,6 +41,22 @@ subject = XPath('descendant::dc:subject') description = XPath('descendant::dc:description') language = XPath('descendant::dc:language') + +class GoogleBooks(MetadataSource): + + name = 'Google Books' + description = _('Downloads metadata from Google Books') + version = (1, 0, 1) + + def fetch(self): + try: + self.results = search(self.title, self.book_author, self.publisher, + self.isbn, max_results=10, verbose=self.verbose) + except Exception, e: + self.exception = e + self.tb = traceback.format_exc() + + class GoogleBooksError(Exception): pass @@ -158,7 +176,7 @@ class ResultList(list): try: desc = description(entry) if desc: - return 'SUMMARY:\n'+desc[0].text + return _('SUMMARY:\n %s') % desc[0].text except: report(verbose) @@ -171,29 +189,27 @@ class ResultList(list): report(verbose) def get_title(self, entry): - candidates = [x.text for x in title(entry)] - return ': '.join(candidates) + return ': '.join([x.text for x in title(entry)]) def get_authors(self, entry): m = creator(entry) - if not m: - m = [] - m = [x.text for x in m] - return m + return [x.text for x in m] if m else [] def get_author_sort(self, entry, verbose): for x in creator(entry): - for key, val in x.attrib.items(): + for key, val in x.attrib.iteritems(): if key.endswith('file-as'): return val def get_identifiers(self, entry, mi): - isbns = [] - for x in identifier(entry): - t = str(x.text).strip() - if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'): - if t[:5].upper() == 'ISBN:': - isbns.append(t[5:]) + isbns = [str(x.text).strip() for x in identifier(entry)] + isbns = [t[5:] for t in isbns \ + if t[:5].upper() == 'ISBN:' and check_isbn(t[5:])] + # for x in identifier(entry): + # t = str(x.text).strip() + # if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'): + # if t[:5].upper() == 'ISBN:': + # isbns.append(t[5:]) if isbns: mi.isbn = sorted(isbns, cmp=lambda x,y:cmp(len(x), len(y)))[-1] @@ -211,28 +227,26 @@ class ResultList(list): def get_publisher(self, entry, verbose): try: - pub = publisher(entry)[0].text + return publisher(entry)[0].text except: - pub = None - return pub + return None def get_date(self, entry, verbose): try: d = date(entry) if d: default = utcnow().replace(day=15) - d = parse_date(d[0].text, assume_utc=True, default=default) + return parse_date(d[0].text, assume_utc=True, default=default) else: - d = None + return None except: report(verbose) - d = None - return d + return None - def fill_MI(self, entry, data, verbose): - x = entry + def fill_MI(self, ent, data, verbose): + x = ent try: - title = self.get_title(entry) + title = self.get_title(x) x = entry(data)[0] except Exception, e: if verbose: @@ -240,7 +254,9 @@ class ResultList(list): print e authors = self.get_authors(x) mi = MetaInformation(title, authors) - mi.author_sort = self.get_author_sort(x, verbose) + tmpautsort = self.get_author_sort(x, verbose) + mi.author_sort = tmpautsort if tmpautsort \ + else authors_to_sort_string(authors) mi.comments = self.get_description(x, verbose) self.get_identifiers(x, mi) mi.tags = self.get_tags(x, verbose) @@ -315,7 +331,6 @@ class ResultList(list): return res def populate(self, entries, br, verbose=False, brcall=3): - #multiple entries pbr = Queue(brcall) sync = Queue(1) for i in xrange(brcall-1): @@ -344,23 +359,23 @@ def search(title=None, author=None, publisher=None, isbn=None, def option_parser(): parser = OptionParser(textwrap.dedent( - '''\ + _('''\ %prog [options] Fetch book metadata from Google. You must specify one of title, author, publisher or ISBN. If you specify ISBN the others are ignored. Will - fetch a maximum of 20 matches, so you should make your query as + fetch a maximum of 40 matches, so you should make your query as specific as possible. ''' - )) - parser.add_option('-t', '--title', help='Book title') - parser.add_option('-a', '--author', help='Book author(s)') - parser.add_option('-p', '--publisher', help='Book publisher') - parser.add_option('-i', '--isbn', help='Book ISBN') + ))) + parser.add_option('-t', '--title', help=_('Book title')) + parser.add_option('-a', '--author', help=_('Book author(s)')) + parser.add_option('-p', '--publisher', help=_('Book publisher')) + parser.add_option('-i', '--isbn', help=_('Book ISBN')) parser.add_option('-m', '--max-results', default=10, - help='Maximum number of results to fetch') + help=_('Maximum number of results to fetch')) parser.add_option('-v', '--verbose', default=0, action='count', - help='Be more verbose about errors') + help=_('Be more verbose about errors')) return parser def main(args=sys.argv): @@ -373,6 +388,9 @@ def main(args=sys.argv): report(True) parser.print_help() return 1 + if results is None or len(results) == 0: + print _('No result found for this search!') + return 0 for result in results: print unicode(result).encode(preferred_encoding) print @@ -380,4 +398,4 @@ def main(args=sys.argv): if __name__ == '__main__': sys.exit(main()) -# C:\Users\Pierre>calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\google_books.py" -m 5 -a gore -v>data.html \ No newline at end of file +# calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\google_books.py" -m 5 -a gore -v>data.html \ No newline at end of file From 6ca1bf64efffd353f9e934bf9649b0f3e92b75bd Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Mon, 13 Dec 2010 20:15:28 +0100 Subject: [PATCH 064/132] import modifications --- src/calibre/ebooks/metadata/fetch.py | 38 --------------------- src/calibre/ebooks/metadata/fictionwise.py | 9 +++-- src/calibre/ebooks/metadata/google_books.py | 13 +++++-- 3 files changed, 18 insertions(+), 42 deletions(-) diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index d6494de54d..0c607b9bb7 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -172,23 +172,6 @@ class MetadataSource(Plugin): # {{{ # }}} -# class GoogleBooks(MetadataSource): # {{{ - - # name = 'Google Books' - # description = _('Downloads metadata from Google Books') - - # def fetch(self): - # from calibre.ebooks.metadata.google_books import search - # try: - # self.results = search(self.title, self.book_author, self.publisher, - # self.isbn, max_results=10, - # verbose=self.verbose) - # except Exception, e: - # self.exception = e - # self.tb = traceback.format_exc() - - # }}} - class ISBNDB(MetadataSource): # {{{ name = 'IsbnDB' @@ -226,27 +209,6 @@ class ISBNDB(MetadataSource): # {{{ # }}} -# class Amazon(MetadataSource): # {{{ - - # name = 'Amazon' - # metadata_type = 'social' - # description = _('Downloads social metadata from amazon.com') - - # has_html_comments = True - - # def fetch(self): - # if not self.isbn: - # return - # from calibre.ebooks.metadata.amazon import get_social_metadata - # try: - # self.results = get_social_metadata(self.title, self.book_author, - # self.publisher, self.isbn) - # except Exception, e: - # self.exception = e - # self.tb = traceback.format_exc() - - # }}} - class LibraryThing(MetadataSource): # {{{ name = 'LibraryThing' diff --git a/src/calibre/ebooks/metadata/fictionwise.py b/src/calibre/ebooks/metadata/fictionwise.py index 909d186702..3ab960c846 100644 --- a/src/calibre/ebooks/metadata/fictionwise.py +++ b/src/calibre/ebooks/metadata/fictionwise.py @@ -3,7 +3,7 @@ __license__ = 'GPL 3' __copyright__ = '2010, sengian <sengian1@gmail.com>' __docformat__ = 'restructuredtext en' -import sys, textwrap, re, traceback, socket +import sys, re from threading import Thread from Queue import Queue from urllib import urlencode @@ -32,6 +32,7 @@ class Fictionwise(MetadataSource): self.results = search(self.title, self.book_author, self.publisher, self.isbn, max_results=10, verbose=self.verbose) except Exception, e: + import traceback self.exception = e self.tb = traceback.format_exc() @@ -55,6 +56,7 @@ class ThreadwithResults(Thread): def report(verbose): if verbose: + import traceback traceback.print_exc() @@ -108,11 +110,12 @@ class Query(object): def __call__(self, browser, verbose, timeout = 5.): if verbose: - print _('Query: %s') % self.BASE_URL+self.urldata + print _('Query: %s POST: %s') % (self.BASE_URL, self.urldata) try: raw = browser.open_novisit(self.BASE_URL, self.urldata, timeout=timeout).read() except Exception, e: + import socket report(verbose) if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: @@ -321,6 +324,7 @@ class ResultList(list): try: raw = br.open_novisit(url).read() except Exception, e: + import socket report(verbose) if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: @@ -410,6 +414,7 @@ def search(title=None, author=None, publisher=None, isbn=None, def option_parser(): + import textwrap parser = OptionParser(textwrap.dedent(\ _('''\ %prog [options] diff --git a/src/calibre/ebooks/metadata/google_books.py b/src/calibre/ebooks/metadata/google_books.py index 1eb5d11441..cac3cac7d0 100644 --- a/src/calibre/ebooks/metadata/google_books.py +++ b/src/calibre/ebooks/metadata/google_books.py @@ -3,7 +3,7 @@ __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' -import sys, textwrap, traceback, socket +import sys from threading import Thread from Queue import Queue from urllib import urlencode @@ -53,6 +53,7 @@ class GoogleBooks(MetadataSource): self.results = search(self.title, self.book_author, self.publisher, self.isbn, max_results=10, verbose=self.verbose) except Exception, e: + import traceback self.exception = e self.tb = traceback.format_exc() @@ -76,6 +77,7 @@ class ThreadwithResults(Thread): def report(verbose): if verbose: + import traceback traceback.print_exc() @@ -89,6 +91,7 @@ class Query(object): isbn is None) assert (max_results < 41) assert (min_viewability in ('none', 'partial', 'full')) + if title == _('Unknown'): title=None if author == _('Unknown'): @@ -125,6 +128,7 @@ class Query(object): try: raw = browser.open_novisit(url, timeout=timeout).read() except Exception, e: + import socket report(verbose) if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: @@ -271,6 +275,7 @@ class ResultList(list): try: raw = br.open_novisit(url).read() except Exception, e: + import socket report(verbose) if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: @@ -352,12 +357,16 @@ def search(title=None, author=None, publisher=None, isbn=None, entries = Query(title=title, author=author, publisher=publisher, isbn=isbn, max_results=max_results, min_viewability=min_viewability)(br, verbose) + + if entries is None or len(entries) == 0: + return None ans = ResultList() ans.populate(entries, br, verbose) return ans def option_parser(): + import textwrap parser = OptionParser(textwrap.dedent( _('''\ %prog [options] @@ -392,7 +401,7 @@ def main(args=sys.argv): print _('No result found for this search!') return 0 for result in results: - print unicode(result).encode(preferred_encoding) + print unicode(result).encode(preferred_encoding, 'replace') print if __name__ == '__main__': From d374b36e97559efc886a3e4733f54431b284be23 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Mon, 13 Dec 2010 21:14:38 +0100 Subject: [PATCH 065/132] ... --- src/calibre/ebooks/metadata/fictionwise.py | 11 ++++- src/calibre/ebooks/metadata/nicebooks.py | 57 ++++++++++++---------- 2 files changed, 40 insertions(+), 28 deletions(-) diff --git a/src/calibre/ebooks/metadata/fictionwise.py b/src/calibre/ebooks/metadata/fictionwise.py index 3ab960c846..9eabcb2ca8 100644 --- a/src/calibre/ebooks/metadata/fictionwise.py +++ b/src/calibre/ebooks/metadata/fictionwise.py @@ -120,7 +120,9 @@ class Query(object): if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: return None - if isinstance(getattr(e, 'args', [None])[0], socket.timeout): + attr = getattr(e, 'args', [None]) + attr = attr if attr else [None] + if isinstance(attr[0], socket.timeout): raise FictionwiseError(_('Fictionwise timed out. Try again later.')) raise FictionwiseError(_('Fictionwise encountered an error.')) if '<title>404 - ' in raw: @@ -329,7 +331,9 @@ class ResultList(list): if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: return None - if isinstance(getattr(e, 'args', [None])[0], socket.timeout): + attr = getattr(e, 'args', [None]) + attr = attr if attr else [None] + if isinstance(attr[0], socket.timeout): raise FictionwiseError(_('Fictionwise timed out. Try again later.')) raise FictionwiseError(_('Fictionwise encountered an error.')) if '<title>404 - ' in raw: @@ -407,6 +411,9 @@ def search(title=None, author=None, publisher=None, isbn=None, entries, islink = Query(title=title, author=author, publisher=publisher, keywords=keywords, max_results=max_results)(br, verbose, timeout = 15.) + if entries is None or len(entries) == 0: + return None + #List of entry ans = ResultList(islink) ans.populate(entries, br, verbose) diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py index 6cb7c9a6ae..cacb511563 100644 --- a/src/calibre/ebooks/metadata/nicebooks.py +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -3,7 +3,7 @@ __license__ = 'GPL 3' __copyright__ = '2010, sengian <sengian1@gmail.com>' __docformat__ = 'restructuredtext en' -import sys, textwrap, re, traceback, socket +import sys, re from threading import Thread from Queue import Queue from urllib import urlencode @@ -35,6 +35,7 @@ class NiceBooks(MetadataSource): self.results = search(self.title, self.book_author, self.publisher, self.isbn, max_results=10, verbose=self.verbose) except Exception, e: + import traceback self.exception = e self.tb = traceback.format_exc() @@ -70,6 +71,7 @@ class NiceBooksCovers(CoverDownload): ext = 'jpg' result_queue.put((True, cover_data, ext, self.name)) except Exception, e: + import traceback result_queue.put((False, self.exception_to_string(e), traceback.format_exc(), self.name)) @@ -96,6 +98,7 @@ class ThreadwithResults(Thread): def report(verbose): if verbose: + import traceback traceback.print_exc() @@ -124,18 +127,21 @@ class Query(object): q = q.encode('utf-8') self.urldata = 'search?' + urlencode({'q':q,'s':'Rechercher'}) - def __call__(self, browser, verbose, timeout = 5.): + def brcall(self, browser, url, verbose, timeout): if verbose: - print _('Query: %s') % self.BASE_URL+self.urldata - + print _('Query: %s') % url + try: - raw = browser.open_novisit(self.BASE_URL+self.urldata, timeout=timeout).read() + raw = browser.open_novisit(url, timeout=timeout).read() except Exception, e: + import socket report(verbose) if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: return None - if isinstance(getattr(e, 'args', [None])[0], socket.timeout): + attr = getattr(e, 'args', [None]) + attr = attr if attr else [None] + if isinstance(attr[0], socket.timeout): raise NiceBooksError(_('Nicebooks timed out. Try again later.')) raise NiceBooksError(_('Nicebooks encountered an error.')) if '<title>404 - ' in raw: @@ -143,14 +149,19 @@ class Query(object): raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] try: - feed = soupparser.fromstring(raw) + return soupparser.fromstring(raw) except: try: #remove ASCII invalid chars - feed = soupparser.fromstring(clean_ascii_chars(raw)) + return soupparser.fromstring(clean_ascii_chars(raw)) except: return None + def __call__(self, browser, verbose, timeout = 5.): + feed = self.brcall(browser, self.BASE_URL+self.urldata, verbose, timeout) + if feed is None: + return None + #nb of page to call try: nbresults = int(feed.xpath("//div[@id='topbar']/b")[0].text) @@ -162,23 +173,10 @@ class Query(object): pages =[feed] if nbpagetoquery > 1: for i in xrange(2, nbpagetoquery + 1): - try: - urldata = self.urldata + '&p=' + str(i) - raw = browser.open_novisit(self.BASE_URL+urldata, timeout=timeout).read() - except Exception, e: + urldata = self.urldata + '&p=' + str(i) + feed = self.brcall(browser, self.BASE_URL+urldata, verbose, timeout) + if feed is None: continue - if '<title>404 - ' in raw: - continue - raw = xml_to_unicode(raw, strip_encoding_pats=True, - resolve_entities=True)[0] - try: - feed = soupparser.fromstring(raw) - except: - try: - #remove ASCII invalid chars - feed = soupparser.fromstring(clean_ascii_chars(raw)) - except: - continue pages.append(feed) results = [] @@ -270,11 +268,14 @@ class ResultList(list): try: raw = br.open_novisit(url).read() except Exception, e: + import socket report(verbose) if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: return None - if isinstance(getattr(e, 'args', [None])[0], socket.timeout): + attr = getattr(e, 'args', [None]) + attr = attr if attr else [None] + if isinstance(attr[0], socket.timeout): raise NiceBooksError(_('NiceBooks timed out. Try again later.')) raise NiceBooksError(_('NiceBooks encountered an error.')) if '<title>404 - ' in raw: @@ -372,7 +373,10 @@ class Covers(object): self.urlimg.rpartition('.')[-1] return cover, ext if ext else 'jpg' except Exception, err: - if isinstance(getattr(err, 'args', [None])[0], socket.timeout): + import socket + attr = getattr(e, 'args', [None]) + attr = attr if attr else [None] + if isinstance(attr[0], socket.timeout): raise NiceBooksError(_('Nicebooks timed out. Try again later.')) if not len(self.urlimg): if not self.isbnf: @@ -407,6 +411,7 @@ def cover_from_isbn(isbn, timeout = 5.): def option_parser(): + import textwrap parser = OptionParser(textwrap.dedent(\ _('''\ %prog [options] From 81af8382d630175c34157effb2fd104577dba2e0 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Mon, 13 Dec 2010 23:24:12 +0100 Subject: [PATCH 066/132] cleaning --- src/calibre/ebooks/metadata/amazon.py | 65 +++++++++++---------- src/calibre/ebooks/metadata/fictionwise.py | 5 +- src/calibre/ebooks/metadata/google_books.py | 6 +- 3 files changed, 39 insertions(+), 37 deletions(-) diff --git a/src/calibre/ebooks/metadata/amazon.py b/src/calibre/ebooks/metadata/amazon.py index 1362349685..aec4fb313a 100644 --- a/src/calibre/ebooks/metadata/amazon.py +++ b/src/calibre/ebooks/metadata/amazon.py @@ -2,7 +2,7 @@ from __future__ import with_statement __license__ = 'GPL 3' __copyright__ = '2010, sengian <sengian1@gmail.com>' -import sys, textwrap, re, traceback, socket +import sys, re from threading import Thread from Queue import Queue from urllib import urlencode @@ -61,6 +61,7 @@ class Amazon(MetadataSource): tempres.extend(tmpnoloc) self.results = tempres except Exception, e: + import traceback self.exception = e self.tb = traceback.format_exc() @@ -107,12 +108,14 @@ class AmazonSocial(MetadataSource): tmploc.tags = tmpnoloc.tags self.results = tmploc except Exception, e: + import traceback self.exception = e self.tb = traceback.format_exc() def report(verbose): if verbose: + import traceback traceback.print_exc() class AmazonError(Exception): @@ -208,33 +211,40 @@ class Query(object): q = q.encode('utf-8') self.urldata += '/gp/search/ref=sr_adv_b/?' + urlencode(q) - def __call__(self, browser, verbose, timeout = 5.): + def brcall(self, browser, url, verbose, timeout): if verbose: - print _('Query: %s') % self.urldata - + print _('Query: %s') % url + try: - raw = browser.open_novisit(self.urldata, timeout=timeout).read() + raw = browser.open_novisit(url, timeout=timeout).read() except Exception, e: + import socket report(verbose) if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: - return None, self.urldata - if isinstance(getattr(e, 'args', [None])[0], socket.timeout): - raise AmazonError(_('Amazon timed out. Try again later.')) - raise AmazonError(_('Amazon encountered an error.')) + return None + attr = getattr(e, 'args', [None]) + attr = attr if attr else [None] + if isinstance(attr[0], socket.timeout): + raise NiceBooksError(_('Nicebooks timed out. Try again later.')) + raise NiceBooksError(_('Nicebooks encountered an error.')) if '<title>404 - ' in raw: - return None, self.urldata + return raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] - try: - feed = soupparser.fromstring(raw) + return soupparser.fromstring(raw) except: try: #remove ASCII invalid chars return soupparser.fromstring(clean_ascii_chars(raw)) except: - return None, self.urldata + return None + + def __call__(self, browser, verbose, timeout = 5.): + feed = self.brcall(browser, self.urldata, verbose, timeout) + if feed is None: + return None, self.urldata #nb of page try: @@ -247,23 +257,10 @@ class Query(object): if len(nbresults) > 1: nbpagetoquery = int(ceil(float(min(int(nbresults[2]), self.max_results))/ int(nbresults[1]))) for i in xrange(2, nbpagetoquery + 1): - try: - urldata = self.urldata + '&page=' + str(i) - raw = browser.open_novisit(urldata, timeout=timeout).read() - except Exception, e: + urldata = self.urldata + '&page=' + str(i) + feed = self.brcall(browser, urldata, verbose, timeout) + if feed is None: continue - if '<title>404 - ' in raw: - continue - raw = xml_to_unicode(raw, strip_encoding_pats=True, - resolve_entities=True)[0] - try: - feed = soupparser.fromstring(raw) - except: - try: - #remove ASCII invalid chars - return soupparser.fromstring(clean_ascii_chars(raw)) - except: - continue pages.append(feed) results = [] @@ -453,11 +450,14 @@ class ResultList(object): try: raw = br.open_novisit(url).read() except Exception, e: + import socket report(verbose) if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: return None - if isinstance(getattr(e, 'args', [None])[0], socket.timeout): + attr = getattr(e, 'args', [None]) + attr = attr if attr else [None] + if isinstance(attr[0], socket.timeout): raise AmazonError(_('Amazon timed out. Try again later.')) raise AmazonError(_('Amazon encountered an error.')) if '<title>404 - ' in raw: @@ -584,6 +584,7 @@ def get_social_metadata(title, authors, publisher, isbn, verbose=False, return [mi] def option_parser(): + import textwrap parser = OptionParser(textwrap.dedent(\ _('''\ %prog [options] @@ -648,6 +649,6 @@ if __name__ == '__main__': sys.exit(main()) # import cProfile # sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonbis; calibre.ebooks.metadata.amazonbis.main()")) - # sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonbis; calibre.ebooks.metadata.amazonbis.main()", "profile_tmp_2")) + # sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonbis; calibre.ebooks.metadata.amazonbis.main()", "profile")) -# calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\amazonbis.py" -m 5 -a gore -v>data.html \ No newline at end of file +# calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\amazon.py" -m 5 -a gore -v>data.html \ No newline at end of file diff --git a/src/calibre/ebooks/metadata/fictionwise.py b/src/calibre/ebooks/metadata/fictionwise.py index 9eabcb2ca8..a50bb2ce04 100644 --- a/src/calibre/ebooks/metadata/fictionwise.py +++ b/src/calibre/ebooks/metadata/fictionwise.py @@ -14,11 +14,12 @@ from calibre import browser, preferred_encoding from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.metadata import MetaInformation, check_isbn, \ authors_to_sort_string -from calibre.library.comments import sanitize_comments_html from calibre.ebooks.metadata.fetch import MetadataSource +from calibre.library.comments import sanitize_comments_html from calibre.utils.config import OptionParser -from calibre.utils.date import parse_date, utcnow from calibre.utils.cleantext import clean_ascii_chars, unescape +from calibre.utils.date import parse_date, utcnow + class Fictionwise(MetadataSource): diff --git a/src/calibre/ebooks/metadata/google_books.py b/src/calibre/ebooks/metadata/google_books.py index cac3cac7d0..765bb4a255 100644 --- a/src/calibre/ebooks/metadata/google_books.py +++ b/src/calibre/ebooks/metadata/google_books.py @@ -1,6 +1,6 @@ from __future__ import with_statement __license__ = 'GPL 3' -__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' +__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>, 2010, sengian <sengian1@gmail.com>' __docformat__ = 'restructuredtext en' import sys @@ -12,13 +12,13 @@ from functools import partial from lxml import etree from calibre import browser, preferred_encoding +from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.metadata import MetaInformation, check_isbn, \ authors_to_sort_string from calibre.ebooks.metadata.fetch import MetadataSource -from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.config import OptionParser -from calibre.utils.date import parse_date, utcnow from calibre.utils.cleantext import clean_ascii_chars +from calibre.utils.date import parse_date, utcnow NAMESPACES = { 'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/', From 99921673d62dd26305a47fed9f35c332aee3a1aa Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Tue, 14 Dec 2010 00:34:25 +0100 Subject: [PATCH 067/132] Threading optimisation (last I hope), now faster than light at least pratchett's for amazon --- src/calibre/ebooks/metadata/amazon.py | 74 +++++++++------------ src/calibre/ebooks/metadata/fictionwise.py | 36 ++-------- src/calibre/ebooks/metadata/google_books.py | 40 +++-------- src/calibre/ebooks/metadata/nicebooks.py | 36 ++-------- 4 files changed, 53 insertions(+), 133 deletions(-) diff --git a/src/calibre/ebooks/metadata/amazon.py b/src/calibre/ebooks/metadata/amazon.py index aec4fb313a..6eb106c862 100644 --- a/src/calibre/ebooks/metadata/amazon.py +++ b/src/calibre/ebooks/metadata/amazon.py @@ -121,20 +121,6 @@ def report(verbose): class AmazonError(Exception): pass -class ThreadwithResults(Thread): - def __init__(self, func, *args, **kargs): - self.func = func - self.args = args - self.kargs = kargs - self.result = None - Thread.__init__(self) - - def get_result(self): - return self.result - - def run(self): - self.result = self.func(*self.args, **self.kargs) - class Query(object): @@ -269,14 +255,11 @@ class Query(object): for i in x.xpath("//a/span[@class='srTitle']")]) return results[:self.max_results], self.baseurl -class ResultList(object): +class ResultList(list): def __init__(self, baseurl, lang = 'all'): self.baseurl = baseurl self.lang = lang - self.thread = [] - self.res = [] - self.nbtag = 0 self.repub = re.compile(u'\((.*)\)') self.rerat = re.compile(u'([0-9.]+)') self.reattr = re.compile(r'<([a-zA-Z0-9]+)\s[^>]+>') @@ -484,63 +467,65 @@ class ResultList(object): entry = None finally: qbr.put(browser, True) - qsync.put(nb, True) - return entry + qsync.put((nb, entry), True) def producer(self, sync, urls, br, verbose=False): for i in xrange(len(urls)): - thread = ThreadwithResults(self.fetchdatathread, br, sync, - i, urls[i], verbose) + thread = Thread(target=self.fetchdatathread, + args=(br, sync, i, urls[i], verbose)) thread.start() - self.thread.append(thread) def consumer(self, sync, syncbis, br, total_entries, verbose=False): i=0 + self.extend([None]*total_entries) while i < total_entries: - nb = int(sync.get(True)) - self.thread[nb].join() - entry = self.thread[nb].get_result() + rq = sync.get(True) + nb = int(rq[0]) + entry = rq[1] i+=1 if entry is not None: mi = self.fill_MI(entry, verbose) if mi is not None: mi.tags, atag = self.get_tags(entry, verbose) - self.res[nb] = mi + self[nb] = mi if atag: - threadbis = ThreadwithResults(self.fetchdatathread, - br, syncbis, nb, mi.tags, verbose) - self.thread[nb] = threadbis - self.nbtag +=1 - threadbis.start() + thread = Thread(target=self.fetchdatathread, + args=(br, syncbis, nb, mi.tags, verbose)) + thread.start() + else: + syncbis.put((nb, None), True) + + def final(self, sync, total_entries, verbose): + i=0 + while i < total_entries: + rq = sync.get(True) + nb = int(rq[0]) + tags = rq[1] + i+=1 + if tags is not None: + self[nb].tags = self.get_tags(tags, verbose)[0] def populate(self, entries, ibr, verbose=False, brcall=3): br = Queue(brcall) cbr = Queue(brcall-1) syncp = Queue(1) - syncc = Queue(len(entries)) + syncc = Queue(1) for i in xrange(brcall-1): br.put(browser(), True) cbr.put(browser(), True) br.put(ibr, True) - self.res = [None]*len(entries) - prod_thread = Thread(target=self.producer, args=(syncp, entries, br, verbose)) cons_thread = Thread(target=self.consumer, args=(syncp, syncc, cbr, len(entries), verbose)) + fin_thread = Thread(target=self.final, args=(syncc, len(entries), verbose)) prod_thread.start() cons_thread.start() + fin_thread.start() prod_thread.join() cons_thread.join() - - #finish processing - for i in xrange(self.nbtag): - nb = int(syncc.get(True)) - tags = self.thread[nb].get_result() - if tags is not None: - self.res[nb].tags = self.get_tags(tags, verbose)[0] - return self.res + fin_thread.join() def search(title=None, author=None, publisher=None, isbn=None, @@ -554,7 +539,8 @@ def search(title=None, author=None, publisher=None, isbn=None, #List of entry ans = ResultList(baseurl, lang) - return [x for x in ans.populate(entries, br, verbose) if x is not None] + ans.populate(entries, br, verbose) + return [x for x in ans if x is not None] def get_social_metadata(title, authors, publisher, isbn, verbose=False, max_results=1, lang='all'): diff --git a/src/calibre/ebooks/metadata/fictionwise.py b/src/calibre/ebooks/metadata/fictionwise.py index a50bb2ce04..48dac131cc 100644 --- a/src/calibre/ebooks/metadata/fictionwise.py +++ b/src/calibre/ebooks/metadata/fictionwise.py @@ -41,20 +41,6 @@ class Fictionwise(MetadataSource): class FictionwiseError(Exception): pass -class ThreadwithResults(Thread): - def __init__(self, func, *args, **kargs): - self.func = func - self.args = args - self.kargs = kargs - self.result = None - Thread.__init__(self) - - def get_result(self): - return self.result - - def run(self): - self.result = self.func(*self.args, **self.kargs) - def report(verbose): if verbose: import traceback @@ -155,7 +141,6 @@ class ResultList(list): def __init__(self, islink): self.islink = islink - self.thread = [] self.retitle = re.compile(r'\[[^\[\]]+\]') self.rechkauth = re.compile(r'.*book\s*by', re.I) self.redesc = re.compile(r'book\s*description\s*:\s*(<br[^>]+>)*(?P<desc>.*)<br[^>]*>.{,15}publisher\s*:', re.I) @@ -361,27 +346,21 @@ class ResultList(list): entry = None finally: qbr.put(browser, True) - qsync.put(nb, True) - return entry + qsync.put((nb, entry), True) def producer(self, sync, urls, br, verbose=False): for i in xrange(len(urls)): - thread = ThreadwithResults(self.fetchdatathread, br, sync, - i, self.BASE_URL+urls[i], verbose) + thread = Thread(target=self.fetchdatathread, + args=(br, sync, i, self.BASE_URL+urls[i], verbose)) thread.start() - self.thread.append(thread) def consumer(self, sync, total_entries, verbose=False): - res=[None]*total_entries + self.extend([None]*total_entries) i=0 while i < total_entries: - nb = int(sync.get(True)) - self.thread[nb].join() - entry = self.thread[nb].get_result() + rq = sync.get(True) + self[int(rq[0])] = self.fill_MI(rq[1], verbose) i+=1 - if entry is not None: - res[nb] = self.fill_MI(entry, verbose) - return res def populate(self, entries, br, verbose=False, brcall=3): if not self.islink: @@ -396,12 +375,11 @@ class ResultList(list): pbr.put(br, True) prod_thread = Thread(target=self.producer, args=(sync, entries, pbr, verbose)) - cons_thread = ThreadwithResults(self.consumer, sync, len(entries), verbose) + cons_thread = Thread(target=self.consumer, args=(sync, len(entries), verbose)) prod_thread.start() cons_thread.start() prod_thread.join() cons_thread.join() - self.extend(cons_thread.get_result()) def search(title=None, author=None, publisher=None, isbn=None, diff --git a/src/calibre/ebooks/metadata/google_books.py b/src/calibre/ebooks/metadata/google_books.py index 765bb4a255..fd18f080a0 100644 --- a/src/calibre/ebooks/metadata/google_books.py +++ b/src/calibre/ebooks/metadata/google_books.py @@ -61,20 +61,6 @@ class GoogleBooks(MetadataSource): class GoogleBooksError(Exception): pass -class ThreadwithResults(Thread): - def __init__(self, func, *args, **kargs): - self.func = func - self.args = args - self.kargs = kargs - self.result = None - Thread.__init__(self) - - def get_result(self): - return self.result - - def run(self): - self.result = self.func(*self.args, **self.kargs) - def report(verbose): if verbose: import traceback @@ -173,8 +159,6 @@ class Query(object): return entries class ResultList(list): - def __init__(self): - self.thread = [] def get_description(self, entry, verbose): try: @@ -206,8 +190,7 @@ class ResultList(list): return val def get_identifiers(self, entry, mi): - isbns = [str(x.text).strip() for x in identifier(entry)] - isbns = [t[5:] for t in isbns \ + isbns = [t[5:] for t in [str(x.text).strip() for x in identifier(entry)] \ if t[:5].upper() == 'ISBN:' and check_isbn(t[5:])] # for x in identifier(entry): # t = str(x.text).strip() @@ -309,8 +292,7 @@ class ResultList(list): entry = None finally: qbr.put(browser, True) - qsync.put(nb, True) - return entry + qsync.put((nb, entry), True) def producer(self, sync, entries, br, verbose=False): for i in xrange(len(entries)): @@ -319,21 +301,18 @@ class ResultList(list): except: id_url = None report(verbose) - thread = ThreadwithResults(self.fetchdatathread, br, sync, - i, id_url, verbose) + thread = Thread(target=self.fetchdatathread, + args=(br, sync, i, id_url, verbose)) thread.start() - self.thread.append(thread) def consumer(self, entries, sync, total_entries, verbose=False): - res=[None]*total_entries #remove? + self.extend([None]*total_entries) i=0 while i < total_entries: - nb = int(sync.get(True)) - self.thread[nb].join() - data = self.thread[nb].get_result() - res[nb] = self.fill_MI(entries[nb], data, verbose) + rq = sync.get(True) + nb = int(rq[0]) + self[nb] = self.fill_MI(entries[nb], rq[1], verbose) i+=1 - return res def populate(self, entries, br, verbose=False, brcall=3): pbr = Queue(brcall) @@ -343,12 +322,11 @@ class ResultList(list): pbr.put(br, True) prod_thread = Thread(target=self.producer, args=(sync, entries, pbr, verbose)) - cons_thread = ThreadwithResults(self.consumer, entries, sync, len(entries), verbose) + cons_thread = Thread(target=self.consumer, args=(entries, sync, len(entries), verbose)) prod_thread.start() cons_thread.start() prod_thread.join() cons_thread.join() - self.extend(cons_thread.get_result()) def search(title=None, author=None, publisher=None, isbn=None, diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py index cacb511563..1ff5f7fc6b 100644 --- a/src/calibre/ebooks/metadata/nicebooks.py +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -82,20 +82,6 @@ class NiceBooksError(Exception): class ISBNNotFound(NiceBooksError): pass -class ThreadwithResults(Thread): - def __init__(self, func, *args, **kargs): - self.func = func - self.args = args - self.kargs = kargs - self.result = None - Thread.__init__(self) - - def get_result(self): - return self.result - - def run(self): - self.result = self.func(*self.args, **self.kargs) - def report(verbose): if verbose: import traceback @@ -191,7 +177,6 @@ class ResultList(list): def __init__(self, islink): self.islink = islink - self.thread = [] self.repub = re.compile(u'\s*.diteur\s*', re.I) self.reauteur = re.compile(u'\s*auteur.*', re.I) self.reautclean = re.compile(u'\s*\(.*\)\s*') @@ -302,27 +287,21 @@ class ResultList(list): entry = None finally: qbr.put(browser, True) - qsync.put(nb, True) - return entry + qsync.put((nb, entry), True) def producer(self, sync, urls, br, verbose=False): for i in xrange(len(urls)): - thread = ThreadwithResults(self.fetchdatathread, br, sync, - i, self.BASE_URL+urls[i], verbose) + thread = Thread(target=self.fetchdatathread, + args=(br, sync, i, self.BASE_URL+urls[i], verbose)) thread.start() - self.thread.append(thread) def consumer(self, sync, total_entries, verbose=False): - res=[None]*total_entries + self.extend([None]*total_entries) i=0 while i < total_entries: - nb = int(sync.get(True)) - self.thread[nb].join() - entry = self.thread[nb].get_result() + rq = sync.get(True) + self[int(rq[0])] = self.fill_MI(rq[1], verbose) i+=1 - if entry is not None: - res[nb] = self.fill_MI(entry, verbose) - return res def populate(self, entries, br, verbose=False, brcall=3): if not self.islink: @@ -337,12 +316,11 @@ class ResultList(list): pbr.put(br, True) prod_thread = Thread(target=self.producer, args=(sync, entries, pbr, verbose)) - cons_thread = ThreadwithResults(self.consumer, sync, len(entries), verbose) + cons_thread = Thread(target=self.consumer, args=(sync, len(entries), verbose)) prod_thread.start() cons_thread.start() prod_thread.join() cons_thread.join() - self.extend(cons_thread.get_result()) class Covers(object): From 08eb0e1a59309f0749e19f6898201d260703c4c4 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Wed, 15 Dec 2010 01:07:40 +0100 Subject: [PATCH 068/132] Minor modifications --- src/calibre/ebooks/metadata/fictionwise.py | 2 +- src/calibre/ebooks/metadata/google_books.py | 11 ++++++----- src/calibre/ebooks/metadata/nicebooks.py | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/metadata/fictionwise.py b/src/calibre/ebooks/metadata/fictionwise.py index 48dac131cc..96638a1788 100644 --- a/src/calibre/ebooks/metadata/fictionwise.py +++ b/src/calibre/ebooks/metadata/fictionwise.py @@ -233,7 +233,7 @@ class ResultList(list): description = self.reimg.sub('', description.group("desc")) description = self.recomment.sub('', description) description = self.resanitize.sub('', sanitize_comments_html(description)) - return _('SUMMARY:\n %s') % re.sub(r'\n\s+</p>','\n</p>', description) + return _('SUMMARY:\n%s') % re.sub(r'\n\s+</p>','\n</p>', description) def get_publisher(self, entry): publisher = self.output_entry(entry.xpath('./p')[1]) diff --git a/src/calibre/ebooks/metadata/google_books.py b/src/calibre/ebooks/metadata/google_books.py index fd18f080a0..41b2edfefb 100644 --- a/src/calibre/ebooks/metadata/google_books.py +++ b/src/calibre/ebooks/metadata/google_books.py @@ -146,7 +146,7 @@ class Query(object): # print etree.tostring(feed, pretty_print=True) total = int(total_results(feed)[0].text) - nbresultstoget = total if total<self.maxresults else self.maxresults + nbresultstoget = total if total < self.maxresults else self.maxresults start = int(start_index(feed)[0].text) entries = entry(feed) @@ -156,7 +156,7 @@ class Query(object): if feed is None: break entries.extend(entry(feed)) - return entries + return entries[:nbresultstoget] class ResultList(list): @@ -164,7 +164,7 @@ class ResultList(list): try: desc = description(entry) if desc: - return _('SUMMARY:\n %s') % desc[0].text + return _('SUMMARY:\n%s') % desc[0].text except: report(verbose) @@ -183,7 +183,7 @@ class ResultList(list): m = creator(entry) return [x.text for x in m] if m else [] - def get_author_sort(self, entry, verbose): + def get_author_sort(self, entry): for x in creator(entry): for key, val in x.attrib.iteritems(): if key.endswith('file-as'): @@ -216,6 +216,7 @@ class ResultList(list): try: return publisher(entry)[0].text except: + report(verbose) return None def get_date(self, entry, verbose): @@ -241,7 +242,7 @@ class ResultList(list): print e authors = self.get_authors(x) mi = MetaInformation(title, authors) - tmpautsort = self.get_author_sort(x, verbose) + tmpautsort = self.get_author_sort(x) mi.author_sort = tmpautsort if tmpautsort \ else authors_to_sort_string(authors) mi.comments = self.get_description(x, verbose) diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py index 1ff5f7fc6b..4384f93809 100644 --- a/src/calibre/ebooks/metadata/nicebooks.py +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -203,7 +203,7 @@ class ResultList(list): def get_description(self, entry, verbose): try: - return u'RESUME:\n' + unicode(entry.getparent().xpath("//p[@id='book-description']")[0].text) + return _(u'SUMMARY:\n%s') % unicode(entry.getparent().xpath("//p[@id='book-description']")[0].text) except: report(verbose) return None From a64a22a934790ff5fb5dd4b81fc16aeafd5403ab Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Wed, 15 Dec 2010 09:10:37 +0100 Subject: [PATCH 069/132] Refactoring of isbndb plugin + add get language --- src/calibre/customize/builtins.py | 3 +- src/calibre/ebooks/metadata/fetch.py | 62 ++--- src/calibre/ebooks/metadata/isbndb.py | 343 ++++++++++++++++++-------- 3 files changed, 270 insertions(+), 138 deletions(-) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 9e34d33941..f95c29a718 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -480,7 +480,8 @@ from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, \ from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG from calibre.devices.kobo.driver import KOBO -from calibre.ebooks.metadata.fetch import ISBNDB, LibraryThing +from calibre.ebooks.metadata.fetch import LibraryThing +from calibre.ebooks.metadata.isbndb import ISBNDB from calibre.ebooks.metadata.douban import DoubanBooks from calibre.ebooks.metadata.google_books import GoogleBooks from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index 0c607b9bb7..3bf4c22afe 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -172,40 +172,40 @@ class MetadataSource(Plugin): # {{{ # }}} -class ISBNDB(MetadataSource): # {{{ +# class ISBNDB(MetadataSource): # {{{ - name = 'IsbnDB' - description = _('Downloads metadata from isbndb.com') + # name = 'IsbnDB' + # description = _('Downloads metadata from isbndb.com') - def fetch(self): - if not self.site_customization: - return - from calibre.ebooks.metadata.isbndb import option_parser, create_books - args = ['isbndb'] - if self.isbn: - args.extend(['--isbn', self.isbn]) - else: - if self.title: - args.extend(['--title', self.title]) - if self.book_author: - args.extend(['--author', self.book_author]) - if self.publisher: - args.extend(['--publisher', self.publisher]) - if self.verbose: - args.extend(['--verbose']) - args.append(self.site_customization) # IsbnDb key - try: - opts, args = option_parser().parse_args(args) - self.results = create_books(opts, args) - except Exception, e: - self.exception = e - self.tb = traceback.format_exc() + # def fetch(self): + # if not self.site_customization: + # return + # from calibre.ebooks.metadata.isbndb import option_parser, create_books + # args = ['isbndb'] + # if self.isbn: + # args.extend(['--isbn', self.isbn]) + # else: + # if self.title: + # args.extend(['--title', self.title]) + # if self.book_author: + # args.extend(['--author', self.book_author]) + # if self.publisher: + # args.extend(['--publisher', self.publisher]) + # if self.verbose: + # args.extend(['--verbose']) + # args.append(self.site_customization) # IsbnDb key + # try: + # opts, args = option_parser().parse_args(args) + # self.results = create_books(opts, args) + # except Exception, e: + # self.exception = e + # self.tb = traceback.format_exc() - @property - def string_customization_help(self): - ans = _('To use isbndb.com you must sign up for a %sfree account%s ' - 'and enter your access key below.') - return '<p>'+ans%('<a href="http://www.isbndb.com">', '</a>') + # @property + # def string_customization_help(self): + # ans = _('To use isbndb.com you must sign up for a %sfree account%s ' + # 'and enter your access key below.') + # return '<p>'+ans%('<a href="http://www.isbndb.com">', '</a>') # }}} diff --git a/src/calibre/ebooks/metadata/isbndb.py b/src/calibre/ebooks/metadata/isbndb.py index 9169227326..330755fe35 100644 --- a/src/calibre/ebooks/metadata/isbndb.py +++ b/src/calibre/ebooks/metadata/isbndb.py @@ -5,115 +5,247 @@ Interface to isbndb.com. My key HLLXQX2A. ''' import sys, re -from urllib import quote +from urllib import urlencode +from lxml import etree + +from calibre import browser, preferred_encoding +from calibre.ebooks.chardet import xml_to_unicode +from calibre.ebooks.metadata.fetch import MetadataSource +from calibre.ebooks.metadata import MetaInformation, authors_to_sort_string +from calibre.utils.cleantext import clean_ascii_chars from calibre.utils.config import OptionParser -from calibre.ebooks.metadata.book.base import Metadata -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup -from calibre import browser -BASE_URL = 'http://isbndb.com/api/books.xml?access_key=%(key)s&page_number=1&results=subjects,authors,texts&' + +class ISBNDB(MetadataSource): + + name = 'IsbnDB' + description = _('Downloads metadata from isbndb.com') + version = (1, 0, 1) + + def fetch(self): + if not self.site_customization: + return + try: + self.results = search(self.title, self.book_author, self.publisher, self.isbn, + max_results=10, verbose=self.verbose, key=self.site_customization) + except Exception, e: + import traceback + self.exception = e + self.tb = traceback.format_exc() + + @property + def string_customization_help(self): + ans = _('To use isbndb.com you must sign up for a %sfree account%s ' + 'and enter your access key below.') + return '<p>'+ans%('<a href="http://www.isbndb.com">', '</a>') + class ISBNDBError(Exception): pass -def fetch_metadata(url, max=100, timeout=5.): - books = [] - page_number = 1 - total_results = sys.maxint - br = browser() - while len(books) < total_results and max > 0: +def report(verbose): + if verbose: + import traceback + traceback.print_exc() + + +class Query(object): + + BASE_URL = 'http://isbndb.com/api/books.xml?' + + def __init__(self, key, title=None, author=None, publisher=None, isbn=None, + keywords=None, max_results=40): + assert not(title is None and author is None and publisher is None and \ + isbn is None and keywords is None) + assert (max_results < 41) + + if title == _('Unknown'): + title=None + if author == _('Unknown'): + author=None + self.maxresults = int(max_results) + + if isbn is not None: + q = isbn + i = 'isbn' + elif keywords is not None: + q = ' '.join([e for e in (title, author, publisher, keywords) \ + if e is not None ]) + q = q.strip() + i = 'full' + else: + q = ' '.join([e for e in (title, author, publisher) \ + if e is not None ]) + q = q.strip() + if len(q) == 0: + raise ISBNDBError(_('You must specify at least one of author, title or publisher')) + i = 'combined' + + if isinstance(q, unicode): + q = q.encode('utf-8') + self.url = self.BASE_URL+urlencode({ + 'value1':q, + 'results':'subjects,authors,texts,details', + 'access_key':key, + 'index1':i, + })+'&page_number=' + + def brcall(self, browser, url, verbose, timeout): + if verbose: + print _('Query: %s') % url + try: - raw = br.open(url, timeout=timeout).read() - except Exception, err: - raise ISBNDBError('Could not fetch ISBNDB metadata. Error: '+str(err)) - soup = BeautifulStoneSoup(raw, - convertEntities=BeautifulStoneSoup.XML_ENTITIES) - book_list = soup.find('booklist') - if book_list is None: - errmsg = soup.find('errormessage').string - raise ISBNDBError('Error fetching metadata: '+errmsg) - total_results = int(book_list['total_results']) - page_number += 1 - np = '&page_number=%s&'%page_number - url = re.sub(r'\&page_number=\d+\&', np, url) - books.extend(book_list.findAll('bookdata')) - max -= 1 - return books - - -class ISBNDBMetadata(Metadata): - - def __init__(self, book): - Metadata.__init__(self, None) - - def tostring(e): - if not hasattr(e, 'string'): + raw = browser.open_novisit(url, timeout=timeout).read() + except Exception, e: + import socket + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return None + attr = getattr(e, 'args', [None]) + attr = attr if attr else [None] + if isinstance(attr[0], socket.timeout): + raise ISBNDBError(_('ISBNDB timed out. Try again later.')) + raise ISBNDBError(_('ISBNDB encountered an error.')) + if '<title>404 - ' in raw: + return None + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + return etree.fromstring(raw) + except: + try: + #remove ASCII invalid chars (normally not needed) + return etree.fromstring(clean_ascii_chars(raw)) + except: return None - ans = e.string - if ans is not None: - ans = unicode(ans).strip() - if not ans: - ans = None - return ans - self.isbn = unicode(book.get('isbn13', book.get('isbn'))) - title = tostring(book.find('titlelong')) + def __call__(self, browser, verbose, timeout = 5.): + url = self.url+str(1) + feed = self.brcall(browser, url, verbose, timeout) + if feed is None: + return None + + # print etree.tostring(feed, pretty_print=True) + total = int(feed.find('BookList').get('total_results')) + nbresultstoget = total if total < self.maxresults else self.maxresults + entries = feed.xpath("./BookList/BookData") + i=2 + while len(entries) < nbresultstoget: + url = self.url+str(i) + feed = self.brcall(browser, url, verbose, timeout) + i+=1 + if feed is None: + break + entries.extend(feed.xpath("./BookList/BookData")) + return entries[:nbresultstoget] + +class ResultList(list): + + def get_description(self, entry, verbose): + try: + desc = entry.find('Summary') + if desc: + return _(u'SUMMARY:\n%s') % self.output_entry(desc) + except: + report(verbose) + + def get_language(self, entry, verbose): + try: + return entry.find('Details').get('language') + except: + report(verbose) + + def get_title(self, entry): + title = entry.find('TitleLong') if not title: - title = tostring(book.find('title')) - self.title = title - self.title = unicode(self.title).strip() + title = entry.find('Title') + return self.output_entry(title) + + def get_authors(self, entry): authors = [] - au = tostring(book.find('authorstext')) - if au: - au = au.strip() - temp = au.split(',') + au = entry.find('AuthorsText') + if au is not None: + au = self.output_entry(au) + temp = au.split(u',') for au in temp: if not au: continue - authors.extend([a.strip() for a in au.split('&')]) - if authors: - self.authors = authors + authors.extend([a.strip() for a in au.split(u'&')]) + return authors + + def get_author_sort(self, entry, verbose): try: - self.author_sort = tostring(book.find('authors').find('person')) - if self.authors and self.author_sort == self.authors[0]: - self.author_sort = None + return self.output_entry(entry.find('Authors').find('Person')) except: - pass - self.publisher = tostring(book.find('publishertext')) + report(verbose) + return None - summ = tostring(book.find('summary')) - if summ: - self.comments = 'SUMMARY:\n'+summ + def get_isbn(self, entry, verbose): + try: + return unicode(entry.get('isbn13', entry.get('isbn'))) + except: + report(verbose) + + def get_publisher(self, entry, verbose): + try: + return self.output_entry(entry.find('PublisherText')) + except: + report(verbose) + return None + + def output_entry(self, entry): + out = etree.tostring(entry, encoding=unicode, method="text") + return out.strip() + + def populate(self, entries, verbose): + for x in entries: + try: + title = self.get_title(x) + authors = self.get_authors(x) + except Exception, e: + if verbose: + print _('Failed to get all details for an entry') + print e + continue + mi = MetaInformation(title, authors) + tmpautsort = self.get_author_sort(x, verbose) + mi.author_sort = tmpautsort if tmpautsort is not None \ + else authors_to_sort_string(authors) + mi.comments = self.get_description(x, verbose) + mi.isbn = self.get_isbn(x, verbose) + mi.publisher = self.get_publisher(x, verbose) + mi.language = self.get_language(x, verbose) + self.append(mi) -def build_isbn(base_url, opts): - return base_url + 'index1=isbn&value1='+opts.isbn +def search(title=None, author=None, publisher=None, isbn=None, + max_results=10, verbose=False, keywords=None, key=None): + br = browser() + entries = Query(key, title=title, author=author, isbn=isbn, publisher=publisher, + keywords=keywords, max_results=max_results)(br, verbose, timeout = 10.) -def build_combined(base_url, opts): - query = ' '.join([e for e in (opts.title, opts.author, opts.publisher) \ - if e is not None ]) - query = query.strip() - if len(query) == 0: - raise ISBNDBError('You must specify at least one of --author, --title or --publisher') - - query = re.sub(r'\s+', '+', query) - if isinstance(query, unicode): - query = query.encode('utf-8') - return base_url+'index1=combined&value1='+quote(query, '+') + if entries is None or len(entries) == 0: + return None + #List of entry + ans = ResultList() + ans.populate(entries, verbose) + return list(dict((book.isbn, book) for book in ans).values()) def option_parser(): - parser = OptionParser(usage=\ -_(''' -%prog [options] key + import textwrap + parser = OptionParser(textwrap.dedent(\ + _('''\ + %prog [options] key -Fetch metadata for books from isndb.com. You can specify either the -books ISBN ID or its title and author. If you specify the title and author, -then more than one book may be returned. + Fetch metadata for books from isndb.com. You can specify either the + books ISBN ID or its title and author. If you specify the title and author, + then more than one book may be returned. -key is the account key you generate after signing up for a free account from isbndb.com. + key is the account key you generate after signing up for a free account from isbndb.com. -''')) + '''))) parser.add_option('-i', '--isbn', default=None, dest='isbn', help=_('The ISBN ID of the book you want metadata for.')) parser.add_option('-a', '--author', dest='author', @@ -122,38 +254,37 @@ key is the account key you generate after signing up for a free account from isb default=None, help=_('The title of the book to search for.')) parser.add_option('-p', '--publisher', default=None, dest='publisher', help=_('The publisher of the book to search for.')) - parser.add_option('-v', '--verbose', default=False, - action='store_true', help=_('Verbose processing')) - + parser.add_option('-k', '--keywords', help=_('Keywords to search for.')) + parser.add_option('-m', '--max-results', default=10, + help=_('Maximum number of results to fetch')) + parser.add_option('-v', '--verbose', default=0, action='count', + help=_('Be more verbose about errors')) return parser - -def create_books(opts, args, timeout=5.): - base_url = BASE_URL%dict(key=args[1]) - if opts.isbn is not None: - url = build_isbn(base_url, opts) - else: - url = build_combined(base_url, opts) - - if opts.verbose: - print ('ISBNDB query: '+url) - - tans = [ISBNDBMetadata(book) for book in fetch_metadata(url, timeout=timeout)] - #remove duplicates ISBN - return list(dict((book.isbn, book) for book in tans).values()) - def main(args=sys.argv): parser = option_parser() opts, args = parser.parse_args(args) if len(args) != 2: parser.print_help() - print ('You must supply the isbndb.com key') + print + print _('You must supply the isbndb.com key') return 1 - - for book in create_books(opts, args): - print unicode(book).encode('utf-8') - + try: + results = search(opts.title, opts.author, opts.publisher, opts.isbn, key=args[1], + keywords=opts.keywords, verbose=opts.verbose, max_results=opts.max_results) + except AssertionError: + report(True) + parser.print_help() + return 1 + if results is None or len(results) == 0: + print _('No result found for this search!') + return 0 + for result in results: + print unicode(result).encode(preferred_encoding, 'replace') + print return 0 if __name__ == '__main__': sys.exit(main()) + +# calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\isbndb-bis.py" -m 5 -a gore -v PWEK5WY4>data.html \ No newline at end of file From 0b0619916aa676822bec1cd3228fe67bf794a552 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 19 Dec 2010 11:15:04 +0100 Subject: [PATCH 070/132] Amazon bug+fetch error --- src/calibre/ebooks/metadata/amazon.py | 13 +++++++++++++ src/calibre/ebooks/metadata/fetch.py | 4 ++-- src/calibre/ebooks/metadata/isbndb.py | 3 ++- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/metadata/amazon.py b/src/calibre/ebooks/metadata/amazon.py index 6eb106c862..c617a2beaf 100644 --- a/src/calibre/ebooks/metadata/amazon.py +++ b/src/calibre/ebooks/metadata/amazon.py @@ -121,6 +121,19 @@ def report(verbose): class AmazonError(Exception): pass +class ThreadwithResults(Thread): + def __init__(self, func, *args, **kargs): + self.func = func + self.args = args + self.kargs = kargs + self.result = None + Thread.__init__(self) + + def get_result(self): + return self.result + + def run(self): + self.result = self.func(*self.args, **self.kargs) class Query(object): diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index a7709f88b4..dbf0db7bfe 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -299,8 +299,8 @@ def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None, with MetadataSources(fetchers) as manager: manager(title, author, publisher, isbn, verbose) manager.join() - - results = list(fetchers[0].results) if fetchers else [] + + results = list(fetchers[0].results) if fetchers[0].results else [] for fetcher in fetchers[1:]: merge_results(results, fetcher.results) diff --git a/src/calibre/ebooks/metadata/isbndb.py b/src/calibre/ebooks/metadata/isbndb.py index b1a69e37c0..787d70eb51 100644 --- a/src/calibre/ebooks/metadata/isbndb.py +++ b/src/calibre/ebooks/metadata/isbndb.py @@ -25,7 +25,7 @@ class ISBNDB(MetadataSource): def fetch(self): if not self.site_customization: - return + return None try: self.results = search(self.title, self.book_author, self.publisher, self.isbn, max_results=10, verbose=self.verbose, key=self.site_customization) @@ -231,6 +231,7 @@ def search(title=None, author=None, publisher=None, isbn=None, #List of entry ans = ResultList() ans.populate(entries, verbose) + ans = [x for x in ans if x is not None] return list(dict((book.isbn, book) for book in ans).values()) def option_parser(): From 3c60c677158ece7422696d82af857a7aceb844a8 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 19 Dec 2010 15:32:30 +0100 Subject: [PATCH 071/132] Wrong copy-paste --- src/calibre/ebooks/metadata/amazon.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/metadata/amazon.py b/src/calibre/ebooks/metadata/amazon.py index c617a2beaf..941c80ac62 100644 --- a/src/calibre/ebooks/metadata/amazon.py +++ b/src/calibre/ebooks/metadata/amazon.py @@ -225,8 +225,8 @@ class Query(object): attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): - raise NiceBooksError(_('Nicebooks timed out. Try again later.')) - raise NiceBooksError(_('Nicebooks encountered an error.')) + raise AmazonError(_('Amazon timed out. Try again later.')) + raise AmazonError(_('Amazon encountered an error.')) if '<title>404 - ' in raw: return raw = xml_to_unicode(raw, strip_encoding_pats=True, From 1cc42192a7b0ffa2ecca80faa19039dead70f28d Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 2 Jan 2011 21:55:01 +0100 Subject: [PATCH 072/132] ... --- src/calibre/ebooks/metadata/amazon.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/metadata/amazon.py b/src/calibre/ebooks/metadata/amazon.py index 941c80ac62..cc7a4c9d34 100644 --- a/src/calibre/ebooks/metadata/amazon.py +++ b/src/calibre/ebooks/metadata/amazon.py @@ -650,4 +650,4 @@ if __name__ == '__main__': # sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonbis; calibre.ebooks.metadata.amazonbis.main()")) # sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonbis; calibre.ebooks.metadata.amazonbis.main()", "profile")) -# calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\amazon.py" -m 5 -a gore -v>data.html \ No newline at end of file +# calibre-debug -e "D:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\amazon.py" -m 5 -a gore -v>data.html \ No newline at end of file From 6391251cb782e038b8c71e9276cb449c2ff2fec5 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 2 Jan 2011 22:16:32 +0100 Subject: [PATCH 073/132] BIB catalog now support custom fields --- src/calibre/gui2/catalog/catalog_bibtex.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/calibre/gui2/catalog/catalog_bibtex.py b/src/calibre/gui2/catalog/catalog_bibtex.py index 5030cf6ec8..f66b63bd58 100644 --- a/src/calibre/gui2/catalog/catalog_bibtex.py +++ b/src/calibre/gui2/catalog/catalog_bibtex.py @@ -9,6 +9,7 @@ __docformat__ = 'restructuredtext en' from calibre.gui2 import gprefs from calibre.gui2.catalog.catalog_bibtex_ui import Ui_Form +from calibre.library import db as db_ from PyQt4.Qt import QWidget, QListWidgetItem class PluginWidget(QWidget, Ui_Form): @@ -28,11 +29,14 @@ class PluginWidget(QWidget, Ui_Form): QWidget.__init__(self, parent) self.setupUi(self) from calibre.library.catalog import FIELDS - self.all_fields = [] - for x in FIELDS : - if x != 'all': - self.all_fields.append(x) - QListWidgetItem(x, self.db_fields) + + self.all_fields = [x for x in FIELDS if x != 'all'] + #add custom columns + db = db_() + self.all_fields.extend([x for x in sorted(db.custom_field_keys())]) + #populate + for x in self.all_fields: + QListWidgetItem(x, self.db_fields) def initialize(self, name, db): #not working properly to update self.name = name From 24f24109603649f96985e55c9cb812e6b5d98fc2 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Mon, 3 Jan 2011 22:17:31 +0100 Subject: [PATCH 074/132] fetch add trad --- src/calibre/ebooks/metadata/fetch.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index dbf0db7bfe..2adde5d6a3 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -168,7 +168,7 @@ class MetadataSource(Plugin): # {{{ customize_plugin(self, sc) def customization_help(self): - return 'This plugin can only be customized using the GUI' + return _('This plugin can only be customized using the GUI') # }}} @@ -433,7 +433,7 @@ def get_social_metadata(mi, verbose=0): def option_parser(): - parser = OptionParser(textwrap.dedent( + parser = OptionParser(textwrap.dedent(_( '''\ %prog [options] @@ -441,19 +441,19 @@ def option_parser(): of title, author, publisher or ISBN. If you specify ISBN, the others are ignored. ''' - )) - parser.add_option('-t', '--title', help='Book title') - parser.add_option('-a', '--author', help='Book author(s)') - parser.add_option('-p', '--publisher', help='Book publisher') - parser.add_option('-i', '--isbn', help='Book ISBN') + ))) + parser.add_option('-t', '--title', help=_('Book title')) + parser.add_option('-a', '--author', help=_('Book author(s)')) + parser.add_option('-p', '--publisher', help=_('Book publisher')) + parser.add_option('-i', '--isbn', help=_('Book ISBN')) parser.add_option('-m', '--max-results', default=10, - help='Maximum number of results to fetch') + help=_('Maximum number of results to fetch')) parser.add_option('-k', '--isbndb-key', - help=('The access key for your ISBNDB.com account. ' + help=_('The access key for your ISBNDB.com account. ' 'Only needed if you want to search isbndb.com ' 'and you haven\'t customized the IsbnDB plugin.')) parser.add_option('-v', '--verbose', default=0, action='count', - help='Be more verbose about errors') + help=_('Be more verbose about errors')) return parser def main(args=sys.argv): @@ -469,7 +469,7 @@ def main(args=sys.argv): for name, exception, tb in exceptions+social_exceptions: if exception is not None: - print 'WARNING: Fetching from', name, 'failed with error:' + print _('WARNING: Fetching from %s failed with error:') % (name) print exception print tb From 61f8f592a81eb8d031905b7ac7dab3c6a5eaa2f4 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Wed, 5 Jan 2011 01:00:36 +0100 Subject: [PATCH 075/132] ... --- src/calibre/ebooks/rtf2xml/ParseRtf.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index 4b0bb41d42..33dc585579 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -193,6 +193,14 @@ class ParseRtf: copy_obj.set_dir(self.__debug_dir) copy_obj.remove_files() copy_obj.copy_file(self.__temp_file, "original_file") + #Check to see if the file is correct ascii + check_encoding_obj = check_encoding.CheckEncoding( + bug_handler = RtfInvalidCodeException, + ) + if check_encoding_obj.check_encoding(self.__file): + file_name = self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8') + msg = _('File %s does not appear to be ascii.\n') % file_name + raise InvalidRtfException, msg # Function to check if bracket are well handled if self.__debug_dir or self.__run_level > 2: self.__check_brack_obj = check_brackets.CheckBrackets\ @@ -230,13 +238,6 @@ class ParseRtf: os.remove(self.__temp_file) except OSError: pass - #Check to see if the file is correct ascii - check_encoding_obj = check_encoding.CheckEncoding( - bug_handler = RtfInvalidCodeException, - ) - if check_encoding_obj.check_encoding(self.__file): - sys.stderr.write(_('File "%s" does not appear to be ascii.\n') \ - % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8')) raise InvalidRtfException, msg delete_info_obj = delete_info.DeleteInfo( in_file = self.__temp_file, From 66b1713e8040648381e56031ab5b486d1ae908d8 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Wed, 5 Jan 2011 08:07:10 +0100 Subject: [PATCH 076/132] Fix regression broking handling of sub and sup in RTF input --- resources/templates/rtf.xsl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/resources/templates/rtf.xsl b/resources/templates/rtf.xsl index ea1fc71172..6db1c0388d 100644 --- a/resources/templates/rtf.xsl +++ b/resources/templates/rtf.xsl @@ -287,7 +287,7 @@ <xsl:value-of select="count(preceding::rtf:footnote) + 1"/> <xsl:text>]</xsl:text> </xsl:when> - <xsl:when test="(@superscript = 'true')"> + <xsl:when test="(@superscript)"> <xsl:element name="sup"> <xsl:element name="span"> <xsl:attribute name="class"> @@ -297,7 +297,7 @@ </xsl:element> </xsl:element> </xsl:when> - <xsl:when test="(@underscript = 'true')"> + <xsl:when test="(@underscript or @subscript)"> <xsl:element name="sub"> <xsl:element name="span"> <xsl:attribute name="class"> From b857f8608f12c29557d6c42e2be3a908f9338e54 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Wed, 5 Jan 2011 08:09:28 +0100 Subject: [PATCH 077/132] Add debuging options for RTF input.py --- src/calibre/ebooks/rtf/input.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index fdd501495b..19f944bbb5 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -115,6 +115,10 @@ class RTFInput(InputFormatPlugin): # Write or do not write paragraphs. Default is 0. empty_paragraphs = 1, + + #debug + # deb_dir = "D:\\Mes eBooks\\Developpement\\debug\\rtfdebug", + # run_level = 3 ) parser.parse_rtf() ans = open('out.xml').read() @@ -256,9 +260,8 @@ class RTFInput(InputFormatPlugin): raise ValueError(_('This RTF file has a feature calibre does not ' 'support. Convert it to HTML first and then try it.\n%s')%e) - '''dataxml = open('dataxml.xml', 'w') - dataxml.write(xml) - dataxml.close''' + # with open('dataxml.xml', 'w') as dataxml: + # dataxml.write(xml) d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf')) if d: From 5784256e022b700adfeaa99959389aab96868e5b Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Wed, 5 Jan 2011 22:39:26 +0100 Subject: [PATCH 078/132] Check if tokens are correct ascii --- src/calibre/ebooks/rtf2xml/process_tokens.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/process_tokens.py b/src/calibre/ebooks/rtf2xml/process_tokens.py index 13ce495e67..2c603ea28d 100755 --- a/src/calibre/ebooks/rtf2xml/process_tokens.py +++ b/src/calibre/ebooks/rtf2xml/process_tokens.py @@ -779,12 +779,16 @@ class ProcessTokens: msg =_('Invalid RTF: document doesn\'t start with \\rtf \n') raise self.__exception_handler, msg - ##token = self.evaluate_token(token) the_index = token.find('\\ ') if token is not None and the_index > -1: - msg ='Invalid RTF: token "\\ " not valid.\n' + msg =_('Invalid RTF: token "\\ " not valid.\n') raise self.__exception_handler, msg elif token[:1] == "\\": + try: + token.decode('us-ascii') + except UnicodeError, msg: + msg = _('Invalid RTF: Tokens not ascii encoded.\n%s') % str(msg) + raise self.__exception_handler, msg line = self.process_cw(token) if line is not None: write_obj.write(line) From bb50018eb35e367d4da05bf3b29d43d2ca2bdc95 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Wed, 5 Jan 2011 23:47:14 +0100 Subject: [PATCH 079/132] Clean defaut encoding --- .../ebooks/rtf2xml/default_encoding.py | 153 ++++++++++++------ src/calibre/ebooks/rtf2xml/process_tokens.py | 6 +- 2 files changed, 109 insertions(+), 50 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/default_encoding.py b/src/calibre/ebooks/rtf2xml/default_encoding.py index b932b465d0..0268c29f75 100755 --- a/src/calibre/ebooks/rtf2xml/default_encoding.py +++ b/src/calibre/ebooks/rtf2xml/default_encoding.py @@ -1,61 +1,118 @@ ######################################################################### # # -# # # copyright 2002 Paul Henry Tremblay # # # -# This program is distributed in the hope that it will be useful, # -# but WITHOUT ANY WARRANTY; without even the implied warranty of # -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # -# General Public License for more details. # -# # -# You should have received a copy of the GNU General Public License # -# along with this program; if not, write to the Free Software # -# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA # -# 02111-1307 USA # -# # -# # ######################################################################### + +''' +Codepages as to RTF 1.9.1: + 437 United States IBM + 708 Arabic (ASMO 708) + 709 Arabic (ASMO 449+, BCON V4) + 710 Arabic (transparent Arabic) + 711 Arabic (Nafitha Enhanced) + 720 Arabic (transparent ASMO) + 819 Windows 3.1 (United States and Western Europe) + 850 IBM multilingual + 852 Eastern European + 860 Portuguese + 862 Hebrew + 863 French Canadian + 864 Arabic + 865 Norwegian + 866 Soviet Union + 874 Thai + 932 Japanese + 936 Simplified Chinese + 949 Korean + 950 Traditional Chinese + 1250 Eastern European + 1251 Cyrillic + 1252 Western European + 1253 Greek + 1254 Turkish + 1255 Hebrew + 1256 Arabic + 1257 Baltic + 1258 Vietnamese + 1361 Johab + 10000 MAC Roman + 10001 MAC Japan + 10004 MAC Arabic + 10005 MAC Hebrew + 10006 MAC Greek + 10007 MAC Cyrillic + 10029 MAC Latin2 + 10081 MAC Turkish + 57002 Devanagari + 57003 Bengali + 57004 Tamil + 57005 Telugu + 57006 Assamese + 57007 Oriya + 57008 Kannada + 57009 Malayalam + 57010 Gujarati + 57011 Punjabi +''' + class DefaultEncoding: """ Find the default encoding for the doc """ def __init__(self, in_file, bug_handler, run_level = 1,): - """ - Required: - 'file' - Returns: - nothing - """ self.__file = in_file self.__bug_handler = bug_handler + self.__platform = 'Windows' + self.__default_num = 'not-defined' + self.__code_page = '1252' + self.__datafetched = False + def find_default_encoding(self): - platform = 'Windows' - default_num = 'not-defined' - code_page = 'ansicpg1252' - read_obj = open(self.__file, 'r') - line_to_read = 1 - while line_to_read: - line_to_read = read_obj.readline() - line = line_to_read - self.__token_info = line[:16] - if self.__token_info == 'mi<mk<rtfhed-end': - break - if self.__token_info == 'cw<ri<ansi-codpg': - #cw<ri<ansi-codpg<nu<10000 - num = line[20:-1] - if not num: - num = '1252' - code_page = 'ansicpg' + num - if self.__token_info == 'cw<ri<macintosh_': - platform = 'Macintosh' - if self.__token_info == 'cw<ri<deflt-font': - default_num = line[20:-1] - #cw<ri<deflt-font<nu<0 - #action = self.__state_dict.get(self.__state) - #if action == None: - #print self.__state - #action(line) - read_obj.close() - if platform == 'Macintosh': - code_page = 'mac_roman' - return platform, code_page, default_num + if not self.__datafetched: + self._encoding() + self.__datafetched = True + if self.__platform = 'Macintosh': + code_page = self.__code_page + else + code_page = 'ansicpg' + self.__code_page + return platform, code_page, self.__default_num + + def get_codepage(self): + if not self.__datafetched: + self._encoding() + self.__datafetched = True + return self.__code_page + + def get_platform(self): + if not self.__datafetched: + self._encoding() + self.__datafetched = True + return self.__platform + + def _encoding(self): + with open(self.__file, 'r') as read_obj: + for line in read_obj: + self.__token_info = line[:16] + if self.__token_info == 'mi<mk<rtfhed-end': + break + if self.__token_info == 'cw<ri<ansi-codpg': + #cw<ri<ansi-codpg<nu<10000 + self.__code_page = line[20:-1] if line[20:-1] \ + else '1252' + if self.__token_info == 'cw<ri<macintosh_': + self.__platform = 'Macintosh' + elif self.__token_info == 'cw<ri<pc________': + self.__platform = 'IBMPC' + elif self.__token_info == 'cw<ri<pca_______': + self.__platform = 'OS/2' + if self.__token_info == 'cw<ri<deflt-font': + self.__default_num = line[20:-1] + #cw<ri<deflt-font<nu<0 + if self.__platform == 'Macintosh': + self.__code_page = 'mac_roman' + elif self.__platform = 'IBMPC': + self.__code_page = '437' + elif self.__platform = 'OS/2': + self.__code_page = '850' + diff --git a/src/calibre/ebooks/rtf2xml/process_tokens.py b/src/calibre/ebooks/rtf2xml/process_tokens.py index 2c603ea28d..36930dedaf 100755 --- a/src/calibre/ebooks/rtf2xml/process_tokens.py +++ b/src/calibre/ebooks/rtf2xml/process_tokens.py @@ -163,15 +163,17 @@ class ProcessTokens: 'rtf' : ('ri', 'rtf_______', self.default_func), 'deff' : ('ri', 'deflt-font', self.default_func), 'mac' : ('ri', 'macintosh_', self.default_func), + 'pc' : ('ri', 'pc________', self.default_func), + 'pca' : ('ri', 'pca_______', self.default_func), 'ansi' : ('ri', 'ansi______', self.default_func), 'ansicpg' : ('ri', 'ansi-codpg', self.default_func), # notes => nt 'footnote' : ('nt', 'footnote__', self.default_func), 'ftnalt' : ('nt', 'type______<endnote', self.two_part_func), # anchor => an - 'tc' : ('an', 'toc_______', self.default_func), + 'tc' : ('an', 'toc_______', self.default_func), 'bkmkstt' : ('an', 'book-mk-st', self.default_func), - 'bkmkstart' : ('an', 'book-mk-st', self.default_func), + 'bkmkstart' : ('an', 'book-mk-st', self.default_func), 'bkmkend' : ('an', 'book-mk-en', self.default_func), 'xe' : ('an', 'index-mark', self.default_func), 'rxe' : ('an', 'place_____', self.default_func), From d0655c4d9abe48185831d4eb7eb9dccfb8b88488 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Wed, 5 Jan 2011 23:57:34 +0100 Subject: [PATCH 080/132] ... --- src/calibre/ebooks/rtf2xml/default_encoding.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/default_encoding.py b/src/calibre/ebooks/rtf2xml/default_encoding.py index 0268c29f75..f89f54ada8 100755 --- a/src/calibre/ebooks/rtf2xml/default_encoding.py +++ b/src/calibre/ebooks/rtf2xml/default_encoding.py @@ -72,7 +72,7 @@ class DefaultEncoding: if not self.__datafetched: self._encoding() self.__datafetched = True - if self.__platform = 'Macintosh': + if self.__platform == 'Macintosh': code_page = self.__code_page else code_page = 'ansicpg' + self.__code_page @@ -111,8 +111,8 @@ class DefaultEncoding: #cw<ri<deflt-font<nu<0 if self.__platform == 'Macintosh': self.__code_page = 'mac_roman' - elif self.__platform = 'IBMPC': + elif self.__platform == 'IBMPC': self.__code_page = '437' - elif self.__platform = 'OS/2': + elif self.__platform == 'OS/2': self.__code_page = '850' From 428fcdd1415194c62b2726ae26d2ce842a3536da Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Thu, 6 Jan 2011 00:01:24 +0100 Subject: [PATCH 081/132] Move check encoding --- src/calibre/ebooks/rtf2xml/ParseRtf.py | 24 +++++++++++-------- .../ebooks/rtf2xml/default_encoding.py | 4 ++-- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index 33dc585579..fdd17e3f78 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -193,21 +193,13 @@ class ParseRtf: copy_obj.set_dir(self.__debug_dir) copy_obj.remove_files() copy_obj.copy_file(self.__temp_file, "original_file") - #Check to see if the file is correct ascii - check_encoding_obj = check_encoding.CheckEncoding( - bug_handler = RtfInvalidCodeException, - ) - if check_encoding_obj.check_encoding(self.__file): - file_name = self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8') - msg = _('File %s does not appear to be ascii.\n') % file_name - raise InvalidRtfException, msg # Function to check if bracket are well handled if self.__debug_dir or self.__run_level > 2: self.__check_brack_obj = check_brackets.CheckBrackets\ (file = self.__temp_file, bug_handler = RtfInvalidCodeException, ) - # convert Macintosh and Windows line endings to Unix line endings + #convert Macintosh and Windows line endings to Unix line endings #why do this if you don't wb after? line_obj = line_endings.FixLineEndings( in_file = self.__temp_file, @@ -238,7 +230,19 @@ class ParseRtf: os.remove(self.__temp_file) except OSError: pass - raise InvalidRtfException, msg + #Check to see if the file is correctly encoded + check_encoding_obj = check_encoding.CheckEncoding( + bug_handler = RtfInvalidCodeException, + ) + if check_encoding_obj.check_encoding(self.__file, 'cp1252') and \ + check_encoding_obj.check_encoding(self.__file, 'cp437') and \ + check_encoding_obj.check_encoding(self.__file, 'cp850') and \ + check_encoding_obj.check_encoding(self.__file, 'mac_roman'): + file_name = self.__file if isinstance(self.__file, str) \ + else self.__file.encode('utf-8') + msg = _('File %s does not appear to be correctly encoded.\n') % file_name + raise InvalidRtfException, msg + delete_info_obj = delete_info.DeleteInfo( in_file = self.__temp_file, copy = self.__copy, diff --git a/src/calibre/ebooks/rtf2xml/default_encoding.py b/src/calibre/ebooks/rtf2xml/default_encoding.py index f89f54ada8..a5c2ab9561 100755 --- a/src/calibre/ebooks/rtf2xml/default_encoding.py +++ b/src/calibre/ebooks/rtf2xml/default_encoding.py @@ -74,9 +74,9 @@ class DefaultEncoding: self.__datafetched = True if self.__platform == 'Macintosh': code_page = self.__code_page - else + else: code_page = 'ansicpg' + self.__code_page - return platform, code_page, self.__default_num + return self.__platform, code_page, self.__default_num def get_codepage(self): if not self.__datafetched: From 9e31d706693e4875ea36f80601b015e812d4a862 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Thu, 6 Jan 2011 09:00:49 +0100 Subject: [PATCH 082/132] Activate RTF debug --- src/calibre/ebooks/rtf/input.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 19f944bbb5..05c851a075 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -117,8 +117,8 @@ class RTFInput(InputFormatPlugin): empty_paragraphs = 1, #debug - # deb_dir = "D:\\Mes eBooks\\Developpement\\debug\\rtfdebug", - # run_level = 3 + deb_dir = "D:\\Mes eBooks\\Developpement\\debug\\rtfdebug", + run_level = 3 ) parser.parse_rtf() ans = open('out.xml').read() @@ -260,8 +260,8 @@ class RTFInput(InputFormatPlugin): raise ValueError(_('This RTF file has a feature calibre does not ' 'support. Convert it to HTML first and then try it.\n%s')%e) - # with open('dataxml.xml', 'w') as dataxml: - # dataxml.write(xml) + with open('dataxml.xml', 'w') as dataxml: + dataxml.write(xml) d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf')) if d: From 7ecf2f1e9c974aac94e6dfc260e582f8746f5fe8 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Thu, 6 Jan 2011 09:01:13 +0100 Subject: [PATCH 083/132] spell --- src/calibre/ebooks/rtf2xml/convert_to_tags.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/rtf2xml/convert_to_tags.py b/src/calibre/ebooks/rtf2xml/convert_to_tags.py index ab54c0cbc3..c2244b784a 100755 --- a/src/calibre/ebooks/rtf2xml/convert_to_tags.py +++ b/src/calibre/ebooks/rtf2xml/convert_to_tags.py @@ -88,7 +88,7 @@ class ConvertToTags: def __open_att_func(self, line): """ Process lines for open tags that have attributes. - The important infor is between [17:-1]. Take this info and split it + The important info is between [17:-1]. Take this info and split it with the delimeter '<'. The first token in this group is the element name. The rest are attributes, separated fromt their values by '>'. So read each token one at a time, and split them by '>'. From bbaecb400726cd2b19d6820bbc6ddf83a86fb7e3 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Thu, 6 Jan 2011 22:25:12 +0100 Subject: [PATCH 084/132] Allow check encoding to look directly in rtf & improve code checking for invalid chars --- src/calibre/ebooks/rtf2xml/ParseRtf.py | 17 +++-- .../ebooks/rtf2xml/default_encoding.py | 71 ++++++++++++------- 2 files changed, 59 insertions(+), 29 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index fdd17e3f78..05a4847ce5 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -230,14 +230,21 @@ class ParseRtf: os.remove(self.__temp_file) except OSError: pass - #Check to see if the file is correctly encoded + #Check to see if the file is correctly encoded + encode_obj = default_encoding.DefaultEncoding( + in_file = self.__temp_file, + run_level = self.__run_level, + bug_handler = RtfInvalidCodeException, + check_raw = True, + ) + platform, code_page, default_font_num = encode_obj.find_default_encoding() check_encoding_obj = check_encoding.CheckEncoding( bug_handler = RtfInvalidCodeException, ) - if check_encoding_obj.check_encoding(self.__file, 'cp1252') and \ - check_encoding_obj.check_encoding(self.__file, 'cp437') and \ - check_encoding_obj.check_encoding(self.__file, 'cp850') and \ - check_encoding_obj.check_encoding(self.__file, 'mac_roman'): + enc = encode_obj.get_codepage() + if enc != 'mac_roman': + enc = 'cp' + enc + if check_encoding_obj.check_encoding(self.__file, enc): file_name = self.__file if isinstance(self.__file, str) \ else self.__file.encode('utf-8') msg = _('File %s does not appear to be correctly encoded.\n') % file_name diff --git a/src/calibre/ebooks/rtf2xml/default_encoding.py b/src/calibre/ebooks/rtf2xml/default_encoding.py index a5c2ab9561..a4eeac9663 100755 --- a/src/calibre/ebooks/rtf2xml/default_encoding.py +++ b/src/calibre/ebooks/rtf2xml/default_encoding.py @@ -55,18 +55,20 @@ Codepages as to RTF 1.9.1: 57010 Gujarati 57011 Punjabi ''' +import re class DefaultEncoding: """ Find the default encoding for the doc """ - def __init__(self, in_file, bug_handler, run_level = 1,): + def __init__(self, in_file, bug_handler, run_level = 1, check_raw = False): self.__file = in_file self.__bug_handler = bug_handler self.__platform = 'Windows' self.__default_num = 'not-defined' self.__code_page = '1252' self.__datafetched = False + self.__fetchraw = check_raw def find_default_encoding(self): if not self.__datafetched: @@ -92,27 +94,48 @@ class DefaultEncoding: def _encoding(self): with open(self.__file, 'r') as read_obj: - for line in read_obj: - self.__token_info = line[:16] - if self.__token_info == 'mi<mk<rtfhed-end': - break - if self.__token_info == 'cw<ri<ansi-codpg': - #cw<ri<ansi-codpg<nu<10000 - self.__code_page = line[20:-1] if line[20:-1] \ - else '1252' - if self.__token_info == 'cw<ri<macintosh_': - self.__platform = 'Macintosh' - elif self.__token_info == 'cw<ri<pc________': - self.__platform = 'IBMPC' - elif self.__token_info == 'cw<ri<pca_______': - self.__platform = 'OS/2' - if self.__token_info == 'cw<ri<deflt-font': - self.__default_num = line[20:-1] - #cw<ri<deflt-font<nu<0 - if self.__platform == 'Macintosh': - self.__code_page = 'mac_roman' - elif self.__platform == 'IBMPC': - self.__code_page = '437' - elif self.__platform == 'OS/2': - self.__code_page = '850' + if not self.__fetchraw: + for line in read_obj: + self.__token_info = line[:16] + if self.__token_info == 'mi<mk<rtfhed-end': + break + if self.__token_info == 'cw<ri<ansi-codpg': + #cw<ri<ansi-codpg<nu<10000 + self.__code_page = line[20:-1] if line[20:-1] \ + else '1252' + if self.__token_info == 'cw<ri<macintosh_': + self.__platform = 'Macintosh' + self.__code_page = 'mac_roman' + elif self.__token_info == 'cw<ri<pc________': + self.__platform = 'IBMPC' + self.__code_page = '437' + elif self.__token_info == 'cw<ri<pca_______': + self.__platform = 'OS/2' + self.__code_page = '850' + if self.__token_info == 'cw<ri<deflt-font': + self.__default_num = line[20:-1] + #cw<ri<deflt-font<nu<0 + else: + fenc = re.compile(r'\\(mac|pc|ansi|pca)[\\ \{\}\t\n]+') + fenccp = re.compile(r'\\ansicpg(\d+)[\\ \{\}\t\n]+') + for line in read_obj: + if fenccp.search(line): + self.__code_page = fenccp.search(line).group(1) + break + if fenc.search(line): + enc = fenc.search(line).group(1) + if enc == 'mac': + self.__code_page = 'mac_roman' + elif enc == 'pc': + self.__code_page = '437' + elif enc == 'pca': + self.__code_page = '850' +# if __name__ == '__main__': + # from calibre.ebooks.rtf2xml import default_encoding + # encode_obj = default_encoding.DefaultEncoding( + # in_file = sys.argv[1], + # bug_handler = Exception, + # check_raw = True, + # ) + # print encode_obj.get_codepage() From 18df9457bb326ee588d0faed936cca2792d95661 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Thu, 6 Jan 2011 22:53:38 +0100 Subject: [PATCH 085/132] Update get_char_map --- src/calibre/ebooks/rtf2xml/get_char_map.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/get_char_map.py b/src/calibre/ebooks/rtf2xml/get_char_map.py index db307b19d6..18e27b2fe7 100755 --- a/src/calibre/ebooks/rtf2xml/get_char_map.py +++ b/src/calibre/ebooks/rtf2xml/get_char_map.py @@ -43,16 +43,16 @@ class GetCharMap: def get_char_map(self, map): if map == 'ansicpg0': map = 'ansicpg1250' - found_map = 0 + found_map = False map_dict = {} self.__char_file.seek(0) - for line in self.__char_file.readlines(): + for line in self.__char_file: if not line.strip(): continue begin_element = '<%s>' % map; end_element = '</%s>' % map if not found_map: if begin_element in line: - found_map = 1 + found_map = True else: if end_element in line: break @@ -62,8 +62,7 @@ class GetCharMap: if not found_map: - msg = 'no map found\n' - msg += 'map is "%s"\n'%(map,) + msg = _('no map found\nmap is "%s"\n') %(map,) raise self.__bug_handler, msg return map_dict From b2187360ecec9ddab30e79c48a26f340d1a12911 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Fri, 7 Jan 2011 07:36:20 +0100 Subject: [PATCH 086/132] various lttle modification in rtf2xml --- src/calibre/ebooks/rtf2xml/ParseRtf.py | 3 +- src/calibre/ebooks/rtf2xml/hex_2_utf8.py | 98 +++++++++++--------- src/calibre/ebooks/rtf2xml/process_tokens.py | 4 +- 3 files changed, 59 insertions(+), 46 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index 05a4847ce5..901188a000 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -326,6 +326,7 @@ class ParseRtf: invalid_rtf_handler = InvalidRtfException, ) hex2utf_obj.convert_hex_2_utf8() + # raise RtfInvalidCodeException, 'stop' self.__bracket_match('hex_2_utf_preamble') fonts_obj = fonts.Fonts( in_file = self.__temp_file, @@ -381,7 +382,7 @@ class ParseRtf: msg += 'self.__run_level is "%s"\n' % self.__run_level raise RtfInvalidCodeException, msg if self.__run_level > 1: - sys.stderr.write('File could be older RTF...\n') + sys.stderr.write(_('File could be older RTF...\n')) if found_destination: if self.__run_level > 1: sys.stderr.write(_( diff --git a/src/calibre/ebooks/rtf2xml/hex_2_utf8.py b/src/calibre/ebooks/rtf2xml/hex_2_utf8.py index d67dce30d2..750d0c9180 100755 --- a/src/calibre/ebooks/rtf2xml/hex_2_utf8.py +++ b/src/calibre/ebooks/rtf2xml/hex_2_utf8.py @@ -54,10 +54,10 @@ class Hex2Utf8: 'convert_to_caps'--wether to convert caps to utf-8 Returns: nothing - """ + """ self.__file = in_file self.__copy = copy - if area_to_convert != 'preamble' and area_to_convert != 'body': + if area_to_convert not in ('preamble', 'body'): msg = ( 'Developer error! Wrong flag.\n' 'in module "hex_2_utf8.py\n' @@ -79,7 +79,8 @@ class Hex2Utf8: self.__write_to = tempfile.mktemp() self.__bug_handler = bug_handler self.__invalid_rtf_handler = invalid_rtf_handler - def update_values( self, + + def update_values(self, file, area_to_convert, char_file, @@ -132,6 +133,7 @@ class Hex2Utf8: # self.__convert_symbol = 0 # self.__convert_wingdings = 0 # self.__convert_zapf = 0 + def __initiate_values(self): """ Required: @@ -191,6 +193,7 @@ class Hex2Utf8: 'body' : self.__body_func, 'mi<mk<body-open_' : self.__found_body_func, 'tx<hx<__________' : self.__hex_text_func, + # 'tx<nu<__________' : self.__text_func, } self.__body_state_dict = { 'preamble' : self.__preamble_for_body_func, @@ -209,6 +212,7 @@ class Hex2Utf8: } self.__caps_list = ['false'] self.__font_list = ['not-defined'] + def __hex_text_func(self, line): """ Required: @@ -218,12 +222,12 @@ class Hex2Utf8: token is in the dictionary, then check if the value starts with a "&". If it does, then tag the result as utf text. Otherwise, tag it as normal text. - If the nex_num is not in the dictionary, then a mistake has been + If the hex_num is not in the dictionary, then a mistake has been made. """ hex_num = line[17:-1] converted = self.__current_dict.get(hex_num) - if converted != None: + if converted is not None: # tag as utf-8 if converted[0:1] == "&": font = self.__current_dict_name @@ -261,44 +265,45 @@ class Hex2Utf8: # msg = 'no dictionary entry for %s\n' # msg += 'the hexidecimal num is "%s"\n' % (hex_num) # msg += 'dictionary is %s\n' % self.__current_dict_name - msg = 'Character "&#x%s;" does not appear to be valid (or is a control character)\n' % token + msg = _('Character "&#x%s;" does not appear to be valid (or is a control character)\n') % token raise self.__bug_handler, msg + def __found_body_func(self, line): self.__state = 'body' self.__write_obj.write(line) + def __body_func(self, line): """ When parsing preamble """ self.__write_obj.write(line) + def __preamble_func(self, line): action = self.__preamble_state_dict.get(self.__token_info) - if action != None: + if action is not None: action(line) else: self.__write_obj.write(line) + def __convert_preamble(self): self.__state = 'preamble' - read_obj = open(self.__file, 'r') self.__write_obj = open(self.__write_to, 'w') - line_to_read = 1 - while line_to_read: - line_to_read = read_obj.readline() - line = line_to_read - self.__token_info = line[:16] - action = self.__preamble_state_dict.get(self.__state) - if action == None: - sys.stderr.write('error no state found in hex_2_utf8', - self.__state - ) - action(line) - read_obj.close() + with open(self.__file, 'r') as read_obj: + for line in read_obj: + self.__token_info = line[:16] + action = self.__preamble_state_dict.get(self.__state) + if action is None: + sys.stderr.write(_('error no state found in hex_2_utf8'), + self.__state + ) + action(line) self.__write_obj.close() copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "preamble_utf_convert.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to) + def __preamble_for_body_func(self, line): """ Required: @@ -311,6 +316,7 @@ class Hex2Utf8: if self.__token_info == 'mi<mk<body-open_': self.__found_body_func(line) self.__write_obj.write(line) + def __body_for_body_func(self, line): """ Required: @@ -321,10 +327,11 @@ class Hex2Utf8: Used when parsing the body. """ action = self.__in_body_dict.get(self.__token_info) - if action != None: + if action is not None: action(line) else: self.__write_obj.write(line) + def __start_font_func(self, line): """ Required: @@ -348,6 +355,7 @@ class Hex2Utf8: else: self.__current_dict_name = 'default' self.__current_dict = self.__def_dict + def __end_font_func(self, line): """ Required: @@ -376,6 +384,7 @@ class Hex2Utf8: else: self.__current_dict_name = 'default' self.__current_dict = self.__def_dict + def __start_special_font_func_old(self, line): """ Required: @@ -398,6 +407,7 @@ class Hex2Utf8: self.__current_dict.append(self.__dingbats_dict) self.__special_fonts_found += 1 self.__current_dict_name = 'Zapf Dingbats' + def __end_special_font_func(self, line): """ Required: @@ -416,6 +426,7 @@ class Hex2Utf8: self.__current_dict.pop() self.__special_fonts_found -= 1 self.__dict_name = 'default' + def __start_caps_func_old(self, line): """ Required: @@ -427,6 +438,7 @@ class Hex2Utf8: self.__in_caps to 1 """ self.__in_caps = 1 + def __start_caps_func(self, line): """ Required: @@ -440,6 +452,7 @@ class Hex2Utf8: self.__in_caps = 1 value = line[17:-1] self.__caps_list.append(value) + def __end_caps_func(self, line): """ Required: @@ -455,7 +468,8 @@ class Hex2Utf8: else: sys.stderr.write('Module is hex_2_utf8\n') sys.stderr.write('method is __end_caps_func\n') - sys.stderr.write('caps list should be more than one?\n') + sys.stderr.write('caps list should be more than one?\n') #self.__in_caps not set + def __text_func(self, line): """ Required: @@ -466,9 +480,8 @@ class Hex2Utf8: if in caps, convert. Otherwise, print out. """ text = line[17:-1] - if self.__current_dict_name == 'Symbol'\ - or self.__current_dict_name == 'Wingdings'\ - or self.__current_dict_name == 'Zapf Dingbats': + # print line + if self.__current_dict_name in ('Symbol', 'Wingdings', 'Zapf Dingbats'): the_string = '' for letter in text: hex_num = hex(ord(letter)) @@ -477,21 +490,21 @@ class Hex2Utf8: hex_num = hex_num[2:] hex_num = '\'%s' % hex_num converted = self.__current_dict.get(hex_num) - if converted == None: + if converted is None: sys.stderr.write('module is hex_2_ut8\n') sys.stderr.write('method is __text_func\n') sys.stderr.write('no hex value for "%s"\n' % hex_num) else: the_string += converted self.__write_obj.write('tx<nu<__________<%s\n' % the_string) + # print the_string else: if self.__caps_list[-1] == 'true' \ and self.__convert_caps\ - and self.__current_dict_name != 'Symbol'\ - and self.__current_dict_name != 'Wingdings'\ - and self.__current_dict_name != 'Zapf Dingbats': + and self.__current_dict_name not in ('Symbol', 'Wingdings', 'Zapf Dingbats'): text = text.upper() self.__write_obj.write('tx<nu<__________<%s\n' % text) + def __utf_to_caps_func(self, line): """ Required: @@ -506,6 +519,7 @@ class Hex2Utf8: # utf_text = utf_text.upper() utf_text = self.__utf_token_to_caps_func(utf_text) self.__write_obj.write('tx<ut<__________<%s\n' % utf_text) + def __utf_token_to_caps_func(self, char_entity): """ Required: @@ -530,28 +544,26 @@ class Hex2Utf8: return char_entity else: return converted + def __convert_body(self): self.__state = 'body' - read_obj = open(self.__file, 'r') - self.__write_obj = open(self.__write_to, 'w') - line_to_read = 1 - while line_to_read: - line_to_read = read_obj.readline() - line = line_to_read - self.__token_info = line[:16] - action = self.__body_state_dict.get(self.__state) - if action == None: - sys.stderr.write('error no state found in hex_2_utf8', - self.__state - ) - action(line) - read_obj.close() + with open(self.__file, 'r') as read_obj: + self.__write_obj = open(self.__write_to, 'w') + for line in read_obj: + self.__token_info = line[:16] + action = self.__body_state_dict.get(self.__state) + if action is None: + sys.stderr.write(_('error no state found in hex_2_utf8'), + self.__state + ) + action(line) self.__write_obj.close() copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "body_utf_convert.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to) + def convert_hex_2_utf8(self): self.__initiate_values() if self.__area_to_convert == 'preamble': diff --git a/src/calibre/ebooks/rtf2xml/process_tokens.py b/src/calibre/ebooks/rtf2xml/process_tokens.py index 36930dedaf..8217c16a85 100755 --- a/src/calibre/ebooks/rtf2xml/process_tokens.py +++ b/src/calibre/ebooks/rtf2xml/process_tokens.py @@ -606,13 +606,13 @@ class ProcessTokens: return 'tx<mc<__________<%s\n' % token def default_func(self, pre, token, num): - if num == None: + if num is None: num = 'true' return 'cw<%s<%s<nu<%s\n' % (pre, token, num) def __list_type_func(self, pre, token, num): type = 'arabic' - if num == None: + if num is None: type = 'Arabic' else: try: From ac07ff853ead790c664051cdb8628a1b1fb30f53 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Fri, 7 Jan 2011 08:07:39 +0100 Subject: [PATCH 087/132] Handle non ascii charset in RTF if declared as codepage --- src/calibre/ebooks/rtf2xml/ParseRtf.py | 2 +- src/calibre/ebooks/rtf2xml/check_encoding.py | 1 + src/calibre/ebooks/rtf2xml/convert_to_tags.py | 50 ++++++++++++++----- .../ebooks/rtf2xml/default_encoding.py | 3 +- 4 files changed, 41 insertions(+), 15 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index 901188a000..f9036989b0 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -326,7 +326,6 @@ class ParseRtf: invalid_rtf_handler = InvalidRtfException, ) hex2utf_obj.convert_hex_2_utf8() - # raise RtfInvalidCodeException, 'stop' self.__bracket_match('hex_2_utf_preamble') fonts_obj = fonts.Fonts( in_file = self.__temp_file, @@ -523,6 +522,7 @@ class ParseRtf: indent = self.__indent, run_level = self.__run_level, no_dtd = self.__no_dtd, + encoding = encode_obj.get_codepage(), bug_handler = RtfInvalidCodeException, ) tags_obj.convert_to_tags() diff --git a/src/calibre/ebooks/rtf2xml/check_encoding.py b/src/calibre/ebooks/rtf2xml/check_encoding.py index 4503cbf98a..ae512fa68a 100755 --- a/src/calibre/ebooks/rtf2xml/check_encoding.py +++ b/src/calibre/ebooks/rtf2xml/check_encoding.py @@ -1,5 +1,6 @@ #!/usr/bin/env python import sys + class CheckEncoding: def __init__(self, bug_handler): diff --git a/src/calibre/ebooks/rtf2xml/convert_to_tags.py b/src/calibre/ebooks/rtf2xml/convert_to_tags.py index c2244b784a..6563d2e982 100755 --- a/src/calibre/ebooks/rtf2xml/convert_to_tags.py +++ b/src/calibre/ebooks/rtf2xml/convert_to_tags.py @@ -1,6 +1,9 @@ import os, tempfile -from calibre.ebooks.rtf2xml import copy + +from calibre.ebooks.rtf2xml import copy, check_encoding + public_dtd = 'rtf2xml1.0.dtd' + class ConvertToTags: """ Convert file to XML @@ -10,6 +13,7 @@ class ConvertToTags: bug_handler, dtd_path, no_dtd, + encoding, indent = None, copy = None, run_level = 1, @@ -29,9 +33,14 @@ class ConvertToTags: self.__copy = copy self.__dtd_path = dtd_path self.__no_dtd = no_dtd + if encoding != 'mac_roman': + self.__encoding = 'cp' + encoding + else: + self.__encoding = 'mac_roman' self.__indent = indent self.__run_level = run_level self.__write_to = tempfile.mktemp() + def __initiate_values(self): """ Set values, including those for the dictionary. @@ -61,6 +70,7 @@ class ConvertToTags: 'tx<ut<__________' : self.__text_func, 'mi<tg<empty_____' : self.__empty_func, } + def __open_func(self, line): """ Print the opening tag and newlines when needed. @@ -73,6 +83,7 @@ class ConvertToTags: if info in self.__two_new_line: self.__write_extra_new_line() self.__write_obj.write('<%s>' % info) + def __empty_func(self, line): """ Print out empty tag and newlines when needed. @@ -85,6 +96,7 @@ class ConvertToTags: self.__write_new_line() if info in self.__two_new_line: self.__write_extra_new_line() + def __open_att_func(self, line): """ Process lines for open tags that have attributes. @@ -119,6 +131,7 @@ class ConvertToTags: self.__write_new_line() if element_name in self.__two_new_line: self.__write_extra_new_line() + def __empty_att_func(self, line): """ Same as the __open_att_func, except a '/' is placed at the end of the tag. @@ -143,6 +156,7 @@ class ConvertToTags: self.__write_new_line() if element_name in self.__two_new_line: self.__write_extra_new_line() + def __close_func(self, line): """ Print out the closed tag and new lines, if appropriate. @@ -156,6 +170,7 @@ class ConvertToTags: self.__write_new_line() if info in self.__two_new_line: self.__write_extra_new_line() + def __text_func(self, line): """ Simply print out the information between [17:-1] @@ -163,6 +178,7 @@ class ConvertToTags: #tx<nu<__________<Normal; # change this! self.__write_obj.write(line[17:-1]) + def __write_extra_new_line(self): """ Print out extra new lines if the new lines have not exceeded two. If @@ -172,8 +188,10 @@ class ConvertToTags: return if self.__new_line < 2: self.__write_obj.write('\n') + def __default_func(self, line): pass + def __write_new_line(self): """ Print out a new line if a new line has not already been printed out. @@ -183,11 +201,22 @@ class ConvertToTags: if not self.__new_line: self.__write_obj.write('\n') self.__new_line += 1 + def __write_dec(self): """ Write the XML declaration at the top of the document. """ - self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>') + #keep maximum compatibility with previous version + check_encoding_obj = check_encoding.CheckEncoding( + bug_handler = self.__bug_handler, + ) + if not check_encoding_obj.check_encoding(self.__file): + self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>') + elif not check_encoding_obj.check_encoding(self.__file, self.__encoding): + self.__write_obj.write('<?xml version="1.0" encoding="%s" ?>' % self.__encoding) + else: + self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>') + sys.stderr.write(_('Bad RTF encoding, revert to US-ASCII chars and hope for the best')) self.__new_line = 0 self.__write_new_line() if self.__no_dtd: @@ -207,6 +236,7 @@ class ConvertToTags: ) self.__new_line = 0 self.__write_new_line() + def convert_to_tags(self): """ Read in the file one line at a time. Get the important info, between @@ -222,18 +252,14 @@ class ConvertToTags: an empty tag function. """ self.__initiate_values() - read_obj = open(self.__file, 'r') self.__write_obj = open(self.__write_to, 'w') self.__write_dec() - line_to_read = 1 - while line_to_read: - line_to_read = read_obj.readline() - line = line_to_read - self.__token_info = line[:16] - action = self.__state_dict.get(self.__token_info) - if action != None: - action(line) - read_obj.close() + with open(self.__file, 'r') as read_obj: + for line in read_obj: + self.__token_info = line[:16] + action = self.__state_dict.get(self.__token_info) + if action is not None: + action(line) self.__write_obj.close() copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: diff --git a/src/calibre/ebooks/rtf2xml/default_encoding.py b/src/calibre/ebooks/rtf2xml/default_encoding.py index a4eeac9663..e145a8a75e 100755 --- a/src/calibre/ebooks/rtf2xml/default_encoding.py +++ b/src/calibre/ebooks/rtf2xml/default_encoding.py @@ -132,8 +132,7 @@ class DefaultEncoding: self.__code_page = '850' # if __name__ == '__main__': - # from calibre.ebooks.rtf2xml import default_encoding - # encode_obj = default_encoding.DefaultEncoding( + # encode_obj = DefaultEncoding( # in_file = sys.argv[1], # bug_handler = Exception, # check_raw = True, From 24cb5514f08c8bf63d59b35f5fd980d126dd49b0 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Fri, 7 Jan 2011 08:15:59 +0100 Subject: [PATCH 088/132] ... --- src/calibre/ebooks/rtf2xml/check_encoding.py | 11 ++++++----- src/calibre/ebooks/rtf2xml/convert_to_tags.py | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/check_encoding.py b/src/calibre/ebooks/rtf2xml/check_encoding.py index ae512fa68a..7a7b842db6 100755 --- a/src/calibre/ebooks/rtf2xml/check_encoding.py +++ b/src/calibre/ebooks/rtf2xml/check_encoding.py @@ -16,7 +16,7 @@ class CheckEncoding: sys.stderr.write(_('line: %s char: %s\n') % (line_num, char_position)) sys.stderr.write(str(msg) + '\n') - def check_encoding(self, path, encoding='us-ascii'): + def check_encoding(self, path, encoding='us-ascii', verbose = True): line_num = 0 with open(path, 'r') as read_obj: for line in read_obj: @@ -24,10 +24,11 @@ class CheckEncoding: try: line.decode(encoding) except UnicodeError: - if len(line) < 1000: - self.__get_position_error(line, encoding, line_num) - else: - sys.stderr.write(_('line: %d has bad encoding\n') % line_num) + if verbose: + if len(line) < 1000: + self.__get_position_error(line, encoding, line_num) + else: + sys.stderr.write(_('line: %d has bad encoding\n') % line_num) return True return False diff --git a/src/calibre/ebooks/rtf2xml/convert_to_tags.py b/src/calibre/ebooks/rtf2xml/convert_to_tags.py index 6563d2e982..67689eb2d1 100755 --- a/src/calibre/ebooks/rtf2xml/convert_to_tags.py +++ b/src/calibre/ebooks/rtf2xml/convert_to_tags.py @@ -210,7 +210,7 @@ class ConvertToTags: check_encoding_obj = check_encoding.CheckEncoding( bug_handler = self.__bug_handler, ) - if not check_encoding_obj.check_encoding(self.__file): + if not check_encoding_obj.check_encoding(self.__file, verbose = False): self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>') elif not check_encoding_obj.check_encoding(self.__file, self.__encoding): self.__write_obj.write('<?xml version="1.0" encoding="%s" ?>' % self.__encoding) From 56bb15d6ff48a36bbe39660631278ab60c246721 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Fri, 7 Jan 2011 22:12:49 +0100 Subject: [PATCH 089/132] Various RTF minor changes --- src/calibre/ebooks/rtf2xml/ParseRtf.py | 1 - src/calibre/ebooks/rtf2xml/delete_info.py | 113 ++++++++++--------- src/calibre/ebooks/rtf2xml/process_tokens.py | 22 ++-- src/calibre/ebooks/rtf2xml/tokenize.py | 2 +- 4 files changed, 69 insertions(+), 69 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index f9036989b0..e994513c68 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -249,7 +249,6 @@ class ParseRtf: else self.__file.encode('utf-8') msg = _('File %s does not appear to be correctly encoded.\n') % file_name raise InvalidRtfException, msg - delete_info_obj = delete_info.DeleteInfo( in_file = self.__temp_file, copy = self.__copy, diff --git a/src/calibre/ebooks/rtf2xml/delete_info.py b/src/calibre/ebooks/rtf2xml/delete_info.py index f79caa3aae..3c93e028b8 100755 --- a/src/calibre/ebooks/rtf2xml/delete_info.py +++ b/src/calibre/ebooks/rtf2xml/delete_info.py @@ -16,7 +16,9 @@ # # ######################################################################### import sys, os, tempfile + from calibre.ebooks.rtf2xml import copy + class DeleteInfo: """Delelet unecessary destination groups""" def __init__(self, @@ -29,17 +31,18 @@ class DeleteInfo: self.__bug_handler = bug_handler self.__copy = copy self.__write_to = tempfile.mktemp() - self.__bracket_count=0 + self.__bracket_count= 0 self.__ob_count = 0 self.__cb_count = 0 - self.__after_asterisk = 0 + self.__after_asterisk = False self.__delete = 0 self.__initiate_allow() self.__ob = 0 self.__write_cb = 0 self.__run_level = run_level - self.__found_delete = 0 - self.__list = 0 + self.__found_delete = False + self.__list = False + def __initiate_allow(self): """ Initiate a list of destination groups which should be printed out. @@ -69,6 +72,7 @@ class DeleteInfo: 'delete' : self.__delete_func, 'list' : self.__list_func, } + def __default_func(self,line): """Handle lines when in no special state. Look for an asterisk to begin a special state. Otherwise, print out line.""" @@ -81,13 +85,14 @@ class DeleteInfo: if self.__ob: self.__write_obj.write(self.__ob) self.__ob = line - return 0 + return False else: # write previous bracket, since didn't fine asterisk if self.__ob: self.__write_obj.write(self.__ob) self.__ob = 0 - return 1 + return True + def __delete_func(self,line): """Handle lines when in delete state. Don't print out lines unless the state has ended.""" @@ -95,13 +100,14 @@ class DeleteInfo: self.__state = 'default' if self.__write_cb: self.__write_cb = 0 - return 1 - return 0 + return True + return False + def __asterisk_func(self,line): """ Determine whether to delete info in group Note on self.__cb flag. - If you find that you are in a delete group, and the preivous + If you find that you are in a delete group, and the previous token in not an open bracket (self.__ob = 0), that means that the delete group is nested inside another acceptable detination group. In this case, you have alrady written @@ -110,21 +116,21 @@ class DeleteInfo: """ # Test for {\*}, in which case don't enter # delete state - self.__after_asterisk = 0 # only enter this function once - self.__found_delete = 1 + self.__after_asterisk = False # only enter this function once + self.__found_delete = True if self.__token_info == 'cb<nu<clos-brack': if self.__delete_count == self.__cb_count: self.__state = 'default' self.__ob = 0 # changed this because haven't printed out start - return 0 + return False else: # not sure what happens here! # believe I have a '{\*} if self.__run_level > 3: msg = 'flag problem\n' raise self.__bug_handler, msg - return 1 + return True elif self.__token_info in self.__allowable : if self.__ob: self.__write_obj.write(self.__ob) @@ -132,7 +138,7 @@ class DeleteInfo: self.__state = 'default' else: pass - return 1 + return True elif self.__token_info == 'cw<ls<list______': self.__ob = 0 self.__found_list_func(line) @@ -142,75 +148,74 @@ class DeleteInfo: self.__ob = 0 self.__state = 'delete' self.__cb_count = 0 - return 0 + return False else: if self.__run_level > 5: - msg = 'After an asterisk, and found neither an allowable or non-allowble token\n' - msg += 'token is "%s"\n' % self.__token_info + msg = _('After an asterisk, and found neither an allowable or non-allowble token\n\ + token is "%s"\n') % self.__token_info raise self.__bug_handler if not self.__ob: self.__write_cb = 1 self.__ob = 0 self.__state = 'delete' self.__cb_count = 0 - return 0 + return False + def __found_list_func(self, line): """ print out control words in this group """ self.__state = 'list' + def __list_func(self, line): """ Check to see if the group has ended. - Return 1 for all control words. - Return 0 otherwise. + Return True for all control words. + Return False otherwise. """ if self.__delete_count == self.__cb_count and self.__token_info ==\ 'cb<nu<clos-brack': self.__state = 'default' if self.__write_cb: self.__write_cb = 0 - return 1 - return 0 + return True + return False elif line[0:2] == 'cw': - return 1 + return True else: - return 0 + return False + def delete_info(self): """Main method for handling other methods. Read one line in at a time, and determine wheter to print the line based on the state.""" - line_to_read = 'dummy' - read_obj = open(self.__file, 'r') self.__write_obj = open(self.__write_to, 'w') - while line_to_read: - #ob<nu<open-brack<0001 - to_print =1 - line_to_read = read_obj.readline() - line = line_to_read - self.__token_info = line[:16] - if self.__token_info == 'ob<nu<open-brack': - self.__ob_count = line[-5:-1] - if self.__token_info == 'cb<nu<clos-brack': - self.__cb_count = line[-5:-1] - action = self.__state_dict.get(self.__state) - if not action: - sys.stderr.write('No action in dictionary state is "%s" \n' - % self.__state) - to_print = action(line) - """ - if self.__after_asterisk: - to_print = self.__asterisk_func(line) - elif self.__list: - self.__in_list_func(line) - elif self.__delete: - to_print = self.__delete_func(line) - else: - to_print = self.__default_func(line) - """ - if to_print: - self.__write_obj.write(line) + with open(self.__file, 'r') as read_obj: + for line in read_obj: + #ob<nu<open-brack<0001 + to_print = True + self.__token_info = line[:16] + if self.__token_info == 'ob<nu<open-brack': + self.__ob_count = line[-5:-1] + if self.__token_info == 'cb<nu<clos-brack': + self.__cb_count = line[-5:-1] + action = self.__state_dict.get(self.__state) + if not action: + sys.stderr.write(_('No action in dictionary state is "%s" \n') + % self.__state) + to_print = action(line) + """ + if self.__after_asterisk: + to_print = self.__asterisk_func(line) + elif self.__list: + self.__in_list_func(line) + elif self.__delete: + to_print = self.__delete_func(line) + else: + to_print = self.__default_func(line) + """ + if to_print: + self.__write_obj.write(line) self.__write_obj.close() - read_obj.close() copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "delete_info.data") diff --git a/src/calibre/ebooks/rtf2xml/process_tokens.py b/src/calibre/ebooks/rtf2xml/process_tokens.py index 8217c16a85..b3f76d06d7 100755 --- a/src/calibre/ebooks/rtf2xml/process_tokens.py +++ b/src/calibre/ebooks/rtf2xml/process_tokens.py @@ -622,7 +622,7 @@ class ProcessTokens: msg = _('Number "%s" cannot be converted to integer\n') % num raise self.__bug_handler, msg type = self.__number_type_dict.get(num) - if type == None: + if type is None: if self.__run_level > 3: msg = _('No type for "%s" in self.__number_type_dict\n') raise self.__bug_handler @@ -634,7 +634,7 @@ class ProcessTokens: if not lang_name: lang_name = "not defined" if self.__run_level > 3: - msg = 'No entry for number "%s"' % num + msg = _('No entry for number "%s"') % num raise self.__bug_handler, msg return 'cw<%s<%s<nu<%s\n' % (pre, token, lang_name) @@ -686,9 +686,7 @@ class ProcessTokens: return 'cw<%s<%s<nu<false\n' % (pre, token) ##return 'cw<nu<nu<nu<%s>false<%s\n' % (token, token) else: - msg = 'boolean should have some value module process tokens\n' - msg += 'token is ' + token + "\n" - msg += "'" + num + "'" + "\n" + msg = _("boolean should have some value module process tokens\ntoken is %s\n'%s'\n") % (token, num) raise self.__bug_handler, msg def __no_sup_sub_func(self, pre, token, num): @@ -702,11 +700,9 @@ class ProcessTokens: numerator = float(re.search('[0-9.\-]+', numerator).group()) except TypeError, msg: if self.__run_level > 3: - msg = 'no number to process?\n' - msg += 'this indicates that the token ' - msg += ' \(\\li\) should have a number and does not\n' - msg += 'numerator is "%s"\n' % numerator - msg += 'denominator is "%s"\n' % denominator + msg = _('No number to process?\nthis indicates that the token \(\\li\) \ + should have a number and does not\nnumerator is \ + "%s"\ndenominator is "%s"\n') % (numerator, denominator) raise self.__bug_handler, msg if 5 > self.__return_code: self.__return_code = 5 @@ -720,17 +716,17 @@ class ProcessTokens: def split_let_num(self, token): match_obj = re.search(self.__num_exp,token) - if match_obj != None: + if match_obj is not None: first = match_obj.group(1) second = match_obj.group(2) if not second: if self.__run_level > 3: - msg = "token is '%s' \n" % token + msg = _("token is '%s' \n") % token raise self.__bug_handler, msg return first, 0 else: if self.__run_level > 3: - msg = "token is '%s' \n" % token + msg = _("token is '%s' \n") % token raise self.__bug_handler return token, 0 return first, second diff --git a/src/calibre/ebooks/rtf2xml/tokenize.py b/src/calibre/ebooks/rtf2xml/tokenize.py index d60909a610..de66415f0c 100755 --- a/src/calibre/ebooks/rtf2xml/tokenize.py +++ b/src/calibre/ebooks/rtf2xml/tokenize.py @@ -117,7 +117,7 @@ class Tokenize: input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file) #remove \n in bin data input_file = self.__bin_exp.sub(lambda x: \ - x.group().replace('\n', '') +'\n', input_file) + x.group().replace('\n', '') + '\n', input_file) #split tokens = re.split(self.__splitexp, input_file) #remove empty tokens and \n From be93bd120ab46c8bfe8959ca1e4186b7992c6fff Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Fri, 7 Jan 2011 23:29:35 +0100 Subject: [PATCH 090/132] clean picture handling TODO: update for new rtf --- src/calibre/ebooks/rtf2xml/pict.py | 108 ++++++++++++----------------- 1 file changed, 43 insertions(+), 65 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/pict.py b/src/calibre/ebooks/rtf2xml/pict.py index 3a1044520e..be2cd9e600 100755 --- a/src/calibre/ebooks/rtf2xml/pict.py +++ b/src/calibre/ebooks/rtf2xml/pict.py @@ -16,7 +16,9 @@ # # ######################################################################### import sys, os, tempfile + from calibre.ebooks.rtf2xml import copy + class Pict: """Process graphic information""" def __init__(self, @@ -36,13 +38,11 @@ class Pict: self.__ob_count = 0 self.__cb_count = 0 self.__pict_count = 0 - self.__in_pict = 0 - self.__already_found_pict = 0 + self.__in_pict = False + self.__already_found_pict = False self.__orig_file = orig_file self.__initiate_pict_dict() self.__out_file = out_file - # this is left over - self.__no_ask = 1 def __initiate_pict_dict(self): self.__pict_dict = { @@ -71,57 +71,43 @@ class Pict: self.__out_file)) else: dir_name = os.path.dirname(self.__orig_file) - # self.__output_to_file_func() self.__dir_name = base_name + "_rtf_pict_dir/" self.__dir_name = os.path.join(dir_name, self.__dir_name) if not os.path.isdir(self.__dir_name): try: os.mkdir(self.__dir_name) except OSError, msg: - msg = str(msg) - msg += "Couldn't make directory '%s':\n" % (self.__dir_name) + msg = _("%sCouldn't make directory '%s':\n") % (str(msg), self.__dir_name) raise self.__bug_handler else: - if self.__no_ask: - user_response = 'r' - else: - msg = 'Do you want to remove all files in %s?\n' % self.__dir_name - msg += 'Type "r" to remove.\n' - msg += 'Type any other key to keep files in place.\n' - sys.stderr.write(msg) - user_response = raw_input() - if user_response == 'r': - if self.__run_level > 1: - sys.stderr.write('Removing files from old pict directory...\n') - all_files = os.listdir(self.__dir_name) - for the_file in all_files: - the_file = os.path.join(self.__dir_name, the_file) - try: - os.remove(the_file) - except OSError: - pass - if self.__run_level > 1: - sys.stderr.write('Files removed.\n') + if self.__run_level > 1: + sys.stderr.write(_('Removing files from old pict directory...\n')) + all_files = os.listdir(self.__dir_name) + for the_file in all_files: + the_file = os.path.join(self.__dir_name, the_file) + try: + os.remove(the_file) + except OSError: + pass + if self.__run_level > 1: + sys.stderr.write(_('Files removed.\n')) def __create_pict_file(self): """Create a file for all the pict data to be written to. """ self.__pict_file = os.path.join(self.__dir_name, 'picts.rtf') - write_pic_obj = open(self.__pict_file, 'w') - write_pic_obj.close() self.__write_pic_obj = open(self.__pict_file, 'a') def __in_pict_func(self, line): if self.__cb_count == self.__pict_br_count: - self.__in_pict = 0 + self.__in_pict = False self.__write_pic_obj.write("}\n") - return 1 + return True else: action = self.__pict_dict.get(self.__token_info) if action: - line = action(line) - self.__write_pic_obj.write(line) - return 0 + self.__write_pic_obj.write(action(line)) + return False def __default(self, line, write_obj): """Determine if each token marks the beginning of pict data. @@ -142,50 +128,42 @@ class Pict: write_obj.write('mi<mk<pict-end__\n') if not self.__already_found_pict: self.__create_pict_file() - self.__already_found_pict=1; + self.__already_found_pict=True; self.__print_rtf_header() self.__in_pict = 1 self.__pict_br_count = self.__ob_count self.__cb_count = 0 self.__write_pic_obj.write("{\\pict\n") - return 0 - return 1 + return False + return True def __print_rtf_header(self): """Print to pict file the necessary RTF data for the file to be recognized as an RTF file. """ - self.__write_pic_obj.write("{\\rtf1 \n") - self.__write_pic_obj.write("{\\fonttbl\\f0\\null;} \n") - self.__write_pic_obj.write("{\\colortbl\\red255\\green255\\blue255;} \n") - self.__write_pic_obj.write("\\pard \n") + self.__write_pic_obj.write("{\\rtf1 \n{\\fonttbl\\f0\\null;} \n") + self.__write_pic_obj.write("{\\colortbl\\red255\\green255\\blue255;} \n\\pard \n") def process_pict(self): self.__make_dir() - read_obj = open(self.__file) - write_obj = open(self.__write_to, 'w') - line_to_read = 'dummy' - while line_to_read: - line_to_read = read_obj.readline() - line = line_to_read - self.__token_info = line[:16] - if self.__token_info == 'ob<nu<open-brack': - self.__ob_count = line[-5:-1] - if self.__token_info == 'cb<nu<clos-brack': - self.__cb_count = line[-5:-1] - if not self.__in_pict: - to_print = self.__default(line, write_obj) - if to_print : - write_obj.write(line) - else: - to_print = self.__in_pict_func(line) - if to_print : - write_obj.write(line) - if self.__already_found_pict: - self.__write_pic_obj.write("}\n") - self.__write_pic_obj.close() - read_obj.close() - write_obj.close() + with open(self.__file) as read_obj, open(self.__write_to, 'w') as write_obj: + for line in read_obj: + self.__token_info = line[:16] + if self.__token_info == 'ob<nu<open-brack': + self.__ob_count = line[-5:-1] + if self.__token_info == 'cb<nu<clos-brack': + self.__cb_count = line[-5:-1] + if not self.__in_pict: + to_print = self.__default(line, write_obj) + if to_print : + write_obj.write(line) + else: + to_print = self.__in_pict_func(line) + if to_print : + write_obj.write(line) + if self.__already_found_pict: + self.__write_pic_obj.write("}\n") + self.__write_pic_obj.close() copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "pict.data") From 4aa12408f268ee5c65092fbb90479ac31acc83b7 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Fri, 7 Jan 2011 23:36:42 +0100 Subject: [PATCH 091/132] Clean RTF combine borders --- src/calibre/ebooks/rtf2xml/combine_borders.py | 35 ++++++++++--------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/combine_borders.py b/src/calibre/ebooks/rtf2xml/combine_borders.py index 71cd822e30..c0b7185c9b 100755 --- a/src/calibre/ebooks/rtf2xml/combine_borders.py +++ b/src/calibre/ebooks/rtf2xml/combine_borders.py @@ -16,7 +16,9 @@ # # ######################################################################### import os, tempfile + from calibre.ebooks.rtf2xml import copy + class CombineBorders: """Combine borders in RTF tokens to make later processing easier""" def __init__(self, @@ -32,28 +34,31 @@ class CombineBorders: self.__state = 'default' self.__bord_pos = 'default' self.__bord_att = [] + def found_bd(self, line): #cw<bd<bor-t-r-vi self.__state = 'border' self.__bord_pos = line[6:16] + def __default_func(self, line): #cw<bd<bor-t-r-vi if self.__first_five == 'cw<bd': self.found_bd(line) return '' return line + def end_border(self, line, write_obj): - joiner = "|" - border_string = joiner.join(self.__bord_att) + border_string = "|".join(self.__bord_att) self.__bord_att = [] write_obj.write('cw<bd<%s<nu<%s\n' % (self.__bord_pos, - border_string)) + border_string)) self.__state = 'default' self.__bord_string = '' if self.__first_five == 'cw<bd': self. found_bd(line) else: write_obj.write(line) + def add_to_border_desc(self, line): #cw<bt<bdr-hair__<nu<true #cw<bt<bdr-linew<nu<0.50 @@ -65,26 +70,22 @@ class CombineBorders: else: num = ':' + num self.__bord_att.append(border_desc + num) + def __border_func(self, line, write_obj): if self.__first_five != 'cw<bt': self.end_border(line, write_obj) else: self.add_to_border_desc(line) + def combine_borders(self): - read_obj = open(self.__file, 'r') - write_obj = open(self.__write_to, 'w') - line_to_read = 'dummy' - while line_to_read: - line_to_read = read_obj.readline() - line = line_to_read - self.__first_five = line[0:5] - if self.__state == 'border': - self.__border_func(line, write_obj) - else: - to_print = self.__default_func(line) - write_obj.write(to_print) - read_obj.close() - write_obj.close() + with open(self.__file, 'r') as read_obj, + open(self.__write_to, 'w') as write_obj: + for line in read_obj: + self.__first_five = line[0:5] + if self.__state == 'border': + self.__border_func(line, write_obj) + else: + write_obj.write(self.__default_func(line)) copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "combine_borders.data") From 3356fe6a8b51eb9df2f235f2399e313d18e8fb43 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sat, 8 Jan 2011 08:35:59 +0100 Subject: [PATCH 092/132] cleaning RTF footnote --- src/calibre/ebooks/rtf2xml/footnote.py | 84 ++++++++++++++------------ 1 file changed, 47 insertions(+), 37 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/footnote.py b/src/calibre/ebooks/rtf2xml/footnote.py index a596ca73f6..0027348cde 100755 --- a/src/calibre/ebooks/rtf2xml/footnote.py +++ b/src/calibre/ebooks/rtf2xml/footnote.py @@ -16,7 +16,9 @@ # # ######################################################################### import os, tempfile + from calibre.ebooks.rtf2xml import copy + class Footnote: """ Two public methods are available. The first separates all of the @@ -35,6 +37,7 @@ class Footnote: self.__copy = copy self.__write_to = tempfile.mktemp() self.__found_a_footnote = 0 + def __first_line_func(self, line): """ Print the tag info for footnotes. Check whether footnote is an @@ -47,6 +50,7 @@ class Footnote: self.__write_to_foot_obj.write( 'mi<tg<open-att__<footnote<num>%s\n' % self.__footnote_count) self.__first_line = 0 + def __in_footnote_func(self, line): """Handle all tokens that are part of footnote""" if self.__first_line: @@ -68,6 +72,7 @@ class Footnote: 'mi<mk<footnt-clo\n') else: self.__write_to_foot_obj.write(line) + def __found_footnote(self, line): """ Found a footnote""" self.__found_a_footnote = 1 @@ -81,6 +86,7 @@ class Footnote: 'mi<mk<footnt-ind<%04d\n' % self.__footnote_count) self.__write_to_foot_obj.write( 'mi<mk<footnt-ope<%04d\n' % self.__footnote_count) + def __default_sep(self, line): """Handle all tokens that are not footnote tokens""" if self.__token_info == 'cw<nt<footnote__': @@ -91,6 +97,7 @@ class Footnote: self.__write_obj.write( 'tx<nu<__________<%s\n' % num ) + def __initiate_sep_values(self): """ initiate counters for separate_footnotes method. @@ -102,6 +109,7 @@ class Footnote: self.__in_footnote = 0 self.__first_line = 0 #have not processed the first line of footnote self.__footnote_count = 0 + def separate_footnotes(self): """ Separate all the footnotes in an RTF file and put them at the bottom, @@ -111,58 +119,55 @@ class Footnote: bottom of the main file. """ self.__initiate_sep_values() - read_obj = open(self.__file) self.__write_obj = open(self.__write_to, 'w') - self.__footnote_holder = tempfile.mktemp() - self.__write_to_foot_obj = open(self.__footnote_holder, 'w') - line_to_read = 1 - while line_to_read: - line_to_read = read_obj.readline() - line = line_to_read - self.__token_info = line[:16] - # keep track of opening and closing brackets - if self.__token_info == 'ob<nu<open-brack': - self.__ob_count = line[-5:-1] - if self.__token_info == 'cb<nu<clos-brack': - self.__cb_count = line[-5:-1] - # In the middle of footnote text - if self.__in_footnote: - self.__in_footnote_func(line) - # not in the middle of footnote text - else: - self.__default_sep(line) + with open(self.__file) as read_obj: + self.__footnote_holder = tempfile.mktemp() + self.__write_to_foot_obj = open(self.__footnote_holder, 'w') + line_to_read = 1 + while line_to_read: + line_to_read = read_obj.readline() + line = line_to_read + self.__token_info = line[:16] + # keep track of opening and closing brackets + if self.__token_info == 'ob<nu<open-brack': + self.__ob_count = line[-5:-1] + if self.__token_info == 'cb<nu<clos-brack': + self.__cb_count = line[-5:-1] + # In the middle of footnote text + if self.__in_footnote: + self.__in_footnote_func(line) + # not in the middle of footnote text + else: + self.__default_sep(line) self.__write_obj.close() - read_obj.close() self.__write_to_foot_obj.close() - read_obj = open(self.__footnote_holder, 'r') - write_obj = open(self.__write_to, 'a') - write_obj.write( - 'mi<mk<sect-close\n' - 'mi<mk<body-close\n' - 'mi<tg<close_____<section\n' - 'mi<tg<close_____<body\n' - 'mi<tg<close_____<doc\n' - 'mi<mk<footnt-beg\n') - line = 1 - while line: - line = read_obj.readline() - write_obj.write(line) - write_obj.write( - 'mi<mk<footnt-end\n') - read_obj.close() - write_obj.close() + with open(self.__footnote_holder, 'r') as read_obj, + open(self.__write_to, 'a') as write_obj: + write_obj.write( + 'mi<mk<sect-close\n' + 'mi<mk<body-close\n' + 'mi<tg<close_____<section\n' + 'mi<tg<close_____<body\n' + 'mi<tg<close_____<doc\n' + 'mi<mk<footnt-beg\n') + for line in read_obj: + write_obj.write(line) + write_obj.write( + 'mi<mk<footnt-end\n') os.remove(self.__footnote_holder) copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "footnote_separate.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to) + def update_info(self, file, copy): """ Unused method """ self.__file = file self.__copy = copy + def __get_foot_body_func(self, line): """ Process lines in main body and look for beginning of footnotes. @@ -172,6 +177,7 @@ class Footnote: self.__state = 'foot' else: self.__write_obj.write(line) + def __get_foot_foot_func(self, line): """ Copy footnotes from bottom of file to a separate, temporary file. @@ -180,6 +186,7 @@ class Footnote: self.__state = 'body' else: self.__write_to_foot_obj.write(line) + def __get_footnotes(self): """ Private method to remove footnotes from main file. Read one line from @@ -203,6 +210,7 @@ class Footnote: read_obj.close() self.__write_obj.close() self.__write_to_foot_obj.close() + def __get_foot_from_temp(self, num): """ Private method for joining footnotes to body. This method reads from @@ -223,6 +231,7 @@ class Footnote: else: if line == look_for: found_foot = 1 + def __join_from_temp(self): """ Private method for rejoining footnotes to body. Read from the @@ -242,6 +251,7 @@ class Footnote: line = self.__get_foot_from_temp(line[17:-1]) self.__write_obj.write(line) read_obj.close() + def join_footnotes(self): """ Join the footnotes from the bottom of the file and put them in their From 40430094333177fe8fc1173d684a936ceccd4359 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 9 Jan 2011 14:47:23 +0100 Subject: [PATCH 093/132] Add metadata to info in RTF metadata plugin --- src/calibre/ebooks/metadata/rtf.py | 86 ++++++++++++++++++++++-------- 1 file changed, 63 insertions(+), 23 deletions(-) diff --git a/src/calibre/ebooks/metadata/rtf.py b/src/calibre/ebooks/metadata/rtf.py index ad41125575..bb6392af6d 100644 --- a/src/calibre/ebooks/metadata/rtf.py +++ b/src/calibre/ebooks/metadata/rtf.py @@ -11,6 +11,8 @@ title_pat = re.compile(r'\{\\info.*?\{\\title(.*?)(?<!\\)\}', re.DOTALL) author_pat = re.compile(r'\{\\info.*?\{\\author(.*?)(?<!\\)\}', re.DOTALL) comment_pat = re.compile(r'\{\\info.*?\{\\subject(.*?)(?<!\\)\}', re.DOTALL) category_pat = re.compile(r'\{\\info.*?\{\\category(.*?)(?<!\\)\}', re.DOTALL) +tags_pat = re.compile(r'\{\\info.*?\{\\keywords(.*?)(?<!\\)\}', re.DOTALL) +publisher_pat = re.compile(r'\{\\info.*?\{\\manager(.*?)(?<!\\)\}', re.DOTALL) def get_document_info(stream): """ @@ -93,50 +95,70 @@ def get_metadata(stream): stream.seek(0) cpg = detect_codepage(stream) stream.seek(0) - + title_match = title_pat.search(block) if title_match: title = decode(title_match.group(1).strip(), cpg) + else: + title = _('Unknown') author_match = author_pat.search(block) if author_match: author = decode(author_match.group(1).strip(), cpg) - comment_match = comment_pat.search(block) - if comment_match: - comment = decode(comment_match.group(1).strip(), cpg) - category_match = category_pat.search(block) - if category_match: - category = decode(category_match.group(1).strip(), cpg) + else: + author = None mi = MetaInformation(title, author) if author: mi.authors = string_to_authors(author) - mi.comments = comment - mi.category = category + + comment_match = comment_pat.search(block) + if comment_match: + comment = decode(comment_match.group(1).strip(), cpg) + mi.comments = comment + category_match = category_pat.search(block) + if category_match: + category = decode(category_match.group(1).strip(), cpg) + mi.category = category + tags_match = tags_pat.search(block) + if tags_match: + tags = decode(tags_match.group(1).strip(), cpg) + mi.tags = tags + publisher_match = publisher_pat.search(block) + if publisher_match: + publisher = decode(publisher_match.group(1).strip(), cpg) + mi.publisher = publisher + return mi - def create_metadata(stream, options): - md = r'{\info' + md = [r'{\info'] if options.title: title = options.title.encode('ascii', 'ignore') - md += r'{\title %s}'%(title,) + md.append(r'{\title %s}'%(title,)) if options.authors: au = options.authors if not isinstance(au, basestring): au = u', '.join(au) author = au.encode('ascii', 'ignore') - md += r'{\author %s}'%(author,) + md.append(r'{\author %s}'%(author,)) if options.get('category', None): category = options.category.encode('ascii', 'ignore') - md += r'{\category %s}'%(category,) + md.append(r'{\category %s}'%(category,)) comp = options.comment if hasattr(options, 'comment') else options.comments if comp: comment = comp.encode('ascii', 'ignore') - md += r'{\subject %s}'%(comment,) - if len(md) > 6: - md += '}' + md.append(r'{\subject %s}'%(comment,)) + if options.publisher: + publisher = options.publisher.encode('ascii', 'ignore') + md.append(r'{\manager %s}'%(publisher,)) + if options.tags: + tags = u', '.join(options.tags) + tags = tags.encode('ascii', 'ignore') + md.append(r'{\keywords %s}'%(tags,)) + if len(md) > 1: + md.append('}') stream.seek(0) src = stream.read() - ans = src[:6] + md + src[6:] + ans = src[:6] + ''.join(md) + src[6:] stream.seek(0) stream.write(ans) @@ -149,14 +171,15 @@ def set_metadata(stream, options): index = src.rindex('}') return src[:index] + r'{\ '[:-1] + name + ' ' + val + '}}' src, pos = get_document_info(stream) - if not src: + print 'I was thre' + if src is not None: create_metadata(stream, options) else: olen = len(src) base_pat = r'\{\\name(.*?)(?<!\\)\}' title = options.title - if title != None: + if title is not None: title = title.encode('ascii', 'replace') pat = re.compile(base_pat.replace('name', 'title'), re.DOTALL) if pat.search(src): @@ -164,7 +187,7 @@ def set_metadata(stream, options): else: src = add_metadata_item(src, 'title', title) comment = options.comments - if comment != None: + if comment is not None: comment = comment.encode('ascii', 'replace') pat = re.compile(base_pat.replace('name', 'subject'), re.DOTALL) if pat.search(src): @@ -172,7 +195,7 @@ def set_metadata(stream, options): else: src = add_metadata_item(src, 'subject', comment) author = options.authors - if author != None: + if author is not None: author = ', '.join(author) author = author.encode('ascii', 'ignore') pat = re.compile(base_pat.replace('name', 'author'), re.DOTALL) @@ -181,13 +204,30 @@ def set_metadata(stream, options): else: src = add_metadata_item(src, 'author', author) category = options.get('category', None) - if category != None: + if category is not None: category = category.encode('ascii', 'replace') pat = re.compile(base_pat.replace('name', 'category'), re.DOTALL) if pat.search(src): src = pat.sub(r'{\\category ' + category + r'}', src) else: src = add_metadata_item(src, 'category', category) + tags = options.tags + if tags is not None: + tags = ', '.join(tags) + tags = tags.encode('ascii', 'ignore') + pat = re.compile(base_pat.replace('name', 'keywords'), re.DOTALL) + if pat.search(src): + src = pat.sub(r'{\\keywords ' + tags + r'}', src) + else: + src = add_metadata_item(src, 'keywords', tags) + publisher = options.publisher + if publisher is not None: + publisher = publisher.encode('ascii', 'replace') + pat = re.compile(base_pat.replace('name', 'manager'), re.DOTALL) + if pat.search(src): + src = pat.sub(r'{\\manager ' + publisher + r'}', src) + else: + src = add_metadata_item(src, 'manager', publisher) stream.seek(pos + olen) after = stream.read() stream.seek(pos) From 5ffbfc89b533308b1b442167bf2ed7ccb0e59a8e Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 9 Jan 2011 19:26:39 +0100 Subject: [PATCH 094/132] Correct a bug with file opening and convert to with --- src/calibre/ebooks/rtf2xml/combine_borders.py | 2 +- src/calibre/ebooks/rtf2xml/footnote.py | 66 +++++++------------ 2 files changed, 26 insertions(+), 42 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/combine_borders.py b/src/calibre/ebooks/rtf2xml/combine_borders.py index c0b7185c9b..a0bc77e7ad 100755 --- a/src/calibre/ebooks/rtf2xml/combine_borders.py +++ b/src/calibre/ebooks/rtf2xml/combine_borders.py @@ -78,7 +78,7 @@ class CombineBorders: self.add_to_border_desc(line) def combine_borders(self): - with open(self.__file, 'r') as read_obj, + with open(self.__file, 'r') as read_obj, \ open(self.__write_to, 'w') as write_obj: for line in read_obj: self.__first_five = line[0:5] diff --git a/src/calibre/ebooks/rtf2xml/footnote.py b/src/calibre/ebooks/rtf2xml/footnote.py index 0027348cde..c1ffb18ada 100755 --- a/src/calibre/ebooks/rtf2xml/footnote.py +++ b/src/calibre/ebooks/rtf2xml/footnote.py @@ -119,14 +119,11 @@ class Footnote: bottom of the main file. """ self.__initiate_sep_values() - self.__write_obj = open(self.__write_to, 'w') - with open(self.__file) as read_obj: - self.__footnote_holder = tempfile.mktemp() - self.__write_to_foot_obj = open(self.__footnote_holder, 'w') - line_to_read = 1 - while line_to_read: - line_to_read = read_obj.readline() - line = line_to_read + self.__footnote_holder = tempfile.mktemp() + with open(self.__file) as read_obj, \ + open(self.__write_to, 'w') as self.__write_obj, \ + open(self.__footnote_holder, 'w') as self.__write_to_foot_obj: + for line in read_obj: self.__token_info = line[:16] # keep track of opening and closing brackets if self.__token_info == 'ob<nu<open-brack': @@ -139,9 +136,7 @@ class Footnote: # not in the middle of footnote text else: self.__default_sep(line) - self.__write_obj.close() - self.__write_to_foot_obj.close() - with open(self.__footnote_holder, 'r') as read_obj, + with open(self.__footnote_holder, 'r') as read_obj, \ open(self.__write_to, 'a') as write_obj: write_obj.write( 'mi<mk<sect-close\n' @@ -195,21 +190,15 @@ class Footnote: These two functions do the work of separating the footnotes form the body. """ - read_obj = open(self.__file) - self.__write_obj = open(self.__write_to, 'w') - # self.__write_to = "footnote_info.data" - self.__write_to_foot_obj = open(self.__footnote_holder, 'w') - line = 1 - while line: - line = read_obj.readline() - self.__token_info = line[:16] - if self.__state == 'body': - self.__get_foot_body_func(line) - elif self.__state == 'foot': - self.__get_foot_foot_func(line) - read_obj.close() - self.__write_obj.close() - self.__write_to_foot_obj.close() + with open(self.__file) as read_obj, \ + open(self.__write_to, 'w') as self.__write_obj, \ + open(self.__footnote_holder, 'w') as self.__write_to_foot_obj: + for line in read_obj: + self.__token_info = line[:16] + if self.__state == 'body': + self.__get_foot_body_func(line) + elif self.__state == 'foot': + self.__get_foot_foot_func(line) def __get_foot_from_temp(self, num): """ @@ -221,9 +210,7 @@ class Footnote: look_for = 'mi<mk<footnt-ope<' + num + '\n' found_foot = 0 string_to_return = '' - line = 1 - while line: - line = self.__read_from_foot_obj.readline() + for line in self.__read_from_foot_obj: if found_foot: if line == 'mi<mk<footnt-clo\n': return string_to_return @@ -241,16 +228,13 @@ class Footnote: print out to the third file. If no footnote marker is found, simply print out the token (line). """ - self.__read_from_foot_obj = open(self.__footnote_holder, 'r') - read_obj = open(self.__write_to, 'r') - self.__write_obj = open(self.__write_to2, 'w') - line = 1 - while line: - line = read_obj.readline() - if line[:16] == 'mi<mk<footnt-ind': - line = self.__get_foot_from_temp(line[17:-1]) - self.__write_obj.write(line) - read_obj.close() + with open(self.__footnote_holder, 'r') as self.__read_from_foot_obj, \ + open(self.__write_to, 'r') as read_obj, \ + open(self.__write_to2, 'w') as self.__write_obj: + for line in read_obj: + if line[:16] == 'mi<mk<footnt-ind': + line = self.__get_foot_from_temp(line[17:-1]) + self.__write_obj.write(line) def join_footnotes(self): """ @@ -268,8 +252,8 @@ class Footnote: self.__state = 'body' self.__get_footnotes() self.__join_from_temp() - self.__write_obj.close() - self.__read_from_foot_obj.close() + # self.__write_obj.close() + # self.__read_from_foot_obj.close() copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to2, "footnote_joined.data") From a335d86cd59ba6d6374fd1aa43473bfe1896b40f Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 9 Jan 2011 20:46:15 +0100 Subject: [PATCH 095/132] Add pict.rtf if debugging + simplify extract images in RTFinput --- src/calibre/ebooks/rtf/input.py | 51 ++++++++++++++++++++---------- src/calibre/ebooks/rtf2xml/pict.py | 4 ++- 2 files changed, 37 insertions(+), 18 deletions(-) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 05c851a075..545c1fb3c8 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -127,35 +127,52 @@ class RTFInput(InputFormatPlugin): def extract_images(self, picts): self.log('Extracting images...') - - count = 0 + raw = open(picts, 'rb').read() - starts = [] - for match in re.finditer(r'\{\\pict([^}]+)\}', raw): - starts.append(match.start(1)) - + picts = filter(len, re.findall(r'\{\\pict([^}]+)\}', raw)) + hex = re.compile(r'[^a-zA-Z0-9]') + encs = [hex.sub('', pict) for pict in picts] + + count = 0 imap = {} - - for start in starts: - pos, bc = start, 1 - while bc > 0: - if raw[pos] == '}': bc -= 1 - elif raw[pos] == '{': bc += 1 - pos += 1 - pict = raw[start:pos+1] - enc = re.sub(r'[^a-zA-Z0-9]', '', pict) + for enc in encs: if len(enc) % 2 == 1: enc = enc[:-1] data = enc.decode('hex') count += 1 - name = (('%4d'%count).replace(' ', '0'))+'.wmf' + name = '%04d.wmf' % count open(name, 'wb').write(data) imap[count] = name #open(name+'.hex', 'wb').write(enc) return self.convert_images(imap) + # count = 0 + # raw = open(picts, 'rb').read() + # starts = [] + # for match in re.finditer(r'\{\\pict([^}]+)\}', raw): + # starts.append(match.start(1)) + + # imap = {} + # for start in starts: + # pos, bc = start, 1 + # while bc > 0: + # if raw[pos] == '}': bc -= 1 + # elif raw[pos] == '{': bc += 1 + # pos += 1 + # pict = raw[start:pos+1] + # enc = re.sub(r'[^a-zA-Z0-9]', '', pict) + # if len(enc) % 2 == 1: + # enc = enc[:-1] + # data = enc.decode('hex') + # count += 1 + # name = (('%4d'%count).replace(' ', '0'))+'.wmf' + # open(name, 'wb').write(data) + # imap[count] = name + # #open(name+'.hex', 'wb').write(enc) + # return self.convert_images(imap) + def convert_images(self, imap): - for count, val in imap.items(): + for count, val in imap.iteritems(): try: imap[count] = self.convert_image(val) except: diff --git a/src/calibre/ebooks/rtf2xml/pict.py b/src/calibre/ebooks/rtf2xml/pict.py index be2cd9e600..a6cc2deade 100755 --- a/src/calibre/ebooks/rtf2xml/pict.py +++ b/src/calibre/ebooks/rtf2xml/pict.py @@ -146,7 +146,8 @@ class Pict: def process_pict(self): self.__make_dir() - with open(self.__file) as read_obj, open(self.__write_to, 'w') as write_obj: + with open(self.__file) as read_obj, \ + open(self.__write_to, 'w') as write_obj: for line in read_obj: self.__token_info = line[:16] if self.__token_info == 'ob<nu<open-brack': @@ -167,6 +168,7 @@ class Pict: copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "pict.data") + copy_obj.copy_file(self.__pict_file, "pict.rtf") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to) if self.__pict_count == 0: From 7fb1fdd8e8ad8b6ae2dcfb71a35514e9b4387ca0 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 9 Jan 2011 21:16:00 +0100 Subject: [PATCH 096/132] Remove category field in rtf metadata plugin --- src/calibre/ebooks/metadata/rtf.py | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/src/calibre/ebooks/metadata/rtf.py b/src/calibre/ebooks/metadata/rtf.py index bb6392af6d..f88250e72a 100644 --- a/src/calibre/ebooks/metadata/rtf.py +++ b/src/calibre/ebooks/metadata/rtf.py @@ -10,7 +10,6 @@ from calibre.ebooks.metadata import MetaInformation, string_to_authors title_pat = re.compile(r'\{\\info.*?\{\\title(.*?)(?<!\\)\}', re.DOTALL) author_pat = re.compile(r'\{\\info.*?\{\\author(.*?)(?<!\\)\}', re.DOTALL) comment_pat = re.compile(r'\{\\info.*?\{\\subject(.*?)(?<!\\)\}', re.DOTALL) -category_pat = re.compile(r'\{\\info.*?\{\\category(.*?)(?<!\\)\}', re.DOTALL) tags_pat = re.compile(r'\{\\info.*?\{\\keywords(.*?)(?<!\\)\}', re.DOTALL) publisher_pat = re.compile(r'\{\\info.*?\{\\manager(.*?)(?<!\\)\}', re.DOTALL) @@ -84,13 +83,12 @@ def decode(raw, codec): def get_metadata(stream): """ Return metadata as a L{MetaInfo} object """ - title, author, comment, category = None, None, None, None stream.seek(0) if stream.read(5) != r'{\rtf': - return MetaInformation(None, None) + return MetaInformation(_('Unknown'), None) block = get_document_info(stream)[0] if not block: - return MetaInformation(None, None) + return MetaInformation(_('Unknown'), None) stream.seek(0) cpg = detect_codepage(stream) @@ -114,10 +112,6 @@ def get_metadata(stream): if comment_match: comment = decode(comment_match.group(1).strip(), cpg) mi.comments = comment - category_match = category_pat.search(block) - if category_match: - category = decode(category_match.group(1).strip(), cpg) - mi.category = category tags_match = tags_pat.search(block) if tags_match: tags = decode(tags_match.group(1).strip(), cpg) @@ -140,9 +134,6 @@ def create_metadata(stream, options): au = u', '.join(au) author = au.encode('ascii', 'ignore') md.append(r'{\author %s}'%(author,)) - if options.get('category', None): - category = options.category.encode('ascii', 'ignore') - md.append(r'{\category %s}'%(category,)) comp = options.comment if hasattr(options, 'comment') else options.comments if comp: comment = comp.encode('ascii', 'ignore') @@ -203,14 +194,6 @@ def set_metadata(stream, options): src = pat.sub(r'{\\author ' + author + r'}', src) else: src = add_metadata_item(src, 'author', author) - category = options.get('category', None) - if category is not None: - category = category.encode('ascii', 'replace') - pat = re.compile(base_pat.replace('name', 'category'), re.DOTALL) - if pat.search(src): - src = pat.sub(r'{\\category ' + category + r'}', src) - else: - src = add_metadata_item(src, 'category', category) tags = options.tags if tags is not None: tags = ', '.join(tags) From ce5ece5750669117531c1e1b1749a37993fb1901 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 9 Jan 2011 22:35:33 +0100 Subject: [PATCH 097/132] Replace keywords by category in rtf metadata --- src/calibre/ebooks/metadata/rtf.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/metadata/rtf.py b/src/calibre/ebooks/metadata/rtf.py index f88250e72a..3e316ee430 100644 --- a/src/calibre/ebooks/metadata/rtf.py +++ b/src/calibre/ebooks/metadata/rtf.py @@ -10,7 +10,7 @@ from calibre.ebooks.metadata import MetaInformation, string_to_authors title_pat = re.compile(r'\{\\info.*?\{\\title(.*?)(?<!\\)\}', re.DOTALL) author_pat = re.compile(r'\{\\info.*?\{\\author(.*?)(?<!\\)\}', re.DOTALL) comment_pat = re.compile(r'\{\\info.*?\{\\subject(.*?)(?<!\\)\}', re.DOTALL) -tags_pat = re.compile(r'\{\\info.*?\{\\keywords(.*?)(?<!\\)\}', re.DOTALL) +tags_pat = re.compile(r'\{\\info.*?\{\\category(.*?)(?<!\\)\}', re.DOTALL) publisher_pat = re.compile(r'\{\\info.*?\{\\manager(.*?)(?<!\\)\}', re.DOTALL) def get_document_info(stream): @@ -144,7 +144,7 @@ def create_metadata(stream, options): if options.tags: tags = u', '.join(options.tags) tags = tags.encode('ascii', 'ignore') - md.append(r'{\keywords %s}'%(tags,)) + md.append(r'{\category %s}'%(tags,)) if len(md) > 1: md.append('}') stream.seek(0) @@ -198,11 +198,11 @@ def set_metadata(stream, options): if tags is not None: tags = ', '.join(tags) tags = tags.encode('ascii', 'ignore') - pat = re.compile(base_pat.replace('name', 'keywords'), re.DOTALL) + pat = re.compile(base_pat.replace('name', 'category'), re.DOTALL) if pat.search(src): - src = pat.sub(r'{\\keywords ' + tags + r'}', src) + src = pat.sub(r'{\\category ' + tags + r'}', src) else: - src = add_metadata_item(src, 'keywords', tags) + src = add_metadata_item(src, 'category', tags) publisher = options.publisher if publisher is not None: publisher = publisher.encode('ascii', 'replace') From 6e831360a29d9f9e181f238a5565bff3fb7dc253 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Mon, 10 Jan 2011 21:27:45 +0100 Subject: [PATCH 098/132] Fix 0 case in rtf cp + case when there is no pictures in a file --- src/calibre/ebooks/rtf2xml/default_encoding.py | 6 ++++-- src/calibre/ebooks/rtf2xml/pict.py | 5 ++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/default_encoding.py b/src/calibre/ebooks/rtf2xml/default_encoding.py index e145a8a75e..c7e030e48b 100755 --- a/src/calibre/ebooks/rtf2xml/default_encoding.py +++ b/src/calibre/ebooks/rtf2xml/default_encoding.py @@ -101,7 +101,7 @@ class DefaultEncoding: break if self.__token_info == 'cw<ri<ansi-codpg': #cw<ri<ansi-codpg<nu<10000 - self.__code_page = line[20:-1] if line[20:-1] \ + self.__code_page = line[20:-1] if int(line[20:-1]) \ else '1252' if self.__token_info == 'cw<ri<macintosh_': self.__platform = 'Macintosh' @@ -120,7 +120,9 @@ class DefaultEncoding: fenccp = re.compile(r'\\ansicpg(\d+)[\\ \{\}\t\n]+') for line in read_obj: if fenccp.search(line): - self.__code_page = fenccp.search(line).group(1) + cp = fenccp.search(line).group(1) + if not int(cp): + self.__code_page = cp break if fenc.search(line): enc = fenc.search(line).group(1) diff --git a/src/calibre/ebooks/rtf2xml/pict.py b/src/calibre/ebooks/rtf2xml/pict.py index a6cc2deade..a8f7746f60 100755 --- a/src/calibre/ebooks/rtf2xml/pict.py +++ b/src/calibre/ebooks/rtf2xml/pict.py @@ -168,7 +168,10 @@ class Pict: copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "pict.data") - copy_obj.copy_file(self.__pict_file, "pict.rtf") + try: + copy_obj.copy_file(self.__pict_file, "pict.rtf") + except: + pass copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to) if self.__pict_count == 0: From ff8d8968ef58aed95029456958afc38b7c8c6c3e Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Mon, 10 Jan 2011 23:52:34 +0100 Subject: [PATCH 099/132] Improve hard line break handling in RTF --- src/calibre/ebooks/rtf2xml/inline.py | 8 +------- src/calibre/ebooks/rtf2xml/process_tokens.py | 5 ++++- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/inline.py b/src/calibre/ebooks/rtf2xml/inline.py index 5ca1cd0783..55e6ed1dbb 100755 --- a/src/calibre/ebooks/rtf2xml/inline.py +++ b/src/calibre/ebooks/rtf2xml/inline.py @@ -51,7 +51,6 @@ class Inline: 'tx<ut<__________' : self.__found_text_func, 'mi<mk<inline-fld' : self.__found_text_func, 'text' : self.__found_text_func, - 'cw<nu<hard-lineb' : self.__found_text_func, #calibre 'cb<nu<clos-brack' : self.__close_bracket_func, 'mi<mk<par-end___' : self.__end_para_func, 'mi<mk<footnt-ope' : self.__end_para_func, @@ -63,7 +62,6 @@ class Inline: 'tx<hx<__________' : self.__found_text_func, 'tx<ut<__________' : self.__found_text_func, 'text' : self.__found_text_func, - 'cw<nu<hard-lineb' : self.__found_text_func, #calibre 'mi<mk<inline-fld' : self.__found_text_func, 'ob<nu<open-brack': self.__found_open_bracket_func, 'mi<mk<par-end___' : self.__end_para_func, @@ -257,7 +255,6 @@ class Inline: Text can mark the start of a paragraph. If already in a paragraph, check to see if any groups are waiting to be added. If so, use another method to write these groups. - 3. If not check if hardline break, then write """ if self.__place == 'in_list': self.__write_inline() @@ -265,10 +262,7 @@ class Inline: if not self.__in_para: self.__in_para = 1 self.__start_para_func(line) - else: - if self.__token_info == 'cw<nu<hard-lineb': #calibre - self.__write_obj.write('mi<tg<empty_____<hardline-break\n') - if self.__groups_in_waiting[0] != 0: + elif self.__groups_in_waiting[0] != 0: self.__write_inline() def __write_inline(self): diff --git a/src/calibre/ebooks/rtf2xml/process_tokens.py b/src/calibre/ebooks/rtf2xml/process_tokens.py index b3f76d06d7..9f26bb295b 100755 --- a/src/calibre/ebooks/rtf2xml/process_tokens.py +++ b/src/calibre/ebooks/rtf2xml/process_tokens.py @@ -70,6 +70,7 @@ class ProcessTokens: ';' : ('mc', ';', self.ms_sub_func), # this must be wrong '-' : ('mc', '-', self.ms_sub_func), + 'line' : ('mi', 'hardline-break', self.hardline_func), #calibre # misc => ml '*' : ('ml', 'asterisk__', self.default_func), ':' : ('ml', 'colon_____', self.default_func), @@ -77,7 +78,6 @@ class ProcessTokens: 'backslash' : ('nu', '\\', self.text_func), 'ob' : ('nu', '{', self.text_func), 'cb' : ('nu', '}', self.text_func), - 'line' : ('nu', 'hard-lineb', self.default_func), #calibre #'line' : ('nu', ' ', self.text_func), calibre # paragraph formatting => pf 'page' : ('pf', 'page-break', self.default_func), @@ -605,6 +605,9 @@ class ProcessTokens: def ms_sub_func(self, pre, token, num): return 'tx<mc<__________<%s\n' % token + def hardline_func(self, pre, token, num): + return 'mi<tg<empty_____<%s\n' % token + def default_func(self, pre, token, num): if num is None: num = 'true' From 51751b197e430e902c546e85121082e111041a7b Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Tue, 11 Jan 2011 00:00:20 +0100 Subject: [PATCH 100/132] clean rtf2xml inline --- src/calibre/ebooks/rtf2xml/inline.py | 78 +++++++++++++++------------- 1 file changed, 43 insertions(+), 35 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/inline.py b/src/calibre/ebooks/rtf2xml/inline.py index 55e6ed1dbb..83c383fa1f 100755 --- a/src/calibre/ebooks/rtf2xml/inline.py +++ b/src/calibre/ebooks/rtf2xml/inline.py @@ -1,5 +1,7 @@ import sys, os, tempfile + from calibre.ebooks.rtf2xml import copy + """ States. 1. default @@ -36,6 +38,7 @@ class Inline: self.__copy = copy self.__run_level = run_level self.__write_to = tempfile.mktemp() + def __initiate_values(self): """ Initiate all values. @@ -81,12 +84,12 @@ class Inline: self.__in_para = 0 # not in paragraph self.__char_dict = { # character info => ci - 'annotation' : 'annotation', + 'annotation' : 'annotation', 'blue______' : 'blue', 'bold______' : 'bold', - 'caps______' : 'caps', - 'char-style' : 'character-style', - 'dbl-strike' : 'double-strike-through', + 'caps______' : 'caps', + 'char-style' : 'character-style', + 'dbl-strike' : 'double-strike-through', 'emboss____' : 'emboss', 'engrave___' : 'engrave', 'font-color' : 'font-color', @@ -94,7 +97,7 @@ class Inline: 'font-size_' : 'font-size', 'font-style' : 'font-style', 'font-up___' : 'superscript', - 'footnot-mk' : 'footnote-marker', + 'footnot-mk' : 'footnote-marker', 'green_____' : 'green', 'hidden____' : 'hidden', 'italics___' : 'italics', @@ -105,9 +108,10 @@ class Inline: 'strike-thr' : 'strike-through', 'subscript_' : 'subscript', 'superscrip' : 'superscript', - 'underlined' : 'underlined', + 'underlined' : 'underlined', } self.__caps_list = ['false'] + def __set_list_func(self, line): """ Requires: @@ -126,6 +130,7 @@ class Inline: self.__place = 'in_list' self.__inline_list = self.__list_inline_list self.__groups_in_waiting = self.__groups_in_waiting_list + def __default_func(self, line): """ Requires: @@ -138,8 +143,8 @@ class Inline: action = self.__default_dict.get(self.__token_info) if action: action(line) - if self.__token_info != 'cw<nu<hard-lineb': #calibre self.__write_obj.write(line) + def __found_open_bracket_func(self, line): """ Requires: @@ -154,6 +159,7 @@ class Inline: self.__groups_in_waiting[0] += 1 self.__inline_list.append({}) self.__inline_list[-1]['contains_inline'] = 0 + def __after_open_bracket_func(self, line): """ Requires: @@ -174,6 +180,7 @@ class Inline: self.__state = 'default' # a non control word? action(line) self.__write_obj.write(line) + def __handle_control_word(self, line): """ Required: @@ -204,6 +211,7 @@ class Inline: elif char_value == 'Zapf Dingbats': self.__write_obj.write('mi<mk<font-dingb\n') """ + def __close_bracket_func(self, line): """ Requires: @@ -242,6 +250,7 @@ class Inline: self.__inline_list.pop() if self.__groups_in_waiting[0] != 0: self.__groups_in_waiting[0] -= 1 + def __found_text_func(self, line): """ Required: @@ -264,7 +273,7 @@ class Inline: self.__start_para_func(line) elif self.__groups_in_waiting[0] != 0: self.__write_inline() - + def __write_inline(self): """ Required: @@ -288,7 +297,7 @@ class Inline: inline_list = self.__inline_list[last_index:] if len(inline_list) <= 0: if self.__run_level > 3: - msg = 'self.__inline_list is %s\n' % self.__inline_list + msg = _('self.__inline_list is %s\n') % self.__inline_list raise self.__bug_handler, msg self.__write_obj.write('error\n') self.__groups_in_waiting[0] = 0 @@ -308,6 +317,7 @@ class Inline: self.__write_obj.write('<%s>%s' % (the_key, the_dict[the_key])) self.__write_obj.write('\n') self.__groups_in_waiting[0] = 0 + def __end_para_func(self, line): """ Requires: @@ -336,6 +346,7 @@ class Inline: self.__write_obj.write('mi<mk<caps-end__\n') self.__write_obj.write('mi<tg<close_____<inline\n') self.__in_para = 0 + def __start_para_func(self, line): """ Requires: @@ -363,12 +374,14 @@ class Inline: self.__write_obj.write('<%s>%s' % (the_key, the_dict[the_key])) self.__write_obj.write('\n') self.__groups_in_waiting[0] = 0 + def __found_field_func(self, line): """ Just a default function to make sure I don't prematurely exit default state """ pass + def form_tags(self): """ Requires: @@ -380,32 +393,27 @@ class Inline: the state. """ self.__initiate_values() - read_obj = open(self.__file, 'r') - self.__write_obj = open(self.__write_to, 'w') - line_to_read = 1 - while line_to_read: - line_to_read = read_obj.readline() - line = line_to_read - token = line[0:-1] - self.__token_info = '' - if token == 'tx<mc<__________<rdblquote'\ - or token == 'tx<mc<__________<ldblquote'\ - or token == 'tx<mc<__________<lquote'\ - or token == 'tx<mc<__________<rquote'\ - or token == 'tx<mc<__________<emdash'\ - or token == 'tx<mc<__________<endash'\ - or token == 'tx<mc<__________<bullet': - self.__token_info = 'text' - else: - self.__token_info = line[:16] - self.__set_list_func(line) - action = self.__state_dict.get(self.__state) - if action == None: - sys.stderr.write('No matching state in module inline_for_lists.py\n') - sys.stderr.write(self.__state + '\n') - action(line) - read_obj.close() - self.__write_obj.close() + with open(self.__file, 'r') as read_obj, \ + open(self.__write_to, 'w') as self.__write_obj: + for line in read_obj: + token = line[0:-1] + self.__token_info = '' + if token == 'tx<mc<__________<rdblquote'\ + or token == 'tx<mc<__________<ldblquote'\ + or token == 'tx<mc<__________<lquote'\ + or token == 'tx<mc<__________<rquote'\ + or token == 'tx<mc<__________<emdash'\ + or token == 'tx<mc<__________<endash'\ + or token == 'tx<mc<__________<bullet': + self.__token_info = 'text' + else: + self.__token_info = line[:16] + self.__set_list_func(line) + action = self.__state_dict.get(self.__state) + if action == None: + sys.stderr.write(_('No matching state in module inline_for_lists.py\n')) + sys.stderr.write(self.__state + '\n') + action(line) copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "inline.data") From bc433b26501da7c0a9391622ea3491f36c057316 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Tue, 11 Jan 2011 00:04:41 +0100 Subject: [PATCH 101/132] use calibre function to clean lower ascii char in rtf2xml --- .../ebooks/rtf2xml/replace_illegals.py | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/replace_illegals.py b/src/calibre/ebooks/rtf2xml/replace_illegals.py index 901cdd289d..9c5c1ef0e9 100755 --- a/src/calibre/ebooks/rtf2xml/replace_illegals.py +++ b/src/calibre/ebooks/rtf2xml/replace_illegals.py @@ -16,7 +16,10 @@ # # ######################################################################### import os, tempfile + from calibre.ebooks.rtf2xml import copy +from calibre.utils.cleantext import clean_ascii_chars + class ReplaceIllegals: """ reaplace illegal lower ascii characters @@ -30,21 +33,14 @@ class ReplaceIllegals: self.__copy = copy self.__run_level = run_level self.__write_to = tempfile.mktemp() + def replace_illegals(self): """ """ - nums = [0, 1, 2, 3, 4, 5, 6, 7, 8, 11, 13, 14, 15, 16, 17, 18, 19] - read_obj = open(self.__file, 'r') - write_obj = open(self.__write_to, 'w') - line_to_read = 1 - while line_to_read: - line_to_read = read_obj.readline() - line = line_to_read - for num in nums: - line = line.replace(chr(num), '') - write_obj.write(line) - read_obj.close() - write_obj.close() + with open(self.__file, 'r') as read_obj, \ + open(self.__write_to, 'w') as write_obj: + for line in read_obj: + write_obj.write(clean_ascii_chars(line)) copy_obj = copy.Copy() if self.__copy: copy_obj.copy_file(self.__write_to, "replace_illegals.data") From 7ea92e2c672d3e4ef315cd98c9466254304c466d Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Tue, 11 Jan 2011 00:30:24 +0100 Subject: [PATCH 102/132] ... --- src/calibre/ebooks/rtf2xml/inline.py | 4 ++-- src/calibre/ebooks/rtf2xml/process_tokens.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/inline.py b/src/calibre/ebooks/rtf2xml/inline.py index 83c383fa1f..8949cf35ad 100755 --- a/src/calibre/ebooks/rtf2xml/inline.py +++ b/src/calibre/ebooks/rtf2xml/inline.py @@ -143,7 +143,7 @@ class Inline: action = self.__default_dict.get(self.__token_info) if action: action(line) - self.__write_obj.write(line) + self.__write_obj.write(line) def __found_open_bracket_func(self, line): """ @@ -410,7 +410,7 @@ class Inline: self.__token_info = line[:16] self.__set_list_func(line) action = self.__state_dict.get(self.__state) - if action == None: + if action is None: sys.stderr.write(_('No matching state in module inline_for_lists.py\n')) sys.stderr.write(self.__state + '\n') action(line) diff --git a/src/calibre/ebooks/rtf2xml/process_tokens.py b/src/calibre/ebooks/rtf2xml/process_tokens.py index 9f26bb295b..1033ebc583 100755 --- a/src/calibre/ebooks/rtf2xml/process_tokens.py +++ b/src/calibre/ebooks/rtf2xml/process_tokens.py @@ -52,7 +52,7 @@ class ProcessTokens: self.__return_code = 0 self.dict_token={ # unicode - 'mshex' : ('nu', '__________', self.__ms_hex_func), + 'mshex' : ('nu', '__________', self.__ms_hex_func), # brackets '{' : ('nu', '{', self.ob_func), '}' : ('nu', '}', self.cb_func), From 056f52c84361ce265602541344184d764f92d28d Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Tue, 11 Jan 2011 13:41:23 +0100 Subject: [PATCH 103/132] Integrate rtf2xml debug process to calibre --- src/calibre/ebooks/rtf/input.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 422105e5b3..3f9eda374f 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -77,7 +77,18 @@ class RTFInput(InputFormatPlugin): def generate_xml(self, stream): from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf - ofile = 'out.xml' + debug_dir = getattr(self.opts, 'debug_pipeline', None) + run_lev = 1 + if debug_dir is not None: + try: + debug_dir = os.path.abspath(os.path.normpath(debug_dir + u'/rtfdebug/')) + os.makedirs(debug_dir) + run_lev = 6 + except OSError, ( errno, strerror ): + print strerror + print errno + debug_dir = None + ofile = 'dataxml.xml' parser = ParseRtf( in_file = stream, out_file = ofile, @@ -117,12 +128,13 @@ class RTFInput(InputFormatPlugin): empty_paragraphs = 1, #debug - deb_dir = "D:\\Mes eBooks\\Developpement\\debug\\rtfdebug", - run_level = 3 + deb_dir = debug_dir, + run_level = run_lev, ) parser.parse_rtf() - ans = open('out.xml').read() - os.remove('out.xml') + ans = open('dataxml.xml').read() + if debug_dir is None: + os.remove('dataxml.xml') return ans def extract_images(self, picts): @@ -213,7 +225,7 @@ class RTFInput(InputFormatPlugin): css += '\n'+'\n'.join(font_size_classes) css += '\n' +'\n'.join(color_classes) - for cls, val in border_styles.items(): + for cls, val in border_styles.iteritems(): css += '\n\n.%s {\n%s\n}'%(cls, val) with open('styles.css', 'ab') as f: @@ -277,9 +289,6 @@ class RTFInput(InputFormatPlugin): raise ValueError(_('This RTF file has a feature calibre does not ' 'support. Convert it to HTML first and then try it.\n%s')%e) - with open('dataxml.xml', 'w') as dataxml: - dataxml.write(xml) - d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf')) if d: imap = {} From 576cac2b98cec1fd620d14bffd341f6cd62bdb44 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Tue, 11 Jan 2011 18:20:31 +0100 Subject: [PATCH 104/132] Modify rtf2xml debug parameters --- src/calibre/ebooks/rtf/input.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 3f9eda374f..c07764c744 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -81,9 +81,9 @@ class RTFInput(InputFormatPlugin): run_lev = 1 if debug_dir is not None: try: - debug_dir = os.path.abspath(os.path.normpath(debug_dir + u'/rtfdebug/')) - os.makedirs(debug_dir) - run_lev = 6 + debug_dir = os.path.normpath('rtfdebug/') + os.mkdir(debug_dir) + run_lev = 4 except OSError, ( errno, strerror ): print strerror print errno From 9ed1e9419081a984161c20b44dcce7940fc8a072 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Tue, 11 Jan 2011 18:39:55 +0100 Subject: [PATCH 105/132] Modify delete_info in rtf2xml --- src/calibre/ebooks/rtf/input.py | 2 - src/calibre/ebooks/rtf2xml/delete_info.py | 49 +++++++++++------------ 2 files changed, 23 insertions(+), 28 deletions(-) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index c07764c744..981a930d54 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -133,8 +133,6 @@ class RTFInput(InputFormatPlugin): ) parser.parse_rtf() ans = open('dataxml.xml').read() - if debug_dir is None: - os.remove('dataxml.xml') return ans def extract_images(self, picts): diff --git a/src/calibre/ebooks/rtf2xml/delete_info.py b/src/calibre/ebooks/rtf2xml/delete_info.py index 3c93e028b8..3a7442addc 100755 --- a/src/calibre/ebooks/rtf2xml/delete_info.py +++ b/src/calibre/ebooks/rtf2xml/delete_info.py @@ -34,14 +34,14 @@ class DeleteInfo: self.__bracket_count= 0 self.__ob_count = 0 self.__cb_count = 0 - self.__after_asterisk = False - self.__delete = 0 + # self.__after_asterisk = False + # self.__delete = 0 self.__initiate_allow() self.__ob = 0 - self.__write_cb = 0 + self.__write_cb = False self.__run_level = run_level self.__found_delete = False - self.__list = False + # self.__list = False def __initiate_allow(self): """ @@ -69,7 +69,7 @@ class DeleteInfo: self.__state_dict = { 'default' : self.__default_func, 'after_asterisk' : self.__asterisk_func, - 'delete' : self.__delete_func, + 'delete' : self.__delete_func, 'list' : self.__list_func, } @@ -99,7 +99,7 @@ class DeleteInfo: if self.__delete_count == self.__cb_count: self.__state = 'default' if self.__write_cb: - self.__write_cb = 0 + self.__write_cb = True return True return False @@ -116,7 +116,7 @@ class DeleteInfo: """ # Test for {\*}, in which case don't enter # delete state - self.__after_asterisk = False # only enter this function once + # self.__after_asterisk = False # only enter this function once self.__found_delete = True if self.__token_info == 'cb<nu<clos-brack': if self.__delete_count == self.__cb_count: @@ -128,7 +128,7 @@ class DeleteInfo: # not sure what happens here! # believe I have a '{\*} if self.__run_level > 3: - msg = 'flag problem\n' + msg = _('flag problem\n') raise self.__bug_handler, msg return True elif self.__token_info in self.__allowable : @@ -144,18 +144,18 @@ class DeleteInfo: self.__found_list_func(line) elif self.__token_info in self.__not_allowable: if not self.__ob: - self.__write_cb = 1 + self.__write_cb = False self.__ob = 0 self.__state = 'delete' self.__cb_count = 0 return False else: if self.__run_level > 5: - msg = _('After an asterisk, and found neither an allowable or non-allowble token\n\ + msg = _('After an asterisk, and found neither an allowable or non-allowable token\n\ token is "%s"\n') % self.__token_info - raise self.__bug_handler + raise self.__bug_handler, msg if not self.__ob: - self.__write_cb = 1 + self.__write_cb = True self.__ob = 0 self.__state = 'delete' self.__cb_count = 0 @@ -177,7 +177,7 @@ class DeleteInfo: 'cb<nu<clos-brack': self.__state = 'default' if self.__write_cb: - self.__write_cb = 0 + self.__write_cb = False return True return False elif line[0:2] == 'cw': @@ -188,8 +188,8 @@ class DeleteInfo: def delete_info(self): """Main method for handling other methods. Read one line in at a time, and determine wheter to print the line based on the state.""" - self.__write_obj = open(self.__write_to, 'w') - with open(self.__file, 'r') as read_obj: + with open(self.__file, 'r') as read_obj, \ + open(self.__write_to, 'w') as self.__write_obj: for line in read_obj: #ob<nu<open-brack<0001 to_print = True @@ -203,19 +203,16 @@ class DeleteInfo: sys.stderr.write(_('No action in dictionary state is "%s" \n') % self.__state) to_print = action(line) - """ - if self.__after_asterisk: - to_print = self.__asterisk_func(line) - elif self.__list: - self.__in_list_func(line) - elif self.__delete: - to_print = self.__delete_func(line) - else: - to_print = self.__default_func(line) - """ + # if self.__after_asterisk: + # to_print = self.__asterisk_func(line) + # elif self.__list: + # self.__in_list_func(line) + # elif self.__delete: + # to_print = self.__delete_func(line) + # else: + # to_print = self.__default_func(line) if to_print: self.__write_obj.write(line) - self.__write_obj.close() copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "delete_info.data") From 5655042dc8b63d04bf6e5ac822e33349e1cd9cd2 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Tue, 11 Jan 2011 20:28:13 +0100 Subject: [PATCH 106/132] Modify rtf2xml debug parameters (2) --- src/calibre/ebooks/rtf/input.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 981a930d54..915ca55fc1 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -77,18 +77,19 @@ class RTFInput(InputFormatPlugin): def generate_xml(self, stream): from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf - debug_dir = getattr(self.opts, 'debug_pipeline', None) + ofile = 'dataxml.xml' run_lev = 1 - if debug_dir is not None: + if hasattr(self.opts, 'debug_pipeline'): try: - debug_dir = os.path.normpath('rtfdebug/') + debug_dir = 'rtfdebug' os.mkdir(debug_dir) run_lev = 4 except OSError, ( errno, strerror ): print strerror print errno debug_dir = None - ofile = 'dataxml.xml' + else: + debug_dir = None parser = ParseRtf( in_file = stream, out_file = ofile, From 10c2e603e29dd7305a25704f4b7711a85cca7af4 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sat, 15 Jan 2011 13:21:13 +0100 Subject: [PATCH 107/132] Correct handling of \ as \par for old RTF --- src/calibre/ebooks/rtf/input.py | 1 - src/calibre/ebooks/rtf2xml/ParseRtf.py | 11 ++++--- .../ebooks/rtf2xml/default_encoding.py | 1 + src/calibre/ebooks/rtf2xml/process_tokens.py | 4 +-- src/calibre/ebooks/rtf2xml/tokenize.py | 33 +++++++++++++++---- 5 files changed, 36 insertions(+), 14 deletions(-) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 5907bf6b55..a6b8c86e79 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -312,7 +312,6 @@ class RTFInput(InputFormatPlugin): try: xml = self.generate_xml(stream.name) except RtfInvalidCodeException, e: - raise raise ValueError(_('This RTF file has a feature calibre does not ' 'support. Convert it to HTML first and then try it.\n%s')%e) diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index 902ad09c30..73f8f04e1c 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -226,10 +226,6 @@ class ParseRtf: try: return_value = process_tokens_obj.process_tokens() except InvalidRtfException, msg: - try: - os.remove(self.__temp_file) - except OSError: - pass #Check to see if the file is correctly encoded encode_obj = default_encoding.DefaultEncoding( in_file = self.__temp_file, @@ -244,11 +240,16 @@ class ParseRtf: enc = encode_obj.get_codepage() if enc != 'mac_roman': enc = 'cp' + enc + msg = 'Exception in token processing' if check_encoding_obj.check_encoding(self.__file, enc): file_name = self.__file if isinstance(self.__file, str) \ else self.__file.encode('utf-8') msg = 'File %s does not appear to be correctly encoded.\n' % file_name - raise InvalidRtfException, msg + try: + os.remove(self.__temp_file) + except OSError: + pass + raise InvalidRtfException, msg delete_info_obj = delete_info.DeleteInfo( in_file = self.__temp_file, copy = self.__copy, diff --git a/src/calibre/ebooks/rtf2xml/default_encoding.py b/src/calibre/ebooks/rtf2xml/default_encoding.py index aec33943a9..53887e0d90 100755 --- a/src/calibre/ebooks/rtf2xml/default_encoding.py +++ b/src/calibre/ebooks/rtf2xml/default_encoding.py @@ -3,6 +3,7 @@ # copyright 2002 Paul Henry Tremblay # # # ######################################################################### + ''' Codepages as to RTF 1.9.1: 437 United States IBM diff --git a/src/calibre/ebooks/rtf2xml/process_tokens.py b/src/calibre/ebooks/rtf2xml/process_tokens.py index ff4fbe110c..5066843976 100755 --- a/src/calibre/ebooks/rtf2xml/process_tokens.py +++ b/src/calibre/ebooks/rtf2xml/process_tokens.py @@ -70,7 +70,7 @@ class ProcessTokens: ';' : ('mc', ';', self.ms_sub_func), # this must be wrong '-' : ('mc', '-', self.ms_sub_func), - 'line' : ('mi', 'hardline-break', self.hardline_func), #calibre + 'line' : ('mi', 'hardline-break', self.direct_conv_func), #calibre # misc => ml '*' : ('ml', 'asterisk__', self.default_func), ':' : ('ml', 'colon_____', self.default_func), @@ -605,7 +605,7 @@ class ProcessTokens: def ms_sub_func(self, pre, token, num): return 'tx<mc<__________<%s\n' % token - def hardline_func(self, pre, token, num): + def direct_conv_func(self, pre, token, num): return 'mi<tg<empty_____<%s\n' % token def default_func(self, pre, token, num): diff --git a/src/calibre/ebooks/rtf2xml/tokenize.py b/src/calibre/ebooks/rtf2xml/tokenize.py index de66415f0c..20438a2e66 100755 --- a/src/calibre/ebooks/rtf2xml/tokenize.py +++ b/src/calibre/ebooks/rtf2xml/tokenize.py @@ -27,11 +27,13 @@ class Tokenize: bug_handler, copy = None, run_level = 1, - ): + # out_file = None, + ): self.__file = in_file self.__bug_handler = bug_handler self.__copy = copy self.__write_to = tempfile.mktemp() + # self.__out_file = out_file self.__compile_expressions() #variables self.__uc_char = 0 @@ -113,6 +115,8 @@ class Tokenize: def __sub_reg_split(self,input_file): input_file = self.__replace_spchar.mreplace(input_file) + # this is for older RTF + input_file = self.__par_exp.sub('\n\\par \n', input_file) input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file) input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file) #remove \n in bin data @@ -127,7 +131,7 @@ class Tokenize: # this is for older RTF #line = re.sub(self.__par_exp, '\\par ', line) #return filter(lambda x: len(x) > 0, \ - #(self.__remove_line.sub('', x) for x in tokens)) + #(self.__remove_line.sub('', x) for x in tokens)) def __compile_expressions(self): SIMPLE_RPL = { @@ -153,8 +157,6 @@ class Tokenize: # put a backslash in front of to eliminate special cases and # make processing easier "}": "\\}", - # this is for older RTF - r'\\$': '\\par ', } self.__replace_spchar = MReplace(SIMPLE_RPL) #add ;? in case of char following \u @@ -168,10 +170,12 @@ class Tokenize: #why keep backslash whereas \is replaced before? #remove \n from endline char self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)") + #this is for old RTF + self.__par_exp = re.compile(r'\\\n+') + # self.__par_exp = re.compile(r'\\$') #self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}") #self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})") #self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)") - #self.__par_exp = re.compile(r'\\$') #self.__remove_line = re.compile(r'\n+') #self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)") ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)") @@ -199,7 +203,24 @@ class Tokenize: copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "tokenize.data") + # if self.__out_file: + # self.__file = self.__out_file copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to) - #self.__special_tokens = [ '_', '~', "'", '{', '}' ] \ No newline at end of file + #self.__special_tokens = [ '_', '~', "'", '{', '}' ] + +# import sys +# def main(args=sys.argv): + # if len(args) < 1: + # print 'No file' + # return + # file = 'data_tokens.txt' + # if len(args) == 3: + # file = args[2] + # to = Tokenize(args[1], Exception, out_file = file) + # to.tokenize() + + +# if __name__ == '__main__': + # sys.exit(main()) \ No newline at end of file From 93ef1699dfd732596ad9f10f08aff7aed43eaa21 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sat, 15 Jan 2011 16:11:28 +0100 Subject: [PATCH 108/132] Modify mac-roman encoding, now go to 10000 --- src/calibre/ebooks/rtf2xml/ParseRtf.py | 4 +- .../ebooks/rtf2xml/default_encoding.py | 57 +++++++++++-------- src/calibre/ebooks/rtf2xml/process_tokens.py | 1 - 3 files changed, 33 insertions(+), 29 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index 73f8f04e1c..442f5f4ac3 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -237,9 +237,7 @@ class ParseRtf: check_encoding_obj = check_encoding.CheckEncoding( bug_handler = RtfInvalidCodeException, ) - enc = encode_obj.get_codepage() - if enc != 'mac_roman': - enc = 'cp' + enc + enc = 'cp' + encode_obj.get_codepage() msg = 'Exception in token processing' if check_encoding_obj.check_encoding(self.__file, enc): file_name = self.__file if isinstance(self.__file, str) \ diff --git a/src/calibre/ebooks/rtf2xml/default_encoding.py b/src/calibre/ebooks/rtf2xml/default_encoding.py index 53887e0d90..31122318b6 100755 --- a/src/calibre/ebooks/rtf2xml/default_encoding.py +++ b/src/calibre/ebooks/rtf2xml/default_encoding.py @@ -74,9 +74,6 @@ class DefaultEncoding: if not self.__datafetched: self._encoding() self.__datafetched = True - if self.__platform == 'Macintosh': - code_page = self.__code_page - else: code_page = 'ansicpg' + self.__code_page return self.__platform, code_page, self.__default_num @@ -94,49 +91,59 @@ class DefaultEncoding: def _encoding(self): with open(self.__file, 'r') as read_obj: + cpfound = False if not self.__fetchraw: for line in read_obj: self.__token_info = line[:16] if self.__token_info == 'mi<mk<rtfhed-end': break - if self.__token_info == 'cw<ri<ansi-codpg': - #cw<ri<ansi-codpg<nu<10000 - self.__code_page = line[20:-1] if int(line[20:-1]) \ - else '1252' if self.__token_info == 'cw<ri<macintosh_': self.__platform = 'Macintosh' - self.__code_page = 'mac_roman' elif self.__token_info == 'cw<ri<pc________': self.__platform = 'IBMPC' - self.__code_page = '437' elif self.__token_info == 'cw<ri<pca_______': self.__platform = 'OS/2' - self.__code_page = '850' + if self.__token_info == 'cw<ri<ansi-codpg' \ + and int(line[20:-1]): + self.__code_page = line[20:-1] if self.__token_info == 'cw<ri<deflt-font': self.__default_num = line[20:-1] + cpfound = True #cw<ri<deflt-font<nu<0 + if self.__platform != 'Windows' and \ + not cpfound: + if self.__platform == 'Macintosh': + self.__code_page = '10000' + elif self.__platform == 'IBMPC': + self.__code_page = '437' + elif self.__platform == 'OS/2': + self.__code_page = '850' else: fenc = re.compile(r'\\(mac|pc|ansi|pca)[\\ \{\}\t\n]+') fenccp = re.compile(r'\\ansicpg(\d+)[\\ \{\}\t\n]+') + for line in read_obj: + if fenc.search(line): + enc = fenc.search(line).group(1) if fenccp.search(line): cp = fenccp.search(line).group(1) if not int(cp): self.__code_page = cp + cpfound = True break - if fenc.search(line): - enc = fenc.search(line).group(1) - if enc == 'mac': - self.__code_page = 'mac_roman' - elif enc == 'pc': - self.__code_page = '437' - elif enc == 'pca': - self.__code_page = '850' + if self.__platform != 'Windows' and \ + not cpfound: + if enc == 'mac': + self.__code_page = '10000' + elif enc == 'pc': + self.__code_page = '437' + elif enc == 'pca': + self.__code_page = '850' -# if __name__ == '__main__': - # encode_obj = DefaultEncoding( - # in_file = sys.argv[1], - # bug_handler = Exception, - # check_raw = True, - # ) - # print encode_obj.get_codepage() +if __name__ == '__main__': + encode_obj = DefaultEncoding( + in_file = sys.argv[1], + bug_handler = Exception, + check_raw = True, + ) + print encode_obj.get_codepage() diff --git a/src/calibre/ebooks/rtf2xml/process_tokens.py b/src/calibre/ebooks/rtf2xml/process_tokens.py index 5066843976..6ff0519dc2 100755 --- a/src/calibre/ebooks/rtf2xml/process_tokens.py +++ b/src/calibre/ebooks/rtf2xml/process_tokens.py @@ -78,7 +78,6 @@ class ProcessTokens: 'backslash' : ('nu', '\\', self.text_func), 'ob' : ('nu', '{', self.text_func), 'cb' : ('nu', '}', self.text_func), - #'line' : ('nu', ' ', self.text_func), calibre # paragraph formatting => pf 'page' : ('pf', 'page-break', self.default_func), 'par' : ('pf', 'par-end___', self.default_func), From 55616a4e2d8c525463e6c440f7e4112ac0782f5f Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sat, 15 Jan 2011 20:51:39 +0100 Subject: [PATCH 109/132] Update info handling to rev RTF 1.9.1 TODO: integrate \userprops --- src/calibre/ebooks/rtf2xml/info.py | 69 ++++++++++++++------ src/calibre/ebooks/rtf2xml/process_tokens.py | 16 ++++- 2 files changed, 62 insertions(+), 23 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/info.py b/src/calibre/ebooks/rtf2xml/info.py index ad0fb8ec06..9f2905f31b 100755 --- a/src/calibre/ebooks/rtf2xml/info.py +++ b/src/calibre/ebooks/rtf2xml/info.py @@ -16,7 +16,9 @@ # # ######################################################################### import sys, os, tempfile + from calibre.ebooks.rtf2xml import copy + class Info: """ Make tags for document-information @@ -42,6 +44,7 @@ class Info: self.__copy = copy self.__run_level = run_level self.__write_to = tempfile.mktemp() + def __initiate_values(self): """ Initiate all values. @@ -58,27 +61,49 @@ class Info: self.__info_table_dict = { 'cw<di<title_____' : (self.__found_tag_with_text_func, 'title'), 'cw<di<author____' : (self.__found_tag_with_text_func, 'author'), + 'cw<di<operator__' : (self.__found_tag_with_text_func, 'operator'), + 'cw<di<manager___' : (self.__found_tag_with_text_func, 'manager'), + 'cw<di<company___' : (self.__found_tag_with_text_func, 'company'), 'cw<di<keywords__' : (self.__found_tag_with_text_func, 'keywords'), + 'cw<di<category__' : (self.__found_tag_with_text_func, 'category'), 'cw<di<doc-notes_' : (self.__found_tag_with_text_func, 'doc-notes'), 'cw<di<subject___' : (self.__found_tag_with_text_func, 'subject'), - 'cw<di<operator__' : (self.__found_tag_with_text_func, 'operator'), + 'cw<di<linkbase__' : (self.__found_tag_with_text_func, 'hyperlink-base'), + 'cw<di<create-tim' : (self.__found_tag_with_tokens_func, 'creation-time'), - 'cw<di<revis-time' : (self.__found_tag_with_tokens_func, 'revision-time'), - 'cw<di<edit-time_' : (self.__single_field_func, 'editing-time'), + 'cw<di<revis-time' : (self.__found_tag_with_tokens_func, 'revision-time'), + 'cw<di<edit-time_' : (self.__found_tag_with_tokens_func, 'editing-time'), + 'cw<di<print-time' : (self.__found_tag_with_tokens_func, 'printing-time'), + 'cw<di<backuptime' : (self.__found_tag_with_tokens_func, 'backup-time'), + 'cw<di<num-of-wor' : (self.__single_field_func, 'number-of-words'), 'cw<di<num-of-chr' : (self.__single_field_func, 'number-of-characters'), + 'cw<di<numofchrws' : (self.__single_field_func, 'number-of-characters-without-space'), 'cw<di<num-of-pag' : (self.__single_field_func, 'number-of-pages'), + 'cw<di<version___' : (self.__single_field_func, 'version'), + 'cw<di<intern-ver' : (self.__single_field_func, 'internal-version-number'), + 'cw<di<internalID' : (self.__single_field_func, 'internal-id-number'), } self.__token_dict = { 'year______' : 'year', 'month_____' : 'month', 'day_______' : 'day', 'minute____' : 'minute', + 'second____' : 'second', 'revis-time' : 'revision-time', + 'create-tim' : 'creation-time', + 'edit-time_' : 'editing-time', + 'print-time' : 'printing-time', + 'backuptime' : 'backup-time', 'num-of-wor' : 'number-of-words', 'num-of-chr' : 'number-of-characters', + 'numofchrws' : 'number-of-characters-without-space', 'num-of-pag' : 'number-of-pages', + 'version___' : 'version', + 'intern-ver' : 'internal-version-number', + 'internalID' : 'internal-id-number', } + def __before_info_table_func(self, line): """ Required: @@ -92,6 +117,7 @@ class Info: if self.__token_info == 'mi<mk<doc-in-beg': self.__state = 'in_info_table' self.__write_obj.write(line) + def __in_info_table_func(self, line): """ Requires: @@ -112,6 +138,7 @@ class Info: action(line, tag) else: self.__write_obj.write(line) + def __found_tag_with_text_func(self, line, tag): """ Requires: @@ -126,6 +153,7 @@ class Info: """ self.__tag = tag self.__state = 'collect_text' + def __collect_text_func(self, line): """ Requires: @@ -147,6 +175,7 @@ class Info: self.__text_string = '' elif line[0:2] == 'tx': self.__text_string += line[17:-1] + def __found_tag_with_tokens_func(self, line, tag): """ Requires: @@ -163,6 +192,7 @@ class Info: self.__state = 'collect_tokens' self.__text_string = 'mi<tg<empty-att_<%s' % tag #mi<tg<empty-att_<page-definition<margin>33\n + def __collect_tokens_func(self, line): """ Requires: @@ -194,18 +224,19 @@ class Info: att = line[6:16] value = line[20:-1] att_changed = self.__token_dict.get(att) - if att_changed == None: + if att_changed is None: if self.__run_level > 3: - msg = 'no dictionary match for %s\n' % att + msg = 'No dictionary match for %s\n' % att raise self.__bug_handler, msg else: self.__text_string += '<%s>%s' % (att_changed, value) + def __single_field_func(self, line, tag): value = line[20:-1] self.__write_obj.write( - 'mi<tg<empty-att_<%s' - '<%s>%s\n' % (tag, tag, value) + 'mi<tg<empty-att_<%s<%s>%s\n' % (tag, tag, value) ) + def __after_info_table_func(self, line): """ Requires: @@ -217,6 +248,7 @@ class Info: the file. """ self.__write_obj.write(line) + def fix_info(self): """ Requires: @@ -234,20 +266,15 @@ class Info: information table, simply write the line to the output file. """ self.__initiate_values() - read_obj = open(self.__file, 'r') - self.__write_obj = open(self.__write_to, 'w') - line_to_read = 1 - while line_to_read: - line_to_read = read_obj.readline() - line = line_to_read - self.__token_info = line[:16] - action = self.__state_dict.get(self.__state) - if action == None: - sys.stderr.write('no no matching state in module styles.py\n') - sys.stderr.write(self.__state + '\n') - action(line) - read_obj.close() - self.__write_obj.close() + with open(self.__file, 'r') as read_obj: + with open(self.__write_to, 'wb') as self.__write_obj: + for line in read_obj: + self.__token_info = line[:16] + action = self.__state_dict.get(self.__state) + if action is None: + sys.stderr.write('No matching state in module styles.py\n') + sys.stderr.write(self.__state + '\n') + action(line) copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "info.data") diff --git a/src/calibre/ebooks/rtf2xml/process_tokens.py b/src/calibre/ebooks/rtf2xml/process_tokens.py index 6ff0519dc2..56e61d2b60 100755 --- a/src/calibre/ebooks/rtf2xml/process_tokens.py +++ b/src/calibre/ebooks/rtf2xml/process_tokens.py @@ -230,11 +230,15 @@ class ProcessTokens: 'trhdr' : ('tb', 'row-header', self.default_func), # preamble => pr # document information => di + # TODO integrate \userprops 'info' : ('di', 'doc-info__', self.default_func), + 'title' : ('di', 'title_____', self.default_func), 'author' : ('di', 'author____', self.default_func), 'operator' : ('di', 'operator__', self.default_func), - 'title' : ('di', 'title_____', self.default_func), + 'manager' : ('di', 'manager___', self.default_func), + 'company' : ('di', 'company___', self.default_func), 'keywords' : ('di', 'keywords__', self.default_func), + 'category' : ('di', 'category__', self.default_func), 'doccomm' : ('di', 'doc-notes_', self.default_func), 'comment' : ('di', 'doc-notes_', self.default_func), 'subject' : ('di', 'subject___', self.default_func), @@ -243,11 +247,19 @@ class ProcessTokens: 'mo' : ('di', 'month_____', self.default_func), 'dy' : ('di', 'day_______', self.default_func), 'min' : ('di', 'minute____', self.default_func), + 'sec' : ('di', 'second____', self.default_func), 'revtim' : ('di', 'revis-time', self.default_func), + 'edmins' : ('di', 'edit-time_', self.default_func), + 'printim' : ('di', 'print-time', self.default_func), + 'buptim' : ('di', 'backuptime', self.default_func), 'nofwords' : ('di', 'num-of-wor', self.default_func), 'nofchars' : ('di', 'num-of-chr', self.default_func), + 'nofcharsws' : ('di', 'numofchrws', self.default_func), 'nofpages' : ('di', 'num-of-pag', self.default_func), - 'edmins' : ('di', 'edit-time_', self.default_func), + 'version' : ('di', 'version___', self.default_func), + 'vern' : ('di', 'intern-ver', self.default_func), + 'hlinkbase' : ('di', 'linkbase__', self.default_func), + 'id' : ('di', 'internalID', self.default_func), # headers and footers => hf 'headerf' : ('hf', 'head-first', self.default_func), 'headerl' : ('hf', 'head-left_', self.default_func), From 77ce7b9c7697cb960ff410b50ad66652e0ce14ec Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sat, 15 Jan 2011 21:38:22 +0100 Subject: [PATCH 110/132] Handling of company tag in info --- src/calibre/ebooks/rtf2xml/delete_info.py | 34 ++++++++--------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/delete_info.py b/src/calibre/ebooks/rtf2xml/delete_info.py index 3ffff7d73a..b3b5bdcad7 100755 --- a/src/calibre/ebooks/rtf2xml/delete_info.py +++ b/src/calibre/ebooks/rtf2xml/delete_info.py @@ -20,7 +20,7 @@ import sys, os, tempfile from calibre.ebooks.rtf2xml import copy class DeleteInfo: - """Delelet unecessary destination groups""" + """Delete unecessary destination groups""" def __init__(self, in_file , bug_handler, @@ -31,17 +31,14 @@ class DeleteInfo: self.__bug_handler = bug_handler self.__copy = copy self.__write_to = tempfile.mktemp() + self.__run_level = run_level + self.__initiate_allow() self.__bracket_count= 0 self.__ob_count = 0 self.__cb_count = 0 - # self.__after_asterisk = False - # self.__delete = 0 - self.__initiate_allow() self.__ob = 0 self.__write_cb = False - self.__run_level = run_level self.__found_delete = False - # self.__list = False def __initiate_allow(self): """ @@ -57,6 +54,8 @@ class DeleteInfo: 'cw<an<annotation', 'cw<cm<comment___', 'cw<it<lovr-table', + # info table + 'cw<di<company___', # 'cw<ls<list______', ) self.__not_allowable = ( @@ -116,7 +115,6 @@ class DeleteInfo: """ # Test for {\*}, in which case don't enter # delete state - # self.__after_asterisk = False # only enter this function once self.__found_delete = True if self.__token_info == 'cb<nu<clos-brack': if self.__delete_count == self.__cb_count: @@ -128,7 +126,7 @@ class DeleteInfo: # not sure what happens here! # believe I have a '{\*} if self.__run_level > 3: - msg = 'flag problem\n' + msg = 'Flag problem\n' raise self.__bug_handler, msg return True elif self.__token_info in self.__allowable : @@ -173,8 +171,8 @@ class DeleteInfo: Return True for all control words. Return False otherwise. """ - if self.__delete_count == self.__cb_count and self.__token_info ==\ - 'cb<nu<clos-brack': + if self.__delete_count == self.__cb_count and \ + self.__token_info == 'cb<nu<clos-brack': self.__state = 'default' if self.__write_cb: self.__write_cb = False @@ -186,31 +184,23 @@ class DeleteInfo: return False def delete_info(self): - """Main method for handling other methods. Read one line in at + """Main method for handling other methods. Read one line at a time, and determine whether to print the line based on the state.""" with open(self.__file, 'r') as read_obj: with open(self.__write_to, 'w') as self.__write_obj: for line in read_obj: #ob<nu<open-brack<0001 - to_print = True self.__token_info = line[:16] if self.__token_info == 'ob<nu<open-brack': self.__ob_count = line[-5:-1] if self.__token_info == 'cb<nu<clos-brack': self.__cb_count = line[-5:-1] + # Get action to perform action = self.__state_dict.get(self.__state) if not action: sys.stderr.write('No action in dictionary state is "%s" \n' % self.__state) - to_print = action(line) - # if self.__after_asterisk: - # to_print = self.__asterisk_func(line) - # elif self.__list: - # self.__in_list_func(line) - # elif self.__delete: - # to_print = self.__delete_func(line) - # else: - # to_print = self.__default_func(line) - if to_print: + # Print if allowed by action + if action(line): self.__write_obj.write(line) copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: From 8cf62f9feb609e1ec0297bd28da4a9b0e67b1be3 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sat, 15 Jan 2011 21:51:37 +0100 Subject: [PATCH 111/132] Remove empty tags in info --- src/calibre/ebooks/rtf2xml/info.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/info.py b/src/calibre/ebooks/rtf2xml/info.py index 9f2905f31b..f5f1c5851c 100755 --- a/src/calibre/ebooks/rtf2xml/info.py +++ b/src/calibre/ebooks/rtf2xml/info.py @@ -15,7 +15,7 @@ # # # # ######################################################################### -import sys, os, tempfile +import sys, os, tempfile, re from calibre.ebooks.rtf2xml import copy @@ -51,6 +51,7 @@ class Info: """ self.__text_string = '' self.__state = 'before_info_table' + self.rmspace = re.compile(r'\s+') self.__state_dict = { 'before_info_table': self.__before_info_table_func, 'after_info_table': self.__after_info_table_func, @@ -167,11 +168,13 @@ class Info: """ if self.__token_info == 'mi<mk<docinf-end': self.__state = 'in_info_table' - self.__write_obj.write( - 'mi<tg<open______<%s\n' - 'tx<nu<__________<%s\n' - 'mi<tg<close_____<%s\n' % (self.__tag, self.__text_string, self.__tag) - ) + #Don't print empty tags + if len(self.rmspace.sub('',self.__text_string)): + self.__write_obj.write( + 'mi<tg<open______<%s\n' + 'tx<nu<__________<%s\n' + 'mi<tg<close_____<%s\n' % (self.__tag, self.__text_string, self.__tag) + ) self.__text_string = '' elif line[0:2] == 'tx': self.__text_string += line[17:-1] From fc42efda4266d5557f0c6a7bdcebd964cc734785 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 16 Jan 2011 00:47:01 +0100 Subject: [PATCH 112/132] Handle inproper \*\csN in body without braces --- src/calibre/ebooks/rtf2xml/ParseRtf.py | 2 +- src/calibre/ebooks/rtf2xml/delete_info.py | 5 ++- src/calibre/ebooks/rtf2xml/fields_small.py | 52 +++++++++++++--------- src/calibre/ebooks/rtf2xml/tokenize.py | 4 +- 4 files changed, 39 insertions(+), 24 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index 442f5f4ac3..a28b6f81da 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -375,7 +375,7 @@ class ParseRtf: old_rtf = old_rtf_obj.check_if_old_rtf() if old_rtf: if self.__run_level > 5: - msg = 'older RTF\n' + msg = 'Older RTF\n' msg += 'self.__run_level is "%s"\n' % self.__run_level raise RtfInvalidCodeException, msg if self.__run_level > 1: diff --git a/src/calibre/ebooks/rtf2xml/delete_info.py b/src/calibre/ebooks/rtf2xml/delete_info.py index b3b5bdcad7..80d2a2b2bd 100755 --- a/src/calibre/ebooks/rtf2xml/delete_info.py +++ b/src/calibre/ebooks/rtf2xml/delete_info.py @@ -48,6 +48,7 @@ class DeleteInfo: 'cw<it<listtable_', 'cw<it<revi-table', 'cw<ls<list-lev-d', + # Field allowed 'cw<fd<field-inst', 'cw<an<book-mk-st', 'cw<an<book-mk-en', @@ -86,7 +87,7 @@ class DeleteInfo: self.__ob = line return False else: - # write previous bracket, since didn't fine asterisk + # write previous bracket, since didn't find asterisk if self.__ob: self.__write_obj.write(self.__ob) self.__ob = 0 @@ -109,7 +110,7 @@ class DeleteInfo: If you find that you are in a delete group, and the previous token in not an open bracket (self.__ob = 0), that means that the delete group is nested inside another acceptable - detination group. In this case, you have alrady written + detination group. In this case, you have already written the open bracket, so you will need to write the closed one as well. """ diff --git a/src/calibre/ebooks/rtf2xml/fields_small.py b/src/calibre/ebooks/rtf2xml/fields_small.py index 2eac812b12..bbfb17eed9 100755 --- a/src/calibre/ebooks/rtf2xml/fields_small.py +++ b/src/calibre/ebooks/rtf2xml/fields_small.py @@ -15,8 +15,10 @@ # # # # ######################################################################### -import sys, os, tempfile, re +import sys, os, tempfile, re + from calibre.ebooks.rtf2xml import field_strings, copy + class FieldsSmall: """ ================= @@ -24,7 +26,7 @@ Purpose ================= Write tags for bookmarks, index and toc entry fields in a tokenized file. This module does not handle toc or index tables. (This module won't be any -use to use to you unless you use it as part of the other modules.) +use to you unless you use it as part of the other modules.) ----------- Method ----------- @@ -55,6 +57,7 @@ file. self.__copy = copy self.__write_to = tempfile.mktemp() self.__run_level = run_level + def __initiate_values(self): """ Initiate all values. @@ -81,6 +84,7 @@ file. tx = 'tx<nu<__________<(.*?)' reg_st = ob + bk_st + tx + cb self.__book_start = re.compile(r'%s' % reg_st) + def __before_body_func(self, line): """ Requires: @@ -94,6 +98,7 @@ file. if self.__token_info == 'mi<mk<body-open_': self.__state = 'body' self.__write_obj.write(line) + def __body_func(self, line): """ Requires: @@ -110,6 +115,7 @@ file. action(line, tag) else: self.__write_obj.write(line) + def __found_bookmark_func(self, line, tag): """ Requires: @@ -125,6 +131,7 @@ file. self.__cb_count = 0 self.__state = 'bookmark' self.__type_of_bookmark = tag + def __bookmark_func(self, line): """ Requires: @@ -153,6 +160,7 @@ file. self.__write_obj.write(line) elif line[0:2] == 'tx': self.__text_string += line[17:-1] + def __parse_index_func(self, my_string): """ Requires: @@ -201,6 +209,7 @@ file. my_changed_string += '<sub-entry>%s' % sub_entry my_changed_string += '\n' return my_changed_string + def __index_see_func(self, my_string): in_see = 0 bracket_count = 0 @@ -226,6 +235,7 @@ file. in_see = 1 changed_string += '%s\n' % line return changed_string, see_string + def __index_bookmark_func(self, my_string): """ Requries: @@ -262,6 +272,7 @@ file. in_bookmark = 1 index_string += '%s\n' % line return index_string, bookmark_string + def __index__format_func(self, my_string): italics = 0 bold =0 @@ -273,6 +284,7 @@ file. if token_info == 'cw<in<index-ital': italics = 1 return italics, bold + def __parse_toc_func(self, my_string): """ Requires: @@ -308,6 +320,7 @@ file. my_changed_string += '<main-entry>%s' % main_entry my_changed_string += '\n' return my_changed_string + def __parse_bookmark_for_toc(self, my_string): """ Requires: @@ -353,6 +366,7 @@ file. in_bookmark = 1 toc_string += '%s\n' % line return toc_string, book_start_string, book_end_string + def __parse_bookmark_func(self, my_string, type): """ Requires: @@ -367,6 +381,7 @@ file. my_changed_string = ('mi<tg<empty-att_<field<type>%s' '<number>%s<update>none\n' % (type, my_string)) return my_changed_string + def __found_toc_index_func(self, line, tag): """ Requires: @@ -382,6 +397,7 @@ file. self.__cb_count = 0 self.__state = 'toc_index' self.__tag = tag + def __toc_index_func(self, line): """ Requires: @@ -409,6 +425,7 @@ file. self.__write_obj.write(line) else: self.__text_string += line + def fix_fields(self): """ Requires: @@ -423,24 +440,19 @@ file. bookmark. """ self.__initiate_values() - read_obj = open(self.__file) - self.__write_obj = open(self.__write_to, 'w') - line_to_read = '1' - while line_to_read: - line_to_read = read_obj.readline() - line = line_to_read - self.__token_info = line[:16] - if self.__token_info == 'ob<nu<open-brack': - self.__ob_count = line[-5:-1] - if self.__token_info == 'cb<nu<clos-brack': - self.__cb_count = line[-5:-1] - action = self.__state_dict.get(self.__state) - if action == None: - sys.stderr.write('no no matching state in module fields_small.py\n') - sys.stderr.write(self.__state + '\n') - action(line) - read_obj.close() - self.__write_obj.close() + with open(self.__file, 'r') as read_obj: + with open(self.__write_to, 'w') as self.__write_obj: + for line in read_obj: + self.__token_info = line[:16] + if self.__token_info == 'ob<nu<open-brack': + self.__ob_count = line[-5:-1] + if self.__token_info == 'cb<nu<clos-brack': + self.__cb_count = line[-5:-1] + action = self.__state_dict.get(self.__state) + if action is None: + sys.stderr.write('No matching state in module fields_small.py\n') + sys.stderr.write(self.__state + '\n') + action(line) copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "fields_small.data") diff --git a/src/calibre/ebooks/rtf2xml/tokenize.py b/src/calibre/ebooks/rtf2xml/tokenize.py index 20438a2e66..9ebd718833 100755 --- a/src/calibre/ebooks/rtf2xml/tokenize.py +++ b/src/calibre/ebooks/rtf2xml/tokenize.py @@ -115,8 +115,8 @@ class Tokenize: def __sub_reg_split(self,input_file): input_file = self.__replace_spchar.mreplace(input_file) - # this is for older RTF input_file = self.__par_exp.sub('\n\\par \n', input_file) + input_file = self.__cs_ast.sub("\g<1>", input_file) input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file) input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file) #remove \n in bin data @@ -172,6 +172,8 @@ class Tokenize: self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)") #this is for old RTF self.__par_exp = re.compile(r'\\\n+') + #handle improper cs char-style with \* before without { + self.__cs_ast = re.compile(r'\\\*([\n ]*\\cs\d+[\n \\]+)') # self.__par_exp = re.compile(r'\\$') #self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}") #self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})") From c81f75f6f6f02e68cf77485aebad44dd28a7594e Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 16 Jan 2011 12:05:00 +0100 Subject: [PATCH 113/132] Partial fix for blank line in RTFInput --- resources/templates/rtf.xsl | 9 ++++----- src/calibre/ebooks/rtf/input.py | 33 +++++---------------------------- 2 files changed, 9 insertions(+), 33 deletions(-) diff --git a/resources/templates/rtf.xsl b/resources/templates/rtf.xsl index 6db1c0388d..58536186d9 100644 --- a/resources/templates/rtf.xsl +++ b/resources/templates/rtf.xsl @@ -220,7 +220,7 @@ </xsl:template> <xsl:template name="parse-styles-attrs"> - <!--<xsl:text>position:relative;</xsl:text>--> + <!--<xsl:text>position:relative;</xsl:text> <xsl:if test="@space-before"> <xsl:text>padding-top:</xsl:text> <xsl:value-of select="@space-before"/> @@ -230,7 +230,7 @@ <xsl:text>padding-bottom:</xsl:text> <xsl:value-of select="@space-after"/> <xsl:text>pt;</xsl:text> - </xsl:if> + </xsl:if>--> <xsl:if test="@left-indent"> <xsl:text>padding-left:</xsl:text> <xsl:value-of select="@left-indent"/> @@ -260,11 +260,11 @@ <xsl:text>text-decoration:underline</xsl:text> <xsl:text>;</xsl:text> </xsl:if> - <xsl:if test="@line-spacing"> + <!--<xsl:if test="@line-spacing"> <xsl:text>line-height:</xsl:text> <xsl:value-of select="@line-spacing"/> <xsl:text>pt;</xsl:text> - </xsl:if> + </xsl:if>--> <xsl:if test="(@align = 'just')"> <xsl:text>text-align: justify;</xsl:text> </xsl:if> @@ -314,7 +314,6 @@ </xsl:attribute> <xsl:apply-templates/> </xsl:element> - </xsl:otherwise> </xsl:choose> </xsl:template> diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index a6b8c86e79..bf7d11c7ed 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -78,13 +78,14 @@ class RTFInput(InputFormatPlugin): def generate_xml(self, stream): from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf ofile = 'dataxml.xml' - run_lev, debug_dir = 1, None + run_lev, debug_dir, indent_out = 1, None, 0 #just to check if the debug process is lauched, no need of this directory in fact if getattr(self.opts, 'debug_pipeline', None) is not None: try: os.mkdir('rtfdebug') debug_dir = 'rtfdebug' run_lev = 4 + indent_out = 1 except: pass parser = ParseRtf( @@ -108,7 +109,7 @@ class RTFInput(InputFormatPlugin): # Indent resulting XML. # Default is 0 (no indent). - indent = 1, + indent = indent_out, # Form lists from RTF. Default is 1. form_lists = 1, @@ -157,34 +158,10 @@ class RTFInput(InputFormatPlugin): with open(name, 'wb') as f: f.write(data) imap[count] = name - #open(name+'.hex', 'wb').write(enc) + # with open(name+'.hex', 'wb') as f: + # f.write(enc) return self.convert_images(imap) - # count = 0 - # raw = open(picts, 'rb').read() - # starts = [] - # for match in re.finditer(r'\{\\pict([^}]+)\}', raw): - # starts.append(match.start(1)) - - # imap = {} - # for start in starts: - # pos, bc = start, 1 - # while bc > 0: - # if raw[pos] == '}': bc -= 1 - # elif raw[pos] == '{': bc += 1 - # pos += 1 - # pict = raw[start:pos+1] - # enc = re.sub(r'[^a-zA-Z0-9]', '', pict) - # if len(enc) % 2 == 1: - # enc = enc[:-1] - # data = enc.decode('hex') - # count += 1 - # name = (('%4d'%count).replace(' ', '0'))+'.wmf' - # open(name, 'wb').write(data) - # imap[count] = name - # #open(name+'.hex', 'wb').write(enc) - # return self.convert_images(imap) - def convert_images(self, imap): self.default_img = None for count, val in imap.iteritems(): From 92870ad5b862014357a0d993645df57a0515bd0c Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Tue, 18 Jan 2011 23:22:50 +0100 Subject: [PATCH 114/132] Add comment with test cmd --- src/calibre/ebooks/rtf/input.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index d8301b7120..6361cb7fdb 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -338,3 +338,4 @@ class RTFInput(InputFormatPlugin): opf.render(open('metadata.opf', 'wb')) return os.path.abspath('metadata.opf') +#ebook-convert "bad.rtf" test.epub -v -d "D:\Mes eBooks\Developpement\debug" \ No newline at end of file From 270d5c41f2ef7c9e63dc3756f59f4ba46131037e Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Mon, 24 Jan 2011 21:55:04 +0100 Subject: [PATCH 115/132] Corrrect edit time field bug --- src/calibre/ebooks/rtf2xml/info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/rtf2xml/info.py b/src/calibre/ebooks/rtf2xml/info.py index f5f1c5851c..55cb54b93a 100755 --- a/src/calibre/ebooks/rtf2xml/info.py +++ b/src/calibre/ebooks/rtf2xml/info.py @@ -73,7 +73,6 @@ class Info: 'cw<di<create-tim' : (self.__found_tag_with_tokens_func, 'creation-time'), 'cw<di<revis-time' : (self.__found_tag_with_tokens_func, 'revision-time'), - 'cw<di<edit-time_' : (self.__found_tag_with_tokens_func, 'editing-time'), 'cw<di<print-time' : (self.__found_tag_with_tokens_func, 'printing-time'), 'cw<di<backuptime' : (self.__found_tag_with_tokens_func, 'backup-time'), @@ -82,6 +81,7 @@ class Info: 'cw<di<numofchrws' : (self.__single_field_func, 'number-of-characters-without-space'), 'cw<di<num-of-pag' : (self.__single_field_func, 'number-of-pages'), 'cw<di<version___' : (self.__single_field_func, 'version'), + 'cw<di<edit-time_' : (self.__single_field_func, 'editing-time'), 'cw<di<intern-ver' : (self.__single_field_func, 'internal-version-number'), 'cw<di<internalID' : (self.__single_field_func, 'internal-id-number'), } From ba1e8510fa188b4166831ba772c7bd7c742fda09 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Mon, 24 Jan 2011 23:32:36 +0100 Subject: [PATCH 116/132] RTF hex_2_utf8 cleaning --- src/calibre/ebooks/rtf2xml/hex_2_utf8.py | 65 +++++++++++------------- 1 file changed, 29 insertions(+), 36 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/hex_2_utf8.py b/src/calibre/ebooks/rtf2xml/hex_2_utf8.py index 0d17f2da99..38f21fd10b 100755 --- a/src/calibre/ebooks/rtf2xml/hex_2_utf8.py +++ b/src/calibre/ebooks/rtf2xml/hex_2_utf8.py @@ -115,7 +115,7 @@ class Hex2Utf8: """ self.__file=file self.__copy = copy - if area_to_convert != 'preamble' and area_to_convert != 'body': + if area_to_convert not in ('preamble', 'body'): msg = ( 'in module "hex_2_utf8.py\n' '"area_to_convert" must be "body" or "preamble"\n' @@ -143,18 +143,19 @@ class Hex2Utf8: Set values, including those for the dictionaries. The file that contains the maps is broken down into many different sets. For example, for the Symbol font, there is the standard part for - hexidecimal numbers, and the the part for Microsoft charcters. Read + hexidecimal numbers, and the part for Microsoft characters. Read each part in, and then combine them. """ # the default encoding system, the lower map for characters 0 through # 128, and the encoding system for Microsoft characters. - # New on 2004-05-8: the self.__char_map is not in diretory with other + # New on 2004-05-8: the self.__char_map is not in directory with other # modules self.__char_file = cStringIO.StringIO(char_set) char_map_obj = get_char_map.GetCharMap( char_file = self.__char_file, bug_handler = self.__bug_handler, ) + print self.__default_char_map up_128_dict = char_map_obj.get_char_map(map=self.__default_char_map) bt_128_dict = char_map_obj.get_char_map(map = 'bottom_128') ms_standard_dict = char_map_obj.get_char_map(map = 'ms_standard') @@ -195,7 +196,6 @@ class Hex2Utf8: 'body' : self.__body_func, 'mi<mk<body-open_' : self.__found_body_func, 'tx<hx<__________' : self.__hex_text_func, - # 'tx<nu<__________' : self.__text_func, } self.__body_state_dict = { 'preamble' : self.__preamble_for_body_func, @@ -235,9 +235,7 @@ class Hex2Utf8: font = self.__current_dict_name if self.__convert_caps\ and self.__caps_list[-1] == 'true'\ - and font != 'Symbol'\ - and font != 'Wingdings'\ - and font != 'Zapf Dingbats': + and font not in ('Symbol', 'Wingdings', 'Zapf Dingbats'): converted = self.__utf_token_to_caps_func(converted) self.__write_obj.write( 'tx<ut<__________<%s\n' % converted @@ -247,9 +245,7 @@ class Hex2Utf8: font = self.__current_dict_name if self.__convert_caps\ and self.__caps_list[-1] == 'true'\ - and font != 'Symbol'\ - and font != 'Wingdings'\ - and font != 'Zapf Dingbats': + and font not in ('Symbol', 'Wingdings', 'Zapf Dingbats'): converted = converted.upper() self.__write_obj.write( 'tx<nu<__________<%s\n' % converted @@ -289,17 +285,16 @@ class Hex2Utf8: def __convert_preamble(self): self.__state = 'preamble' - self.__write_obj = open(self.__write_to, 'w') - with open(self.__file, 'r') as read_obj: - for line in read_obj: - self.__token_info = line[:16] - action = self.__preamble_state_dict.get(self.__state) - if action is None: - sys.stderr.write('error no state found in hex_2_utf8', - self.__state - ) - action(line) - self.__write_obj.close() + with open(self.__write_to, 'w') as self.__write_obj: + with open(self.__file, 'r') as read_obj: + for line in read_obj: + self.__token_info = line[:16] + action = self.__preamble_state_dict.get(self.__state) + if action is None: + sys.stderr.write('error no state found in hex_2_utf8', + self.__state + ) + action(line) copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "preamble_utf_convert.data") @@ -468,9 +463,9 @@ class Hex2Utf8: if len(self.__caps_list) > 1: self.__caps_list.pop() else: - sys.stderr.write('Module is hex_2_utf8\n') - sys.stderr.write('method is __end_caps_func\n') - sys.stderr.write('caps list should be more than one?\n') #self.__in_caps not set + sys.stderr.write('Module is hex_2_utf8\n' + 'method is __end_caps_func\n' + 'caps list should be more than one?\n') #self.__in_caps not set def __text_func(self, line): """ @@ -493,8 +488,7 @@ class Hex2Utf8: hex_num = '\'%s' % hex_num converted = self.__current_dict.get(hex_num) if converted is None: - sys.stderr.write('module is hex_2_ut8\n') - sys.stderr.write('method is __text_func\n') + sys.stderr.write('module is hex_2_ut8\nmethod is __text_func\n') sys.stderr.write('no hex value for "%s"\n' % hex_num) else: the_string += converted @@ -550,16 +544,15 @@ class Hex2Utf8: def __convert_body(self): self.__state = 'body' with open(self.__file, 'r') as read_obj: - self.__write_obj = open(self.__write_to, 'w') - for line in read_obj: - self.__token_info = line[:16] - action = self.__body_state_dict.get(self.__state) - if action is None: - sys.stderr.write('error no state found in hex_2_utf8', - self.__state - ) - action(line) - self.__write_obj.close() + with open(self.__write_to, 'w') as self.__write_obj: + for line in read_obj: + self.__token_info = line[:16] + action = self.__body_state_dict.get(self.__state) + if action is None: + sys.stderr.write('error no state found in hex_2_utf8', + self.__state + ) + action(line) copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "body_utf_convert.data") From 05a90f1bcb5139c7a331a93c323feb8122921dd5 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Tue, 25 Jan 2011 00:48:32 +0100 Subject: [PATCH 117/132] ... --- src/calibre/ebooks/rtf2xml/get_char_map.py | 3 --- src/calibre/ebooks/rtf2xml/hex_2_utf8.py | 1 - src/calibre/ebooks/rtf2xml/process_tokens.py | 4 ++-- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/get_char_map.py b/src/calibre/ebooks/rtf2xml/get_char_map.py index fb3ef28b4f..cb118b0df8 100755 --- a/src/calibre/ebooks/rtf2xml/get_char_map.py +++ b/src/calibre/ebooks/rtf2xml/get_char_map.py @@ -30,8 +30,6 @@ class GetCharMap: 'char_file'--the file with the mappings - - Returns: nothing @@ -62,7 +60,6 @@ class GetCharMap: fields[1].replace('\\colon', ':') map_dict[fields[1]] = fields[3] - if not found_map: msg = 'no map found\nmap is "%s"\n'%(map,) raise self.__bug_handler, msg diff --git a/src/calibre/ebooks/rtf2xml/hex_2_utf8.py b/src/calibre/ebooks/rtf2xml/hex_2_utf8.py index 38f21fd10b..7b8e148661 100755 --- a/src/calibre/ebooks/rtf2xml/hex_2_utf8.py +++ b/src/calibre/ebooks/rtf2xml/hex_2_utf8.py @@ -155,7 +155,6 @@ class Hex2Utf8: char_file = self.__char_file, bug_handler = self.__bug_handler, ) - print self.__default_char_map up_128_dict = char_map_obj.get_char_map(map=self.__default_char_map) bt_128_dict = char_map_obj.get_char_map(map = 'bottom_128') ms_standard_dict = char_map_obj.get_char_map(map = 'ms_standard') diff --git a/src/calibre/ebooks/rtf2xml/process_tokens.py b/src/calibre/ebooks/rtf2xml/process_tokens.py index 56e61d2b60..1edf69b32d 100755 --- a/src/calibre/ebooks/rtf2xml/process_tokens.py +++ b/src/calibre/ebooks/rtf2xml/process_tokens.py @@ -757,7 +757,7 @@ class ProcessTokens: def process_cw(self, token): """Change the value of the control word by determining what dictionary it belongs to""" - special = [ '*', ':', '}', '{', '~', '_', '-', ';' ] + special = [ '*', ':', '}', '{', '~', '_', '-', ';' ] ##if token != "{" or token != "}": token = token[1:] # strip off leading \ token = token.replace(" ", "") @@ -793,7 +793,7 @@ class ProcessTokens: raise self.__exception_handler, msg the_index = token.find('\\ ') - if token is not None and the_index > -1: + if token is not None and the_index > -1: msg = 'Invalid RTF: token "\\ " not valid.\n' raise self.__exception_handler, msg elif token[:1] == "\\": From 026772d016c47ebdf2d99c9161363944c6db9382 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Thu, 27 Jan 2011 23:56:51 +0100 Subject: [PATCH 118/132] Add posssibility of file path export with formts in bibtex catalog --- src/calibre/gui2/catalog/catalog_bibtex.py | 9 +-- src/calibre/gui2/catalog/catalog_bibtex.ui | 9 ++- src/calibre/library/catalog.py | 74 ++++++++++++++-------- 3 files changed, 61 insertions(+), 31 deletions(-) diff --git a/src/calibre/gui2/catalog/catalog_bibtex.py b/src/calibre/gui2/catalog/catalog_bibtex.py index a24f6b0f95..ebfcc6e546 100644 --- a/src/calibre/gui2/catalog/catalog_bibtex.py +++ b/src/calibre/gui2/catalog/catalog_bibtex.py @@ -9,7 +9,6 @@ __docformat__ = 'restructuredtext en' from calibre.gui2 import gprefs from calibre.gui2.catalog.catalog_bibtex_ui import Ui_Form -from calibre.library import db as db_ from PyQt4.Qt import QWidget, QListWidgetItem class PluginWidget(QWidget, Ui_Form): @@ -20,7 +19,9 @@ class PluginWidget(QWidget, Ui_Form): ('bib_entry', 0), #mixed ('bibfile_enc', 0), #utf-8 ('bibfile_enctag', 0), #strict - ('impcit', True) ] + ('impcit', True), + ('addfiles', False), + ] sync_enabled = False formats = set(['bib']) @@ -50,7 +51,7 @@ class PluginWidget(QWidget, Ui_Form): opt_value = gprefs.get(self.name + '_' + opt[0], opt[1]) if opt[0] in ['bibfile_enc', 'bibfile_enctag', 'bib_entry']: getattr(self, opt[0]).setCurrentIndex(opt_value) - elif opt[0] == 'impcit' : + elif opt[0] in ['impcit', 'addfiles'] : getattr(self, opt[0]).setChecked(opt_value) else: getattr(self, opt[0]).setText(opt_value) @@ -77,7 +78,7 @@ class PluginWidget(QWidget, Ui_Form): for opt in self.OPTION_FIELDS: if opt[0] in ['bibfile_enc', 'bibfile_enctag', 'bib_entry']: opt_value = getattr(self,opt[0]).currentIndex() - elif opt[0] == 'impcit' : + elif opt[0] in ['impcit', 'addfiles'] : opt_value = getattr(self, opt[0]).isChecked() else : opt_value = unicode(getattr(self, opt[0]).text()) diff --git a/src/calibre/gui2/catalog/catalog_bibtex.ui b/src/calibre/gui2/catalog/catalog_bibtex.ui index 7f4920655d..8712d40148 100644 --- a/src/calibre/gui2/catalog/catalog_bibtex.ui +++ b/src/calibre/gui2/catalog/catalog_bibtex.ui @@ -47,7 +47,7 @@ </item> </widget> </item> - <item row="1" column="1" rowspan="12"> + <item row="1" column="1" rowspan="11"> <widget class="QListWidget" name="db_fields"> <property name="sizePolicy"> <sizepolicy hsizetype="Preferred" vsizetype="Expanding"> @@ -141,6 +141,13 @@ </widget> </item> <item row="8" column="0"> + <widget class="QCheckBox" name="addfiles"> + <property name="text"> + <string>Add files path with formats?</string> + </property> + </widget> + </item> + <item row="9" column="0"> <widget class="QLabel" name="label_3"> <property name="text"> <string>Expression to form the BibTeX citation tag:</string> diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py index f0e4778de4..e20eebc517 100644 --- a/src/calibre/library/catalog.py +++ b/src/calibre/library/catalog.py @@ -24,10 +24,9 @@ from calibre.utils.logging import default_log as log from calibre.utils.zipfile import ZipFile, ZipInfo from calibre.utils.magick.draw import thumbnail -FIELDS = ['all', 'author_sort', 'authors', 'comments', - 'cover', 'formats', 'id', 'isbn', 'ondevice', 'pubdate', 'publisher', 'rating', - 'series_index', 'series', 'size', 'tags', 'timestamp', 'title', - 'uuid'] +FIELDS = ['all', 'title', 'author_sort', 'authors', 'comments', + 'cover', 'formats','id', 'isbn', 'ondevice', 'pubdate', 'publisher', + 'rating', 'series_index', 'series', 'size', 'tags', 'timestamp', 'uuid'] #Allowed fields for template TEMPLATE_ALLOWED_FIELDS = [ 'author_sort', 'authors', 'id', 'isbn', 'pubdate', @@ -252,6 +251,15 @@ class BIBTEX(CatalogPlugin): # {{{ "Default: '%default'\n" "Applies to: BIBTEX output format")), + Option('--add-files-path', + default = 'True', + dest = 'addfiles', + action = None, + help = _('Create a file entry if formats is selected for BibTeX entries.\n' + 'Boolean value: True, False\n' + "Default: '%default'\n" + "Applies to: BIBTEX output format")), + Option('--citation-template', default = '{authors}{id}', dest = 'bib_cit', @@ -298,7 +306,7 @@ class BIBTEX(CatalogPlugin): # {{{ from calibre.utils.bibtex import BibTeX def create_bibtex_entry(entry, fields, mode, template_citation, - bibtexdict, citation_bibtex = True): + bibtexdict, citation_bibtex=True, calibre_files=True): #Bibtex doesn't like UTF-8 but keep unicode until writing #Define starting chain or if book valid strict and not book return a Fail string @@ -360,8 +368,13 @@ class BIBTEX(CatalogPlugin): # {{{ bibtex_entry.append(u'isbn = "%s"' % re.sub(u'[\D]', u'', item)) elif field == 'formats' : - item = u', '.join([format.rpartition('.')[2].lower() for format in item]) - bibtex_entry.append(u'formats = "%s"' % item) + #Add file path if format is selected + formats = [format.rpartition('.')[2].lower() for format in item] + bibtex_entry.append(u'formats = "%s"' % u', '.join(formats)) + if calibre_files: + files = [u':%s:%s' % (format, format.rpartition('.')[2].upper())\ + for format in item] + bibtex_entry.append(u'files = "%s"' % u', '.join(files)) elif field == 'series_index' : bibtex_entry.append(u'volume = "%s"' % int(item)) @@ -510,32 +523,41 @@ class BIBTEX(CatalogPlugin): # {{{ citation_bibtex= True else : citation_bibtex= opts.impcit + + #Check add file entry and go to default in case of bad CLI + if isinstance(opts.addfiles, (StringType, UnicodeType)) : + if opts.addfiles == 'False' : + addfiles_bibtex = False + elif opts.addfiles == 'True' : + addfiles_bibtex = True + else : + log(" WARNING: incorrect --add-files-path, revert to default") + addfiles_bibtex= True + else : + addfiles_bibtex = opts.addfiles #Preprocess for error and light correction template_citation = preprocess_template(opts.bib_cit) #Open output and write entries - outfile = codecs.open(path_to_output, 'w', bibfile_enc, bibfile_enctag) + with codecs.open(path_to_output, 'w', bibfile_enc, bibfile_enctag)\ + as outfile: + #File header + nb_entries = len(data) + #check in book strict if all is ok else throw a warning into log + if bib_entry == 'book' : + nb_books = len(filter(check_entry_book_valid, data)) + if nb_books < nb_entries : + log(" WARNING: only %d entries in %d are book compatible" % (nb_books, nb_entries)) + nb_entries = nb_books - #File header - nb_entries = len(data) + outfile.write(u'%%%Calibre catalog\n%%%{0} entries in catalog\n\n'.format(nb_entries)) + outfile.write(u'@preamble{"This catalog of %d entries was generated by calibre on %s"}\n\n' + % (nb_entries, nowf().strftime("%A, %d. %B %Y %H:%M").decode(preferred_encoding))) - #check in book strict if all is ok else throw a warning into log - if bib_entry == 'book' : - nb_books = len(filter(check_entry_book_valid, data)) - if nb_books < nb_entries : - log(" WARNING: only %d entries in %d are book compatible" % (nb_books, nb_entries)) - nb_entries = nb_books - - outfile.write(u'%%%Calibre catalog\n%%%{0} entries in catalog\n\n'.format(nb_entries)) - outfile.write(u'@preamble{"This catalog of %d entries was generated by calibre on %s"}\n\n' - % (nb_entries, nowf().strftime("%A, %d. %B %Y %H:%M").decode(preferred_encoding))) - - for entry in data: - outfile.write(create_bibtex_entry(entry, fields, bib_entry, template_citation, - bibtexc, citation_bibtex)) - - outfile.close() + for entry in data: + outfile.write(create_bibtex_entry(entry, fields, bib_entry, template_citation, + bibtexc, citation_bibtex, addfiles_bibtex)) # }}} class EPUB_MOBI(CatalogPlugin): From ea86886f16ca5dd89f3eca0f250d42fe711951fb Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Fri, 28 Jan 2011 00:05:35 +0100 Subject: [PATCH 119/132] bibtex catalog on device modifications --- src/calibre/library/catalog.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py index e20eebc517..2b95d0a5be 100644 --- a/src/calibre/library/catalog.py +++ b/src/calibre/library/catalog.py @@ -340,7 +340,7 @@ class BIBTEX(CatalogPlugin): # {{{ if field == 'authors' : bibtex_entry.append(u'author = "%s"' % bibtexdict.bibtex_author_format(item)) - elif field in ['title', 'publisher', 'cover', 'uuid', + elif field in ['title', 'publisher', 'cover', 'uuid', 'ondevice', 'author_sort', 'series'] : bibtex_entry.append(u'%s = "%s"' % (field, bibtexdict.utf8ToBibtex(item))) @@ -374,7 +374,7 @@ class BIBTEX(CatalogPlugin): # {{{ if calibre_files: files = [u':%s:%s' % (format, format.rpartition('.')[2].upper())\ for format in item] - bibtex_entry.append(u'files = "%s"' % u', '.join(files)) + bibtex_entry.append(u'file = "%s"' % u', '.join(files)) elif field == 'series_index' : bibtex_entry.append(u'volume = "%s"' % int(item)) @@ -470,6 +470,8 @@ class BIBTEX(CatalogPlugin): # {{{ if opts.verbose: opts_dict = vars(opts) log("%s(): Generating %s" % (self.name,self.fmt)) + if opts.connected_device['is_device_connected']: + log(" connected_device: %s" % opts.connected_device['name']) if opts_dict['search_text']: log(" --search='%s'" % opts_dict['search_text']) @@ -544,6 +546,7 @@ class BIBTEX(CatalogPlugin): # {{{ as outfile: #File header nb_entries = len(data) + #check in book strict if all is ok else throw a warning into log if bib_entry == 'book' : nb_books = len(filter(check_entry_book_valid, data)) @@ -551,6 +554,11 @@ class BIBTEX(CatalogPlugin): # {{{ log(" WARNING: only %d entries in %d are book compatible" % (nb_books, nb_entries)) nb_entries = nb_books + # If connected device, add 'On Device' values to data + if opts.connected_device['is_device_connected'] and 'ondevice' in fields: + for entry in data: + entry['ondevice'] = db.catalog_plugin_on_device_temp_mapping[entry['id']]['ondevice'] + outfile.write(u'%%%Calibre catalog\n%%%{0} entries in catalog\n\n'.format(nb_entries)) outfile.write(u'@preamble{"This catalog of %d entries was generated by calibre on %s"}\n\n' % (nb_entries, nowf().strftime("%A, %d. %B %Y %H:%M").decode(preferred_encoding))) From 23f2fc62021ed8b1e9b2dca6f7d409affdeebe0a Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 30 Jan 2011 11:05:07 +0100 Subject: [PATCH 120/132] Minor modifications to catalog --- src/calibre/library/catalog.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py index 084c238f00..8b88e44407 100644 --- a/src/calibre/library/catalog.py +++ b/src/calibre/library/catalog.py @@ -232,6 +232,7 @@ class BIBTEX(CatalogPlugin): # {{{ help = _('The fields to output when cataloging books in the ' 'database. Should be a comma-separated list of fields.\n' 'Available fields: %s.\n' + 'plus user-created custom fields.\n' 'Example: %s=title,authors,tags\n' "Default: '%%default'\n" "Applies to: BIBTEX output format")%(', '.join(FIELDS), @@ -269,7 +270,7 @@ class BIBTEX(CatalogPlugin): # {{{ dest = 'bib_cit', action = None, help = _('The template for citation creation from database fields.\n' - ' Should be a template with {} enclosed fields.\n' + 'Should be a template with {} enclosed fields.\n' 'Available fields: %s.\n' "Default: '%%default'\n" "Applies to: BIBTEX output format")%', '.join(TEMPLATE_ALLOWED_FIELDS)), From ed4da14df07a4c61a21bfe09c542aa4802863a9d Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Mon, 31 Jan 2011 08:29:42 +0100 Subject: [PATCH 121/132] Correct problems with tag splitting in RTFParser, some encoding refactoring & move all encodings to UTF-8 or US-ASCII for lxml --- src/calibre/ebooks/rtf/input.py | 23 ++------ src/calibre/ebooks/rtf2xml/ParseRtf.py | 2 + src/calibre/ebooks/rtf2xml/colors.py | 54 +++++++++++-------- src/calibre/ebooks/rtf2xml/convert_to_tags.py | 38 ++++++++----- .../ebooks/rtf2xml/default_encoding.py | 4 ++ src/calibre/ebooks/rtf2xml/fonts.py | 36 +++++++------ src/calibre/ebooks/rtf2xml/get_char_map.py | 2 +- src/calibre/ebooks/rtf2xml/tokenize.py | 24 +++++---- 8 files changed, 101 insertions(+), 82 deletions(-) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 6361cb7fdb..caa35a9eda 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -85,6 +85,7 @@ class RTFInput(InputFormatPlugin): debug_dir = 'rtfdebug' run_lev = 4 indent_out = 1 + self.log('Running RTFParser in debug mode') except: pass parser = ParseRtf( @@ -233,22 +234,6 @@ class RTFInput(InputFormatPlugin): with open('styles.css', 'ab') as f: f.write(css) - # def preprocess(self, fname): - # self.log('\tPreprocessing to convert unicode characters') - # try: - # data = open(fname, 'rb').read() - # from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser - # tokenizer = RtfTokenizer(data) - # tokens = RtfTokenParser(tokenizer.tokens) - # data = tokens.toRTF() - # fname = 'preprocessed.rtf' - # with open(fname, 'wb') as f: - # f.write(data) - # except: - # self.log.exception( - # 'Failed to preprocess RTF to convert unicode sequences, ignoring...') - # return fname - def convert_borders(self, doc): border_styles = [] style_map = {} @@ -283,8 +268,6 @@ class RTFInput(InputFormatPlugin): self.opts = options self.log = log self.log('Converting RTF to XML...') - #Name of the preprocesssed RTF file - # fname = self.preprocess(stream.name) try: xml = self.generate_xml(stream.name) except RtfInvalidCodeException, e: @@ -338,4 +321,6 @@ class RTFInput(InputFormatPlugin): opf.render(open('metadata.opf', 'wb')) return os.path.abspath('metadata.opf') -#ebook-convert "bad.rtf" test.epub -v -d "D:\Mes eBooks\Developpement\debug" \ No newline at end of file +#ebook-convert "bad.rtf" test.epub -v -d "D:\Mes eBooks\Developpement\debug" +# os.makedirs('D:\\Mes eBooks\\Developpement\\rtfdebug') +# debug_dir = 'D:\\Mes eBooks\\Developpement\\rtfdebug' \ No newline at end of file diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index a28b6f81da..56e18fe74d 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -238,6 +238,8 @@ class ParseRtf: bug_handler = RtfInvalidCodeException, ) enc = 'cp' + encode_obj.get_codepage() + if enc == 'cp10000': + enc = 'mac_roman' msg = 'Exception in token processing' if check_encoding_obj.check_encoding(self.__file, enc): file_name = self.__file if isinstance(self.__file, str) \ diff --git a/src/calibre/ebooks/rtf2xml/colors.py b/src/calibre/ebooks/rtf2xml/colors.py index d81b293bbf..eba03547c8 100755 --- a/src/calibre/ebooks/rtf2xml/colors.py +++ b/src/calibre/ebooks/rtf2xml/colors.py @@ -15,8 +15,10 @@ # # # # ######################################################################### -import sys, os, tempfile, re +import sys, os, tempfile, re + from calibre.ebooks.rtf2xml import copy + class Colors: """ Change lines with color info from color numbers to the actual color names. @@ -40,8 +42,10 @@ class Colors: self.__file = in_file self.__copy = copy self.__bug_handler = bug_handler + self.__line = 0 self.__write_to = tempfile.mktemp() self.__run_level = run_level + def __initiate_values(self): """ Initiate all values. @@ -61,6 +65,7 @@ class Colors: self.__color_num = 1 self.__line_color_exp = re.compile(r'bdr-color_:(\d+)') # cw<bd<bor-par-to<nu<bdr-hair__|bdr-li-wid:0.50|bdr-sp-wid:1.00|bdr-color_:2 + def __before_color_func(self, line): """ Requires: @@ -76,6 +81,7 @@ class Colors: if self.__token_info == 'mi<mk<clrtbl-beg': self.__state = 'in_color_table' self.__write_obj.write(line) + def __default_color_func(self, line): """ Requires: @@ -87,6 +93,7 @@ class Colors: """ hex_num = line[-3:-1] self.__color_string += hex_num + def __blue_func(self, line): """ Requires: @@ -109,6 +116,7 @@ class Colors: ) self.__color_num += 1 self.__color_string = '#' + def __in_color_func(self, line): """ Requires: @@ -127,12 +135,13 @@ class Colors: self.__state = 'after_color_table' else: action = self.__state_dict.get(self.__token_info) - if action == None: + if action is None: sys.stderr.write('in module colors.py\n' 'function is self.__in_color_func\n' 'no action for %s' % self.__token_info ) action(line) + def __after_color_func(self, line): """ Check the to see if it contains color info. If it does, extract the @@ -180,6 +189,7 @@ class Colors: else: self.__write_obj.write(line) # cw<bd<bor-par-to<nu<bdr-hair__|bdr-li-wid:0.50|bdr-sp-wid:1.00|bdr-color_:2 + def __sub_from_line_color(self, match_obj): num = match_obj.group(1) try: @@ -191,25 +201,27 @@ class Colors: else: return 'bdr-color_:no-value' hex_num = self.__figure_num(num) - return_value = 'bdr-color_:%s' % hex_num - return return_value + return 'bdr-color_:%s' % hex_num + def __figure_num(self, num): if num == 0: hex_num = 'false' else: hex_num = self.__color_dict.get(num) - if hex_num == None: - if self.__run_level > 3: - msg = 'no value in self.__color_dict for key %s\n' % num - raise self.__bug_hanlder, msg - if hex_num == None: + if hex_num is None: hex_num = '0' + if self.__run_level > 5: + msg = 'no value in self.__color_dict' \ + 'for key %s at line %d\n' % (num, self.__line) + raise self.__bug_handler, msg return hex_num + def __do_nothing_func(self, line): """ Bad RTF will have text in the color table """ pass + def convert_colors(self): """ Requires: @@ -226,20 +238,16 @@ class Colors: info, and substitute the number with the hex number. """ self.__initiate_values() - read_obj = open(self.__file, 'r') - self.__write_obj = open(self.__write_to, 'w') - line_to_read = 1 - while line_to_read: - line_to_read = read_obj.readline() - line = line_to_read - self.__token_info = line[:16] - action = self.__state_dict.get(self.__state) - if action == None: - sys.stderr.write('no no matching state in module fonts.py\n') - sys.stderr.write(self.__state + '\n') - action(line) - read_obj.close() - self.__write_obj.close() + with open(self.__file, 'r') as read_obj: + with open(self.__write_to, 'w') as self.__write_obj: + for line in read_obj: + self.__line+=1 + self.__token_info = line[:16] + action = self.__state_dict.get(self.__state) + if action is None: + sys.stderr.write('no matching state in module fonts.py\n') + sys.stderr.write(self.__state + '\n') + action(line) copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "color.data") diff --git a/src/calibre/ebooks/rtf2xml/convert_to_tags.py b/src/calibre/ebooks/rtf2xml/convert_to_tags.py index 6927537474..1abc672f85 100755 --- a/src/calibre/ebooks/rtf2xml/convert_to_tags.py +++ b/src/calibre/ebooks/rtf2xml/convert_to_tags.py @@ -33,13 +33,13 @@ class ConvertToTags: self.__copy = copy self.__dtd_path = dtd_path self.__no_dtd = no_dtd - if encoding != 'mac_roman': - self.__encoding = 'cp' + encoding - else: + self.__encoding = 'cp' + encoding + if encoding == 'mac_roman': self.__encoding = 'mac_roman' self.__indent = indent self.__run_level = run_level self.__write_to = tempfile.mktemp() + self.__convert_utf = False def __initiate_values(self): """ @@ -213,7 +213,8 @@ class ConvertToTags: if not check_encoding_obj.check_encoding(self.__file, verbose=False): self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>') elif not check_encoding_obj.check_encoding(self.__file, self.__encoding): - self.__write_obj.write('<?xml version="1.0" encoding="%s" ?>' % self.__encoding) + self.__write_obj.write('<?xml version="1.0" encoding="UTF-8" ?>') + self.__convert_utf = True else: self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>') sys.stderr.write('Bad RTF encoding, revert to US-ASCII chars and' @@ -253,15 +254,28 @@ class ConvertToTags: an empty tag function. """ self.__initiate_values() - self.__write_obj = open(self.__write_to, 'w') - self.__write_dec() - with open(self.__file, 'r') as read_obj: - for line in read_obj: - self.__token_info = line[:16] - action = self.__state_dict.get(self.__token_info) - if action is not None: - action(line) + with open(self.__write_to, 'w') as self.__write_obj: + self.__write_dec() + with open(self.__file, 'r') as read_obj: + for line in read_obj: + self.__token_info = line[:16] + action = self.__state_dict.get(self.__token_info) + if action is not None: + action(line) self.__write_obj.close() + #convert all encodings to UTF8 to avoid unsupported encodings in lxml + if self.__convert_utf: + copy_obj = copy.Copy(bug_handler = self.__bug_handler) + copy_obj.rename(self.__write_to, self.__file) + with open(self.__file, 'r') as read_obj: + with open(self.__write_to, 'w') as write_obj: + file = read_obj.read() + try: + file = file.decode(self.__encoding) + write_obj.write(file.encode('utf-8')) + except: + sys.stderr.write('Conversion to UTF-8 is not possible,' + ' encoding should be very carefully checked') copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "convert_to_tags.data") diff --git a/src/calibre/ebooks/rtf2xml/default_encoding.py b/src/calibre/ebooks/rtf2xml/default_encoding.py index 3ddfbcd321..c0a43db800 100755 --- a/src/calibre/ebooks/rtf2xml/default_encoding.py +++ b/src/calibre/ebooks/rtf2xml/default_encoding.py @@ -75,12 +75,16 @@ class DefaultEncoding: self._encoding() self.__datafetched = True code_page = 'ansicpg' + self.__code_page + if self.__code_page == '10000': + self.__code_page = 'mac_roman' return self.__platform, code_page, self.__default_num def get_codepage(self): if not self.__datafetched: self._encoding() self.__datafetched = True + if self.__code_page == '10000': + self.__code_page = 'mac_roman' return self.__code_page def get_platform(self): diff --git a/src/calibre/ebooks/rtf2xml/fonts.py b/src/calibre/ebooks/rtf2xml/fonts.py index b85717ce48..45ed3c1957 100755 --- a/src/calibre/ebooks/rtf2xml/fonts.py +++ b/src/calibre/ebooks/rtf2xml/fonts.py @@ -16,7 +16,9 @@ # # ######################################################################### import sys, os, tempfile + from calibre.ebooks.rtf2xml import copy + class Fonts: """ Change lines with font info from font numbers to the actual font names. @@ -45,6 +47,7 @@ class Fonts: self.__default_font_num = default_font_num self.__write_to = tempfile.mktemp() self.__run_level = run_level + def __initiate_values(self): """ Initiate all values. @@ -67,6 +70,7 @@ class Fonts: self.__font_table = {} # individual font written self.__wrote_ind_font = 0 + def __default_func(self, line): """ Requires: @@ -79,6 +83,7 @@ class Fonts: if self.__token_info == 'mi<mk<fonttb-beg': self.__state = 'font_table' self.__write_obj.write(line) + def __font_table_func(self, line): """ Requires: @@ -101,6 +106,7 @@ class Fonts: self.__font_num = self.__default_font_num self.__text_line = '' ##self.__write_obj.write(line) + def __font_in_table_func(self, line): """ Requires: @@ -138,6 +144,7 @@ class Fonts: elif self.__token_info == 'mi<mk<fonttb-end': self.__found_end_font_table_func() self.__state = 'after_font_table' + def __found_end_font_table_func(self): """ Required: @@ -150,7 +157,8 @@ class Fonts: if not self.__wrote_ind_font: self.__write_obj.write( 'mi<tg<empty-att_' - '<font-in-table<name>Times<num>0\n' ) + '<font-in-table<name>Times<num>0\n') + def __after_font_table_func(self, line): """ Required: @@ -169,7 +177,7 @@ class Fonts: if self.__token_info == 'cw<ci<font-style': font_num = line[20:-1] font_name = self.__font_table.get(font_num) - if font_name == None: + if font_name is None: if self.__run_level > 3: msg = 'no value for %s in self.__font_table\n' % font_num raise self.__bug_handler, msg @@ -182,6 +190,7 @@ class Fonts: ) else: self.__write_obj.write(line) + def convert_fonts(self): """ Required: @@ -197,20 +206,15 @@ class Fonts: info. Substitute a font name for a font number. """ self.__initiate_values() - read_obj = open(self.__file, 'r') - self.__write_obj = open(self.__write_to, 'w') - line_to_read = 1 - while line_to_read: - line_to_read = read_obj.readline() - line = line_to_read - self.__token_info = line[:16] - action = self.__state_dict.get(self.__state) - if action == None: - sys.stderr.write('no no matching state in module fonts.py\n') - sys.stderr.write(self.__state + '\n') - action(line) - read_obj.close() - self.__write_obj.close() + with open(self.__file, 'r') as read_obj: + with open(self.__write_to, 'w') as self.__write_obj: + for line in read_obj: + self.__token_info = line[:16] + action = self.__state_dict.get(self.__state) + if action is None: + sys.stderr.write('no matching state in module fonts.py\n' \ + + self.__state + '\n') + action(line) default_font_name = self.__font_table.get(self.__default_font_num) if not default_font_name: default_font_name = 'Not Defined' diff --git a/src/calibre/ebooks/rtf2xml/get_char_map.py b/src/calibre/ebooks/rtf2xml/get_char_map.py index cb118b0df8..bd487bb6f5 100755 --- a/src/calibre/ebooks/rtf2xml/get_char_map.py +++ b/src/calibre/ebooks/rtf2xml/get_char_map.py @@ -41,7 +41,7 @@ class GetCharMap: def get_char_map(self, map): if map == 'ansicpg0': map = 'ansicpg1250' - if map in ('ansicpg10000', '10000'): + if map == 'ansicpg10000': map = 'mac_roman' found_map = False map_dict = {} diff --git a/src/calibre/ebooks/rtf2xml/tokenize.py b/src/calibre/ebooks/rtf2xml/tokenize.py index 9ebd718833..84acd26a57 100755 --- a/src/calibre/ebooks/rtf2xml/tokenize.py +++ b/src/calibre/ebooks/rtf2xml/tokenize.py @@ -115,6 +115,7 @@ class Tokenize: def __sub_reg_split(self,input_file): input_file = self.__replace_spchar.mreplace(input_file) + # this is for older RTF input_file = self.__par_exp.sub('\n\\par \n', input_file) input_file = self.__cs_ast.sub("\g<1>", input_file) input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file) @@ -126,12 +127,6 @@ class Tokenize: tokens = re.split(self.__splitexp, input_file) #remove empty tokens and \n return filter(lambda x: len(x) > 0 and x != '\n', tokens) - #input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file) - # line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line) - # this is for older RTF - #line = re.sub(self.__par_exp, '\\par ', line) - #return filter(lambda x: len(x) > 0, \ - #(self.__remove_line.sub('', x) for x in tokens)) def __compile_expressions(self): SIMPLE_RPL = { @@ -160,7 +155,7 @@ class Tokenize: } self.__replace_spchar = MReplace(SIMPLE_RPL) #add ;? in case of char following \u - self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") #r"\\\'(..)" + self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) ?") self.__bin_exp = re.compile(r"(?:\\bin(-?\d{0,10})[\n ]+)[01\n]+") #manage upr/ud situations @@ -174,14 +169,21 @@ class Tokenize: self.__par_exp = re.compile(r'\\\n+') #handle improper cs char-style with \* before without { self.__cs_ast = re.compile(r'\\\*([\n ]*\\cs\d+[\n \\]+)') - # self.__par_exp = re.compile(r'\\$') + #handle cw using a digit as argument and without space as delimiter + self.__cwdigit_exp = re.compile(r"(\\[a-zA-Z]+[\-0-9]+)([^0-9 \\]+)") #self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}") #self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})") #self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)") #self.__remove_line = re.compile(r'\n+') - #self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)") ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)") + def __correct_spliting(self, token): + match_obj = re.search(self.__cwdigit_exp, token) + if match_obj is None: + return token + else: + return '%s\n%s' % (match_obj.group(1), match_obj.group(2)) + def tokenize(self): """Main class for handling other methods. Reads the file \ , uses method self.sub_reg to make basic substitutions,\ @@ -197,6 +199,8 @@ class Tokenize: tokens = map(self.__unicode_process, tokens) #remove empty items created by removing \uc tokens = filter(lambda x: len(x) > 0, tokens) + #handles bothersome cases + tokens = map(self.__correct_spliting, tokens) #write with open(self.__write_to, 'wb') as write_obj: @@ -205,8 +209,6 @@ class Tokenize: copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "tokenize.data") - # if self.__out_file: - # self.__file = self.__out_file copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to) From 056f97c7008037f0eb9d20d9ae508171dd879993 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sat, 5 Feb 2011 12:19:46 +0100 Subject: [PATCH 122/132] Correct splitting problem --- src/calibre/ebooks/rtf/input.py | 6 +++--- src/calibre/ebooks/rtf2xml/colors.py | 2 +- src/calibre/ebooks/rtf2xml/tokenize.py | 10 +--------- 3 files changed, 5 insertions(+), 13 deletions(-) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 2ef5932784..6e17e33556 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -321,6 +321,6 @@ class RTFInput(InputFormatPlugin): opf.render(open('metadata.opf', 'wb')) return os.path.abspath('metadata.opf') -#ebook-convert "bad.rtf" test.epub -v -d "D:\Mes eBooks\Developpement\debug" -# os.makedirs('D:\\Mes eBooks\\Developpement\\rtfdebug') -# debug_dir = 'D:\\Mes eBooks\\Developpement\\rtfdebug' +#ebook-convert "bad.rtf" test.epub -v -d "E:\Mes eBooks\Developpement\debug" +# os.makedirs('E:\\Mes eBooks\\Developpement\\rtfdebug') +# debug_dir = 'E:\\Mes eBooks\\Developpement\\rtfdebug' diff --git a/src/calibre/ebooks/rtf2xml/colors.py b/src/calibre/ebooks/rtf2xml/colors.py index eba03547c8..e85b59571c 100755 --- a/src/calibre/ebooks/rtf2xml/colors.py +++ b/src/calibre/ebooks/rtf2xml/colors.py @@ -210,7 +210,7 @@ class Colors: hex_num = self.__color_dict.get(num) if hex_num is None: hex_num = '0' - if self.__run_level > 5: + if self.__run_level > 3: msg = 'no value in self.__color_dict' \ 'for key %s at line %d\n' % (num, self.__line) raise self.__bug_handler, msg diff --git a/src/calibre/ebooks/rtf2xml/tokenize.py b/src/calibre/ebooks/rtf2xml/tokenize.py index 2c0fa8fcb6..5e01515730 100755 --- a/src/calibre/ebooks/rtf2xml/tokenize.py +++ b/src/calibre/ebooks/rtf2xml/tokenize.py @@ -117,6 +117,7 @@ class Tokenize: input_file = self.__replace_spchar.mreplace(input_file) # this is for older RTF input_file = self.__par_exp.sub('\n\\par \n', input_file) + input_file = self.__cwdigit_exp.sub("\g<1>\n\g<2>", input_file) input_file = self.__cs_ast.sub("\g<1>", input_file) input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file) input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file) @@ -177,13 +178,6 @@ class Tokenize: #self.__remove_line = re.compile(r'\n+') ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)") - def __correct_spliting(self, token): - match_obj = re.search(self.__cwdigit_exp, token) - if match_obj is None: - return token - else: - return '%s\n%s' % (match_obj.group(1), match_obj.group(2)) - def tokenize(self): """Main class for handling other methods. Reads the file \ , uses method self.sub_reg to make basic substitutions,\ @@ -199,8 +193,6 @@ class Tokenize: tokens = map(self.__unicode_process, tokens) #remove empty items created by removing \uc tokens = filter(lambda x: len(x) > 0, tokens) - #handles bothersome cases - tokens = map(self.__correct_spliting, tokens) #write with open(self.__write_to, 'wb') as write_obj: From ccf856539aee3de4a462a409aee514dde22b312a Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sat, 5 Feb 2011 17:34:57 +0100 Subject: [PATCH 123/132] Still old paragraph format --- src/calibre/ebooks/rtf/input.py | 5 ++--- src/calibre/ebooks/rtf2xml/ParseRtf.py | 12 +++++----- src/calibre/ebooks/rtf2xml/process_tokens.py | 14 +++++++----- src/calibre/ebooks/rtf2xml/tokenize.py | 23 ++++++++------------ 4 files changed, 25 insertions(+), 29 deletions(-) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 6e17e33556..06a5fa61c9 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -78,7 +78,6 @@ class RTFInput(InputFormatPlugin): from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf ofile = 'dataxml.xml' run_lev, debug_dir, indent_out = 1, None, 0 - #just to check if the debug process is lauched, no need of this directory in fact if getattr(self.opts, 'debug_pipeline', None) is not None: try: os.mkdir('rtfdebug') @@ -322,5 +321,5 @@ class RTFInput(InputFormatPlugin): return os.path.abspath('metadata.opf') #ebook-convert "bad.rtf" test.epub -v -d "E:\Mes eBooks\Developpement\debug" -# os.makedirs('E:\\Mes eBooks\\Developpement\\rtfdebug') -# debug_dir = 'E:\\Mes eBooks\\Developpement\\rtfdebug' +# os.makedirs("E:\\Mes eBooks\\Developpement\\rtfdebug") +# debug_dir = "E:\\Mes eBooks\\Developpement\\rtfdebug" diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index 56e18fe74d..9f554467b0 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -226,7 +226,7 @@ class ParseRtf: try: return_value = process_tokens_obj.process_tokens() except InvalidRtfException, msg: - #Check to see if the file is correctly encoded + # Check to see if the file is correctly encoded encode_obj = default_encoding.DefaultEncoding( in_file = self.__temp_file, run_level = self.__run_level, @@ -237,14 +237,14 @@ class ParseRtf: check_encoding_obj = check_encoding.CheckEncoding( bug_handler = RtfInvalidCodeException, ) - enc = 'cp' + encode_obj.get_codepage() - if enc == 'cp10000': - enc = 'mac_roman' - msg = 'Exception in token processing' + enc = encode_obj.get_codepage() + if enc != 'mac_roman': + enc = 'cp' + enc + msg = '%s\nException in token processing' % str(msg) if check_encoding_obj.check_encoding(self.__file, enc): file_name = self.__file if isinstance(self.__file, str) \ else self.__file.encode('utf-8') - msg = 'File %s does not appear to be correctly encoded.\n' % file_name + msg +='\nFile %s does not appear to be correctly encoded.\n' % file_name try: os.remove(self.__temp_file) except OSError: diff --git a/src/calibre/ebooks/rtf2xml/process_tokens.py b/src/calibre/ebooks/rtf2xml/process_tokens.py index 1edf69b32d..010d374cbc 100755 --- a/src/calibre/ebooks/rtf2xml/process_tokens.py +++ b/src/calibre/ebooks/rtf2xml/process_tokens.py @@ -786,21 +786,23 @@ class ProcessTokens: token = line.replace("\n","") line_count += 1 if line_count == 1 and token != '\\{': - msg = 'Invalid RTF: document doesn\'t start with {\n' + msg = '\nInvalid RTF: document doesn\'t start with {\n' raise self.__exception_handler, msg elif line_count == 2 and token[0:4] != '\\rtf': - msg = 'Invalid RTF: document doesn\'t start with \\rtf \n' + msg = '\nInvalid RTF: document doesn\'t start with \\rtf \n' raise self.__exception_handler, msg the_index = token.find('\\ ') if token is not None and the_index > -1: - msg = 'Invalid RTF: token "\\ " not valid.\n' + msg = '\nInvalid RTF: token "\\ " not valid.\nError at line %d'\ + % line_count raise self.__exception_handler, msg elif token[:1] == "\\": try: token.decode('us-ascii') except UnicodeError, msg: - msg = 'Invalid RTF: Tokens not ascii encoded.\n%s' % str(msg) + msg = '\nInvalid RTF: Tokens not ascii encoded.\n%s\nError at line %d'\ + % (str(msg), line_count) raise self.__exception_handler, msg line = self.process_cw(token) if line is not None: @@ -816,7 +818,7 @@ class ProcessTokens: write_obj.write('tx<nu<__________<%s\n' % field) if not line_count: - msg = 'Invalid RTF: file appears to be empty.\n' + msg = '\nInvalid RTF: file appears to be empty.\n' raise self.__exception_handler, msg copy_obj = copy.Copy(bug_handler = self.__bug_handler) @@ -827,7 +829,7 @@ class ProcessTokens: bad_brackets = self.__check_brackets(self.__file) if bad_brackets: - msg = 'Invalid RTF: document does not have matching brackets.\n' + msg = '\nInvalid RTF: document does not have matching brackets.\n' raise self.__exception_handler, msg else: return self.__return_code \ No newline at end of file diff --git a/src/calibre/ebooks/rtf2xml/tokenize.py b/src/calibre/ebooks/rtf2xml/tokenize.py index 5e01515730..062a720d91 100755 --- a/src/calibre/ebooks/rtf2xml/tokenize.py +++ b/src/calibre/ebooks/rtf2xml/tokenize.py @@ -141,17 +141,17 @@ class Tokenize: "\\_": "\\_ ", "\\:": "\\: ", "\\-": "\\- ", - # turn into a generic token to eliminate special - # cases and make processing easier + #turn into a generic token to eliminate special + #cases and make processing easier "\\{": "\\ob ", - # turn into a generic token to eliminate special - # cases and make processing easier + #turn into a generic token to eliminate special + #cases and make processing easier "\\}": "\\cb ", - # put a backslash in front of to eliminate special cases and - # make processing easier + #put a backslash in front of to eliminate special cases and + #make processing easier "{": "\\{", - # put a backslash in front of to eliminate special cases and - # make processing easier + #put a backslash in front of to eliminate special cases and + #make processing easier "}": "\\}", } self.__replace_spchar = MReplace(SIMPLE_RPL) @@ -167,16 +167,11 @@ class Tokenize: #remove \n from endline char self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)") #this is for old RTF - self.__par_exp = re.compile(r'\\\n+') + self.__par_exp = re.compile(r'(\\\n+|\\ )') #handle improper cs char-style with \* before without { self.__cs_ast = re.compile(r'\\\*([\n ]*\\cs\d+[\n \\]+)') #handle cw using a digit as argument and without space as delimiter self.__cwdigit_exp = re.compile(r"(\\[a-zA-Z]+[\-0-9]+)([^0-9 \\]+)") - #self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}") - #self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})") - #self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)") - #self.__remove_line = re.compile(r'\n+') - ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)") def tokenize(self): """Main class for handling other methods. Reads the file \ From 6a4f647668d9b9622c02d824ed16bf6167210465 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 6 Feb 2011 01:14:56 +0100 Subject: [PATCH 124/132] Remove librarything login --- src/calibre/ebooks/metadata/covers.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/metadata/covers.py b/src/calibre/ebooks/metadata/covers.py index cbd8fc0e99..529b38c1d3 100644 --- a/src/calibre/ebooks/metadata/covers.py +++ b/src/calibre/ebooks/metadata/covers.py @@ -145,12 +145,13 @@ class LibraryThingCovers(CoverDownload): # {{{ return url def has_cover(self, mi, ans, timeout=5.): - if not mi.isbn or not self.site_customization: + # if not mi.isbn or not self.site_customization: + if not mi.isbn: return False from calibre.ebooks.metadata.library_thing import get_browser, login br = get_browser() - un, _, pw = self.site_customization.partition(':') - login(br, un, pw) + # un, _, pw = self.site_customization.partition(':') + # login(br, un, pw) try: self.get_cover_url(mi.isbn, br, timeout=timeout) self.debug('cover for', mi.isbn, 'found') @@ -159,12 +160,13 @@ class LibraryThingCovers(CoverDownload): # {{{ self.debug(e) def get_covers(self, mi, result_queue, abort, timeout=5.): - if not mi.isbn or not self.site_customization: + # if not mi.isbn or not self.site_customization: + if not mi.isbn: return from calibre.ebooks.metadata.library_thing import get_browser, login br = get_browser() - un, _, pw = self.site_customization.partition(':') - login(br, un, pw) + # un, _, pw = self.site_customization.partition(':') + # login(br, un, pw) try: url = self.get_cover_url(mi.isbn, br, timeout=timeout) cover_data = br.open_novisit(url).read() From bcc516afc211f873d8b8d9be712d2968d1960271 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Tue, 22 Feb 2011 22:48:36 +0100 Subject: [PATCH 125/132] Meta personalisation --- src/calibre/customize/builtins.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 8f83795ef5..5f1dfd9c35 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -506,12 +506,12 @@ from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG from calibre.devices.kobo.driver import KOBO from calibre.devices.bambook.driver import BAMBOOK -from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \ - KentDistrictLibrary +from calibre.ebooks.metadata.fetch import KentDistrictLibrary from calibre.ebooks.metadata.douban import DoubanBooks -#from calibre.ebooks.metadata.google_books import GoogleBooks +from calibre.ebooks.metadata.isbndb import ISBNDB +from calibre.ebooks.metadata.google_books import GoogleBooks from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers -#from calibre.ebooks.metadata.amazon import Amazon, AmazonSocial +from calibre.ebooks.metadata.amazon import Amazon, AmazonSocial from calibre.ebooks.metadata.fictionwise import Fictionwise from calibre.ebooks.metadata.covers import OpenLibraryCovers, \ LibraryThingCovers, DoubanCovers From 888aaec88fea2d669d0ed4d2b245351c0013436f Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Wed, 9 Mar 2011 22:21:02 +0100 Subject: [PATCH 126/132] Metadata compatibility --- src/calibre/customize/builtins.py | 6 +- src/calibre/ebooks/metadata/amazon.py | 259 +++----------------------- src/calibre/ebooks/metadata/fetch.py | 21 +++ 3 files changed, 52 insertions(+), 234 deletions(-) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index ba90d20dcc..74f1f9eafe 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -580,12 +580,12 @@ from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG from calibre.devices.kobo.driver import KOBO from calibre.devices.bambook.driver import BAMBOOK -from calibre.ebooks.metadata.fetch import KentDistrictLibrary +from calibre.ebooks.metadata.fetch import KentDistrictLibrary, Amazon from calibre.ebooks.metadata.douban import DoubanBooks from calibre.ebooks.metadata.isbndb import ISBNDB from calibre.ebooks.metadata.google_books import GoogleBooks from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers -from calibre.ebooks.metadata.amazon import Amazon, AmazonSocial +# from calibre.ebooks.metadata.amazon import Amazon , AmazonSocial from calibre.ebooks.metadata.fictionwise import Fictionwise from calibre.ebooks.metadata.covers import OpenLibraryCovers, \ AmazonCovers, DoubanCovers, LibrarythingCovers @@ -593,7 +593,7 @@ from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX from calibre.ebooks.epub.fix.unmanifested import Unmanifested from calibre.ebooks.epub.fix.epubcheck import Epubcheck -plugins = [HTML2ZIP, PML2PMLZ, TXT2TXTZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, AmazonSocial, +plugins = [HTML2ZIP, PML2PMLZ, TXT2TXTZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, #AmazonSocial, KentDistrictLibrary, DoubanBooks, NiceBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, Epubcheck, OpenLibraryCovers, AmazonCovers, DoubanCovers, LibrarythingCovers, NiceBooksCovers] diff --git a/src/calibre/ebooks/metadata/amazon.py b/src/calibre/ebooks/metadata/amazon.py index a2ddc22770..c87249ed39 100644 --- a/src/calibre/ebooks/metadata/amazon.py +++ b/src/calibre/ebooks/metadata/amazon.py @@ -1,7 +1,11 @@ -from __future__ import with_statement -__license__ = 'GPL 3' -__copyright__ = '2010, sengian <sengian1@gmail.com>' +#!/usr/bin/env python +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' +__docformat__ = 'restructuredtext en' +''' +Fetch metadata using Amazon AWS +''' import sys, re from threading import RLock @@ -12,10 +16,6 @@ from calibre import browser from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.chardet import xml_to_unicode -from calibre.ebooks.metadata import MetaInformation, check_isbn, \ - authors_to_sort_string -from calibre.ebooks.metadata.fetch import MetadataSource -from calibre.utils.config import OptionParser from calibre.library.comments import sanitize_comments_html asin_cache = {} @@ -160,229 +160,31 @@ def get_metadata(br, asin, mi): m = pat.match(t) if m is not None: try: - default = utcnow().replace(day=15) - if self.lang != 'all': - d = replace_months(d, self.lang) - d = parse_date(d, assume_utc=True, default=default) - mi.pubdate = d + mi.rating = float(m.group(1))/float(m.group(2)) * 5 + break except: - report(verbose) - #ISBN - elt = filter(lambda x: self.reisbn.search(x.find('b').text), elts) - if elt: - isbn = elt[0].find('b').tail.replace('-', '').strip() - if check_isbn(isbn): - mi.isbn = unicode(isbn) - elif len(elt) > 1: - isbnone = elt[1].find('b').tail.replace('-', '').strip() - if check_isbn(isbnone): - mi.isbn = unicode(isbnone) - else: - #assume ASIN-> find a check for asin - mi.isbn = unicode(isbn) - #Langue - elt = filter(lambda x: self.relang.search(x.find('b').text), elts) - if elt: - langue = elt[0].find('b').tail.strip() - if langue: - mi.language = unicode(langue) - #ratings - elt = filter(lambda x: self.reratelt.search(x.find('b').text), elts) - if elt: - ratings = elt[0].find_class('swSprite') - if ratings: - ratings = self.rerat.findall(ratings[0].get('title')) - if len(ratings) == 2: - mi.rating = float(ratings[0])/float(ratings[1]) * 5 - return mi + pass - def fill_MI(self, entry, verbose): - try: - title = self.get_title(entry) - authors = self.get_authors(entry) - except Exception, e: - if verbose: - print _('Failed to get all details for an entry') - print e - print _('URL who failed: %s') % x - report(verbose) - return None - mi = MetaInformation(title, authors) - mi.author_sort = authors_to_sort_string(authors) - try: - mi.comments = self.get_description(entry, verbose) - mi = self.get_book_info(entry, mi, verbose) - except: - pass - return mi + desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]') + if desc: + desc = desc[0] + for c in desc.xpath('descendant::*[@class="seeAll" or' + ' @class="emptyClear" or @href]'): + c.getparent().remove(c) + desc = html.tostring(desc, method='html', encoding=unicode).strip() + # remove all attributes from tags + desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) + # Collapse whitespace + #desc = re.sub('\n+', '\n', desc) + #desc = re.sub(' +', ' ', desc) + # Remove the notice about text referring to out of print editions + desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc) + # Remove comments + desc = re.sub(r'(?s)<!--.*?-->', '', desc) + mi.comments = sanitize_comments_html(desc) - def get_individual_metadata(self, url, br, verbose): - try: - raw = br.open_novisit(url).read() - except Exception, e: - import socket - report(verbose) - if callable(getattr(e, 'getcode', None)) and \ - e.getcode() == 404: - return None - attr = getattr(e, 'args', [None]) - attr = attr if attr else [None] - if isinstance(attr[0], socket.timeout): - raise AmazonError(_('Amazon timed out. Try again later.')) - raise AmazonError(_('Amazon encountered an error.')) - if '<title>404 - ' in raw: - report(verbose) - return None - raw = xml_to_unicode(raw, strip_encoding_pats=True, - resolve_entities=True)[0] - try: - return soupparser.fromstring(raw) - except: - try: - #remove ASCII invalid chars - return soupparser.fromstring(clean_ascii_chars(raw)) - except: - report(verbose) - return None + return True - def fetchdatathread(self, qbr, qsync, nb, url, verbose): - try: - browser = qbr.get(True) - entry = self.get_individual_metadata(url, browser, verbose) - except: - report(verbose) - entry = None - finally: - qbr.put(browser, True) - qsync.put((nb, entry), True) - - def producer(self, sync, urls, br, verbose=False): - for i in xrange(len(urls)): - thread = Thread(target=self.fetchdatathread, - args=(br, sync, i, urls[i], verbose)) - thread.start() - - def consumer(self, sync, syncbis, br, total_entries, verbose=False): - i=0 - self.extend([None]*total_entries) - while i < total_entries: - rq = sync.get(True) - nb = int(rq[0]) - entry = rq[1] - i+=1 - if entry is not None: - mi = self.fill_MI(entry, verbose) - if mi is not None: - mi.tags, atag = self.get_tags(entry, verbose) - self[nb] = mi - if atag: - thread = Thread(target=self.fetchdatathread, - args=(br, syncbis, nb, mi.tags, verbose)) - thread.start() - else: - syncbis.put((nb, None), True) - - def final(self, sync, total_entries, verbose): - i=0 - while i < total_entries: - rq = sync.get(True) - nb = int(rq[0]) - tags = rq[1] - i+=1 - if tags is not None: - self[nb].tags = self.get_tags(tags, verbose)[0] - - def populate(self, entries, ibr, verbose=False, brcall=3): - br = Queue(brcall) - cbr = Queue(brcall-1) - - syncp = Queue(1) - syncc = Queue(1) - - for i in xrange(brcall-1): - br.put(browser(), True) - cbr.put(browser(), True) - br.put(ibr, True) - - prod_thread = Thread(target=self.producer, args=(syncp, entries, br, verbose)) - cons_thread = Thread(target=self.consumer, args=(syncp, syncc, cbr, len(entries), verbose)) - fin_thread = Thread(target=self.final, args=(syncc, len(entries), verbose)) - prod_thread.start() - cons_thread.start() - fin_thread.start() - prod_thread.join() - cons_thread.join() - fin_thread.join() - - -def search(title=None, author=None, publisher=None, isbn=None, - max_results=5, verbose=False, keywords=None, lang='all'): - br = browser() - entries, baseurl = Query(title=title, author=author, isbn=isbn, publisher=publisher, - keywords=keywords, max_results=max_results,rlang=lang)(br, verbose) - - if entries is None or len(entries) == 0: - return None - - #List of entry - ans = ResultList(baseurl, lang) - ans.populate(entries, br, verbose) - return [x for x in ans if x is not None] - -def get_social_metadata(title, authors, publisher, isbn, verbose=False, - max_results=1, lang='all'): - mi = MetaInformation(title, authors) - if not isbn or not check_isbn(isbn): - return [mi] - - amazresults = search(isbn=isbn, verbose=verbose, - max_results=max_results, lang=lang) - if amazresults is None or amazresults[0] is None: - from calibre.ebooks.metadata.xisbn import xisbn - for i in xisbn.get_associated_isbns(isbn): - amazresults = search(isbn=i, verbose=verbose, - max_results=max_results, lang=lang) - if amazresults is not None and amazresults[0] is not None: - break - if amazresults is None or amazresults[0] is None: - return [mi] - - miaz = amazresults[0] - if miaz.rating is not None: - mi.rating = miaz.rating - if miaz.comments is not None: - mi.comments = miaz.comments - if miaz.tags is not None: - mi.tags = miaz.tags - return [mi] - -def option_parser(): - import textwrap - parser = OptionParser(textwrap.dedent(\ - _('''\ - %prog [options] - - Fetch book metadata from Amazon. You must specify one of title, author, - ISBN, publisher or keywords. Will fetch a maximum of 20 matches, - so you should make your query as specific as possible. - You can chose the language for metadata retrieval: - english & french & german - ''' - ))) - parser.add_option('-t', '--title', help=_('Book title')) - parser.add_option('-a', '--author', help=_('Book author(s)')) - parser.add_option('-p', '--publisher', help=_('Book publisher')) - parser.add_option('-i', '--isbn', help=_('Book ISBN')) - parser.add_option('-k', '--keywords', help=_('Keywords')) - parser.add_option('-s', '--social', default=0, action='count', - help=_('Get social data only')) - parser.add_option('-m', '--max-results', default=10, - help=_('Maximum number of results to fetch')) - parser.add_option('-l', '--lang', default='all', - help=_('Chosen language for metadata search (en, fr, de)')) - parser.add_option('-v', '--verbose', default=0, action='count', - help=_('Be more verbose about errors')) - return parser def main(args=sys.argv): import tempfile, os @@ -412,8 +214,3 @@ def main(args=sys.argv): if __name__ == '__main__': sys.exit(main()) - # import cProfile - # sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonbis; calibre.ebooks.metadata.amazonbis.main()")) - # sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonbis; calibre.ebooks.metadata.amazonbis.main()", "profile")) - -# calibre-debug -e "D:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\amazon.py" -m 5 -a gore -v>data.html \ No newline at end of file diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index 5936222e24..978e460190 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -212,6 +212,27 @@ class MetadataSource(Plugin): # {{{ # }}} +class Amazon(MetadataSource): # {{{ + + name = 'Amazon' + metadata_type = 'social' + description = _('Downloads social metadata from amazon.com') + + has_html_comments = True + + def fetch(self): + if not self.isbn: + return + from calibre.ebooks.metadata.amazon import get_social_metadata + try: + self.results = get_social_metadata(self.title, self.book_author, + self.publisher, self.isbn) + except Exception, e: + self.exception = e + self.tb = traceback.format_exc() + + # }}} + class KentDistrictLibrary(MetadataSource): # {{{ name = 'Kent District Library' From 5dc5b93a1fa5a69143d59c7b423b64c1be4cf92f Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Wed, 9 Mar 2011 22:22:42 +0100 Subject: [PATCH 127/132] Correction of space eating after unicode chars --- src/calibre/ebooks/rtf2xml/tokenize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/rtf2xml/tokenize.py b/src/calibre/ebooks/rtf2xml/tokenize.py index 062a720d91..45a6e75ed6 100755 --- a/src/calibre/ebooks/rtf2xml/tokenize.py +++ b/src/calibre/ebooks/rtf2xml/tokenize.py @@ -46,7 +46,8 @@ class Tokenize: def __remove_uc_chars(self, startchar, token): for i in xrange(startchar, len(token)): - if token[i] == " ": + #handle the case of an uc char with a terminating blank before ansi char + if token[i] == " " and self.__uc_char: continue elif self.__uc_char: self.__uc_char -= 1 From 79a28bad795545a42196ceacddbd58566c12d52b Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Fri, 1 Apr 2011 23:10:09 +0200 Subject: [PATCH 128/132] Meta activation --- src/calibre/customize/builtins.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index ffac87c02e..8d91913b84 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -588,14 +588,14 @@ from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers # from calibre.ebooks.metadata.amazon import Amazon , AmazonSocial from calibre.ebooks.metadata.fictionwise import Fictionwise from calibre.ebooks.metadata.covers import OpenLibraryCovers, \ - AmazonCovers, DoubanCovers, LibrarythingCovers + AmazonCovers, DoubanCovers #, LibrarythingCovers from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX from calibre.ebooks.epub.fix.unmanifested import Unmanifested from calibre.ebooks.epub.fix.epubcheck import Epubcheck plugins = [HTML2ZIP, PML2PMLZ, TXT2TXTZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, #AmazonSocial, KentDistrictLibrary, DoubanBooks, NiceBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, - Epubcheck, OpenLibraryCovers, AmazonCovers, DoubanCovers, LibrarythingCovers, + Epubcheck, OpenLibraryCovers, AmazonCovers, DoubanCovers, #LibrarythingCovers, NiceBooksCovers] plugins += [ ComicInput, From b68b82fc647b7ac9cc95a222f9cf65c690608fb6 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Tue, 3 May 2011 00:41:16 +0200 Subject: [PATCH 129/132] Correct and bug with multiple authors and convert html in comments to markdown text --- src/calibre/library/catalog.py | 17 ++++++++++------- src/calibre/utils/bibtex.py | 2 +- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py index 717e8e2c6b..67f1c16d2d 100644 --- a/src/calibre/library/catalog.py +++ b/src/calibre/library/catalog.py @@ -8,6 +8,7 @@ from collections import namedtuple from copy import deepcopy from xml.sax.saxutils import escape from lxml import etree +from types import StringType, UnicodeType from calibre import prints, prepare_string_for_xml, strftime from calibre.constants import preferred_encoding, DEBUG @@ -15,13 +16,16 @@ from calibre.customize import CatalogPlugin from calibre.customize.conversion import OptionRecommendation, DummyReporter from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, Tag, NavigableString from calibre.ebooks.chardet import substitute_entites +from calibre.library.save_to_disk import preprocess_template from calibre.ptempfile import PersistentTemporaryDirectory +from calibre.utils.bibtex import BibTeX from calibre.utils.config import config_dir from calibre.utils.date import format_date, isoformat, is_date_undefined, now as nowf +from calibre.utils.html2text import html2text from calibre.utils.icu import capitalize from calibre.utils.logging import default_log as log -from calibre.utils.zipfile import ZipFile, ZipInfo from calibre.utils.magick.draw import thumbnail +from calibre.utils.zipfile import ZipFile, ZipInfo FIELDS = ['all', 'title', 'author_sort', 'authors', 'comments', 'cover', 'formats','id', 'isbn', 'ondevice', 'pubdate', 'publisher', @@ -303,12 +307,6 @@ class BIBTEX(CatalogPlugin): # {{{ def run(self, path_to_output, opts, db, notification=DummyReporter()): - from types import StringType, UnicodeType - - from calibre.library.save_to_disk import preprocess_template - #Bibtex functions - from calibre.utils.bibtex import BibTeX - def create_bibtex_entry(entry, fields, mode, template_citation, bibtexdict, citation_bibtex=True, calibre_files=True): @@ -365,6 +363,11 @@ class BIBTEX(CatalogPlugin): # {{{ #\n removal item = item.replace(u'\r\n',u' ') item = item.replace(u'\n',u' ') + #html to text + try: + item = html2text(item) + except: + log(" WARNING: error in converting comments to text") bibtex_entry.append(u'note = "%s"' % bibtexdict.utf8ToBibtex(item)) elif field == 'isbn' : diff --git a/src/calibre/utils/bibtex.py b/src/calibre/utils/bibtex.py index d19a6b05fe..518ec96611 100644 --- a/src/calibre/utils/bibtex.py +++ b/src/calibre/utils/bibtex.py @@ -2905,4 +2905,4 @@ class BibTeX: def bibtex_author_format(self, item): #Format authors for Bibtex compliance (get a list as input) - return self.utf8ToBibtex(u' and'.join([author for author in item])) + return self.utf8ToBibtex(u' and '.join([author for author in item])) From 50df54efa63f27ff41559d7d47ff0dce0790564f Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Wed, 11 May 2011 23:44:22 +0200 Subject: [PATCH 130/132] Color None mistingly translated to true instead of 0 --- src/calibre/ebooks/rtf/input.py | 2 +- src/calibre/ebooks/rtf2xml/process_tokens.py | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 2f8c11fd50..be032f0598 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -87,7 +87,7 @@ class RTFInput(InputFormatPlugin): indent_out = 1 self.log('Running RTFParser in debug mode') except: - pass + self.log.warn('Impossible to run RTFParser in debug mode') parser = ParseRtf( in_file = stream, out_file = ofile, diff --git a/src/calibre/ebooks/rtf2xml/process_tokens.py b/src/calibre/ebooks/rtf2xml/process_tokens.py index 010d374cbc..7dc88e7f2b 100755 --- a/src/calibre/ebooks/rtf2xml/process_tokens.py +++ b/src/calibre/ebooks/rtf2xml/process_tokens.py @@ -197,8 +197,8 @@ class ProcessTokens: # character info => ci 'b' : ('ci', 'bold______', self.bool_st_func), 'blue' : ('ci', 'blue______', self.color_func), - 'caps' : ('ci', 'caps______', self.bool_st_func), - 'cf' : ('ci', 'font-color', self.default_func), + 'caps' : ('ci', 'caps______', self.bool_st_func), + 'cf' : ('ci', 'font-color', self.colorz_func), 'chftn' : ('ci', 'footnot-mk', self.bool_st_func), 'dn' : ('ci', 'font-down_', self.divide_by_2), 'embo' : ('ci', 'emboss____', self.bool_st_func), @@ -624,6 +624,11 @@ class ProcessTokens: num = 'true' return 'cw<%s<%s<nu<%s\n' % (pre, token, num) + def colorz_func(self, pre, token, num): + if num is None: + num = '0' + return 'cw<%s<%s<nu<%s\n' % (pre, token, num) + def __list_type_func(self, pre, token, num): type = 'arabic' if num is None: From c8e820760a17847edbd0f8e11f9a20d4c251b9be Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sat, 10 Sep 2011 11:27:27 +0200 Subject: [PATCH 131/132] Fix RTFinput not handling underlined text --- resources/templates/rtf.xsl | 2 +- src/calibre/ebooks/rtf/input.py | 5 ++- src/calibre/ebooks/rtf2xml/configure_txt.py | 2 +- src/calibre/ebooks/rtf2xml/inline.py | 4 +- src/calibre/ebooks/rtf2xml/process_tokens.py | 41 ++++++++++---------- src/calibre/ebooks/rtf2xml/sections.py | 2 +- src/calibre/ebooks/rtf2xml/styles.py | 2 - 7 files changed, 30 insertions(+), 28 deletions(-) diff --git a/resources/templates/rtf.xsl b/resources/templates/rtf.xsl index 58536186d9..9dba87e83a 100644 --- a/resources/templates/rtf.xsl +++ b/resources/templates/rtf.xsl @@ -256,7 +256,7 @@ <xsl:value-of select="'italic'"/> <xsl:text>;</xsl:text> </xsl:if> - <xsl:if test="@underline and @underline != 'false'"> + <xsl:if test="@underlined and @underlined != 'false'"> <xsl:text>text-decoration:underline</xsl:text> <xsl:text>;</xsl:text> </xsl:if> diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index be032f0598..c1e649851b 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -41,7 +41,7 @@ border_style_map = { class InlineClass(etree.XSLTExtension): - FMTS = ('italics', 'bold', 'underlined', 'strike-through', 'small-caps') + FMTS = ('italics', 'bold', 'strike-through', 'small-caps') def __init__(self, log): etree.XSLTExtension.__init__(self) @@ -54,6 +54,9 @@ class InlineClass(etree.XSLTExtension): for x in self.FMTS: if input_node.get(x, None) == 'true': classes.append(x) + #underlined is special + if input_node.get('underlined', 'false') != 'false': + classes.append('underlined') fs = input_node.get('font-size', False) if fs: if fs not in self.font_sizes: diff --git a/src/calibre/ebooks/rtf2xml/configure_txt.py b/src/calibre/ebooks/rtf2xml/configure_txt.py index cd4c2558b7..27f06d0d19 100755 --- a/src/calibre/ebooks/rtf2xml/configure_txt.py +++ b/src/calibre/ebooks/rtf2xml/configure_txt.py @@ -25,7 +25,7 @@ class Configure: if self.__show_config_file and self.__configuration_file: sys.stderr.write('configuration file is "%s"\n' % self.__configuration_file) if self.__show_config_file and not self.__configuration_file: - sys.stderr.write('No configuraiton file found; using default vaules\n') + sys.stderr.write('No configuraiton file found; using default values\n') if self.__configuration_file: read_obj = open(self.__configuration_file, 'r') line_to_read = 1 diff --git a/src/calibre/ebooks/rtf2xml/inline.py b/src/calibre/ebooks/rtf2xml/inline.py index 7eda0ce429..2d73db9071 100755 --- a/src/calibre/ebooks/rtf2xml/inline.py +++ b/src/calibre/ebooks/rtf2xml/inline.py @@ -411,11 +411,11 @@ class Inline: self.__set_list_func(line) action = self.__state_dict.get(self.__state) if action is None: - sys.stderr.write('No matching state in module inline_for_lists.py\n') + sys.stderr.write('No matching state in module inline.py\n') sys.stderr.write(self.__state + '\n') action(line) copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "inline.data") copy_obj.rename(self.__write_to, self.__file) - os.remove(self.__write_to) + os.remove(self.__write_to) \ No newline at end of file diff --git a/src/calibre/ebooks/rtf2xml/process_tokens.py b/src/calibre/ebooks/rtf2xml/process_tokens.py index 7dd602ff46..5e4017c000 100755 --- a/src/calibre/ebooks/rtf2xml/process_tokens.py +++ b/src/calibre/ebooks/rtf2xml/process_tokens.py @@ -214,7 +214,27 @@ class ProcessTokens: 'nosupersub' : ('ci', 'no-su-supe', self.__no_sup_sub_func), 'up' : ('ci', 'font-up___', self.divide_by_2), 'v' : ('ci', 'hidden____', self.default_func), - # table => tb + # underline + # can't see why it isn't a char info: 'ul'=>'ci' + 'ul' : ('ci', 'underlined<continous', self.two_part_func), + 'uld' : ('ci', 'underlined<dotted', self.two_part_func), + 'uldash' : ('ci', 'underlined<dash', self.two_part_func), + 'uldashd' : ('ci', 'underlined<dash-dot', self.two_part_func), + 'uldashdd' : ('ci', 'underlined<dash-dot-dot', self.two_part_func), + 'uldb' : ('ci', 'underlined<double', self.two_part_func), + 'ulhwave' : ('ci', 'underlined<heavy-wave', self.two_part_func), + 'ulldash' : ('ci', 'underlined<long-dash', self.two_part_func), + 'ulth' : ('ci', 'underlined<thich', self.two_part_func), + 'ulthd' : ('ci', 'underlined<thick-dotted', self.two_part_func), + 'ulthdash' : ('ci', 'underlined<thick-dash', self.two_part_func), + 'ulthdashd' : ('ci', 'underlined<thick-dash-dot', self.two_part_func), + 'ulthdashdd' : ('ci', 'underlined<thick-dash-dot-dot', self.two_part_func), + 'ulthldash' : ('ci', 'underlined<thick-long-dash', self.two_part_func), + 'ululdbwave' : ('ci', 'underlined<double-wave', self.two_part_func), + 'ulw' : ('ci', 'underlined<word', self.two_part_func), + 'ulwave' : ('ci', 'underlined<wave', self.two_part_func), + 'ulnone' : ('ci', 'underlined<false', self.two_part_func), + # table => tb 'trowd' : ('tb', 'row-def___', self.default_func), 'cell' : ('tb', 'cell______', self.default_func), 'row' : ('tb', 'row_______', self.default_func), @@ -274,25 +294,6 @@ class ProcessTokens: 'paperh' : ('pa', 'paper-hght', self.divide_by_20), # annotation => an 'annotation' : ('an', 'annotation', self.default_func), - # underline - 'ul' : ('ul', 'underlined<continous', self.two_part_func), - 'uld' : ('ul', 'underlined<dotted', self.two_part_func), - 'uldash' : ('ul', 'underlined<dash', self.two_part_func), - 'uldashd' : ('ul', 'underlined<dash-dot', self.two_part_func), - 'uldashdd' : ('ul', 'underlined<dash-dot-dot', self.two_part_func), - 'uldb' : ('ul', 'underlined<double', self.two_part_func), - 'ulhwave' : ('ul', 'underlined<heavy-wave', self.two_part_func), - 'ulldash' : ('ul', 'underlined<long-dash', self.two_part_func), - 'ulth' : ('ul', 'underlined<thich', self.two_part_func), - 'ulthd' : ('ul', 'underlined<thick-dotted', self.two_part_func), - 'ulthdash' : ('ul', 'underlined<thick-dash', self.two_part_func), - 'ulthdashd' : ('ul', 'underlined<thick-dash-dot', self.two_part_func), - 'ulthdashdd' : ('ul', 'underlined<thick-dash-dot-dot', self.two_part_func), - 'ulthldash' : ('ul', 'underlined<thick-long-dash', self.two_part_func), - 'ululdbwave' : ('ul', 'underlined<double-wave', self.two_part_func), - 'ulw' : ('ul', 'underlined<word', self.two_part_func), - 'ulwave' : ('ul', 'underlined<wave', self.two_part_func), - 'ulnone' : ('ul', 'underlined<false', self.two_part_func), # border => bd 'trbrdrh' : ('bd', 'bor-t-r-hi', self.default_func), 'trbrdrv' : ('bd', 'bor-t-r-vi', self.default_func), diff --git a/src/calibre/ebooks/rtf2xml/sections.py b/src/calibre/ebooks/rtf2xml/sections.py index 13bf2c2ddc..a315729525 100755 --- a/src/calibre/ebooks/rtf2xml/sections.py +++ b/src/calibre/ebooks/rtf2xml/sections.py @@ -496,7 +496,7 @@ Instead, ingore all section information in a field-block. self.__token_info = line[:16] action = self.__state_dict.get(self.__state) if action == None: - sys.stderr.write('no no matching state in module sections.py\n') + sys.stderr.write('no matching state in module sections.py\n') sys.stderr.write(self.__state + '\n') action(line) read_obj.close() diff --git a/src/calibre/ebooks/rtf2xml/styles.py b/src/calibre/ebooks/rtf2xml/styles.py index 55f86e4208..7fcbfb24a3 100755 --- a/src/calibre/ebooks/rtf2xml/styles.py +++ b/src/calibre/ebooks/rtf2xml/styles.py @@ -103,8 +103,6 @@ class Styles: 'sect-note_' : 'endnotes-in-section', # list=> ls 'list-text_' : 'list-text', - # this line must be wrong because it duplicates an earlier one - 'list-text_' : 'list-text', 'list______' : 'list', 'list-lev-d' : 'list-level-definition', 'list-cardi' : 'list-cardinal-numbering', From 6328279091f9d40862d0caf07ff777b73c0966b8 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sat, 10 Sep 2011 12:06:50 +0200 Subject: [PATCH 132/132] Add link anchor for internal bookmarks --- resources/templates/rtf.xsl | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/resources/templates/rtf.xsl b/resources/templates/rtf.xsl index 9dba87e83a..0f91d7f4ac 100644 --- a/resources/templates/rtf.xsl +++ b/resources/templates/rtf.xsl @@ -98,7 +98,7 @@ <xsl:apply-templates/> </emph> </xsl:when> - <xsl:when test = "@underlined"> + <xsl:when test = "@underlined and @underlined != 'false'"> <emph rend = "paragraph-emph-underlined"> <xsl:apply-templates/> </emph> @@ -451,6 +451,15 @@ <xsl:apply-templates/> </xsl:element> </xsl:template> + + <xsl:template match = "rtf:field[@type='bookmark-start']"> + <xsl:element name ="a"> + <xsl:attribute name = "id"> + <xsl:value-of select = "@number"/> + </xsl:attribute> + <xsl:apply-templates/> + </xsl:element> + </xsl:template> <xsl:template match = "rtf:field"> <xsl:apply-templates/>