From 2f5f2a9d335a77abaa97fe34ef86592c3acab5e3 Mon Sep 17 00:00:00 2001 From: Sengian Date: Mon, 26 Jul 2010 22:43:11 +0200 Subject: [PATCH 01/41] Bug correction: negative values of first line indent where converted to positive values causing a lot of formatting problems --- src/calibre/ebooks/rtf2xml/process_tokens.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/rtf2xml/process_tokens.py b/src/calibre/ebooks/rtf2xml/process_tokens.py index 19a7d38135..9cb7c3c6a4 100755 --- a/src/calibre/ebooks/rtf2xml/process_tokens.py +++ b/src/calibre/ebooks/rtf2xml/process_tokens.py @@ -680,7 +680,7 @@ class ProcessTokens: return the_string def divide_num(self, numerator, denominator): try: - numerator = float(re.search('[0-9.]+', numerator).group()) + numerator = float(re.search('[0-9.\-]+', numerator).group()) #calibre why ignore negative number? Wrong in case of \fi except TypeError, msg: if self.__run_level > 3: msg = 'no number to process?\n' From a2702d99c29c2a2eb86c1f957141544f2e11399b Mon Sep 17 00:00:00 2001 From: Sengian Date: Tue, 27 Jul 2010 19:33:12 +0200 Subject: [PATCH 02/41] Formatting --- resources/templates/rtf.xsl | 4 ---- src/calibre/ebooks/rtf/input.py | 7 ------- 2 files changed, 11 deletions(-) diff --git a/resources/templates/rtf.xsl b/resources/templates/rtf.xsl index bf016efaaf..ae054186d4 100644 --- a/resources/templates/rtf.xsl +++ b/resources/templates/rtf.xsl @@ -81,7 +81,6 @@ - @@ -182,14 +181,12 @@ - - unnamed @@ -386,7 +383,6 @@ - true true false diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 50f5571d58..df74a7b3cb 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -141,7 +141,6 @@ class RTFInput(InputFormatPlugin): return name - def write_inline_css(self, ic): font_size_classes = ['span.fs%d { font-size: %spt }'%(i, x) for i, x in enumerate(ic.font_sizes)] @@ -152,17 +151,11 @@ class RTFInput(InputFormatPlugin): text-decoration: none; font-weight: normal; font-style: normal; font-variant: normal } - span.italics { font-style: italic } - span.bold { font-weight: bold } - span.small-caps { font-variant: small-caps } - span.underlined { text-decoration: underline } - span.strike-through { text-decoration: line-through } - ''') css += '\n'+'\n'.join(font_size_classes) css += '\n' +'\n'.join(color_classes) From 3cf9f7986a174a4404764790800272f2ecdf787d Mon Sep 17 00:00:00 2001 From: Sengian Date: Wed, 28 Jul 2010 00:47:31 +0200 Subject: [PATCH 03/41] Implementation of a multiple replace class based on Dict substitutions. Very fast for large dictionnaries. --- src/calibre/utils/mreplace.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 src/calibre/utils/mreplace.py diff --git a/src/calibre/utils/mreplace.py b/src/calibre/utils/mreplace.py new file mode 100644 index 0000000000..dff5fab578 --- /dev/null +++ b/src/calibre/utils/mreplace.py @@ -0,0 +1,32 @@ +#multiple replace from dictionnary : http://code.activestate.com/recipes/81330/ +__license__ = 'GPL v3' +__copyright__ = '2010, sengian ' +__docformat__ = 'restructuredtext en' + +import re +from UserDict import UserDict + +class MReplace(UserDict): + def __init__(self, dict = None): + UserDict.__init__(self, dict) + self.re = None + self.regex = None + self.compile_regex() + + def compile_regex(self): + if len(self.data) > 0: + keys = sorted(self.data.keys(), key=len) + keys.reverse() + tmp = "(%s)" % "|".join([re.escape(item) for item in keys]) + if self.re != tmp: + self.re = tmp + self.regex = re.compile(self.re) + + def __call__(self, mo): + return self[mo.string[mo.start():mo.end()]] + + def mreplace(self, text): + #Replace without regex compile + if len(self.data) < 1 or self.re is None: + return text + return self.regex.sub(self, text) \ No newline at end of file From 7ebf416513125cee88fc487aa3306a25e4ac6681 Mon Sep 17 00:00:00 2001 From: Sengian Date: Wed, 28 Jul 2010 00:49:37 +0200 Subject: [PATCH 04/41] Modifications of BIBTEX catalog generation: create a class for bibtex fonctions, use the new Mreplace fonction as the dictionnary is very large. Divide by 10 the total execution time. --- src/calibre/library/catalog.py | 41 ++++++----- src/calibre/utils/bibtex.py | 125 ++++++++++++++++----------------- 2 files changed, 85 insertions(+), 81 deletions(-) diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py index a540a8a660..5ee0683b87 100644 --- a/src/calibre/library/catalog.py +++ b/src/calibre/library/catalog.py @@ -270,10 +270,10 @@ class BIBTEX(CatalogPlugin): from calibre.library.save_to_disk import preprocess_template #Bibtex functions - from calibre.utils.bibtex import bibtex_author_format, utf8ToBibtex, ValidateCitationKey + from calibre.utils.bibtex import BibTeX def create_bibtex_entry(entry, fields, mode, template_citation, - asccii_bibtex = True, citation_bibtex = True): + bibtexdict, citation_bibtex = True): #Bibtex doesn't like UTF-8 but keep unicode until writing #Define starting chain or if book valid strict and not book return a Fail string @@ -289,7 +289,8 @@ class BIBTEX(CatalogPlugin): if citation_bibtex : # Citation tag - bibtex_entry.append(make_bibtex_citation(entry, template_citation, asccii_bibtex)) + bibtex_entry.append(make_bibtex_citation(entry, template_citation, + bibtexdict)) bibtex_entry = [u' '.join(bibtex_entry)] for field in fields: @@ -304,11 +305,11 @@ class BIBTEX(CatalogPlugin): pass if field == 'authors' : - bibtex_entry.append(u'author = "%s"' % bibtex_author_format(item)) + bibtex_entry.append(u'author = "%s"' % bibtexdict.bibtex_author_format(item)) elif field in ['title', 'publisher', 'cover', 'uuid', 'author_sort', 'series'] : - bibtex_entry.append(u'%s = "%s"' % (field, utf8ToBibtex(item, asccii_bibtex))) + bibtex_entry.append(u'%s = "%s"' % (field, bibtexdict.utf8ToBibtex(item))) elif field == 'id' : bibtex_entry.append(u'calibreid = "%s"' % int(item)) @@ -321,13 +322,13 @@ class BIBTEX(CatalogPlugin): elif field == 'tags' : #A list to flatten - bibtex_entry.append(u'tags = "%s"' % utf8ToBibtex(u', '.join(item), asccii_bibtex)) + bibtex_entry.append(u'tags = "%s"' % bibtexdict.utf8ToBibtex(u', '.join(item))) elif field == 'comments' : #\n removal item = item.replace(u'\r\n',u' ') item = item.replace(u'\n',u' ') - bibtex_entry.append(u'note = "%s"' % utf8ToBibtex(item, asccii_bibtex)) + bibtex_entry.append(u'note = "%s"' % bibtexdict.utf8ToBibtex(item)) elif field == 'isbn' : # Could be 9, 10 or 13 digits @@ -345,8 +346,7 @@ class BIBTEX(CatalogPlugin): elif field == 'pubdate' : bibtex_entry.append(u'year = "%s"' % item.year) - bibtex_entry.append(u'month = "%s"' % utf8ToBibtex(strftime("%b", item), - asccii_bibtex)) + bibtex_entry.append(u'month = "%s"' % bibtexdict.utf8ToBibtex(strftime("%b", item))) bibtex_entry = u',\n '.join(bibtex_entry) bibtex_entry += u' }\n\n' @@ -363,7 +363,7 @@ class BIBTEX(CatalogPlugin): else : return True - def make_bibtex_citation(entry, template_citation, asccii_bibtex): + def make_bibtex_citation(entry, template_citation, bibtexclass): #define a function to replace the template entry by its value def tpl_replace(objtplname) : @@ -384,8 +384,9 @@ class BIBTEX(CatalogPlugin): return u'' if len(template_citation) >0 : - tpl_citation = utf8ToBibtex(ValidateCitationKey(re.sub(u'\{[^{}]*\}', - tpl_replace, template_citation)), asccii_bibtex) + tpl_citation = bibtexclass.utf8ToBibtex( + bibtexclass.ValidateCitationKey(re.sub(u'\{[^{}]*\}', + tpl_replace, template_citation))) if len(tpl_citation) >0 : return tpl_citation @@ -397,9 +398,9 @@ class BIBTEX(CatalogPlugin): template_citation = u'%s' % str(entry["id"]) if asccii_bibtex : - return ValidateCitationKey(template_citation.encode('ascii', 'replace')) + return bibtexclass.ValidateCitationKey(template_citation.encode('ascii', 'replace')) else : - return ValidateCitationKey(template_citation) + return bibtexclass.ValidateCitationKey(template_citation) self.fmt = path_to_output.rpartition('.')[2] self.notification = notification @@ -467,13 +468,16 @@ class BIBTEX(CatalogPlugin): if not len(data): log.error("\nNo matching database entries for search criteria '%s'" % opts.search_text) + #Initialize BibTeX class + bibtexc = BibTeX() + #Entries writing after Bibtex formating (or not) if bibfile_enc != 'ascii' : - asccii_bibtex = False + bibtexc.ascii_bibtex = False else : - asccii_bibtex = True + bibtexc.ascii_bibtex = True - #Check and go to default in case of bad CLI + #Check citation choice and go to default in case of bad CLI if isinstance(opts.impcit, (StringType, UnicodeType)) : if opts.impcit == 'False' : citation_bibtex= False @@ -485,6 +489,7 @@ class BIBTEX(CatalogPlugin): else : citation_bibtex= opts.impcit + #Preprocess for error and light correction template_citation = preprocess_template(opts.bib_cit) #Open output and write entries @@ -506,7 +511,7 @@ class BIBTEX(CatalogPlugin): for entry in data: outfile.write(create_bibtex_entry(entry, fields, bib_entry, template_citation, - asccii_bibtex, citation_bibtex)) + bibtexc, citation_bibtex)) outfile.close() diff --git a/src/calibre/utils/bibtex.py b/src/calibre/utils/bibtex.py index f6e596e8f0..5b9193d16d 100644 --- a/src/calibre/utils/bibtex.py +++ b/src/calibre/utils/bibtex.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - """ Collection of python utility-methodes commonly used by other bibliograph packages. From http://pypi.python.org/pypi/bibliograph.core/ @@ -62,10 +60,14 @@ DAMAGE. """ -__docformat__ = 'reStructuredText' __author__ = 'sengian ' +__docformat__ = 'restructuredtext en' import re, string +from UserDict import UserDict + +from calibre.constants import preferred_encoding +from calibre.utils.mreplace import MReplace utf8enc2latex_mapping = { # This is a mapping of Unicode characters to LaTeX equivalents. @@ -2842,69 +2844,66 @@ entity_mapping = { '"':'{"}', } -def ValidateCitationKey(text): - """ - removes characters not allowed in BibTeX keys +class BibTeX: + def __init__(self): + self.rep_utf8 = MReplace(utf8enc2latex_mapping) + self.rep_ent = MReplace(entity_mapping) + #Set default conversion to ASCII BibTeX + self.ascii_bibtex = True + # This substitution is based on the description of cite key restrictions at + # http://bibdesk.sourceforge.net/manual/BibDesk%20Help_2.html + self.invalid_cit = re.compile(u'[ "@\',\\#}{~%&$^]') + self.upper = re.compile(u'[' + + string.uppercase.decode(preferred_encoding) + u']') + self.escape = re.compile(u'[~#&%_]') + + def ValidateCitationKey(self, text): + """ + removes characters not allowed in BibTeX keys + >>> ValidateCitationKey(DummyEntry('my@id')) + 'myid' + """ + return self.invalid_cit.sub(u'', text) - >>> from bibliograph.core.utils import _validKey - >>> _validKey(DummyEntry('Foo Bar')) - 'FooBar' + def braceUppercase(self, text): + """ Convert uppercase letters to bibtex encoded uppercase + >>> braceUppercase('Foo Bar') + '{F}oo {B}ar' + """ + return self.upper.sub(lambda m: u'{%s}' % m.group(), text) - >>> _validKey(DummyEntry('my@id')) - 'myid' + def resolveEntities(self, text): + #for entity, entity_map in entity_mapping.iteritems(): + # text = text.replace(entity, entity_map) + #return text + return self.rep_ent.mreplace(text) - """ - # This substitution is based on the description of cite key restrictions at - # http://bibdesk.sourceforge.net/manual/BibDesk%20Help_2.html - return re.sub(u'[ "@\',\\#}{~%&$^]', u'', text) + def resolveUnicode(self, text): + #UTF-8 text as entry + #for unichar, latexenc in utf8enc2latex_mapping.iteritems() : + # text = text.replace(unichar, latexenc) + text = self.rep_utf8.mreplace(text) + return text.replace(u'$}{$', u'') -def BraceUppercase(text): - """ Convert uppercase letters to bibtex encoded uppercase + def escapeSpecialCharacters(self, text): + """ + latex escaping some (not all) special characters + """ + text.replace('\\', '\\\\') + return self.escape.sub(lambda m: u'\\%s' % m.group(), text) - >>> from bibliograph.core.utils import _braceUppercase - >>> _braceUppercase('foo bar') - 'foo bar' + #Calibre functions + #Option to go to official ASCII Bibtex or unofficial UTF-8 + #Go from an unicode entry to ASCII Bibtex format without encoding + def utf8ToBibtex(self, text): + if len(text) == 0: + return '' + text.replace('\\', '\\\\') + text = self.resolveEntities(text) + if self.ascii_bibtex : + text = self.resolveUnicode(text) + return self.escapeSpecialCharacters(text) - >>> _braceUppercase('Foo Bar') - '{F}oo {B}ar' - """ - for uc in string.uppercase: - text = text.replace(uc, u'{%s}' % uc) - return text - -def resolveEntities(text): - for entity, entity_map in entity_mapping.iteritems(): - text = text.replace(entity, entity_map) - return text - -def resolveUnicode(text): - #UTF-8 text as entry - for unichar, latexenc in utf8enc2latex_mapping.iteritems() : - text = text.replace(unichar, latexenc) - return text.replace(u'$}{$', u'') - -def escapeSpecialCharacters(text): - """ - latex escaping some (not all) special characters - """ - text.replace('\\', '\\\\') - escape = ['~', '#', '&', '%', '_'] - for c in escape: - text = text.replace(c, '\\' + c ) - return text - -#Calibre functions -#Go from an unicode entry to ASCII Bibtex format without encoding -#Option to go to official ASCII Bibtex or unofficial UTF-8 -def utf8ToBibtex(text, asccii_bibtex = True): - if len(text) == 0: - return '' - text.replace('\\', '\\\\') - text = resolveEntities(text) - if asccii_bibtex : - text = resolveUnicode(text) - return escapeSpecialCharacters(text) - -def bibtex_author_format(item): - #Format authors for Bibtex compliance (get a list as input) - return utf8ToBibtex(u' and'.join([author for author in item])) + def bibtex_author_format(self, item): + #Format authors for Bibtex compliance (get a list as input) + return self.utf8ToBibtex(u' and'.join([author for author in item])) From 8512f57866262b66f4cd542ac96cccf2b9c05737 Mon Sep 17 00:00:00 2001 From: Sengian Date: Wed, 28 Jul 2010 23:08:02 +0200 Subject: [PATCH 05/41] Check if RTF is asccii early. Will be effactive after preprocess integration in rtf2xml. --- src/calibre/ebooks/rtf2xml/ParseRtf.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index 7b89407f79..f494b7a9c1 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -17,7 +17,8 @@ ######################################################################### # $Revision: 1.41 $ # $Date: 2006/03/24 23:50:07 $ -import sys,os +import sys, os, codecs + from calibre.ebooks.rtf2xml import headings_to_sections, \ line_endings, footnote, fields_small, default_encoding, \ make_lists, preamble_div, header, colors, group_borders, \ @@ -90,7 +91,6 @@ class ParseRtf: out_file = '', out_dir = None, dtd = '', - #debug = 0, #why? calibre deb_dir = None, convert_symbol = None, convert_wingdings = None, @@ -107,6 +107,7 @@ class ParseRtf: no_dtd = 0, char_data = '', ): + """ Requires: 'file' --file to parse @@ -125,14 +126,16 @@ class ParseRtf: through a file. Only for debugging. Returns: Nothing """ + self.__file = in_file self.__out_file = out_file self.__out_dir = out_dir self.__temp_dir = out_dir self.__dtd_path = dtd self.__check_file(in_file,"file_to_parse") + self.__check_ascii(in_file) self.__char_data = char_data - self.__debug_dir = deb_dir #self.__debug_dir = debug calibre + self.__debug_dir = deb_dir self.__check_dir(self.__temp_dir) self.__copy = self.__check_dir(self.__debug_dir) self.__convert_caps = convert_caps @@ -149,19 +152,17 @@ class ParseRtf: self.__group_borders = group_borders self.__empty_paragraphs = empty_paragraphs self.__no_dtd = no_dtd - def __check_file(self, the_file, type): """Check to see if files exist""" if hasattr(the_file, 'read'): return if the_file == None: if type == "file_to_parse": - message = "You must provide a file for the script to work" - msg = message + msg = "\nYou must provide a file for the script to work" raise RtfInvalidCodeException, msg elif os.path.exists(the_file): pass # do nothing else: - message = "The file '%s' cannot be found" % the_file + message = "\nThe file '%s' cannot be found" % the_file msg = message raise RtfInvalidCodeException, msg def __check_dir(self, the_dir): @@ -170,7 +171,16 @@ class ParseRtf: return dir_exists = os.path.isdir(the_dir) if not dir_exists: - message = "%s is not a directory" % the_dir + msg = "\n%s is not a directory" % the_dir + raise RtfInvalidCodeException, msg + return 1 + def __check_ascii(self, the_file): + """Check to see if the file is correct ascii""" + try: + test = codecs.open(the_file, 'r', 'ascii', 'strict') + test.close() + except UnicodeError: + message= "\n%s is not a correct ascii file" % the_file msg = message raise RtfInvalidCodeException, msg return 1 From 09c8f13a1f17c869d06ace0d6cf76f0ff9b3fdc7 Mon Sep 17 00:00:00 2001 From: Sengian Date: Sat, 31 Jul 2010 10:47:12 +0200 Subject: [PATCH 06/41] Global overhaul of rtf2xml : RTF fixes (1) --- src/calibre/ebooks/rtf/input.py | 1 + src/calibre/ebooks/rtf2xml/ParseRtf.py | 53 ++++++++------------ src/calibre/ebooks/rtf2xml/check_brackets.py | 10 ++-- src/calibre/ebooks/rtf2xml/line_endings.py | 52 ++++++++----------- src/calibre/ebooks/rtf2xml/process_tokens.py | 2 - src/calibre/ebooks/rtf2xml/tokenize.py | 6 +-- src/calibre/ebooks/txt/processor.py | 5 +- 7 files changed, 52 insertions(+), 77 deletions(-) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index df74a7b3cb..2622d82d99 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -50,6 +50,7 @@ class RTFInput(InputFormatPlugin): parser = ParseRtf( in_file = stream, out_file = ofile, + #deb_dir = 'I:\\Calibre\\rtfdebug', # Convert symbol fonts to unicode equivalents. Default # is 1 convert_symbol = 1, diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index f494b7a9c1..3a804792c5 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -143,7 +143,7 @@ class ParseRtf: self.__convert_wingdings = convert_wingdings self.__convert_zapf = convert_zapf self.__run_level = run_level - self.__exit_level = 0 + #self.__exit_level = 0 self.__indent = indent self.__replace_illegals = replace_illegals self.__form_lists = form_lists @@ -162,8 +162,7 @@ class ParseRtf: elif os.path.exists(the_file): pass # do nothing else: - message = "\nThe file '%s' cannot be found" % the_file - msg = message + msg = "\nThe file '%s' cannot be found" % the_file raise RtfInvalidCodeException, msg def __check_dir(self, the_dir): """Check to see if directory exists""" @@ -180,8 +179,7 @@ class ParseRtf: test = codecs.open(the_file, 'r', 'ascii', 'strict') test.close() except UnicodeError: - message= "\n%s is not a correct ascii file" % the_file - msg = message + msg = "\n%s is not a correct ascii file" % the_file raise RtfInvalidCodeException, msg return 1 def parse_rtf(self): @@ -204,27 +202,29 @@ class ParseRtf: copy_obj.set_dir(self.__debug_dir) copy_obj.remove_files() copy_obj.copy_file(self.__temp_file, "original_file") - # new as of 2005-08-02. Do I want this? + # Function to check if bracket are well handled if self.__debug_dir or self.__run_level > 2: self.__check_brack_obj = check_brackets.CheckBrackets\ (file = self.__temp_file, bug_handler = RtfInvalidCodeException, ) - # convert Macintosh line endings to Unix line endings + # convert Macintosh and Windows line endings to Unix line endings + #why do this if you don't wb after? line_obj = line_endings.FixLineEndings( in_file = self.__temp_file, bug_handler = RtfInvalidCodeException, copy = self.__copy, - run_level = self.__run_level, + #run_level = self.__run_level, replace_illegals = self.__replace_illegals, ) - return_value = line_obj.fix_endings() - self.__return_code(return_value) + line_obj.fix_endings() + #return_value = line_obj.fix_endings() #calibre: no return in this function, why keep it? + #self.__return_code(return_value) tokenize_obj = tokenize.Tokenize( bug_handler = RtfInvalidCodeException, in_file = self.__temp_file, - copy = self.__copy, - run_level = self.__run_level,) + copy = self.__copy,) + #run_level = self.__run_level,) tokenize_obj.tokenize() process_tokens_obj = process_tokens.ProcessTokens( in_file = self.__temp_file, @@ -529,36 +529,27 @@ class ParseRtf: ) output_obj.output() os.remove(self.__temp_file) - return self.__exit_level + #return self.__exit_level def __bracket_match(self, file_name): if self.__run_level > 2: good_br, msg = self.__check_brack_obj.check_brackets() if good_br: pass - # sys.stderr.write( msg + ' in ' + file_name + "\n") + #sys.stderr.write( msg + ' in ' + file_name + "\n") else: msg += msg + " in file '" + file_name + "'\n" raise RtfInvalidCodeException, msg - def __return_code(self, num): - if num == None: - return - if int(num) > self.__exit_level: - self.__exit_level = num + #def __return_code(self, num): calibre not used + # if num == None: + # return + # if int(num) > self.__exit_level: + # self.__exit_level = num def __make_temp_file(self,file): """Make a temporary file to parse""" write_file="rtf_write_file" read_obj = file if hasattr(file, 'read') else open(file,'r') write_obj = open(write_file, 'w') - line = "dummy" - while line: - line = read_obj.read(1000) - write_obj.write(line ) + for line in read_obj: + write_obj.write(line) write_obj.close() - return write_file - """ -mi1\n -mi33\n -mi 0: length_byte = len(txt.encode('utf-8')) From 3405615e54da2f2aa7345d1f51525acd250cbd91 Mon Sep 17 00:00:00 2001 From: Sengian Date: Sat, 31 Jul 2010 13:15:47 +0200 Subject: [PATCH 07/41] Remove invalid ASCII characters from plain text files --- src/calibre/ebooks/txt/input.py | 3 ++- src/calibre/ebooks/txt/processor.py | 25 +++++++++++++++---------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index b444bf1cf4..935a187d5d 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -57,6 +57,7 @@ class TXTInput(InputFormatPlugin): txt = preserve_spaces(txt) txt = _ent_pat.sub(xml_entity_to_unicode, txt) + txt = txt.encode('utf-8') if options.markdown: log.debug('Running text though markdown conversion...') @@ -79,7 +80,7 @@ class TXTInput(InputFormatPlugin): base = os.path.dirname(stream.name) htmlfile = open(os.path.join(base, 'temp_calibre_txt_input_to_html.html'), 'wb') - htmlfile.write(html.encode('utf-8')) + htmlfile.write(html) #html.encode('utf-8') htmlfile.close() cwd = os.getcwdu() odi = options.debug_pipeline diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 91c274a7b1..6bd635b6df 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -19,7 +19,7 @@ HTML_TEMPLATE = u' ] - + @@ -294,7 +294,7 @@ - + From 1f237c99bfe5bb875f4dc384b4b80938967d7ae9 Mon Sep 17 00:00:00 2001 From: Sengian Date: Sat, 31 Jul 2010 20:01:54 +0200 Subject: [PATCH 10/41] Change in the convert to bibtex reference for euro symbol --- src/calibre/utils/bibtex.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/utils/bibtex.py b/src/calibre/utils/bibtex.py index 5b9193d16d..09868ccdb1 100644 --- a/src/calibre/utils/bibtex.py +++ b/src/calibre/utils/bibtex.py @@ -80,7 +80,7 @@ utf8enc2latex_mapping = { #Fix some encoding problem between cp1252 and latin1 # from http://www.microsoft.com/typography/unicode/1252.htm - u'\x80': '{\\mbox{\\texteuro}}', # EURO SIGN + u'\x80': '{\\texteuro}', # EURO SIGN u'\x82': '{,}', # SINGLE LOW-9 QUOTATION MARK u'\x83': '$f$', # LATIN SMALL LETTER F WITH HOOK u'\x84': '{,,}', # DOUBLE LOW-9 QUOTATION MARK @@ -746,7 +746,7 @@ utf8enc2latex_mapping = { u'\u205f': '{\\mkern4mu}', u'\u2060': '{\\nolinebreak}', u'\u20a7': '{\\ensuremath{\\Elzpes}}', - u'\u20ac': '{\\mbox{\\texteuro}}', + u'\u20ac': '{\\texteuro}', u'\u20db': '$\\dddot$', u'\u20dc': '$\\ddddot$', u'\u2102': '$\\mathbb{C}$', From 2eb20249319e551f41d4d721c831e3e64abaf72c Mon Sep 17 00:00:00 2001 From: Sengian Date: Tue, 10 Aug 2010 12:38:59 +0200 Subject: [PATCH 11/41] Merge from trunk --- resources/catalog/stylesheet.css | 142 +++---- resources/content_server/gui.css | 163 ++++---- resources/content_server/index.html | 103 ++--- resources/content_server/mobile.css | 91 ++--- resources/templates/html.css | 361 ++++++++--------- setup/installer/windows/en-us.xml | 19 +- setup/installer/windows/wix-template.xml | 267 ++++++------- src/calibre/ebooks/lrf/html/demo/demo.html | 440 +++++++++++++-------- src/calibre/manual/templates/layout.html | 24 +- src/calibre/manual/xpath.xhtml | 30 +- 10 files changed, 871 insertions(+), 769 deletions(-) diff --git a/resources/catalog/stylesheet.css b/resources/catalog/stylesheet.css index 4f9ca9ac41..ea01aeb43b 100644 --- a/resources/catalog/stylesheet.css +++ b/resources/catalog/stylesheet.css @@ -1,102 +1,104 @@ -body { background-color: white; } +body { + background-color: white; +} -p.title { - margin-top:0em; - margin-bottom:1em; - text-align:center; - font-style:italic; - font-size:xx-large; - border-bottom: solid black 4px; - } +p.title { + margin-top: 0em; + margin-bottom: 1em; + text-align: center; + font-style: italic; + font-size: xx-large; + border-bottom: solid black 4px; +} p.author { - margin-top:0em; - margin-bottom:0em; + margin-top: 0em; + margin-bottom: 0em; text-align: left; text-indent: 1em; - font-size:large; - } + font-size: large; +} p.tags { - margin-top:0em; - margin-bottom:0em; + margin-top: 0em; + margin-bottom: 0em; text-align: left; text-indent: 1em; - font-size:small; - } + font-size: small; +} p.description { - text-align:left; - font-style:normal; + text-align: left; + font-style: normal; margin-top: 0em; - } +} p.date_index { - font-size:x-large; - text-align:center; - font-weight:bold; - margin-top:1em; - margin-bottom:0px; - } + font-size: x-large; + text-align: center; + font-weight: bold; + margin-top: 1em; + margin-bottom: 0px; +} p.letter_index { - font-size:x-large; - text-align:center; - font-weight:bold; - margin-top:1em; - margin-bottom:0px; - } + font-size: x-large; + text-align: center; + font-weight: bold; + margin-top: 1em; + margin-bottom: 0px; +} p.author_index { - font-size:large; - text-align:left; - margin-top:0px; - margin-bottom:0px; + font-size: large; + text-align: left; + margin-top: 0px; + margin-bottom: 0px; text-indent: 0em; - } +} p.series { text-align: left; - margin-top:0px; - margin-bottom:0px; - margin-left:2em; - text-indent:-2em; - } + margin-top: 0px; + margin-bottom: 0px; + margin-left: 2em; + text-indent: -2em; +} p.read_book { - text-align:left; - margin-top:0px; - margin-bottom:0px; - margin-left:2em; - text-indent:-2em; - } + text-align: left; + margin-top: 0px; + margin-bottom: 0px; + margin-left: 2em; + text-indent: -2em; +} p.unread_book { - text-align:left; - margin-top:0px; - margin-bottom:0px; - margin-left:2em; - text-indent:-2em; - } + text-align: left; + margin-top: 0px; + margin-bottom: 0px; + margin-left: 2em; + text-indent: -2em; +} p.date_read { - text-align:left; - margin-top:0px; - margin-bottom:0px; - margin-left:6em; - text-indent:-6em; - } + text-align: left; + margin-top: 0px; + margin-bottom: 0px; + margin-left: 6em; + text-indent: -6em; +} hr.series_divider { - width:50%; - margin-left:1em; - margin-top:0em; - margin-bottom:0em; - } + width: 50%; + margin-left: 1em; + margin-top: 0em; + margin-bottom: 0em; +} hr.annotations_divider { - width:50%; - margin-left:1em; - margin-top:0em; - margin-bottom:0em; - } + width: 50%; + margin-left: 1em; + margin-top: 0em; + margin-bottom: 0em; +} \ No newline at end of file diff --git a/resources/content_server/gui.css b/resources/content_server/gui.css index 1bcc4e1eb0..d7a3eda51e 100644 --- a/resources/content_server/gui.css +++ b/resources/content_server/gui.css @@ -1,142 +1,157 @@ body { - background-color: white; + background-color: white; } #banner { - position: absolute; - left: 5px; top: 0px; + position: absolute; + left: 5px; + top: 0px; } /* Search bar */ #search_box { - width: 201px; - height: 31px; - background: url(bg_search_box.png); - top: 5px; right: 20px; - position: absolute; + width: 201px; + height: 31px; + background: url(bg_search_box.png); + top: 5px; + right: 20px; + position: absolute; } + #search_box #s { - float: left; - padding: 0; - margin: 6px 0 0 6px; - border-width: 0px; - font-size: 16px; - width: 159px; - background: transparent; + float: left; + padding: 0; + margin: 6px 0 0 6px; + border-width: 0px; + font-size: 16px; + width: 159px; + background: transparent; } + #search_box #go { - float: right; - margin: 3px 4px 0 0; + float: right; + margin: 3px 4px 0 0; } /* Count bar */ #count_bar { - position: absolute; - right: 30px; - top: 80px; - font-size:smaller; - padding-bottom: 5px; + position: absolute; + right: 30px; + top: 80px; + font-size: smaller; + padding-bottom: 5px; } #count_bar * img { - cursor: pointer; + cursor: pointer; } -#count { cursor: default;} +#count { + cursor: default; +} /* Styles for the book list */ #main { - width:95%; - overflow: auto; - border: solid thin black; - position: absolute; - top: 115px; left: 10px; - z-index: 1; + width: 95%; + overflow: auto; + border: solid thin black; + position: absolute; + top: 115px; + left: 10px; + z-index: 1; } table#book_list thead tr td { - width: 100%; - padding-right: 1em; padding-left: 1em; - text-align: center; - font-weight: bold; - font-size: 130%; - border-bottom: thick solid black; - border-top: thick solid black; - cursor: pointer; - font-family: serif; - padding-top: 0.5ex; padding-bottom: 0.5ex; + width: 100%; + padding-right: 1em; + padding-left: 1em; + text-align: center; + font-weight: bold; + font-size: 130%; + border-bottom: thick solid black; + border-top: thick solid black; + cursor: pointer; + font-family: serif; + padding-top: 0.5ex; + padding-bottom: 0.5ex; } table#book_list tbody tr td { - padding-right: 1em; padding-left: 1em; - /*border-bottom: thin solid black;*/ - padding-bottom: 0.7ex; padding-top: 0.7ex; - margin: 0pt; - cursor: pointer; - + padding-right: 1em; + padding-left: 1em; + /*border-bottom: thin solid black;*/ + padding-bottom: 0.7ex; + padding-top: 0.7ex; + margin: 0pt; + cursor: pointer; } table#book_list * .sort_indicator { - visibility:hidden; - color: #9f9f9f; + visibility: hidden; + color: #9f9f9f; } table#book_list * .rating { - color: #3fbbe4; + color: #3fbbe4; } table#book_list * span.subtitle { - font-size: smaller; + font-size: smaller; } table#book_list * a.format { - text-decoration: none; - color: blue; - font-family: monospace; + text-decoration: none; + color: blue; + font-family: monospace; } table#book_list * a.format:hover { - color: red; + color: red; } table#book_list * a.format:visited { - color: blue; + color: blue; } table#book_list * .comments { - font-size: smaller; - display: none; + font-size: smaller; + display: none; } + /* Loading message */ #loading { - top: 10px; left: 10px; - position: absolute; - font-size: 160%; font-family: monospace; - text-align: center; - visibility: hidden; - z-index: 10000; - background-color: #aaaaaa; - opacity: 0.8; - + top: 10px; + left: 10px; + position: absolute; + font-size: 160%; + font-family: monospace; + text-align: center; + visibility: hidden; + z-index: 10000; + background-color: #aaaaaa; + opacity: 0.8; } #loading div { - top: 50%; position: relative; + top: 50%; + position: relative; } #cover_pane { - overflow: auto; - position: absolute; - visibility: hidden; - text-align: right; - z-index: 2; - margin: 0pt; padding: 0pt; border-width: 0pt; -} + overflow: auto; + position: absolute; + visibility: hidden; + text-align: right; + z-index: 2; + margin: 0pt; + padding: 0pt; + border-width: 0pt; +} \ No newline at end of file diff --git a/resources/content_server/index.html b/resources/content_server/index.html index f9f0aff491..ff11acc719 100644 --- a/resources/content_server/index.html +++ b/resources/content_server/index.html @@ -1,49 +1,60 @@ - - - calibre library - - - - - - - - - - - -
- Show first set of books Show previous set of books              Show next set of books Show last set of books -
- -
- - - - - - - -
-
- -
-
- Loading... Loading… -
-
- -
- -
- + + +calibre library + + + + + + + + + + + +
Show first set of books Show previous set of books              Show next set of books Show last set of books
+ +
+ + + + + + + +
+
+ +
+
Loading... Loading… +
+
+ +
+ diff --git a/resources/content_server/mobile.css b/resources/content_server/mobile.css index 9be755b954..e3a4b58422 100644 --- a/resources/content_server/mobile.css +++ b/resources/content_server/mobile.css @@ -1,83 +1,78 @@ /* CSS for the mobile version of the content server webpage */ - .navigation table.buttons { - width: 100%; + width: 100%; } .navigation .button { - width: 50%; + width: 50%; } -.button a, .button:visited a { - padding: 0.5em; - font-size: 1.25em; - border: 1px solid black; - text-color: black; - background-color: #ddd; - border-top: 1px solid ThreeDLightShadow; - border-right: 1px solid ButtonShadow; - border-bottom: 1px solid ButtonShadow; - border-left: 1 px solid ThreeDLightShadow; - -moz-border-radius: 0.25em; - -webkit-border-radius: 0.25em; +.button a,.button:visited a { + padding: 0.5em; + font-size: 1.25em; + border: 1px solid black; + text-color: black; + background-color: #ddd; + border-top: 1px solid ThreeDLightShadow; + border-right: 1px solid ButtonShadow; + border-bottom: 1px solid ButtonShadow; + border-left: 1 px solid ThreeDLightShadow; + -moz-border-radius: 0.25em; + -webkit-border-radius: 0.25em; } .button:hover a { - border-top: 1px solid #666; - border-right: 1px solid #CCC; - border-bottom: 1 px solid #CCC; - border-left: 1 px solid #666; - - + border-top: 1px solid #666; + border-right: 1px solid #CCC; + border-bottom: 1 px solid #CCC; + border-left: 1 px solid #666; } div.navigation { - padding-bottom: 1em; - clear: both; + padding-bottom: 1em; + clear: both; } #search_box { - border: 1px solid #393; - -moz-border-radius: 0.5em; - -webkit-border-radius: 0.5em; - padding: 1em; - margin-bottom: 0.5em; - float: right; + border: 1px solid #393; + -moz-border-radius: 0.5em; + -webkit-border-radius: 0.5em; + padding: 1em; + margin-bottom: 0.5em; + float: right; } #listing { - width: 100%; - border-collapse: collapse; + width: 100%; + border-collapse: collapse; } + #listing td { - padding: 0.25em; + padding: 0.25em; } #listing td.thumbnail { - height: 60px; - width: 60px; + height: 60px; + width: 60px; } #listing tr:nth-child(even) { - - background: #eee; + background: #eee; } -#listing .button a{ - display: inline-block; - width: 2.5em; - padding-left: 0em; - padding-right: 0em; - overflow: hidden; - text-align: center; +#listing .button a { + display: inline-block; + width: 2.5em; + padding-left: 0em; + padding-right: 0em; + overflow: hidden; + text-align: center; } #logo { - float: left; + float: left; } #spacer { - clear: both; -} - - + clear: both; +} \ No newline at end of file diff --git a/resources/templates/html.css b/resources/templates/html.css index e9b683ca34..448ec596b9 100644 --- a/resources/templates/html.css +++ b/resources/templates/html.css @@ -34,380 +34,367 @@ * the terms of any one of the MPL, the GPL or the LGPL. * * ***** END LICENSE BLOCK ***** */ +@ +namespace url (http: //www.w3.org /1999/xhtml); + @namespace svg url (http: //www.w3.org /2000/svg); + /* blocks */ -@namespace url(http://www.w3.org/1999/xhtml); -@namespace svg url(http://www.w3.org/2000/svg); - -/* blocks */ - -html, div, map, dt, isindex, form { - display: block; +html,div,map,dt,isindex,form { + display: block; } body { - display: block; + display: block; } -p, dl, multicol { - display: block; - margin: 1em 0; +p,dl,multicol { + display: block; + margin: 1em 0; } dd { - display: block; - margin-left: 40px; + display: block; + margin-left: 40px; } blockquote { - display: block; - margin: 1em; + display: block; + margin: 1em; } address { - display: block; - font-style: italic; + display: block; + font-style: italic; } center { - display: block; - text-align: center; + display: block; + text-align: center; } blockquote[type=cite] { - display: block; - margin: 1em 0em; - border-color: blue; - border-width: thin; + display: block; + margin: 1em 0em; + border-color: blue; + border-width: thin; } span[_moz_quote=true] { - color: blue; + color: blue; } pre[_moz_quote=true] { - color: blue; + color: blue; } h1 { - display: block; - font-size: 2em; - font-weight: bold; - margin: .67em 0; + display: block; + font-size: 2em; + font-weight: bold; + margin: .67em 0; } h2 { - display: block; - font-size: 1.5em; - font-weight: bold; - margin: .83em 0; + display: block; + font-size: 1.5em; + font-weight: bold; + margin: .83em 0; } h3 { - display: block; - font-size: 1.17em; - font-weight: bold; - margin: 1em 0; + display: block; + font-size: 1.17em; + font-weight: bold; + margin: 1em 0; } h4 { - display: block; - font-weight: bold; - margin: 1.33em 0; + display: block; + font-weight: bold; + margin: 1.33em 0; } h5 { - display: block; - font-size: 0.83em; - font-weight: bold; - margin: 1.67em 0; + display: block; + font-size: 0.83em; + font-weight: bold; + margin: 1.67em 0; } h6 { - display: block; - font-size: 0.67em; - font-weight: bold; - margin: 2.33em 0; + display: block; + font-size: 0.67em; + font-weight: bold; + margin: 2.33em 0; } listing { - display: block; - font-family: monospace; - font-size: medium; - white-space: pre; - margin: 1em 0; + display: block; + font-family: monospace; + font-size: medium; + white-space: pre; + margin: 1em 0; } -xmp, pre, plaintext { - display: block; - font-family: monospace; - white-space: pre; - margin: 1em 0; +xmp,pre,plaintext { + display: block; + font-family: monospace; + white-space: pre; + margin: 1em 0; } /* tables */ - table { - display: table; - border-spacing: 2px; - border-collapse: separate; - margin-top: 0; - margin-bottom: 0; - text-indent: 0; + display: table; + border-spacing: 2px; + border-collapse: separate; + margin-top: 0; + margin-bottom: 0; + text-indent: 0; } table[align="left"] { - float: left; + float: left; } table[align="right"] { - float: right; + float: right; } -table[rules]:not([rules="none"]) { - border-collapse: collapse; +table[rules]:not ([rules="none"] ) { + border-collapse: collapse; } - -/* caption inherits from table not table-outer */ + +/* caption inherits from table not table-outer */ caption { - display: table-caption; - text-align: center; + display: table-caption; + text-align: center; } -table[align="center"] > caption { - margin-left: auto; - margin-right: auto; +table[align="center"]>caption { + margin-left: auto; + margin-right: auto; } -table[align="center"] > caption[align="left"] { - margin-right: 0; +table[align="center"]>caption[align="left"] { + margin-right: 0; } -table[align="center"] > caption[align="right"] { - margin-left: 0; +table[align="center"]>caption[align="right"] { + margin-left: 0; } tr { - display: table-row; - vertical-align: inherit; + display: table-row; + vertical-align: inherit; } col { - display: table-column; + display: table-column; } colgroup { - display: table-column-group; + display: table-column-group; } tbody { - display: table-row-group; - vertical-align: middle; + display: table-row-group; + vertical-align: middle; } thead { - display: table-header-group; - vertical-align: middle; + display: table-header-group; + vertical-align: middle; } tfoot { - display: table-footer-group; - vertical-align: middle; + display: table-footer-group; + vertical-align: middle; } /* for XHTML tables without tbody */ -table > tr { - vertical-align: middle; +table>tr { + vertical-align: middle; } -td { - display: table-cell; - vertical-align: inherit; - text-align: inherit; - padding: 1px; +td { + display: table-cell; + vertical-align: inherit; + text-align: inherit; + padding: 1px; } th { - display: table-cell; - vertical-align: inherit; - font-weight: bold; - padding: 1px; + display: table-cell; + vertical-align: inherit; + font-weight: bold; + padding: 1px; } /* inlines */ - -b, strong { - font-weight: bolder; +b,strong { + font-weight: bolder; } -i, cite, em, var, dfn { - font-style: italic; +i,cite,em,var,dfn { + font-style: italic; } -tt, code, kbd, samp { - font-family: monospace; +tt,code,kbd,samp { + font-family: monospace; } -u, ins { - text-decoration: underline; +u,ins { + text-decoration: underline; } -s, strike, del { - text-decoration: line-through; +s,strike,del { + text-decoration: line-through; } blink { - text-decoration: blink; + text-decoration: blink; } big { - font-size: larger; + font-size: larger; } small { - font-size: smaller; + font-size: smaller; } sub { - vertical-align: sub; - font-size: smaller; - line-height: normal; + vertical-align: sub; + font-size: smaller; + line-height: normal; } sup { - vertical-align: super; - font-size: smaller; - line-height: normal; + vertical-align: super; + font-size: smaller; + line-height: normal; } nobr { - white-space: nowrap; + white-space: nowrap; } /* titles */ -abbr[title], acronym[title] { - border-bottom: dotted 1px; +abbr[title],acronym[title] { + border-bottom: dotted 1px; } /* lists */ - -ul, menu, dir { - display: block; - list-style-type: disc; - margin: 1em 0; +ul,menu,dir { + display: block; + list-style-type: disc; + margin: 1em 0; } ol { - display: block; - list-style-type: decimal; - margin: 1em 0; + display: block; + list-style-type: decimal; + margin: 1em 0; } li { - display: list-item; + display: list-item; } /* nested lists have no top/bottom margins */ -ul ul, ul ol, ul dir, ul menu, ul dl, -ol ul, ol ol, ol dir, ol menu, ol dl, -dir ul, dir ol, dir dir, dir menu, dir dl, -menu ul, menu ol, menu dir, menu menu, menu dl, -dl ul, dl ol, dl dir, dl menu, dl dl { - margin-top: 0; - margin-bottom: 0; +ul ul,ul ol,ul dir,ul menu,ul dl,ol ul,ol ol,ol dir,ol menu,ol dl,dir ul,dir ol,dir dir,dir menu,dir dl,menu ul,menu ol,menu dir,menu menu,menu dl,dl ul,dl ol,dl dir,dl menu,dl dl + { + margin-top: 0; + margin-bottom: 0; } /* 2 deep unordered lists use a circle */ -ol ul, ul ul, menu ul, dir ul, -ol menu, ul menu, menu menu, dir menu, -ol dir, ul dir, menu dir, dir dir { - list-style-type: circle; +ol ul,ul ul,menu ul,dir ul,ol menu,ul menu,menu menu,dir menu,ol dir,ul dir,menu dir,dir dir + { + list-style-type: circle; } /* 3 deep (or more) unordered lists use a square */ -ol ol ul, ol ul ul, ol menu ul, ol dir ul, -ol ol menu, ol ul menu, ol menu menu, ol dir menu, -ol ol dir, ol ul dir, ol menu dir, ol dir dir, -ul ol ul, ul ul ul, ul menu ul, ul dir ul, -ul ol menu, ul ul menu, ul menu menu, ul dir menu, -ul ol dir, ul ul dir, ul menu dir, ul dir dir, -menu ol ul, menu ul ul, menu menu ul, menu dir ul, -menu ol menu, menu ul menu, menu menu menu, menu dir menu, -menu ol dir, menu ul dir, menu menu dir, menu dir dir, -dir ol ul, dir ul ul, dir menu ul, dir dir ul, -dir ol menu, dir ul menu, dir menu menu, dir dir menu, -dir ol dir, dir ul dir, dir menu dir, dir dir dir { - list-style-type: square; +ol ol ul,ol ul ul,ol menu ul,ol dir ul,ol ol menu,ol ul menu,ol menu menu,ol dir menu,ol ol dir,ol ul dir,ol menu dir,ol dir dir,ul ol ul,ul ul ul,ul menu ul,ul dir ul,ul ol menu,ul ul menu,ul menu menu,ul dir menu,ul ol dir,ul ul dir,ul menu dir,ul dir dir,menu ol ul,menu ul ul,menu menu ul,menu dir ul,menu ol menu,menu ul menu,menu menu menu,menu dir menu,menu ol dir,menu ul dir,menu menu dir,menu dir dir,dir ol ul,dir ul ul,dir menu ul,dir dir ul,dir ol menu,dir ul menu,dir menu menu,dir dir menu,dir ol dir,dir ul dir,dir menu dir,dir dir dir + { + list-style-type: square; } - /* leafs */ - -/*
noshade and color attributes are handled completely by + /*
noshade and color attributes are handled completely by * the nsHTMLHRElement attribute mapping code */ hr { - display: block; - height: 2px; - border: 1px inset; - margin: 0.5em auto 0.5em auto; - color: gray; + display: block; + height: 2px; + border: 1px inset; + margin: 0.5em auto 0.5em auto; + color: gray; } hr[size="1"] { - border-style: solid none none none; + border-style: solid none none none; } -img[usemap], object[usemap] { - color: blue; +img[usemap],object[usemap] { + color: blue; } frameset { - display: block ! important; - position: static ! important; - float: none ! important; - border: none ! important; + display: block ! important; + position: static ! important; + float: none ! important; + border: none ! important; } frame { - border: none ! important; + border: none ! important; } iframe { - border: 2px inset; + border: 2px inset; } noframes { - display: none; + display: none; } spacer { - position: static ! important; - float: none ! important; + position: static ! important; + float: none ! important; } /* hidden elements */ -area, base, basefont, head, meta, script, style, title, -noembed, param, link { - display: none; +area,base,basefont,head,meta,script,style,title,noembed,param,link { + display: none; } /* Page breaks at body tags, to help out with LIT-generation */ body { - page-break-before: always; + page-break-before: always; } /* Explicit line-breaks are blocks, sure... */ br { - display: block; + display: block; } /* Images, embedded object, and SVG size defaults */ -img, object, svg|svg { - width: auto; - height: auto; +img,object,svg |svg { + width: auto; + height: auto; } /* These are needed because ADE renders anchors the same as links */ +a { + text-decoration: inherit; + color: inherit; + cursor: inherit +} -a { text-decoration: inherit; color: inherit; cursor: inherit } -a[href] { text-decoration: underline; color: blue; cursor: pointer } +a[href] { + text-decoration: underline; + color: blue; + cursor: pointer +} \ No newline at end of file diff --git a/setup/installer/windows/en-us.xml b/setup/installer/windows/en-us.xml index 89cc25f0a2..ed181c524b 100644 --- a/setup/installer/windows/en-us.xml +++ b/setup/installer/windows/en-us.xml @@ -1,9 +1,16 @@ - - If you are upgrading from a {app} version older than 0.6.17, please uninstall {app} first. Click Advanced to change installation settings. - Computing space requirements, this may take upto five minutes... - Computing space requirements, this may take upto five minutes... - Computing space requirements, this may take upto five minutes... - Please wait while the installer finishes determining your disk space requirements, this may take upto five minutes... + + If you are upgrading from a {app} version older than + 0.6.17, please uninstall {app} first. Click Advanced to change + installation settings. + Computing space requirements, this may take upto five + minutes... + Computing space requirements, this may take upto five + minutes... + Computing space requirements, this may take upto five + minutes... + Please wait while the installer finishes determining + your disk space requirements, this may take upto five minutes... diff --git a/setup/installer/windows/wix-template.xml b/setup/installer/windows/wix-template.xml index 37dd8b25a8..1300eba956 100644 --- a/setup/installer/windows/wix-template.xml +++ b/setup/installer/windows/wix-template.xml @@ -1,164 +1,157 @@ - + - - - + - - - - - - - + - - - - - - - - - - + - - {app_components} - - - - - + + + + + - - - - - - - + + + + + + + + + + - - - - + + {app_components} + + + + + - - - - - - - + + + + + + + + + + + + + + + + + + + - - - + - - - + + - - - + + + - - - + + + - - - - + + + - - - + + + + - + + + + + = 501)]]> - - - NEWPRODUCTFOUND - - - - NEWPRODUCTFOUND - + + + NEWPRODUCTFOUND + + + + NEWPRODUCTFOUND + - - - WIXUI_EXITDIALOGOPTIONALCHECKBOX = 1 and NOT Installed + + + WIXUI_EXITDIALOGOPTIONALCHECKBOX = 1 and NOT Installed - + - - - - + + + + - - - - - + + + + + - - - + + + - + diff --git a/src/calibre/ebooks/lrf/html/demo/demo.html b/src/calibre/ebooks/lrf/html/demo/demo.html index 7d2f783ccc..37bed69b88 100644 --- a/src/calibre/ebooks/lrf/html/demo/demo.html +++ b/src/calibre/ebooks/lrf/html/demo/demo.html @@ -1,187 +1,279 @@ -

Demo of html2lrf

-

- This document contains a demonstration of the capabilities of html2lrf, the HTML to LRF converter from calibre. To obtain calibre visit
http://calibre-ebook.com -

-
-

Table of Contents

- +

Demo of html2lrf

+

This document contains a demonstration of the capabilities of html2lrf, the HTML to LRF +converter from calibre. To obtain calibre visit
+http://calibre-ebook.com

+
+

Table of Contents

+ -

Lists

- -

Nested lists

-
    -
  1. Item 1
  2. -
      -
    • Sub item 1
    • -
    • Sub item 2
    • -
        -
      1. Sub sub item 1. This is a multiline item with almost correct blocking.
      2. -
      3. Sub sub item 2
      4. -
      -
    -
  3. Item 2
  4. -
-

-

Definition Lists

-
-
Term 1
-
Definition of Term 1. A multi line definition showing correct blocking.
-
Term 2
-
Definition of Term 2
-
-

-


- Table of Contents -

+

Lists

-

Tables

- - - - - - -

A matrix

Column 1Column 2Column 3
Row 1

(1, 1)

Row 2

(2, 2)

Row 3

(3, 3)

-
-

- html2lrf supports both rowspan and colspan, but no other HTML table attributes, as it uses its own algorithm to determine optimal placement of cells. -

-

- Note that if you have custom fonts on your reader, the table may not be properly aligned. Also html2lrf does not support nested tables. -

-

- On the next page you'll see a real life example taken from a Project Gutenberg text with no modifications. It shows off html2lrf's handling of rowspan and colspan. -

-

Sample Complex Table of Contents

- - - - - - - - - - - - - - -
 PAGE
Prefacev
List of Works of Referencevii
List of Illustrationsxi
ChapterI.History of the Foundation3
II.Exterior of the Church25
III.Interior of the Church33
IV.St. Bartholomew-the-Less and the Hospital63
AppendixI.The Priory Seals73
II.The Priors and Rectors77
III.Inventory of Vestments, etc.79
IV.The Organ80
Index83
- -

-


- Table of Contents -

- -

Text formatting

-

- A simple paragraph of formatted - text, with a ruled line following it. - Superscripts and Subscripts. -

-
-
-

A - similar - paragraph, but now using - CSS - to perform the text formatting.

-
-
A centered phrase
- A right aligned phrase - A normal phrase -
-

A paragraph containing a <blockquote> -

This is blockquoted text. It is rendered in a separate block with margins.
The above text should be distinct from the rest of the paragraph. -

-
-

A very indented paragraph

-

An unindented paragraph

-

A default indented paragraph

-

-


- Table of Contents -

- - -

Inline images

-

- Here I demonstrate the use of inline images in the midst of text. Here is a small image embedded in a sentence. Now we have a slightly larger image that is automatically put in its own block and finally we have a large image which is put on a page by itself. Try changing sizes from S to M to L and see how the images behave. -

+

Nested lists

+
    +
  1. Item 1
  2. +
      +
    • Sub item 1
    • +
    • Sub item 2
    • +
        +
      1. Sub sub item 1. This is a multiline item with almost + correct blocking.
      2. +
      3. Sub sub item 2
      4. +
      +
    +
  3. Item 2
  4. +
+

+

Definition Lists

+
+
Term 1
+
Definition of Term 1. A multi line definition showing correct + blocking.
+
Term 2
+
Definition of Term 2
+

-


- Table of Contents -

+
+Table of Contents

-

Embedded fonts

-

This LRF file has been prepared by embedding Times New Roman and Andale Mono - as the default serif and monospace fonts. This allows it to correctly display - non English characters such as:

-
    -
  • mouse in German: mÅ«s
  • -
  • mouse in Russian: мышь
  • -
-

- Note that embedding fonts in LRF files slows down page turns slightly. -
-

- -

-


- Table of Contents -

- -

Paragraph Emphasis

-
-

beautiful image based dropcaps to emphasize this - paragraph. Image based dropcaps are specified by adding the class = 'libprs500_dropcaps' - attribute to an <img> tag.

-
- -

This is a plain text based dropcaps. It - is not nearly as dramatic, but easier to code ;-) -

-
- -

This is an Example of small-caps. - It can also be used to highlight the start of a paragraph very effectively. -

-
-

A paragraph with a hanging indent. This is especially - useful for highly structured text like verse, or dialogue.

-

-


- Table of Contents -

+

Tables

+ + + + + + + + + + + + + + + + + + + + + + + + + +
+

A matrix

+
Column 1Column 2Column 3
Row 1 +

(1, 1)

+
Row 2 +

(2, 2)

+
Row 3 +

(3, 3)

+
+
+

html2lrf supports both rowspan and colspan, but no other HTML +table attributes, as it uses its own algorithm to determine optimal +placement of cells.

+

Note that if you have custom fonts on your reader, the table may +not be properly aligned. Also html2lrf does not support nested tables.

+

On the next page you'll see a +real life example taken from a Project Gutenberg text with no +modifications. It shows off html2lrf's handling of rowspan and colspan. +

+

Sample Complex Table of Contents

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
 PAGE
Prefacev
List of Works of Referencevii
List of Illustrationsxi
ChapterI.History of the Foundation3
II.Exterior of the Church25
III.Interior of the Church33
IV.St. Bartholomew-the-Less and the Hospital63
AppendixI.The Priory Seals73
II.The Priors and Rectors77
III.Inventory of Vestments, etc.79
IV.The Organ80
Index83
-

Recursive link following

-

- html2lrf follows links in HTML files that point to other files, recursively. Thus it can be used to convert a whole tree of HTML files into a single LRF file. -
-

-


- Table of Contents -

+
+Table of Contents

+

Text formatting

+

A simple paragraph of formatted text, with a +ruled line following it. Superscripts and Subscripts. +

+
+
+

A similar paragraph, but +now using CSS to perform the text +formatting.

+
+
A centered phrase
+A right aligned phrase +A normal phrase +
+

A paragraph containing a <blockquote> +

This is blockquoted text. It is rendered in a +separate block with margins.
+The above text should be distinct from the rest of the paragraph.

+
+

A very indented paragraph

+

An unindented paragraph

+

A default indented paragraph

+

+


+Table of Contents

+ + +

Inline images

+

Here I demonstrate the use of inline images in the midst of text. +Here is a small image embedded in a sentence. +Now we have a slightly larger image that is automatically put in its own +block and finally +we have a large image which is put on a page by itself. Try changing +sizes from S to M to L and see how the images behave.

+

+


+Table of Contents

+ +

Embedded fonts

+

This LRF file has been prepared by embedding Times New Roman and +Andale Mono as the default serif and monospace fonts. This allows it to +correctly display non English characters such as:

+
    +
  • mouse in German: mÅ«s
  • +
  • mouse in Russian: мышь
  • +
+

Note that embedding fonts in LRF files slows down page turns +slightly.
+

+ +

+


+Table of Contents

+ +

Paragraph Emphasis

+
+

beautiful image +based dropcaps to emphasize this paragraph. Image based dropcaps are +specified by adding the class = 'libprs500_dropcaps' +attribute to an <img> tag. +


+ +

This is a plain text based dropcaps. It is not +nearly as dramatic, but easier to code ;-)

+
+ +

This is an Example +of small-caps. It can also be used to highlight the start of a paragraph +very effectively.

+
+

A paragraph with a hanging indent. This is +especially useful for highly structured text like verse, or dialogue.
+

+

+


+Table of Contents

+ +

Recursive link following

+

html2lrf follows links in +HTML files that point to other files, recursively. Thus it can be used +to convert a whole tree of HTML files into a single LRF file.
+

+

+


+Table of Contents

diff --git a/src/calibre/manual/templates/layout.html b/src/calibre/manual/templates/layout.html index c5a857650f..8ec8c949e8 100644 --- a/src/calibre/manual/templates/layout.html +++ b/src/calibre/manual/templates/layout.html @@ -1,14 +1,14 @@ -{% extends "!layout.html" %} -{% block sidebarlogo %} - -
- - - - -
-
+{% extends "!layout.html" %} {% block sidebarlogo %} + +
+ +
+
{% endblock %} diff --git a/src/calibre/manual/xpath.xhtml b/src/calibre/manual/xpath.xhtml index 7468e3d856..3a78863236 100644 --- a/src/calibre/manual/xpath.xhtml +++ b/src/calibre/manual/xpath.xhtml @@ -1,19 +1,19 @@ - - A very short ebook - - - -

A very short ebook

-

Written by Kovid Goyal

-
-

A very short ebook to demonstrate the use of XPath.

-
+ +A very short ebook + + + +

A very short ebook

+

Written by Kovid Goyal

+
+

A very short ebook to demonstrate the use of XPath.

+
-

Chapter One

-

This is a truly fascinating chapter.

+

Chapter One

+

This is a truly fascinating chapter.

-

Chapter Two

-

A worthy continuation of a fine tradition.

- +

Chapter Two

+

A worthy continuation of a fine tradition.

+ From ae8fcb1fd4579026c55f8ee6686fcc096b861b30 Mon Sep 17 00:00:00 2001 From: Sengian Date: Tue, 10 Aug 2010 13:07:29 +0200 Subject: [PATCH 12/41] Correct error with setup.py --- setup.py | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 setup.py diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000..d8bd0267ee --- /dev/null +++ b/setup.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import sys, os, optparse + +sys.path.insert(0, os.path.abspath(os.path.dirname(__file__))) + +import setup.commands as commands +from setup import prints, get_warnings + +def check_version_info(): + vi = sys.version_info + if vi[0] == 2 and vi[1] > 5: + return None + return 'calibre requires python >= 2.6' + +def option_parser(): + parser = optparse.OptionParser() + parser.add_option('-c', '--clean', default=False, action='store_true', + help=('Instead of running the command delete all files generated ' + 'by the command')) + parser.add_option('--clean-backups', default=False, action='store_true', + help='Delete all backup files from the source tree') + parser.add_option('--clean-all', default=False, action='store_true', + help='Delete all machine generated files from the source tree') + return parser + +def clean_backups(): + for root, _, files in os.walk('.'): + for name in files: + for t in ('.pyc', '.pyo', '~', '.swp', '.swo'): + if name.endswith(t): + os.remove(os.path.join(root, name)) + + +def main(args=sys.argv): + if len(args) == 1 or args[1] in ('-h', '--help'): + print 'Usage: python', args[0], 'command', '[options]' + print '\nWhere command is one of:' + print + for x in sorted(commands.__all__): + print '%-20s -'%x, + c = getattr(commands, x) + desc = getattr(c, 'short_description', c.description) + print desc + + print '\nTo get help on a particular command, run:' + print '\tpython', args[0], 'command -h' + return 1 + + command = args[1] + if command not in commands.__all__: + print command, 'is not a recognized command.' + print 'Valid commands:', ', '.join(commands.__all__) + return 1 + + command = getattr(commands, command) + + parser = option_parser() + command.add_all_options(parser) + parser.set_usage('Usage: python setup.py %s [options]\n\n'%args[1]+\ + command.description) + + opts, args = parser.parse_args(args) + + if opts.clean_backups: + clean_backups() + + if opts.clean: + prints('Cleaning', args[1]) + command.clean() + return 0 + + if opts.clean_all: + for cmd in commands.__all__: + prints('Cleaning', cmd) + getattr(commands, cmd).clean() + return 0 + + command.run_all(opts) + + warnings = get_warnings() + if warnings: + print + prints('There were', len(warnings), 'warning(s):') + print + for args, kwargs in warnings: + prints('*', *args, **kwargs) + print + + return 0 + +if __name__ == '__main__': + sys.exit(main()) From 7c70914ad30fc358bfcd7c099494b0a43682ba27 Mon Sep 17 00:00:00 2001 From: Sengian Date: Thu, 12 Aug 2010 16:25:09 +0200 Subject: [PATCH 13/41] Global overhaul of rtf2xml: RTFfixes (3) ->removal of preprocessing, first draft of tokenize finished, introduction of \ud:\upr for unicode --- src/calibre/ebooks/rtf2xml/tokenize.py | 104 +++++++++++++++---------- 1 file changed, 64 insertions(+), 40 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/tokenize.py b/src/calibre/ebooks/rtf2xml/tokenize.py index 3aa2079fb3..e594fed80d 100755 --- a/src/calibre/ebooks/rtf2xml/tokenize.py +++ b/src/calibre/ebooks/rtf2xml/tokenize.py @@ -26,7 +26,7 @@ class Tokenize: in_file, bug_handler, copy = None, - #run_level = 1, + run_level = 1, ): self.__file = in_file self.__bug_handler = bug_handler @@ -37,17 +37,22 @@ class Tokenize: self.__uc_char = 0 self.__uc_bin = False self.__uc_value = [1] - - def __from_ms_to_utf8(self,match_obj): - uni_char = int(match_obj.group(1)) - if uni_char < 0: - uni_char += 65536 - return '&#x' + str('%X' % uni_char) + ';' - + def __reini_utf8_counters(self): self.__uc_char = 0 self.__uc_bin = False + def __remove_uc_chars(self, startchar, token): + for i in xrange(startchar, len(token)): + if token[i] == " ": + continue + elif self.__uc_char: + self.__uc_char -= 1 + else: + return token[i:] + #if only " " and char to skip + return '' + def __unicode_process(self, token): #change scope in if token == '\{': @@ -55,9 +60,9 @@ class Tokenize: #basic error handling self.__reini_utf8_counters() return token - #change scope out: evaluate dict and rebuild + #change scope out elif token == '\}': - #self.__uc_value.pop() + self.__uc_value.pop() self.__reini_utf8_counters() return token #add a uc control @@ -65,58 +70,65 @@ class Tokenize: self.__uc_value[-1] = int(token[3:]) self.__reini_utf8_counters() return token - #handle uc skippable char + #bin data to slip + elif self.__uc_bin: + self.__uc_bin = False + return '' + #uc char to remove elif self.__uc_char: - #if token[:1] == "\" and token[:1] == "\" - pass + #handle \bin tag in case of uc char to skip + if token[:4] == '\bin': + self.__uc_char -=1 + self.__uc_bin = True + return '' + elif token[:1] == "\\" : + self.__uc_char -=1 + return '' + else: + return self.__remove_uc_chars(0, token) #go for real \u token match_obj = self.__utf_exp.match(token) if match_obj is not None: + self.__reini_utf8_counters() #get value and handle negative case uni_char = int(match_obj.group(1)) uni_len = len(match_obj.group(1)) + 2 if uni_char < 0: uni_char += 65536 uni_char = unichr(uni_char).encode('ascii', 'xmlcharrefreplace') - #if not uc0 - if self.__uc_value[-1]: - self.__uc_char = self.__uc_value[-1] + self.__uc_char = self.__uc_value[-1] #there is only an unicode char if len(token)<= uni_len: return uni_char #an unicode char and something else #must be after as it is splited on \ - elif not self.__uc_value[-1]: - print('not only token uc0 token: ' + uni_char + token[uni_len:]) + #necessary? maybe for \bin? + elif not self.__uc_char: return uni_char + token[uni_len:] #if not uc0 and chars else: - for i in xrange(uni_len, len(token)): - if token[i] == " ": - continue - elif self.__uc_char > 0: - self.__uc_char -= 1 - else: - return uni_char + token[i:] - #print('uc: ' + str(self.__uc_value) + 'uni: ' + str(uni_char) + 'token: ' + token) + return uni_char + self.__remove_uc_chars(uni_len, token) #default return token - + def __sub_reg_split(self,input_file): input_file = self.__replace_spchar.mreplace(input_file) - #input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file) - # line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line) - # this is for older RTF - #line = re.sub(self.__par_exp, '\\par ', line) - input_file = re.sub(self.__ms_hex_exp, "\\mshex0\g<1> ", input_file) + input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file) + input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file) + #remove \n in bin data + input_file = self.__bin_exp.sub(lambda x: \ + x.group().replace('\n', '') +'\n', input_file) #split tokens = re.split(self.__splitexp, input_file) #remove empty tokens and \n return filter(lambda x: len(x) > 0 and x != '\n', tokens) + #input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file) + # line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line) + # this is for older RTF + #line = re.sub(self.__par_exp, '\\par ', line) #return filter(lambda x: len(x) > 0, \ #(self.__remove_line.sub('', x) for x in tokens)) - - + def __compile_expressions(self): SIMPLE_RPL = { "\\\\": "\\backslash ", @@ -145,18 +157,25 @@ class Tokenize: r'\\$': '\\par ', } self.__replace_spchar = MReplace(SIMPLE_RPL) + #add ;? in case of char following \u self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") #r"\\\'(..)" - self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) {0,1}") #modify this - #self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})") + self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) ?") + self.__bin_exp = re.compile(r"(?:\\bin(-?\d{0,10})[\n ]+)[01\n]+") + #manage upr/ud situations + self.__utf_ud = re.compile(r"\\{[\n ]?\\upr[\n ]?(?:\\{.*?\\})[\n ]?" + \ + r"\\{[\n ]?\\*[\n ]?\\ud[\n ]?(\\{.*?\\})[\n ]?\\}[\n ]?\\}") #add \n in split for whole file reading - #self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)") #why keep backslash whereas \is replaced before? + #remove \n from endline char self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)") + #self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}") + #self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})") + #self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)") #self.__par_exp = re.compile(r'\\$') #self.__remove_line = re.compile(r'\n+') #self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)") ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)") - + def tokenize(self): """Main class for handling other methods. Reads the file \ , uses method self.sub_reg to make basic substitutions,\ @@ -170,9 +189,9 @@ class Tokenize: #remove '' and \n in the process tokens = self.__sub_reg_split(input_file) #correct unicode - #tokens = map(self.__unicode_process, tokens) + tokens = map(self.__unicode_process, tokens) #remove empty items created by removing \uc - #tokens = filter(lambda x: len(x) > 0, tokens) + tokens = filter(lambda x: len(x) > 0, tokens) #write write_obj = open(self.__write_to, 'wb') @@ -241,4 +260,9 @@ class Tokenize: neg_uni_char = int(match_obj.group(1)) * -1 # sys.stderr.write(str( neg_uni_char)) uni_char = neg_uni_char + 65536 + return '&#x' + str('%X' % uni_char) + ';''' + '''def __from_ms_to_utf8(self,match_obj): + uni_char = int(match_obj.group(1)) + if uni_char < 0: + uni_char += 65536 return '&#x' + str('%X' % uni_char) + ';''' \ No newline at end of file From b9ed0c6b3d579f1dc2e2c5b94df5e2e8f9ec75d4 Mon Sep 17 00:00:00 2001 From: Sengian Date: Thu, 12 Aug 2010 17:16:37 +0200 Subject: [PATCH 14/41] Global overhaul of rtf2xml: RTFfixes (4) ->minors corrections in line endings and check brackets, move check encoding first to eliminate non ascii RTF --- src/calibre/ebooks/rtf2xml/ParseRtf.py | 40 ++++++++++---------- src/calibre/ebooks/rtf2xml/check_brackets.py | 1 - src/calibre/ebooks/rtf2xml/check_encoding.py | 10 +++-- src/calibre/ebooks/rtf2xml/line_endings.py | 11 ++++-- 4 files changed, 33 insertions(+), 29 deletions(-) diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index 3a804792c5..76bdcc08af 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -133,7 +133,6 @@ class ParseRtf: self.__temp_dir = out_dir self.__dtd_path = dtd self.__check_file(in_file,"file_to_parse") - self.__check_ascii(in_file) self.__char_data = char_data self.__debug_dir = deb_dir self.__check_dir(self.__temp_dir) @@ -152,6 +151,7 @@ class ParseRtf: self.__group_borders = group_borders self.__empty_paragraphs = empty_paragraphs self.__no_dtd = no_dtd + def __check_file(self, the_file, type): """Check to see if files exist""" if hasattr(the_file, 'read'): return @@ -164,6 +164,7 @@ class ParseRtf: else: msg = "\nThe file '%s' cannot be found" % the_file raise RtfInvalidCodeException, msg + def __check_dir(self, the_dir): """Check to see if directory exists""" if not the_dir : @@ -173,15 +174,7 @@ class ParseRtf: msg = "\n%s is not a directory" % the_dir raise RtfInvalidCodeException, msg return 1 - def __check_ascii(self, the_file): - """Check to see if the file is correct ascii""" - try: - test = codecs.open(the_file, 'r', 'ascii', 'strict') - test.close() - except UnicodeError: - msg = "\n%s is not a correct ascii file" % the_file - raise RtfInvalidCodeException, msg - return 1 + def parse_rtf(self): """ Parse the file by calling on other classes. @@ -192,6 +185,18 @@ class ParseRtf: depending on the value of 'output' when the instance was created. """ self.__temp_file = self.__make_temp_file(self.__file) + #Check to see if the file is correct ascii first + check_encoding_obj = check_encoding.CheckEncoding( + bug_handler = RtfInvalidCodeException, + ) + if check_encoding_obj.check_encoding(self.__file): + try: + os.remove(self.__temp_file) + except OSError: + pass + sys.stderr.write('File "%s" does not appear to be ascii.\n' \ + % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8')) + raise InvalidRtfException # if the self.__deb_dir is true, then create a copy object, # set the directory to write to, remove files, and copy # the new temporary file to this directory @@ -214,7 +219,7 @@ class ParseRtf: in_file = self.__temp_file, bug_handler = RtfInvalidCodeException, copy = self.__copy, - #run_level = self.__run_level, + run_level = self.__run_level, replace_illegals = self.__replace_illegals, ) line_obj.fix_endings() @@ -223,8 +228,8 @@ class ParseRtf: tokenize_obj = tokenize.Tokenize( bug_handler = RtfInvalidCodeException, in_file = self.__temp_file, - copy = self.__copy,) - #run_level = self.__run_level,) + copy = self.__copy, + run_level = self.__run_level) tokenize_obj.tokenize() process_tokens_obj = process_tokens.ProcessTokens( in_file = self.__temp_file, @@ -240,10 +245,6 @@ class ParseRtf: os.remove(self.__temp_file) except OSError: pass - check_encoding_obj = check_encoding.CheckEncoding( - bug_handler = RtfInvalidCodeException, - ) - check_encoding_obj.check_encoding(self.__file) sys.stderr.write('File "%s" does not appear to be RTF.\n' % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8')) raise InvalidRtfException, msg delete_info_obj = delete_info.DeleteInfo( @@ -548,8 +549,7 @@ class ParseRtf: """Make a temporary file to parse""" write_file="rtf_write_file" read_obj = file if hasattr(file, 'read') else open(file,'r') - write_obj = open(write_file, 'w') - for line in read_obj: - write_obj.write(line) + write_obj = open(write_file, 'wb') + write_obj.write(read_obj.read()) write_obj.close() return write_file \ No newline at end of file diff --git a/src/calibre/ebooks/rtf2xml/check_brackets.py b/src/calibre/ebooks/rtf2xml/check_brackets.py index 53f9363d63..8917780746 100755 --- a/src/calibre/ebooks/rtf2xml/check_brackets.py +++ b/src/calibre/ebooks/rtf2xml/check_brackets.py @@ -30,7 +30,6 @@ class CheckBrackets: self.__bracket_count += 1 def close_brack(self, line): num = line[-5:-1] - ##self.__open_bracket_num.append(num) try: last_num = self.__open_bracket_num.pop() except: diff --git a/src/calibre/ebooks/rtf2xml/check_encoding.py b/src/calibre/ebooks/rtf2xml/check_encoding.py index f6810e4909..1f8645bb0c 100755 --- a/src/calibre/ebooks/rtf2xml/check_encoding.py +++ b/src/calibre/ebooks/rtf2xml/check_encoding.py @@ -14,12 +14,11 @@ class CheckEncoding: sys.stderr.write(str(msg) + '\n') def check_encoding(self, path, encoding='us-ascii'): read_obj = open(path, 'r') - line_to_read = 1 + input_file = read_obj.read() + read_obj.close() line_num = 0 - while line_to_read: + for line in input_file: line_num += 1 - line_to_read = read_obj.readline() - line = line_to_read try: line.decode(encoding) except UnicodeError: @@ -27,6 +26,9 @@ class CheckEncoding: self.__get_position_error(line, encoding, line_num) else: sys.stderr.write('line: %d has bad encoding\n'%line_num) + return True + return False + if __name__ == '__main__': check_encoding_obj = CheckEncoding() check_encoding_obj.check_encoding(sys.argv[1]) diff --git a/src/calibre/ebooks/rtf2xml/line_endings.py b/src/calibre/ebooks/rtf2xml/line_endings.py index e77e5d747c..86546967a7 100755 --- a/src/calibre/ebooks/rtf2xml/line_endings.py +++ b/src/calibre/ebooks/rtf2xml/line_endings.py @@ -23,7 +23,7 @@ class FixLineEndings: bug_handler, in_file = None, copy = None, - #run_level = 1, calibre why keep it? + run_level = 1, replace_illegals = 1, ): self.__file = in_file @@ -32,8 +32,11 @@ class FixLineEndings: self.__write_to = tempfile.mktemp() self.__replace_illegals = replace_illegals def fix_endings(self): - illegal_regx = re.compile('\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13') - # always check since I have to get rid of illegal characters + #remove ASCII invalid chars : 0 to 8 and 11-14 to 24 + #always check since I have to get rid of illegal characters + chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19)) + illegal_regx = re.compile(u'|'.join(map(unichr, chars))) + #illegal_regx = re.compile('\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13') #read read_obj = open(self.__file, 'r') input_file = read_obj.read() @@ -42,7 +45,7 @@ class FixLineEndings: input_file = input_file.replace ('\r\n', '\n') input_file = input_file.replace ('\r', '\n') if self.__replace_illegals: - input_file = re.sub(illegal_regx, '', input_file) + input_file = illegal_regx.sub('', input_file) #write write_obj = open(self.__write_to, 'wb') write_obj.write(input_file) From a9fd0ad4ba9acdcc07d5bfcae503c378c25a7303 Mon Sep 17 00:00:00 2001 From: Sengian Date: Mon, 16 Aug 2010 10:08:59 +0200 Subject: [PATCH 15/41] Global overhaul of rtf2xml: RTFfixes (5) ->minors corrections and regression correction --- src/calibre/ebooks/rtf/input.py | 2 +- src/calibre/ebooks/rtf2xml/ParseRtf.py | 14 +- src/calibre/ebooks/rtf2xml/check_encoding.py | 11 +- src/calibre/ebooks/rtf2xml/copy.py | 14 +- src/calibre/ebooks/rtf2xml/process_tokens.py | 163 ++++++++++--------- 5 files changed, 104 insertions(+), 100 deletions(-) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 824da7d6f1..f4fbdf411c 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -50,7 +50,7 @@ class RTFInput(InputFormatPlugin): parser = ParseRtf( in_file = stream, out_file = ofile, - deb_dir = 'I:\\Calibre\\rtfdebug', + deb_dir = 'D:\\calibre\\pierre\\debug\\rtfdebug', # Convert symbol fonts to unicode equivalents. Default # is 1 convert_symbol = 1, diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index 76bdcc08af..1230ae150e 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -120,8 +120,6 @@ class ParseRtf: script tries to output to directory where is script is exectued.) 'deb_dir' --debug directory. If a debug_dir is provided, the script will copy each run through as a file to examine in the debug_dir - 'perl_script'--use perl to make tokens. This runs just a bit faster. - (I will probably phase this out.) 'check_brackets' -- make sure the brackets match up after each run through a file. Only for debugging. Returns: Nothing @@ -142,7 +140,7 @@ class ParseRtf: self.__convert_wingdings = convert_wingdings self.__convert_zapf = convert_zapf self.__run_level = run_level - #self.__exit_level = 0 + #self.__exit_level = 0 See what this means and if it is consistent self.__indent = indent self.__replace_illegals = replace_illegals self.__form_lists = form_lists @@ -184,19 +182,15 @@ class ParseRtf: A parsed file in XML, either to standard output or to a file, depending on the value of 'output' when the instance was created. """ - self.__temp_file = self.__make_temp_file(self.__file) #Check to see if the file is correct ascii first check_encoding_obj = check_encoding.CheckEncoding( bug_handler = RtfInvalidCodeException, ) if check_encoding_obj.check_encoding(self.__file): - try: - os.remove(self.__temp_file) - except OSError: - pass sys.stderr.write('File "%s" does not appear to be ascii.\n' \ % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8')) raise InvalidRtfException + self.__temp_file = self.__make_temp_file(self.__file) # if the self.__deb_dir is true, then create a copy object, # set the directory to write to, remove files, and copy # the new temporary file to this directory @@ -223,7 +217,6 @@ class ParseRtf: replace_illegals = self.__replace_illegals, ) line_obj.fix_endings() - #return_value = line_obj.fix_endings() #calibre: no return in this function, why keep it? #self.__return_code(return_value) tokenize_obj = tokenize.Tokenize( bug_handler = RtfInvalidCodeException, @@ -550,6 +543,7 @@ class ParseRtf: write_file="rtf_write_file" read_obj = file if hasattr(file, 'read') else open(file,'r') write_obj = open(write_file, 'wb') - write_obj.write(read_obj.read()) + for line in read_obj: + write_obj.write(line) write_obj.close() return write_file \ No newline at end of file diff --git a/src/calibre/ebooks/rtf2xml/check_encoding.py b/src/calibre/ebooks/rtf2xml/check_encoding.py index 1f8645bb0c..444fd373e4 100755 --- a/src/calibre/ebooks/rtf2xml/check_encoding.py +++ b/src/calibre/ebooks/rtf2xml/check_encoding.py @@ -14,10 +14,10 @@ class CheckEncoding: sys.stderr.write(str(msg) + '\n') def check_encoding(self, path, encoding='us-ascii'): read_obj = open(path, 'r') - input_file = read_obj.read() - read_obj.close() + line_num = 0 - for line in input_file: + error_found = False + for line in read_obj: line_num += 1 try: line.decode(encoding) @@ -26,8 +26,9 @@ class CheckEncoding: self.__get_position_error(line, encoding, line_num) else: sys.stderr.write('line: %d has bad encoding\n'%line_num) - return True - return False + error_found = True + read_obj.close() + return error_found if __name__ == '__main__': check_encoding_obj = CheckEncoding() diff --git a/src/calibre/ebooks/rtf2xml/copy.py b/src/calibre/ebooks/rtf2xml/copy.py index ff029c1841..1b620b9fbf 100755 --- a/src/calibre/ebooks/rtf2xml/copy.py +++ b/src/calibre/ebooks/rtf2xml/copy.py @@ -23,6 +23,7 @@ class Copy: def __init__(self, bug_handler, file = None, deb_dir = None, ): self.__file = file self.__bug_handler = bug_handler + def set_dir(self, deb_dir): """Set the temporary directory to write files to""" if deb_dir is None: @@ -33,19 +34,11 @@ class Copy: message = "%(deb_dir)s is not a directory" % vars() raise self.__bug_handler , message Copy.__dir = deb_dir + def remove_files(self ): """Remove files from directory""" self.__remove_the_files(Copy.__dir) - """ - list_of_files = os.listdir(Copy.__dir) - list_of_files = os.listdir(the_dir) - for file in list_of_files: - rem_file = os.path.join(Copy.__dir,file) - if os.path.isdir(rem_file): - self.remove_files(rem_file) - else: - os.remove(rem_file) - """ + def __remove_the_files(self, the_dir): """Remove files from directory""" list_of_files = os.listdir(the_dir) @@ -58,6 +51,7 @@ class Copy: os.remove(rem_file) except OSError: pass + def copy_file(self, file, new_file): """ Copy the file to a new name diff --git a/src/calibre/ebooks/rtf2xml/process_tokens.py b/src/calibre/ebooks/rtf2xml/process_tokens.py index 072d8b02e4..2c5c0c7df0 100755 --- a/src/calibre/ebooks/rtf2xml/process_tokens.py +++ b/src/calibre/ebooks/rtf2xml/process_tokens.py @@ -735,8 +735,94 @@ class ProcessTokens: pre, token, action = self.dict_token.get(token, (None, None, None)) if action: return action(pre, token, num) - # unused function - def initiate_token_actions(self): + + def __check_brackets(self, in_file): + self.__check_brack_obj = check_brackets.CheckBrackets\ + (file = in_file) + good_br = self.__check_brack_obj.check_brackets()[0] + if not good_br: + return 1 + def process_tokens(self): + """Main method for handling other methods. """ + + read_obj= open(self.__file, 'r') + write_obj = open(self.__write_to, 'wb') + + '''first_token = 0 + second_token = 0''' + line_count = 0 + + for line in read_obj: + token = line.replace("\n","") + #calibre not necessary normaly, fixed in tokenize + '''if not token: + continue''' + line_count += 1 + #calibre not necessary, encoding checked before + """try: + token.decode('us-ascii') + except UnicodeError, msg: + msg = str(msg) + msg += 'Invalid RTF: File not ascii encoded.\n' + raise self.__exception_handler, msg""" + #calibre: with tokenize, should be first and second line, why bother? + """if not first_token: + if token != '\\{': + msg = 'Invalid RTF: document doesn\'t start with {\n' + raise self.__exception_handler, msg + first_token = 1 + elif line_count == and not second_token: + if token[0:4] != '\\rtf': + msg ='Invalid RTF: document doesn\'t start with \\rtf \n' + raise self.__exception_handler, msg + second_token = 1""" + if line_count == 1 and token != '\\{': + msg = 'Invalid RTF: document doesn\'t start with {\n' + raise self.__exception_handler, msg + elif line_count == 2 and token[0:4] != '\\rtf': + msg ='Invalid RTF: document doesn\'t start with \\rtf \n' + raise self.__exception_handler, msg + + ##token = self.evaluate_token(token) + the_index = token.find('\\ ') + if token is not None and the_index > -1: + msg ='Invalid RTF: token "\\ " not valid.\n' + raise self.__exception_handler, msg + elif token[:1] == "\\": + line = self.process_cw(token) + if line is not None: + write_obj.write(line) + else: + fields = re.split(self.__utf_exp, token) + for field in fields: + if not field: + continue + if field[0:1] == '&': + write_obj.write('tx -1: - msg ='Invalid RTF: token "\\ " not valid. \n' - raise self.__exception_handler, msg - elif token[0:1] == "\\": - line = self.process_cw(token) - if line != None: - write_obj.write(line) - else: - fields = re.split(self.__utf_exp, token) - for field in fields: - if not field: - continue - if field[0:1] == '&': - write_obj.write('tx Date: Sun, 26 Sep 2010 17:49:59 +0200 Subject: [PATCH 16/41] Modif debug --- src/calibre/ebooks/rtf/input.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 1de064df5c..4c7dfd9260 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -51,7 +51,7 @@ class RTFInput(InputFormatPlugin): parser = ParseRtf( in_file = stream, out_file = ofile, - deb_dir = 'D:\\calibre\\pierre\\debug\\rtfdebug', + deb_dir = 'H:\\Temp\\Calibre\\rtfdebug', # Convert symbol fonts to unicode equivalents. Default # is 1 convert_symbol = 1, From 9590ba62348930d93c496e507549a8c97d43ef16 Mon Sep 17 00:00:00 2001 From: Sengian Date: Mon, 11 Oct 2010 00:35:07 +0200 Subject: [PATCH 17/41] isbndb.py minor changes --- src/calibre/ebooks/metadata/isbndb.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/metadata/isbndb.py b/src/calibre/ebooks/metadata/isbndb.py index 221cfc13d1..2bbffc2c8b 100644 --- a/src/calibre/ebooks/metadata/isbndb.py +++ b/src/calibre/ebooks/metadata/isbndb.py @@ -74,14 +74,14 @@ class ISBNDBMetadata(Metadata): if authors: self.authors = authors try: - self.author_sort = self.tostring(book.find('authors').find('person')) + self.author_sort = tostring(book.find('authors').find('person')) if self.authors and self.author_sort == self.authors[0]: self.author_sort = None except: pass - self.publisher = self.tostring(book.find('publishertext')) + self.publisher = tostring(book.find('publishertext')) - summ = self.tostring(book.find('summary')) + summ = tostring(book.find('summary')) if summ: self.comments = 'SUMMARY:\n'+summ @@ -141,7 +141,7 @@ def create_books(opts, args, timeout=5.): print ('ISBNDB query: '+url) tans = [ISBNDBMetadata(book) for book in fetch_metadata(url, timeout=timeout)] - ans = [] + '''ans = [] for x in tans: add = True for y in ans: @@ -149,7 +149,9 @@ def create_books(opts, args, timeout=5.): add = False if add: ans.append(x) - return ans + return ans''' + #remove duplicates ISBN + return dict((book.isbn, book) for book in tans).values() def main(args=sys.argv): parser = option_parser() From 19288b38acd138e4e3702845e1b1b61ef82c0d2d Mon Sep 17 00:00:00 2001 From: Sengian Date: Mon, 11 Oct 2010 00:36:26 +0200 Subject: [PATCH 18/41] Merge from trunk --- resources/catalog/stylesheet.css | 142 ++++++++++++++-------------- resources/content_server/index.html | 6 +- resources/templates/fb2.xsl | 97 ++++++++++--------- resources/templates/html.css | 35 +++++-- 4 files changed, 154 insertions(+), 126 deletions(-) diff --git a/resources/catalog/stylesheet.css b/resources/catalog/stylesheet.css index 4f9ca9ac41..ea01aeb43b 100644 --- a/resources/catalog/stylesheet.css +++ b/resources/catalog/stylesheet.css @@ -1,102 +1,104 @@ -body { background-color: white; } +body { + background-color: white; +} -p.title { - margin-top:0em; - margin-bottom:1em; - text-align:center; - font-style:italic; - font-size:xx-large; - border-bottom: solid black 4px; - } +p.title { + margin-top: 0em; + margin-bottom: 1em; + text-align: center; + font-style: italic; + font-size: xx-large; + border-bottom: solid black 4px; +} p.author { - margin-top:0em; - margin-bottom:0em; + margin-top: 0em; + margin-bottom: 0em; text-align: left; text-indent: 1em; - font-size:large; - } + font-size: large; +} p.tags { - margin-top:0em; - margin-bottom:0em; + margin-top: 0em; + margin-bottom: 0em; text-align: left; text-indent: 1em; - font-size:small; - } + font-size: small; +} p.description { - text-align:left; - font-style:normal; + text-align: left; + font-style: normal; margin-top: 0em; - } +} p.date_index { - font-size:x-large; - text-align:center; - font-weight:bold; - margin-top:1em; - margin-bottom:0px; - } + font-size: x-large; + text-align: center; + font-weight: bold; + margin-top: 1em; + margin-bottom: 0px; +} p.letter_index { - font-size:x-large; - text-align:center; - font-weight:bold; - margin-top:1em; - margin-bottom:0px; - } + font-size: x-large; + text-align: center; + font-weight: bold; + margin-top: 1em; + margin-bottom: 0px; +} p.author_index { - font-size:large; - text-align:left; - margin-top:0px; - margin-bottom:0px; + font-size: large; + text-align: left; + margin-top: 0px; + margin-bottom: 0px; text-indent: 0em; - } +} p.series { text-align: left; - margin-top:0px; - margin-bottom:0px; - margin-left:2em; - text-indent:-2em; - } + margin-top: 0px; + margin-bottom: 0px; + margin-left: 2em; + text-indent: -2em; +} p.read_book { - text-align:left; - margin-top:0px; - margin-bottom:0px; - margin-left:2em; - text-indent:-2em; - } + text-align: left; + margin-top: 0px; + margin-bottom: 0px; + margin-left: 2em; + text-indent: -2em; +} p.unread_book { - text-align:left; - margin-top:0px; - margin-bottom:0px; - margin-left:2em; - text-indent:-2em; - } + text-align: left; + margin-top: 0px; + margin-bottom: 0px; + margin-left: 2em; + text-indent: -2em; +} p.date_read { - text-align:left; - margin-top:0px; - margin-bottom:0px; - margin-left:6em; - text-indent:-6em; - } + text-align: left; + margin-top: 0px; + margin-bottom: 0px; + margin-left: 6em; + text-indent: -6em; +} hr.series_divider { - width:50%; - margin-left:1em; - margin-top:0em; - margin-bottom:0em; - } + width: 50%; + margin-left: 1em; + margin-top: 0em; + margin-bottom: 0em; +} hr.annotations_divider { - width:50%; - margin-left:1em; - margin-top:0em; - margin-bottom:0em; - } + width: 50%; + margin-left: 1em; + margin-top: 0em; + margin-bottom: 0em; +} \ No newline at end of file diff --git a/resources/content_server/index.html b/resources/content_server/index.html index ff11acc719..1bc13096d5 100644 --- a/resources/content_server/index.html +++ b/resources/content_server/index.html @@ -29,9 +29,9 @@
Show first set of books Show previous set of books               Show previous set of books              Show next set of books Show last set of books - - - + + + + - + - <xsl:value-of select="fb:description/fb:title-info/fb:book-title"/> + <xsl:value-of select="fb:description/fb:title-info/fb:book-title" /> @@ -51,37 +58,37 @@
- +
-
+
    - +
-
- +
+ -
+

- +

- +
- + - +
diff --git a/resources/templates/html.css b/resources/templates/html.css index 448ec596b9..bfbb646afb 100644 --- a/resources/templates/html.css +++ b/resources/templates/html.css @@ -35,9 +35,9 @@ * * ***** END LICENSE BLOCK ***** */ @ -namespace url (http: //www.w3.org /1999/xhtml); - @namespace svg url (http: //www.w3.org /2000/svg); - /* blocks */ +namespace url (http: //www.w3.org /1999/xhtml); + @namespace svg url (http: //www.w3.org /2000/svg); + /* blocks */ html,div,map,dt,isindex,form { display: block; @@ -161,10 +161,29 @@ table[align="right"] { float: right; } -table[rules]:not ([rules="none"] ) { - border-collapse: collapse; -} +table +[ +rules +] +:not + +( +[ +rules += +"none" +] + +) +{ +border-collapse +: + +collapse +; + +} /* caption inherits from table not table-outer */ caption { display: table-caption; @@ -322,7 +341,7 @@ ol ol ul,ol ul ul,ol menu ul,ol dir ul,ol ol menu,ol ul menu,ol menu menu,ol dir } /* leafs */ - /*
noshade and color attributes are handled completely by +/*
noshade and color attributes are handled completely by * the nsHTMLHRElement attribute mapping code */ hr { @@ -381,7 +400,7 @@ br { } /* Images, embedded object, and SVG size defaults */ -img,object,svg |svg { +img,object,svg |svg { width: auto; height: auto; } From 282c6aaa49006086c0887115edd3da1381d663e9 Mon Sep 17 00:00:00 2001 From: Sengian Date: Fri, 15 Oct 2010 08:45:09 +0200 Subject: [PATCH 19/41] Minor modification to isbndb.py --- src/calibre/ebooks/metadata/isbndb.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/src/calibre/ebooks/metadata/isbndb.py b/src/calibre/ebooks/metadata/isbndb.py index 2bbffc2c8b..615b4ab818 100644 --- a/src/calibre/ebooks/metadata/isbndb.py +++ b/src/calibre/ebooks/metadata/isbndb.py @@ -90,10 +90,8 @@ def build_isbn(base_url, opts): return base_url + 'index1=isbn&value1='+opts.isbn def build_combined(base_url, opts): - query = '' - for e in (opts.title, opts.author, opts.publisher): - if e is not None: - query += ' ' + e + query = ' '.join([e for e in (opts.title, opts.author, opts.publisher) \ + if e is not None ]) query = query.strip() if len(query) == 0: raise ISBNDBError('You must specify at least one of --author, --title or --publisher') @@ -141,15 +139,6 @@ def create_books(opts, args, timeout=5.): print ('ISBNDB query: '+url) tans = [ISBNDBMetadata(book) for book in fetch_metadata(url, timeout=timeout)] - '''ans = [] - for x in tans: - add = True - for y in ans: - if y.isbn == x.isbn: - add = False - if add: - ans.append(x) - return ans''' #remove duplicates ISBN return dict((book.isbn, book) for book in tans).values() From 18d2c55d4bccfaff1b32416a7fe7c7507dcaee0b Mon Sep 17 00:00:00 2001 From: Sengian Date: Tue, 19 Oct 2010 23:10:34 +0200 Subject: [PATCH 20/41] Modify single metadata display to include summary and covers check --- src/calibre/gui2/dialogs/fetch_metadata.py | 8 +- src/calibre/gui2/dialogs/fetch_metadata.ui | 344 ++++++++++----------- 2 files changed, 179 insertions(+), 173 deletions(-) diff --git a/src/calibre/gui2/dialogs/fetch_metadata.py b/src/calibre/gui2/dialogs/fetch_metadata.py index eb6edce75d..950f014442 100644 --- a/src/calibre/gui2/dialogs/fetch_metadata.py +++ b/src/calibre/gui2/dialogs/fetch_metadata.py @@ -48,7 +48,7 @@ class Matches(QAbstractTableModel): return len(self.matches) def columnCount(self, *args): - return 6 + return 8 def headerData(self, section, orientation, role): if role != Qt.DisplayRole: @@ -61,6 +61,8 @@ class Matches(QAbstractTableModel): elif section == 3: text = _("Publisher") elif section == 4: text = _("ISBN") elif section == 5: text = _("Published") + elif section == 6: text = _("Cover?") + elif section == 7: text = _("Summary?") return QVariant(text) else: @@ -87,6 +89,10 @@ class Matches(QAbstractTableModel): elif col == 5: if hasattr(book.pubdate, 'timetuple'): res = strftime('%b %Y', book.pubdate.timetuple()) + elif col == 6 and book.has_cover: + res = 'OK' + elif col == 7 and book.comments: + res = 'OK' if not res: return NONE return QVariant(res) diff --git a/src/calibre/gui2/dialogs/fetch_metadata.ui b/src/calibre/gui2/dialogs/fetch_metadata.ui index 03a362096c..c54ee66044 100644 --- a/src/calibre/gui2/dialogs/fetch_metadata.ui +++ b/src/calibre/gui2/dialogs/fetch_metadata.ui @@ -1,172 +1,172 @@ - - - FetchMetadata - - - Qt::WindowModal - - - - 0 - 0 - 830 - 642 - - - - Fetch metadata - - - - :/images/metadata.png:/images/metadata.png - - - - - - <p>calibre can find metadata for your books from two locations: <b>Google Books</b> and <b>isbndb.com</b>. <p>To use isbndb.com you must sign up for a <a href="http://www.isbndb.com">free account</a> and enter your access key below. - - - Qt::AlignCenter - - - true - - - true - - - - - - - - - &Access Key: - - - key - - - - - - - - - - Fetch - - - - - - - - - - - - true - - - - - - - Matches - - - - - - Select the book that most closely matches your copy from the list below - - - - - - - - 0 - 1 - - - - true - - - QAbstractItemView::SingleSelection - - - QAbstractItemView::SelectRows - - - - - - - - - - - - - Download &social metadata (tags/rating/etc.) for the selected book - - - - - - - Overwrite author and title with author and title of selected book - - - - - - - QDialogButtonBox::Cancel|QDialogButtonBox::Ok - - - - - - - - - - - buttonBox - accepted() - FetchMetadata - accept() - - - 460 - 599 - - - 657 - 530 - - - - - buttonBox - rejected() - FetchMetadata - reject() - - - 417 - 599 - - - 0 - 491 - - - - - + + + FetchMetadata + + + Qt::WindowModal + + + + 0 + 0 + 890 + 642 + + + + Fetch metadata + + + + :/images/metadata.png:/images/metadata.png + + + + + + <p>calibre can find metadata for your books from two locations: <b>Google Books</b> and <b>isbndb.com</b>. <p>To use isbndb.com you must sign up for a <a href="http://www.isbndb.com">free account</a> and enter your access key below. + + + Qt::AlignCenter + + + true + + + true + + + + + + + + + &Access Key: + + + key + + + + + + + + + + Fetch + + + + + + + + + + + + true + + + + + + + Matches + + + + + + Select the book that most closely matches your copy from the list below + + + + + + + + 0 + 1 + + + + true + + + QAbstractItemView::SingleSelection + + + QAbstractItemView::SelectRows + + + + + + + + + + + + + Download &social metadata (tags/rating/etc.) for the selected book + + + + + + + Overwrite author and title with author and title of selected book + + + + + + + QDialogButtonBox::Cancel|QDialogButtonBox::Ok + + + + + + + + + + + buttonBox + accepted() + FetchMetadata + accept() + + + 460 + 599 + + + 657 + 530 + + + + + buttonBox + rejected() + FetchMetadata + reject() + + + 417 + 599 + + + 0 + 491 + + + + + From b59631db5f348c2cba069ffc725251afc87a3a1c Mon Sep 17 00:00:00 2001 From: Sengian Date: Sun, 24 Oct 2010 23:26:17 +0200 Subject: [PATCH 21/41] Add a get cover option which overwrite the cover if one is available to metadata_single.py but needs to be modified to remember the option --- src/calibre/gui2/dialogs/fetch_metadata.ui | 11 +++++++++-- src/calibre/gui2/dialogs/metadata_single.py | 2 ++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/calibre/gui2/dialogs/fetch_metadata.ui b/src/calibre/gui2/dialogs/fetch_metadata.ui index c54ee66044..0b39089ee3 100644 --- a/src/calibre/gui2/dialogs/fetch_metadata.ui +++ b/src/calibre/gui2/dialogs/fetch_metadata.ui @@ -109,6 +109,13 @@ + + + + Overwrite author and title with author and title of selected book + + + @@ -117,9 +124,9 @@ - + - Overwrite author and title with author and title of selected book + Overwrite cover image with downloaded cover if available for the selected book diff --git a/src/calibre/gui2/dialogs/metadata_single.py b/src/calibre/gui2/dialogs/metadata_single.py index ef1bddca0c..65cfdf57d4 100644 --- a/src/calibre/gui2/dialogs/metadata_single.py +++ b/src/calibre/gui2/dialogs/metadata_single.py @@ -709,6 +709,8 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): self.title.setText(book.title) self.authors.setText(authors_to_string(book.authors)) if book.author_sort: self.author_sort.setText(book.author_sort) + if d.opt_overwrite_cover_image.isChecked() and book.has_cover: + self.fetch_cover() if book.publisher: self.publisher.setEditText(book.publisher) if book.isbn: self.isbn.setText(book.isbn) if book.pubdate: From c7995f136f839c2719f5aada74c59239916bfd7f Mon Sep 17 00:00:00 2001 From: Sengian Date: Sat, 30 Oct 2010 18:11:50 +0200 Subject: [PATCH 22/41] Finishing the option of downloading cover in single metadata and correcting a bug concerning option saving --- src/calibre/gui2/__init__.py | 2 ++ src/calibre/gui2/dialogs/fetch_metadata.py | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/src/calibre/gui2/__init__.py b/src/calibre/gui2/__init__.py index 4820bd251c..712c6b8a04 100644 --- a/src/calibre/gui2/__init__.py +++ b/src/calibre/gui2/__init__.py @@ -123,6 +123,8 @@ def _config(): help=_('Download social metadata (tags/rating/etc.)')) c.add_opt('overwrite_author_title_metadata', default=True, help=_('Overwrite author and title with new metadata')) + c.add_opt('overwrite_cover_image', default=False, + help=_('Overwrite cover with new new cover if existing')) c.add_opt('enforce_cpu_limit', default=True, help=_('Limit max simultaneous jobs to number of CPUs')) c.add_opt('tag_browser_hidden_categories', default=set(), diff --git a/src/calibre/gui2/dialogs/fetch_metadata.py b/src/calibre/gui2/dialogs/fetch_metadata.py index 35b5e576e6..a0ee250457 100644 --- a/src/calibre/gui2/dialogs/fetch_metadata.py +++ b/src/calibre/gui2/dialogs/fetch_metadata.py @@ -137,6 +137,7 @@ class FetchMetadata(QDialog, Ui_FetchMetadata): self.fetch_metadata() self.opt_get_social_metadata.setChecked(config['get_social_metadata']) self.opt_overwrite_author_title_metadata.setChecked(config['overwrite_author_title_metadata']) + self.opt_overwrite_cover_image.setChecked(config['overwrite_cover_image']) def show_summary(self, current, *args): @@ -219,6 +220,13 @@ class FetchMetadata(QDialog, Ui_FetchMetadata): _hung_fetchers.add(self.fetcher) if hasattr(self, '_hangcheck') and self._hangcheck.isActive(): self._hangcheck.stop() + #option configure + if self.opt_get_social_metadata.isChecked() != config['get_social_metadata']: + config.set('get_social_metadata', self.opt_get_social_metadata.isChecked()) + if self.opt_overwrite_author_title_metadata.isChecked() != config['overwrite_author_title_metadata']: + config.set('overwrite_author_title_metadata', self.opt_overwrite_author_title_metadata.isChecked()) + if self.opt_overwrite_cover_image.isChecked() != config['overwrite_cover_image']: + config.set('overwrite_cover_image', self.opt_overwrite_cover_image.isChecked()) def __enter__(self, *args): return self From c369ff9534d597bda6b7b8910278adaed9b359e9 Mon Sep 17 00:00:00 2001 From: Sengian Date: Sat, 30 Oct 2010 21:59:03 +0200 Subject: [PATCH 23/41] Modify for html correct display --- src/calibre/gui2/dialogs/metadata_single.ui | 1626 +++++++++---------- 1 file changed, 813 insertions(+), 813 deletions(-) diff --git a/src/calibre/gui2/dialogs/metadata_single.ui b/src/calibre/gui2/dialogs/metadata_single.ui index 18bcf2dc4c..29f5d48a11 100644 --- a/src/calibre/gui2/dialogs/metadata_single.ui +++ b/src/calibre/gui2/dialogs/metadata_single.ui @@ -1,813 +1,813 @@ - - - MetadataSingleDialog - - - - 0 - 0 - 887 - 750 - - - - - 0 - 0 - - - - Edit Meta Information - - - - :/images/edit_input.png:/images/edit_input.png - - - true - - - true - - - - - - QFrame::NoFrame - - - true - - - - - 0 - 0 - 879 - 711 - - - - - 0 - - - - - - 800 - 665 - - - - 0 - - - - &Basic metadata - - - - - - Qt::Horizontal - - - - - - - Meta information - - - - - - &Title: - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - title - - - - - - - Change the title of this book - - - - - - - Swap the author and title - - - ... - - - - :/images/swap.png:/images/swap.png - - - - 16 - 16 - - - - - - - - &Author(s): - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - authors - - - - - - - Author S&ort: - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - author_sort - - - - - - - - - Specify how the author(s) of this book should be sorted. For example Charles Dickens should be sorted as Dickens, Charles. -If the box is colored green, then text matches the individual author's sort strings. If it is colored red, then the authors and this text do not match. - - - - - - - Automatically create the author sort entry based on the current author entry. -Using this button to create author sort will change author sort from red to green. - - - ... - - - - :/images/auto_author_sort.png:/images/auto_author_sort.png - - - - - - - - - &Rating: - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - rating - - - - - - - Rating of this book. 0-5 stars - - - Rating of this book. 0-5 stars - - - QAbstractSpinBox::PlusMinus - - - stars - - - 5 - - - - - - - &Publisher: - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - publisher - - - - - - - Ta&gs: - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - tags - - - - - - - - - Tags categorize the book. This is particularly useful while searching. <br><br>They can be any words or phrases, separated by commas. - - - - - - - Open Tag Editor - - - Open Tag Editor - - - - :/images/chapters.png:/images/chapters.png - - - - - - - - - &Series: - - - Qt::PlainText - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - series - - - - - - - 5 - - - - - List of known series. You can add new series. - - - List of known series. You can add new series. - - - true - - - QComboBox::InsertAlphabetically - - - - - - - Remove unused series (Series that have no books) - - - ... - - - - :/images/trash.png:/images/trash.png - - - - - - - - - IS&BN: - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - isbn - - - - - - - - - - Publishe&d: - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - pubdate - - - - - - - true - - - - - - - false - - - Book - - - 9999.989999999999782 - - - - - - - MMM yyyy - - - true - - - - - - - true - - - - - - - dd MMM yyyy - - - true - - - - - - - &Date: - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - date - - - - - - - - - - &Comments - - - - - - true - - - false - - - - - - - - - - &Fetch metadata from server - - - - - - - - - - - - 0 - 0 - - - - Available Formats - - - - - - - - - 0 - 0 - - - - - 16777215 - 130 - - - - QAbstractItemView::DropOnly - - - - 64 - 64 - - - - - - - - Add a new format for this book to the database - - - ... - - - - :/images/add_book.png:/images/add_book.png - - - - 32 - 32 - - - - - - - - Remove the selected formats for this book from the database. - - - ... - - - - :/images/trash.png:/images/trash.png - - - - 32 - 32 - - - - - - - - Set the cover for the book from the selected format - - - ... - - - - :/images/book.png:/images/book.png - - - - 32 - 32 - - - - - - - - Update metadata from the metadata in the selected format - - - - - - - :/images/edit_input.png:/images/edit_input.png - - - - 32 - 32 - - - - - - - - - - - - - - - 0 - 10 - - - - Book Cover - - - - - - - 0 - 100 - - - - - - - - 6 - - - QLayout::SetMaximumSize - - - 0 - - - - - Change &cover image: - - - cover_path - - - - - - - 6 - - - 0 - - - - - true - - - - - - - &Browse - - - - :/images/document_open.png:/images/document_open.png - - - - - - - Remove border (if any) from cover - - - T&rim - - - - :/images/trim.png:/images/trim.png - - - Qt::ToolButtonTextBesideIcon - - - - - - - Reset cover to default - - - ... - - - - :/images/trash.png:/images/trash.png - - - - - - - - - - - - - Download co&ver - - - - - - - Generate a default cover based on the title and author - - - &Generate cover - - - - - - - - - - - - - - - - - &Custom metadata - - - - - - - - - - - - - Qt::Horizontal - - - QDialogButtonBox::Cancel|QDialogButtonBox::Ok - - - - - - - - EnLineEdit - QLineEdit -
widgets.h
-
- - EnComboBox - QComboBox -
widgets.h
-
- - TagsLineEdit - QLineEdit -
widgets.h
-
- - FormatList - QListWidget -
calibre/gui2/widgets.h
-
- - ImageView - QWidget -
calibre/gui2/widgets.h
- 1 -
-
- - title - swap_button - authors - author_sort - auto_author_sort - rating - publisher - tags - tag_editor_button - series - remove_series_button - series_index - isbn - date - pubdate - comments - fetch_metadata_button - add_format_button - remove_format_button - button_set_cover - button_set_metadata - formats - cover_path - reset_cover - fetch_cover_button - generate_cover_button - scrollArea - central_widget - button_box - - - - - - - button_box - accepted() - MetadataSingleDialog - accept() - - - 261 - 710 - - - 157 - 274 - - - - - button_box - rejected() - MetadataSingleDialog - reject() - - - 329 - 710 - - - 286 - 274 - - - - -
+ + + MetadataSingleDialog + + + + 0 + 0 + 887 + 750 + + + + + 0 + 0 + + + + Edit Meta Information + + + + :/images/edit_input.png:/images/edit_input.png + + + true + + + true + + + + + + QFrame::NoFrame + + + true + + + + + 0 + 0 + 879 + 711 + + + + + 0 + + + + + + 800 + 665 + + + + 0 + + + + &Basic metadata + + + + + + Qt::Horizontal + + + + + + + Meta information + + + + + + &Title: + + + Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter + + + title + + + + + + + Change the title of this book + + + + + + + Swap the author and title + + + ... + + + + :/images/swap.png:/images/swap.png + + + + 16 + 16 + + + + + + + + &Author(s): + + + Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter + + + authors + + + + + + + Author S&ort: + + + Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter + + + author_sort + + + + + + + + + Specify how the author(s) of this book should be sorted. For example Charles Dickens should be sorted as Dickens, Charles. +If the box is colored green, then text matches the individual author's sort strings. If it is colored red, then the authors and this text do not match. + + + + + + + Automatically create the author sort entry based on the current author entry. +Using this button to create author sort will change author sort from red to green. + + + ... + + + + :/images/auto_author_sort.png:/images/auto_author_sort.png + + + + + + + + + &Rating: + + + Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter + + + rating + + + + + + + Rating of this book. 0-5 stars + + + Rating of this book. 0-5 stars + + + QAbstractSpinBox::PlusMinus + + + stars + + + 5 + + + + + + + &Publisher: + + + Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter + + + publisher + + + + + + + Ta&gs: + + + Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter + + + tags + + + + + + + + + Tags categorize the book. This is particularly useful while searching. <br><br>They can be any words or phrases, separated by commas. + + + + + + + Open Tag Editor + + + Open Tag Editor + + + + :/images/chapters.png:/images/chapters.png + + + + + + + + + &Series: + + + Qt::PlainText + + + Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter + + + series + + + + + + + 5 + + + + + List of known series. You can add new series. + + + List of known series. You can add new series. + + + true + + + QComboBox::InsertAlphabetically + + + + + + + Remove unused series (Series that have no books) + + + ... + + + + :/images/trash.png:/images/trash.png + + + + + + + + + IS&BN: + + + Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter + + + isbn + + + + + + + + + + Publishe&d: + + + Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter + + + pubdate + + + + + + + true + + + + + + + false + + + Book + + + 9999.989999999999782 + + + + + + + MMM yyyy + + + true + + + + + + + true + + + + + + + dd MMM yyyy + + + true + + + + + + + &Date: + + + Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter + + + date + + + + + + + + + + &Comments + + + + + + true + + + true + + + + + + + + + + &Fetch metadata from server + + + + + + + + + + + + 0 + 0 + + + + Available Formats + + + + + + + + + 0 + 0 + + + + + 16777215 + 130 + + + + QAbstractItemView::DropOnly + + + + 64 + 64 + + + + + + + + Add a new format for this book to the database + + + ... + + + + :/images/add_book.png:/images/add_book.png + + + + 32 + 32 + + + + + + + + Remove the selected formats for this book from the database. + + + ... + + + + :/images/trash.png:/images/trash.png + + + + 32 + 32 + + + + + + + + Set the cover for the book from the selected format + + + ... + + + + :/images/book.png:/images/book.png + + + + 32 + 32 + + + + + + + + Update metadata from the metadata in the selected format + + + + + + + :/images/edit_input.png:/images/edit_input.png + + + + 32 + 32 + + + + + + + + + + + + + + + 0 + 10 + + + + Book Cover + + + + + + + 0 + 100 + + + + + + + + 6 + + + QLayout::SetMaximumSize + + + 0 + + + + + Change &cover image: + + + cover_path + + + + + + + 6 + + + 0 + + + + + true + + + + + + + &Browse + + + + :/images/document_open.png:/images/document_open.png + + + + + + + Remove border (if any) from cover + + + T&rim + + + + :/images/trim.png:/images/trim.png + + + Qt::ToolButtonTextBesideIcon + + + + + + + Reset cover to default + + + ... + + + + :/images/trash.png:/images/trash.png + + + + + + + + + + + + + Download co&ver + + + + + + + Generate a default cover based on the title and author + + + &Generate cover + + + + + + + + + + + + + + + + + &Custom metadata + + + + + + + + + + + + + Qt::Horizontal + + + QDialogButtonBox::Cancel|QDialogButtonBox::Ok + + + + + + + + EnLineEdit + QLineEdit +
widgets.h
+
+ + EnComboBox + QComboBox +
widgets.h
+
+ + TagsLineEdit + QLineEdit +
widgets.h
+
+ + FormatList + QListWidget +
calibre/gui2/widgets.h
+
+ + ImageView + QWidget +
calibre/gui2/widgets.h
+ 1 +
+
+ + title + swap_button + authors + author_sort + auto_author_sort + rating + publisher + tags + tag_editor_button + series + remove_series_button + series_index + isbn + date + pubdate + comments + fetch_metadata_button + add_format_button + remove_format_button + button_set_cover + button_set_metadata + formats + cover_path + reset_cover + fetch_cover_button + generate_cover_button + scrollArea + central_widget + button_box + + + + + + + button_box + accepted() + MetadataSingleDialog + accept() + + + 261 + 710 + + + 157 + 274 + + + + + button_box + rejected() + MetadataSingleDialog + reject() + + + 329 + 710 + + + 286 + 274 + + + + +
From dd522b051e85ccef7e153a873510ae681988e89c Mon Sep 17 00:00:00 2001 From: Sengian Date: Sun, 31 Oct 2010 23:37:19 +0100 Subject: [PATCH 24/41] Add a choice to get text instead of html in metadata plugins --- src/calibre/ebooks/metadata/fetch.py | 11 +- src/calibre/utils/html2text.py | 451 +++++++++++++++++++++++++++ 2 files changed, 461 insertions(+), 1 deletion(-) create mode 100644 src/calibre/utils/html2text.py diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index 9b8a42e482..87989a4d42 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -10,6 +10,7 @@ from calibre import prints from calibre.utils.config import OptionParser from calibre.utils.logging import default_log from calibre.utils.titlecase import titlecase +from calibre.utils.html2text import html2text from calibre.customize import Plugin from calibre.ebooks.metadata.covers import check_for_cover @@ -79,6 +80,8 @@ class MetadataSource(Plugin): # {{{ mi.comments = None if not c.get('tags', True): mi.tags = [] + if c.get('textconvert', True) and mi.comments is not None: + mi.comments = html2text(mi.comments) except Exception, e: self.exception = e @@ -132,11 +135,17 @@ class MetadataSource(Plugin): # {{{ setattr(w, '_'+x, cb) cb.setChecked(c.get(x, True)) w._layout.addWidget(cb) + #textconvert for comments + cb = QCheckBox(_('Convert comments from %s to text')%(self.name)) + setattr(w, '_textconvert', cb) + cb.setChecked(c.get('textconvert', False)) + w._layout.addWidget(cb) + return w def save_settings(self, w): dl_settings = {} - for x in ('rating', 'tags', 'comments'): + for x in ('rating', 'tags', 'comments', 'textconvert'): dl_settings[x] = getattr(w, '_'+x).isChecked() c = self.config_store() c.set(self.name, dl_settings) diff --git a/src/calibre/utils/html2text.py b/src/calibre/utils/html2text.py new file mode 100644 index 0000000000..b271def4bb --- /dev/null +++ b/src/calibre/utils/html2text.py @@ -0,0 +1,451 @@ +#!/usr/bin/env python +"""html2text: Turn HTML into equivalent Markdown-structured text.""" +__version__ = "2.39" +__author__ = "Aaron Swartz (me@aaronsw.com)" +__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." +__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"] + +# TODO: +# Support decoded entities with unifiable. + +if not hasattr(__builtins__, 'True'): True, False = 1, 0 +import re, sys, urllib, htmlentitydefs, codecs, StringIO, types +import sgmllib +import urlparse +sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]') + +try: from textwrap import wrap +except: pass + +# Use Unicode characters instead of their ascii pseudo-replacements +UNICODE_SNOB = 0 + +# Put the links after each paragraph instead of at the end. +LINKS_EACH_PARAGRAPH = 0 + +# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.) +BODY_WIDTH = 0 + +# Don't show internal links (href="#local-anchor") -- corresponding link targets +# won't be visible in the plain text file anyway. +SKIP_INTERNAL_LINKS = True + +### Entity Nonsense ### + +def name2cp(k): + if k == 'apos': return ord("'") + if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3 + return htmlentitydefs.name2codepoint[k] + else: + k = htmlentitydefs.entitydefs[k] + if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1 + return ord(codecs.latin_1_decode(k)[0]) + +unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', +'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*', +'ndash':'-', 'oelig':'oe', 'aelig':'ae', +'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', +'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', +'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i', +'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', +'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'} + +unifiable_n = {} + +for k in unifiable.keys(): + unifiable_n[name2cp(k)] = unifiable[k] + +def charref(name): + if name[0] in ['x','X']: + c = int(name[1:], 16) + else: + c = int(name) + + if not UNICODE_SNOB and c in unifiable_n.keys(): + return unifiable_n[c] + else: + return unichr(c) + +def entityref(c): + if not UNICODE_SNOB and c in unifiable.keys(): + return unifiable[c] + else: + try: name2cp(c) + except KeyError: return "&" + c + else: return unichr(name2cp(c)) + +def replaceEntities(s): + s = s.group(1) + if s[0] == "#": + return charref(s[1:]) + else: return entityref(s) + +r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));") +def unescape(s): + return r_unescape.sub(replaceEntities, s) + +def fixattrs(attrs): + # Fix bug in sgmllib.py + if not attrs: return attrs + newattrs = [] + for attr in attrs: + newattrs.append((attr[0], unescape(attr[1]))) + return newattrs + +### End Entity Nonsense ### + +def onlywhite(line): + """Return true if the line does only consist of whitespace characters.""" + for c in line: + if c is not ' ' and c is not ' ': + return c is ' ' + return line + +def optwrap(text): + """Wrap all paragraphs in the provided text.""" + if not BODY_WIDTH: + return text + + assert wrap, "Requires Python 2.3." + result = '' + newlines = 0 + for para in text.split("\n"): + if len(para) > 0: + if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*': + for line in wrap(para, BODY_WIDTH): + result += line + "\n" + result += "\n" + newlines = 2 + else: + if not onlywhite(para): + result += para + "\n" + newlines = 1 + else: + if newlines < 2: + result += "\n" + newlines += 1 + return result + +def hn(tag): + if tag[0] == 'h' and len(tag) == 2: + try: + n = int(tag[1]) + if n in range(1, 10): return n + except ValueError: return 0 + +class _html2text(sgmllib.SGMLParser): + def __init__(self, out=None, baseurl=''): + sgmllib.SGMLParser.__init__(self) + + if out is None: self.out = self.outtextf + else: self.out = out + self.outtext = u'' + self.quiet = 0 + self.p_p = 0 + self.outcount = 0 + self.start = 1 + self.space = 0 + self.a = [] + self.astack = [] + self.acount = 0 + self.list = [] + self.blockquote = 0 + self.pre = 0 + self.startpre = 0 + self.lastWasNL = 0 + self.abbr_title = None # current abbreviation definition + self.abbr_data = None # last inner HTML (for abbr being defined) + self.abbr_list = {} # stack of abbreviations to write later + self.baseurl = baseurl + + def outtextf(self, s): + self.outtext += s + + def close(self): + sgmllib.SGMLParser.close(self) + + self.pbr() + self.o('', 0, 'end') + + return self.outtext + + def handle_charref(self, c): + self.o(charref(c)) + + def handle_entityref(self, c): + self.o(entityref(c)) + + def unknown_starttag(self, tag, attrs): + self.handle_tag(tag, attrs, 1) + + def unknown_endtag(self, tag): + self.handle_tag(tag, None, 0) + + def previousIndex(self, attrs): + """ returns the index of certain set of attributes (of a link) in the + self.a list + + If the set of attributes is not found, returns None + """ + if not attrs.has_key('href'): return None + + i = -1 + for a in self.a: + i += 1 + match = 0 + + if a.has_key('href') and a['href'] == attrs['href']: + if a.has_key('title') or attrs.has_key('title'): + if (a.has_key('title') and attrs.has_key('title') and + a['title'] == attrs['title']): + match = True + else: + match = True + + if match: return i + + def handle_tag(self, tag, attrs, start): + attrs = fixattrs(attrs) + + if hn(tag): + self.p() + if start: self.o(hn(tag)*"#" + ' ') + + if tag in ['p', 'div']: self.p() + + if tag == "br" and start: self.o(" \n") + + if tag == "hr" and start: + self.p() + self.o("* * *") + self.p() + + if tag in ["head", "style", 'script']: + if start: self.quiet += 1 + else: self.quiet -= 1 + + if tag in ["body"]: + self.quiet = 0 # sites like 9rules.com never close + + if tag == "blockquote": + if start: + self.p(); self.o('> ', 0, 1); self.start = 1 + self.blockquote += 1 + else: + self.blockquote -= 1 + self.p() + + if tag in ['em', 'i', 'u']: self.o("_") + if tag in ['strong', 'b']: self.o("**") + if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` `` + if tag == "abbr": + if start: + attrsD = {} + for (x, y) in attrs: attrsD[x] = y + attrs = attrsD + + self.abbr_title = None + self.abbr_data = '' + if attrs.has_key('title'): + self.abbr_title = attrs['title'] + else: + if self.abbr_title != None: + self.abbr_list[self.abbr_data] = self.abbr_title + self.abbr_title = None + self.abbr_data = '' + + if tag == "a": + if start: + attrsD = {} + for (x, y) in attrs: attrsD[x] = y + attrs = attrsD + if attrs.has_key('href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')): + self.astack.append(attrs) + self.o("[") + else: + self.astack.append(None) + else: + if self.astack: + a = self.astack.pop() + if a: + i = self.previousIndex(a) + if i is not None: + a = self.a[i] + else: + self.acount += 1 + a['count'] = self.acount + a['outcount'] = self.outcount + self.a.append(a) + self.o("][" + `a['count']` + "]") + + if tag == "img" and start: + attrsD = {} + for (x, y) in attrs: attrsD[x] = y + attrs = attrsD + if attrs.has_key('src'): + attrs['href'] = attrs['src'] + alt = attrs.get('alt', '') + i = self.previousIndex(attrs) + if i is not None: + attrs = self.a[i] + else: + self.acount += 1 + attrs['count'] = self.acount + attrs['outcount'] = self.outcount + self.a.append(attrs) + self.o("![") + self.o(alt) + self.o("]["+`attrs['count']`+"]") + + if tag == 'dl' and start: self.p() + if tag == 'dt' and not start: self.pbr() + if tag == 'dd' and start: self.o(' ') + if tag == 'dd' and not start: self.pbr() + + if tag in ["ol", "ul"]: + if start: + self.list.append({'name':tag, 'num':0}) + else: + if self.list: self.list.pop() + + self.p() + + if tag == 'li': + if start: + self.pbr() + if self.list: li = self.list[-1] + else: li = {'name':'ul', 'num':0} + self.o(" "*len(self.list)) #TODO: line up
  1. s > 9 correctly. + if li['name'] == "ul": self.o("* ") + elif li['name'] == "ol": + li['num'] += 1 + self.o(`li['num']`+". ") + self.start = 1 + else: + self.pbr() + + if tag in ["table", "tr"] and start: self.p() + if tag == 'td': self.pbr() + + if tag == "pre": + if start: + self.startpre = 1 + self.pre = 1 + else: + self.pre = 0 + self.p() + + def pbr(self): + if self.p_p == 0: self.p_p = 1 + + def p(self): self.p_p = 2 + + def o(self, data, puredata=0, force=0): + if self.abbr_data is not None: self.abbr_data += data + + if not self.quiet: + if puredata and not self.pre: + data = re.sub('\s+', ' ', data) + if data and data[0] == ' ': + self.space = 1 + data = data[1:] + if not data and not force: return + + if self.startpre: + #self.out(" :") #TODO: not output when already one there + self.startpre = 0 + + bq = (">" * self.blockquote) + if not (force and data and data[0] == ">") and self.blockquote: bq += " " + + if self.pre: + bq += " " + data = data.replace("\n", "\n"+bq) + + if self.start: + self.space = 0 + self.p_p = 0 + self.start = 0 + + if force == 'end': + # It's the end. + self.p_p = 0 + self.out("\n") + self.space = 0 + + + if self.p_p: + self.out(('\n'+bq)*self.p_p) + self.space = 0 + + if self.space: + if not self.lastWasNL: self.out(' ') + self.space = 0 + + if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"): + if force == "end": self.out("\n") + + newa = [] + for link in self.a: + if self.outcount > link['outcount']: + self.out(" ["+`link['count']`+"]: " + urlparse.urljoin(self.baseurl, link['href'])) + if link.has_key('title'): self.out(" ("+link['title']+")") + self.out("\n") + else: + newa.append(link) + + if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done. + + self.a = newa + + if self.abbr_list and force == "end": + for abbr, definition in self.abbr_list.items(): + self.out(" *[" + abbr + "]: " + definition + "\n") + + self.p_p = 0 + self.out(data) + self.lastWasNL = data and data[-1] == '\n' + self.outcount += 1 + + def handle_data(self, data): + if r'\/script>' in data: self.quiet -= 1 + self.o(data, 1) + + def unknown_decl(self, data): pass + +def wrapwrite(text): sys.stdout.write(text.encode('utf8')) + +def html2text_file(html, out=wrapwrite, baseurl=''): + h = _html2text(out, baseurl) + h.feed(html) + h.feed("") + return h.close() + +def html2text(html, baseurl=''): + return optwrap(html2text_file(html, None, baseurl)) + +if __name__ == "__main__": + baseurl = '' + if sys.argv[1:]: + arg = sys.argv[1] + if arg.startswith('http://') or arg.startswith('https://'): + baseurl = arg + j = urllib.urlopen(baseurl) + try: + from feedparser import _getCharacterEncoding as enc + except ImportError: + enc = lambda x, y: ('utf-8', 1) + text = j.read() + encoding = enc(j.headers, text)[0] + if encoding == 'us-ascii': encoding = 'utf-8' + data = text.decode(encoding) + + else: + encoding = 'utf8' + if len(sys.argv) > 2: + encoding = sys.argv[2] + data = open(arg, 'r').read().decode(encoding) + else: + data = sys.stdin.read().decode('utf8') + wrapwrite(html2text(data, baseurl)) + From 9aefafc74506ac60fbc0e0ffbe1c53d48edbc0a5 Mon Sep 17 00:00:00 2001 From: Sengian Date: Mon, 1 Nov 2010 01:22:47 +0100 Subject: [PATCH 25/41] Implemented basic html check and none check to avoid problems with html2text --- src/calibre/ebooks/metadata/fetch.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index 87989a4d42..d45a299e39 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -16,6 +16,8 @@ from calibre.ebooks.metadata.covers import check_for_cover metadata_config = None +html_check = re.compile("([\<])([^\>]{1,})*([\>])", re.I) + class MetadataSource(Plugin): # {{{ ''' Represents a source to query for metadata. Subclasses must implement @@ -78,10 +80,11 @@ class MetadataSource(Plugin): # {{{ mi.rating = None if not c.get('comments', True): mi.comments = None + if c.get('textconvert', True) and mi.comments is not None \ + and html_check.search(mi.comments) is not None: + mi.comments = html2text(mi.comments) if not c.get('tags', True): mi.tags = [] - if c.get('textconvert', True) and mi.comments is not None: - mi.comments = html2text(mi.comments) except Exception, e: self.exception = e From a8578eee2d4008a547f0cf2ac9c880ef02cf0a37 Mon Sep 17 00:00:00 2001 From: Sengian Date: Tue, 2 Nov 2010 00:05:20 +0100 Subject: [PATCH 26/41] minor corrections linked to bug 7345 --- src/calibre/ebooks/metadata/fetch.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index 36a1af9c07..dedd251640 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -17,8 +17,6 @@ from calibre.utils.html2text import html2text metadata_config = None -html_check = re.compile("([\<])([^\>]{1,})*([\>])", re.I) - class MetadataSource(Plugin): # {{{ ''' Represents a source to query for metadata. Subclasses must implement @@ -86,9 +84,6 @@ class MetadataSource(Plugin): # {{{ mi.rating = None if not c.get('comments', True): mi.comments = None - if c.get('textconvert', True) and mi.comments is not None \ - and html_check.search(mi.comments) is not None: - mi.comments = html2text(mi.comments) if not c.get('tags', True): mi.tags = [] if self.has_html_comments and mi.comments and \ @@ -151,18 +146,21 @@ class MetadataSource(Plugin): # {{{ setattr(w, '_'+x, cb) cb.setChecked(c.get(x, True)) w._layout.addWidget(cb) - - cb = QCheckBox(_('Convert comments downloaded from %s to plain text')%(self.name)) - setattr(w, '_textcomments', cb) - cb.setChecked(c.get('textcomments', False)) - w._layout.addWidget(cb) + + if self.has_html_comments: + cb = QCheckBox(_('Convert comments downloaded from %s to plain text')%(self.name)) + setattr(w, '_textcomments', cb) + cb.setChecked(c.get('textcomments', False)) + w._layout.addWidget(cb) return w def save_settings(self, w): dl_settings = {} - for x in ('rating', 'tags', 'comments', 'textcomments'): + for x in ('rating', 'tags', 'comments'): dl_settings[x] = getattr(w, '_'+x).isChecked() + if self.has_html_comments: + dl_settings['textcomments'] = getattr(w, '_textcomments').isChecked() c = self.config_store() c.set(self.name, dl_settings) if hasattr(w, '_sc'): From a0fc1086364cab8d744530274ac5149ecfdda2f1 Mon Sep 17 00:00:00 2001 From: Sengian Date: Sat, 13 Nov 2010 15:22:18 +0100 Subject: [PATCH 27/41] Adding Fictionwise metadata source --- src/calibre/customize/builtins.py | 4 +- src/calibre/ebooks/metadata/fetch.py | 18 ++ src/calibre/ebooks/metadata/fictionwise.py | 351 +++++++++++++++++++++ 3 files changed, 371 insertions(+), 2 deletions(-) create mode 100644 src/calibre/ebooks/metadata/fictionwise.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index bd766827a5..04364b6b28 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -481,7 +481,7 @@ from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG from calibre.devices.kobo.driver import KOBO from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \ - LibraryThing + LibraryThing, Fictionwise from calibre.ebooks.metadata.douban import DoubanBooks from calibre.ebooks.metadata.covers import OpenLibraryCovers, \ LibraryThingCovers, DoubanCovers @@ -490,7 +490,7 @@ from calibre.ebooks.epub.fix.unmanifested import Unmanifested from calibre.ebooks.epub.fix.epubcheck import Epubcheck plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, - LibraryThing, DoubanBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, + LibraryThing, Fictionwise, DoubanBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers] plugins += [ ComicInput, diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index dedd251640..c9d6a74cb2 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -267,6 +267,24 @@ class LibraryThing(MetadataSource): # {{{ # }}} +class Fictionwise(MetadataSource): # {{{ + + author = 'Sengian' + name = 'Fictionwise' + description = _('Downloads metadata from Fictionwise') + + has_html_comments = True + + def fetch(self): + from calibre.ebooks.metadata.fictionwise import search + try: + self.results = search(self.title, self.book_author, self.publisher, + self.isbn, max_results=10, verbose=self.verbose) + except Exception, e: + self.exception = e + self.tb = traceback.format_exc() + + # }}} def result_index(source, result): if not result.isbn: diff --git a/src/calibre/ebooks/metadata/fictionwise.py b/src/calibre/ebooks/metadata/fictionwise.py new file mode 100644 index 0000000000..2fa9a1bcee --- /dev/null +++ b/src/calibre/ebooks/metadata/fictionwise.py @@ -0,0 +1,351 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2010, sengian ' +__docformat__ = 'restructuredtext en' + +import sys, textwrap, re +from urllib import urlencode + +from lxml import html, etree +from lxml.html import soupparser +from lxml.etree import tostring + +from calibre import browser, preferred_encoding +from calibre.ebooks.chardet import xml_to_unicode +from calibre.ebooks.metadata import MetaInformation, check_isbn, \ + authors_to_sort_string +from calibre.library.comments import sanitize_comments_html +from calibre.utils.config import OptionParser +from calibre.utils.date import parse_date, utcnow + + +def report(verbose): + if verbose: + import traceback + traceback.print_exc() + +class Query(object): + + BASE_URL = 'http://www.fictionwise.com/servlet/mw' + + def __init__(self, title=None, author=None, publisher=None, keywords=None, max_results=20): + assert not(title is None and author is None and keywords is None) + assert (max_results < 21) + + self.max_results = max_results + + q = { 'template' : 'searchresults_adv.htm' , + 'searchtitle' : '', + 'searchauthor' : '', + 'searchpublisher' : '', + 'searchkeyword' : '', + #possibilities startoflast, fullname, lastfirst + 'searchauthortype' : 'startoflast', + 'searchcategory' : '', + 'searchcategory2' : '', + 'searchprice_s' : '0', + 'searchprice_e' : 'ANY', + 'searchformat' : '', + 'searchgeo' : 'US', + 'searchfwdatetype' : '', + #maybe use dates fields if needed? + #'sortorder' : 'DESC', + #many options available: b.SortTitle, a.SortName, + #b.DateFirstPublished, b.FWPublishDate + 'sortby' : 'b.SortTitle' + } + if title is not None: + q['searchtitle'] = title + if author is not None: + q['searchauthor'] = author + if publisher is not None: + q['searchpublisher'] = publisher + if keywords is not None: + q['searchkeyword'] = keywords + + if isinstance(q, unicode): + q = q.encode('utf-8') + self.urldata = urlencode(q) + + def __call__(self, browser, verbose): + if verbose: + print 'Query:', self.BASE_URL+self.urldata + + try: + raw = browser.open_novisit(self.BASE_URL, self.urldata).read() + except Exception, e: + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return + raise + if '404 - ' in raw: + return + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + feed = soupparser.fromstring(raw) + except: + return + + # get list of results as links + results = feed.xpath("//table[3]/tr/td[2]/table/tr/td/p/table[2]/tr[@valign]") + results = results[:self.max_results] + results = [i.xpath('descendant-or-self::a')[0].get('href') for i in results] + #return feed if no links ie normally a single book or nothing + if not results: + results = [feed] + return results + +class ResultList(list): + + BASE_URL = 'http://www.fictionwise.com' + COLOR_VALUES = {'BLUE': 4, 'GREEN': 3, 'YELLOW': 2, 'RED': 1, 'NA': 0} + + def __init__(self): + self.retitle = re.compile(r'\[[^\[\]]+\]') + self.rechkauth = re.compile(r'.*book\s*by', re.I) + self.redesc = re.compile(r'book\s*description\s*:\s*(<br[^>]+>)*(?P<desc>.*)' \ + + '<br[^>]+>.{,15}publisher\s*:', re.I) + self.repub = re.compile(r'.*publisher\s*:\s*', re.I) + self.redate = re.compile(r'.*release\s*date\s*:\s*', re.I) + self.retag = re.compile(r'.*book\s*category\s*:\s*', re.I) + self.resplitbr = re.compile(r'<br[^>]+>', re.I) + self.recomment = re.compile(r'(?s)<!--.*?-->') + self.reimg = re.compile(r'<img[^>]*>', re.I) + self.resanitize = re.compile(r'\[HTML_REMOVED\]\s*', re.I) + self.renbcom = re.compile('(?P<nbcom>\d+)\s*Reader Ratings:') + self.recolor = re.compile('(?P<ncolor>[^/]+).gif') + self.resplitbrdiv = re.compile(r'(<br[^>]+>|</?div[^>]*>)', re.I) + self.reisbn = re.compile(r'.*ISBN\s*:\s*', re.I) + + def strip_tags_etree(self, etreeobj, invalid_tags): + for itag in invalid_tags: + for elt in etreeobj.getiterator(itag): + elt.drop_tag() + return etreeobj + + def clean_entry(self, entry, + invalid_tags = ('font', 'strong', 'b', 'ul', 'span', 'a'), + remove_tags_trees = ('script',)): + for it in entry[0].iterchildren(tag='table'): + entry[0].remove(it) + entry[0].remove(entry[0].xpath( 'descendant-or-self::p[1]')[0]) + entry = entry[0] + cleantree = self.strip_tags_etree(entry, invalid_tags) + for itag in remove_tags_trees: + for elts in cleantree.getiterator(itag): + elts.drop_tree() + return cleantree + + def output_entry(self, entry, prettyout = True, htmlrm="\d+"): + out = tostring(entry, pretty_print=prettyout) + reclean = re.compile('(\n+|\t+|\r+|&#'+htmlrm+';)') + return reclean.sub('', out) + + def get_title(self, entry): + title = entry.findtext('./') + return self.retitle.sub('', title).strip() + + def get_authors(self, entry): + authortext = entry.find('./br').tail + if not self.rechkauth.search(authortext): + return [] + #TODO: parse all tag if necessary + authortext = self.rechkauth.sub('', authortext) + return [a.strip() for a in authortext.split('&')] + + def get_rating(self, entrytable, verbose): + nbcomment = tostring(entrytable.getprevious()) + try: + nbcomment = self.renbcom.search(nbcomment).group("nbcom") + except: + report(verbose) + return None + hval = dict((self.COLOR_VALUES[self.recolor.search(image.get('src', default='NA.gif')).group("ncolor")], + float(image.get('height', default=0))) \ + for image in entrytable.getiterator('img')) + #ratings as x/20, not sure + return 5*sum(k*v for (k, v) in hval.iteritems())/sum(hval.itervalues()) + + def get_description(self, entry): + description = self.output_entry(entry.find('./p'),htmlrm="") + description = self.redesc.search(description) + if not description and not description.group("desc"): + return None + #remove invalid tags + description = self.reimg.sub('', description.group("desc")) + description = self.recomment.sub('', description) + description = self.resanitize.sub('', sanitize_comments_html(description)) + return 'SUMMARY:\n' + re.sub(r'\n\s+</p>','\n</p>', description) + + def get_publisher(self, entry): + publisher = self.output_entry(entry.find('./p')) + publisher = filter(lambda x: self.repub.search(x) is not None, + self.resplitbr.split(publisher)) + if not len(publisher): + return None + publisher = self.repub.sub('', publisher[0]) + return publisher.split(',')[0].strip() + + def get_tags(self, entry): + tag = self.output_entry(entry.find('./p')) + tag = filter(lambda x: self.retag.search(x) is not None, + self.resplitbr.split(tag)) + if not len(tag): + return [] + return map(lambda x: x.strip(), self.retag.sub('', tag[0]).split('/')) + + def get_date(self, entry, verbose): + date = self.output_entry(entry.find('./p')) + date = filter(lambda x: self.redate.search(x) is not None, + self.resplitbr.split(date)) + if not len(date): + return None + #TODO: parse all tag if necessary + try: + d = self.redate.sub('', date[0]) + if d: + default = utcnow().replace(day=15) + d = parse_date(d, assume_utc=True, default=default) + else: + d = None + except: + report(verbose) + d = None + return d + + def get_ISBN(self, entry): + isbns = self.output_entry(entry.getchildren()[2]) + isbns = filter(lambda x: self.reisbn.search(x) is not None, + self.resplitbrdiv.split(isbns)) + if not len(isbns): + return None + #TODO: parse all tag if necessary + isbns = [self.reisbn.sub('', x) for x in isbns if check_isbn(self.reisbn.sub('', x))] + return sorted(isbns, cmp=lambda x,y:cmp(len(x), len(y)))[-1] + + def fill_MI(self, entry, title, authors, ratings, verbose): + mi = MetaInformation(title, authors) + mi.rating = ratings + mi.comments = self.get_description(entry) + mi.publisher = self.get_publisher(entry) + mi.tags = self.get_tags(entry) + mi.pubdate = self.get_date(entry, verbose) + mi.isbn = self.get_ISBN(entry) + mi.author_sort = authors_to_sort_string(authors) + # mi.language = self.get_language(x, verbose) + return mi + + def get_individual_metadata(self, browser, linkdata): + try: + raw = browser.open_novisit(self.BASE_URL + linkdata).read() + except Exception, e: + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return + raise + if '<title>404 - ' in raw: + report(verbose) + return + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + feed = soupparser.fromstring(raw) + except: + return + + # get results + return feed.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td") + + def populate(self, entries, browser, verbose=False): + for x in entries: + try: + entry = self.get_individual_metadata(browser, x) + entry = self.clean_entry(entry) + title = self.get_title(entry) + #ratings: get table for rating then drop + for elt in entry.getiterator('table'): + ratings = self.get_rating(elt, verbose) + elt.getprevious().drop_tree() + elt.drop_tree() + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print 'Failed to get all details for an entry' + print e + continue + self.append(self.fill_MI(entry, title, authors, ratings, verbose)) + + def populate_single(self, feed, verbose=False): + try: + entry = feed.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td") + entry = self.clean_entry(entry) + title = self.get_title(entry) + #ratings: get table for rating then drop + for elt in entry.getiterator('table'): + ratings = self.get_rating(elt, verbose) + elt.getprevious().drop_tree() + elt.drop_tree() + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print 'Failed to get all details for an entry' + print e + return + self.append(self.fill_MI(entry, title, authors, ratings, verbose)) + + +def search(title=None, author=None, publisher=None, isbn=None, + min_viewability='none', verbose=False, max_results=5, + keywords=None): + br = browser() + entries = Query(title=title, author=author, publisher=publisher, + keywords=keywords, max_results=max_results)(br, verbose) + + #List of entry + ans = ResultList() + if len(entries) > 1: + ans.populate(entries, br, verbose) + else: + ans.populate_single(entries[0], verbose) + return ans + + +def option_parser(): + parser = OptionParser(textwrap.dedent(\ + '''\ + %prog [options] + + Fetch book metadata from Fictionwise. You must specify one of title, author, + or keywords. No ISBN specification possible. Will fetch a maximum of 20 matches, + so you should make your query as specific as possible. + ''' + )) + parser.add_option('-t', '--title', help='Book title') + parser.add_option('-a', '--author', help='Book author(s)') + parser.add_option('-p', '--publisher', help='Book publisher') + parser.add_option('-k', '--keywords', help='Keywords') + parser.add_option('-m', '--max-results', default=5, + help='Maximum number of results to fetch') + parser.add_option('-v', '--verbose', default=0, action='count', + help='Be more verbose about errors') + return parser + +def main(args=sys.argv): + parser = option_parser() + opts, args = parser.parse_args(args) + try: + results = search(opts.title, opts.author, publisher=opts.publisher, + keywords=opts.keywords, verbose=opts.verbose, max_results=opts.max_results) + except AssertionError: + report(True) + parser.print_help() + return 1 + for result in results: + print unicode(result).encode(preferred_encoding, 'replace') + print + +if __name__ == '__main__': + sys.exit(main()) From 041fbd293227dbc52dc9d823e37512d4ed441c0e Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sat, 13 Nov 2010 22:35:32 +0100 Subject: [PATCH 28/41] Correct rating scale for fictionwise.py --- src/calibre/ebooks/metadata/fictionwise.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/metadata/fictionwise.py b/src/calibre/ebooks/metadata/fictionwise.py index 2fa9a1bcee..ca438805ea 100644 --- a/src/calibre/ebooks/metadata/fictionwise.py +++ b/src/calibre/ebooks/metadata/fictionwise.py @@ -165,8 +165,8 @@ class ResultList(list): hval = dict((self.COLOR_VALUES[self.recolor.search(image.get('src', default='NA.gif')).group("ncolor")], float(image.get('height', default=0))) \ for image in entrytable.getiterator('img')) - #ratings as x/20, not sure - return 5*sum(k*v for (k, v) in hval.iteritems())/sum(hval.itervalues()) + #ratings as x/5 + return 1.25*sum(k*v for (k, v) in hval.iteritems())/sum(hval.itervalues()) def get_description(self, entry): description = self.output_entry(entry.find('./p'),htmlrm="") From c92271dc2d8b71a01e6484d611ec0b28d1d9a6ae Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Thu, 18 Nov 2010 21:22:21 +0100 Subject: [PATCH 29/41] minor revisions finctionwise plugin --- src/calibre/ebooks/metadata/fictionwise.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/metadata/fictionwise.py b/src/calibre/ebooks/metadata/fictionwise.py index ca438805ea..de60cd9dca 100644 --- a/src/calibre/ebooks/metadata/fictionwise.py +++ b/src/calibre/ebooks/metadata/fictionwise.py @@ -29,10 +29,10 @@ class Query(object): BASE_URL = 'http://www.fictionwise.com/servlet/mw' def __init__(self, title=None, author=None, publisher=None, keywords=None, max_results=20): - assert not(title is None and author is None and keywords is None) + assert not(title is None and author is None and publisher is None and keywords is None) assert (max_results < 21) - self.max_results = max_results + self.max_results = int(max_results) q = { 'template' : 'searchresults_adv.htm' , 'searchtitle' : '', @@ -327,7 +327,7 @@ def option_parser(): parser.add_option('-a', '--author', help='Book author(s)') parser.add_option('-p', '--publisher', help='Book publisher') parser.add_option('-k', '--keywords', help='Keywords') - parser.add_option('-m', '--max-results', default=5, + parser.add_option('-m', '--max-results', default=20, help='Maximum number of results to fetch') parser.add_option('-v', '--verbose', default=0, action='count', help='Be more verbose about errors') From 78e4aba18ce8cd86f2e91834a866029c0f3ab476 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Thu, 18 Nov 2010 21:37:51 +0100 Subject: [PATCH 30/41] Revert --- resources/catalog/stylesheet.css | 198 +++++++++++++++++-------------- src/calibre/ebooks/rtf/input.py | 74 ++++++++++-- 2 files changed, 177 insertions(+), 95 deletions(-) diff --git a/resources/catalog/stylesheet.css b/resources/catalog/stylesheet.css index afda6ffc05..057c6c9f42 100644 --- a/resources/catalog/stylesheet.css +++ b/resources/catalog/stylesheet.css @@ -1,87 +1,98 @@ -body { - background-color: white; -} +body { background-color: white; } -p.title { - margin-top: 0em; - margin-bottom: 1em; - text-align: center; - font-style: italic; - font-size: xx-large; - border-bottom: solid black 4px; -} +p.title { + margin-top:0em; + margin-bottom:1em; + text-align:center; + font-style:italic; + font-size:xx-large; + border-bottom: solid black 2px; + } p.author { - margin-top: 0em; - margin-bottom: 0em; - text-align: left; - text-indent: 1em; - font-size: large; -} - -p.tags { - margin-top: 0em; - margin-bottom: 0em; - text-align: left; - text-indent: 1em; - font-size: small; -} - -p.description { - text-align: left; - font-style: normal; - margin-top: 0em; -} - -p.date_index { - font-size: x-large; + margin-top:0em; + margin-bottom:0em; text-align: center; - font-weight: bold; - margin-top: 1em; - margin-bottom: 0px; -} - -p.letter_index { - font-size: x-large; - text-align: center; - font-weight: bold; - margin-top: 1em; - margin-bottom: 0px; -} + text-indent: 0em; + font-size:large; + } p.author_index { - font-size: large; - text-align: left; - margin-top: 0px; - margin-bottom: 0px; + font-size:large; + font-weight:bold; + text-align:left; + margin-top:0px; + margin-bottom:-2px; text-indent: 0em; -} + } + +p.tags { + margin-top:0.5em; + margin-bottom:0em; + text-align: left; + text-indent: 0.0in; + } + +p.formats { + font-size:90%; + margin-top:0em; + margin-bottom:0.5em; + text-align: left; + text-indent: 0.0in; + } + +div.description > p:first-child { + margin: 0 0 0 0; + text-indent: 0em; + } + +div.description { + margin: 0 0 0 0; + text-indent: 1em; + } + +p.date_index { + font-size:x-large; + text-align:center; + font-weight:bold; + margin-top:1em; + margin-bottom:0px; + } + +p.letter_index { + font-size:x-large; + text-align:center; + font-weight:bold; + margin-top:1em; + margin-bottom:0px; + } p.series { - text-align: left; - margin-top: 0px; - margin-bottom: 0px; - margin-left: 2em; - text-indent: -2em; -} + font-style:italic; + margin-top:2px; + margin-bottom:0px; + margin-left:2em; + text-align:left; + text-indent:-2em; + } p.read_book { - text-align: left; - margin-top: 0px; - margin-bottom: 0px; - margin-left: 2em; - text-indent: -2em; -} + text-align:left; + margin-top:0px; + margin-bottom:0px; + margin-left:2em; + text-indent:-2em; + } p.unread_book { - text-align: left; - margin-top: 0px; - margin-bottom: 0px; - margin-left: 2em; - text-indent: -2em; -} + text-align:left; + margin-top:0px; + margin-bottom:0px; + margin-left:2em; + text-indent:-2em; + } -p.missing_book { +p.wishlist_item { text-align:left; margin-top:0px; margin-bottom:0px; @@ -90,23 +101,36 @@ p.missing_book { } p.date_read { - text-align: left; - margin-top: 0px; - margin-bottom: 0px; - margin-left: 6em; - text-indent: -6em; -} + text-align:left; + margin-top:0px; + margin-bottom:0px; + margin-left:6em; + text-indent:-6em; + } -hr.series_divider { - width: 50%; - margin-left: 1em; - margin-top: 0em; - margin-bottom: 0em; -} +hr.description_divider { + width:90%; + margin-left:5%; + border-top: solid white 0px; + border-right: solid white 0px; + border-bottom: solid black 1px; + border-left: solid white 0px; + } hr.annotations_divider { - width: 50%; - margin-left: 1em; - margin-top: 0em; - margin-bottom: 0em; -} \ No newline at end of file + width:50%; + margin-left:1em; + margin-top:0em; + margin-bottom:0em; + } + +td.publisher, td.date { + font-weight:bold; + text-align:center; + } +td.rating { + text-align: center; + } +td.thumbnail img { + -webkit-box-shadow: 4px 4px 12px #999; + } \ No newline at end of file diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index ec6f9a04d3..32de91c011 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -9,6 +9,36 @@ from lxml import etree from calibre.customize.conversion import InputFormatPlugin from calibre.ebooks.conversion.utils import PreProcessor +border_style_map = { + 'single' : 'solid', + 'double-thickness-border' : 'double', + 'shadowed-border': 'outset', + 'double-border': 'double', + 'dotted-border': 'dotted', + 'dashed': 'dashed', + 'hairline': 'solid', + 'inset': 'inset', + 'dash-small': 'dashed', + 'dot-dash': 'dotted', + 'dot-dot-dash': 'dotted', + 'outset': 'outset', + 'tripple': 'double', + 'thick-thin-small': 'solid', + 'thin-thick-small': 'solid', + 'thin-thick-thin-small': 'solid', + 'thick-thin-medium': 'solid', + 'thin-thick-medium': 'solid', + 'thin-thick-thin-medium': 'solid', + 'thick-thin-large': 'solid', + 'thin-thick-thin-large': 'solid', + 'wavy': 'ridge', + 'double-wavy': 'ridge', + 'striped': 'ridge', + 'emboss': 'inset', + 'engrave': 'inset', + 'frame': 'ridge', +} + class InlineClass(etree.XSLTExtension): FMTS = ('italics', 'bold', 'underlined', 'strike-through', 'small-caps') @@ -51,7 +81,6 @@ class RTFInput(InputFormatPlugin): parser = ParseRtf( in_file = stream, out_file = ofile, - deb_dir = 'H:\\Temp\\Calibre\\rtfdebug', # Convert symbol fonts to unicode equivalents. Default # is 1 convert_symbol = 1, @@ -138,8 +167,7 @@ class RTFInput(InputFormatPlugin): return name - - def write_inline_css(self, ic): + def write_inline_css(self, ic, border_styles): font_size_classes = ['span.fs%d { font-size: %spt }'%(i, x) for i, x in enumerate(ic.font_sizes)] color_classes = ['span.col%d { color: %s }'%(i, x) for i, x in @@ -163,6 +191,10 @@ class RTFInput(InputFormatPlugin): ''') css += '\n'+'\n'.join(font_size_classes) css += '\n' +'\n'.join(color_classes) + + for cls, val in border_styles.items(): + css += '\n\n.%s {\n%s\n}'%(cls, val) + with open('styles.css', 'ab') as f: f.write(css) @@ -182,6 +214,32 @@ class RTFInput(InputFormatPlugin): 'Failed to preprocess RTF to convert unicode sequences, ignoring...') return fname + def convert_borders(self, doc): + border_styles = [] + style_map = {} + for elem in doc.xpath(r'//*[local-name()="cell"]'): + style = ['border-style: hidden', 'border-width: 1px', + 'border-color: black'] + for x in ('bottom', 'top', 'left', 'right'): + bs = elem.get('border-cell-%s-style'%x, None) + if bs: + cbs = border_style_map.get(bs, 'solid') + style.append('border-%s-style: %s'%(x, cbs)) + bw = elem.get('border-cell-%s-line-width'%x, None) + if bw: + style.append('border-%s-width: %spt'%(x, bw)) + bc = elem.get('border-cell-%s-color'%x, None) + if bc: + style.append('border-%s-color: %s'%(x, bc)) + style = ';\n'.join(style) + if style not in border_styles: + border_styles.append(style) + idx = border_styles.index(style) + cls = 'border_style%d'%idx + style_map[cls] = style + elem.set('class', cls) + return style_map + def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.metadata.meta import get_metadata @@ -191,17 +249,16 @@ class RTFInput(InputFormatPlugin): self.log = log self.log('Converting RTF to XML...') #Name of the preprocesssed RTF file - #fname = self.preprocess(stream.name) - fname = stream.name + fname = self.preprocess(stream.name) try: xml = self.generate_xml(fname) except RtfInvalidCodeException, e: raise ValueError(_('This RTF file has a feature calibre does not ' 'support. Convert it to HTML first and then try it.\n%s')%e) - dataxml = open('dataxml.xml', 'w') + '''dataxml = open('dataxml.xml', 'w') dataxml.write(xml) - dataxml.close + dataxml.close''' d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf')) if d: @@ -214,6 +271,7 @@ class RTFInput(InputFormatPlugin): self.log('Parsing XML...') parser = etree.XMLParser(recover=True, no_network=True) doc = etree.fromstring(xml, parser=parser) + border_styles = self.convert_borders(doc) for pict in doc.xpath('//rtf:pict[@num]', namespaces={'rtf':'http://rtf2xml.sourceforge.net/'}): num = int(pict.get('num')) @@ -235,7 +293,7 @@ class RTFInput(InputFormatPlugin): preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) res = preprocessor(res) f.write(res) - self.write_inline_css(inline_class) + self.write_inline_css(inline_class, border_styles) stream.seek(0) mi = get_metadata(stream, 'rtf') if not mi.title: From 8f6cc227cd46db8f008720ef7f50250152a5788e Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 21 Nov 2010 00:08:23 +0100 Subject: [PATCH 31/41] Minor modification mreplace.py --- src/calibre/utils/mreplace.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/utils/mreplace.py b/src/calibre/utils/mreplace.py index dff5fab578..b9fbc0bded 100644 --- a/src/calibre/utils/mreplace.py +++ b/src/calibre/utils/mreplace.py @@ -17,7 +17,7 @@ class MReplace(UserDict): if len(self.data) > 0: keys = sorted(self.data.keys(), key=len) keys.reverse() - tmp = "(%s)" % "|".join([re.escape(item) for item in keys]) + tmp = "(%s)" % "|".join(map(re.escape, keys)) if self.re != tmp: self.re = tmp self.regex = re.compile(self.re) From 229f511202b408f0627685e4eeab39022604b450 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 21 Nov 2010 00:08:53 +0100 Subject: [PATCH 32/41] Minor modif fictionwise.py --- src/calibre/ebooks/metadata/fictionwise.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/metadata/fictionwise.py b/src/calibre/ebooks/metadata/fictionwise.py index de60cd9dca..706d38b559 100644 --- a/src/calibre/ebooks/metadata/fictionwise.py +++ b/src/calibre/ebooks/metadata/fictionwise.py @@ -237,7 +237,7 @@ class ResultList(list): # mi.language = self.get_language(x, verbose) return mi - def get_individual_metadata(self, browser, linkdata): + def get_individual_metadata(self, browser, linkdata, verbose): try: raw = browser.open_novisit(self.BASE_URL + linkdata).read() except Exception, e: @@ -262,7 +262,7 @@ class ResultList(list): def populate(self, entries, browser, verbose=False): for x in entries: try: - entry = self.get_individual_metadata(browser, x) + entry = self.get_individual_metadata(browser, x, verbose) entry = self.clean_entry(entry) title = self.get_title(entry) #ratings: get table for rating then drop From eb4e7154dbcb63863ee70bb8dcc14c508631272f Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 21 Nov 2010 00:16:24 +0100 Subject: [PATCH 33/41] Plugin nicebooks for metadatas and cover. Should be disable by default. --- src/calibre/customize/builtins.py | 6 +- src/calibre/ebooks/metadata/nicebooks.py | 458 +++++++++++++++++++++++ 2 files changed, 462 insertions(+), 2 deletions(-) create mode 100644 src/calibre/ebooks/metadata/nicebooks.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 5723da34a8..ce5275d35e 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -483,15 +483,17 @@ from calibre.devices.kobo.driver import KOBO from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \ LibraryThing, Fictionwise from calibre.ebooks.metadata.douban import DoubanBooks +from calibre.ebooks.metadata.nicebooks import NiceBooks from calibre.ebooks.metadata.covers import OpenLibraryCovers, \ LibraryThingCovers, DoubanCovers +from calibre.ebooks.metadata.nicebooks import NiceBooksCovers from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX from calibre.ebooks.epub.fix.unmanifested import Unmanifested from calibre.ebooks.epub.fix.epubcheck import Epubcheck plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, - LibraryThing, Fictionwise, DoubanBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, - Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers] + LibraryThing, Fictionwise, DoubanBooks, NiceBooks,CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, + Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers, NiceBooksCovers] plugins += [ ComicInput, EPUBInput, diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py new file mode 100644 index 0000000000..28fb2de562 --- /dev/null +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -0,0 +1,458 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2010, sengian <sengian1@gmail.com>' +__docformat__ = 'restructuredtext en' + +import sys, textwrap, re, traceback, socket +from urllib import urlencode +from functools import partial +from math import ceil +from copy import deepcopy + +from lxml import html +from lxml.html import soupparser + +from calibre.utils.date import parse_date, utcnow +from calibre import browser, preferred_encoding +from calibre.ebooks.chardet import xml_to_unicode +from calibre.ebooks.metadata import MetaInformation, check_isbn, \ + authors_to_sort_string +from calibre.ebooks.metadata.fetch import MetadataSource +from calibre.ebooks.metadata.covers import CoverDownload +from calibre.utils.config import OptionParser + +class NiceBooks(MetadataSource): + + name = 'Nicebooks' + description = _('Downloads metadata from french Nicebooks') + supported_platforms = ['windows', 'osx', 'linux'] + author = 'Sengian' + version = (1, 0, 0) + + def fetch(self): + try: + self.results = search(self.title, self.book_author, self.publisher, + self.isbn, max_results=10, verbose=self.verbose) + except Exception, e: + self.exception = e + self.tb = traceback.format_exc() + +class NiceBooksCovers(CoverDownload): + + name = 'Nicebooks covers' + description = _('Downloads covers from french Nicebooks') + supported_platforms = ['windows', 'osx', 'linux'] + author = 'Sengian' + type = _('Cover download') + version = (1, 0, 0) + + def has_cover(self, mi, ans, timeout=5.): + if not mi.isbn: + return False + br = browser() + try: + entry = Query(isbn=mi.isbn, max_results=1)(br, False, timeout)[0] + if Covers(isbn)(entry).check_cover(): + self.debug('cover for', mi.isbn, 'found') + ans.set() + except Exception, e: + self.debug(e) + + def get_covers(self, mi, result_queue, abort, timeout=5.): + if not mi.isbn: + return + br = browser() + try: + entry = Query(isbn=mi.isbn, max_results=1)(br, False, timeout)[0] + cover_data, ext = Covers(isbn)(entry).get_cover(br, timeout) + if not ext: + ext = 'jpg' + result_queue.put((True, cover_data, ext, self.name)) + except Exception, e: + result_queue.put((False, self.exception_to_string(e), + traceback.format_exc(), self.name)) + + +def report(verbose): + if verbose: + import traceback + traceback.print_exc() + +def replace_monthsfr(datefr): + # Replace french months by english equivalent for parse_date + frtoen = { + u'[jJ]anvier': u'jan', + u'[fF].vrier': u'feb', + u'[mM]ars': u'mar', + u'[aA]vril': u'apr', + u'[mM]ai': u'may', + u'[jJ]uin': u'jun', + u'[jJ]uillet': u'jul', + u'[aA]o.t': u'aug', + u'[sS]eptembre': u'sep', + u'[Oo]ctobre': u'oct', + u'[nN]ovembre': u'nov', + u'[dD].cembre': u'dec' } + for k in frtoen.iterkeys(): + tmp = re.sub(k, frtoen[k], datefr) + if tmp <> datefr: break + return tmp + +class Query(object): + + BASE_URL = 'http://fr.nicebooks.com/' + + def __init__(self, title=None, author=None, publisher=None, isbn=None, keywords=None, max_results=20): + assert not(title is None and author is None and publisher is None \ + and isbn is None and keywords is None) + assert (max_results < 21) + + self.max_results = int(max_results) + + q = '' + if isbn is not None: + q += isbn + else: + + if title is not None: + q += title + if author is not None: + q += author + if publisher is not None: + q += publisher + if keywords is not None: + q += keywords + + if isinstance(q, unicode): + q = q.encode('utf-8') + self.urldata = 'search?' + urlencode({'q':q,'s':'Rechercher'}) + + def __call__(self, browser, verbose, timeout = 5.): + if verbose: + print 'Query:', self.BASE_URL+self.urldata + + try: + raw = browser.open_novisit(self.BASE_URL+self.urldata, timeout=timeout).read() + except Exception, e: + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return + raise + if '<title>404 - ' in raw: + return + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + feed = soupparser.fromstring(raw) + except: + return + + #nb of page to call + try: + nbresults = int(feed.xpath("//div[@id='topbar']/b")[0].text) + except: + #direct hit + return [feed] + + nbpagetoquery = ceil(min(nbresults, self.max_results)/10) + pages =[feed] + if nbpagetoquery > 1: + for i in xrange(2, nbpagetoquery + 1): + try: + urldata = self.urldata + '&p=' + str(i) + raw = browser.open_novisit(self.BASE_URL+urldata, timeout=timeout).read() + except Exception, e: + continue + if '<title>404 - ' in raw: + continue + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + feed = soupparser.fromstring(raw) + except: + continue + pages.append(feed) + + results = [] + for x in pages: + results.extend([i.find_class('title')[0].get('href') \ + for i in x.xpath("//ul[@id='results']/li")]) + return results[:self.max_results] + +class ResultList(list): + + BASE_URL = 'http://fr.nicebooks.com' + + def __init__(self): + self.repub = re.compile(r'\s*.diteur\s*', re.I) + self.reauteur = re.compile(r'\s*auteur.*', re.I) + self.reautclean = re.compile(r'\s*\(.*\)\s*') + + def get_title(self, entry): + title = deepcopy(entry.find("div[@id='book-info']")) + title.remove(title.find("dl[@title='Informations sur le livre']")) + title = ' '.join([i.text_content() for i in title.iterchildren()]) + return title.replace('\n', '') + + def get_authors(self, entry): + author = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + authortext = [] + for x in author.getiterator('dt'): + if self.reauteur.match(x.text): + elt = x.getnext() + i = 0 + while elt.tag <> 'dt' and i < 20: + authortext.append(elt.text_content()) + elt = elt.getnext() + i += 1 + break + if len(authortext) == 1: + authortext = [self.reautclean.sub('', authortext[0])] + return authortext + + def get_description(self, entry, verbose): + try: + return 'RESUME:\n' + entry.xpath("//p[@id='book-description']")[0].text + except: + report(verbose) + return None + + def get_publisher(self, entry): + publisher = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + publitext = None + for x in publisher.getiterator('dt'): + if self.repub.match(x.text): + publitext = x.getnext().text_content() + break + return publitext + + def get_date(self, entry, verbose): + date = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + for x in date.getiterator('dt'): + if x.text == 'Date de parution': + d = x.getnext().text_content() + break + if not len(d): + return None + try: + default = utcnow().replace(day=15) + d = replace_monthsfr(d) + d = parse_date(d, assume_utc=True, default=default) + except: + report(verbose) + d = None + return d + + def get_ISBN(self, entry): + isbn = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + isbntext = None + for x in isbn.getiterator('dt'): + if x.text == 'ISBN': + isbntext = x.getnext().text_content() + if not check_isbn(isbntext): + return None + break + return isbntext + + def get_language(self, entry): + language = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + langtext = None + for x in language.getiterator('dt'): + if x.text == 'Langue': + langtext = x.getnext().text_content() + break + return langtext + + def fill_MI(self, entry, title, authors, verbose): + mi = MetaInformation(title, authors) + mi.comments = self.get_description(entry, verbose) + mi.publisher = self.get_publisher(entry) + mi.pubdate = self.get_date(entry, verbose) + mi.isbn = self.get_ISBN(entry) + mi.author_sort = authors_to_sort_string(authors) + mi.language = self.get_language(entry) + return mi + + def get_individual_metadata(self, browser, linkdata, verbose): + try: + raw = browser.open_novisit(self.BASE_URL + linkdata).read() + except Exception, e: + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return + raise + if '<title>404 - ' in raw: + report(verbose) + return + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + feed = soupparser.fromstring(raw) + except: + return + + # get results + return feed.xpath("//div[@id='container']")[0] + + def populate(self, entries, browser, verbose=False): + for x in entries: + try: + entry = self.get_individual_metadata(browser, x, verbose) + title = self.get_title(entry) + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print 'Failed to get all details for an entry' + print e + continue + self.append(self.fill_MI(entry, title, authors, verbose)) + + def populate_single(self, feed, verbose=False): + try: + entry = feed.xpath("//div[@id='container']")[0] + title = self.get_title(entry) + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print 'Failed to get all details for an entry' + print e + return + self.append(self.fill_MI(entry, title, authors, verbose)) + +class NiceBooksError(Exception): + pass + +class ISBNNotFound(NiceBooksError): + pass + +class Covers(object): + + def __init__(self, isbn = None): + assert isbn is not None + self.urlimg = '' + self.isbn = isbn + self.isbnf = False + + def __call__(self, entry = None): + try: + self.urlimg = entry.xpath("//div[@id='book-picture']/a")[0].get('href') + except: + return self + isbno = entry.get_element_by_id('book-info').find("dl[@title='Informations sur le livre']") + isbntext = None + for x in isbno.getiterator('dt'): + if x.text == 'ISBN': + isbntext = x.getnext().text_content() + break + if isbntext is not None: + self.isbnf = True + return self + + def check_cover(self): + if self.urlimg: + return True + else: + return False + + def get_cover(self, browser, timeout = 5.): + try: + return browser.open_novisit(self.urlimg, timeout=timeout).read(), \ + self.urlimg.rpartition('.')[-1] + except Exception, err: + if isinstance(getattr(err, 'args', [None])[0], socket.timeout): + err = NiceBooksError(_('Nicebooks timed out. Try again later.')) + raise err + if not len(self.urlimg): + if not self.isbnf: + raise ISBNNotFound('ISBN: '+self.isbn+_(' not found.')) + raise NiceBooksError(_('An errror occured with Nicebooks cover fetcher')) + + +def search(title=None, author=None, publisher=None, isbn=None, + verbose=False, max_results=5, keywords=None): + br = browser() + entries = Query(title=title, author=author, isbn=isbn, publisher=publisher, + keywords=keywords, max_results=max_results)(br, verbose) + + if entries is None: + return + + #List of entry + ans = ResultList() + if len(entries) > 1: + ans.populate(entries, br, verbose) + else: + ans.populate_single(entries[0], verbose) + return ans + +def check_for_cover(isbn): + br = browser() + entry = Query(isbn=isbn, max_results=1)(br, False)[0] + return Covers(isbn)(entry).check_cover() + +def cover_from_isbn(isbn, timeout = 5.): + br = browser() + entry = Query(isbn=isbn, max_results=1)(br, False, timeout)[0] + return Covers(isbn)(entry).get_cover(br, timeout) + + +def option_parser(): + parser = OptionParser(textwrap.dedent(\ + '''\ + %prog [options] + + Fetch book metadata from Nicebooks. You must specify one of title, author, + ISBN, publisher or keywords. Will fetch a maximum of 20 matches, + so you should make your query as specific as possible. + It can also get covers if the option is activated. + ''' + )) + parser.add_option('-t', '--title', help='Book title') + parser.add_option('-a', '--author', help='Book author(s)') + parser.add_option('-p', '--publisher', help='Book publisher') + parser.add_option('-i', '--isbn', help='Book ISBN') + parser.add_option('-k', '--keywords', help='Keywords') + parser.add_option('-c', '--covers', default=0, + help='Covers: 1-Check/ 2-Download') + parser.add_option('-p', '--coverspath', default='', + help='Covers files path') + parser.add_option('-m', '--max-results', default=20, + help='Maximum number of results to fetch') + parser.add_option('-v', '--verbose', default=0, action='count', + help='Be more verbose about errors') + return parser + +def main(args=sys.argv): + parser = option_parser() + opts, args = parser.parse_args(args) + try: + results = search(opts.title, opts.author, isbn=opts.isbn, publisher=opts.publisher, + keywords=opts.keywords, verbose=opts.verbose, max_results=opts.max_results) + except AssertionError: + report(True) + parser.print_help() + return 1 + for result in results: + print unicode(result).encode(preferred_encoding, 'replace') + covact = int(opts.covers) + if covact == 1: + textcover = 'No cover found!' + if check_for_cover(result.isbn): + textcover = 'A cover was found for this book' + print textcover + elif covact == 2: + cover_data, ext = cover_from_isbn(result.isbn) + if not ext: + ext = 'jpg' + cpath = result.isbn + if len(opts.coverspath): + cpath = os.path.normpath(opts.coverspath + '/' + result.isbn) + oname = os.path.abspath(cpath+'.'+ext) + open(oname, 'wb').write(cover_data) + print 'Cover saved to file ', oname + print + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file From fd711e6075e2dec43ab37c76fad9ed299fcdc71d Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 21 Nov 2010 01:28:56 +0100 Subject: [PATCH 34/41] Minor fix for nicebooks.py --- src/calibre/ebooks/metadata/nicebooks.py | 49 +++++++++++------------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py index 28fb2de562..98ecdf3625 100644 --- a/src/calibre/ebooks/metadata/nicebooks.py +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -52,7 +52,7 @@ class NiceBooksCovers(CoverDownload): br = browser() try: entry = Query(isbn=mi.isbn, max_results=1)(br, False, timeout)[0] - if Covers(isbn)(entry).check_cover(): + if Covers(mi.isbn)(entry).check_cover(): self.debug('cover for', mi.isbn, 'found') ans.set() except Exception, e: @@ -64,7 +64,7 @@ class NiceBooksCovers(CoverDownload): br = browser() try: entry = Query(isbn=mi.isbn, max_results=1)(br, False, timeout)[0] - cover_data, ext = Covers(isbn)(entry).get_cover(br, timeout) + cover_data, ext = Covers(mi.isbn)(entry).get_cover(br, timeout) if not ext: ext = 'jpg' result_queue.put((True, cover_data, ext, self.name)) @@ -109,20 +109,12 @@ class Query(object): self.max_results = int(max_results) - q = '' if isbn is not None: - q += isbn + q = isbn else: - - if title is not None: - q += title - if author is not None: - q += author - if publisher is not None: - q += publisher - if keywords is not None: - q += keywords - + q = ' '.join([i for i in (title, author, publisher, keywords) \ + if i is not None]) + if isinstance(q, unicode): q = q.encode('utf-8') self.urldata = 'search?' + urlencode({'q':q,'s':'Rechercher'}) @@ -185,15 +177,15 @@ class ResultList(list): BASE_URL = 'http://fr.nicebooks.com' def __init__(self): - self.repub = re.compile(r'\s*.diteur\s*', re.I) - self.reauteur = re.compile(r'\s*auteur.*', re.I) - self.reautclean = re.compile(r'\s*\(.*\)\s*') + self.repub = re.compile(u'\s*.diteur\s*', re.I) + self.reauteur = re.compile(u'\s*auteur.*', re.I) + self.reautclean = re.compile(u'\s*\(.*\)\s*') def get_title(self, entry): title = deepcopy(entry.find("div[@id='book-info']")) title.remove(title.find("dl[@title='Informations sur le livre']")) title = ' '.join([i.text_content() for i in title.iterchildren()]) - return title.replace('\n', '') + return unicode(title.replace('\n', '')) def get_authors(self, entry): author = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") @@ -203,7 +195,7 @@ class ResultList(list): elt = x.getnext() i = 0 while elt.tag <> 'dt' and i < 20: - authortext.append(elt.text_content()) + authortext.append(unicode(elt.text_content())) elt = elt.getnext() i += 1 break @@ -213,7 +205,7 @@ class ResultList(list): def get_description(self, entry, verbose): try: - return 'RESUME:\n' + entry.xpath("//p[@id='book-description']")[0].text + return 'RESUME:\n' + unicode(entry.xpath("//p[@id='book-description']")[0].text) except: report(verbose) return None @@ -225,15 +217,16 @@ class ResultList(list): if self.repub.match(x.text): publitext = x.getnext().text_content() break - return publitext + return unicode(publitext).strip() def get_date(self, entry, verbose): date = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + d = '' for x in date.getiterator('dt'): if x.text == 'Date de parution': d = x.getnext().text_content() break - if not len(d): + if len(d) == 0: return None try: default = utcnow().replace(day=15) @@ -252,8 +245,9 @@ class ResultList(list): isbntext = x.getnext().text_content() if not check_isbn(isbntext): return None + isbntext = isbntext.replace('-', '') break - return isbntext + return unicode(isbntext) def get_language(self, entry): language = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") @@ -262,7 +256,7 @@ class ResultList(list): if x.text == 'Langue': langtext = x.getnext().text_content() break - return langtext + return unicode(langtext).strip() def fill_MI(self, entry, title, authors, verbose): mi = MetaInformation(title, authors) @@ -371,12 +365,12 @@ class Covers(object): def search(title=None, author=None, publisher=None, isbn=None, - verbose=False, max_results=5, keywords=None): + max_results=5, verbose=False, keywords=None): br = browser() entries = Query(title=title, author=author, isbn=isbn, publisher=publisher, keywords=keywords, max_results=max_results)(br, verbose) - if entries is None: + if entries is None or len(entries) == 0: return #List of entry @@ -434,6 +428,9 @@ def main(args=sys.argv): report(True) parser.print_help() return 1 + if results is None or len(results) == 0: + print 'No result found for this search!' + return 0 for result in results: print unicode(result).encode(preferred_encoding, 'replace') covact = int(opts.covers) From bc98b043fd4a7e7a09ab765c1d94f5782bda8676 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 21 Nov 2010 01:29:22 +0100 Subject: [PATCH 35/41] Fix for download cover regression --- src/calibre/gui2/dialogs/metadata_single.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/gui2/dialogs/metadata_single.py b/src/calibre/gui2/dialogs/metadata_single.py index 0b9b33868c..1eae761561 100644 --- a/src/calibre/gui2/dialogs/metadata_single.py +++ b/src/calibre/gui2/dialogs/metadata_single.py @@ -716,10 +716,10 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): self.title.setText(book.title) self.authors.setText(authors_to_string(book.authors)) if book.author_sort: self.author_sort.setText(book.author_sort) - if d.opt_overwrite_cover_image.isChecked() and book.has_cover: - self.fetch_cover() if book.publisher: self.publisher.setEditText(book.publisher) if book.isbn: self.isbn.setText(book.isbn) + if d.opt_overwrite_cover_image.isChecked() and book.has_cover: + self.fetch_cover() if book.pubdate: d = book.pubdate self.pubdate.setDate(QDate(d.year, d.month, d.day)) From 681c451238bbcf4d0f9e7c8102ef9e83de79e9ce Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 21 Nov 2010 09:14:44 +0100 Subject: [PATCH 36/41] Disable by default my plugins --- src/calibre/customize/ui.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/calibre/customize/ui.py b/src/calibre/customize/ui.py index 844269e453..e963a17df9 100644 --- a/src/calibre/customize/ui.py +++ b/src/calibre/customize/ui.py @@ -121,6 +121,8 @@ def enable_plugin(plugin_or_name): default_disabled_plugins = set([ 'Douban Books', 'Douban.com covers', + 'NiceBooks', 'NiceBooksCovers', + 'Fictionwise' ]) def is_disabled(plugin): From c5cbaffd20b042150a4c654584bbc526e613f5f6 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 21 Nov 2010 09:19:59 +0100 Subject: [PATCH 37/41] Externalize metadata plugin in fictionwise.py --- src/calibre/customize/builtins.py | 1 + src/calibre/ebooks/metadata/fetch.py | 18 ------------------ src/calibre/ebooks/metadata/fictionwise.py | 19 +++++++++++++++++++ 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index ce5275d35e..4815375563 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -484,6 +484,7 @@ from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \ LibraryThing, Fictionwise from calibre.ebooks.metadata.douban import DoubanBooks from calibre.ebooks.metadata.nicebooks import NiceBooks +from calibre.ebooks.metadata.fictionwise import Fictionwise from calibre.ebooks.metadata.covers import OpenLibraryCovers, \ LibraryThingCovers, DoubanCovers from calibre.ebooks.metadata.nicebooks import NiceBooksCovers diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index c9d6a74cb2..dedd251640 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -267,24 +267,6 @@ class LibraryThing(MetadataSource): # {{{ # }}} -class Fictionwise(MetadataSource): # {{{ - - author = 'Sengian' - name = 'Fictionwise' - description = _('Downloads metadata from Fictionwise') - - has_html_comments = True - - def fetch(self): - from calibre.ebooks.metadata.fictionwise import search - try: - self.results = search(self.title, self.book_author, self.publisher, - self.isbn, max_results=10, verbose=self.verbose) - except Exception, e: - self.exception = e - self.tb = traceback.format_exc() - - # }}} def result_index(source, result): if not result.isbn: diff --git a/src/calibre/ebooks/metadata/fictionwise.py b/src/calibre/ebooks/metadata/fictionwise.py index 706d38b559..828ea31c3a 100644 --- a/src/calibre/ebooks/metadata/fictionwise.py +++ b/src/calibre/ebooks/metadata/fictionwise.py @@ -15,9 +15,28 @@ from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.metadata import MetaInformation, check_isbn, \ authors_to_sort_string from calibre.library.comments import sanitize_comments_html +from calibre.ebooks.metadata.fetch import MetadataSource from calibre.utils.config import OptionParser from calibre.utils.date import parse_date, utcnow +class Fictionwise(MetadataSource): # {{{ + + author = 'Sengian' + name = 'Fictionwise' + description = _('Downloads metadata from Fictionwise') + + has_html_comments = True + + def fetch(self): + try: + self.results = search(self.title, self.book_author, self.publisher, + self.isbn, max_results=10, verbose=self.verbose) + except Exception, e: + self.exception = e + self.tb = traceback.format_exc() + + # }}} + def report(verbose): if verbose: From 9c30a416120d257e5bd9078408287683d150c191 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 21 Nov 2010 09:50:53 +0100 Subject: [PATCH 38/41] Correct nicebook max result problem --- src/calibre/ebooks/metadata/nicebooks.py | 45 +++++++++++------------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py index 98ecdf3625..e72d4b26ae 100644 --- a/src/calibre/ebooks/metadata/nicebooks.py +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -5,7 +5,6 @@ __docformat__ = 'restructuredtext en' import sys, textwrap, re, traceback, socket from urllib import urlencode -from functools import partial from math import ceil from copy import deepcopy @@ -147,7 +146,7 @@ class Query(object): #direct hit return [feed] - nbpagetoquery = ceil(min(nbresults, self.max_results)/10) + nbpagetoquery = int(ceil(float(min(nbresults, self.max_results))/10)) pages =[feed] if nbpagetoquery > 1: for i in xrange(2, nbpagetoquery + 1): @@ -193,11 +192,9 @@ class ResultList(list): for x in author.getiterator('dt'): if self.reauteur.match(x.text): elt = x.getnext() - i = 0 - while elt.tag <> 'dt' and i < 20: + while elt.tag == 'dd': authortext.append(unicode(elt.text_content())) elt = elt.getnext() - i += 1 break if len(authortext) == 1: authortext = [self.reautclean.sub('', authortext[0])] @@ -291,29 +288,32 @@ class ResultList(list): return feed.xpath("//div[@id='container']")[0] def populate(self, entries, browser, verbose=False): - for x in entries: + #single entry + if len(entries) ==1: try: - entry = self.get_individual_metadata(browser, x, verbose) + entry = entries[0].xpath("//div[@id='container']")[0] title = self.get_title(entry) authors = self.get_authors(entry) except Exception, e: if verbose: print 'Failed to get all details for an entry' print e - continue + return self.append(self.fill_MI(entry, title, authors, verbose)) + else: + #multiple entries + for x in entries: + try: + entry = self.get_individual_metadata(browser, x, verbose) + title = self.get_title(entry) + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print 'Failed to get all details for an entry' + print e + continue + self.append(self.fill_MI(entry, title, authors, verbose)) - def populate_single(self, feed, verbose=False): - try: - entry = feed.xpath("//div[@id='container']")[0] - title = self.get_title(entry) - authors = self.get_authors(entry) - except Exception, e: - if verbose: - print 'Failed to get all details for an entry' - print e - return - self.append(self.fill_MI(entry, title, authors, verbose)) class NiceBooksError(Exception): pass @@ -372,13 +372,10 @@ def search(title=None, author=None, publisher=None, isbn=None, if entries is None or len(entries) == 0: return - + #List of entry ans = ResultList() - if len(entries) > 1: - ans.populate(entries, br, verbose) - else: - ans.populate_single(entries[0], verbose) + ans.populate(entries, br, verbose) return ans def check_for_cover(isbn): From 3a37d7e78fa94dff29c86bde480e085463070f56 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 21 Nov 2010 10:24:56 +0100 Subject: [PATCH 39/41] Optimize metadata retrieval --- src/calibre/ebooks/metadata/nicebooks.py | 65 +++++++++++++++++------- 1 file changed, 47 insertions(+), 18 deletions(-) diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py index e72d4b26ae..f7cffa959b 100644 --- a/src/calibre/ebooks/metadata/nicebooks.py +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -181,13 +181,15 @@ class ResultList(list): self.reautclean = re.compile(u'\s*\(.*\)\s*') def get_title(self, entry): - title = deepcopy(entry.find("div[@id='book-info']")) + # title = deepcopy(entry.find("div[@id='book-info']")) + title = deepcopy(entry) title.remove(title.find("dl[@title='Informations sur le livre']")) title = ' '.join([i.text_content() for i in title.iterchildren()]) return unicode(title.replace('\n', '')) def get_authors(self, entry): - author = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + # author = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + author = entry.find("dl[@title='Informations sur le livre']") authortext = [] for x in author.getiterator('dt'): if self.reauteur.match(x.text): @@ -202,22 +204,46 @@ class ResultList(list): def get_description(self, entry, verbose): try: - return 'RESUME:\n' + unicode(entry.xpath("//p[@id='book-description']")[0].text) + return u'RESUME:\n' + unicode(entry.getparent().xpath("//p[@id='book-description']")[0].text) except: report(verbose) return None - + + def get_book_info(self, entry, mi): + entry = entry.find("dl[@title='Informations sur le livre']") + for x in entry.getiterator('dt'): + if x.text == 'ISBN': + isbntext = x.getnext().text_content().replace('-', '') + if check_isbn(isbntext): + mi.isbn = unicode(isbntext) + elif self.repub.match(x.text): + mi.publisher = unicode(x.getnext().text_content()) + elif x.text == 'Langue': + mi.language = unicode(x.getnext().text_content()) + elif x.text == 'Date de parution': + d = x.getnext().text_content() + try: + default = utcnow().replace(day=15) + d = replace_monthsfr(d) + d = parse_date(d, assume_utc=True, default=default) + mi.pubdate = d + except: + report(verbose) + return mi + def get_publisher(self, entry): - publisher = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + # publisher = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + publisher = entry publitext = None for x in publisher.getiterator('dt'): if self.repub.match(x.text): publitext = x.getnext().text_content() break - return unicode(publitext).strip() + return unicode(publitext) def get_date(self, entry, verbose): - date = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + # date = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + date = entry d = '' for x in date.getiterator('dt'): if x.text == 'Date de parution': @@ -235,35 +261,37 @@ class ResultList(list): return d def get_ISBN(self, entry): - isbn = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + # isbn = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + isbn = entry isbntext = None for x in isbn.getiterator('dt'): if x.text == 'ISBN': - isbntext = x.getnext().text_content() + isbntext = x.getnext().text_content().replace('-', '') if not check_isbn(isbntext): return None - isbntext = isbntext.replace('-', '') break return unicode(isbntext) def get_language(self, entry): - language = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + # language = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + language = entry langtext = None for x in language.getiterator('dt'): if x.text == 'Langue': langtext = x.getnext().text_content() break - return unicode(langtext).strip() + return unicode(langtext) def fill_MI(self, entry, title, authors, verbose): mi = MetaInformation(title, authors) - mi.comments = self.get_description(entry, verbose) - mi.publisher = self.get_publisher(entry) - mi.pubdate = self.get_date(entry, verbose) - mi.isbn = self.get_ISBN(entry) mi.author_sort = authors_to_sort_string(authors) - mi.language = self.get_language(entry) - return mi + mi.comments = self.get_description(entry, verbose) + # entry = entry.find("dl[@title='Informations sur le livre']") + # mi.publisher = self.get_publisher(entry) + # mi.pubdate = self.get_date(entry, verbose) + # mi.isbn = self.get_ISBN(entry) + # mi.language = self.get_language(entry) + return self.get_book_info(entry, mi) def get_individual_metadata(self, browser, linkdata, verbose): try: @@ -292,6 +320,7 @@ class ResultList(list): if len(entries) ==1: try: entry = entries[0].xpath("//div[@id='container']")[0] + entry = entry.find("div[@id='book-info']") title = self.get_title(entry) authors = self.get_authors(entry) except Exception, e: From 4887bac205622d0c6fe486278286b7eecbc30acc Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 21 Nov 2010 10:29:55 +0100 Subject: [PATCH 40/41] bug --- src/calibre/ebooks/metadata/nicebooks.py | 52 +----------------------- 1 file changed, 1 insertion(+), 51 deletions(-) diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py index f7cffa959b..9a06bad998 100644 --- a/src/calibre/ebooks/metadata/nicebooks.py +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -230,57 +230,6 @@ class ResultList(list): except: report(verbose) return mi - - def get_publisher(self, entry): - # publisher = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") - publisher = entry - publitext = None - for x in publisher.getiterator('dt'): - if self.repub.match(x.text): - publitext = x.getnext().text_content() - break - return unicode(publitext) - - def get_date(self, entry, verbose): - # date = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") - date = entry - d = '' - for x in date.getiterator('dt'): - if x.text == 'Date de parution': - d = x.getnext().text_content() - break - if len(d) == 0: - return None - try: - default = utcnow().replace(day=15) - d = replace_monthsfr(d) - d = parse_date(d, assume_utc=True, default=default) - except: - report(verbose) - d = None - return d - - def get_ISBN(self, entry): - # isbn = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") - isbn = entry - isbntext = None - for x in isbn.getiterator('dt'): - if x.text == 'ISBN': - isbntext = x.getnext().text_content().replace('-', '') - if not check_isbn(isbntext): - return None - break - return unicode(isbntext) - - def get_language(self, entry): - # language = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") - language = entry - langtext = None - for x in language.getiterator('dt'): - if x.text == 'Langue': - langtext = x.getnext().text_content() - break - return unicode(langtext) def fill_MI(self, entry, title, authors, verbose): mi = MetaInformation(title, authors) @@ -334,6 +283,7 @@ class ResultList(list): for x in entries: try: entry = self.get_individual_metadata(browser, x, verbose) + entry = entry.find("div[@id='book-info']") title = self.get_title(entry) authors = self.get_authors(entry) except Exception, e: From 3490c73ad93fa9bd55fd0d9ed513ded5eb6ea1c9 Mon Sep 17 00:00:00 2001 From: Sengian <sengian1@gmail.com> Date: Sun, 21 Nov 2010 11:10:21 +0100 Subject: [PATCH 41/41] Optimisation of nicebooks covers --- src/calibre/ebooks/metadata/nicebooks.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py index 9a06bad998..51858e4b77 100644 --- a/src/calibre/ebooks/metadata/nicebooks.py +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -266,7 +266,7 @@ class ResultList(list): def populate(self, entries, browser, verbose=False): #single entry - if len(entries) ==1: + if len(entries) == 1 and not isinstance(entries[0], str): try: entry = entries[0].xpath("//div[@id='container']")[0] entry = entry.find("div[@id='book-info']") @@ -314,25 +314,20 @@ class Covers(object): except: return self isbno = entry.get_element_by_id('book-info').find("dl[@title='Informations sur le livre']") - isbntext = None for x in isbno.getiterator('dt'): - if x.text == 'ISBN': - isbntext = x.getnext().text_content() + if x.text == 'ISBN' and check_isbn(x.getnext().text_content()): + self.isbnf = True break - if isbntext is not None: - self.isbnf = True return self def check_cover(self): - if self.urlimg: - return True - else: - return False + return True if self.urlimg else False def get_cover(self, browser, timeout = 5.): try: - return browser.open_novisit(self.urlimg, timeout=timeout).read(), \ + cover, ext = browser.open_novisit(self.urlimg, timeout=timeout).read(), \ self.urlimg.rpartition('.')[-1] + return cover, ext if ext else 'jpg' except Exception, err: if isinstance(getattr(err, 'args', [None])[0], socket.timeout): err = NiceBooksError(_('Nicebooks timed out. Try again later.')) @@ -417,8 +412,6 @@ def main(args=sys.argv): print textcover elif covact == 2: cover_data, ext = cover_from_isbn(result.isbn) - if not ext: - ext = 'jpg' cpath = result.isbn if len(opts.coverspath): cpath = os.path.normpath(opts.coverspath + '/' + result.isbn)