From 05a734cdb68e4ddd0278510cf9c6cf88d7a0a85d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 31 Jan 2011 08:31:24 -0700 Subject: [PATCH 01/11] BiBTeX catalog: Add on device column when available --- src/calibre/library/catalog.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py index 8ad64c8cdd..4aeaa237f3 100644 --- a/src/calibre/library/catalog.py +++ b/src/calibre/library/catalog.py @@ -344,7 +344,7 @@ class BIBTEX(CatalogPlugin): # {{{ if field == 'authors' : bibtex_entry.append(u'author = "%s"' % bibtexdict.bibtex_author_format(item)) - elif field in ['title', 'publisher', 'cover', 'uuid', + elif field in ['title', 'publisher', 'cover', 'uuid', 'ondevice', 'author_sort', 'series'] : bibtex_entry.append(u'%s = "%s"' % (field, bibtexdict.utf8ToBibtex(item))) @@ -378,7 +378,7 @@ class BIBTEX(CatalogPlugin): # {{{ if calibre_files: files = [u':%s:%s' % (format, format.rpartition('.')[2].upper())\ for format in item] - bibtex_entry.append(u'files = "%s"' % u', '.join(files)) + bibtex_entry.append(u'file = "%s"' % u', '.join(files)) elif field == 'series_index' : bibtex_entry.append(u'volume = "%s"' % int(item)) @@ -474,6 +474,8 @@ class BIBTEX(CatalogPlugin): # {{{ if opts.verbose: opts_dict = vars(opts) log("%s(): Generating %s" % (self.name,self.fmt)) + if opts.connected_device['is_device_connected']: + log(" connected_device: %s" % opts.connected_device['name']) if opts_dict['search_text']: log(" --search='%s'" % opts_dict['search_text']) @@ -548,6 +550,7 @@ class BIBTEX(CatalogPlugin): # {{{ as outfile: #File header nb_entries = len(data) + #check in book strict if all is ok else throw a warning into log if bib_entry == 'book' : nb_books = len(filter(check_entry_book_valid, data)) @@ -555,6 +558,11 @@ class BIBTEX(CatalogPlugin): # {{{ log(" WARNING: only %d entries in %d are book compatible" % (nb_books, nb_entries)) nb_entries = nb_books + # If connected device, add 'On Device' values to data + if opts.connected_device['is_device_connected'] and 'ondevice' in fields: + for entry in data: + entry['ondevice'] = db.catalog_plugin_on_device_temp_mapping[entry['id']]['ondevice'] + outfile.write(u'%%%Calibre catalog\n%%%{0} entries in catalog\n\n'.format(nb_entries)) outfile.write(u'@preamble{"This catalog of %d entries was generated by calibre on %s"}\n\n' % (nb_entries, nowf().strftime("%A, %d. %B %Y %H:%M").decode(preferred_encoding))) From 1b98c225ae13fffbd26a9afad455cee3c92b05cd Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 31 Jan 2011 08:32:42 -0700 Subject: [PATCH 02/11] ... --- src/calibre/library/catalog.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py index 4aeaa237f3..460bf79c87 100644 --- a/src/calibre/library/catalog.py +++ b/src/calibre/library/catalog.py @@ -232,6 +232,7 @@ class BIBTEX(CatalogPlugin): # {{{ help = _('The fields to output when cataloging books in the ' 'database. Should be a comma-separated list of fields.\n' 'Available fields: %s.\n' + 'plus user-created custom fields.\n' 'Example: %s=title,authors,tags\n' "Default: '%%default'\n" "Applies to: BIBTEX output format")%(', '.join(FIELDS), @@ -269,7 +270,7 @@ class BIBTEX(CatalogPlugin): # {{{ dest = 'bib_cit', action = None, help = _('The template for citation creation from database fields.\n' - ' Should be a template with {} enclosed fields.\n' + 'Should be a template with {} enclosed fields.\n' 'Available fields: %s.\n' "Default: '%%default'\n" "Applies to: BIBTEX output format")%', '.join(TEMPLATE_ALLOWED_FIELDS)), From 51abcdc4f9bf687350569885f12607bb1bdbe9d8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 31 Jan 2011 08:43:39 -0700 Subject: [PATCH 03/11] RTF Input: More encoding handlig fixes. Fixes #8678 (Conversion RTF-file to EPUB failed) --- src/calibre/ebooks/rtf/input.py | 20 +------ src/calibre/ebooks/rtf2xml/ParseRtf.py | 2 + src/calibre/ebooks/rtf2xml/colors.py | 54 +++++++++++-------- src/calibre/ebooks/rtf2xml/convert_to_tags.py | 38 ++++++++----- .../ebooks/rtf2xml/default_encoding.py | 4 ++ src/calibre/ebooks/rtf2xml/fonts.py | 36 +++++++------ src/calibre/ebooks/rtf2xml/get_char_map.py | 2 +- src/calibre/ebooks/rtf2xml/tokenize.py | 31 +++++------ 8 files changed, 102 insertions(+), 85 deletions(-) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index ca6f2c7b95..52f6feb071 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -83,6 +83,7 @@ class RTFInput(InputFormatPlugin): os.mkdir(debug_dir) debug_dir = 'rtfdebug' run_lev = 4 + self.log('Running RTFParser in debug mode') except: pass parser = ParseRtf( @@ -230,22 +231,6 @@ class RTFInput(InputFormatPlugin): with open('styles.css', 'ab') as f: f.write(css) - # def preprocess(self, fname): - # self.log('\tPreprocessing to convert unicode characters') - # try: - # data = open(fname, 'rb').read() - # from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser - # tokenizer = RtfTokenizer(data) - # tokens = RtfTokenParser(tokenizer.tokens) - # data = tokens.toRTF() - # fname = 'preprocessed.rtf' - # with open(fname, 'wb') as f: - # f.write(data) - # except: - # self.log.exception( - # 'Failed to preprocess RTF to convert unicode sequences, ignoring...') - # return fname - def convert_borders(self, doc): border_styles = [] style_map = {} @@ -280,8 +265,6 @@ class RTFInput(InputFormatPlugin): self.opts = options self.log = log self.log('Converting RTF to XML...') - #Name of the preprocesssed RTF file - # fname = self.preprocess(stream.name) try: xml = self.generate_xml(stream.name) except RtfInvalidCodeException, e: @@ -335,3 +318,4 @@ class RTFInput(InputFormatPlugin): opf.render(open('metadata.opf', 'wb')) return os.path.abspath('metadata.opf') + diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index d673836210..831183f0dd 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -238,6 +238,8 @@ class ParseRtf: bug_handler = RtfInvalidCodeException, ) enc = 'cp' + encode_obj.get_codepage() + if enc == 'cp10000': + enc = 'mac_roman' msg = 'Exception in token processing' if check_encoding_obj.check_encoding(self.__file, enc): file_name = self.__file if isinstance(self.__file, str) \ diff --git a/src/calibre/ebooks/rtf2xml/colors.py b/src/calibre/ebooks/rtf2xml/colors.py index d81b293bbf..eba03547c8 100755 --- a/src/calibre/ebooks/rtf2xml/colors.py +++ b/src/calibre/ebooks/rtf2xml/colors.py @@ -15,8 +15,10 @@ # # # # ######################################################################### -import sys, os, tempfile, re +import sys, os, tempfile, re + from calibre.ebooks.rtf2xml import copy + class Colors: """ Change lines with color info from color numbers to the actual color names. @@ -40,8 +42,10 @@ class Colors: self.__file = in_file self.__copy = copy self.__bug_handler = bug_handler + self.__line = 0 self.__write_to = tempfile.mktemp() self.__run_level = run_level + def __initiate_values(self): """ Initiate all values. @@ -61,6 +65,7 @@ class Colors: self.__color_num = 1 self.__line_color_exp = re.compile(r'bdr-color_:(\d+)') # cw 3: - msg = 'no value in self.__color_dict for key %s\n' % num - raise self.__bug_hanlder, msg - if hex_num == None: + if hex_num is None: hex_num = '0' + if self.__run_level > 5: + msg = 'no value in self.__color_dict' \ + 'for key %s at line %d\n' % (num, self.__line) + raise self.__bug_handler, msg return hex_num + def __do_nothing_func(self, line): """ Bad RTF will have text in the color table """ pass + def convert_colors(self): """ Requires: @@ -226,20 +238,16 @@ class Colors: info, and substitute the number with the hex number. """ self.__initiate_values() - read_obj = open(self.__file, 'r') - self.__write_obj = open(self.__write_to, 'w') - line_to_read = 1 - while line_to_read: - line_to_read = read_obj.readline() - line = line_to_read - self.__token_info = line[:16] - action = self.__state_dict.get(self.__state) - if action == None: - sys.stderr.write('no no matching state in module fonts.py\n') - sys.stderr.write(self.__state + '\n') - action(line) - read_obj.close() - self.__write_obj.close() + with open(self.__file, 'r') as read_obj: + with open(self.__write_to, 'w') as self.__write_obj: + for line in read_obj: + self.__line+=1 + self.__token_info = line[:16] + action = self.__state_dict.get(self.__state) + if action is None: + sys.stderr.write('no matching state in module fonts.py\n') + sys.stderr.write(self.__state + '\n') + action(line) copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "color.data") diff --git a/src/calibre/ebooks/rtf2xml/convert_to_tags.py b/src/calibre/ebooks/rtf2xml/convert_to_tags.py index 6927537474..1abc672f85 100755 --- a/src/calibre/ebooks/rtf2xml/convert_to_tags.py +++ b/src/calibre/ebooks/rtf2xml/convert_to_tags.py @@ -33,13 +33,13 @@ class ConvertToTags: self.__copy = copy self.__dtd_path = dtd_path self.__no_dtd = no_dtd - if encoding != 'mac_roman': - self.__encoding = 'cp' + encoding - else: + self.__encoding = 'cp' + encoding + if encoding == 'mac_roman': self.__encoding = 'mac_roman' self.__indent = indent self.__run_level = run_level self.__write_to = tempfile.mktemp() + self.__convert_utf = False def __initiate_values(self): """ @@ -213,7 +213,8 @@ class ConvertToTags: if not check_encoding_obj.check_encoding(self.__file, verbose=False): self.__write_obj.write('') elif not check_encoding_obj.check_encoding(self.__file, self.__encoding): - self.__write_obj.write('' % self.__encoding) + self.__write_obj.write('') + self.__convert_utf = True else: self.__write_obj.write('') sys.stderr.write('Bad RTF encoding, revert to US-ASCII chars and' @@ -253,15 +254,28 @@ class ConvertToTags: an empty tag function. """ self.__initiate_values() - self.__write_obj = open(self.__write_to, 'w') - self.__write_dec() - with open(self.__file, 'r') as read_obj: - for line in read_obj: - self.__token_info = line[:16] - action = self.__state_dict.get(self.__token_info) - if action is not None: - action(line) + with open(self.__write_to, 'w') as self.__write_obj: + self.__write_dec() + with open(self.__file, 'r') as read_obj: + for line in read_obj: + self.__token_info = line[:16] + action = self.__state_dict.get(self.__token_info) + if action is not None: + action(line) self.__write_obj.close() + #convert all encodings to UTF8 to avoid unsupported encodings in lxml + if self.__convert_utf: + copy_obj = copy.Copy(bug_handler = self.__bug_handler) + copy_obj.rename(self.__write_to, self.__file) + with open(self.__file, 'r') as read_obj: + with open(self.__write_to, 'w') as write_obj: + file = read_obj.read() + try: + file = file.decode(self.__encoding) + write_obj.write(file.encode('utf-8')) + except: + sys.stderr.write('Conversion to UTF-8 is not possible,' + ' encoding should be very carefully checked') copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "convert_to_tags.data") diff --git a/src/calibre/ebooks/rtf2xml/default_encoding.py b/src/calibre/ebooks/rtf2xml/default_encoding.py index 3ddfbcd321..c0a43db800 100755 --- a/src/calibre/ebooks/rtf2xml/default_encoding.py +++ b/src/calibre/ebooks/rtf2xml/default_encoding.py @@ -75,12 +75,16 @@ class DefaultEncoding: self._encoding() self.__datafetched = True code_page = 'ansicpg' + self.__code_page + if self.__code_page == '10000': + self.__code_page = 'mac_roman' return self.__platform, code_page, self.__default_num def get_codepage(self): if not self.__datafetched: self._encoding() self.__datafetched = True + if self.__code_page == '10000': + self.__code_page = 'mac_roman' return self.__code_page def get_platform(self): diff --git a/src/calibre/ebooks/rtf2xml/fonts.py b/src/calibre/ebooks/rtf2xml/fonts.py index b85717ce48..45ed3c1957 100755 --- a/src/calibre/ebooks/rtf2xml/fonts.py +++ b/src/calibre/ebooks/rtf2xml/fonts.py @@ -16,7 +16,9 @@ # # ######################################################################### import sys, os, tempfile + from calibre.ebooks.rtf2xml import copy + class Fonts: """ Change lines with font info from font numbers to the actual font names. @@ -45,6 +47,7 @@ class Fonts: self.__default_font_num = default_font_num self.__write_to = tempfile.mktemp() self.__run_level = run_level + def __initiate_values(self): """ Initiate all values. @@ -67,6 +70,7 @@ class Fonts: self.__font_table = {} # individual font written self.__wrote_ind_font = 0 + def __default_func(self, line): """ Requires: @@ -79,6 +83,7 @@ class Fonts: if self.__token_info == 'miTimes0\n' ) + 'Times0\n') + def __after_font_table_func(self, line): """ Required: @@ -169,7 +177,7 @@ class Fonts: if self.__token_info == 'cw 3: msg = 'no value for %s in self.__font_table\n' % font_num raise self.__bug_handler, msg @@ -182,6 +190,7 @@ class Fonts: ) else: self.__write_obj.write(line) + def convert_fonts(self): """ Required: @@ -197,20 +206,15 @@ class Fonts: info. Substitute a font name for a font number. """ self.__initiate_values() - read_obj = open(self.__file, 'r') - self.__write_obj = open(self.__write_to, 'w') - line_to_read = 1 - while line_to_read: - line_to_read = read_obj.readline() - line = line_to_read - self.__token_info = line[:16] - action = self.__state_dict.get(self.__state) - if action == None: - sys.stderr.write('no no matching state in module fonts.py\n') - sys.stderr.write(self.__state + '\n') - action(line) - read_obj.close() - self.__write_obj.close() + with open(self.__file, 'r') as read_obj: + with open(self.__write_to, 'w') as self.__write_obj: + for line in read_obj: + self.__token_info = line[:16] + action = self.__state_dict.get(self.__state) + if action is None: + sys.stderr.write('no matching state in module fonts.py\n' \ + + self.__state + '\n') + action(line) default_font_name = self.__font_table.get(self.__default_font_num) if not default_font_name: default_font_name = 'Not Defined' diff --git a/src/calibre/ebooks/rtf2xml/get_char_map.py b/src/calibre/ebooks/rtf2xml/get_char_map.py index fb3ef28b4f..5944d1920d 100755 --- a/src/calibre/ebooks/rtf2xml/get_char_map.py +++ b/src/calibre/ebooks/rtf2xml/get_char_map.py @@ -43,7 +43,7 @@ class GetCharMap: def get_char_map(self, map): if map == 'ansicpg0': map = 'ansicpg1250' - if map in ('ansicpg10000', '10000'): + if map == 'ansicpg10000': map = 'mac_roman' found_map = False map_dict = {} diff --git a/src/calibre/ebooks/rtf2xml/tokenize.py b/src/calibre/ebooks/rtf2xml/tokenize.py index 20438a2e66..59c2cab082 100755 --- a/src/calibre/ebooks/rtf2xml/tokenize.py +++ b/src/calibre/ebooks/rtf2xml/tokenize.py @@ -126,12 +126,6 @@ class Tokenize: tokens = re.split(self.__splitexp, input_file) #remove empty tokens and \n return filter(lambda x: len(x) > 0 and x != '\n', tokens) - #input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file) - # line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line) - # this is for older RTF - #line = re.sub(self.__par_exp, '\\par ', line) - #return filter(lambda x: len(x) > 0, \ - #(self.__remove_line.sub('', x) for x in tokens)) def __compile_expressions(self): SIMPLE_RPL = { @@ -160,7 +154,7 @@ class Tokenize: } self.__replace_spchar = MReplace(SIMPLE_RPL) #add ;? in case of char following \u - self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") #r"\\\'(..)" + self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) ?") self.__bin_exp = re.compile(r"(?:\\bin(-?\d{0,10})[\n ]+)[01\n]+") #manage upr/ud situations @@ -172,14 +166,21 @@ class Tokenize: self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)") #this is for old RTF self.__par_exp = re.compile(r'\\\n+') - # self.__par_exp = re.compile(r'\\$') + #handle cw using a digit as argument and without space as delimiter + self.__cwdigit_exp = re.compile(r"(\\[a-zA-Z]+[\-0-9]+)([^0-9 \\]+)") #self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}") #self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})") #self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)") #self.__remove_line = re.compile(r'\n+') - #self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)") ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)") + def __correct_spliting(self, token): + match_obj = re.search(self.__cwdigit_exp, token) + if match_obj is None: + return token + else: + return '%s\n%s' % (match_obj.group(1), match_obj.group(2)) + def tokenize(self): """Main class for handling other methods. Reads the file \ , uses method self.sub_reg to make basic substitutions,\ @@ -187,7 +188,7 @@ class Tokenize: #read with open(self.__file, 'r') as read_obj: input_file = read_obj.read() - + #process simple replacements and split giving us a correct list #remove '' and \n in the process tokens = self.__sub_reg_split(input_file) @@ -195,7 +196,9 @@ class Tokenize: tokens = map(self.__unicode_process, tokens) #remove empty items created by removing \uc tokens = filter(lambda x: len(x) > 0, tokens) - + #handles bothersome cases + tokens = map(self.__correct_spliting, tokens) + #write with open(self.__write_to, 'wb') as write_obj: write_obj.write('\n'.join(tokens)) @@ -203,11 +206,9 @@ class Tokenize: copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "tokenize.data") - # if self.__out_file: - # self.__file = self.__out_file copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to) - + #self.__special_tokens = [ '_', '~', "'", '{', '}' ] # import sys @@ -223,4 +224,4 @@ class Tokenize: # if __name__ == '__main__': - # sys.exit(main()) \ No newline at end of file + # sys.exit(main()) From 940969619c108e4cfbf276b91118a1243c75eae8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 31 Jan 2011 11:12:17 -0700 Subject: [PATCH 04/11] Fix #8701 (Updated recipe for LA Times) --- resources/recipes/latimes.recipe | 149 +++++++++++++++++++------------ 1 file changed, 93 insertions(+), 56 deletions(-) diff --git a/resources/recipes/latimes.recipe b/resources/recipes/latimes.recipe index bd426c1f33..930b986315 100644 --- a/resources/recipes/latimes.recipe +++ b/resources/recipes/latimes.recipe @@ -1,73 +1,92 @@ -#!/usr/bin/env python - __license__ = 'GPL v3' -__copyright__ = '2008-2009, Darko Miletic ' +__copyright__ = '2008-2011, Darko Miletic ' ''' -latimes.com +www.latimes.com ''' + from calibre.web.feeds.news import BasicNewsRecipe class LATimes(BasicNewsRecipe): - title = u'The Los Angeles Times' - __author__ = u'Darko Miletic and Sujata Raman' - description = u'News from Los Angeles' - oldest_article = 7 - max_articles_per_feed = 100 - language = 'en' + title = 'Los Angeles Times' + __author__ = 'Darko Miletic' + description = 'The Los Angeles Times is a leading source of news on Southern California, entertainment, movies, television, music, politics, business, health, technology, travel, sports, environment, economics, autos, jobs, real estate and other topics affecting California' + publisher = 'Tribune Company' + category = 'news, politics, USA, Los Angeles, world' + oldest_article = 2 + max_articles_per_feed = 200 no_stylesheets = True + encoding = 'utf8' use_embedded_content = False - encoding = 'utf-8' - lang = 'en-US' + language = 'en' + remove_empty_feeds = True + publication_type = 'newspaper' + masthead_url = 'http://www.latimes.com/images/logo.png' + cover_url = 'http://www.latimes.com/includes/sectionfronts/A1.pdf' + extra_css = """ + body{font-family: Georgia,"Times New Roman",Times,serif } + img{margin-bottom: 0.4em; margin-top: 0.8em; display:block} + h2{font-size: 1.1em} + .deckhead{font-size: small; text-transform: uppercase} + .small{color: gray; font-size: small} + .date,.time,.copyright{font-size: x-small; color:gray; font-style:italic;} + """ conversion_options = { - 'comment' : description - , 'language' : lang - } + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + , 'linearize_tables' : 'Yes' + } - extra_css = ''' - h1{font-family :Georgia,"Times New Roman",Times,serif; font-size:large; } - h2{font-family :Georgia,"Times New Roman",Times,serif; font-size:x-small;} - .story{font-family :Georgia,"Times New Roman",Times,serif; font-size: x-small;} - .entry-body{font-family :Georgia,"Times New Roman",Times,serif; font-size: x-small;} - .entry-more{font-family :Georgia,"Times New Roman",Times,serif; font-size: x-small;} - .credit{color:#666666; font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;} - .small{color:#666666; font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;} - .byline{font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;} - .date{font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;color:#930000; font-style:italic;} - .time{font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;color:#930000; font-style:italic;} - .copyright{font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;color:#930000; } - .subhead{font-family :Georgia,"Times New Roman",Times,serif; font-size:x-small;} - ''' - - # recursions = 1 - # match_regexps = [r'http://www.latimes.com/.*page=[2-9]'] - - keep_only_tags = [dict(name='div', attrs={'class':["story" ,"entry"] })] + keep_only_tags = [ + dict(name='div', attrs={'class':'story'}) + ,dict(attrs={'class':['entry-header','time','entry-content']}) + ] + remove_tags_after=dict(name='p', attrs={'class':'copyright'}) + remove_tags = [ + dict(name=['meta','link','iframe','object','embed']) + ,dict(attrs={'class':['toolSet','articlerail','googleAd','entry-footer-left','entry-footer-right','entry-footer-social','google-ad-story-bottom','sphereTools']}) + ,dict(attrs={'id':['article-promo','googleads','moduleArticleToolsContainer','gallery-subcontent']}) + ] + remove_attributes=['lang','xmlns:fb','xmlns:og','border','xtags','i','article_body'] - remove_tags = [ dict(name='div', attrs={'class':['articlerail',"sphereTools","tools","toppaginate","entry-footer-left","entry-footer-right"]}), - dict(name='div', attrs={'id':["moduleArticleToolsContainer",]}), - dict(name='p', attrs={'class':["entry-footer",]}), - dict(name='ul', attrs={'class':"article-nav clearfix"}), - dict(name=['iframe']) - ] - - - feeds = [(u'News', u'http://feeds.latimes.com/latimes/news') - ,(u'Local','http://feeds.latimes.com/latimes/news/local') - ,(u'MostEmailed','http://feeds.latimes.com/MostEmailed') - ,(u'Politics','http://feeds.latimes.com/latimes/news/local/politics/cal/') - ,('OrangeCounty','http://feeds.latimes.com/latimes/news/local/orange/') - ,('National','http://feeds.latimes.com/latimes/news/nationworld/nation') - ,('Politics','http://feeds.latimes.com/latimes/news/politics/') - ,('Business','http://feeds.latimes.com/latimes/business') - ,('Sports','http://feeds.latimes.com/latimes/sports/') - ,('Entertainment','http://feeds.latimes.com/latimes/entertainment/') - ] - + feeds = [ + (u'Top News' , u'http://feeds.latimes.com/latimes/news' ) + ,(u'Local News' , u'http://feeds.latimes.com/latimes/news/local' ) + ,(u'National' , u'http://feeds.latimes.com/latimes/news/nationworld/nation' ) + ,(u'National Politics' , u'http://feeds.latimes.com/latimes/news/politics/' ) + ,(u'Business' , u'http://feeds.latimes.com/latimes/business' ) + ,(u'Education' , u'http://feeds.latimes.com/latimes/news/education' ) + ,(u'Environment' , u'http://feeds.latimes.com/latimes/news/science/environment' ) + ,(u'Religion' , u'http://feeds.latimes.com/latimes/features/religion' ) + ,(u'Science' , u'http://feeds.latimes.com/latimes/news/science' ) + ,(u'Technology' , u'http://feeds.latimes.com/latimes/technology' ) + ,(u'Africa' , u'http://feeds.latimes.com/latimes/africa' ) + ,(u'Asia' , u'http://feeds.latimes.com/latimes/asia' ) + ,(u'Europe' , u'http://feeds.latimes.com/latimes/europe' ) + ,(u'Latin America' , u'http://feeds.latimes.com/latimes/latinamerica' ) + ,(u'Middle East' , u'http://feeds.latimes.com/latimes/middleeast' ) + ,(u'Arts&Culture' , u'http://feeds.feedburner.com/latimes/entertainment/news/arts' ) + ,(u'Entertainment News' , u'http://feeds.feedburner.com/latimes/entertainment/news/' ) + ,(u'Movie News' , u'http://feeds.feedburner.com/latimes/entertainment/news/movies/' ) + ,(u'Movie Reviews' , u'http://feeds.feedburner.com/movies/reviews/' ) + ,(u'Music News' , u'http://feeds.feedburner.com/latimes/entertainment/news/music/' ) + ,(u'Pop Album Reviews' , u'http://feeds.feedburner.com/latimes/pop-album-reviews' ) + ,(u'Restaurant Reviews' , u'http://feeds.feedburner.com/latimes/restaurant/reviews' ) + ,(u'Theatar and Dance' , u'http://feeds.feedburner.com/latimes/theaterdance' ) + ,(u'Autos' , u'http://feeds.latimes.com/latimes/classified/automotive/highway1/') + ,(u'Books' , u'http://feeds.latimes.com/features/books' ) + ,(u'Food' , u'http://feeds.latimes.com/latimes/features/food/' ) + ,(u'Health' , u'http://feeds.latimes.com/latimes/features/health/' ) + ,(u'Real Estate' , u'http://feeds.latimes.com/latimes/classified/realestate/' ) + ,(u'Commentary' , u'http://feeds2.feedburner.com/latimes/news/opinion/commentary/' ) + ,(u'Sports' , u'http://feeds.latimes.com/latimes/sports/' ) + ] def get_article_url(self, article): - ans = article.get('feedburner_origlink').rpartition('?')[0] + ans = BasicNewsRecipe.get_article_url(self, article).rpartition('?')[0] try: self.log('Looking for full story link in', ans) @@ -83,4 +102,22 @@ class LATimes(BasicNewsRecipe): pass return ans - + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll('img'): + if not item.has_key('alt'): + item['alt'] = 'image' + for item in soup.findAll('a'): + limg = item.find('img') + if item.string is not None: + str = item.string + item.replaceWith(str) + else: + if limg: + item.name ='div' + item.attrs =[] + else: + str = self.tag_to_string(item) + item.replaceWith(str) + return soup From d34451b6d1bcd27d89c6dcbccadaa3c5bb0c079f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 31 Jan 2011 15:45:17 -0700 Subject: [PATCH 05/11] EPUB Input: Filter made media tytpes from the spine, currently only filter Adobe page templates --- src/calibre/ebooks/epub/input.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/calibre/ebooks/epub/input.py b/src/calibre/ebooks/epub/input.py index ec2004d81c..e22ed27371 100644 --- a/src/calibre/ebooks/epub/input.py +++ b/src/calibre/ebooks/epub/input.py @@ -175,6 +175,19 @@ class EPUBInput(InputFormatPlugin): raise ValueError( 'EPUB files with DTBook markup are not supported') + for x in list(opf.iterspine()): + ref = x.get('idref', None) + if ref is None: + x.getparent().remove(x) + continue + for y in opf.itermanifest(): + if y.get('id', None) == ref and y.get('media-type', None) in \ + ('application/vnd.adobe-page-template+xml',): + p = x.getparent() + if p is not None: + p.remove(x) + break + with open('content.opf', 'wb') as nopf: nopf.write(opf.render()) From 5849b45d11203a8172ff6c24dca21a9f651bb54c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 31 Jan 2011 16:25:32 -0700 Subject: [PATCH 06/11] Fix #8688 (Calibre fails to convert some buggy chm ebooks which doesn't have .hhc file.) --- src/calibre/ebooks/chm/reader.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/calibre/ebooks/chm/reader.py b/src/calibre/ebooks/chm/reader.py index 025e252005..04ce6d5efe 100644 --- a/src/calibre/ebooks/chm/reader.py +++ b/src/calibre/ebooks/chm/reader.py @@ -139,6 +139,13 @@ class CHMReader(CHMFile): if self.hhc_path not in files and files: self.hhc_path = files[0] + if self.hhc_path == '.hhc' and self.hhc_path not in files: + from calibre import walk + for x in walk(output_dir): + if os.path.basename(x).lower() in ('index.htm', 'index.html'): + self.hhc_path = os.path.relpath(x, output_dir) + break + def _reformat(self, data, htmlpath): try: data = xml_to_unicode(data, strip_encoding_pats=True)[0] From c47bacb016eabdd6870d2a3409b1d2f2ba29f8eb Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 31 Jan 2011 17:12:22 -0700 Subject: [PATCH 07/11] Update 20 Minutos --- resources/recipes/20_minutos.recipe | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/resources/recipes/20_minutos.recipe b/resources/recipes/20_minutos.recipe index cb3002a76c..106c0dcffa 100644 --- a/resources/recipes/20_minutos.recipe +++ b/resources/recipes/20_minutos.recipe @@ -1,25 +1,25 @@ -# -*- coding: utf-8 __license__ = 'GPL v3' __author__ = 'Luis Hernandez' __copyright__ = 'Luis Hernandez' -description = 'Periódico gratuito en español - v0.8 - 27 Jan 2011' +__version__ = 'v0.85' +__date__ = '31 January 2011' ''' www.20minutos.es ''' - +import re from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1294946868(BasicNewsRecipe): - title = u'20 Minutos' + title = u'20 Minutos new' publisher = u'Grupo 20 Minutos' - __author__ = 'Luis Hernández' - description = 'Periódico gratuito en español' + __author__ = 'Luis Hernandez' + description = 'Free spanish newspaper' cover_url = 'http://estaticos.20minutos.es/mmedia/especiales/corporativo/css/img/logotipos_grupo20minutos.gif' - oldest_article = 5 + oldest_article = 2 max_articles_per_feed = 100 remove_javascript = True @@ -29,6 +29,7 @@ class AdvancedUserRecipe1294946868(BasicNewsRecipe): encoding = 'ISO-8859-1' language = 'es' timefmt = '[%a, %d %b, %Y]' + remove_empty_feeds = True keep_only_tags = [ dict(name='div', attrs={'id':['content','vinetas',]}) @@ -43,13 +44,21 @@ class AdvancedUserRecipe1294946868(BasicNewsRecipe): remove_tags = [ dict(name='ol', attrs={'class':['navigation',]}) ,dict(name='span', attrs={'class':['action']}) - ,dict(name='div', attrs={'class':['twitter comments-list hidden','related-news','col','photo-gallery','calendario','article-comment','postto estirar','otras_vinetas estirar','kment','user-actions']}) + ,dict(name='div', attrs={'class':['twitter comments-list hidden','related-news','col','photo-gallery','photo-gallery side-art-block','calendario','article-comment','postto estirar','otras_vinetas estirar','kment','user-actions']}) ,dict(name='div', attrs={'id':['twitter-destacados','eco-tabs','inner','vineta_calendario','vinetistas clearfix','otras_vinetas estirar','MIN1','main','SUP1','INT']}) ,dict(name='ul', attrs={'class':['article-user-actions','stripped-list']}) ,dict(name='ul', attrs={'id':['site-links']}) ,dict(name='li', attrs={'class':['puntuacion','enviar','compartir']}) ] + extra_css = """ + p{text-align: justify; font-size: 100%} + body{ text-align: left; font-size:100% } + h3{font-family: sans-serif; font-size:150%; font-weight:bold; text-align: justify; } + """ + + preprocess_regexps = [(re.compile(r'', re.DOTALL), lambda m: '')] + feeds = [ (u'Portada' , u'http://www.20minutos.es/rss/') ,(u'Nacional' , u'http://www.20minutos.es/rss/nacional/') @@ -65,6 +74,6 @@ class AdvancedUserRecipe1294946868(BasicNewsRecipe): ,(u'Empleo' , u'http://www.20minutos.es/rss/empleo/') ,(u'Cine' , u'http://www.20minutos.es/rss/cine/') ,(u'Musica' , u'http://www.20minutos.es/rss/musica/') - ,(u'Vinetas' , u'http://www.20minutos.es/rss/vinetas/') + ,(u'Vinetas' , u'http://www.20minutos.es/rss/vinetas/') ,(u'Comunidad20' , u'http://www.20minutos.es/rss/zona20/') ] From 527f11e32e5d42b2cdda5a3439189dad9dc154ff Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 31 Jan 2011 17:15:02 -0700 Subject: [PATCH 08/11] Cinco Dias by Luis Hernandez --- resources/images/news/latimes.png | Bin 0 -> 358 bytes resources/recipes/cinco_dias.recipe | 71 ++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 resources/images/news/latimes.png create mode 100644 resources/recipes/cinco_dias.recipe diff --git a/resources/images/news/latimes.png b/resources/images/news/latimes.png new file mode 100644 index 0000000000000000000000000000000000000000..62bb4d0b8a2586c4884c4ccbac5b481bff096309 GIT binary patch literal 358 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!60wlNoGJgf6n3BBRT^JZv^(q?yd7K3vk;OpT z1B~5HX4`=T%L*LRfizezL(H+Yhk=~Qo-U3d8t11@4CHM#5OL)XZfNmb!6K6*q{XV9 z!gat=b%N#%84>n$d4~)!g@@msFMYq~`s~|WmREDNqm`6p1$K&ee13VRF63-l_VwSa z99j7WD{t=D8onTOa)<9t5pMp*UpJnJuc(w`W)}Wn!KTJ$ahu`&(}j~4JlHxrRq)D- zH7WbWLsmYMf6?2*WJZb$#?VG66YWwZt`|BqgyV)hf9t6-Y4{85kPs z8W`#t8-^GfS{WKxnVRbwm|GbbFwC#DN70a*pOTqYiK4;C%FxKlzyhM7ER_8(Py>Uf LtDnm{r-UW|G&pfi literal 0 HcmV?d00001 diff --git a/resources/recipes/cinco_dias.recipe b/resources/recipes/cinco_dias.recipe new file mode 100644 index 0000000000..40241aff5c --- /dev/null +++ b/resources/recipes/cinco_dias.recipe @@ -0,0 +1,71 @@ +__license__ = 'GPL v3' +__author__ = 'Luis Hernandez' +__copyright__ = 'Luis Hernandez' +__version__ = 'v1.2' +__date__ = '31 January 2011' + +''' +http://www.cincodias.com/ +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1294946868(BasicNewsRecipe): + + title = u'Cinco Dias' + publisher = u'Grupo Prisa' + + __author__ = 'Luis Hernandez' + description = 'spanish web about money and bussiness, free edition' + + cover_url = 'http://www.prisa.com/images/logos/logo_cinco_dias.gif' + oldest_article = 2 + max_articles_per_feed = 100 + + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + + language = 'es' + remove_empty_feeds = True + encoding = 'ISO-8859-1' + timefmt = '[%a, %d %b, %Y]' + + keep_only_tags = [ + dict(name='div', attrs={'class':['cab_articulo cab_noticia','pos_3','txt_noticia','mod_despiece']}) + ,dict(name='p', attrs={'class':['cintillo']}) + ] + + remove_tags_before = dict(name='div' , attrs={'class':['publi_h']}) + remove_tags_after = dict(name='div' , attrs={'class':['tab_util util_estadisticas']}) + + remove_tags = [ + dict(name='div', attrs={'class':['util-1','util-2','util-3','inner estirar','inner1','inner2','inner3','cont','tab_util util_estadisticas','tab_util util_enviar','mod_list_inf','mod_similares','mod_divisas','mod_sectores','mod_termometro','mod post','mod_img','mod_txt','nivel estirar','barra estirar','info_brujula btnBrujula','utilidad_brujula estirar']}) + ,dict(name='li', attrs={'class':['lnk-fcbook','lnk-retweet','lnk-meneame','desplegable','comentarios','list-options','estirar']}) + ,dict(name='ul', attrs={'class':['lista-izquierda','list-options','estirar']}) + ,dict(name='p', attrs={'class':['autor']}) + ] + + extra_css = """ + p{text-align: justify; font-size: 100%} + body{ text-align: left; font-size:100% } + h1{font-family: sans-serif; font-size:150%; font-weight:bold; text-align: justify; } + h3{font-family: sans-serif; font-size:100%; font-style: italic; text-align: justify; } + """ + + feeds = [ + (u'Ultima Hora' , u'http://www.cincodias.com/rss/feed.html?feedId=17029') + ,(u'Empresas' , u'http://www.cincodias.com/rss/feed.html?feedId=19') + ,(u'Mercados' , u'http://www.cincodias.com/rss/feed.html?feedId=20') + ,(u'Economia' , u'http://www.cincodias.com/rss/feed.html?feedId=21') + ,(u'Tecnorama' , u'http://www.cincodias.com/rss/feed.html?feedId=17230') + ,(u'Tecnologia' , u'http://www.cincodias.com/rss/feed.html?feedId=17106') + ,(u'Finanzas Personales' , u'http://www.cincodias.com/rss/feed.html?feedId=22') + ,(u'Fiscalidad' , u'http://www.cincodias.com/rss/feed.html?feedId=17107') + ,(u'Vivienda' , u'http://www.cincodias.com/rss/feed.html?feedId=17108') + ,(u'Tendencias' , u'http://www.cincodias.com/rss/feed.html?feedId=17109') + ,(u'Empleo' , u'http://www.cincodias.com/rss/feed.html?feedId=17110') + ,(u'IBEX 35' , u'http://www.cincodias.com/rss/feed.html?feedId=17125') + ,(u'Sectores' , u'http://www.cincodias.com/rss/feed.html?feedId=17126') + ,(u'Opinion' , u'http://www.cincodias.com/rss/feed.html?feedId=17105') + ] From 360fd374c77d5d3f13c1b98b341a37a809154b72 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 31 Jan 2011 18:01:13 -0700 Subject: [PATCH 09/11] ... --- src/calibre/manual/faq.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst index 849ded82c9..59f6a9b88d 100644 --- a/src/calibre/manual/faq.rst +++ b/src/calibre/manual/faq.rst @@ -391,6 +391,8 @@ Take your pick: * A tribute to the SONY Librie which was the first e-ink based e-book reader * My wife chose it ;-) +|app| is pronounced as cal-i-ber *not* ca-libre. If you're wondering, |app| is the British/commonwealth spelling for caliber. Being Indian, that's the natural spelling for me. + Why does |app| show only some of my fonts on OS X? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |app| embeds fonts in ebook files it creates. E-book files support embedding only TrueType (.ttf) fonts. Most fonts on OS X systems are in .dfont format, thus they cannot be embedded. |app| shows only TrueType fonts found on your system. You can obtain many TrueType fonts on the web. Simply download the .ttf files and add them to the Library/Fonts directory in your home directory. From 5d4c7388629914e40c122f17a0106a363de3f810 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 31 Jan 2011 18:58:04 -0700 Subject: [PATCH 10/11] Fix #8672 (Converted format disappears while adding a new format) --- src/calibre/gui2/dialogs/metadata_single.py | 8 ++++++-- src/calibre/gui2/metadata/basic_widgets.py | 9 ++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/calibre/gui2/dialogs/metadata_single.py b/src/calibre/gui2/dialogs/metadata_single.py index 7a8e4ea8d0..fa20658c12 100644 --- a/src/calibre/gui2/dialogs/metadata_single.py +++ b/src/calibre/gui2/dialogs/metadata_single.py @@ -429,10 +429,12 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): old_extensions.add(ext) for ext in new_extensions: self.db.add_format(self.row, ext, open(paths[ext], 'rb'), notify=False) - db_extensions = set([f.lower() for f in self.db.formats(self.row).split(',')]) + dbfmts = self.db.formats(self.row) + db_extensions = set([f.lower() for f in (dbfmts.split(',') if dbfmts + else [])]) extensions = new_extensions.union(old_extensions) for ext in db_extensions: - if ext not in extensions: + if ext not in extensions and ext in self.original_formats: self.db.remove_format(self.row, ext, notify=False) def show_format(self, item, *args): @@ -576,6 +578,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): self.orig_date = qt_to_dt(self.date.date()) exts = self.db.formats(row) + self.original_formats = [] if exts: exts = exts.split(',') for ext in exts: @@ -586,6 +589,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): if size is None: continue Format(self.formats, ext, size, timestamp=timestamp) + self.original_formats.append(ext.lower()) self.initialize_combos() diff --git a/src/calibre/gui2/metadata/basic_widgets.py b/src/calibre/gui2/metadata/basic_widgets.py index 590a8be3bb..d3fa5958ab 100644 --- a/src/calibre/gui2/metadata/basic_widgets.py +++ b/src/calibre/gui2/metadata/basic_widgets.py @@ -472,6 +472,7 @@ class FormatsManager(QWidget): # {{{ def initialize(self, db, id_): self.changed = False exts = db.formats(id_, index_is_id=True) + self.original_val = set([]) if exts: exts = exts.split(',') for ext in exts: @@ -482,6 +483,7 @@ class FormatsManager(QWidget): # {{{ if size is None: continue Format(self.formats, ext, size, timestamp=timestamp) + self.original_val.add(ext.lower()) def commit(self, db, id_): if not self.changed: @@ -500,11 +502,12 @@ class FormatsManager(QWidget): # {{{ for ext in new_extensions: db.add_format(id_, ext, open(paths[ext], 'rb'), notify=False, index_is_id=True) - db_extensions = set([f.lower() for f in db.formats(id_, - index_is_id=True).split(',')]) + dbfmts = db.formats(id_, index_is_id=True) + db_extensions = set([f.lower() for f in (dbfmts.split(',') if dbfmts + else [])]) extensions = new_extensions.union(old_extensions) for ext in db_extensions: - if ext not in extensions: + if ext not in extensions and ext in self.original_val: db.remove_format(id_, ext, notify=False, index_is_id=True) self.changed = False From d2ba1812bb0b0d9c95acd6c0e22287ce47502bc9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 31 Jan 2011 20:09:26 -0700 Subject: [PATCH 11/11] Initial import of new metadata download framework --- src/calibre/ebooks/metadata/sources/base.py | 61 +++++ src/calibre/ebooks/metadata/sources/google.py | 215 ++++++++++++++++++ 2 files changed, 276 insertions(+) create mode 100644 src/calibre/ebooks/metadata/sources/base.py create mode 100644 src/calibre/ebooks/metadata/sources/google.py diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py new file mode 100644 index 0000000000..89ad8a7956 --- /dev/null +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai + +__license__ = 'GPL v3' +__copyright__ = '2011, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import re + +from calibre.customize import Plugin + +class Source(Plugin): + + type = _('Metadata source') + author = 'Kovid Goyal' + + supported_platforms = ['windows', 'osx', 'linux'] + + result_of_identify_is_complete = True + + def get_author_tokens(self, authors): + 'Take a list of authors and return a list of tokens useful for a ' + 'AND search query' + # Leave ' in there for Irish names + pat = re.compile(r'[-,:;+!@#$%^&*(){}.`~"\s\[\]/]') + for au in authors: + for tok in au.split(): + yield pat.sub('', tok) + + def split_jobs(self, jobs, num): + 'Split a list of jobs into at most num groups, as evenly as possible' + groups = [[] for i in range(num)] + jobs = list(jobs) + while jobs: + for gr in groups: + try: + job = jobs.pop() + except IndexError: + break + gr.append(job) + return [g for g in groups if g] + + def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}): + ''' + Identify a book by its title/author/isbn/etc. + + :param log: A log object, use it to output debugging information/errors + :param result_queue: A result Queue, results should be put into it. + Each result is a Metadata object + :param abort: If abort.is_set() returns True, abort further processing + and return as soon as possible + :param title: The title of the book, can be None + :param authors: A list of authors of the book, can be None + :param identifiers: A dictionary of other identifiers, most commonly + {'isbn':'1234...'} + :return: None if no errors occurred, otherwise a unicode representation + of the error suitable for showing to the user + + ''' + return None + diff --git a/src/calibre/ebooks/metadata/sources/google.py b/src/calibre/ebooks/metadata/sources/google.py new file mode 100644 index 0000000000..1a3bf6d516 --- /dev/null +++ b/src/calibre/ebooks/metadata/sources/google.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai + +__license__ = 'GPL v3' +__copyright__ = '2011, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import time +from urllib import urlencode +from functools import partial +from threading import Thread + +from lxml import etree + +from calibre.ebooks.metadata.sources import Source +from calibre.ebooks.metadata.book.base import Metadata +from calibre.utils.date import parse_date, utcnow +from calibre import browser, as_unicode + +NAMESPACES = { + 'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/', + 'atom' : 'http://www.w3.org/2005/Atom', + 'dc': 'http://purl.org/dc/terms' + } +XPath = partial(etree.XPath, namespaces=NAMESPACES) + +total_results = XPath('//openSearch:totalResults') +start_index = XPath('//openSearch:startIndex') +items_per_page = XPath('//openSearch:itemsPerPage') +entry = XPath('//atom:entry') +entry_id = XPath('descendant::atom:id') +creator = XPath('descendant::dc:creator') +identifier = XPath('descendant::dc:identifier') +title = XPath('descendant::dc:title') +date = XPath('descendant::dc:date') +publisher = XPath('descendant::dc:publisher') +subject = XPath('descendant::dc:subject') +description = XPath('descendant::dc:description') +language = XPath('descendant::dc:language') + + + +def to_metadata(browser, log, entry_): + + def get_text(extra, x): + try: + ans = x(extra) + if ans: + ans = ans[0].text + if ans and ans.strip(): + return ans.strip() + except: + log.exception('Programming error:') + return None + + + id_url = entry_id(entry_)[0].text + title_ = ': '.join([x.text for x in title(entry_)]).strip() + authors = [x.text.strip() for x in creator(entry_) if x.text] + if not authors: + authors = [_('Unknown')] + if not id_url or not title: + # Silently discard this entry + return None + + mi = Metadata(title_, authors) + try: + raw = browser.open(id_url).read() + feed = etree.fromstring(raw) + extra = entry(feed)[0] + except: + log.exception('Failed to get additional details for', mi.title) + return mi + + mi.comments = get_text(extra, description) + #mi.language = get_text(extra, language) + mi.publisher = get_text(extra, publisher) + + # Author sort + for x in creator(extra): + for key, val in x.attrib.items(): + if key.endswith('file-as') and val and val.strip(): + mi.author_sort = val + break + # ISBN + isbns = [] + for x in identifier(extra): + t = str(x.text).strip() + if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'): + if t[:5].upper() == 'ISBN:': + isbns.append(t[5:]) + if isbns: + mi.isbn = sorted(isbns, key=len)[-1] + + # Tags + try: + btags = [x.text for x in subject(extra) if x.text] + tags = [] + for t in btags: + tags.extend([y.strip() for y in t.split('/')]) + tags = list(sorted(list(set(tags)))) + except: + log.exception('Failed to parse tags:') + tags = [] + if tags: + mi.tags = [x.replace(',', ';') for x in tags] + + # pubdate + pubdate = get_text(extra, date) + if pubdate: + try: + default = utcnow().replace(day=15) + mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) + except: + log.exception('Failed to parse pubdate') + + + return mi + +class Worker(Thread): + + def __init__(self, log, entries, abort, result_queue): + self.browser, self.log, self.entries = browser(), log, entries + self.abort, self.result_queue = abort, result_queue + Thread.__init__(self) + self.daemon = True + + def run(self): + for i in self.entries: + try: + ans = to_metadata(self.browser, self.log, i) + if ans is not None: + self.result_queue.put(ans) + except: + self.log.exception( + 'Failed to get metadata for identify entry:', + etree.tostring(i)) + if self.abort.is_set(): + break + + +class GoogleBooks(Source): + + name = 'Google Books' + + def create_query(self, log, title=None, authors=None, identifiers={}, + start_index=1): + BASE_URL = 'http://books.google.com/books/feeds/volumes?' + isbn = identifiers.get('isbn', None) + q = '' + if isbn is not None: + q += 'isbn:'+isbn + elif title or authors: + def build_term(prefix, parts): + return ' '.join('in'+prefix + ':' + x for x in parts) + if title is not None: + q += build_term('title', title.split()) + if authors: + q += ('+' if q else '')+build_term('author', + self.get_author_tokens(authors)) + + if isinstance(q, unicode): + q = q.encode('utf-8') + if not q: + return None + return BASE_URL+urlencode({ + 'q':q, + 'max-results':20, + 'start-index':start_index, + 'min-viewability':'none', + }) + + + def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}): + query = self.create_query(log, title=title, authors=authors, + identifiers=identifiers) + try: + raw = browser().open_novisit(query).read() + except Exception, e: + log.exception('Failed to make identify query: %r'%query) + return as_unicode(e) + + try: + parser = etree.XMLParser(recover=True, no_network=True) + feed = etree.fromstring(raw, parser=parser) + entries = entry(feed) + except Exception, e: + log.exception('Failed to parse identify results') + return as_unicode(e) + + + groups = self.split_jobs(entries, 5) # At most 5 threads + if not groups: + return + workers = [Worker(log, entries, abort, result_queue) for entries in + groups] + + if abort.is_set(): + return + + for worker in workers: worker.start() + + has_alive_worker = True + while has_alive_worker and not abort.is_set(): + has_alive_worker = False + for worker in workers: + if worker.is_alive(): + has_alive_worker = True + time.sleep(0.1) + + return None + + + +