From 8af795d9dd34ce97bfa9ca3f07afd3553f64ff9d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 21 Mar 2010 06:58:02 +0530 Subject: [PATCH 1/7] Fix NYTimes --- resources/recipes/nytimes_sub.recipe | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/resources/recipes/nytimes_sub.recipe b/resources/recipes/nytimes_sub.recipe index c126902899..78f6016c94 100644 --- a/resources/recipes/nytimes_sub.recipe +++ b/resources/recipes/nytimes_sub.recipe @@ -31,7 +31,7 @@ class NYTimes(BasicNewsRecipe): # List of sections to exclude # To add a section, copy the section name from the allSectionKeywords list above # For example, to exclude 'Dining' and 'Weddings': - # excludeSectionKeywords = ['Dining','Weddings'] + #excludeSectionKeywords = ['Dining','Weddings'] excludeSectionKeywords = [] # List of sections to include (test and debug only) @@ -56,9 +56,12 @@ class NYTimes(BasicNewsRecipe): remove_tags_before = dict(id='article') remove_tags_after = dict(id='article') remove_tags = [dict(attrs={'class':[ + 'articleFooter', + 'articleInline runaroundLeft', 'articleTools', 'columnGroup doubleRule', 'columnGroup last', + 'columnGroup last', 'doubleRule', 'dottedLine', 'entry-meta', @@ -70,6 +73,7 @@ class NYTimes(BasicNewsRecipe): 'relatedSearchesModule', 'side_tool', 'singleAd', + 'subNavigation tabContent active clearfix', ]}), dict(id=[ 'adxLeaderboard', @@ -222,11 +226,11 @@ class NYTimes(BasicNewsRecipe): if div['class'] == 'section-headline': key = string.capwords(feed_title(div)) - excluded = re.compile('|'.join(self.excludeSectionKeywords)) - if excluded.search(key): - self.log("Skipping section %s" % key) - continue - + if len(self.excludeSectionKeywords): + excluded = re.compile('|'.join(self.excludeSectionKeywords)) + if excluded.search(key): + self.log("Skipping section %s" % key) + continue articles[key] = [] ans.append(key) From 70d9a6d3d60b99276467832809f988fe862fa6fb Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 21 Mar 2010 07:20:57 +0530 Subject: [PATCH 2/7] Don't choke if the user provides an invalid remove header/footer regexp --- src/calibre/ebooks/conversion/preprocess.py | 25 +++++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 29ce0e4296..ada4f1a3af 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -230,14 +230,25 @@ class HTMLPreProcessor(object): end_rules = [] if getattr(self.extra_opts, 'remove_header', None): - end_rules.append( - (re.compile(getattr(self.extra_opts, 'header_regex')), lambda match : '') - ) + try: + end_rules.append( + (re.compile(self.extra_opts.header_regex), lambda match : '') + ) + except: + import traceback + print 'Failed to parse remove_header regexp' + traceback.print_exc() + if getattr(self.extra_opts, 'remove_footer', None): - end_rules.append( - (re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '') - ) - + try: + end_rules.append( + (re.compile(self.extra_opts.footer_regex), lambda match : '') + ) + except: + import traceback + print 'Failed to parse remove_footer regexp' + traceback.print_exc() + if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01: length = line_length(html, getattr(self.extra_opts, 'unwrap_factor')) if length: From 91f3f2d8b8addbaef9975b0eb7b53d8abfa0b44d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 22 Mar 2010 06:43:16 +0530 Subject: [PATCH 3/7] More intelligent error message when user selects wrong card for send to device operation on SONYs and improved nyimes --- resources/recipes/nytimes_sub.recipe | 5 +++-- src/calibre/devices/prs505/driver.py | 9 +++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/resources/recipes/nytimes_sub.recipe b/resources/recipes/nytimes_sub.recipe index 78f6016c94..93df08220d 100644 --- a/resources/recipes/nytimes_sub.recipe +++ b/resources/recipes/nytimes_sub.recipe @@ -57,9 +57,9 @@ class NYTimes(BasicNewsRecipe): remove_tags_after = dict(id='article') remove_tags = [dict(attrs={'class':[ 'articleFooter', - 'articleInline runaroundLeft', 'articleTools', 'columnGroup doubleRule', + 'columnGroup singleRule', 'columnGroup last', 'columnGroup last', 'doubleRule', @@ -68,6 +68,7 @@ class NYTimes(BasicNewsRecipe): 'icon enlargeThis', 'leftNavTabs', 'module box nav', + 'nextArticleLink', 'nextArticleLink clearfix', 'post-tools', 'relatedSearchesModule', @@ -226,7 +227,7 @@ class NYTimes(BasicNewsRecipe): if div['class'] == 'section-headline': key = string.capwords(feed_title(div)) - if len(self.excludeSectionKeywords): + if self.excludeSectionKeywords: excluded = re.compile('|'.join(self.excludeSectionKeywords)) if excluded.search(key): self.log("Skipping section %s" % key) diff --git a/src/calibre/devices/prs505/driver.py b/src/calibre/devices/prs505/driver.py index 5d759be47c..448965a913 100644 --- a/src/calibre/devices/prs505/driver.py +++ b/src/calibre/devices/prs505/driver.py @@ -150,7 +150,8 @@ class PRS505(CLI, Device): for location in locations: info = metadata.next() path = location[0] - blist = 2 if location[3] == 'cardb' else 1 if location[3] == 'carda' else 0 + oncard = location[3] + blist = 2 if oncard == 'cardb' else 1 if oncard == 'carda' else 0 if self._main_prefix and path.startswith(self._main_prefix): name = path.replace(self._main_prefix, '') @@ -166,7 +167,11 @@ class PRS505(CLI, Device): opts = self.settings() collections = opts.extra_customization.split(',') if opts.extra_customization else [] - booklists[blist].add_book(info, name, collections, *location[1:-1]) + booklist = booklists[blist] + if not hasattr(booklist, 'add_book'): + raise ValueError(('Incorrect upload location %s. Did you choose the' + ' correct card A or B, to send books to?')%oncard) + booklist.add_book(info, name, collections, *location[1:-1]) fix_ids(*booklists) def delete_books(self, paths, end_session=True): From fdaed4a1690298d20e49034656fc3e65ca5412b4 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 23 Mar 2010 09:46:54 +0530 Subject: [PATCH 4/7] IEEE Spectrum by Franco Venturi --- resources/recipes/ieeespectrum.recipe | 67 +++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 resources/recipes/ieeespectrum.recipe diff --git a/resources/recipes/ieeespectrum.recipe b/resources/recipes/ieeespectrum.recipe new file mode 100644 index 0000000000..79a107cd9d --- /dev/null +++ b/resources/recipes/ieeespectrum.recipe @@ -0,0 +1,67 @@ + +__license__ = 'GPL v3' +__copyright__ = '2010, Franco Venturi ' +''' +spectrum.ieee.org +''' + +from calibre.web.feeds.news import BasicNewsRecipe +from string import capwords +from urlparse import urljoin + +class IEEESpectrum(BasicNewsRecipe): + title = 'IEEE Spectrum' + __author__ = 'Franco Venturi' + description = 'Electronics News from IEEE' + publisher = 'IEEE' + category = 'news, electronics, IT, computer science' + oldest_article = 32 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + language = 'en' + index = 'http://spectrum.ieee.org/magazine/' + masthead_url = 'http://spectrum.ieee.org/images/logo_hdr.png' + + remove_javascript = True + remove_tags = [dict(name={'script':True, 'object':True})] + remove_attributes = ['height','width','alt'] + keep_only_tags = [dict(attrs={'class': {'artSctn':True, 'artTitle':True, 'dekTitle': True}}), dict(attrs={'id':'artBody'})] + + +# def get_cover_url(self): +# cover_url = None +# soup = self.index_to_soup(self.index) +# cover_item = soup.find('img',attrs={'image':'cover.gif'}) +# if cover_item: +# cover_url = urljoin(self.index, cover_item['src']) +# return cover_url + + def parse_index(self): + soup = self.index_to_soup(self.index) + content = soup.find(id='gnrlContent') + title = content.find(attrs={'class':'style4'}).string.strip() + date = ' '.join(title.split()[0:2]) + self.timefmt = ' [' + date + ']' + contents = [] + for tag in content.findAll(attrs={'class': {'style2':True, 'lstngTitle':True, 'lstngBody': True}}): + if tag['class'] == 'style2': + contents.append((capwords(tag.renderContents().strip()), [])) + elif tag['class'] == 'lstngTitle': + url = urljoin(self.index, tag.findPrevious('a')['href']) + '/0' + contents[-1][1].append({'title': tag.renderContents().strip(), + 'url': url, + 'date': date, + 'description': '', + 'content': '' + }) + elif tag['class'] == 'lstngBody': + contents[-1][1][-1]['description'] = tag.renderContents().strip() + + return contents + + def preprocess_html(self, soup): + for a in soup.findAll('a'): + if not a['href'].lower().startswith('http'): + a['href'] = urljoin(self.index, a['href']) + return soup From 69ec0e1ee56699429b4b5ee4b9cb2bddccaec070 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 23 Mar 2010 11:36:00 +0530 Subject: [PATCH 5/7] ... --- resources/recipes/ieeespectrum.recipe | 12 ++++-------- src/calibre/web/feeds/news.py | 4 ++-- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/resources/recipes/ieeespectrum.recipe b/resources/recipes/ieeespectrum.recipe index 79a107cd9d..e2490b2a6c 100644 --- a/resources/recipes/ieeespectrum.recipe +++ b/resources/recipes/ieeespectrum.recipe @@ -29,16 +29,12 @@ class IEEESpectrum(BasicNewsRecipe): keep_only_tags = [dict(attrs={'class': {'artSctn':True, 'artTitle':True, 'dekTitle': True}}), dict(attrs={'id':'artBody'})] -# def get_cover_url(self): -# cover_url = None -# soup = self.index_to_soup(self.index) -# cover_item = soup.find('img',attrs={'image':'cover.gif'}) -# if cover_item: -# cover_url = urljoin(self.index, cover_item['src']) -# return cover_url - def parse_index(self): soup = self.index_to_soup(self.index) + img = soup.find('img', image='cover.gif', src=True) + if img is not None: + self.cover_url = 'http://spectrum.ieee.org'+img['src'] + content = soup.find(id='gnrlContent') title = content.find(attrs={'class':'style4'}).string.strip() date = ' '.join(title.split()[0:2]) diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index d07c135abd..496a1f4d5b 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -864,10 +864,10 @@ class BasicNewsRecipe(Recipe): self.log.error(_('Could not download cover: %s')%str(err)) self.log.debug(traceback.format_exc()) if cu is not None: - ext = cu.rpartition('.')[-1] + ext = cu.split('/')[-1].rpartition('.')[-1] if '?' in ext: ext = '' - ext = ext.lower() if ext else 'jpg' + ext = ext.lower() if ext and '/' not in ext else 'jpg' cpath = os.path.join(self.output_dir, 'cover.'+ext) if os.access(cu, os.R_OK): with open(cpath, 'wb') as cfile: From 4a3ebab22ec7393f3b837330b2a358c53cbe18e5 Mon Sep 17 00:00:00 2001 From: GRiker Date: Tue, 23 Mar 2010 16:53:35 -0700 Subject: [PATCH 6/7] GwR fix problem with TPZ files not indexing on Kindle --- src/calibre/ebooks/metadata/topaz.py | 63 +++++++++++++--------------- 1 file changed, 29 insertions(+), 34 deletions(-) diff --git a/src/calibre/ebooks/metadata/topaz.py b/src/calibre/ebooks/metadata/topaz.py index 6fe858df53..996cc77770 100644 --- a/src/calibre/ebooks/metadata/topaz.py +++ b/src/calibre/ebooks/metadata/topaz.py @@ -267,7 +267,7 @@ class MetadataUpdater(object): offset += md_len self.metadata[tag] = metadata - def regenerate_headers(self, len_updated_metadata): + def regenerate_headers(self, updated_md_len): headers = {} for tag in self.topaz_headers: @@ -276,22 +276,16 @@ class MetadataUpdater(object): else: headers[tag] = None - # Sort headers based on initial offset - sh = sorted(headers,key=lambda x:(headers[x],headers[x])) - - # Metadata goes last - sh.remove('metadata') - sh.append('metadata') - original_md_len = self.topaz_headers['metadata']['blocks'][0]['len_uncomp'] original_md_offset = self.topaz_headers['metadata']['blocks'][0]['offset'] + delta = updated_md_len - original_md_len # Copy the first 5 bytes of the file: sig + num_recs ths = StringIO.StringIO() ths.write(self.data[:5]) - # Rewrite the offsets for hdr_offsets > metadata original location - for tag in sh[:-1]: + # Rewrite the offsets for hdr_offsets > metadata offset + for tag in headers.keys(): ths.write('c') ths.write(self.encode_vwi(len(tag))) ths.write(tag) @@ -300,32 +294,18 @@ class MetadataUpdater(object): for block in self.topaz_headers[tag]['blocks']: b = self.topaz_headers[tag]['blocks'][block] - if b['offset'] < original_md_offset: + if b['offset'] <= original_md_offset: ths.write(self.encode_vwi(b['offset'])) else: - ths.write(self.encode_vwi(b['offset'] - original_md_len)) + ths.write(self.encode_vwi(b['offset'] + delta)) - ths.write(self.encode_vwi(b['len_uncomp'])) + if tag == 'metadata': + ths.write(self.encode_vwi(updated_md_len)) + else: + ths.write(self.encode_vwi(b['len_uncomp'])) ths.write(self.encode_vwi(b['len_comp'])) else: ths.write(self.encode_vwi(0)) - - # Adjust metadata offset to end - new_md_offset = (len(self.data) - self.base - original_md_len) - - new_md_len = len_updated_metadata - 1 - len('metadata') - 1 - - # Write the metadata header - ths.write('c') - ths.write(self.encode_vwi(len('metadata'))) - ths.write('metadata') - ths.write(self.encode_vwi(1)) - ths.write(self.encode_vwi(new_md_offset)) - - ths.write(self.encode_vwi(new_md_len)) - ths.write(self.encode_vwi(0)) - - self.sorted_headers = sh self.original_md_start = original_md_offset + self.base self.original_md_len = original_md_len return ths.getvalue().encode('iso-8859-1') @@ -364,8 +344,8 @@ class MetadataUpdater(object): self.stream.write(head) self.stream.write('d') self.stream.write(chunk1) - self.stream.write(chunk2) self.stream.write(updated_metadata) + self.stream.write(chunk2) def get_metadata(stream): mu = MetadataUpdater(stream) @@ -377,6 +357,21 @@ def set_metadata(stream, mi): return if __name__ == '__main__': - #print get_metadata(open(sys.argv[1], 'rb')) - mi = MetaInformation(title="My New Title", authors=['Smith, John']) - set_metadata(open(sys.argv[1], 'rb'), mi) + if False: + # Test get_metadata() + print get_metadata(open(sys.argv[1], 'rb')) + else: + # Test set_metadata() + import cStringIO + data = open(sys.argv[1], 'rb') + stream = cStringIO.StringIO() + stream.write(data.read()) + mi = MetaInformation(title="A Marvelously Long Title", authors=['Riker, Gregory; Riker, Charles']) + set_metadata(stream, mi) + + # Write the result + tokens = sys.argv[1].rpartition('.') + updated_data = open(tokens[0]+'-updated' + '.' + tokens[2],'wb') + updated_data.write(stream.getvalue()) + updated_data.close() + From c1a6be45b95c1f46de1ed7872b6d49fa8e688ed5 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 24 Mar 2010 08:47:31 +0530 Subject: [PATCH 7/7] ... --- resources/recipes/times_online.recipe | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/resources/recipes/times_online.recipe b/resources/recipes/times_online.recipe index 98c26e6a66..98e96552ce 100644 --- a/resources/recipes/times_online.recipe +++ b/resources/recipes/times_online.recipe @@ -21,9 +21,8 @@ class Timesonline(BasicNewsRecipe): use_embedded_content = False simultaneous_downloads = 1 encoding = 'ISO-8859-1' - lang = 'en-UK' remove_javascript = True - language = 'en' + language = 'en_GB' recursions = 9 match_regexps = [r'http://www.timesonline.co.uk/.*page=[2-9]']