Sync to trunk.

2025-07-09 03:04:10 -04:00 · 2010-03-24 18:51:52 -04:00 · 2010-03-24 18:51:52 -04:00 · 16c9b7dc1e
commit 16c9b7dc1e
parent 3a982d51b0 0c820caa43
7 changed files with 131 additions and 53 deletions
--- a/resources/recipes/ieeespectrum.recipe
+++ b/resources/recipes/ieeespectrum.recipe
@ -0,0 +1,63 @@
 __license__   = 'GPL v3'
 __copyright__ = '2010, Franco Venturi <fventuri at comcast.net>'
 '''
 spectrum.ieee.org
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 from string import capwords
 from urlparse import urljoin
 class IEEESpectrum(BasicNewsRecipe):
    title                 = 'IEEE Spectrum'
    __author__            = 'Franco Venturi'
    description           = 'Electronics News from IEEE'
    publisher             = 'IEEE'
    category              = 'news, electronics, IT, computer science'
    oldest_article        = 32
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    language              = 'en'
    index                 = 'http://spectrum.ieee.org/magazine/'
    masthead_url          = 'http://spectrum.ieee.org/images/logo_hdr.png'
    remove_javascript     = True
    remove_tags           = [dict(name={'script':True, 'object':True})]
    remove_attributes     = ['height','width','alt']
    keep_only_tags        = [dict(attrs={'class': {'artSctn':True, 'artTitle':True, 'dekTitle': True}}), dict(attrs={'id':'artBody'})]
    def parse_index(self):
        soup = self.index_to_soup(self.index)
        img = soup.find('img', image='cover.gif', src=True)
        if img is not None:
            self.cover_url = 'http://spectrum.ieee.org'+img['src']
        content = soup.find(id='gnrlContent')
        title = content.find(attrs={'class':'style4'}).string.strip()
        date = ' '.join(title.split()[0:2])
        self.timefmt = ' [' + date + ']'
        contents = []
        for tag in content.findAll(attrs={'class': {'style2':True, 'lstngTitle':True, 'lstngBody': True}}):
            if tag['class'] == 'style2':
                contents.append((capwords(tag.renderContents().strip()), []))
            elif tag['class'] == 'lstngTitle':
                url = urljoin(self.index, tag.findPrevious('a')['href']) + '/0'
                contents[-1][1].append({'title': tag.renderContents().strip(),
                                        'url': url,
                                        'date': date,
                                        'description': '',
                                        'content': ''
                                       })
            elif tag['class'] == 'lstngBody':
                contents[-1][1][-1]['description'] = tag.renderContents().strip()
        return contents
    def preprocess_html(self, soup):
        for a in soup.findAll('a'):
            if not a['href'].lower().startswith('http'):
               a['href'] = urljoin(self.index, a['href'])
        return soup
--- a/resources/recipes/nytimes_sub.recipe
+++ b/resources/recipes/nytimes_sub.recipe
@ -31,7 +31,7 @@ class NYTimes(BasicNewsRecipe):
    # List of sections to exclude
    # To add a section, copy the section name from the allSectionKeywords list above
    # For example, to exclude 'Dining' and 'Weddings':
-    # excludeSectionKeywords = ['Dining','Weddings']
+    #excludeSectionKeywords = ['Dining','Weddings']
    excludeSectionKeywords = []
    # List of sections to include (test and debug only)
@ -56,20 +56,25 @@ class NYTimes(BasicNewsRecipe):
    remove_tags_before = dict(id='article')
    remove_tags_after  = dict(id='article')
    remove_tags = [dict(attrs={'class':[
                            'articleFooter',
                            'articleTools',
                            'columnGroup doubleRule',
                            'columnGroup singleRule',
                            'columnGroup last',
                            'columnGroup  last',
                            'doubleRule',
                            'dottedLine',
                            'entry-meta',
                            'icon enlargeThis',
                            'leftNavTabs',
                            'module box nav',
                            'nextArticleLink',
                            'nextArticleLink clearfix',
                            'post-tools',
                            'relatedSearchesModule',
                            'side_tool',
                            'singleAd',
                            'subNavigation tabContent active clearfix',
                            ]}),
                   dict(id=[
                            'adxLeaderboard',
@ -222,11 +227,11 @@ class NYTimes(BasicNewsRecipe):
            if div['class'] == 'section-headline':
                key = string.capwords(feed_title(div))
-                excluded = re.compile('|'.join(self.excludeSectionKeywords))
+                if self.excludeSectionKeywords:
-                if excluded.search(key):
+                    excluded = re.compile('|'.join(self.excludeSectionKeywords))
-                    self.log("Skipping section %s" % key)
+                    if excluded.search(key):
-                    continue
+                        self.log("Skipping section %s" % key)
-
+                        continue
                articles[key] = []
                ans.append(key)
--- a/resources/recipes/times_online.recipe
+++ b/resources/recipes/times_online.recipe
@ -21,9 +21,8 @@ class Timesonline(BasicNewsRecipe):
    use_embedded_content   = False
    simultaneous_downloads = 1
    encoding               = 'ISO-8859-1'
    lang                   = 'en-UK'
    remove_javascript = True
-    language = 'en'
+    language = 'en_GB'
    recursions = 9
    match_regexps = [r'http://www.timesonline.co.uk/.*page=[2-9]']
--- a/src/calibre/devices/prs505/driver.py
+++ b/src/calibre/devices/prs505/driver.py
@ -150,7 +150,8 @@ class PRS505(CLI, Device):
        for location in locations:
            info = metadata.next()
            path = location[0]
-            blist = 2 if location[3] == 'cardb' else 1 if location[3] == 'carda' else 0
+            oncard = location[3]
            blist = 2 if oncard == 'cardb' else 1 if oncard == 'carda' else 0
            if self._main_prefix and path.startswith(self._main_prefix):
                name = path.replace(self._main_prefix, '')
@ -166,7 +167,11 @@ class PRS505(CLI, Device):
            opts = self.settings()
            collections = opts.extra_customization.split(',') if opts.extra_customization else []
-            booklists[blist].add_book(info, name, collections, *location[1:-1])
+            booklist = booklists[blist]
            if not hasattr(booklist, 'add_book'):
                raise ValueError(('Incorrect upload location %s. Did you choose the'
                        ' correct card A or B, to send books to?')%oncard)
            booklist.add_book(info, name, collections, *location[1:-1])
        fix_ids(*booklists)
    def delete_books(self, paths, end_session=True):
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -230,14 +230,25 @@ class HTMLPreProcessor(object):
        end_rules = []
        if getattr(self.extra_opts, 'remove_header', None):
-            end_rules.append(
+            try:
-                (re.compile(getattr(self.extra_opts, 'header_regex')), lambda match : '')
+                end_rules.append(
-            )
+                    (re.compile(self.extra_opts.header_regex), lambda match : '')
                )
            except:
                import traceback
                print 'Failed to parse remove_header regexp'
                traceback.print_exc()
        if getattr(self.extra_opts, 'remove_footer', None):
-            end_rules.append(
+            try:
-                (re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '')
+                end_rules.append(
-            )
+                    (re.compile(self.extra_opts.footer_regex), lambda match : '')
-        
+                )
            except:
                import traceback
                print 'Failed to parse remove_footer regexp'
                traceback.print_exc()
        if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
            length = line_length(html, getattr(self.extra_opts, 'unwrap_factor'))
            if length:
--- a/src/calibre/ebooks/metadata/topaz.py
+++ b/src/calibre/ebooks/metadata/topaz.py
@ -267,7 +267,7 @@ class MetadataUpdater(object):
            offset += md_len
            self.metadata[tag] = metadata
-    def regenerate_headers(self, len_updated_metadata):
+    def regenerate_headers(self, updated_md_len):
        headers = {}
        for tag in self.topaz_headers:
@ -276,22 +276,16 @@ class MetadataUpdater(object):
            else:
                headers[tag] = None
        # Sort headers based on initial offset
        sh = sorted(headers,key=lambda x:(headers[x],headers[x]))
        # Metadata goes last
        sh.remove('metadata')
        sh.append('metadata')
        original_md_len = self.topaz_headers['metadata']['blocks'][0]['len_uncomp']
        original_md_offset = self.topaz_headers['metadata']['blocks'][0]['offset']
        delta = updated_md_len - original_md_len
        # Copy the first 5 bytes of the file: sig + num_recs
        ths = StringIO.StringIO()
        ths.write(self.data[:5])
-        # Rewrite the offsets for hdr_offsets > metadata original location
+        # Rewrite the offsets for hdr_offsets > metadata offset
-        for tag in sh[:-1]:
+        for tag in headers.keys():
            ths.write('c')
            ths.write(self.encode_vwi(len(tag)))
            ths.write(tag)
@ -300,32 +294,18 @@ class MetadataUpdater(object):
                for block in self.topaz_headers[tag]['blocks']:
                    b = self.topaz_headers[tag]['blocks'][block]
-                    if b['offset'] < original_md_offset:
+                    if b['offset'] <= original_md_offset:
                        ths.write(self.encode_vwi(b['offset']))
                    else:
-                        ths.write(self.encode_vwi(b['offset'] - original_md_len))
+                        ths.write(self.encode_vwi(b['offset'] + delta))
-                    ths.write(self.encode_vwi(b['len_uncomp']))
+                    if tag == 'metadata':
                        ths.write(self.encode_vwi(updated_md_len))
                    else:
                        ths.write(self.encode_vwi(b['len_uncomp']))
                    ths.write(self.encode_vwi(b['len_comp']))
            else:
                ths.write(self.encode_vwi(0))
        # Adjust metadata offset to end
        new_md_offset = (len(self.data) - self.base - original_md_len)
        new_md_len = len_updated_metadata - 1 - len('metadata') - 1
        # Write the metadata header
        ths.write('c')
        ths.write(self.encode_vwi(len('metadata')))
        ths.write('metadata')
        ths.write(self.encode_vwi(1))
        ths.write(self.encode_vwi(new_md_offset))
        ths.write(self.encode_vwi(new_md_len))
        ths.write(self.encode_vwi(0))
        self.sorted_headers = sh
        self.original_md_start = original_md_offset + self.base
        self.original_md_len = original_md_len
        return ths.getvalue().encode('iso-8859-1')
@ -364,8 +344,8 @@ class MetadataUpdater(object):
        self.stream.write(head)
        self.stream.write('d')
        self.stream.write(chunk1)
        self.stream.write(chunk2)
        self.stream.write(updated_metadata)
        self.stream.write(chunk2)
 def get_metadata(stream):
    mu = MetadataUpdater(stream)
@ -377,6 +357,21 @@ def set_metadata(stream, mi):
    return
 if __name__ == '__main__':
-    #print get_metadata(open(sys.argv[1], 'rb'))
+    if False:
-    mi = MetaInformation(title="My New Title", authors=['Smith, John'])
+        # Test get_metadata()
-    set_metadata(open(sys.argv[1], 'rb'), mi)
+        print get_metadata(open(sys.argv[1], 'rb'))
    else:
        # Test set_metadata()
        import cStringIO
        data = open(sys.argv[1], 'rb')
        stream = cStringIO.StringIO()
        stream.write(data.read())
        mi = MetaInformation(title="A Marvelously Long Title", authors=['Riker, Gregory; Riker, Charles'])
        set_metadata(stream, mi)
        # Write the result
        tokens = sys.argv[1].rpartition('.')
        updated_data = open(tokens[0]+'-updated' + '.' + tokens[2],'wb')
        updated_data.write(stream.getvalue())
        updated_data.close()
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -864,10 +864,10 @@ class BasicNewsRecipe(Recipe):
            self.log.error(_('Could not download cover: %s')%str(err))
            self.log.debug(traceback.format_exc())
        if cu is not None:
-            ext = cu.rpartition('.')[-1]
+            ext = cu.split('/')[-1].rpartition('.')[-1]
            if '?' in ext:
                ext = ''
-            ext = ext.lower() if ext else 'jpg'
+            ext = ext.lower() if ext and '/' not in ext else 'jpg'
            cpath = os.path.join(self.output_dir, 'cover.'+ext)
            if os.access(cu, os.R_OK):
                with open(cpath, 'wb') as cfile: