Make news download more robust and fetch blogs in The Atlantic recipe

2025-07-09 03:04:10 -04:00 · 2008-11-06 11:06:32 -08:00 · 2008-11-06 11:06:32 -08:00 · dbe52fd1b3
commit dbe52fd1b3
parent 7fb21ac00d
2 changed files with 37 additions and 22 deletions
--- a/src/calibre/web/feeds/recipes/atlantic.py
+++ b/src/calibre/web/feeds/recipes/atlantic.py
@ -16,7 +16,7 @@ class TheAtlantic(BasicNewsRecipe):
    INDEX = 'http://www.theatlantic.com/doc/current'
    
    remove_tags_before = dict(name='div', id='storytop')
-    remove_tags        = [dict(name='div', id=['seealso', 'storybottom', 'footer'])]
+    remove_tags        = [dict(name='div', id=['seealso', 'storybottom', 'footer', 'ad_banner_top', 'sidebar'])]
    no_stylesheets     = True
    
    def parse_index(self):
@ -35,8 +35,9 @@ class TheAtlantic(BasicNewsRecipe):
        for item in soup.findAll('div', attrs={'class':'item'}):
            a = item.find('a')
            if a and a.has_key('href'):
-                url = a['href']
-                url = 'http://www.theatlantic.com/'+url.replace('/doc', 'doc/print')
+                url = a['href'].replace('/doc', 'doc/print')
+                if not url.startswith('http://'):
+                    url = 'http://www.theatlantic.com/'+url
                title = self.tag_to_string(a)
                byline = item.find(attrs={'class':'byline'})
                date = self.tag_to_string(byline) if byline else ''
@ -48,5 +49,4 @@ class TheAtlantic(BasicNewsRecipe):
                                 'description':description
                                })
                
-        
        return [('Current Issue', articles)]
--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@ -10,6 +10,7 @@ UTF-8 encoding with any charset declarations removed.
 import sys, socket, os, urlparse, logging, re, time, copy, urllib2, threading, traceback
 from urllib import url2pathname
 from httplib import responses
+from contextlib import closing

 from calibre import setup_cli_handlers, browser, sanitize_file_name, \
                    relpath, LoggingInterface
@ -48,6 +49,11 @@ def save_soup(soup, target):
    with open(target, 'wb') as f:
        f.write(html.encode('utf-8'))
    
+class response(str):
+    
+    def __init__(self, *args):
+        str.__init__(self, *args)
+        self.newurl = None

 class RecursiveFetcher(object, LoggingInterface):
    LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in 
@ -134,13 +140,25 @@ class RecursiveFetcher(object, LoggingInterface):
        

    def fetch_url(self, url):
-        f = None
+        data = None
        self.log_debug('Fetching %s', url)
        delta = time.time() - self.last_fetch_at 
        if  delta < self.delay:
            time.sleep(delta)
        try:
-            f = self.browser.open(url)
+            try:
+                with closing(self.browser.open(url)) as f:
+                    data = response(f.read())
+                    data.newurl = f.geturl()
+            except AttributeError:
+                time.sleep(2)
+                try:
+                    with closing(self.browser.open(url)) as f:
+                        data = response(f.read())
+                        data.newurl = f.geturl()
+                except AttributeError:
+                    data = response(urllib2.urlopen(url).read())
+                    data.newurl = f.geturl()
        except urllib2.URLError, err:
            if hasattr(err, 'code') and responses.has_key(err.code):
                raise FetchError, responses[err.code]
@ -149,12 +167,13 @@ class RecursiveFetcher(object, LoggingInterface):
                time.sleep(1)
                if hasattr(f, 'close'):
                    f.close()
-                f = self.browser.open(url)
+                with closing(self.browser.open(url)) as f:
+                    data = f.read()
            else: 
                raise err
        finally:
            self.last_fetch_at = time.time()
-        return f
+        return data

        
    def start_fetch(self, url):
@ -196,7 +215,7 @@ class RecursiveFetcher(object, LoggingInterface):
                        tag['href'] = self.stylemap[iurl]
                        continue
                try:
-                    f = self.fetch_url(iurl)
+                    data = self.fetch_url(iurl)
                except Exception, err:
                    self.log_debug('Could not fetch stylesheet %s', iurl)
                    self.log_debug('Error: %s', str(err), exc_info=True)
@ -205,8 +224,7 @@ class RecursiveFetcher(object, LoggingInterface):
                with self.stylemap_lock:
                    self.stylemap[iurl] = stylepath
                with open(stylepath, 'wb') as x:
-                    x.write(f.read())
-                f.close()
+                    x.write(data)
                tag['href'] = stylepath
            else:
                for ns in tag.findAll(text=True):                    
@ -221,19 +239,17 @@ class RecursiveFetcher(object, LoggingInterface):
                                ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
                                continue
                        try:
-                            f = self.fetch_url(iurl)
+                            data = self.fetch_url(iurl)
                        except Exception, err:
                            self.log_warning('Could not fetch stylesheet %s', iurl)
                            self.log_debug('Error: %s', str(err), exc_info=True)
-                            if hasattr(f, 'close'): f.close()
                            continue
                        c += 1
                        stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
                        with self.stylemap_lock:
                            self.stylemap[iurl] = stylepath
                        with open(stylepath, 'wb') as x:
-                            x.write(f.read())
-                        f.close()
+                            x.write(data)
                        ns.replaceWith(src.replace(m.group(1), stylepath))
                        
                        
@ -256,7 +272,7 @@ class RecursiveFetcher(object, LoggingInterface):
                    tag['src'] = self.imagemap[iurl]
                    continue
            try:
-                f = self.fetch_url(iurl)
+                data = self.fetch_url(iurl)
            except Exception, err:
                self.log_warning('Could not fetch image %s', iurl)
                self.log_debug('Error: %s', str(err), exc_info=True)
@ -269,8 +285,7 @@ class RecursiveFetcher(object, LoggingInterface):
            with self.imagemap_lock:
                self.imagemap[iurl] = imgpath
            with open(imgpath, 'wb') as x:
-                x.write(f.read())
-            f.close()
+                x.write(data)                
            tag['src'] = imgpath

    def absurl(self, baseurl, tag, key, filter=True): 
@ -337,9 +352,8 @@ class RecursiveFetcher(object, LoggingInterface):
                    os.mkdir(linkdiskpath)
                try:
                    self.current_dir = linkdiskpath
-                    f = self.fetch_url(iurl)
-                    dsrc = f.read()
-                    f.close()
+                    dsrc = self.fetch_url(iurl)
+                    newbaseurl = dsrc.newurl
                    if len(dsrc) == 0 or \
                       len(re.compile('<!--.*?-->', re.DOTALL).sub('', dsrc).strip()) == 0:
                        raise ValueError('No content at URL %s'%iurl)
@ -349,7 +363,7 @@ class RecursiveFetcher(object, LoggingInterface):
                        dsrc = xml_to_unicode(dsrc, self.verbose)[0]
                    
                    soup = self.get_soup(dsrc)
-                    newbaseurl = f.geturl()
+                    
                    base = soup.find('base', href=True)
                    if base is not None:
                        newbaseurl = base['href']
@ -372,6 +386,7 @@ class RecursiveFetcher(object, LoggingInterface):
                        soup = self.postprocess_html_ext(soup, 
                                c==0 and recursion_level==0 and not getattr(self, 'called_first', False),
                                self.job_info)
+                        
                        if c==0 and recursion_level == 0:
                            self.called_first = True