From dbe52fd1b394613a175478560eb7aefa3fb2bf58 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 6 Nov 2008 11:06:32 -0800
Subject: [PATCH] Make news download more robust and fetch blogs in The
 Atlantic recipe

---
 src/calibre/web/feeds/recipes/atlantic.py |  8 ++--
 src/calibre/web/fetch/simple.py           | 51 +++++++++++++++--------
 2 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/src/calibre/web/feeds/recipes/atlantic.py b/src/calibre/web/feeds/recipes/atlantic.py
index 1eb59ddd88..8978925528 100644
--- a/src/calibre/web/feeds/recipes/atlantic.py
+++ b/src/calibre/web/feeds/recipes/atlantic.py
@@ -16,7 +16,7 @@ class TheAtlantic(BasicNewsRecipe):
     INDEX = 'http://www.theatlantic.com/doc/current'
     
     remove_tags_before = dict(name='div', id='storytop')
-    remove_tags        = [dict(name='div', id=['seealso', 'storybottom', 'footer'])]
+    remove_tags        = [dict(name='div', id=['seealso', 'storybottom', 'footer', 'ad_banner_top', 'sidebar'])]
     no_stylesheets     = True
     
     def parse_index(self):
@@ -35,8 +35,9 @@ class TheAtlantic(BasicNewsRecipe):
         for item in soup.findAll('div', attrs={'class':'item'}):
             a = item.find('a')
             if a and a.has_key('href'):
-                url = a['href']
-                url = 'http://www.theatlantic.com/'+url.replace('/doc', 'doc/print')
+                url = a['href'].replace('/doc', 'doc/print')
+                if not url.startswith('http://'):
+                    url = 'http://www.theatlantic.com/'+url
                 title = self.tag_to_string(a)
                 byline = item.find(attrs={'class':'byline'})
                 date = self.tag_to_string(byline) if byline else ''
@@ -48,5 +49,4 @@ class TheAtlantic(BasicNewsRecipe):
                                  'description':description
                                 })
                 
-        
         return [('Current Issue', articles)]
diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py
index 6656ae5039..5d115ef5b1 100644
--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@@ -10,6 +10,7 @@ UTF-8 encoding with any charset declarations removed.
 import sys, socket, os, urlparse, logging, re, time, copy, urllib2, threading, traceback
 from urllib import url2pathname
 from httplib import responses
+from contextlib import closing
 
 from calibre import setup_cli_handlers, browser, sanitize_file_name, \
                     relpath, LoggingInterface
@@ -48,6 +49,11 @@ def save_soup(soup, target):
     with open(target, 'wb') as f:
         f.write(html.encode('utf-8'))
     
+class response(str):
+    
+    def __init__(self, *args):
+        str.__init__(self, *args)
+        self.newurl = None
 
 class RecursiveFetcher(object, LoggingInterface):
     LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in 
@@ -134,13 +140,25 @@ class RecursiveFetcher(object, LoggingInterface):
         
 
     def fetch_url(self, url):
-        f = None
+        data = None
         self.log_debug('Fetching %s', url)
         delta = time.time() - self.last_fetch_at 
         if  delta < self.delay:
             time.sleep(delta)
         try:
-            f = self.browser.open(url)
+            try:
+                with closing(self.browser.open(url)) as f:
+                    data = response(f.read())
+                    data.newurl = f.geturl()
+            except AttributeError:
+                time.sleep(2)
+                try:
+                    with closing(self.browser.open(url)) as f:
+                        data = response(f.read())
+                        data.newurl = f.geturl()
+                except AttributeError:
+                    data = response(urllib2.urlopen(url).read())
+                    data.newurl = f.geturl()
         except urllib2.URLError, err:
             if hasattr(err, 'code') and responses.has_key(err.code):
                 raise FetchError, responses[err.code]
@@ -149,12 +167,13 @@ class RecursiveFetcher(object, LoggingInterface):
                 time.sleep(1)
                 if hasattr(f, 'close'):
                     f.close()
-                f = self.browser.open(url)
+                with closing(self.browser.open(url)) as f:
+                    data = f.read()
             else: 
                 raise err
         finally:
             self.last_fetch_at = time.time()
-        return f
+        return data
 
         
     def start_fetch(self, url):
@@ -196,7 +215,7 @@ class RecursiveFetcher(object, LoggingInterface):
                         tag['href'] = self.stylemap[iurl]
                         continue
                 try:
-                    f = self.fetch_url(iurl)
+                    data = self.fetch_url(iurl)
                 except Exception, err:
                     self.log_debug('Could not fetch stylesheet %s', iurl)
                     self.log_debug('Error: %s', str(err), exc_info=True)
@@ -205,8 +224,7 @@ class RecursiveFetcher(object, LoggingInterface):
                 with self.stylemap_lock:
                     self.stylemap[iurl] = stylepath
                 with open(stylepath, 'wb') as x:
-                    x.write(f.read())
-                f.close()
+                    x.write(data)
                 tag['href'] = stylepath
             else:
                 for ns in tag.findAll(text=True):                    
@@ -221,19 +239,17 @@ class RecursiveFetcher(object, LoggingInterface):
                                 ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
                                 continue
                         try:
-                            f = self.fetch_url(iurl)
+                            data = self.fetch_url(iurl)
                         except Exception, err:
                             self.log_warning('Could not fetch stylesheet %s', iurl)
                             self.log_debug('Error: %s', str(err), exc_info=True)
-                            if hasattr(f, 'close'): f.close()
                             continue
                         c += 1
                         stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
                         with self.stylemap_lock:
                             self.stylemap[iurl] = stylepath
                         with open(stylepath, 'wb') as x:
-                            x.write(f.read())
-                        f.close()
+                            x.write(data)
                         ns.replaceWith(src.replace(m.group(1), stylepath))
                         
                         
@@ -256,7 +272,7 @@ class RecursiveFetcher(object, LoggingInterface):
                     tag['src'] = self.imagemap[iurl]
                     continue
             try:
-                f = self.fetch_url(iurl)
+                data = self.fetch_url(iurl)
             except Exception, err:
                 self.log_warning('Could not fetch image %s', iurl)
                 self.log_debug('Error: %s', str(err), exc_info=True)
@@ -269,8 +285,7 @@ class RecursiveFetcher(object, LoggingInterface):
             with self.imagemap_lock:
                 self.imagemap[iurl] = imgpath
             with open(imgpath, 'wb') as x:
-                x.write(f.read())
-            f.close()
+                x.write(data)                
             tag['src'] = imgpath
 
     def absurl(self, baseurl, tag, key, filter=True): 
@@ -337,9 +352,8 @@ class RecursiveFetcher(object, LoggingInterface):
                     os.mkdir(linkdiskpath)
                 try:
                     self.current_dir = linkdiskpath
-                    f = self.fetch_url(iurl)
-                    dsrc = f.read()
-                    f.close()
+                    dsrc = self.fetch_url(iurl)
+                    newbaseurl = dsrc.newurl
                     if len(dsrc) == 0 or \
                        len(re.compile('<!--.*?-->', re.DOTALL).sub('', dsrc).strip()) == 0:
                         raise ValueError('No content at URL %s'%iurl)
@@ -349,7 +363,7 @@ class RecursiveFetcher(object, LoggingInterface):
                         dsrc = xml_to_unicode(dsrc, self.verbose)[0]
                     
                     soup = self.get_soup(dsrc)
-                    newbaseurl = f.geturl()
+                    
                     base = soup.find('base', href=True)
                     if base is not None:
                         newbaseurl = base['href']
@@ -372,6 +386,7 @@ class RecursiveFetcher(object, LoggingInterface):
                         soup = self.postprocess_html_ext(soup, 
                                 c==0 and recursion_level==0 and not getattr(self, 'called_first', False),
                                 self.job_info)
+                        
                         if c==0 and recursion_level == 0:
                             self.called_first = True