Make news download more robust and fetch blogs in The Atlantic recipe

2025-07-09 03:04:10 -04:00 · 2008-11-06 11:06:32 -08:00 · 2008-11-06 11:06:32 -08:00 · dbe52fd1b3
commit dbe52fd1b3
parent 7fb21ac00d
2 changed files with 37 additions and 22 deletions
--- a/src/calibre/web/feeds/recipes/atlantic.py
+++ b/src/calibre/web/feeds/recipes/atlantic.py
@ -16,7 +16,7 @@ class TheAtlantic(BasicNewsRecipe):
    INDEX = 'http://www.theatlantic.com/doc/current'
    remove_tags_before = dict(name='div', id='storytop')
-    remove_tags        = [dict(name='div', id=['seealso', 'storybottom', 'footer'])]
+    remove_tags        = [dict(name='div', id=['seealso', 'storybottom', 'footer', 'ad_banner_top', 'sidebar'])]
    no_stylesheets     = True
    def parse_index(self):
@ -35,8 +35,9 @@ class TheAtlantic(BasicNewsRecipe):
        for item in soup.findAll('div', attrs={'class':'item'}):
            a = item.find('a')
            if a and a.has_key('href'):
-                url = a['href']
+                url = a['href'].replace('/doc', 'doc/print')
-                url = 'http://www.theatlantic.com/'+url.replace('/doc', 'doc/print')
+                if not url.startswith('http://'):
                    url = 'http://www.theatlantic.com/'+url
                title = self.tag_to_string(a)
                byline = item.find(attrs={'class':'byline'})
                date = self.tag_to_string(byline) if byline else ''
@ -48,5 +49,4 @@ class TheAtlantic(BasicNewsRecipe):
                                 'description':description
                                })
        return [('Current Issue', articles)]
--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@ -10,6 +10,7 @@ UTF-8 encoding with any charset declarations removed.
 import sys, socket, os, urlparse, logging, re, time, copy, urllib2, threading, traceback
 from urllib import url2pathname
 from httplib import responses
 from contextlib import closing
 from calibre import setup_cli_handlers, browser, sanitize_file_name, \
                    relpath, LoggingInterface
@ -48,6 +49,11 @@ def save_soup(soup, target):
    with open(target, 'wb') as f:
        f.write(html.encode('utf-8'))
 class response(str):
    def __init__(self, *args):
        str.__init__(self, *args)
        self.newurl = None
 class RecursiveFetcher(object, LoggingInterface):
    LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in 
@ -134,13 +140,25 @@ class RecursiveFetcher(object, LoggingInterface):
    def fetch_url(self, url):
-        f = None
+        data = None
        self.log_debug('Fetching %s', url)
        delta = time.time() - self.last_fetch_at 
        if  delta < self.delay:
            time.sleep(delta)
        try:
-            f = self.browser.open(url)
+            try:
                with closing(self.browser.open(url)) as f:
                    data = response(f.read())
                    data.newurl = f.geturl()
            except AttributeError:
                time.sleep(2)
                try:
                    with closing(self.browser.open(url)) as f:
                        data = response(f.read())
                        data.newurl = f.geturl()
                except AttributeError:
                    data = response(urllib2.urlopen(url).read())
                    data.newurl = f.geturl()
        except urllib2.URLError, err:
            if hasattr(err, 'code') and responses.has_key(err.code):
                raise FetchError, responses[err.code]
@ -149,12 +167,13 @@ class RecursiveFetcher(object, LoggingInterface):
                time.sleep(1)
                if hasattr(f, 'close'):
                    f.close()
-                f = self.browser.open(url)
+                with closing(self.browser.open(url)) as f:
                    data = f.read()
            else: 
                raise err
        finally:
            self.last_fetch_at = time.time()
-        return f
+        return data
    def start_fetch(self, url):
@ -196,7 +215,7 @@ class RecursiveFetcher(object, LoggingInterface):
                        tag['href'] = self.stylemap[iurl]
                        continue
                try:
-                    f = self.fetch_url(iurl)
+                    data = self.fetch_url(iurl)
                except Exception, err:
                    self.log_debug('Could not fetch stylesheet %s', iurl)
                    self.log_debug('Error: %s', str(err), exc_info=True)
@ -205,8 +224,7 @@ class RecursiveFetcher(object, LoggingInterface):
                with self.stylemap_lock:
                    self.stylemap[iurl] = stylepath
                with open(stylepath, 'wb') as x:
-                    x.write(f.read())
+                    x.write(data)
                f.close()
                tag['href'] = stylepath
            else:
                for ns in tag.findAll(text=True):                    
@ -221,19 +239,17 @@ class RecursiveFetcher(object, LoggingInterface):
                                ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
                                continue
                        try:
-                            f = self.fetch_url(iurl)
+                            data = self.fetch_url(iurl)
                        except Exception, err:
                            self.log_warning('Could not fetch stylesheet %s', iurl)
                            self.log_debug('Error: %s', str(err), exc_info=True)
                            if hasattr(f, 'close'): f.close()
                            continue
                        c += 1
                        stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
                        with self.stylemap_lock:
                            self.stylemap[iurl] = stylepath
                        with open(stylepath, 'wb') as x:
-                            x.write(f.read())
+                            x.write(data)
                        f.close()
                        ns.replaceWith(src.replace(m.group(1), stylepath))
@ -256,7 +272,7 @@ class RecursiveFetcher(object, LoggingInterface):
                    tag['src'] = self.imagemap[iurl]
                    continue
            try:
-                f = self.fetch_url(iurl)
+                data = self.fetch_url(iurl)
            except Exception, err:
                self.log_warning('Could not fetch image %s', iurl)
                self.log_debug('Error: %s', str(err), exc_info=True)
@ -269,8 +285,7 @@ class RecursiveFetcher(object, LoggingInterface):
            with self.imagemap_lock:
                self.imagemap[iurl] = imgpath
            with open(imgpath, 'wb') as x:
-                x.write(f.read())
+                x.write(data)                
            f.close()
            tag['src'] = imgpath
    def absurl(self, baseurl, tag, key, filter=True): 
@ -337,9 +352,8 @@ class RecursiveFetcher(object, LoggingInterface):
                    os.mkdir(linkdiskpath)
                try:
                    self.current_dir = linkdiskpath
-                    f = self.fetch_url(iurl)
+                    dsrc = self.fetch_url(iurl)
-                    dsrc = f.read()
+                    newbaseurl = dsrc.newurl
                    f.close()
                    if len(dsrc) == 0 or \
                       len(re.compile('<!--.*?-->', re.DOTALL).sub('', dsrc).strip()) == 0:
                        raise ValueError('No content at URL %s'%iurl)
@ -349,7 +363,7 @@ class RecursiveFetcher(object, LoggingInterface):
                        dsrc = xml_to_unicode(dsrc, self.verbose)[0]
                    soup = self.get_soup(dsrc)
-                    newbaseurl = f.geturl()
+                    
                    base = soup.find('base', href=True)
                    if base is not None:
                        newbaseurl = base['href']
@ -372,6 +386,7 @@ class RecursiveFetcher(object, LoggingInterface):
                        soup = self.postprocess_html_ext(soup, 
                                c==0 and recursion_level==0 and not getattr(self, 'called_first', False),
                                self.job_info)
                        if c==0 and recursion_level == 0:
                            self.called_first = True