From 132ac4b850dd2b8b47e4920b82539aae40c1e1c0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 29 Mar 2008 01:22:34 +0000 Subject: [PATCH] Don't download comment only HTML pages --- src/libprs500/web/fetch/simple.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/libprs500/web/fetch/simple.py b/src/libprs500/web/fetch/simple.py index 58c9feae23..e762062db3 100644 --- a/src/libprs500/web/fetch/simple.py +++ b/src/libprs500/web/fetch/simple.py @@ -321,7 +321,8 @@ class RecursiveFetcher(object): self.current_dir = linkdiskpath f = self.fetch_url(iurl) dsrc = f.read() - if len(dsrc) == 0: + if len(dsrc) == 0 or \ + len(re.compile('', re.DOTALL).sub('', dsrc).strip()) == 0: raise ValueError('No content at URL %s'%iurl) if self.encoding is not None: dsrc = dsrc.decode(self.encoding, 'ignore')