IGN:News download: Fix logging during feed parsing

This commit is contained in:
Kovid Goyal 2009-11-11 11:11:04 -07:00
parent 14156737ce
commit fb5634ab4a
4 changed files with 22 additions and 12 deletions

View File

@ -6,6 +6,8 @@ __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
infobae.com
'''
import re
import urllib, urlparse
from calibre.web.feeds.news import BasicNewsRecipe
class Infobae(BasicNewsRecipe):
@ -61,11 +63,11 @@ class Infobae(BasicNewsRecipe):
# return u'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id
def get_article_url(self, article):
import urllib, urlparse
parts = list(urlparse.urlparse(article.get('link')))
ans = article.get('link').encode('utf-8')
parts = list(urlparse.urlparse(ans))
parts[2] = urllib.quote(parts[2])
ans = urlparse.urlunparse(parts)
return ans
return ans.decode('utf-8')
def preprocess_html(self, soup):

View File

@ -97,7 +97,8 @@ class ZAOBAO(BasicNewsRecipe):
})
pfeeds = feeds_from_index([(title, articles)], oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed)
max_articles_per_feed=self.max_articles_per_feed,
log=self.log)
self.log.debug('adding %s to feed'%(title))
for feed in pfeeds:

View File

@ -5,10 +5,11 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Contains the logic for parsing feeds.
'''
import time, logging, traceback, copy, re
import time, traceback, copy, re
from datetime import datetime
from calibre.web.feeds.feedparser import parse
from calibre.utils.logging import default_log
from calibre import entity_to_unicode
from lxml import html
@ -87,11 +88,12 @@ Has content : %s
class Feed(object):
def __init__(self, get_article_url=lambda item: item.get('link', None)):
def __init__(self, get_article_url=lambda item: item.get('link', None),
log=default_log):
'''
Parse a feed into articles.
'''
self.logger = logging.getLogger('feeds2disk')
self.logger = log
self.get_article_url = get_article_url
def populate_from_feed(self, feed, title=None, oldest_article=7,
@ -288,15 +290,18 @@ class FeedCollection(list):
def feed_from_xml(raw_xml, title=None, oldest_article=7,
max_articles_per_feed=100, get_article_url=lambda item: item.get('link', None)):
max_articles_per_feed=100,
get_article_url=lambda item: item.get('link', None),
log=default_log):
feed = parse(raw_xml)
pfeed = Feed(get_article_url=get_article_url)
pfeed = Feed(get_article_url=get_article_url, log=log)
pfeed.populate_from_feed(feed, title=title,
oldest_article=oldest_article,
max_articles_per_feed=max_articles_per_feed)
return pfeed
def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100):
def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100,
log=default_log):
'''
@param index: A parsed index as returned by L{BasicNewsRecipe.parse_index}.
@return: A list of L{Feed} objects.
@ -304,7 +309,7 @@ def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100):
'''
feeds = []
for title, articles in index:
pfeed = Feed()
pfeed = Feed(log=log)
pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article,
max_articles_per_feed=max_articles_per_feed)
feeds.append(pfeed)

View File

@ -704,7 +704,8 @@ class BasicNewsRecipe(Recipe):
self.report_progress(0, _('Fetching feeds...'))
try:
feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed)
max_articles_per_feed=self.max_articles_per_feed,
log=self.log)
self.report_progress(0, _('Got feeds from index page'))
except NotImplementedError:
feeds = self.parse_feeds()
@ -1028,6 +1029,7 @@ class BasicNewsRecipe(Recipe):
with closing(self.browser.open(url)) as f:
parsed_feeds.append(feed_from_xml(f.read(),
title=title,
log=self.log,
oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed,
get_article_url=self.get_article_url))