IGN:News download: Fix logging during feed parsing

This commit is contained in:
Kovid Goyal 2009-11-11 11:11:04 -07:00
parent 14156737ce
commit fb5634ab4a
4 changed files with 22 additions and 12 deletions

View File

@ -6,6 +6,8 @@ __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
infobae.com infobae.com
''' '''
import re import re
import urllib, urlparse
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Infobae(BasicNewsRecipe): class Infobae(BasicNewsRecipe):
@ -61,11 +63,11 @@ class Infobae(BasicNewsRecipe):
# return u'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id # return u'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id
def get_article_url(self, article): def get_article_url(self, article):
import urllib, urlparse ans = article.get('link').encode('utf-8')
parts = list(urlparse.urlparse(article.get('link'))) parts = list(urlparse.urlparse(ans))
parts[2] = urllib.quote(parts[2]) parts[2] = urllib.quote(parts[2])
ans = urlparse.urlunparse(parts) ans = urlparse.urlunparse(parts)
return ans return ans.decode('utf-8')
def preprocess_html(self, soup): def preprocess_html(self, soup):

View File

@ -97,7 +97,8 @@ class ZAOBAO(BasicNewsRecipe):
}) })
pfeeds = feeds_from_index([(title, articles)], oldest_article=self.oldest_article, pfeeds = feeds_from_index([(title, articles)], oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed) max_articles_per_feed=self.max_articles_per_feed,
log=self.log)
self.log.debug('adding %s to feed'%(title)) self.log.debug('adding %s to feed'%(title))
for feed in pfeeds: for feed in pfeeds:

View File

@ -5,10 +5,11 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
''' '''
Contains the logic for parsing feeds. Contains the logic for parsing feeds.
''' '''
import time, logging, traceback, copy, re import time, traceback, copy, re
from datetime import datetime from datetime import datetime
from calibre.web.feeds.feedparser import parse from calibre.web.feeds.feedparser import parse
from calibre.utils.logging import default_log
from calibre import entity_to_unicode from calibre import entity_to_unicode
from lxml import html from lxml import html
@ -87,11 +88,12 @@ Has content : %s
class Feed(object): class Feed(object):
def __init__(self, get_article_url=lambda item: item.get('link', None)): def __init__(self, get_article_url=lambda item: item.get('link', None),
log=default_log):
''' '''
Parse a feed into articles. Parse a feed into articles.
''' '''
self.logger = logging.getLogger('feeds2disk') self.logger = log
self.get_article_url = get_article_url self.get_article_url = get_article_url
def populate_from_feed(self, feed, title=None, oldest_article=7, def populate_from_feed(self, feed, title=None, oldest_article=7,
@ -288,15 +290,18 @@ class FeedCollection(list):
def feed_from_xml(raw_xml, title=None, oldest_article=7, def feed_from_xml(raw_xml, title=None, oldest_article=7,
max_articles_per_feed=100, get_article_url=lambda item: item.get('link', None)): max_articles_per_feed=100,
get_article_url=lambda item: item.get('link', None),
log=default_log):
feed = parse(raw_xml) feed = parse(raw_xml)
pfeed = Feed(get_article_url=get_article_url) pfeed = Feed(get_article_url=get_article_url, log=log)
pfeed.populate_from_feed(feed, title=title, pfeed.populate_from_feed(feed, title=title,
oldest_article=oldest_article, oldest_article=oldest_article,
max_articles_per_feed=max_articles_per_feed) max_articles_per_feed=max_articles_per_feed)
return pfeed return pfeed
def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100): def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100,
log=default_log):
''' '''
@param index: A parsed index as returned by L{BasicNewsRecipe.parse_index}. @param index: A parsed index as returned by L{BasicNewsRecipe.parse_index}.
@return: A list of L{Feed} objects. @return: A list of L{Feed} objects.
@ -304,7 +309,7 @@ def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100):
''' '''
feeds = [] feeds = []
for title, articles in index: for title, articles in index:
pfeed = Feed() pfeed = Feed(log=log)
pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article, pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article,
max_articles_per_feed=max_articles_per_feed) max_articles_per_feed=max_articles_per_feed)
feeds.append(pfeed) feeds.append(pfeed)

View File

@ -704,7 +704,8 @@ class BasicNewsRecipe(Recipe):
self.report_progress(0, _('Fetching feeds...')) self.report_progress(0, _('Fetching feeds...'))
try: try:
feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article, feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed) max_articles_per_feed=self.max_articles_per_feed,
log=self.log)
self.report_progress(0, _('Got feeds from index page')) self.report_progress(0, _('Got feeds from index page'))
except NotImplementedError: except NotImplementedError:
feeds = self.parse_feeds() feeds = self.parse_feeds()
@ -1028,6 +1029,7 @@ class BasicNewsRecipe(Recipe):
with closing(self.browser.open(url)) as f: with closing(self.browser.open(url)) as f:
parsed_feeds.append(feed_from_xml(f.read(), parsed_feeds.append(feed_from_xml(f.read(),
title=title, title=title,
log=self.log,
oldest_article=self.oldest_article, oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed, max_articles_per_feed=self.max_articles_per_feed,
get_article_url=self.get_article_url)) get_article_url=self.get_article_url))