From f79fd7a15c719426b428cd12cef865e2f261daa5 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 22 Jan 2008 06:34:05 +0000 Subject: [PATCH] Implement support for full-content feeds via a FullContentProfile --- .../ebooks/lrf/web/profiles/__init__.py | 117 ++++++++++++++++-- .../ebooks/lrf/web/profiles/portfolio.py | 8 +- 2 files changed, 110 insertions(+), 15 deletions(-) diff --git a/src/libprs500/ebooks/lrf/web/profiles/__init__.py b/src/libprs500/ebooks/lrf/web/profiles/__init__.py index 490bf29df6..33b605e60f 100644 --- a/src/libprs500/ebooks/lrf/web/profiles/__init__.py +++ b/src/libprs500/ebooks/lrf/web/profiles/__init__.py @@ -153,9 +153,10 @@ class DefaultProfile(object): return index - def parse_feeds(self): + def parse_feeds(self, require_url=True): ''' Create list of articles from a list of feeds. + @param require_url: If True skip articles that don't have a link to a HTML page with the full article contents. @return: A dictionary whose keys are feed titles and whose values are each a list of dictionaries. Each list contains dictionaries of the form: { @@ -163,6 +164,7 @@ class DefaultProfile(object): 'url' : URL of print version, 'date' : The publication date of the article as a string, 'description' : A summary of the article + 'content' : The full article (can be an empty string). This is unused in DefaultProfile } ''' added_articles = {} @@ -196,14 +198,25 @@ class DefaultProfile(object): if url: break - if not url or not url.string: + if require_url and (not url or not url.string): self.logger.debug('Skipping article as it does not have a link url') continue + url = url.string if (url and url.string) else '' + + content = item.find('content:encoded') + if not content: + content = item.find('description') + if content: + content = self.process_html_description(content, strip_links=False) + else: + content = '' + d = { 'title' : item.find('title').string, - 'url' : self.print_version(url.string), + 'url' : self.print_version(url), 'timestamp': self.strptime(pubdate) if self.use_pubdate else time.time(), - 'date' : pubdate if self.use_pubdate else time.ctime() + 'date' : pubdate if self.use_pubdate else time.ctime(), + 'content' : content, } delta = time.time() - d['timestamp'] if not self.allow_duplicates: @@ -240,13 +253,19 @@ class DefaultProfile(object): pass @classmethod - def process_html_description(cls, tag): + def process_html_description(cls, tag, strip_links=True): src = '\n'.join(tag.contents) - replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ] - for e in replaced_entities: - ent = '&'+e+';' - src = src.replace(ent, unichr(name2codepoint[e])) - return re.compile(r'(.*?)', re.IGNORECASE|re.DOTALL).sub(r'\1', src) + match = re.match(r'<\!\[CDATA\[(.*)\]\]>', src.lstrip()) + if match: + src = match.group(1) + else: + replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ] + for e in replaced_entities: + ent = '&'+e+';' + src = src.replace(ent, unichr(name2codepoint[e])) + if strip_links: + src = re.compile(r'(.*?)', re.IGNORECASE|re.DOTALL).sub(r'\1', src) + return src DAY_MAP = dict(Sun=0, Mon=1, Tue=2, Wed=3, Thu=4, Fri=5, Sat=6) @@ -301,3 +320,81 @@ class DefaultProfile(object): return args +class FullContentProfile(DefaultProfile): + + max_recursions = 3 + summary_length = 500 # Max number of characters in the short description + + article_counter = 0 + + def build_index(self): + '''Build an RSS based index.html''' + import os + articles = self.parse_feeds(require_url=False) + + + + def build_sub_index(title, items): + ilist = '' + li = u'
  • %(title)s [%(date)s]
    \n'+\ + u'
    %(description)s
  • \n' + for item in items: + content = item['content'] + if not content: + self.logger.debug('Skipping article as it has no content:%s'%item['title']) + continue + item['description'] = item['description'][:self.summary_length]+'…' + self.article_counter = self.article_counter + 1 + url = os.path.join(self.temp_dir, 'article%d.html'%self.article_counter) + item['url'] = url + open(url, 'wb').write((u'''\ + + +

    %s

    +
    + %s +
    + + '''%(item['title'], content)).encode('utf-8') + ) + ilist += li%item + return u'''\ + + +

    %(title)s

    +
      + %(items)s +
    + + + '''%dict(title=title, items=ilist.rstrip()) + + cnum = 0 + clist = '' + categories = articles.keys() + categories.sort() + for category in categories: + cnum += 1 + cfile = os.path.join(self.temp_dir, 'category'+str(cnum)+'.html') + prefix = 'file:' if iswindows else '' + clist += u'
  • %s
  • \n'%(prefix+cfile, category) + src = build_sub_index(category, articles[category]) + open(cfile, 'wb').write(src.encode('utf-8')) + + src = '''\ + + +

    %(title)s

    +
    %(date)s
    +
      + %(categories)s +
    + + + '''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()), + categories=clist, title=self.title) + index = os.path.join(self.temp_dir, 'index.html') + open(index, 'wb').write(src.encode('utf-8')) + return index + + \ No newline at end of file diff --git a/src/libprs500/ebooks/lrf/web/profiles/portfolio.py b/src/libprs500/ebooks/lrf/web/profiles/portfolio.py index a955402fdc..c57455ba80 100644 --- a/src/libprs500/ebooks/lrf/web/profiles/portfolio.py +++ b/src/libprs500/ebooks/lrf/web/profiles/portfolio.py @@ -4,12 +4,11 @@ ''' ''' -from libprs500.ebooks.lrf.web.profiles import DefaultProfile +from libprs500.ebooks.lrf.web.profiles import FullContentProfile -class Portfolio(DefaultProfile): +class Portfolio(FullContentProfile): - title = 'Portfolio' - max_recursions = 0 + title = 'Portfolio' max_articles_per_feed = 50 timefmt = ' [%a, %b %d, %Y]' html_description = True @@ -17,7 +16,6 @@ class Portfolio(DefaultProfile): html2lrf_options = ['--ignore-tables'] ##delay = 1 - ## Don't grab articles more than 7 days old oldest_article = 30 ## Comment out the feeds you don't want retrieved.