Fix downloaded news in EPUB format causing reader resets

2025-12-17 18:45:04 -05:00 · 2008-12-08 11:37:32 -08:00 · 2008-12-08 11:37:32 -08:00 · 68f24caced
commit 68f24caced
parent 039572d937
5 changed files with 79 additions and 6 deletions
--- a/src/calibre/web/feeds/init.py
+++ b/src/calibre/web/feeds/init.py
@ -9,6 +9,7 @@ import time, logging, traceback, copy
 from datetime import datetime

 from calibre.web.feeds.feedparser import parse
+from lxml import html

 class Article(object):
    
@ -19,6 +20,17 @@ class Article(object):
        self.id = id
        self.title = title.strip() if title else title
        self.url = url
+        if summary and not isinstance(summary, unicode):
+            summary = summary.decode('utf-8', 'replace')
+        if summary and '<' in summary:
+            try:
+                s = html.fragment_fromstring(summary, create_parent=True)
+                summary = html.tostring(s, method='text', encoding=unicode)
+            except:
+                print 'Failed to process article summary, deleting:'
+                print summary.encode('utf-8')
+                traceback.print_exc()
+                summary = u''
        self.summary = summary
        self.content = content
        self.date = published
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -586,9 +586,9 @@ class BasicNewsRecipe(object, LoggingInterface):
        if npos < 0:
            npos = pos
        ans = src[:npos+1]
-        if isinstance(ans, unicode):
-            return ans
+        if len(ans) < len(src):
            return ans+u'\u2026' if isinstance(ans, unicode) else ans + '...'
+        return ans

        
    
--- a/src/calibre/web/feeds/recipes/init.py
+++ b/src/calibre/web/feeds/recipes/init.py
@ -17,7 +17,7 @@ recipe_modules = [
           'blic', 'novosti', 'danas', 'vreme', 'times_online', 'the_scotsman',
           'nytimes_sub', 'security_watch', 'cyberpresse', 'st_petersburg_times',
           'clarin', 'financial_times', 'heise', 'le_monde', 'harpers', 'science_aas',
-           'science_news', 'the_nation', 'lrb'
+           'science_news', 'the_nation', 'lrb', 'harpers_full'
          ]

 import re, imp, inspect, time, os
--- a/src/calibre/web/feeds/recipes/harpers_full.py
+++ b/src/calibre/web/feeds/recipes/harpers_full.py
@ -0,0 +1,61 @@
+#!/usr/bin/env  python
+
+__license__   = 'GPL v3'
+__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
+'''
+harpers.org - paid subscription/ printed issue articles
+This recipe only get's article's published in text format
+images and pdf's are ignored
+'''
+
+from calibre import strftime
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Harpers_full(BasicNewsRecipe):
+    title                 = u"Harper's Magazine - articles from printed edition"
+    __author__            = u'Darko Miletic'
+    description           = u"Harper's Magazine: Founded June 1850."
+    oldest_article        = 30
+    max_articles_per_feed = 100
+    no_stylesheets        = True
+    use_embedded_content  = False
+    simultaneous_downloads = 1
+    delay = 1
+    needs_subscription = True
+    INDEX = strftime('http://www.harpers.org/archive/%Y/%m')
+    LOGIN = 'http://www.harpers.org'
+    cover_url = strftime('http://www.harpers.org/media/pages/%Y/%m/gif/0001.gif')
+
+    keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ]
+    remove_tags = [
+                     dict(name='table', attrs={'class':'rcnt'})
+                    ,dict(name='table', attrs={'class':'rcnt topline'})
+                  ]
+
+    def get_browser(self):
+        br = BasicNewsRecipe.get_browser()
+        if self.username is not None and self.password is not None:
+            br.open(self.LOGIN)
+            br.select_form(nr=1)
+            br['handle'  ] = self.username
+            br['password'] = self.password
+            br.submit()
+        return br
+        
+    def parse_index(self):
+        articles = []
+        print 'Processing ' + self.INDEX
+        soup = self.index_to_soup(self.INDEX)
+        for item in soup.findAll('div', attrs={'class':'title'}):
+            text_link = item.parent.find('img',attrs={'alt':'Text'})            
+            if text_link:
+                url   = self.LOGIN + item.a['href']
+                title = item.a.contents[0]
+                date  = strftime(' %B %Y')
+                articles.append({
+                                  'title'      :title
+                                 ,'date'       :date
+                                 ,'url'        :url
+                                 ,'description':''
+                                })
+        return [(soup.head.title.string, articles)]
--- a/src/calibre/web/feeds/templates.py
+++ b/src/calibre/web/feeds/templates.py
@ -144,9 +144,9 @@ class FeedTemplate(Template):
            <li id="${'article_%d'%i}" py:if="getattr(article, 'downloaded', False)">
                <a class="article" href="${article.url}">${article.title}</a>
                <span class="article_date">${article.localtime.strftime(" [%a, %d %b %H:%M]")}</span>
-                <p class="article_decription" py:if="article.summary">
+                <div class="article_decription" py:if="article.summary">
                    ${Markup(cutoff(article.summary))}
-                </p>
+                </div>
            </li>
            </py:for>
        </ul>