beta 13. Also force Article.title to be unicode

2025-07-09 03:04:10 -04:00 · 2009-07-14 08:41:38 -06:00 · 2009-07-14 08:41:38 -06:00 · ff19d4dc76
commit ff19d4dc76
parent ece4adfab9
3 changed files with 118 additions and 47 deletions
--- a/src/calibre/constants.py
+++ b/src/calibre/constants.py
@ -2,7 +2,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
 __appname__   = 'calibre'
-__version__   = '0.6.0b12'
+__version__   = '0.6.0b13'
 __author__    = "Kovid Goyal <kovid@kovidgoyal.net>"

 import re
--- a/src/calibre/web/feeds/init.py
+++ b/src/calibre/web/feeds/init.py
@ -19,14 +19,14 @@ class Article(object):
    def __init__(self, id, title, url, author, summary, published, content):
        self.downloaded = False
        self.id = id
-        self.title = title.strip() if title else title
+        self._title = title.strip() if title else title
        try:
-            self.title = re.sub(r'&(\S+);',
-                entity_to_unicode, self.title)
+            self._title = re.sub(r'&(\S+);',
+                entity_to_unicode, self._title)
        except:
            pass
-        if not isinstance(self.title, unicode):
-            self.title = self.title.decode('utf-8', 'replace')
+        if not isinstance(self._title, unicode):
+            self._title = self._title.decode('utf-8', 'replace')
        self.url = url
        self.author = author
        if author and not isinstance(author, unicode):
@ -50,6 +50,17 @@ class Article(object):
        self.utctime = datetime(*self.date[:6])
        self.localtime = self.utctime + self.time_offset

+    @dynamic_property
+    def title(self):
+        def fget(self):
+            t = self._title
+            if not isinstance(t, unicode) and hasattr(t, 'decode'):
+                t = t.decode('utf-8', 'replace')
+            return t
+        def fset(self, val):
+            self._title = val
+        return property(fget=fget, fset=fset)
+

    def __repr__(self):
        return \
--- a/src/calibre/web/feeds/recipes/recipe_nytimes.py
+++ b/src/calibre/web/feeds/recipes/recipe_nytimes.py
@ -6,6 +6,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 nytimes.com
 '''
 import re
+from calibre import entity_to_unicode
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag

@ -22,12 +23,13 @@ class NYTimes(BasicNewsRecipe):
    remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink', 
                               'clearfix', 'nextArticleLink clearfix','inlineSearchControl',
                               'columnGroup','entry-meta','entry-response module','jumpLink','nav',
-                               'columnGroup advertisementColumnGroup']}),
+                               'columnGroup advertisementColumnGroup', 'kicker entry-category']}),
                   dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 
                            'side_search', 'blog_sidebar', 'side_tool', 'side_index', 'login',
-                            'blog-header','searchForm','NYTLogo','insideNYTimes']),
+                            'blog-header','searchForm','NYTLogo','insideNYTimes','adxToolSponsor',
+                            'adxLeaderboard']),
                   dict(name=['script', 'noscript', 'style','hr'])]
-    encoding = None
+    encoding = 'cp1252'
    no_stylesheets = True
    #extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
    extra_css = '.headline  {text-align:left;}\n\
@ -37,6 +39,8 @@ class NYTimes(BasicNewsRecipe):


    flatPeriodical = True
+    feed = None
+    ans = []

    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
@ -48,31 +52,76 @@ class NYTimes(BasicNewsRecipe):
            br.submit()
        return br

+    def index_to_soup(self, url_or_raw, raw=False):
+        '''
+        Convenience method that takes an URL to the index page and returns
+        a `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
+        of it.
+        
+        This is an OVERRIDE of the method provided in news.py to solve an encoding problem
+        with NYTimes index pages which seem to be encoded in a wonderful blend
+
+        `url_or_raw`: Either a URL or the downloaded index page as a string
+        '''
+        def get_the_soup(docEncoding, url_or_raw, raw=False) :
+            if re.match(r'\w+://', url_or_raw):
+                f = self.browser.open(url_or_raw)
+                _raw = f.read()
+                f.close()
+                if not _raw:
+                    raise RuntimeError('Could not fetch index from %s'%url_or_raw)
+            else:
+                _raw = url_or_raw
+            if raw:
+                return _raw
+                
+            if not isinstance(_raw, unicode) and self.encoding:
+                _raw = _raw.decode(docEncoding, 'replace')
+            massage = list(BeautifulSoup.MARKUP_MASSAGE)
+            massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding)))
+            return BeautifulSoup(_raw, markupMassage=massage)
+        
+        # Entry point
+        soup = get_the_soup( self.encoding, url_or_raw )
+        contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
+        docEncoding =  str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
+        if docEncoding == '' :
+            docEncoding = self.encoding
+
+        if self.verbose :
+            self.log( "  document encoding: '%s'" % docEncoding)
+        if docEncoding != self.encoding :
+            soup = get_the_soup(docEncoding, url_or_raw)         
+
+        return soup
+
    def parse_index(self):
-        soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
-
-        def feed_title(div):
-            return ''.join(div.findAll(text=True, recursive=False)).strip()
-
        articles = {}

-        ans = []
        if self.flatPeriodical :
-            feed = key = 'All Top Stories'
+            self.feed = key = 'All Top Stories'
            articles[key] = []
-            ans.append(key)
+            self.ans.append(key)
        else :
            key = None

+        '''
+        def feed_title(div):
+            return ''.join(div.findAll(text=True, recursive=False)).strip()
+        '''
+        
+
        sections = {
                     'arts'             :   'Arts',
                     'business'         :   'Business',
                     'editorials'       :   'Editorials',
+                     'health'           :   'Health',
                     'magazine'         :   'Magazine',
                     'mediaadvertising' :   'Media & Advertising',
                     'newyorkregion'    :   'New York/Region',
                     'oped'             :   'Op-Ed',
                     'politics'         :   'Politics',
+                     'science'          :   'Science',
                     'sports'           :   'Sports',
                     'technology'       :   'Technology',
                     'topstories'       :   'Top Stories',
@ -81,8 +130,18 @@ class NYTimes(BasicNewsRecipe):
                     'world'            :   'World'
                   }

-        #excludeSectionKeywords = ['World','U.S.', 'Politics','Business','Technology','Sports','Arts','New York','Travel', 'Editorials', 'Op-Ed']
-        excludeSectionKeywords = []
+        '''
+        excludeSectionKeywords = ['Arts','Business','Editorials','Health','Magazine','Media',
+                                   'New York','Op-Ed','Politics','Science','Sports','Technology',
+                                   'Top Stories','Travel','U.S.','World']
+        '''                                   
+        excludeSectionKeywords = ['Arts','Business','Editorials','Health','Magazine','Media',
+                                   'New York','Politics','Science','Sports','Technology',
+                                   'Top Stories','Travel','U.S.','World']
+        
+        #excludeSectionKeywords = []
+        
+        soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')

        # Fetch the outer table
        table = soup.find('table')
@ -164,7 +223,7 @@ class NYTimes(BasicNewsRecipe):

                        if not self.flatPeriodical :
                            articles[key] = []
-                            ans.append(key)
+                            self.ans.append(key)

                # Get the bylines and descriptions
                if not skipThisSection :
@ -192,7 +251,7 @@ class NYTimes(BasicNewsRecipe):
                        title = self.tag_to_string(a, use_alt=True)
                        if self.flatPeriodical :
                            # prepend the section name
-                            title = sections[section] + " : " + title
+                            title = sections[section] + " &middot; " + title
                        if not isinstance(title, unicode):
                            title = title.decode('utf-8', 'replace')
                        description = descriptions[i]
@ -201,28 +260,43 @@ class NYTimes(BasicNewsRecipe):
                        else :
                            author = None

-
                        if self.verbose > 2 : self.log( "      title: %s" % title)
                        if self.verbose > 2 : self.log( "        url: %s" % url)
                        if self.verbose > 2 : self.log( "     author: %s" % author)
                        if self.verbose > 2 : self.log( "description: %s" % description)

                        if not self.flatPeriodical :
-                            feed = key
+                            self.feed = key

-                        if not articles.has_key(feed):
-                            if self.verbose > 2 : self.log( "adding %s to articles[]" % feed)
-                            articles[feed] = []
-                        if self.verbose > 2 : self.log( "     adding: %s to articles[%s]\n" % (title, feed))
-                        articles[feed].append(
+                        # Check for duplicates
+                        duplicateFound = False
+                        if self.flatPeriodical and len(articles[self.feed]) > 1:
+                            #print articles[self.feed]
+                            for article in articles[self.feed] :
+                                #print "comparing %s\n %s\n" % (url, article['url'])
+                                if url == article['url'] :
+                                    duplicateFound = True
+                                    break
+                            #print
+                            
+                            if duplicateFound:        
+                                # Continue fetching, don't add this article
+                                print "  skipping duplicate %s" % article['url']
+                                continue        
+
+                        if not articles.has_key(self.feed):
+                            if self.verbose > 2 : self.log( "adding %s to articles[]" % self.feed)
+                            articles[self.feed] = []
+                        if self.verbose > 2 : self.log( "     adding: %s to articles[%s]\n" % (title, self.feed))
+                        articles[self.feed].append(
                            dict(title=title, url=url, date=pubdate,
                                 description=description, author=author, content=''))

-        ans = self.sort_index_by(ans, {'Top Stories':-1})
-        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
+        self.ans = self.sort_index_by(self.ans, {'Top Stories':-1})
+        self.ans = [(key, articles[key]) for key in self.ans if articles.has_key(key)]
        #sys.exit(1)

-        return ans
+        return self.ans

    def preprocess_html(self, soup):
        refresh = soup.find('meta', {'http-equiv':'refresh'})
@ -286,17 +360,3 @@ class NYTimes(BasicNewsRecipe):

        return soup

-    def postprocess_book(self, oeb, opts, log) :
-        log( " ********** recipe.postprocess_book ********** ")
-        log( list(oeb.toc) )
-        log( "oeb: %s" % oeb.toc)
-        log( "opts: %s" % opts.verbose)
-        for sections in oeb.toc :
-            log( "section:")
-            for articleTOC in sections:
-                log( "      title: %s" % articleTOC.title)
-                log( "     author: %s" % articleTOC.author)
-                log( "description: %s" % articleTOC.description)
-                log( "       href: %s" % articleTOC.href)
-                log( "    content: %s" % oeb.manifest.hrefs[articleTOC.href])
-        return