Fix #7304 (New York Times Conversion Error)

2025-07-09 03:04:10 -04:00 · 2010-10-26 09:59:57 -06:00 · 2010-10-26 09:59:57 -06:00 · 3fdde53502
commit 3fdde53502
parent 7988560d75
1 changed files with 133 additions and 418 deletions
--- a/resources/recipes/nytimes_sub.recipe
+++ b/resources/recipes/nytimes_sub.recipe
@ -4,149 +4,79 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''
 nytimes.com
-V5 - One picture per article, moved to top:
-Headline
-Image
-Byline
-Story
 '''
-import re, string, time
+import string, re, time
 from calibre import strftime
 from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, NavigableString, Tag
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+
+def decode(self, src):
+    enc = 'utf-8'
+    if 'iso-8859-1' in src:
+        enc = 'cp1252'
+    return src.decode(enc, 'ignore')

 class NYTimes(BasicNewsRecipe):

-    title       = 'The New York Times'
-    __author__  = 'GRiker'
+    title       = u'New York Times'
+    __author__  = 'Kovid Goyal/Nick Redding'
    language = 'en'
-    requires_version = (0, 7, 5)
+    requires_version = (0, 6, 36)

    description = 'Daily news from the New York Times (subscription version)'
-    allSectionKeywords = ['The Front Page', 'International','National','Obituaries','Editorials',
-                          'New York','Business Day','Science Times','Sports','Dining','Arts',
-                          'Home','Styles','Sunday Business','Week In Review','Travel','Magazine',
-                          'Book Review','Weddings','Real Estate','Automobiles',"T Men's Fashion",
-                          "T Women's Fashion"]
-
-    # List of sections to exclude
-    # To add a section, copy the section name from the allSectionKeywords list above
-    # For example, to exclude 'Dining' and 'Weddings':
-    #excludeSectionKeywords = ['Dining','Weddings']
-    excludeSectionKeywords = []
-
-    # List of sections to include (test and debug only)
-    # By default, any sections in today's paper that are not listed in excludeSectionKeywords
-    # are downloaded.  fetch_only specifies that only certain sections are to be downloaded.
-    # This should only be used for testing and debugging.
-    # For example, to download only 'The Front Page' section:
-    # fetch_only = set(['The Front Page'])
-    fetch_only = set([])
-    if fetch_only:
-        excludeSectionKeywords = list(set(allSectionKeywords) ^ fetch_only)
-
-    # one_picture_per_article specifies that calibre should only use the first image
-    # from an article (if one exists).  If one_picture_per_article = True, the image
-    # will be moved to a location between the headline and the byline.
-    # If one_picture_per_article = False, all images from the article will be included
-    # and shown in their original location.
-    one_picture_per_article = True
-
-    timefmt = ''
+    timefmt = ' [%b %d]'
    needs_subscription = True
    remove_tags_before = dict(id='article')
    remove_tags_after  = dict(id='article')
-    remove_tags = [dict(attrs={'class':[
-                            'articleFooter',
-                            'articleTools',
-                            'columnGroup doubleRule',
-                            'columnGroup singleRule',
-                            'columnGroup last',
-                            'columnGroup  last',
-                            'doubleRule',
-                            'dottedLine',
-                            'entry-meta',
-                            'entry-response module',
-                            'icon enlargeThis',
-                            'leftNavTabs',
-                            'module box nav',
-                            'nextArticleLink',
-                            'nextArticleLink clearfix',
-                            'post-tools',
-                            'relatedSearchesModule',
-                            'side_tool',
-                            'singleAd',
-                            'subNavigation clearfix',
-                            'subNavigation tabContent active',
-                            'subNavigation tabContent active clearfix',
-                            ]}),
-                   dict(id=[
-                            'adxLeaderboard',
-                            'archive',
-                            'articleExtras',
-                            'articleInline',
-                            'blog_sidebar',
-                            'businessSearchBar',
-                            'cCol',
-                            'entertainmentSearchBar',
-                            'footer',
-                            'header',
-                            'header_search',
-                            'login',
-                            'masthead',
-                            'masthead-nav',
-                            'memberTools',
-                            'navigation',
-                            'portfolioInline',
-                            'relatedArticles',
-                            'respond',
-                            'side_search',
-                            'side_index',
-                            'side_tool',
-                            'toolsRight',
-                            ]),
-                   dict(name=['script', 'noscript', 'style'])]
-    masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
-    cover_margins = (18,18,'grey99')
+    remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool','nextArticleLink',
+                                        'nextArticleLink clearfix','columnGroup doubleRule','doubleRule','entry-meta',
+                                        'icon enlargeThis','columnGroup  last','relatedSearchesModule']}),
+                   dict({'class':re.compile('^subNavigation')}),
+                   dict({'class':re.compile('^leaderboard')}),
+                   dict({'class':re.compile('^module')}),
+                   dict({'class':'metaFootnote'}),
+                   dict(id=['inlineBox','footer', 'toolsRight', 'articleInline','login','masthead',
+                            'navigation', 'archive', 'side_search', 'blog_sidebar','cCol','portfolioInline',
+                            'side_tool', 'side_index','header','readerReviewsCount','readerReviews',
+                            'relatedArticles', 'relatedTopics', 'adxSponLink']),
+                   dict(name=['script', 'noscript', 'style','form','hr'])]
+    encoding = decode
    no_stylesheets = True
-    extra_css = '.headline      {text-align:    left;}\n    \
-                 .byline        {font-family:   monospace;  \
-                                 text-align:    left;       \
-                                 margin-top:    0px;        \
-                                 margin-bottom: 0px;}\n     \
-                 .dateline      {font-size:     small;      \
-                                 margin-top:    0px;        \
-                                 margin-bottom: 0px;}\n     \
-                 .timestamp     {font-size:     small;      \
-                                 margin-top:    0px;        \
-                                 margin-bottom: 0px;}\n     \
-                 .source        {text-align:    left;}\n    \
-                 .image         {text-align:    center;}\n  \
-                 .credit        {text-align:    right;      \
-                                 font-size:     small;      \
-                                 margin-top:    0px;        \
-                                 margin-bottom: 0px;}\n     \
-                 .articleBody   {text-align:    left;}\n    \
-                 .authorId      {text-align:    left;       \
-                                 font-style:    italic;}\n  '
+    extra_css = '''
+                .articleHeadline { margin-top:0.5em; margin-bottom:0.25em; }
+                .credit { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
+                .byline { font-size: small; font-style:italic; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
+                .dateline { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
+                .kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
+                .timestamp { font-size: small; }
+                .caption { font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
+                a:link {text-decoration: none; }'''

    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        if self.username is not None and self.password is not None:
-            try:
-                br.open('http://www.nytimes.com/auth/login')
-                br.select_form(name='login')
-                br['USERID']   = self.username
-                br['PASSWORD'] = self.password
-                raw = br.submit().read()
-                if 'Sorry, we could not find the combination you entered. Please try again.' in raw:
-                    raise Exception('Your username and password are incorrect')
-                #open('/t/log.html', 'wb').write(raw)
-            except:
-                self.log("\nFailed to login")
-
+            br.open('http://www.nytimes.com/auth/login')
+            br.select_form(name='login')
+            br['USERID']   = self.username
+            br['PASSWORD'] = self.password
+            raw = br.submit().read()
+            if 'Sorry, we could not find the combination you entered. Please try again.' in raw:
+                raise Exception('Your username and password are incorrect')
+            #open('/t/log.html', 'wb').write(raw)
        return br

+    def get_masthead_url(self):
+        masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
+        #masthead = 'http://members.cox.net/nickredding/nytlogo.gif'
+        br = BasicNewsRecipe.get_browser()
+        try:
+            br.open(masthead)
+        except:
+            self.log("\nMasthead unavailable")
+            masthead = None
+        return masthead
+
+
    def get_cover_url(self):
        cover = None
        st = time.localtime()
@ -162,316 +92,101 @@ class NYTimes(BasicNewsRecipe):
            cover = None
        return cover

-    def get_masthead_title(self):
-        return self.title
-
-    def dump_ans(self, ans):
-        total_article_count = 0
-        for section in ans :
-            if self.verbose:
-                self.log("section %s: %d articles" % (section[0], len(section[1])) )
-            for article in section[1]:
-                total_article_count += 1
-                if self.verbose:
-                    self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('mac-roman','replace'),
-                              article['url'].encode('mac-roman','replace')))
-        self.log( "Queued %d articles" % total_article_count )
-
-    def dump_hex(self, src, length=16):
-        ''' Diagnostic '''
-        FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)])
-        N=0; result=''
-        while src:
-           s,src = src[:length],src[length:]
-           hexa = ' '.join(["%02X"%ord(x) for x in s])
-           s = s.translate(FILTER)
-           result += "%04X   %-*s   %s\n" % (N, length*3, hexa, s)
-           N+=length
-        print result
-
-    def fixChars(self,string):
-        # Replace lsquo (\x91)
-        fixed = re.sub("\x91","&#8216;",string)
-
-        # Replace rsquo (\x92)
-        fixed = re.sub("\x92","&#8217;",fixed)
-
-        # Replace ldquo (\x93)
-        fixed = re.sub("\x93","&#8220;",fixed)
-
-        # Replace rdquo (\x94)
-        fixed = re.sub("\x94","&#8221;",fixed)
-
-        # Replace ndash (\x96)
-        fixed = re.sub("\x96","&#8211;",fixed)
-
-        # Replace mdash (\x97)
-        fixed = re.sub("\x97","&#8212;",fixed)
-
-        return fixed
-
-    def massageNCXText(self, description):
-        # Kindle TOC descriptions won't render certain characters
-        if description:
-            massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
-            # Replace '&' with '&#38;'
-            massaged = re.sub("&","&#38;", massaged)
-            return self.fixChars(massaged)
-        else:
-            return description
+    def short_title(self):
+        return 'New York Times'

    def parse_index(self):
+        self.encoding = 'cp1252'
        soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
+        self.encoding = decode

        def feed_title(div):
-            return ''.join(div.findAll(text=True, recursive=False)).strip()
+            return ''.join(div.findAll(text=True, recursive=True)).strip()

        articles = {}
        key = None
        ans = []
-        # Find each instance of class="section-headline", class="story", class="story headline"
-        for div in soup.findAll(True,
-            attrs={'class':['section-headline', 'story', 'story headline']}):
+        url_list = []

-            if div['class'] == 'section-headline':
-                key = string.capwords(feed_title(div))
-                if self.excludeSectionKeywords:
-                    excluded = re.compile('|'.join(self.excludeSectionKeywords))
-                    if excluded.search(key):
-                        self.log("Skipping section %s" % key)
-                        continue
-                articles[key] = []
-                ans.append(key)
-
-            elif div['class'] in ['story', 'story headline'] :
-                a = div.find('a', href=True)
-                if not a:
-                    continue
-                url = re.sub(r'\?.*', '', a['href'])
-                url += '?pagewanted=all'
-
-                title = self.massageNCXText(self.tag_to_string(a, use_alt=True).strip())
-
-                description = ''
-                pubdate = strftime('%a, %d %b')
-                summary = div.find(True, attrs={'class':'summary'})
-                if summary:
-                    description = self.massageNCXText(self.tag_to_string(summary, use_alt=False))
-
-                author = ''
-                authorAttribution = div.find(True, attrs={'class':'storyheadline-author'})
+        def handle_article(div):
+            a = div.find('a', href=True)
+            if not a:
+                return
+            url = re.sub(r'\?.*', '', a['href'])
+            if not url.startswith("http"):
+                return
+            if not url.endswith(".html"):
+                return
+            if 'podcast' in url:
+                return
+            url += '?pagewanted=all'
+            if url in url_list:
+                return
+            url_list.append(url)
+            title = self.tag_to_string(a, use_alt=True).strip()
+            #self.log("Title: %s" % title)
+            description = ''
+            pubdate = strftime('%a, %d %b')
+            summary = div.find(True, attrs={'class':'summary'})
+            if summary:
+                description = self.tag_to_string(summary, use_alt=False)
+            author = ''
+            authorAttribution = div.find(True, attrs={'class':'byline'})
+            if authorAttribution:
+                author = self.tag_to_string(authorAttribution, use_alt=False)
+            else:
+                authorAttribution = div.find(True, attrs={'class':'byline'})
                if authorAttribution:
                    author = self.tag_to_string(authorAttribution, use_alt=False)
-                else:
-                    authorAttribution = div.find(True, attrs={'class':'byline'})
-                    if authorAttribution:
-                        author = self.tag_to_string(authorAttribution, use_alt=False)
-                # Kill commas - Kindle switches to '&'
-                author = re.sub(',','',author)
+            feed = key if key is not None else 'Uncategorized'
+            if not articles.has_key(feed):
+                articles[feed] = []
+            articles[feed].append(
+                            dict(title=title, url=url, date=pubdate,
+                                description=description, author=author,
+                                content=''))

-                feed = key if key is not None else 'Uncategorized'
-                if not articles.has_key(feed):
-                    articles[feed] = []
-                if not 'podcasts' in url:
-                    articles[feed].append(
-                                  dict(title=title, url=url, date=pubdate,
-                                       description=description, author=author,
-                                       content=''))
-        ans = self.sort_index_by(ans, {'The Front Page':-1,
-                                       'Dining In, Dining Out':1,
-                                       'Obituaries':2})
+
+
+        # Find each instance of class="section-headline", class="story", class="story headline"
+        for div in soup.findAll(True,
+            attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
+
+            if div['class'] in ['section-headline','sectionHeader']:
+                key = string.capwords(feed_title(div))
+                articles[key] = []
+                ans.append(key)
+                #self.log('Section: %s' % key)
+
+            elif div['class'] in ['story', 'story headline'] :
+                handle_article(div)
+            elif div['class'] == 'headlinesOnly multiline flush':
+                for lidiv in div.findAll('li'):
+                    handle_article(lidiv)
+
+#        ans = self.sort_index_by(ans, {'The Front Page':-1,
+#                                      'Dining In, Dining Out':1,
+#                                     'Obituaries':2})
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
-        self.dump_ans(ans)
+
        return ans

-    def skip_ad_pages(self, soup):
-        # Skip ad pages served before actual article
-        skip_tag = soup.find(True, {'name':'skip'})
-        if skip_tag is not None:
-            self.log.warn("Found forwarding link: %s" % skip_tag.parent['href'])
-            url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
-            url += '?pagewanted=all'
-            self.log.warn("Skipping ad to article at '%s'" % url)
-            return self.index_to_soup(url, raw=True)
-
    def preprocess_html(self, soup):
-        return self.strip_anchors(soup)
+        kicker_tag = soup.find(attrs={'class':'kicker'})
+        if kicker_tag:
+            tagline = self.tag_to_string(kicker_tag)
+            #self.log("FOUND KICKER %s" % tagline)
+            if tagline=='Op-Ed Columnist':
+                img_div = soup.find('div','inlineImage module')
+                #self.log("Searching for photo")
+                if img_div:
+                    img_div.extract()
+                    #self.log("Photo deleted")
+        refresh = soup.find('meta', {'http-equiv':'refresh'})
+        if refresh is None:
+            return soup
+        content = refresh.get('content').partition('=')[2]
+        raw = self.browser.open_novisit('http://www.nytimes.com'+content).read()
+        return BeautifulSoup(raw.decode('cp1252', 'replace'))

-    def postprocess_html(self,soup, True):
-        print "\npostprocess_html()\n"
-
-        if self.one_picture_per_article:
-            # Remove all images after first
-            largeImg = soup.find(True, {'class':'articleSpanImage'})
-            inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
-            if largeImg:
-                for inlineImg in inlineImgs:
-                    inlineImg.extract()
-            else:
-                if inlineImgs:
-                    firstImg = inlineImgs[0]
-                    for inlineImg in inlineImgs[1:]:
-                        inlineImg.extract()
-                    # Move firstImg after headline
-                    cgFirst = soup.find(True, {'class':'columnGroup  first'})
-                    if cgFirst:
-                        # Strip all sibling NavigableStrings: noise
-                        navstrings = cgFirst.findAll(text=True, recursive=False)
-                        [ns.extract() for ns in navstrings]
-                        headline_found = False
-                        tag = cgFirst.find(True)
-                        insertLoc = 0
-                        while True:
-                            insertLoc += 1
-                            if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
-                                    headline_found = True
-                                    break
-                            tag = tag.nextSibling
-                            if not tag:
-                                headline_found = False
-                                break
-                        if headline_found:
-                            cgFirst.insert(insertLoc,firstImg)
-                    else:
-                        self.log(">>> No class:'columnGroup  first' found <<<")
-        # Change class="kicker" to <h3>
-        kicker = soup.find(True, {'class':'kicker'})
-        if kicker and kicker.contents and kicker.contents[0]:
-            h3Tag = Tag(soup, "h3")
-            h3Tag.insert(0, self.fixChars(self.tag_to_string(kicker,
-                             use_alt=False)))
-            kicker.replaceWith(h3Tag)
-
-        # Change captions to italic -1
-        for caption in soup.findAll(True, {'class':'caption'}) :
-            if caption and caption.contents[0]:
-                emTag = Tag(soup, "em")
-                c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
-                mp_off = c.find("More Photos")
-                if mp_off >= 0:
-                    c = c[:mp_off]
-                emTag.insert(0, c)
-                #hrTag = Tag(soup, 'hr')
-                #hrTag['class'] = 'caption_divider'
-                hrTag = Tag(soup, 'div')
-                hrTag['class'] = 'divider'
-                emTag.insert(1, hrTag)
-                caption.replaceWith(emTag)
-
-        # Change <nyt_headline> to <h2>
-        h1 = soup.find('h1')
-        if h1:
-            headline = h1.find("nyt_headline")
-            if headline:
-                tag = Tag(soup, "h2")
-                tag['class'] = "headline"
-                tag.insert(0, self.fixChars(headline.contents[0]))
-                h1.replaceWith(tag)
-        else:
-            # Blog entry - replace headline, remove <hr> tags
-            headline = soup.find('title')
-            if headline:
-                tag = Tag(soup, "h2")
-                tag['class'] = "headline"
-                tag.insert(0, self.fixChars(headline.contents[0]))
-                soup.insert(0, tag)
-                hrs = soup.findAll('hr')
-                for hr in hrs:
-                    hr.extract()
-
-        # Change <h1> to <h3> - used in editorial blogs
-        masthead = soup.find("h1")
-        if masthead:
-            # Nuke the href
-            if masthead.a:
-                del(masthead.a['href'])
-            tag = Tag(soup, "h3")
-            tag.insert(0, self.fixChars(masthead.contents[0]))
-            masthead.replaceWith(tag)
-
-        # Change <span class="bold"> to <b>
-        for subhead in soup.findAll(True, {'class':'bold'}) :
-            if subhead.contents:
-                bTag = Tag(soup, "b")
-                bTag.insert(0, subhead.contents[0])
-                subhead.replaceWith(bTag)
-
-        # Synthesize a section header
-        dsk = soup.find('meta', attrs={'name':'dsk'})
-        if dsk and dsk.has_key('content'):
-            hTag = Tag(soup,'h3')
-            hTag['class'] = 'section'
-            hTag.insert(0,NavigableString(dsk['content']))
-            articleTag = soup.find(True, attrs={'id':'article'})
-            if articleTag:
-                articleTag.insert(0,hTag)
-
-        # Add class="articleBody" to <div> so we can format with CSS
-        divTag = soup.find('div',attrs={'id':'articleBody'})
-        if divTag:
-            divTag['class'] = divTag['id']
-
-        # Add class="authorId" to <div> so we can format with CSS
-        divTag = soup.find('div',attrs={'id':'authorId'})
-        if divTag and divTag.contents[0]:
-            tag = Tag(soup, "p")
-            tag['class'] = "authorId"
-            tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
-                             use_alt=False)))
-            divTag.replaceWith(tag)
-
-        return soup
-
-    def populate_article_metadata(self,article,soup,first):
-        '''
-        Extract author and description from article, add to article metadata
-        '''
-        def extract_author(soup):
-            byline = soup.find('meta',attrs={'name':['byl','CLMST']})
-            if byline :
-                author = byline['content']
-            else :
-                # Try for <div class="byline">
-                byline = soup.find('div', attrs={'class':'byline'})
-                if byline:
-                    author = byline.renderContents()
-                else:
-                    print soup.prettify()
-                    return None
-            return author
-
-        def extract_description(soup):
-            description = soup.find('meta',attrs={'name':['description','description ']})
-            if description :
-                return self.massageNCXText(description['content'])
-            else:
-                # Take first paragraph of article
-                articlebody = soup.find('div',attrs={'id':'articlebody'})
-                if not articlebody:
-                    # Try again with class instead of id
-                    articlebody = soup.find('div',attrs={'class':'articlebody'})
-                    if not articlebody:
-                        print 'postprocess_book.extract_description(): Did not find <div id="articlebody">:'
-                        print soup.prettify()
-                        return None
-                paras = articlebody.findAll('p')
-                for p in paras:
-                    if p.renderContents() > '' :
-                        return self.massageNCXText(self.tag_to_string(p,use_alt=False))
-                return None
-
-        if not article.author:
-            article.author = extract_author(soup)
-        if not article.summary:
-            article.summary = article.text_summary = extract_description(soup)
-
-    def strip_anchors(self,soup):
-        paras = soup.findAll(True)
-        for para in paras:
-            aTags = para.findAll('a')
-            for a in aTags:
-                if a.img is None:
-                    a.replaceWith(a.renderContents().decode('utf-8','replace'))
-                    #a.replaceWith(a.renderContents().decode('cp1252','replace'))
-        return soup