IGN:Tag release

This commit is contained in:
Kovid Goyal 2009-11-13 16:46:27 -07:00
parent 507348e16c
commit 6fd0a3100b
2 changed files with 40 additions and 38 deletions

View File

@ -6,7 +6,6 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
nytimes.com
'''
import re
import time
from calibre import entity_to_unicode
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment
@ -17,7 +16,7 @@ class NYTimes(BasicNewsRecipe):
__author__ = 'GRiker'
language = _('English')
description = 'Top Stories from the New York Times'
# List of sections typically included in Top Stories. Use a keyword from the
# right column in the excludeSectionKeywords[] list to skip downloading that section
sections = {
@ -40,7 +39,7 @@ class NYTimes(BasicNewsRecipe):
'world' : 'World'
}
# By default, no sections are skipped.
# By default, no sections are skipped.
excludeSectionKeywords = []
# Add section keywords from the right column above to skip that section
@ -50,7 +49,7 @@ class NYTimes(BasicNewsRecipe):
# excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World']
# Fetch only Top Stories
# excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World']
# The maximum number of articles that will be downloaded
max_articles_per_feed = 40
@ -64,7 +63,7 @@ class NYTimes(BasicNewsRecipe):
dict(attrs={ 'id':['toolsRight','inlineBox','sidebarArticles',
'portfolioInline','articleInline','readerscomment',
'nytRating']}) ]
encoding = 'cp1252'
no_stylesheets = True
extra_css = '.headline {text-align: left;}\n \
@ -114,13 +113,13 @@ class NYTimes(BasicNewsRecipe):
_raw = url_or_raw
if raw:
return _raw
if not isinstance(_raw, unicode) and self.encoding:
_raw = _raw.decode(docEncoding, 'replace')
massage = list(BeautifulSoup.MARKUP_MASSAGE)
massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding)))
return BeautifulSoup(_raw, markupMassage=massage)
# Entry point
soup = get_the_soup( self.encoding, url_or_raw )
contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
@ -131,7 +130,7 @@ class NYTimes(BasicNewsRecipe):
if self.verbose > 2:
self.log( " document encoding: '%s'" % docEncoding)
if docEncoding != self.encoding :
soup = get_the_soup(docEncoding, url_or_raw)
soup = get_the_soup(docEncoding, url_or_raw)
return soup
@ -142,7 +141,7 @@ class NYTimes(BasicNewsRecipe):
feed = key = 'All Top Stories'
articles[key] = []
ans.append(key)
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
# Fetch the outer table
@ -188,7 +187,7 @@ class NYTimes(BasicNewsRecipe):
bylines = []
descriptions = []
pubdate = None
# Get the Section title
for (x,i) in enumerate(sectionblock.contents) :
skipThisSection = False
@ -210,14 +209,14 @@ class NYTimes(BasicNewsRecipe):
break
# Get the bylines and descriptions
if not skipThisSection :
if not skipThisSection :
lines = sectionblock.contents
contentStrings = []
for line in lines:
if not isinstance(line, Comment) and line.strip and line.strip() > "":
contentStrings.append(line.strip())
# Gather the byline/description pairs
bylines = []
descriptions = []
@ -226,7 +225,7 @@ class NYTimes(BasicNewsRecipe):
bylines.append(contentString)
else:
descriptions.append(contentString)
# Fetch the article titles and URLs
articleCount = len(sectionblock.findAll('span'))
for (i,span) in enumerate(sectionblock.findAll(attrs={'class':'headlineWrapper'})) :
@ -241,7 +240,7 @@ class NYTimes(BasicNewsRecipe):
if not isinstance(title, unicode):
title = title.decode('utf-8', 'replace')
# Allow for unattributed, undescribed entries "Editor's Note"
# Allow for unattributed, undescribed entries "Editor's Note"
if i >= len(descriptions) :
description = None
else :
@ -259,10 +258,10 @@ class NYTimes(BasicNewsRecipe):
if url == article['url'] :
duplicateFound = True
break
if duplicateFound:
if duplicateFound:
# Continue fetching, don't add this article
continue
continue
if not articles.has_key(feed):
articles[feed] = []
@ -271,7 +270,7 @@ class NYTimes(BasicNewsRecipe):
description=description, author=author, content=''))
ans = self.sort_index_by(ans, {'Top Stories':-1})
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans
def strip_anchors(self,soup):
@ -287,7 +286,7 @@ class NYTimes(BasicNewsRecipe):
# refresh = soup.find('meta', {'http-equiv':'refresh'})
# if refresh is None:
# return self.strip_anchors(soup)
#
#
# content = refresh.get('content').partition('=')[2]
# raw = self.browser.open('http://www.nytimes.com'+content).read()
# soup = BeautifulSoup(raw.decode('cp1252', 'replace'))
@ -297,7 +296,7 @@ class NYTimes(BasicNewsRecipe):
content = refresh.get('content').partition('=')[2]
raw = self.browser.open('http://www.nytimes.com'+content).read()
soup = BeautifulSoup(raw.decode('cp1252', 'replace'))
soup = self.strip_anchors(soup)
# Test for empty content
@ -308,7 +307,7 @@ class NYTimes(BasicNewsRecipe):
return soup
else:
print "no allowed content found, removing article"
raise StringError
raise Exception
def postprocess_html(self,soup, True):
@ -351,7 +350,7 @@ class NYTimes(BasicNewsRecipe):
bTag = Tag(soup, "b")
bTag.insert(0, subhead.contents[0])
subhead.replaceWith(bTag)
# Synthesize a section header
dsk = soup.find('meta', attrs={'name':'dsk'})
if dsk is not None and dsk.has_key('content'):
@ -360,12 +359,12 @@ class NYTimes(BasicNewsRecipe):
hTag.insert(0,NavigableString(dsk['content']))
articleTag = soup.find(True, attrs={'id':'article'})
articleTag.insert(0,hTag)
# Add class="articleBody" to <div> so we can format with CSS
divTag = soup.find('div',attrs={'id':'articleBody'})
if divTag is not None :
divTag['class'] = divTag['id']
# Add class="authorId" to <div> so we can format with CSS
divTag = soup.find('div',attrs={'id':'authorId'})
if divTag is not None :

View File

@ -4,9 +4,9 @@
#
msgid ""
msgstr ""
"Project-Id-Version: calibre 0.6.21\n"
"POT-Creation-Date: 2009-11-13 15:53+MST\n"
"PO-Revision-Date: 2009-11-13 15:53+MST\n"
"Project-Id-Version: calibre 0.6.22\n"
"POT-Creation-Date: 2009-11-13 16:05+MST\n"
"PO-Revision-Date: 2009-11-13 16:05+MST\n"
"Last-Translator: Automatically generated\n"
"Language-Team: LANGUAGE\n"
"MIME-Version: 1.0\n"
@ -53,6 +53,8 @@ msgstr ""
#: /home/kovid/work/calibre/src/calibre/ebooks/metadata/opf2.py:894
#: /home/kovid/work/calibre/src/calibre/ebooks/metadata/pdb.py:39
#: /home/kovid/work/calibre/src/calibre/ebooks/metadata/pdf.py:21
#: /home/kovid/work/calibre/src/calibre/ebooks/metadata/pml.py:18
#: /home/kovid/work/calibre/src/calibre/ebooks/metadata/pml.py:40
#: /home/kovid/work/calibre/src/calibre/ebooks/metadata/topaz.py:29
#: /home/kovid/work/calibre/src/calibre/ebooks/metadata/txt.py:14
#: /home/kovid/work/calibre/src/calibre/ebooks/mobi/reader.py:44
@ -177,30 +179,31 @@ msgstr ""
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:170
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:181
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:192
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:214
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:204
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:225
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:235
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:245
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:236
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:246
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:256
msgid "Read metadata from %s files"
msgstr ""
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:204
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:215
msgid "Read metadata from ebooks in RAR archives"
msgstr ""
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:256
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:267
msgid "Read metadata from ebooks in ZIP archives"
msgstr ""
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:267
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:277
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:287
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:309
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:278
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:288
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:298
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:320
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:331
msgid "Set metadata in %s files"
msgstr ""
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:298
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:309
msgid "Set metadata from %s files"
msgstr ""