mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 02:34:06 -04:00
IGN:Tag release
This commit is contained in:
parent
507348e16c
commit
6fd0a3100b
@ -6,7 +6,6 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
|||||||
nytimes.com
|
nytimes.com
|
||||||
'''
|
'''
|
||||||
import re
|
import re
|
||||||
import time
|
|
||||||
from calibre import entity_to_unicode
|
from calibre import entity_to_unicode
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment
|
||||||
@ -17,7 +16,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
__author__ = 'GRiker'
|
__author__ = 'GRiker'
|
||||||
language = _('English')
|
language = _('English')
|
||||||
description = 'Top Stories from the New York Times'
|
description = 'Top Stories from the New York Times'
|
||||||
|
|
||||||
# List of sections typically included in Top Stories. Use a keyword from the
|
# List of sections typically included in Top Stories. Use a keyword from the
|
||||||
# right column in the excludeSectionKeywords[] list to skip downloading that section
|
# right column in the excludeSectionKeywords[] list to skip downloading that section
|
||||||
sections = {
|
sections = {
|
||||||
@ -40,7 +39,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
'world' : 'World'
|
'world' : 'World'
|
||||||
}
|
}
|
||||||
|
|
||||||
# By default, no sections are skipped.
|
# By default, no sections are skipped.
|
||||||
excludeSectionKeywords = []
|
excludeSectionKeywords = []
|
||||||
|
|
||||||
# Add section keywords from the right column above to skip that section
|
# Add section keywords from the right column above to skip that section
|
||||||
@ -50,7 +49,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
# excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World']
|
# excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World']
|
||||||
# Fetch only Top Stories
|
# Fetch only Top Stories
|
||||||
# excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World']
|
# excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World']
|
||||||
|
|
||||||
# The maximum number of articles that will be downloaded
|
# The maximum number of articles that will be downloaded
|
||||||
max_articles_per_feed = 40
|
max_articles_per_feed = 40
|
||||||
|
|
||||||
@ -64,7 +63,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
dict(attrs={ 'id':['toolsRight','inlineBox','sidebarArticles',
|
dict(attrs={ 'id':['toolsRight','inlineBox','sidebarArticles',
|
||||||
'portfolioInline','articleInline','readerscomment',
|
'portfolioInline','articleInline','readerscomment',
|
||||||
'nytRating']}) ]
|
'nytRating']}) ]
|
||||||
|
|
||||||
encoding = 'cp1252'
|
encoding = 'cp1252'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
extra_css = '.headline {text-align: left;}\n \
|
extra_css = '.headline {text-align: left;}\n \
|
||||||
@ -114,13 +113,13 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
_raw = url_or_raw
|
_raw = url_or_raw
|
||||||
if raw:
|
if raw:
|
||||||
return _raw
|
return _raw
|
||||||
|
|
||||||
if not isinstance(_raw, unicode) and self.encoding:
|
if not isinstance(_raw, unicode) and self.encoding:
|
||||||
_raw = _raw.decode(docEncoding, 'replace')
|
_raw = _raw.decode(docEncoding, 'replace')
|
||||||
massage = list(BeautifulSoup.MARKUP_MASSAGE)
|
massage = list(BeautifulSoup.MARKUP_MASSAGE)
|
||||||
massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding)))
|
massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding)))
|
||||||
return BeautifulSoup(_raw, markupMassage=massage)
|
return BeautifulSoup(_raw, markupMassage=massage)
|
||||||
|
|
||||||
# Entry point
|
# Entry point
|
||||||
soup = get_the_soup( self.encoding, url_or_raw )
|
soup = get_the_soup( self.encoding, url_or_raw )
|
||||||
contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
|
contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
|
||||||
@ -131,7 +130,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
if self.verbose > 2:
|
if self.verbose > 2:
|
||||||
self.log( " document encoding: '%s'" % docEncoding)
|
self.log( " document encoding: '%s'" % docEncoding)
|
||||||
if docEncoding != self.encoding :
|
if docEncoding != self.encoding :
|
||||||
soup = get_the_soup(docEncoding, url_or_raw)
|
soup = get_the_soup(docEncoding, url_or_raw)
|
||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
@ -142,7 +141,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
feed = key = 'All Top Stories'
|
feed = key = 'All Top Stories'
|
||||||
articles[key] = []
|
articles[key] = []
|
||||||
ans.append(key)
|
ans.append(key)
|
||||||
|
|
||||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
|
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
|
||||||
|
|
||||||
# Fetch the outer table
|
# Fetch the outer table
|
||||||
@ -188,7 +187,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
bylines = []
|
bylines = []
|
||||||
descriptions = []
|
descriptions = []
|
||||||
pubdate = None
|
pubdate = None
|
||||||
|
|
||||||
# Get the Section title
|
# Get the Section title
|
||||||
for (x,i) in enumerate(sectionblock.contents) :
|
for (x,i) in enumerate(sectionblock.contents) :
|
||||||
skipThisSection = False
|
skipThisSection = False
|
||||||
@ -210,14 +209,14 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
break
|
break
|
||||||
|
|
||||||
# Get the bylines and descriptions
|
# Get the bylines and descriptions
|
||||||
if not skipThisSection :
|
if not skipThisSection :
|
||||||
lines = sectionblock.contents
|
lines = sectionblock.contents
|
||||||
contentStrings = []
|
contentStrings = []
|
||||||
|
|
||||||
for line in lines:
|
for line in lines:
|
||||||
if not isinstance(line, Comment) and line.strip and line.strip() > "":
|
if not isinstance(line, Comment) and line.strip and line.strip() > "":
|
||||||
contentStrings.append(line.strip())
|
contentStrings.append(line.strip())
|
||||||
|
|
||||||
# Gather the byline/description pairs
|
# Gather the byline/description pairs
|
||||||
bylines = []
|
bylines = []
|
||||||
descriptions = []
|
descriptions = []
|
||||||
@ -226,7 +225,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
bylines.append(contentString)
|
bylines.append(contentString)
|
||||||
else:
|
else:
|
||||||
descriptions.append(contentString)
|
descriptions.append(contentString)
|
||||||
|
|
||||||
# Fetch the article titles and URLs
|
# Fetch the article titles and URLs
|
||||||
articleCount = len(sectionblock.findAll('span'))
|
articleCount = len(sectionblock.findAll('span'))
|
||||||
for (i,span) in enumerate(sectionblock.findAll(attrs={'class':'headlineWrapper'})) :
|
for (i,span) in enumerate(sectionblock.findAll(attrs={'class':'headlineWrapper'})) :
|
||||||
@ -241,7 +240,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
if not isinstance(title, unicode):
|
if not isinstance(title, unicode):
|
||||||
title = title.decode('utf-8', 'replace')
|
title = title.decode('utf-8', 'replace')
|
||||||
|
|
||||||
# Allow for unattributed, undescribed entries "Editor's Note"
|
# Allow for unattributed, undescribed entries "Editor's Note"
|
||||||
if i >= len(descriptions) :
|
if i >= len(descriptions) :
|
||||||
description = None
|
description = None
|
||||||
else :
|
else :
|
||||||
@ -259,10 +258,10 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
if url == article['url'] :
|
if url == article['url'] :
|
||||||
duplicateFound = True
|
duplicateFound = True
|
||||||
break
|
break
|
||||||
|
|
||||||
if duplicateFound:
|
if duplicateFound:
|
||||||
# Continue fetching, don't add this article
|
# Continue fetching, don't add this article
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if not articles.has_key(feed):
|
if not articles.has_key(feed):
|
||||||
articles[feed] = []
|
articles[feed] = []
|
||||||
@ -271,7 +270,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
description=description, author=author, content=''))
|
description=description, author=author, content=''))
|
||||||
|
|
||||||
ans = self.sort_index_by(ans, {'Top Stories':-1})
|
ans = self.sort_index_by(ans, {'Top Stories':-1})
|
||||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
def strip_anchors(self,soup):
|
def strip_anchors(self,soup):
|
||||||
@ -287,7 +286,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
# refresh = soup.find('meta', {'http-equiv':'refresh'})
|
# refresh = soup.find('meta', {'http-equiv':'refresh'})
|
||||||
# if refresh is None:
|
# if refresh is None:
|
||||||
# return self.strip_anchors(soup)
|
# return self.strip_anchors(soup)
|
||||||
#
|
#
|
||||||
# content = refresh.get('content').partition('=')[2]
|
# content = refresh.get('content').partition('=')[2]
|
||||||
# raw = self.browser.open('http://www.nytimes.com'+content).read()
|
# raw = self.browser.open('http://www.nytimes.com'+content).read()
|
||||||
# soup = BeautifulSoup(raw.decode('cp1252', 'replace'))
|
# soup = BeautifulSoup(raw.decode('cp1252', 'replace'))
|
||||||
@ -297,7 +296,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
content = refresh.get('content').partition('=')[2]
|
content = refresh.get('content').partition('=')[2]
|
||||||
raw = self.browser.open('http://www.nytimes.com'+content).read()
|
raw = self.browser.open('http://www.nytimes.com'+content).read()
|
||||||
soup = BeautifulSoup(raw.decode('cp1252', 'replace'))
|
soup = BeautifulSoup(raw.decode('cp1252', 'replace'))
|
||||||
|
|
||||||
soup = self.strip_anchors(soup)
|
soup = self.strip_anchors(soup)
|
||||||
|
|
||||||
# Test for empty content
|
# Test for empty content
|
||||||
@ -308,7 +307,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
return soup
|
return soup
|
||||||
else:
|
else:
|
||||||
print "no allowed content found, removing article"
|
print "no allowed content found, removing article"
|
||||||
raise StringError
|
raise Exception
|
||||||
|
|
||||||
def postprocess_html(self,soup, True):
|
def postprocess_html(self,soup, True):
|
||||||
|
|
||||||
@ -351,7 +350,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
bTag = Tag(soup, "b")
|
bTag = Tag(soup, "b")
|
||||||
bTag.insert(0, subhead.contents[0])
|
bTag.insert(0, subhead.contents[0])
|
||||||
subhead.replaceWith(bTag)
|
subhead.replaceWith(bTag)
|
||||||
|
|
||||||
# Synthesize a section header
|
# Synthesize a section header
|
||||||
dsk = soup.find('meta', attrs={'name':'dsk'})
|
dsk = soup.find('meta', attrs={'name':'dsk'})
|
||||||
if dsk is not None and dsk.has_key('content'):
|
if dsk is not None and dsk.has_key('content'):
|
||||||
@ -360,12 +359,12 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
hTag.insert(0,NavigableString(dsk['content']))
|
hTag.insert(0,NavigableString(dsk['content']))
|
||||||
articleTag = soup.find(True, attrs={'id':'article'})
|
articleTag = soup.find(True, attrs={'id':'article'})
|
||||||
articleTag.insert(0,hTag)
|
articleTag.insert(0,hTag)
|
||||||
|
|
||||||
# Add class="articleBody" to <div> so we can format with CSS
|
# Add class="articleBody" to <div> so we can format with CSS
|
||||||
divTag = soup.find('div',attrs={'id':'articleBody'})
|
divTag = soup.find('div',attrs={'id':'articleBody'})
|
||||||
if divTag is not None :
|
if divTag is not None :
|
||||||
divTag['class'] = divTag['id']
|
divTag['class'] = divTag['id']
|
||||||
|
|
||||||
# Add class="authorId" to <div> so we can format with CSS
|
# Add class="authorId" to <div> so we can format with CSS
|
||||||
divTag = soup.find('div',attrs={'id':'authorId'})
|
divTag = soup.find('div',attrs={'id':'authorId'})
|
||||||
if divTag is not None :
|
if divTag is not None :
|
||||||
|
@ -4,9 +4,9 @@
|
|||||||
#
|
#
|
||||||
msgid ""
|
msgid ""
|
||||||
msgstr ""
|
msgstr ""
|
||||||
"Project-Id-Version: calibre 0.6.21\n"
|
"Project-Id-Version: calibre 0.6.22\n"
|
||||||
"POT-Creation-Date: 2009-11-13 15:53+MST\n"
|
"POT-Creation-Date: 2009-11-13 16:05+MST\n"
|
||||||
"PO-Revision-Date: 2009-11-13 15:53+MST\n"
|
"PO-Revision-Date: 2009-11-13 16:05+MST\n"
|
||||||
"Last-Translator: Automatically generated\n"
|
"Last-Translator: Automatically generated\n"
|
||||||
"Language-Team: LANGUAGE\n"
|
"Language-Team: LANGUAGE\n"
|
||||||
"MIME-Version: 1.0\n"
|
"MIME-Version: 1.0\n"
|
||||||
@ -53,6 +53,8 @@ msgstr ""
|
|||||||
#: /home/kovid/work/calibre/src/calibre/ebooks/metadata/opf2.py:894
|
#: /home/kovid/work/calibre/src/calibre/ebooks/metadata/opf2.py:894
|
||||||
#: /home/kovid/work/calibre/src/calibre/ebooks/metadata/pdb.py:39
|
#: /home/kovid/work/calibre/src/calibre/ebooks/metadata/pdb.py:39
|
||||||
#: /home/kovid/work/calibre/src/calibre/ebooks/metadata/pdf.py:21
|
#: /home/kovid/work/calibre/src/calibre/ebooks/metadata/pdf.py:21
|
||||||
|
#: /home/kovid/work/calibre/src/calibre/ebooks/metadata/pml.py:18
|
||||||
|
#: /home/kovid/work/calibre/src/calibre/ebooks/metadata/pml.py:40
|
||||||
#: /home/kovid/work/calibre/src/calibre/ebooks/metadata/topaz.py:29
|
#: /home/kovid/work/calibre/src/calibre/ebooks/metadata/topaz.py:29
|
||||||
#: /home/kovid/work/calibre/src/calibre/ebooks/metadata/txt.py:14
|
#: /home/kovid/work/calibre/src/calibre/ebooks/metadata/txt.py:14
|
||||||
#: /home/kovid/work/calibre/src/calibre/ebooks/mobi/reader.py:44
|
#: /home/kovid/work/calibre/src/calibre/ebooks/mobi/reader.py:44
|
||||||
@ -177,30 +179,31 @@ msgstr ""
|
|||||||
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:170
|
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:170
|
||||||
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:181
|
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:181
|
||||||
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:192
|
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:192
|
||||||
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:214
|
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:204
|
||||||
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:225
|
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:225
|
||||||
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:235
|
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:236
|
||||||
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:245
|
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:246
|
||||||
|
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:256
|
||||||
msgid "Read metadata from %s files"
|
msgid "Read metadata from %s files"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:204
|
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:215
|
||||||
msgid "Read metadata from ebooks in RAR archives"
|
msgid "Read metadata from ebooks in RAR archives"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:256
|
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:267
|
||||||
msgid "Read metadata from ebooks in ZIP archives"
|
msgid "Read metadata from ebooks in ZIP archives"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:267
|
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:278
|
||||||
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:277
|
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:288
|
||||||
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:287
|
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:298
|
||||||
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:309
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:320
|
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:320
|
||||||
|
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:331
|
||||||
msgid "Set metadata in %s files"
|
msgid "Set metadata in %s files"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:298
|
#: /home/kovid/work/calibre/src/calibre/customize/builtins.py:309
|
||||||
msgid "Set metadata from %s files"
|
msgid "Set metadata from %s files"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user