mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
beta 13. Also force Article.title to be unicode
This commit is contained in:
parent
ece4adfab9
commit
ff19d4dc76
@ -2,7 +2,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
__appname__ = 'calibre'
|
__appname__ = 'calibre'
|
||||||
__version__ = '0.6.0b12'
|
__version__ = '0.6.0b13'
|
||||||
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
|
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
@ -19,14 +19,14 @@ class Article(object):
|
|||||||
def __init__(self, id, title, url, author, summary, published, content):
|
def __init__(self, id, title, url, author, summary, published, content):
|
||||||
self.downloaded = False
|
self.downloaded = False
|
||||||
self.id = id
|
self.id = id
|
||||||
self.title = title.strip() if title else title
|
self._title = title.strip() if title else title
|
||||||
try:
|
try:
|
||||||
self.title = re.sub(r'&(\S+);',
|
self._title = re.sub(r'&(\S+);',
|
||||||
entity_to_unicode, self.title)
|
entity_to_unicode, self._title)
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
if not isinstance(self.title, unicode):
|
if not isinstance(self._title, unicode):
|
||||||
self.title = self.title.decode('utf-8', 'replace')
|
self._title = self._title.decode('utf-8', 'replace')
|
||||||
self.url = url
|
self.url = url
|
||||||
self.author = author
|
self.author = author
|
||||||
if author and not isinstance(author, unicode):
|
if author and not isinstance(author, unicode):
|
||||||
@ -50,6 +50,17 @@ class Article(object):
|
|||||||
self.utctime = datetime(*self.date[:6])
|
self.utctime = datetime(*self.date[:6])
|
||||||
self.localtime = self.utctime + self.time_offset
|
self.localtime = self.utctime + self.time_offset
|
||||||
|
|
||||||
|
@dynamic_property
|
||||||
|
def title(self):
|
||||||
|
def fget(self):
|
||||||
|
t = self._title
|
||||||
|
if not isinstance(t, unicode) and hasattr(t, 'decode'):
|
||||||
|
t = t.decode('utf-8', 'replace')
|
||||||
|
return t
|
||||||
|
def fset(self, val):
|
||||||
|
self._title = val
|
||||||
|
return property(fget=fget, fset=fset)
|
||||||
|
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return \
|
return \
|
||||||
|
@ -6,6 +6,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
|||||||
nytimes.com
|
nytimes.com
|
||||||
'''
|
'''
|
||||||
import re
|
import re
|
||||||
|
from calibre import entity_to_unicode
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
||||||
|
|
||||||
@ -22,12 +23,13 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink',
|
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink',
|
||||||
'clearfix', 'nextArticleLink clearfix','inlineSearchControl',
|
'clearfix', 'nextArticleLink clearfix','inlineSearchControl',
|
||||||
'columnGroup','entry-meta','entry-response module','jumpLink','nav',
|
'columnGroup','entry-meta','entry-response module','jumpLink','nav',
|
||||||
'columnGroup advertisementColumnGroup']}),
|
'columnGroup advertisementColumnGroup', 'kicker entry-category']}),
|
||||||
dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive',
|
dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive',
|
||||||
'side_search', 'blog_sidebar', 'side_tool', 'side_index', 'login',
|
'side_search', 'blog_sidebar', 'side_tool', 'side_index', 'login',
|
||||||
'blog-header','searchForm','NYTLogo','insideNYTimes']),
|
'blog-header','searchForm','NYTLogo','insideNYTimes','adxToolSponsor',
|
||||||
|
'adxLeaderboard']),
|
||||||
dict(name=['script', 'noscript', 'style','hr'])]
|
dict(name=['script', 'noscript', 'style','hr'])]
|
||||||
encoding = None
|
encoding = 'cp1252'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
#extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
|
#extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
|
||||||
extra_css = '.headline {text-align:left;}\n\
|
extra_css = '.headline {text-align:left;}\n\
|
||||||
@ -37,6 +39,8 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
|
|
||||||
flatPeriodical = True
|
flatPeriodical = True
|
||||||
|
feed = None
|
||||||
|
ans = []
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
@ -48,31 +52,76 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
br.submit()
|
br.submit()
|
||||||
return br
|
return br
|
||||||
|
|
||||||
|
def index_to_soup(self, url_or_raw, raw=False):
|
||||||
|
'''
|
||||||
|
Convenience method that takes an URL to the index page and returns
|
||||||
|
a `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
|
||||||
|
of it.
|
||||||
|
|
||||||
|
This is an OVERRIDE of the method provided in news.py to solve an encoding problem
|
||||||
|
with NYTimes index pages which seem to be encoded in a wonderful blend
|
||||||
|
|
||||||
|
`url_or_raw`: Either a URL or the downloaded index page as a string
|
||||||
|
'''
|
||||||
|
def get_the_soup(docEncoding, url_or_raw, raw=False) :
|
||||||
|
if re.match(r'\w+://', url_or_raw):
|
||||||
|
f = self.browser.open(url_or_raw)
|
||||||
|
_raw = f.read()
|
||||||
|
f.close()
|
||||||
|
if not _raw:
|
||||||
|
raise RuntimeError('Could not fetch index from %s'%url_or_raw)
|
||||||
|
else:
|
||||||
|
_raw = url_or_raw
|
||||||
|
if raw:
|
||||||
|
return _raw
|
||||||
|
|
||||||
|
if not isinstance(_raw, unicode) and self.encoding:
|
||||||
|
_raw = _raw.decode(docEncoding, 'replace')
|
||||||
|
massage = list(BeautifulSoup.MARKUP_MASSAGE)
|
||||||
|
massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding)))
|
||||||
|
return BeautifulSoup(_raw, markupMassage=massage)
|
||||||
|
|
||||||
|
# Entry point
|
||||||
|
soup = get_the_soup( self.encoding, url_or_raw )
|
||||||
|
contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
|
||||||
|
docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
|
||||||
|
if docEncoding == '' :
|
||||||
|
docEncoding = self.encoding
|
||||||
|
|
||||||
|
if self.verbose :
|
||||||
|
self.log( " document encoding: '%s'" % docEncoding)
|
||||||
|
if docEncoding != self.encoding :
|
||||||
|
soup = get_the_soup(docEncoding, url_or_raw)
|
||||||
|
|
||||||
|
return soup
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
|
|
||||||
|
|
||||||
def feed_title(div):
|
|
||||||
return ''.join(div.findAll(text=True, recursive=False)).strip()
|
|
||||||
|
|
||||||
articles = {}
|
articles = {}
|
||||||
|
|
||||||
ans = []
|
|
||||||
if self.flatPeriodical :
|
if self.flatPeriodical :
|
||||||
feed = key = 'All Top Stories'
|
self.feed = key = 'All Top Stories'
|
||||||
articles[key] = []
|
articles[key] = []
|
||||||
ans.append(key)
|
self.ans.append(key)
|
||||||
else :
|
else :
|
||||||
key = None
|
key = None
|
||||||
|
|
||||||
|
'''
|
||||||
|
def feed_title(div):
|
||||||
|
return ''.join(div.findAll(text=True, recursive=False)).strip()
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
sections = {
|
sections = {
|
||||||
'arts' : 'Arts',
|
'arts' : 'Arts',
|
||||||
'business' : 'Business',
|
'business' : 'Business',
|
||||||
'editorials' : 'Editorials',
|
'editorials' : 'Editorials',
|
||||||
|
'health' : 'Health',
|
||||||
'magazine' : 'Magazine',
|
'magazine' : 'Magazine',
|
||||||
'mediaadvertising' : 'Media & Advertising',
|
'mediaadvertising' : 'Media & Advertising',
|
||||||
'newyorkregion' : 'New York/Region',
|
'newyorkregion' : 'New York/Region',
|
||||||
'oped' : 'Op-Ed',
|
'oped' : 'Op-Ed',
|
||||||
'politics' : 'Politics',
|
'politics' : 'Politics',
|
||||||
|
'science' : 'Science',
|
||||||
'sports' : 'Sports',
|
'sports' : 'Sports',
|
||||||
'technology' : 'Technology',
|
'technology' : 'Technology',
|
||||||
'topstories' : 'Top Stories',
|
'topstories' : 'Top Stories',
|
||||||
@ -81,8 +130,18 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
'world' : 'World'
|
'world' : 'World'
|
||||||
}
|
}
|
||||||
|
|
||||||
#excludeSectionKeywords = ['World','U.S.', 'Politics','Business','Technology','Sports','Arts','New York','Travel', 'Editorials', 'Op-Ed']
|
'''
|
||||||
excludeSectionKeywords = []
|
excludeSectionKeywords = ['Arts','Business','Editorials','Health','Magazine','Media',
|
||||||
|
'New York','Op-Ed','Politics','Science','Sports','Technology',
|
||||||
|
'Top Stories','Travel','U.S.','World']
|
||||||
|
'''
|
||||||
|
excludeSectionKeywords = ['Arts','Business','Editorials','Health','Magazine','Media',
|
||||||
|
'New York','Politics','Science','Sports','Technology',
|
||||||
|
'Top Stories','Travel','U.S.','World']
|
||||||
|
|
||||||
|
#excludeSectionKeywords = []
|
||||||
|
|
||||||
|
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
|
||||||
|
|
||||||
# Fetch the outer table
|
# Fetch the outer table
|
||||||
table = soup.find('table')
|
table = soup.find('table')
|
||||||
@ -164,7 +223,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
if not self.flatPeriodical :
|
if not self.flatPeriodical :
|
||||||
articles[key] = []
|
articles[key] = []
|
||||||
ans.append(key)
|
self.ans.append(key)
|
||||||
|
|
||||||
# Get the bylines and descriptions
|
# Get the bylines and descriptions
|
||||||
if not skipThisSection :
|
if not skipThisSection :
|
||||||
@ -192,7 +251,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
title = self.tag_to_string(a, use_alt=True)
|
title = self.tag_to_string(a, use_alt=True)
|
||||||
if self.flatPeriodical :
|
if self.flatPeriodical :
|
||||||
# prepend the section name
|
# prepend the section name
|
||||||
title = sections[section] + " : " + title
|
title = sections[section] + " · " + title
|
||||||
if not isinstance(title, unicode):
|
if not isinstance(title, unicode):
|
||||||
title = title.decode('utf-8', 'replace')
|
title = title.decode('utf-8', 'replace')
|
||||||
description = descriptions[i]
|
description = descriptions[i]
|
||||||
@ -201,28 +260,43 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
else :
|
else :
|
||||||
author = None
|
author = None
|
||||||
|
|
||||||
|
|
||||||
if self.verbose > 2 : self.log( " title: %s" % title)
|
if self.verbose > 2 : self.log( " title: %s" % title)
|
||||||
if self.verbose > 2 : self.log( " url: %s" % url)
|
if self.verbose > 2 : self.log( " url: %s" % url)
|
||||||
if self.verbose > 2 : self.log( " author: %s" % author)
|
if self.verbose > 2 : self.log( " author: %s" % author)
|
||||||
if self.verbose > 2 : self.log( "description: %s" % description)
|
if self.verbose > 2 : self.log( "description: %s" % description)
|
||||||
|
|
||||||
if not self.flatPeriodical :
|
if not self.flatPeriodical :
|
||||||
feed = key
|
self.feed = key
|
||||||
|
|
||||||
if not articles.has_key(feed):
|
# Check for duplicates
|
||||||
if self.verbose > 2 : self.log( "adding %s to articles[]" % feed)
|
duplicateFound = False
|
||||||
articles[feed] = []
|
if self.flatPeriodical and len(articles[self.feed]) > 1:
|
||||||
if self.verbose > 2 : self.log( " adding: %s to articles[%s]\n" % (title, feed))
|
#print articles[self.feed]
|
||||||
articles[feed].append(
|
for article in articles[self.feed] :
|
||||||
|
#print "comparing %s\n %s\n" % (url, article['url'])
|
||||||
|
if url == article['url'] :
|
||||||
|
duplicateFound = True
|
||||||
|
break
|
||||||
|
#print
|
||||||
|
|
||||||
|
if duplicateFound:
|
||||||
|
# Continue fetching, don't add this article
|
||||||
|
print " skipping duplicate %s" % article['url']
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not articles.has_key(self.feed):
|
||||||
|
if self.verbose > 2 : self.log( "adding %s to articles[]" % self.feed)
|
||||||
|
articles[self.feed] = []
|
||||||
|
if self.verbose > 2 : self.log( " adding: %s to articles[%s]\n" % (title, self.feed))
|
||||||
|
articles[self.feed].append(
|
||||||
dict(title=title, url=url, date=pubdate,
|
dict(title=title, url=url, date=pubdate,
|
||||||
description=description, author=author, content=''))
|
description=description, author=author, content=''))
|
||||||
|
|
||||||
ans = self.sort_index_by(ans, {'Top Stories':-1})
|
self.ans = self.sort_index_by(self.ans, {'Top Stories':-1})
|
||||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
self.ans = [(key, articles[key]) for key in self.ans if articles.has_key(key)]
|
||||||
#sys.exit(1)
|
#sys.exit(1)
|
||||||
|
|
||||||
return ans
|
return self.ans
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
refresh = soup.find('meta', {'http-equiv':'refresh'})
|
refresh = soup.find('meta', {'http-equiv':'refresh'})
|
||||||
@ -286,17 +360,3 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def postprocess_book(self, oeb, opts, log) :
|
|
||||||
log( " ********** recipe.postprocess_book ********** ")
|
|
||||||
log( list(oeb.toc) )
|
|
||||||
log( "oeb: %s" % oeb.toc)
|
|
||||||
log( "opts: %s" % opts.verbose)
|
|
||||||
for sections in oeb.toc :
|
|
||||||
log( "section:")
|
|
||||||
for articleTOC in sections:
|
|
||||||
log( " title: %s" % articleTOC.title)
|
|
||||||
log( " author: %s" % articleTOC.author)
|
|
||||||
log( "description: %s" % articleTOC.description)
|
|
||||||
log( " href: %s" % articleTOC.href)
|
|
||||||
log( " content: %s" % oeb.manifest.hrefs[articleTOC.href])
|
|
||||||
return
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user