mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update New York Times Top Stories
This commit is contained in:
parent
041ca66102
commit
656ce3eac8
@ -5,7 +5,8 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
|||||||
'''
|
'''
|
||||||
nytimes.com
|
nytimes.com
|
||||||
'''
|
'''
|
||||||
import re, time
|
import re
|
||||||
|
import time
|
||||||
from calibre import entity_to_unicode
|
from calibre import entity_to_unicode
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment
|
||||||
@ -14,7 +15,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
title = 'New York Times Top Stories'
|
title = 'New York Times Top Stories'
|
||||||
__author__ = 'GRiker'
|
__author__ = 'GRiker'
|
||||||
language = 'en'
|
language = _('English')
|
||||||
description = 'Top Stories from the New York Times'
|
description = 'Top Stories from the New York Times'
|
||||||
|
|
||||||
# List of sections typically included in Top Stories. Use a keyword from the
|
# List of sections typically included in Top Stories. Use a keyword from the
|
||||||
@ -39,9 +40,6 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
'world' : 'World'
|
'world' : 'World'
|
||||||
}
|
}
|
||||||
|
|
||||||
# By default, no sections are skipped.
|
|
||||||
excludeSectionKeywords = []
|
|
||||||
|
|
||||||
# Add section keywords from the right column above to skip that section
|
# Add section keywords from the right column above to skip that section
|
||||||
# For example, to skip sections containing the word 'Sports' or 'Dining', use:
|
# For example, to skip sections containing the word 'Sports' or 'Dining', use:
|
||||||
# excludeSectionKeywords = ['Sports', 'Dining']
|
# excludeSectionKeywords = ['Sports', 'Dining']
|
||||||
@ -49,36 +47,138 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
# excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World']
|
# excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World']
|
||||||
# Fetch only Top Stories
|
# Fetch only Top Stories
|
||||||
# excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World']
|
# excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World']
|
||||||
|
# By default, no sections are skipped.
|
||||||
|
excludeSectionKeywords = []
|
||||||
|
|
||||||
|
# one_picture_per_article specifies that calibre should only use the first image
|
||||||
|
# from an article (if one exists). If one_picture_per_article = True, the image
|
||||||
|
# will be moved to a location between the headline and the byline.
|
||||||
|
# If one_picture_per_article = False, all images from the article will be included
|
||||||
|
# and shown in their original location.
|
||||||
|
one_picture_per_article = True
|
||||||
|
|
||||||
# The maximum number of articles that will be downloaded
|
# The maximum number of articles that will be downloaded
|
||||||
max_articles_per_feed = 40
|
max_articles_per_feed = 40
|
||||||
|
|
||||||
timefmt = ''
|
timefmt = ''
|
||||||
needs_subscription = True
|
needs_subscription = True
|
||||||
keep_only_tags = [ dict(attrs={ 'id':['article']}),
|
masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
|
||||||
dict(attrs={'class':['blog wrap']}) ]
|
|
||||||
|
|
||||||
remove_tags = [ dict(attrs={'class':['nextArticleLink clearfix', 'clearfix',
|
remove_tags_before = dict(id='article')
|
||||||
'inlineVideo left brightcove', 'entry-meta']}),
|
remove_tags_after = dict(id='article')
|
||||||
dict(attrs={ 'id':['toolsRight','inlineBox','sidebarArticles',
|
remove_tags = [dict(attrs={'class':[
|
||||||
'portfolioInline','articleInline','readerscomment',
|
'articleFooter',
|
||||||
'nytRating']}) ]
|
'articleTools',
|
||||||
|
'columnGroup doubleRule',
|
||||||
|
'columnGroup singleRule',
|
||||||
|
'columnGroup last',
|
||||||
|
'columnGroup last',
|
||||||
|
'doubleRule',
|
||||||
|
'dottedLine',
|
||||||
|
'entry-meta',
|
||||||
|
'icon enlargeThis',
|
||||||
|
'leftNavTabs',
|
||||||
|
'module box nav',
|
||||||
|
'nextArticleLink',
|
||||||
|
'nextArticleLink clearfix',
|
||||||
|
'post-tools',
|
||||||
|
'relatedSearchesModule',
|
||||||
|
'side_tool',
|
||||||
|
'singleAd',
|
||||||
|
'subNavigation tabContent active clearfix',
|
||||||
|
]}),
|
||||||
|
dict(id=[
|
||||||
|
'adxLeaderboard',
|
||||||
|
'archive',
|
||||||
|
'articleExtras',
|
||||||
|
'articleInline',
|
||||||
|
'blog_sidebar',
|
||||||
|
'cCol',
|
||||||
|
'entertainmentSearchBar',
|
||||||
|
'footer',
|
||||||
|
'header',
|
||||||
|
'header_search',
|
||||||
|
'login',
|
||||||
|
'masthead',
|
||||||
|
'memberTools',
|
||||||
|
'navigation',
|
||||||
|
'portfolioInline',
|
||||||
|
'relatedArticles',
|
||||||
|
'side_search',
|
||||||
|
'side_index',
|
||||||
|
'side_tool',
|
||||||
|
'toolsRight',
|
||||||
|
]),
|
||||||
|
dict(name=['script', 'noscript', 'style'])]
|
||||||
|
|
||||||
encoding = 'cp1252'
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
extra_css = '.headline {text-align: left;}\n \
|
extra_css = '.headline {text-align: left;}\n \
|
||||||
.byline {font-family: monospace; \
|
.byline {font-family: monospace; \
|
||||||
text-align: left; \
|
text-align: left; \
|
||||||
|
margin-top: 0px; \
|
||||||
|
margin-bottom: 0px;}\n \
|
||||||
|
.dateline {font-size: small; \
|
||||||
|
margin-top: 0px; \
|
||||||
|
margin-bottom: 0px;}\n \
|
||||||
|
.timestamp {font-size: small; \
|
||||||
|
margin-top: 0px; \
|
||||||
margin-bottom: 0px;}\n \
|
margin-bottom: 0px;}\n \
|
||||||
.timestamp {font-size: smaller;}\n \
|
|
||||||
.source {text-align: left;}\n \
|
.source {text-align: left;}\n \
|
||||||
.image {text-align: center;}\n \
|
.image {text-align: center;}\n \
|
||||||
.credit {text-align: right; \
|
.credit {text-align: right; \
|
||||||
font-size: smaller;}\n \
|
font-size: small; \
|
||||||
|
margin-top: 0px; \
|
||||||
|
margin-bottom: 0px;}\n \
|
||||||
.articleBody {text-align: left;}\n \
|
.articleBody {text-align: left;}\n \
|
||||||
.authorId {text-align: left; \
|
.authorId {text-align: left; \
|
||||||
font-style: italic;}\n '
|
font-style: italic;}\n '
|
||||||
|
|
||||||
|
def dump_ans(self, ans) :
|
||||||
|
total_article_count = 0
|
||||||
|
for section in ans :
|
||||||
|
if self.verbose:
|
||||||
|
self.log("section %s: %d articles" % (section[0], len(section[1])) )
|
||||||
|
for article in section[1]:
|
||||||
|
total_article_count += 1
|
||||||
|
if self.verbose:
|
||||||
|
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
|
||||||
|
article['url'].encode('cp1252','replace')))
|
||||||
|
self.log( "Queued %d articles" % total_article_count )
|
||||||
|
|
||||||
|
def fixChars(self,string):
|
||||||
|
# Replace lsquo (\x91)
|
||||||
|
fixed = re.sub("\x91","‘",string)
|
||||||
|
|
||||||
|
# Replace rsquo (\x92)
|
||||||
|
fixed = re.sub("\x92","’",fixed)
|
||||||
|
|
||||||
|
# Replace ldquo (\x93)
|
||||||
|
fixed = re.sub("\x93","“",fixed)
|
||||||
|
|
||||||
|
# Replace rdquo (\x94)
|
||||||
|
fixed = re.sub("\x94","”",fixed)
|
||||||
|
|
||||||
|
# Replace ndash (\x96)
|
||||||
|
fixed = re.sub("\x96","–",fixed)
|
||||||
|
|
||||||
|
# Replace mdash (\x97)
|
||||||
|
fixed = re.sub("\x97","—",fixed)
|
||||||
|
|
||||||
|
return fixed
|
||||||
|
|
||||||
|
def get_browser(self):
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
if self.username is not None and self.password is not None:
|
||||||
|
try:
|
||||||
|
br.open('http://www.nytimes.com/auth/login')
|
||||||
|
br.select_form(name='login')
|
||||||
|
br['USERID'] = self.username
|
||||||
|
br['PASSWORD'] = self.password
|
||||||
|
br.submit()
|
||||||
|
except:
|
||||||
|
self.log("\nFailed to login")
|
||||||
|
return br
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
cover = None
|
cover = None
|
||||||
st = time.localtime()
|
st = time.localtime()
|
||||||
@ -94,26 +194,6 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
cover = None
|
cover = None
|
||||||
return cover
|
return cover
|
||||||
|
|
||||||
def get_masthead_url(self):
|
|
||||||
masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
|
|
||||||
br = BasicNewsRecipe.get_browser()
|
|
||||||
try:
|
|
||||||
br.open(masthead)
|
|
||||||
except:
|
|
||||||
self.log("\nCover unavailable")
|
|
||||||
masthead = None
|
|
||||||
return masthead
|
|
||||||
|
|
||||||
def get_browser(self):
|
|
||||||
br = BasicNewsRecipe.get_browser()
|
|
||||||
if self.username is not None and self.password is not None:
|
|
||||||
br.open('http://www.nytimes.com/auth/login')
|
|
||||||
br.select_form(name='login')
|
|
||||||
br['USERID'] = self.username
|
|
||||||
br['PASSWORD'] = self.password
|
|
||||||
br.submit()
|
|
||||||
return br
|
|
||||||
|
|
||||||
def index_to_soup(self, url_or_raw, raw=False):
|
def index_to_soup(self, url_or_raw, raw=False):
|
||||||
'''
|
'''
|
||||||
OVERRIDE of class method
|
OVERRIDE of class method
|
||||||
@ -138,6 +218,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
return BeautifulSoup(_raw, markupMassage=massage)
|
return BeautifulSoup(_raw, markupMassage=massage)
|
||||||
|
|
||||||
# Entry point
|
# Entry point
|
||||||
|
print "index_to_soup()"
|
||||||
soup = get_the_soup( self.encoding, url_or_raw )
|
soup = get_the_soup( self.encoding, url_or_raw )
|
||||||
contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
|
contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
|
||||||
docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
|
docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
|
||||||
@ -151,6 +232,16 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
def massageNCXText(self, description):
|
||||||
|
# Kindle TOC descriptions won't render certain characters
|
||||||
|
if description:
|
||||||
|
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||||
|
# Replace '&' with '&'
|
||||||
|
massaged = re.sub("&","&", massaged)
|
||||||
|
return self.fixChars(massaged)
|
||||||
|
else:
|
||||||
|
return description
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
articles = {}
|
articles = {}
|
||||||
ans = []
|
ans = []
|
||||||
@ -158,12 +249,14 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
feed = key = 'All Top Stories'
|
feed = key = 'All Top Stories'
|
||||||
articles[key] = []
|
articles[key] = []
|
||||||
ans.append(key)
|
ans.append(key)
|
||||||
|
self.log("Scanning 1 section ...")
|
||||||
|
|
||||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
|
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
|
||||||
|
|
||||||
# Fetch the outer table
|
# Fetch the outer table
|
||||||
table = soup.find('table')
|
table = soup.find('table')
|
||||||
previousTable = table
|
previousTable = table
|
||||||
|
contentTable = None
|
||||||
|
|
||||||
# Find the deepest table containing the stories
|
# Find the deepest table containing the stories
|
||||||
while True :
|
while True :
|
||||||
@ -191,8 +284,9 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
skipThisSection = False
|
skipThisSection = False
|
||||||
|
todays_article_count = 0
|
||||||
# Within this table are <font face="times new roman, times, san serif"> entries
|
# Within this table are <font face="times new roman, times, san serif"> entries
|
||||||
|
self.log("Fetching feed Top Stories")
|
||||||
for tr in storyblock.findAllNext('tr'):
|
for tr in storyblock.findAllNext('tr'):
|
||||||
if tr.find('span') is not None :
|
if tr.find('span') is not None :
|
||||||
|
|
||||||
@ -244,6 +338,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
# Fetch the article titles and URLs
|
# Fetch the article titles and URLs
|
||||||
articleCount = len(sectionblock.findAll('span'))
|
articleCount = len(sectionblock.findAll('span'))
|
||||||
|
todays_article_count += articleCount
|
||||||
for (i,span) in enumerate(sectionblock.findAll(attrs={'class':'headlineWrapper'})) :
|
for (i,span) in enumerate(sectionblock.findAll(attrs={'class':'headlineWrapper'})) :
|
||||||
a = span.find('a', href=True)
|
a = span.find('a', href=True)
|
||||||
url = re.sub(r'\?.*', '', a['href'])
|
url = re.sub(r'\?.*', '', a['href'])
|
||||||
@ -277,6 +372,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
if duplicateFound:
|
if duplicateFound:
|
||||||
# Continue fetching, don't add this article
|
# Continue fetching, don't add this article
|
||||||
|
todays_article_count -= 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if not articles.has_key(feed):
|
if not articles.has_key(feed):
|
||||||
@ -284,11 +380,138 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
articles[feed].append(
|
articles[feed].append(
|
||||||
dict(title=title, url=url, date=pubdate,
|
dict(title=title, url=url, date=pubdate,
|
||||||
description=description, author=author, content=''))
|
description=description, author=author, content=''))
|
||||||
|
# self.log("Queuing %d articles from %s" % (todays_article_count, "Top Stories"))
|
||||||
|
|
||||||
ans = self.sort_index_by(ans, {'Top Stories':-1})
|
ans = self.sort_index_by(ans, {'Top Stories':-1})
|
||||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||||
|
self.dump_ans(ans)
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
return self.strip_anchors(soup)
|
||||||
|
|
||||||
|
def postprocess_html(self,soup, True):
|
||||||
|
|
||||||
|
if self.one_picture_per_article:
|
||||||
|
# Remove all images after first
|
||||||
|
largeImg = soup.find(True, {'class':'articleSpanImage'})
|
||||||
|
inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
|
||||||
|
if largeImg:
|
||||||
|
for inlineImg in inlineImgs:
|
||||||
|
inlineImg.extract()
|
||||||
|
else:
|
||||||
|
if inlineImgs:
|
||||||
|
firstImg = inlineImgs[0]
|
||||||
|
for inlineImg in inlineImgs[1:]:
|
||||||
|
inlineImg.extract()
|
||||||
|
# Move firstImg after headline
|
||||||
|
cgFirst = soup.find(True, {'class':'columnGroup first'})
|
||||||
|
if cgFirst:
|
||||||
|
# Strip all sibling NavigableStrings: noise
|
||||||
|
navstrings = cgFirst.findAll(text=True, recursive=False)
|
||||||
|
[ns.extract() for ns in navstrings]
|
||||||
|
headline_found = False
|
||||||
|
tag = cgFirst.find(True)
|
||||||
|
insertLoc = 0
|
||||||
|
while True:
|
||||||
|
insertLoc += 1
|
||||||
|
if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
|
||||||
|
headline_found = True
|
||||||
|
break
|
||||||
|
tag = tag.nextSibling
|
||||||
|
if not tag:
|
||||||
|
headline_found = False
|
||||||
|
break
|
||||||
|
if headline_found:
|
||||||
|
cgFirst.insert(insertLoc,firstImg)
|
||||||
|
else:
|
||||||
|
self.log(">>> No class:'columnGroup first' found <<<")
|
||||||
|
# Change class="kicker" to <h3>
|
||||||
|
kicker = soup.find(True, {'class':'kicker'})
|
||||||
|
if kicker and kicker.contents[0]:
|
||||||
|
h3Tag = Tag(soup, "h3")
|
||||||
|
h3Tag.insert(0, self.fixChars(self.tag_to_string(kicker,
|
||||||
|
use_alt=False)))
|
||||||
|
kicker.replaceWith(h3Tag)
|
||||||
|
|
||||||
|
# Change captions to italic -1
|
||||||
|
for caption in soup.findAll(True, {'class':'caption'}) :
|
||||||
|
if caption and caption.contents[0]:
|
||||||
|
emTag = Tag(soup, "em")
|
||||||
|
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
|
||||||
|
mp_off = c.find("More Photos")
|
||||||
|
if mp_off >= 0:
|
||||||
|
c = c[:mp_off]
|
||||||
|
emTag.insert(0, c)
|
||||||
|
hrTag = Tag(soup, 'hr')
|
||||||
|
#hrTag['style'] = "margin-top:0em;margin-bottom:0em"
|
||||||
|
emTag.insert(1, hrTag)
|
||||||
|
caption.replaceWith(emTag)
|
||||||
|
|
||||||
|
# Change <nyt_headline> to <h2>
|
||||||
|
h1 = soup.find('h1')
|
||||||
|
if h1:
|
||||||
|
headline = h1.find("nyt_headline")
|
||||||
|
if headline:
|
||||||
|
tag = Tag(soup, "h2")
|
||||||
|
tag['class'] = "headline"
|
||||||
|
tag.insert(0, self.fixChars(headline.contents[0]))
|
||||||
|
h1.replaceWith(tag)
|
||||||
|
else:
|
||||||
|
# Blog entry - replace headline, remove <hr> tags
|
||||||
|
headline = soup.find('title')
|
||||||
|
if headline:
|
||||||
|
tag = Tag(soup, "h2")
|
||||||
|
tag['class'] = "headline"
|
||||||
|
tag.insert(0, self.fixChars(headline.contents[0]))
|
||||||
|
soup.insert(0, tag)
|
||||||
|
hrs = soup.findAll('hr')
|
||||||
|
for hr in hrs:
|
||||||
|
hr.extract()
|
||||||
|
|
||||||
|
# Change <h1> to <h3> - used in editorial blogs
|
||||||
|
masthead = soup.find("h1")
|
||||||
|
if masthead:
|
||||||
|
# Nuke the href
|
||||||
|
if masthead.a:
|
||||||
|
del(masthead.a['href'])
|
||||||
|
tag = Tag(soup, "h3")
|
||||||
|
tag.insert(0, self.fixChars(masthead.contents[0]))
|
||||||
|
masthead.replaceWith(tag)
|
||||||
|
|
||||||
|
# Change <span class="bold"> to <b>
|
||||||
|
for subhead in soup.findAll(True, {'class':'bold'}) :
|
||||||
|
if subhead.contents:
|
||||||
|
bTag = Tag(soup, "b")
|
||||||
|
bTag.insert(0, subhead.contents[0])
|
||||||
|
subhead.replaceWith(bTag)
|
||||||
|
|
||||||
|
# Synthesize a section header
|
||||||
|
dsk = soup.find('meta', attrs={'name':'dsk'})
|
||||||
|
if dsk and dsk.has_key('content'):
|
||||||
|
hTag = Tag(soup,'h3')
|
||||||
|
hTag['class'] = 'section'
|
||||||
|
hTag.insert(0,NavigableString(dsk['content']))
|
||||||
|
articleTag = soup.find(True, attrs={'id':'article'})
|
||||||
|
if articleTag:
|
||||||
|
articleTag.insert(0,hTag)
|
||||||
|
|
||||||
|
# Add class="articleBody" to <div> so we can format with CSS
|
||||||
|
divTag = soup.find('div',attrs={'id':'articleBody'})
|
||||||
|
if divTag:
|
||||||
|
divTag['class'] = divTag['id']
|
||||||
|
|
||||||
|
# Add class="authorId" to <div> so we can format with CSS
|
||||||
|
divTag = soup.find('div',attrs={'id':'authorId'})
|
||||||
|
if divTag and divTag.contents[0]:
|
||||||
|
tag = Tag(soup, "p")
|
||||||
|
tag['class'] = "authorId"
|
||||||
|
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
|
||||||
|
use_alt=False)))
|
||||||
|
divTag.replaceWith(tag)
|
||||||
|
|
||||||
|
return soup
|
||||||
|
|
||||||
def strip_anchors(self,soup):
|
def strip_anchors(self,soup):
|
||||||
paras = soup.findAll(True)
|
paras = soup.findAll(True)
|
||||||
for para in paras:
|
for para in paras:
|
||||||
@ -297,94 +520,3 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
if a.img is None:
|
if a.img is None:
|
||||||
a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
# refresh = soup.find('meta', {'http-equiv':'refresh'})
|
|
||||||
# if refresh is None:
|
|
||||||
# return self.strip_anchors(soup)
|
|
||||||
#
|
|
||||||
# content = refresh.get('content').partition('=')[2]
|
|
||||||
# raw = self.browser.open('http://www.nytimes.com'+content).read()
|
|
||||||
# soup = BeautifulSoup(raw.decode('cp1252', 'replace'))
|
|
||||||
return self.strip_anchors(soup)
|
|
||||||
refresh = soup.find('meta', {'http-equiv':'refresh'})
|
|
||||||
if refresh is not None:
|
|
||||||
content = refresh.get('content').partition('=')[2]
|
|
||||||
raw = self.browser.open('http://www.nytimes.com'+content).read()
|
|
||||||
soup = BeautifulSoup(raw.decode('cp1252', 'replace'))
|
|
||||||
|
|
||||||
soup = self.strip_anchors(soup)
|
|
||||||
|
|
||||||
# Test for empty content
|
|
||||||
body = soup.find('body')
|
|
||||||
tagCount = len(body.findAll(True))
|
|
||||||
if tagCount:
|
|
||||||
# print "%d tags in article" % tagCount
|
|
||||||
return soup
|
|
||||||
else:
|
|
||||||
print "no allowed content found, removing article"
|
|
||||||
raise Exception
|
|
||||||
|
|
||||||
def postprocess_html(self,soup, True):
|
|
||||||
|
|
||||||
# Change class="kicker" to <h3>
|
|
||||||
kicker = soup.find(True, {'class':'kicker'})
|
|
||||||
if kicker is not None :
|
|
||||||
h3Tag = Tag(soup, "h3")
|
|
||||||
h3Tag.insert(0, kicker.contents[0])
|
|
||||||
kicker.replaceWith(h3Tag)
|
|
||||||
|
|
||||||
# Change captions to italic -1
|
|
||||||
for caption in soup.findAll(True, {'class':'caption'}) :
|
|
||||||
if caption is not None:
|
|
||||||
emTag = Tag(soup, "em")
|
|
||||||
emTag.insert(0, caption.contents[0])
|
|
||||||
hrTag = Tag(soup, 'hr')
|
|
||||||
emTag.insert(1, hrTag)
|
|
||||||
caption.replaceWith(emTag)
|
|
||||||
|
|
||||||
# Change <nyt_headline> to <h2>
|
|
||||||
headline = soup.find("nyt_headline")
|
|
||||||
if headline is not None :
|
|
||||||
tag = Tag(soup, "h2")
|
|
||||||
tag['class'] = "headline"
|
|
||||||
tag.insert(0, headline.contents[0])
|
|
||||||
soup.h1.replaceWith(tag)
|
|
||||||
|
|
||||||
# Change <h1> to <h3> - used in editorial blogs
|
|
||||||
masthead = soup.find("h1")
|
|
||||||
if masthead is not None :
|
|
||||||
# Nuke the href
|
|
||||||
if masthead.a is not None :
|
|
||||||
del(masthead.a['href'])
|
|
||||||
tag = Tag(soup, "h3")
|
|
||||||
tag.insert(0, masthead.contents[0])
|
|
||||||
soup.h1.replaceWith(tag)
|
|
||||||
|
|
||||||
# Change <span class="bold"> to <b>
|
|
||||||
for subhead in soup.findAll(True, {'class':'bold'}) :
|
|
||||||
bTag = Tag(soup, "b")
|
|
||||||
bTag.insert(0, subhead.contents[0])
|
|
||||||
subhead.replaceWith(bTag)
|
|
||||||
|
|
||||||
# Synthesize a section header
|
|
||||||
dsk = soup.find('meta', attrs={'name':'dsk'})
|
|
||||||
if dsk is not None and dsk.has_key('content'):
|
|
||||||
hTag = Tag(soup,'h3')
|
|
||||||
hTag['class'] = 'section'
|
|
||||||
hTag.insert(0,NavigableString(dsk['content']))
|
|
||||||
articleTag = soup.find(True, attrs={'id':'article'})
|
|
||||||
articleTag.insert(0,hTag)
|
|
||||||
|
|
||||||
# Add class="articleBody" to <div> so we can format with CSS
|
|
||||||
divTag = soup.find('div',attrs={'id':'articleBody'})
|
|
||||||
if divTag is not None :
|
|
||||||
divTag['class'] = divTag['id']
|
|
||||||
|
|
||||||
# Add class="authorId" to <div> so we can format with CSS
|
|
||||||
divTag = soup.find('div',attrs={'id':'authorId'})
|
|
||||||
if divTag is not None :
|
|
||||||
divTag['class'] = divTag['id']
|
|
||||||
|
|
||||||
return soup
|
|
||||||
|
|
||||||
|
@ -20,7 +20,7 @@ class ANDROID(USBMS):
|
|||||||
VENDOR_ID = {
|
VENDOR_ID = {
|
||||||
0x0bb4 : { 0x0c02 : [0x100], 0x0c01 : [0x100]},
|
0x0bb4 : { 0x0c02 : [0x100], 0x0c01 : [0x100]},
|
||||||
0x22b8 : { 0x41d9 : [0x216]},
|
0x22b8 : { 0x41d9 : [0x216]},
|
||||||
0x18d1 : { 0x4e11 : [0x0100], 0x4e12: [0x0100]},
|
0x18d1 : { 0x4e11 : [0x0100, 0x226], 0x4e12: [0x0100, 0x226]},
|
||||||
0x04e8 : { 0x681d : [0x0222]},
|
0x04e8 : { 0x681d : [0x0222]},
|
||||||
}
|
}
|
||||||
EBOOK_DIR_MAIN = ['wordplayer/calibretransfer', 'eBooks/import', 'Books']
|
EBOOK_DIR_MAIN = ['wordplayer/calibretransfer', 'eBooks/import', 'Books']
|
||||||
|
Loading…
x
Reference in New Issue
Block a user