mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-09-29 15:31:08 -04:00
436 lines
17 KiB
Python
436 lines
17 KiB
Python
#!/usr/bin/env python
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
|
'''
|
|
usatoday.com
|
|
'''
|
|
|
|
from calibre.web.feeds.news import BasicNewsRecipe
|
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, NavigableString, Tag
|
|
import re
|
|
|
|
class USAToday(BasicNewsRecipe):
|
|
|
|
title = 'USA Today'
|
|
__author__ = 'GRiker'
|
|
oldest_article = 1
|
|
timefmt = ''
|
|
max_articles_per_feed = 20
|
|
language = 'en'
|
|
no_stylesheets = True
|
|
extra_css = '.headline {text-align: left;}\n \
|
|
.byline {font-family: monospace; \
|
|
text-align: left; \
|
|
margin-bottom: 1em;}\n \
|
|
.image {text-align: center;}\n \
|
|
.caption {text-align: center; \
|
|
font-size: smaller; \
|
|
font-style: italic}\n \
|
|
.credit {text-align: right; \
|
|
margin-bottom: 0em; \
|
|
font-size: smaller;}\n \
|
|
.articleBody {text-align: left;}\n '
|
|
conversion_options = { 'linearize_tables' : True }
|
|
#simultaneous_downloads = 1
|
|
feeds = [
|
|
('Top Headlines', 'http://rssfeeds.usatoday.com/usatoday-NewsTopStories'),
|
|
('Tech Headlines', 'http://rssfeeds.usatoday.com/usatoday-TechTopStories'),
|
|
('Personal Tech', 'http://rssfeeds.usatoday.com/UsatodaycomTech-PersonalTalk'),
|
|
('Science', 'http://rssfeeds.usatoday.com/TP-ScienceFair'),
|
|
('Health', 'http://rssfeeds.usatoday.com/UsatodaycomHealth-TopStories'),
|
|
('Travel Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomTravel-TopStories'),
|
|
('Money Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomMoney-TopStories'),
|
|
('Entertainment Headlines', 'http://rssfeeds.usatoday.com/usatoday-LifeTopStories'),
|
|
('Sport Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomSports-TopStories'),
|
|
('Weather Headlines', 'http://rssfeeds.usatoday.com/usatoday-WeatherTopStories'),
|
|
('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles'),
|
|
('Offbeat News', 'http://rssfeeds.usatoday.com/UsatodaycomOffbeat-TopStories'),
|
|
]
|
|
keep_only_tags = [dict(attrs={'class':[
|
|
'byLine',
|
|
'inside-copy',
|
|
'inside-head',
|
|
'inside-head2',
|
|
'item',
|
|
'item-block',
|
|
'photo-container',
|
|
]}),
|
|
dict(id=[
|
|
'applyMainStoryPhoto',
|
|
'permalink',
|
|
])]
|
|
|
|
remove_tags = [dict(attrs={'class':[
|
|
'comments',
|
|
'jump',
|
|
'pagetools',
|
|
'post-attributes',
|
|
'tags',
|
|
]}),
|
|
dict(id=[])]
|
|
|
|
#feeds = [('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles')]
|
|
|
|
def dump_hex(self, src, length=16):
|
|
''' Diagnostic '''
|
|
FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)])
|
|
N=0; result=''
|
|
while src:
|
|
s,src = src[:length],src[length:]
|
|
hexa = ' '.join(["%02X"%ord(x) for x in s])
|
|
s = s.translate(FILTER)
|
|
result += "%04X %-*s %s\n" % (N, length*3, hexa, s)
|
|
N+=length
|
|
print result
|
|
|
|
def fixChars(self,string):
|
|
# Replace lsquo (\x91)
|
|
fixed = re.sub("\x91","‘",string)
|
|
|
|
# Replace rsquo (\x92)
|
|
fixed = re.sub("\x92","’",fixed)
|
|
|
|
# Replace ldquo (\x93)
|
|
fixed = re.sub("\x93","“",fixed)
|
|
|
|
# Replace rdquo (\x94)
|
|
fixed = re.sub("\x94","”",fixed)
|
|
|
|
# Replace ndash (\x96)
|
|
fixed = re.sub("\x96","–",fixed)
|
|
|
|
# Replace mdash (\x97)
|
|
fixed = re.sub("\x97","—",fixed)
|
|
|
|
return fixed
|
|
|
|
def get_masthead_url(self):
|
|
masthead = 'http://i.usatoday.net/mobile/_common/_images/565x73_usat_mobile.gif'
|
|
br = BasicNewsRecipe.get_browser()
|
|
try:
|
|
br.open(masthead)
|
|
except:
|
|
self.log("\nCover unavailable")
|
|
masthead = None
|
|
return masthead
|
|
|
|
def massageNCXText(self, description):
|
|
# Kindle TOC descriptions won't render certain characters
|
|
if description:
|
|
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
|
# Replace '&' with '&'
|
|
massaged = re.sub("&","&", massaged)
|
|
return self.fixChars(massaged)
|
|
else:
|
|
return description
|
|
|
|
def parse_feeds(self, *args, **kwargs):
|
|
parsed_feeds = BasicNewsRecipe.parse_feeds(self, *args, **kwargs)
|
|
# Count articles for progress dialog
|
|
article_count = 0
|
|
for feed in parsed_feeds:
|
|
article_count += len(feed)
|
|
self.log( "Queued %d articles" % article_count)
|
|
return parsed_feeds
|
|
|
|
def preprocess_html(self, soup):
|
|
soup = self.strip_anchors(soup)
|
|
return soup
|
|
|
|
def postprocess_html(self, soup, first_fetch):
|
|
|
|
# Remove navLinks <div class="inside-copy" style="padding-bottom:3px">
|
|
navLinks = soup.find(True,{'style':'padding-bottom:3px'})
|
|
if navLinks:
|
|
navLinks.extract()
|
|
|
|
# Remove <div class="inside-copy" style="margin-bottom:10px">
|
|
gibberish = soup.find(True,{'style':'margin-bottom:10px'})
|
|
if gibberish:
|
|
gibberish.extract()
|
|
|
|
# Change <inside-head> to <h2>
|
|
headline = soup.find(True, {'class':['inside-head','inside-head2']})
|
|
if not headline:
|
|
headline = soup.find('h3')
|
|
if headline:
|
|
tag = Tag(soup, "h2")
|
|
tag['class'] = "headline"
|
|
tag.insert(0, headline.contents[0])
|
|
headline.replaceWith(tag)
|
|
else:
|
|
print "unable to find headline:\n%s\n" % soup
|
|
|
|
# Change byLine to byline, change commas to middot
|
|
# Kindle renders commas in byline as '&'
|
|
byline = soup.find(True, {'class':'byLine'})
|
|
if byline:
|
|
byline['class'] = 'byline'
|
|
# Replace comma with middot
|
|
byline.contents[0].replaceWith(re.sub(","," ·", byline.renderContents()))
|
|
|
|
jumpout_punc_list = [':','?']
|
|
# Remove the inline jumpouts in <div class="inside-copy">
|
|
paras = soup.findAll(True, {'class':'inside-copy'})
|
|
for para in paras:
|
|
if re.match("<b>[\w\W]+ ",para.renderContents()):
|
|
p = para.find('b')
|
|
for punc in jumpout_punc_list:
|
|
punc_offset = p.contents[0].find(punc)
|
|
if punc_offset == -1:
|
|
continue
|
|
if punc_offset > 1:
|
|
if p.contents[0][:punc_offset] == p.contents[0][:punc_offset].upper():
|
|
#print "extracting \n%s\n" % para.prettify()
|
|
para.extract()
|
|
|
|
# Reset class for remaining
|
|
paras = soup.findAll(True, {'class':'inside-copy'})
|
|
for para in paras:
|
|
para['class'] = 'articleBody'
|
|
|
|
# Remove inline jumpouts in <p>
|
|
paras = soup.findAll(['p'])
|
|
for p in paras:
|
|
if hasattr(p,'contents') and len(p.contents):
|
|
for punc in jumpout_punc_list:
|
|
punc_offset = p.contents[0].find(punc)
|
|
if punc_offset == -1:
|
|
continue
|
|
if punc_offset > 2 and hasattr(p,'a') and len(p.contents):
|
|
#print "evaluating %s\n" % p.contents[0][:punc_offset+1]
|
|
if p.contents[0][:punc_offset] == p.contents[0][:punc_offset].upper():
|
|
#print "extracting \n%s\n" % p.prettify()
|
|
p.extract()
|
|
|
|
# Capture the first img, insert after headline
|
|
imgs = soup.findAll('img')
|
|
print "postprocess_html(): %d images" % len(imgs)
|
|
if imgs:
|
|
divTag = Tag(soup, 'div')
|
|
divTag['class'] = 'image'
|
|
body = soup.find('body')
|
|
img = imgs[0]
|
|
#print "img: \n%s\n" % img.prettify()
|
|
|
|
# Table for photo and credit
|
|
tableTag = Tag(soup,'table')
|
|
|
|
# Photo
|
|
trimgTag = Tag(soup, 'tr')
|
|
tdimgTag = Tag(soup, 'td')
|
|
tdimgTag.insert(0,img)
|
|
trimgTag.insert(0,tdimgTag)
|
|
tableTag.insert(0,trimgTag)
|
|
|
|
# Credit
|
|
trcreditTag = Tag(soup, 'tr')
|
|
|
|
tdcreditTag = Tag(soup, 'td')
|
|
tdcreditTag['class'] = 'credit'
|
|
credit = soup.find('td',{'class':'photoCredit'})
|
|
if credit:
|
|
tdcreditTag.insert(0,NavigableString(credit.renderContents()))
|
|
else:
|
|
credit = img['credit']
|
|
if credit:
|
|
tdcreditTag.insert(0,NavigableString(credit))
|
|
else:
|
|
tdcreditTag.insert(0,NavigableString(''))
|
|
|
|
trcreditTag.insert(0,tdcreditTag)
|
|
tableTag.insert(1,trcreditTag)
|
|
dtc = 0
|
|
divTag.insert(dtc,tableTag)
|
|
dtc += 1
|
|
|
|
if False:
|
|
# Add the caption in the table
|
|
tableCaptionTag = Tag(soup,'caption')
|
|
tableCaptionTag.insert(0,soup.find('td',{'class':'photoCredit'}).renderContents())
|
|
tableTag.insert(1,tableCaptionTag)
|
|
divTag.insert(dtc,tableTag)
|
|
dtc += 1
|
|
body.insert(1,divTag)
|
|
else:
|
|
# Add the caption below the table
|
|
#print "Looking for caption in this soup:\n%s" % img.prettify()
|
|
captionTag = Tag(soup,'p')
|
|
captionTag['class'] = 'caption'
|
|
if hasattr(img,'alt') and img['alt']:
|
|
captionTag.insert(0,NavigableString('<blockquote>%s</blockquote>' % img['alt']))
|
|
divTag.insert(dtc, captionTag)
|
|
dtc += 1
|
|
else:
|
|
try:
|
|
captionTag.insert(0,NavigableString('<blockquote>%s</blockquote>' % img['cutline']))
|
|
divTag.insert(dtc, captionTag)
|
|
dtc += 1
|
|
except:
|
|
pass
|
|
|
|
hrTag = Tag(soup, 'hr')
|
|
divTag.insert(dtc, hrTag)
|
|
dtc += 1
|
|
|
|
# Delete <div id="applyMainStoryPhoto"
|
|
photoJunk = soup.find('div',{'id':'applyMainStoryPhoto'})
|
|
if photoJunk:
|
|
photoJunk.extract()
|
|
|
|
# Insert img after headline
|
|
tag = body.find(True)
|
|
insertLoc = 0
|
|
headline_found = False
|
|
while True:
|
|
# Scan the top-level tags
|
|
insertLoc += 1
|
|
if hasattr(tag,'class') and tag['class'] == 'headline':
|
|
headline_found = True
|
|
body.insert(insertLoc,divTag)
|
|
break
|
|
tag = tag.nextSibling
|
|
if not tag:
|
|
break
|
|
|
|
if not headline_found:
|
|
# Monolithic <div> - restructure
|
|
tag = body.find(True)
|
|
while True:
|
|
insertLoc += 1
|
|
try:
|
|
if hasattr(tag,'class') and tag['class'] == 'headline':
|
|
headline_found = True
|
|
tag.insert(insertLoc,divTag)
|
|
break
|
|
except:
|
|
pass
|
|
tag = tag.next
|
|
if not tag:
|
|
break
|
|
|
|
# Yank out headline, img and caption
|
|
headline = body.find('h2','headline')
|
|
img = body.find('div','image')
|
|
caption = body.find('p''class')
|
|
|
|
# body(0) is calibre_navbar
|
|
# body(1) is <div class="item">
|
|
|
|
btc = 1
|
|
headline.extract()
|
|
body.insert(1, headline)
|
|
btc += 1
|
|
if img:
|
|
img.extract()
|
|
body.insert(btc, img)
|
|
btc += 1
|
|
if caption:
|
|
caption.extract()
|
|
body.insert(btc, caption)
|
|
btc += 1
|
|
|
|
if len(imgs) > 1:
|
|
if True:
|
|
[img.extract() for img in imgs[1:]]
|
|
else:
|
|
# Format the remaining images
|
|
# This doesn't work yet
|
|
for img in imgs[1:]:
|
|
print "img:\n%s\n" % img.prettify()
|
|
divTag = Tag(soup, 'div')
|
|
divTag['class'] = 'image'
|
|
|
|
# Table for photo and credit
|
|
tableTag = Tag(soup,'table')
|
|
|
|
# Photo
|
|
trimgTag = Tag(soup, 'tr')
|
|
tdimgTag = Tag(soup, 'td')
|
|
tdimgTag.insert(0,img)
|
|
trimgTag.insert(0,tdimgTag)
|
|
tableTag.insert(0,trimgTag)
|
|
|
|
# Credit
|
|
trcreditTag = Tag(soup, 'tr')
|
|
|
|
tdcreditTag = Tag(soup, 'td')
|
|
tdcreditTag['class'] = 'credit'
|
|
try:
|
|
tdcreditTag.insert(0,NavigableString(img['credit']))
|
|
except:
|
|
tdcreditTag.insert(0,NavigableString(''))
|
|
trcreditTag.insert(0,tdcreditTag)
|
|
tableTag.insert(1,trcreditTag)
|
|
divTag.insert(0,tableTag)
|
|
soup.img.replaceWith(divTag)
|
|
|
|
return soup
|
|
|
|
def postprocess_book(self, oeb, opts, log) :
|
|
|
|
def extract_byline(href) :
|
|
# <meta name="byline" content=
|
|
soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
|
|
byline = soup.find('div',attrs={'class':'byline'})
|
|
if byline:
|
|
byline['class'] = 'byline'
|
|
# Replace comma with middot
|
|
byline.contents[0].replaceWith(re.sub(u",", u" ·",
|
|
byline.renderContents(encoding=None)))
|
|
return byline.renderContents(encoding=None)
|
|
else :
|
|
paras = soup.findAll(text=True)
|
|
for para in paras:
|
|
if para.startswith("Copyright"):
|
|
return para[len('Copyright xxxx '):para.find('.')]
|
|
return None
|
|
|
|
def extract_description(href) :
|
|
soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
|
|
description = soup.find('meta',attrs={'name':'description'})
|
|
if description :
|
|
return self.massageNCXText(description['content'])
|
|
else:
|
|
# Take first paragraph of article
|
|
articleBody = soup.find('div',attrs={'id':['articleBody','item']})
|
|
if articleBody:
|
|
paras = articleBody.findAll('p')
|
|
for p in paras:
|
|
if p.renderContents() > '' :
|
|
return self.massageNCXText(self.tag_to_string(p,use_alt=False))
|
|
else:
|
|
print "Didn't find <div id='articleBody'> in this soup:\n%s" % soup.prettify()
|
|
return None
|
|
|
|
# Method entry point here
|
|
# Single section toc looks different than multi-section tocs
|
|
if oeb.toc.depth() == 2 :
|
|
for article in oeb.toc :
|
|
if article.author is None :
|
|
article.author = extract_byline(article.href)
|
|
if article.description is None :
|
|
article.description = extract_description(article.href)
|
|
elif oeb.toc.depth() == 3 :
|
|
for section in oeb.toc :
|
|
for article in section :
|
|
article.author = extract_byline(article.href)
|
|
'''
|
|
if article.author is None :
|
|
article.author = self.massageNCXText(extract_byline(article.href))
|
|
else:
|
|
article.author = self.massageNCXText(article.author)
|
|
'''
|
|
if article.description is None :
|
|
article.description = extract_description(article.href)
|
|
|
|
def strip_anchors(self,soup):
|
|
paras = soup.findAll(True)
|
|
for para in paras:
|
|
aTags = para.findAll('a')
|
|
for a in aTags:
|
|
if a.img is None:
|
|
a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
|
return soup
|