mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Updated recipe for the NYT
This commit is contained in:
commit
118791052f
@ -4,44 +4,107 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
'''
|
'''
|
||||||
nytimes.com
|
nytimes.com
|
||||||
|
V5 - One picture per article, moved to top:
|
||||||
|
Headline
|
||||||
|
Image
|
||||||
|
Byline
|
||||||
|
Story
|
||||||
'''
|
'''
|
||||||
import string, re, time
|
import re, string, time
|
||||||
from calibre import strftime
|
from calibre import strftime
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, NavigableString, Tag
|
||||||
|
|
||||||
def decode(self, src):
|
|
||||||
enc = 'utf-8'
|
|
||||||
if 'iso-8859-1' in src:
|
|
||||||
enc = 'cp1252'
|
|
||||||
return src.decode(enc, 'ignore')
|
|
||||||
|
|
||||||
class NYTimes(BasicNewsRecipe):
|
class NYTimes(BasicNewsRecipe):
|
||||||
|
|
||||||
title = 'The New York Times (subscription)'
|
title = 'The New York Times'
|
||||||
__author__ = 'Kovid Goyal'
|
__author__ = 'GRiker'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
requires_version = (0, 6, 36)
|
|
||||||
|
|
||||||
description = 'Daily news from the New York Times (subscription version)'
|
description = 'Daily news from the New York Times (subscription version)'
|
||||||
timefmt = ' [%a, %b %d, %Y]'
|
allSectionKeywords = ['The Front Page', 'International','National','Obituaries','Editorials',
|
||||||
|
'New York','Business Day','Science Times','Sports','Dining','Arts',
|
||||||
|
'Home','Styles','Sunday Business','Week In Review','Travel','Magazine',
|
||||||
|
'Book Review','Weddings','Real Estate','Automobiles',"T Men's Fashion",
|
||||||
|
"T Women's Fashion"]
|
||||||
|
excludeSectionKeywords = ['Dining','Weddings']
|
||||||
|
|
||||||
|
test_mode = False
|
||||||
|
if test_mode:
|
||||||
|
all = set(allSectionKeywords)
|
||||||
|
fetch_only = set(['The Front Page'])
|
||||||
|
excludeSectionKeywords = list(all ^ fetch_only)
|
||||||
|
|
||||||
|
use_one_picture_per_article = True
|
||||||
|
timefmt = ''
|
||||||
needs_subscription = True
|
needs_subscription = True
|
||||||
remove_tags_before = dict(id='article')
|
remove_tags_before = dict(id='article')
|
||||||
remove_tags_after = dict(id='article')
|
remove_tags_after = dict(id='article')
|
||||||
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
|
remove_tags = [dict(attrs={'class':[
|
||||||
dict(id=['footer', 'toolsRight', 'articleInline',
|
'articleTools',
|
||||||
'navigation', 'archive', 'side_search', 'blog_sidebar',
|
'columnGroup doubleRule',
|
||||||
'side_tool', 'side_index', 'login', 'businessSearchBar',
|
'columnGroup last',
|
||||||
|
'doubleRule',
|
||||||
|
'entry-meta',
|
||||||
|
'icon enlargeThis',
|
||||||
|
'leftNavTabs',
|
||||||
|
'module box nav',
|
||||||
|
'nextArticleLink clearfix',
|
||||||
|
'post-tools',
|
||||||
|
'relatedSearchesModule',
|
||||||
|
'side_tool',
|
||||||
|
'singleAd',
|
||||||
|
]}),
|
||||||
|
dict(id=[
|
||||||
'adxLeaderboard',
|
'adxLeaderboard',
|
||||||
'relatedArticles', 'relatedTopics', 'adxSponLink']),
|
'archive',
|
||||||
|
'articleExtras',
|
||||||
|
'articleInline',
|
||||||
|
'blog_sidebar',
|
||||||
|
'cCol',
|
||||||
|
'entertainmentSearchBar',
|
||||||
|
'footer',
|
||||||
|
'header',
|
||||||
|
'header_search',
|
||||||
|
'login',
|
||||||
|
'masthead',
|
||||||
|
'memberTools',
|
||||||
|
'navigation',
|
||||||
|
'portfolioInline',
|
||||||
|
'relatedArticles',
|
||||||
|
'side_search',
|
||||||
|
'side_index',
|
||||||
|
'side_tool',
|
||||||
|
'toolsRight',
|
||||||
|
]),
|
||||||
dict(name=['script', 'noscript', 'style'])]
|
dict(name=['script', 'noscript', 'style'])]
|
||||||
encoding = decode
|
#encoding = 'utf-8'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
extra_css = 'h1 {font-face:sans-serif; font-size:2em; font-weight:bold;}\n.byline {font:monospace;}\n.bold {font-weight:bold;}'
|
extra_css = '.headline {text-align: left;}\n \
|
||||||
|
.byline {font-family: monospace; \
|
||||||
|
text-align: left; \
|
||||||
|
margin-top: 0px; \
|
||||||
|
margin-bottom: 0px;}\n \
|
||||||
|
.dateline {font-size: small; \
|
||||||
|
margin-top: 0px; \
|
||||||
|
margin-bottom: 0px;}\n \
|
||||||
|
.timestamp {font-size: small; \
|
||||||
|
margin-top: 0px; \
|
||||||
|
margin-bottom: 0px;}\n \
|
||||||
|
.source {text-align: left;}\n \
|
||||||
|
.image {text-align: center;}\n \
|
||||||
|
.credit {text-align: right; \
|
||||||
|
font-size: small; \
|
||||||
|
margin-top: 0px; \
|
||||||
|
margin-bottom: 0px;}\n \
|
||||||
|
.articleBody {text-align: left;}\n \
|
||||||
|
.authorId {text-align: left; \
|
||||||
|
font-style: italic;}\n '
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
if self.username is not None and self.password is not None:
|
if self.username is not None and self.password is not None:
|
||||||
|
try:
|
||||||
br.open('http://www.nytimes.com/auth/login')
|
br.open('http://www.nytimes.com/auth/login')
|
||||||
br.select_form(name='login')
|
br.select_form(name='login')
|
||||||
br['USERID'] = self.username
|
br['USERID'] = self.username
|
||||||
@ -50,18 +113,15 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
if 'Sorry, we could not find the combination you entered. Please try again.' in raw:
|
if 'Sorry, we could not find the combination you entered. Please try again.' in raw:
|
||||||
raise Exception('Your username and password are incorrect')
|
raise Exception('Your username and password are incorrect')
|
||||||
#open('/t/log.html', 'wb').write(raw)
|
#open('/t/log.html', 'wb').write(raw)
|
||||||
|
except:
|
||||||
|
self.log("\nFailed to login")
|
||||||
|
|
||||||
return br
|
return br
|
||||||
|
|
||||||
def get_masthead_url(self):
|
masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
|
||||||
masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
|
|
||||||
br = BasicNewsRecipe.get_browser()
|
|
||||||
try:
|
|
||||||
br.open(masthead)
|
|
||||||
except:
|
|
||||||
self.log("\nCover unavailable")
|
|
||||||
masthead = None
|
|
||||||
return masthead
|
|
||||||
|
|
||||||
|
def get_masthead_title(self):
|
||||||
|
return 'NYTimes GR Version'
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
cover = None
|
cover = None
|
||||||
@ -78,13 +138,66 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
cover = None
|
cover = None
|
||||||
return cover
|
return cover
|
||||||
|
|
||||||
def short_title(self):
|
def dump_ans(self, ans):
|
||||||
return 'NY Times'
|
total_article_count = 0
|
||||||
|
for section in ans :
|
||||||
|
if self.verbose:
|
||||||
|
self.log("section %s: %d articles" % (section[0], len(section[1])) )
|
||||||
|
for article in section[1]:
|
||||||
|
total_article_count += 1
|
||||||
|
if self.verbose:
|
||||||
|
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('mac-roman','replace'),
|
||||||
|
article['url'].encode('mac-roman','replace')))
|
||||||
|
self.log( "Queued %d articles" % total_article_count )
|
||||||
|
|
||||||
|
def dump_hex(self, src, length=16):
|
||||||
|
''' Diagnostic '''
|
||||||
|
FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)])
|
||||||
|
N=0; result=''
|
||||||
|
while src:
|
||||||
|
s,src = src[:length],src[length:]
|
||||||
|
hexa = ' '.join(["%02X"%ord(x) for x in s])
|
||||||
|
s = s.translate(FILTER)
|
||||||
|
result += "%04X %-*s %s\n" % (N, length*3, hexa, s)
|
||||||
|
N+=length
|
||||||
|
print result
|
||||||
|
|
||||||
|
def fixChars(self,string):
|
||||||
|
# Something's not right in the pipeline
|
||||||
|
#
|
||||||
|
|
||||||
|
# Replace lsquo (\x91)
|
||||||
|
fixed = re.sub("\x91","‘",string)
|
||||||
|
|
||||||
|
# Replace rsquo (\x92)
|
||||||
|
fixed = re.sub("\x92","’",fixed)
|
||||||
|
|
||||||
|
# Replace ldquo (\x93)
|
||||||
|
fixed = re.sub("\x93","“",fixed)
|
||||||
|
|
||||||
|
# Replace rdquo (\x94)
|
||||||
|
fixed = re.sub("\x94","”",fixed)
|
||||||
|
|
||||||
|
# Replace ndash (\x96)
|
||||||
|
fixed = re.sub("\x96","–",fixed)
|
||||||
|
|
||||||
|
# Replace mdash (\x97)
|
||||||
|
fixed = re.sub("\x97","—",fixed)
|
||||||
|
|
||||||
|
return fixed
|
||||||
|
|
||||||
|
def massageNCXText(self, description):
|
||||||
|
# Kindle TOC descriptions won't render certain characters
|
||||||
|
if description:
|
||||||
|
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||||
|
# Replace '&' with '&'
|
||||||
|
massaged = re.sub("&","&", massaged)
|
||||||
|
return self.fixChars(massaged)
|
||||||
|
else:
|
||||||
|
return description
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
self.encoding = 'cp1252'
|
|
||||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
|
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
|
||||||
self.encoding = decode
|
|
||||||
|
|
||||||
def feed_title(div):
|
def feed_title(div):
|
||||||
return ''.join(div.findAll(text=True, recursive=False)).strip()
|
return ''.join(div.findAll(text=True, recursive=False)).strip()
|
||||||
@ -92,18 +205,13 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
articles = {}
|
articles = {}
|
||||||
key = None
|
key = None
|
||||||
ans = []
|
ans = []
|
||||||
#allSectionKeywords = ['The Front Page', 'International','National','Obituaries','Editorials',
|
|
||||||
#'New York','Business Day','Sports','Dining','Arts','Home','Styles']
|
|
||||||
excludeSectionKeywords = ['Dining','Styles']
|
|
||||||
|
|
||||||
|
|
||||||
# Find each instance of class="section-headline", class="story", class="story headline"
|
# Find each instance of class="section-headline", class="story", class="story headline"
|
||||||
for div in soup.findAll(True,
|
for div in soup.findAll(True,
|
||||||
attrs={'class':['section-headline', 'story', 'story headline']}):
|
attrs={'class':['section-headline', 'story', 'story headline']}):
|
||||||
|
|
||||||
if div['class'] == 'section-headline':
|
if div['class'] == 'section-headline':
|
||||||
key = string.capwords(feed_title(div))
|
key = string.capwords(feed_title(div))
|
||||||
excluded = re.compile('|'.join(excludeSectionKeywords))
|
excluded = re.compile('|'.join(self.excludeSectionKeywords))
|
||||||
if excluded.search(key):
|
if excluded.search(key):
|
||||||
self.log("Skipping section %s" % key)
|
self.log("Skipping section %s" % key)
|
||||||
continue
|
continue
|
||||||
@ -117,13 +225,14 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
continue
|
continue
|
||||||
url = re.sub(r'\?.*', '', a['href'])
|
url = re.sub(r'\?.*', '', a['href'])
|
||||||
url += '?pagewanted=all'
|
url += '?pagewanted=all'
|
||||||
title = self.tag_to_string(a, use_alt=True).strip()
|
|
||||||
|
title = self.massageNCXText(self.tag_to_string(a, use_alt=True).strip())
|
||||||
|
|
||||||
description = ''
|
description = ''
|
||||||
pubdate = strftime('%a, %d %b')
|
pubdate = strftime('%a, %d %b')
|
||||||
summary = div.find(True, attrs={'class':'summary'})
|
summary = div.find(True, attrs={'class':'summary'})
|
||||||
if summary:
|
if summary:
|
||||||
description = self.tag_to_string(summary, use_alt=False)
|
description = self.massageNCXText(self.tag_to_string(summary, use_alt=False))
|
||||||
|
|
||||||
author = ''
|
author = ''
|
||||||
authorAttribution = div.find(True, attrs={'class':'storyheadline-author'})
|
authorAttribution = div.find(True, attrs={'class':'storyheadline-author'})
|
||||||
@ -133,6 +242,8 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
authorAttribution = div.find(True, attrs={'class':'byline'})
|
authorAttribution = div.find(True, attrs={'class':'byline'})
|
||||||
if authorAttribution:
|
if authorAttribution:
|
||||||
author = self.tag_to_string(authorAttribution, use_alt=False)
|
author = self.tag_to_string(authorAttribution, use_alt=False)
|
||||||
|
# Kill commas - Kindle switches to '&'
|
||||||
|
author = re.sub(',','',author)
|
||||||
|
|
||||||
feed = key if key is not None else 'Uncategorized'
|
feed = key if key is not None else 'Uncategorized'
|
||||||
if not articles.has_key(feed):
|
if not articles.has_key(feed):
|
||||||
@ -146,13 +257,208 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
'Dining In, Dining Out':1,
|
'Dining In, Dining Out':1,
|
||||||
'Obituaries':2})
|
'Obituaries':2})
|
||||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||||
|
self.dump_ans(ans)
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
def strip_anchors(self,soup):
|
||||||
|
paras = soup.findAll(True)
|
||||||
|
for para in paras:
|
||||||
|
aTags = para.findAll('a')
|
||||||
|
for a in aTags:
|
||||||
|
if a.img is None:
|
||||||
|
a.replaceWith(a.renderContents().decode('utf-8','replace'))
|
||||||
|
#a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
||||||
|
return soup
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
|
'''
|
||||||
refresh = soup.find('meta', {'http-equiv':'refresh'})
|
refresh = soup.find('meta', {'http-equiv':'refresh'})
|
||||||
if refresh is None:
|
if refresh is None:
|
||||||
return soup
|
return soup
|
||||||
content = refresh.get('content').partition('=')[2]
|
content = refresh.get('content').partition('=')[2]
|
||||||
raw = self.browser.open_novisit('http://www.nytimes.com'+content).read()
|
raw = self.browser.open('http://www.nytimes.com'+content).read()
|
||||||
return BeautifulSoup(raw.decode('cp1252', 'replace'))
|
return BeautifulSoup(raw.decode('cp1252', 'replace'))
|
||||||
|
'''
|
||||||
|
return self.strip_anchors(soup)
|
||||||
|
|
||||||
|
def postprocess_html(self,soup, True):
|
||||||
|
|
||||||
|
if self.use_one_picture_per_article:
|
||||||
|
# Remove all images after first
|
||||||
|
largeImg = soup.find(True, {'class':'articleSpanImage'})
|
||||||
|
inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
|
||||||
|
if largeImg:
|
||||||
|
for inlineImg in inlineImgs:
|
||||||
|
inlineImg.extract()
|
||||||
|
else:
|
||||||
|
if inlineImgs:
|
||||||
|
firstImg = inlineImgs[0]
|
||||||
|
for inlineImg in inlineImgs[1:]:
|
||||||
|
inlineImg.extract()
|
||||||
|
# Move firstImg after headline
|
||||||
|
cgFirst = soup.find(True, {'class':'columnGroup first'})
|
||||||
|
if cgFirst:
|
||||||
|
# Strip all sibling NavigableStrings: noise
|
||||||
|
navstrings = cgFirst.findAll(text=True, recursive=False)
|
||||||
|
[ns.extract() for ns in navstrings]
|
||||||
|
headline_found = False
|
||||||
|
tag = cgFirst.find(True)
|
||||||
|
insertLoc = 0
|
||||||
|
while True:
|
||||||
|
insertLoc += 1
|
||||||
|
if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
|
||||||
|
headline_found = True
|
||||||
|
break
|
||||||
|
tag = tag.nextSibling
|
||||||
|
if not tag:
|
||||||
|
headline_found = False
|
||||||
|
break
|
||||||
|
if headline_found:
|
||||||
|
cgFirst.insert(insertLoc,firstImg)
|
||||||
|
else:
|
||||||
|
self.log(">>> No class:'columnGroup first' found <<<")
|
||||||
|
# Change class="kicker" to <h3>
|
||||||
|
kicker = soup.find(True, {'class':'kicker'})
|
||||||
|
if kicker and kicker.contents[0]:
|
||||||
|
h3Tag = Tag(soup, "h3")
|
||||||
|
h3Tag.insert(0, self.fixChars(self.tag_to_string(kicker,
|
||||||
|
use_alt=False)))
|
||||||
|
kicker.replaceWith(h3Tag)
|
||||||
|
|
||||||
|
# Change captions to italic -1
|
||||||
|
for caption in soup.findAll(True, {'class':'caption'}) :
|
||||||
|
if caption and caption.contents[0]:
|
||||||
|
emTag = Tag(soup, "em")
|
||||||
|
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
|
||||||
|
mp_off = c.find("More Photos")
|
||||||
|
if mp_off >= 0:
|
||||||
|
c = c[:mp_off]
|
||||||
|
emTag.insert(0, c)
|
||||||
|
hrTag = Tag(soup, 'hr')
|
||||||
|
#hrTag['style'] = "margin-top:0em;margin-bottom:0em"
|
||||||
|
emTag.insert(1, hrTag)
|
||||||
|
caption.replaceWith(emTag)
|
||||||
|
|
||||||
|
# Change <nyt_headline> to <h2>
|
||||||
|
h1 = soup.find('h1')
|
||||||
|
if h1:
|
||||||
|
headline = h1.find("nyt_headline")
|
||||||
|
if headline:
|
||||||
|
tag = Tag(soup, "h2")
|
||||||
|
tag['class'] = "headline"
|
||||||
|
tag.insert(0, self.fixChars(headline.contents[0]))
|
||||||
|
h1.replaceWith(tag)
|
||||||
|
else:
|
||||||
|
# Blog entry - replace headline, remove <hr> tags
|
||||||
|
headline = soup.find('title')
|
||||||
|
if headline:
|
||||||
|
tag = Tag(soup, "h2")
|
||||||
|
tag['class'] = "headline"
|
||||||
|
tag.insert(0, self.fixChars(headline.contents[0]))
|
||||||
|
soup.insert(0, tag)
|
||||||
|
hrs = soup.findAll('hr')
|
||||||
|
for hr in hrs:
|
||||||
|
hr.extract()
|
||||||
|
|
||||||
|
# Change <h1> to <h3> - used in editorial blogs
|
||||||
|
masthead = soup.find("h1")
|
||||||
|
if masthead:
|
||||||
|
# Nuke the href
|
||||||
|
if masthead.a:
|
||||||
|
del(masthead.a['href'])
|
||||||
|
tag = Tag(soup, "h3")
|
||||||
|
tag.insert(0, self.fixChars(masthead.contents[0]))
|
||||||
|
masthead.replaceWith(tag)
|
||||||
|
|
||||||
|
# Change <span class="bold"> to <b>
|
||||||
|
for subhead in soup.findAll(True, {'class':'bold'}) :
|
||||||
|
if subhead.contents:
|
||||||
|
bTag = Tag(soup, "b")
|
||||||
|
bTag.insert(0, subhead.contents[0])
|
||||||
|
subhead.replaceWith(bTag)
|
||||||
|
|
||||||
|
# Synthesize a section header
|
||||||
|
dsk = soup.find('meta', attrs={'name':'dsk'})
|
||||||
|
if dsk and dsk.has_key('content'):
|
||||||
|
hTag = Tag(soup,'h3')
|
||||||
|
hTag['class'] = 'section'
|
||||||
|
hTag.insert(0,NavigableString(dsk['content']))
|
||||||
|
articleTag = soup.find(True, attrs={'id':'article'})
|
||||||
|
if articleTag:
|
||||||
|
articleTag.insert(0,hTag)
|
||||||
|
|
||||||
|
# Add class="articleBody" to <div> so we can format with CSS
|
||||||
|
divTag = soup.find('div',attrs={'id':'articleBody'})
|
||||||
|
if divTag:
|
||||||
|
divTag['class'] = divTag['id']
|
||||||
|
|
||||||
|
# Add class="authorId" to <div> so we can format with CSS
|
||||||
|
divTag = soup.find('div',attrs={'id':'authorId'})
|
||||||
|
if divTag and divTag.contents[0]:
|
||||||
|
tag = Tag(soup, "p")
|
||||||
|
tag['class'] = "authorId"
|
||||||
|
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
|
||||||
|
use_alt=False)))
|
||||||
|
divTag.replaceWith(tag)
|
||||||
|
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def postprocess_book(self, oeb, opts, log) :
|
||||||
|
|
||||||
|
def extract_byline(href) :
|
||||||
|
# <meta name="byline" content=
|
||||||
|
soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
|
||||||
|
byline = soup.find('meta',attrs={'name':['byl','CLMST']})
|
||||||
|
if byline :
|
||||||
|
author = byline['content']
|
||||||
|
else :
|
||||||
|
# Try for <div class="byline">
|
||||||
|
byline = soup.find('div', attrs={'class':'byline'})
|
||||||
|
if byline:
|
||||||
|
author = byline.renderContents()
|
||||||
|
else:
|
||||||
|
print "couldn't find byline in %s" % href
|
||||||
|
print soup.prettify()
|
||||||
|
return None
|
||||||
|
# Kill commas - Kindle switches to '&'
|
||||||
|
return re.sub(',','',author)
|
||||||
|
|
||||||
|
def extract_description(href) :
|
||||||
|
soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
|
||||||
|
description = soup.find('meta',attrs={'name':['description','description ']})
|
||||||
|
if description :
|
||||||
|
# print repr(description['content'])
|
||||||
|
# print self.massageNCXText(description['content'])
|
||||||
|
return self.massageNCXText(description['content'])
|
||||||
|
else:
|
||||||
|
# Take first paragraph of article
|
||||||
|
articleBody = soup.find('div',attrs={'id':'articleBody'})
|
||||||
|
if not articleBody:
|
||||||
|
# Try again with class instead of id
|
||||||
|
articleBody = soup.find('div',attrs={'class':'articleBody'})
|
||||||
|
if not articleBody:
|
||||||
|
print 'postprocess_book.extract_description(): Did not find <div id="articleBody">:'
|
||||||
|
print soup.prettify()
|
||||||
|
return None
|
||||||
|
paras = articleBody.findAll('p')
|
||||||
|
for p in paras:
|
||||||
|
if p.renderContents() > '' :
|
||||||
|
return self.massageNCXText(self.tag_to_string(p,use_alt=False))
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Method entry point here
|
||||||
|
# Single section toc looks different than multi-section tocs
|
||||||
|
if oeb.toc.depth() == 2 :
|
||||||
|
for article in oeb.toc :
|
||||||
|
if article.author is None :
|
||||||
|
article.author = extract_byline(article.href)
|
||||||
|
if article.description is None :
|
||||||
|
article.description = extract_description(article.href).decode('utf-8')
|
||||||
|
elif oeb.toc.depth() == 3 :
|
||||||
|
for section in oeb.toc :
|
||||||
|
for article in section :
|
||||||
|
if article.author is None :
|
||||||
|
article.author = extract_byline(article.href)
|
||||||
|
if article.description is None :
|
||||||
|
article.description = extract_description(article.href)
|
||||||
|
|
||||||
|
@ -8,13 +8,13 @@ msgstr ""
|
|||||||
"Project-Id-Version: calibre\n"
|
"Project-Id-Version: calibre\n"
|
||||||
"Report-Msgid-Bugs-To: FULL NAME <EMAIL@ADDRESS>\n"
|
"Report-Msgid-Bugs-To: FULL NAME <EMAIL@ADDRESS>\n"
|
||||||
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
|
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
|
||||||
"PO-Revision-Date: 2010-03-12 07:01+0000\n"
|
"PO-Revision-Date: 2010-03-16 00:45+0000\n"
|
||||||
"Last-Translator: Kovid Goyal <Unknown>\n"
|
"Last-Translator: Kovid Goyal <Unknown>\n"
|
||||||
"Language-Team: Arabic <ar@li.org>\n"
|
"Language-Team: Arabic <ar@li.org>\n"
|
||||||
"MIME-Version: 1.0\n"
|
"MIME-Version: 1.0\n"
|
||||||
"Content-Type: text/plain; charset=UTF-8\n"
|
"Content-Type: text/plain; charset=UTF-8\n"
|
||||||
"Content-Transfer-Encoding: 8bit\n"
|
"Content-Transfer-Encoding: 8bit\n"
|
||||||
"X-Launchpad-Export-Date: 2010-03-13 04:48+0000\n"
|
"X-Launchpad-Export-Date: 2010-03-16 04:44+0000\n"
|
||||||
"X-Generator: Launchpad (build Unknown)\n"
|
"X-Generator: Launchpad (build Unknown)\n"
|
||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/customize/__init__.py:43
|
#: /home/kovid/work/calibre/src/calibre/customize/__init__.py:43
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -8,13 +8,13 @@ msgstr ""
|
|||||||
"Project-Id-Version: de\n"
|
"Project-Id-Version: de\n"
|
||||||
"Report-Msgid-Bugs-To: \n"
|
"Report-Msgid-Bugs-To: \n"
|
||||||
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
|
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
|
||||||
"PO-Revision-Date: 2010-03-12 09:01+0000\n"
|
"PO-Revision-Date: 2010-03-16 00:48+0000\n"
|
||||||
"Last-Translator: S. Dorscht <Unknown>\n"
|
"Last-Translator: Kovid Goyal <Unknown>\n"
|
||||||
"Language-Team: American English <kde-i18n-doc@lists.kde.org>\n"
|
"Language-Team: American English <kde-i18n-doc@lists.kde.org>\n"
|
||||||
"MIME-Version: 1.0\n"
|
"MIME-Version: 1.0\n"
|
||||||
"Content-Type: text/plain; charset=UTF-8\n"
|
"Content-Type: text/plain; charset=UTF-8\n"
|
||||||
"Content-Transfer-Encoding: 8bit\n"
|
"Content-Transfer-Encoding: 8bit\n"
|
||||||
"X-Launchpad-Export-Date: 2010-03-13 04:49+0000\n"
|
"X-Launchpad-Export-Date: 2010-03-16 04:44+0000\n"
|
||||||
"X-Generator: Launchpad (build Unknown)\n"
|
"X-Generator: Launchpad (build Unknown)\n"
|
||||||
"Generated-By: pygettext.py 1.5\n"
|
"Generated-By: pygettext.py 1.5\n"
|
||||||
|
|
||||||
|
@ -11,13 +11,13 @@ msgstr ""
|
|||||||
"Project-Id-Version: es\n"
|
"Project-Id-Version: es\n"
|
||||||
"Report-Msgid-Bugs-To: \n"
|
"Report-Msgid-Bugs-To: \n"
|
||||||
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
|
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
|
||||||
"PO-Revision-Date: 2010-03-13 12:12+0000\n"
|
"PO-Revision-Date: 2010-03-16 00:49+0000\n"
|
||||||
"Last-Translator: Jellby <Unknown>\n"
|
"Last-Translator: Kovid Goyal <Unknown>\n"
|
||||||
"Language-Team: Spanish\n"
|
"Language-Team: Spanish\n"
|
||||||
"MIME-Version: 1.0\n"
|
"MIME-Version: 1.0\n"
|
||||||
"Content-Type: text/plain; charset=UTF-8\n"
|
"Content-Type: text/plain; charset=UTF-8\n"
|
||||||
"Content-Transfer-Encoding: 8bit\n"
|
"Content-Transfer-Encoding: 8bit\n"
|
||||||
"X-Launchpad-Export-Date: 2010-03-14 05:01+0000\n"
|
"X-Launchpad-Export-Date: 2010-03-16 04:45+0000\n"
|
||||||
"X-Generator: Launchpad (build Unknown)\n"
|
"X-Generator: Launchpad (build Unknown)\n"
|
||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/customize/__init__.py:43
|
#: /home/kovid/work/calibre/src/calibre/customize/__init__.py:43
|
||||||
|
@ -7,13 +7,13 @@ msgstr ""
|
|||||||
"Project-Id-Version: calibre 0.4.22\n"
|
"Project-Id-Version: calibre 0.4.22\n"
|
||||||
"Report-Msgid-Bugs-To: \n"
|
"Report-Msgid-Bugs-To: \n"
|
||||||
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
|
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
|
||||||
"PO-Revision-Date: 2010-03-12 20:07+0000\n"
|
"PO-Revision-Date: 2010-03-16 00:47+0000\n"
|
||||||
"Last-Translator: Vincent C. <Unknown>\n"
|
"Last-Translator: Kovid Goyal <Unknown>\n"
|
||||||
"Language-Team: fr\n"
|
"Language-Team: fr\n"
|
||||||
"MIME-Version: 1.0\n"
|
"MIME-Version: 1.0\n"
|
||||||
"Content-Type: text/plain; charset=UTF-8\n"
|
"Content-Type: text/plain; charset=UTF-8\n"
|
||||||
"Content-Transfer-Encoding: 8bit\n"
|
"Content-Transfer-Encoding: 8bit\n"
|
||||||
"X-Launchpad-Export-Date: 2010-03-13 04:49+0000\n"
|
"X-Launchpad-Export-Date: 2010-03-16 04:44+0000\n"
|
||||||
"X-Generator: Launchpad (build Unknown)\n"
|
"X-Generator: Launchpad (build Unknown)\n"
|
||||||
"Generated-By: pygettext.py 1.5\n"
|
"Generated-By: pygettext.py 1.5\n"
|
||||||
|
|
||||||
|
@ -8,13 +8,13 @@ msgstr ""
|
|||||||
"Project-Id-Version: calibre\n"
|
"Project-Id-Version: calibre\n"
|
||||||
"Report-Msgid-Bugs-To: FULL NAME <EMAIL@ADDRESS>\n"
|
"Report-Msgid-Bugs-To: FULL NAME <EMAIL@ADDRESS>\n"
|
||||||
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
|
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
|
||||||
"PO-Revision-Date: 2010-03-14 14:29+0000\n"
|
"PO-Revision-Date: 2010-03-16 01:01+0000\n"
|
||||||
"Last-Translator: Miguel Anxo Bouzada <mbouzada@gmail.com>\n"
|
"Last-Translator: Miguel Anxo Bouzada <mbouzada@gmail.com>\n"
|
||||||
"Language-Team: Galician <gl@li.org>\n"
|
"Language-Team: Galician <gl@li.org>\n"
|
||||||
"MIME-Version: 1.0\n"
|
"MIME-Version: 1.0\n"
|
||||||
"Content-Type: text/plain; charset=UTF-8\n"
|
"Content-Type: text/plain; charset=UTF-8\n"
|
||||||
"Content-Transfer-Encoding: 8bit\n"
|
"Content-Transfer-Encoding: 8bit\n"
|
||||||
"X-Launchpad-Export-Date: 2010-03-15 04:47+0000\n"
|
"X-Launchpad-Export-Date: 2010-03-16 04:44+0000\n"
|
||||||
"X-Generator: Launchpad (build Unknown)\n"
|
"X-Generator: Launchpad (build Unknown)\n"
|
||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/customize/__init__.py:43
|
#: /home/kovid/work/calibre/src/calibre/customize/__init__.py:43
|
||||||
|
@ -8,13 +8,13 @@ msgstr ""
|
|||||||
"Project-Id-Version: calibre\n"
|
"Project-Id-Version: calibre\n"
|
||||||
"Report-Msgid-Bugs-To: FULL NAME <EMAIL@ADDRESS>\n"
|
"Report-Msgid-Bugs-To: FULL NAME <EMAIL@ADDRESS>\n"
|
||||||
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
|
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
|
||||||
"PO-Revision-Date: 2010-03-12 06:58+0000\n"
|
"PO-Revision-Date: 2010-03-16 00:52+0000\n"
|
||||||
"Last-Translator: Kovid Goyal <Unknown>\n"
|
"Last-Translator: Kovid Goyal <Unknown>\n"
|
||||||
"Language-Team: Latvian <ivars_a@inbox.lv>\n"
|
"Language-Team: Latvian <ivars_a@inbox.lv>\n"
|
||||||
"MIME-Version: 1.0\n"
|
"MIME-Version: 1.0\n"
|
||||||
"Content-Type: text/plain; charset=UTF-8\n"
|
"Content-Type: text/plain; charset=UTF-8\n"
|
||||||
"Content-Transfer-Encoding: 8bit\n"
|
"Content-Transfer-Encoding: 8bit\n"
|
||||||
"X-Launchpad-Export-Date: 2010-03-13 04:49+0000\n"
|
"X-Launchpad-Export-Date: 2010-03-16 04:44+0000\n"
|
||||||
"X-Generator: Launchpad (build Unknown)\n"
|
"X-Generator: Launchpad (build Unknown)\n"
|
||||||
"X-Poedit-Country: LATVIA\n"
|
"X-Poedit-Country: LATVIA\n"
|
||||||
"X-Poedit-Language: Latvian\n"
|
"X-Poedit-Language: Latvian\n"
|
||||||
|
@ -8,13 +8,13 @@ msgstr ""
|
|||||||
"Project-Id-Version: calibre\n"
|
"Project-Id-Version: calibre\n"
|
||||||
"Report-Msgid-Bugs-To: FULL NAME <EMAIL@ADDRESS>\n"
|
"Report-Msgid-Bugs-To: FULL NAME <EMAIL@ADDRESS>\n"
|
||||||
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
|
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
|
||||||
"PO-Revision-Date: 2010-03-12 23:15+0000\n"
|
"PO-Revision-Date: 2010-03-16 00:48+0000\n"
|
||||||
"Last-Translator: Øyvind Øritsland <Unknown>\n"
|
"Last-Translator: Kovid Goyal <Unknown>\n"
|
||||||
"Language-Team: Norwegian Bokmal <nb@li.org>\n"
|
"Language-Team: Norwegian Bokmal <nb@li.org>\n"
|
||||||
"MIME-Version: 1.0\n"
|
"MIME-Version: 1.0\n"
|
||||||
"Content-Type: text/plain; charset=UTF-8\n"
|
"Content-Type: text/plain; charset=UTF-8\n"
|
||||||
"Content-Transfer-Encoding: 8bit\n"
|
"Content-Transfer-Encoding: 8bit\n"
|
||||||
"X-Launchpad-Export-Date: 2010-03-14 05:01+0000\n"
|
"X-Launchpad-Export-Date: 2010-03-16 04:44+0000\n"
|
||||||
"X-Generator: Launchpad (build Unknown)\n"
|
"X-Generator: Launchpad (build Unknown)\n"
|
||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/customize/__init__.py:43
|
#: /home/kovid/work/calibre/src/calibre/customize/__init__.py:43
|
||||||
|
@ -7,13 +7,13 @@ msgstr ""
|
|||||||
"Project-Id-Version: calibre 0.4.55\n"
|
"Project-Id-Version: calibre 0.4.55\n"
|
||||||
"Report-Msgid-Bugs-To: \n"
|
"Report-Msgid-Bugs-To: \n"
|
||||||
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
|
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
|
||||||
"PO-Revision-Date: 2010-03-12 07:15+0000\n"
|
"PO-Revision-Date: 2010-03-16 01:01+0000\n"
|
||||||
"Last-Translator: Kovid Goyal <Unknown>\n"
|
"Last-Translator: Kovid Goyal <Unknown>\n"
|
||||||
"Language-Team: American English <kde-i18n-doc@lists.kde.org>\n"
|
"Language-Team: American English <kde-i18n-doc@lists.kde.org>\n"
|
||||||
"MIME-Version: 1.0\n"
|
"MIME-Version: 1.0\n"
|
||||||
"Content-Type: text/plain; charset=UTF-8\n"
|
"Content-Type: text/plain; charset=UTF-8\n"
|
||||||
"Content-Transfer-Encoding: 8bit\n"
|
"Content-Transfer-Encoding: 8bit\n"
|
||||||
"X-Launchpad-Export-Date: 2010-03-13 04:49+0000\n"
|
"X-Launchpad-Export-Date: 2010-03-16 04:44+0000\n"
|
||||||
"X-Generator: Launchpad (build Unknown)\n"
|
"X-Generator: Launchpad (build Unknown)\n"
|
||||||
"X-Poedit-Country: RUSSIAN FEDERATION\n"
|
"X-Poedit-Country: RUSSIAN FEDERATION\n"
|
||||||
"X-Poedit-Language: Russian\n"
|
"X-Poedit-Language: Russian\n"
|
||||||
|
@ -8,13 +8,13 @@ msgstr ""
|
|||||||
"Project-Id-Version: calibre\n"
|
"Project-Id-Version: calibre\n"
|
||||||
"Report-Msgid-Bugs-To: FULL NAME <EMAIL@ADDRESS>\n"
|
"Report-Msgid-Bugs-To: FULL NAME <EMAIL@ADDRESS>\n"
|
||||||
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
|
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
|
||||||
"PO-Revision-Date: 2010-03-12 06:57+0000\n"
|
"PO-Revision-Date: 2010-03-16 00:47+0000\n"
|
||||||
"Last-Translator: Besnik <besnik@programeshqip.org>\n"
|
"Last-Translator: Besnik <besnik@programeshqip.org>\n"
|
||||||
"Language-Team: Albanian <sq@li.org>\n"
|
"Language-Team: Albanian <sq@li.org>\n"
|
||||||
"MIME-Version: 1.0\n"
|
"MIME-Version: 1.0\n"
|
||||||
"Content-Type: text/plain; charset=UTF-8\n"
|
"Content-Type: text/plain; charset=UTF-8\n"
|
||||||
"Content-Transfer-Encoding: 8bit\n"
|
"Content-Transfer-Encoding: 8bit\n"
|
||||||
"X-Launchpad-Export-Date: 2010-03-13 04:48+0000\n"
|
"X-Launchpad-Export-Date: 2010-03-16 04:43+0000\n"
|
||||||
"X-Generator: Launchpad (build Unknown)\n"
|
"X-Generator: Launchpad (build Unknown)\n"
|
||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/customize/__init__.py:43
|
#: /home/kovid/work/calibre/src/calibre/customize/__init__.py:43
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -8,13 +8,13 @@ msgstr ""
|
|||||||
"Project-Id-Version: calibre\n"
|
"Project-Id-Version: calibre\n"
|
||||||
"Report-Msgid-Bugs-To: FULL NAME <EMAIL@ADDRESS>\n"
|
"Report-Msgid-Bugs-To: FULL NAME <EMAIL@ADDRESS>\n"
|
||||||
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
|
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
|
||||||
"PO-Revision-Date: 2010-03-12 07:15+0000\n"
|
"PO-Revision-Date: 2010-03-16 01:01+0000\n"
|
||||||
"Last-Translator: Thruth Wang <wanglihao@gmail.com>\n"
|
"Last-Translator: Thruth Wang <wanglihao@gmail.com>\n"
|
||||||
"Language-Team: Simplified Chinese <wanglihao@gmail.com>\n"
|
"Language-Team: Simplified Chinese <wanglihao@gmail.com>\n"
|
||||||
"MIME-Version: 1.0\n"
|
"MIME-Version: 1.0\n"
|
||||||
"Content-Type: text/plain; charset=UTF-8\n"
|
"Content-Type: text/plain; charset=UTF-8\n"
|
||||||
"Content-Transfer-Encoding: 8bit\n"
|
"Content-Transfer-Encoding: 8bit\n"
|
||||||
"X-Launchpad-Export-Date: 2010-03-13 04:50+0000\n"
|
"X-Launchpad-Export-Date: 2010-03-16 04:45+0000\n"
|
||||||
"X-Generator: Launchpad (build Unknown)\n"
|
"X-Generator: Launchpad (build Unknown)\n"
|
||||||
"X-Poedit-Country: CHINA\n"
|
"X-Poedit-Country: CHINA\n"
|
||||||
"X-Poedit-Language: Chinese\n"
|
"X-Poedit-Language: Chinese\n"
|
||||||
@ -6929,7 +6929,7 @@ msgstr "打印预览"
|
|||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/gui2/viewer/main.py:294
|
#: /home/kovid/work/calibre/src/calibre/gui2/viewer/main.py:294
|
||||||
msgid "Connecting to dict.org to lookup: <b>%s</b>…"
|
msgid "Connecting to dict.org to lookup: <b>%s</b>…"
|
||||||
msgstr "正在链接 dict.org 查询:"
|
msgstr "正在链接 dict.org 查询:<b>%s</b>"
|
||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/gui2/viewer/main.py:393
|
#: /home/kovid/work/calibre/src/calibre/gui2/viewer/main.py:393
|
||||||
msgid "Choose ebook"
|
msgid "Choose ebook"
|
||||||
|
@ -8,13 +8,13 @@ msgstr ""
|
|||||||
"Project-Id-Version: calibre\n"
|
"Project-Id-Version: calibre\n"
|
||||||
"Report-Msgid-Bugs-To: FULL NAME <EMAIL@ADDRESS>\n"
|
"Report-Msgid-Bugs-To: FULL NAME <EMAIL@ADDRESS>\n"
|
||||||
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
|
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
|
||||||
"PO-Revision-Date: 2010-03-13 03:27+0000\n"
|
"PO-Revision-Date: 2010-03-16 00:51+0000\n"
|
||||||
"Last-Translator: Chao-Hsiung Liao <j_h_liau@yahoo.com.tw>\n"
|
"Last-Translator: Chao-Hsiung Liao <j_h_liau@yahoo.com.tw>\n"
|
||||||
"Language-Team: Chinese (traditional)\n"
|
"Language-Team: Chinese (traditional)\n"
|
||||||
"MIME-Version: 1.0\n"
|
"MIME-Version: 1.0\n"
|
||||||
"Content-Type: text/plain; charset=UTF-8\n"
|
"Content-Type: text/plain; charset=UTF-8\n"
|
||||||
"Content-Transfer-Encoding: 8bit\n"
|
"Content-Transfer-Encoding: 8bit\n"
|
||||||
"X-Launchpad-Export-Date: 2010-03-14 05:01+0000\n"
|
"X-Launchpad-Export-Date: 2010-03-16 04:45+0000\n"
|
||||||
"X-Generator: Launchpad (build Unknown)\n"
|
"X-Generator: Launchpad (build Unknown)\n"
|
||||||
"Language: zh_TW\n"
|
"Language: zh_TW\n"
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user