KG updates

This commit is contained in:
GRiker 2010-03-19 06:36:03 -07:00
commit 77f6df628d
21 changed files with 4238 additions and 2639 deletions

View File

@ -6,43 +6,66 @@ __docformat__ = 'restructuredtext en'
'''
http://www.news.com.au/dailytelegraph/
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class DailyTelegraph(BasicNewsRecipe):
title = u'Daily Telegraph'
__author__ = u'AprilHare'
__author__ = u'Adrian G.'
language = 'en_AU'
description = u'News from down under'
oldest_article = 2
max_articles_per_feed = 10
remove_tags_before = dict(name='div', attrs={'class':'article-title'})
remove_tags = [dict(attrs={'class':['article-source', 'article-tools']})]
remove_tags_after = dict(attrs={'class':re.compile('share-article')})
description = u'Daily Telegraph News'
oldest_article = 5
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
no_javascript = True
timefmt = ' [%A, %d %B, %Y]'
encoding = 'utf-8'
keep_only_tags = [dict(name='div', attrs ={'id':'story'})]
extra_css = '''
h1{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:large;}
.cT-storyDetails{font-family:Arial,Helvetica,sans-serif; color:#666666;font-size:x-small;}
.articleBody{font-family:Arial,Helvetica,sans-serif; color:black;font-size:small;}
.cT-imageLandscape{font-family:Arial,Helvetica,sans-serif; color:#333333 ;font-size:x-small;}
.source{font-family:Arial,Helvetica,sans-serif; color:#333333 ;font-size:xx-small;}
#content{font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
.pageprint{font-family:Arial,Helvetica,sans-serif;font-size:small;}
#bylineDetails{font-family:Arial,Helvetica,sans-serif; color:#666666;font-size:x-small;}
.featurePic-wide{font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
#idfeaturepic{font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
h3{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;}
h2{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;}
h4{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;}
h5{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;}
body{font-family:Arial,Helvetica,sans-serif; font-size:x-small;}
'''
remove_tags = [
dict(name='div', attrs ={'id':['comments','story-related-coverage']}),
dict(name='div', attrs ={'class':['story-header-tools','story-footer','story-extras','story-related']}),
dict(name='div', attrs ={'class':['promo-image','story-extras story-extras-2']}),
dict(name='div', attrs ={'class':['assistive sidebar-jump']})
]
feeds = [
(u'Top Stories', u'http://feeds.news.com.au/public/rss/2.0/dtele_top_stories_253.xml'),
(u'National News', u'http://feeds.news.com.au/public/rss/2.0/dtele_national_news_202.xml'),
(u'World News', u'http://feeds.news.com.au/public/rss/2.0/dtele_world_news_204.xml'),
(u'NSW and ACT', u'http://feeds.news.com.au/public/rss/2.0/dtele_nswact_225.xml'),
(u'Arts', u'http://feeds.news.com.au/public/rss/2.0/dtele_art_444.xml'),
(u'Business News', u'http://feeds.news.com.au/public/rss/2.0/dtele_business_226.xml'),
(u'Entertainment News', u'http://feeds.news.com.au/public/rss/2.0/dtele_entertainment_news_201.xml'),
(u'Lifestyle News', u'http://feeds.news.com.au/public/rss/2.0/dtele_lifestyle_227.xml'),
(u'Music', u'http://feeds.news.com.au/public/rss/2.0/dtele_music_441.xml'),
(u'Property Confidential', u'http://feeds.news.com.au/public/rss/2.0/dtele_property_confidential_463.xml'),
(u'Property - Your Space', u'http://feeds.news.com.au/public/rss/2.0/dtele_property_yourspace_462.xml'),
(u'Confidential News', u'http://feeds.news.com.au/public/rss/2.0/dtele_entertainment_confidential_252.xml'),
(u'Confidential Biographies', u'http://feeds.news.com.au/public/rss/2.0/dtele_confidential_biographies_491.xml'),
(u'Confidential Galleries', u'http://feeds.news.com.au/public/rss/2.0/dtele_confidential_galleries_483.xml'),
(u'Confidential In-depth', u'http://feeds.news.com.au/public/rss/2.0/dtele_confidential_indepth_490.xml'),
(u'Confidential ShowBuzz', u'http://feeds.news.com.au/public/rss/2.0/dtele_confidential_showbuzz_485.xml'),
(u'Sport', u'http://feeds.news.com.au/public/rss/2.0/dtele_sport_203.xml'),
(u'AFL', u'http://feeds.news.com.au/public/rss/2.0/dtele_sports_afl_341.xml'),
(u'Cricket', u'http://feeds.news.com.au/public/rss/2.0/dtele_sports_cricket_343.xml'),
(u'Horse Racing', u'http://feeds.news.com.au/public/rss/2.0/dtele_sports_horseracing_686.xml'),
(u'NRL', u'http://feeds.news.com.au/public/rss/2.0/dtele_sports_nrl_345.xml'),
(u'Rugby Union', u'http://feeds.news.com.au/public/rss/2.0/dtele_sports_rugby_union_342.xml'),
(u'Soccer', u'http://feeds.news.com.au/public/rss/2.0/dtele_sports_soccer_344.xml')
(u'Top Stories', u'http://feeds.news.com.au/public/rss/2.0/dtele_top_stories_253.xml'),
(u'National News', u'http://feeds.news.com.au/public/rss/2.0/dtele_national_news_202.xml'),
(u'World News', u'http://feeds.news.com.au/public/rss/2.0/dtele_world_news_204.xml'),
(u'NSW and ACT', u'http://feeds.news.com.au/public/rss/2.0/dtele_nswact_225.xml'),
(u'Arts', u'http://feeds.news.com.au/public/rss/2.0/dtele_art_444.xml'),
(u'Business News', u'http://feeds.news.com.au/public/rss/2.0/dtele_business_226.xml'),
(u'Entertainment News', u'http://feeds.news.com.au/public/rss/2.0/dtele_entertainment_news_201.xml'),
(u'Lifestyle News', u'http://feeds.news.com.au/public/rss/2.0/dtele_lifestyle_227.xml'),
(u'Music', u'http://feeds.news.com.au/public/rss/2.0/dtele_music_441.xml'),
(u'Property Confidential', u'http://feeds.news.com.au/public/rss/2.0/dtele_property_confidential_463.xml'),
(u'Property - Your Space', u'http://feeds.news.com.au/public/rss/2.0/dtele_property_yourspace_462.xml'),
(u'Confidential News', u'http://feeds.news.com.au/public/rss/2.0/dtele_entertainment_confidential_252.xml'),
(u'Confidential Biographies', u'http://feeds.news.com.au/public/rss/2.0/dtele_confidential_biographies_491.xml'),
(u'Confidential Galleries', u'http://feeds.news.com.au/public/rss/2.0/dtele_confidential_galleries_483.xml'),
]

View File

@ -9,16 +9,16 @@ from calibre.web.feeds.news import BasicNewsRecipe
class Pagina12(BasicNewsRecipe):
title = 'Elsevier.nl'
__author__ = 'Darko Miletic'
description = 'News from Denmark'
description = 'News from Holland'
publisher = 'elsevier.nl'
category = 'news, politics, Denmark'
category = 'news, politics, Holland'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
language = 'nl'
country = 'NL'
country = 'NL'
remove_empty_feeds = True
masthead_url = 'http://www.elsevier.nl/static/elsevier/stdimg/logo.gif'
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} '
@ -29,7 +29,7 @@ class Pagina12(BasicNewsRecipe):
, 'publisher' : publisher
, 'language' : language
}
keep_only_tags = dict(attrs={'id':'artikel_container'})
remove_tags_before = dict(attrs={'id':'breadcrumb_container'})
remove_tags_after = dict(attrs={'class':'author_link'})
@ -50,7 +50,7 @@ class Pagina12(BasicNewsRecipe):
,(u'Cultuur & Televisie' , u'http://www.elsevier.nl/web/RSS/Cultuur-Televisie-RSS.htm?output=xml')
,(u'Society' , u'http://www.elsevier.nl/web/RSS/Society-RSS.htm?output=xml' )
,(u'Internet&/Gadgets' , u'http://www.elsevier.nl/web/RSS/Internet-Gadgets-RSS.htm?output=xml' )
,(u'Comentaren' , u'http://www.elsevier.nl/web/RSS/Commentaren-RSS.htm?output=xml' )
,(u'Comentaren' , u'http://www.elsevier.nl/web/RSS/Commentaren-RSS.htm?output=xml' )
]
def print_version(self, url):
@ -58,8 +58,8 @@ class Pagina12(BasicNewsRecipe):
def get_article_url(self, article):
return article.get('guid', None).rpartition('?')[0]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup
return soup

View File

@ -4,65 +4,136 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
nytimes.com
V5 - One picture per article, moved to top:
Headline
Image
Byline
Story
'''
import string, re, time
import re, string, time
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
def decode(self, src):
enc = 'utf-8'
if 'iso-8859-1' in src:
enc = 'cp1252'
return src.decode(enc, 'ignore')
from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, NavigableString, Tag
class NYTimes(BasicNewsRecipe):
title = 'The New York Times (subscription)'
__author__ = 'Kovid Goyal'
title = 'The New York Times'
__author__ = 'GRiker'
language = 'en'
requires_version = (0, 6, 36)
description = 'Daily news from the New York Times (subscription version)'
timefmt = ' [%a, %b %d, %Y]'
allSectionKeywords = ['The Front Page', 'International','National','Obituaries','Editorials',
'New York','Business Day','Science Times','Sports','Dining','Arts',
'Home','Styles','Sunday Business','Week In Review','Travel','Magazine',
'Book Review','Weddings','Real Estate','Automobiles',"T Men's Fashion",
"T Women's Fashion"]
# List of sections to exclude
# To add a section, copy the section name from the allSectionKeywords list above
# For example, to exclude 'Dining' and 'Weddings':
# excludeSectionKeywords = ['Dining','Weddings']
excludeSectionKeywords = []
# List of sections to include (test and debug only)
# By default, any sections in today's paper that are not listed in excludeSectionKeywords
# are downloaded. fetch_only specifies that only certain sections are to be downloaded.
# This should only be used for testing and debugging.
# For example, to download only 'The Front Page' section:
# fetch_only = set(['The Front Page'])
fetch_only = set([])
if fetch_only:
excludeSectionKeywords = list(set(allSectionKeywords) ^ fetch_only)
# one_picture_per_article specifies that calibre should only use the first image
# from an article (if one exists). If one_picture_per_article = True, the image
# will be moved to a location between the headline and the byline.
# If one_picture_per_article = False, all images from the article will be included
# and shown in their original location.
one_picture_per_article = True
timefmt = ''
needs_subscription = True
remove_tags_before = dict(id='article')
remove_tags_after = dict(id='article')
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
dict(id=['footer', 'toolsRight', 'articleInline',
'navigation', 'archive', 'side_search', 'blog_sidebar',
'side_tool', 'side_index', 'login', 'businessSearchBar',
'adxLeaderboard',
'relatedArticles', 'relatedTopics', 'adxSponLink']),
remove_tags = [dict(attrs={'class':[
'articleTools',
'columnGroup doubleRule',
'columnGroup last',
'doubleRule',
'dottedLine',
'entry-meta',
'icon enlargeThis',
'leftNavTabs',
'module box nav',
'nextArticleLink clearfix',
'post-tools',
'relatedSearchesModule',
'side_tool',
'singleAd',
]}),
dict(id=[
'adxLeaderboard',
'archive',
'articleExtras',
'articleInline',
'blog_sidebar',
'cCol',
'entertainmentSearchBar',
'footer',
'header',
'header_search',
'login',
'masthead',
'memberTools',
'navigation',
'portfolioInline',
'relatedArticles',
'side_search',
'side_index',
'side_tool',
'toolsRight',
]),
dict(name=['script', 'noscript', 'style'])]
encoding = decode
masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
no_stylesheets = True
extra_css = 'h1 {font-face:sans-serif; font-size:2em; font-weight:bold;}\n.byline {font:monospace;}\n.bold {font-weight:bold;}'
extra_css = '.headline {text-align: left;}\n \
.byline {font-family: monospace; \
text-align: left; \
margin-top: 0px; \
margin-bottom: 0px;}\n \
.dateline {font-size: small; \
margin-top: 0px; \
margin-bottom: 0px;}\n \
.timestamp {font-size: small; \
margin-top: 0px; \
margin-bottom: 0px;}\n \
.source {text-align: left;}\n \
.image {text-align: center;}\n \
.credit {text-align: right; \
font-size: small; \
margin-top: 0px; \
margin-bottom: 0px;}\n \
.articleBody {text-align: left;}\n \
.authorId {text-align: left; \
font-style: italic;}\n '
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://www.nytimes.com/auth/login')
br.select_form(name='login')
br['USERID'] = self.username
br['PASSWORD'] = self.password
raw = br.submit().read()
if 'Sorry, we could not find the combination you entered. Please try again.' in raw:
raise Exception('Your username and password are incorrect')
#open('/t/log.html', 'wb').write(raw)
try:
br.open('http://www.nytimes.com/auth/login')
br.select_form(name='login')
br['USERID'] = self.username
br['PASSWORD'] = self.password
raw = br.submit().read()
if 'Sorry, we could not find the combination you entered. Please try again.' in raw:
raise Exception('Your username and password are incorrect')
#open('/t/log.html', 'wb').write(raw)
except:
self.log("\nFailed to login")
return br
def get_masthead_url(self):
masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
br = BasicNewsRecipe.get_browser()
try:
br.open(masthead)
except:
self.log("\nCover unavailable")
masthead = None
return masthead
def get_cover_url(self):
cover = None
st = time.localtime()
@ -78,13 +149,66 @@ class NYTimes(BasicNewsRecipe):
cover = None
return cover
def short_title(self):
return 'NY Times'
def get_masthead_title(self):
return 'NYTimes GR Version'
def dump_ans(self, ans):
total_article_count = 0
for section in ans :
if self.verbose:
self.log("section %s: %d articles" % (section[0], len(section[1])) )
for article in section[1]:
total_article_count += 1
if self.verbose:
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('mac-roman','replace'),
article['url'].encode('mac-roman','replace')))
self.log( "Queued %d articles" % total_article_count )
def dump_hex(self, src, length=16):
''' Diagnostic '''
FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)])
N=0; result=''
while src:
s,src = src[:length],src[length:]
hexa = ' '.join(["%02X"%ord(x) for x in s])
s = s.translate(FILTER)
result += "%04X %-*s %s\n" % (N, length*3, hexa, s)
N+=length
print result
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","&#8216;",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","&#8217;",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","&#8220;",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","&#8221;",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","&#8211;",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","&#8212;",fixed)
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&#38;'
massaged = re.sub("&","&#38;", massaged)
return self.fixChars(massaged)
else:
return description
def parse_index(self):
self.encoding = 'cp1252'
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
self.encoding = decode
def feed_title(div):
return ''.join(div.findAll(text=True, recursive=False)).strip()
@ -92,18 +216,13 @@ class NYTimes(BasicNewsRecipe):
articles = {}
key = None
ans = []
#allSectionKeywords = ['The Front Page', 'International','National','Obituaries','Editorials',
#'New York','Business Day','Sports','Dining','Arts','Home','Styles']
excludeSectionKeywords = ['Dining','Styles']
# Find each instance of class="section-headline", class="story", class="story headline"
for div in soup.findAll(True,
attrs={'class':['section-headline', 'story', 'story headline']}):
if div['class'] == 'section-headline':
key = string.capwords(feed_title(div))
excluded = re.compile('|'.join(excludeSectionKeywords))
excluded = re.compile('|'.join(self.excludeSectionKeywords))
if excluded.search(key):
self.log("Skipping section %s" % key)
continue
@ -117,13 +236,14 @@ class NYTimes(BasicNewsRecipe):
continue
url = re.sub(r'\?.*', '', a['href'])
url += '?pagewanted=all'
title = self.tag_to_string(a, use_alt=True).strip()
title = self.massageNCXText(self.tag_to_string(a, use_alt=True).strip())
description = ''
pubdate = strftime('%a, %d %b')
summary = div.find(True, attrs={'class':'summary'})
if summary:
description = self.tag_to_string(summary, use_alt=False)
description = self.massageNCXText(self.tag_to_string(summary, use_alt=False))
author = ''
authorAttribution = div.find(True, attrs={'class':'storyheadline-author'})
@ -133,6 +253,8 @@ class NYTimes(BasicNewsRecipe):
authorAttribution = div.find(True, attrs={'class':'byline'})
if authorAttribution:
author = self.tag_to_string(authorAttribution, use_alt=False)
# Kill commas - Kindle switches to '&'
author = re.sub(',','',author)
feed = key if key is not None else 'Uncategorized'
if not articles.has_key(feed):
@ -146,13 +268,208 @@ class NYTimes(BasicNewsRecipe):
'Dining In, Dining Out':1,
'Obituaries':2})
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
self.dump_ans(ans)
return ans
def preprocess_html(self, soup):
'''
refresh = soup.find('meta', {'http-equiv':'refresh'})
if refresh is None:
return soup
content = refresh.get('content').partition('=')[2]
raw = self.browser.open_novisit('http://www.nytimes.com'+content).read()
raw = self.browser.open('http://www.nytimes.com'+content).read()
return BeautifulSoup(raw.decode('cp1252', 'replace'))
'''
return self.strip_anchors(soup)
def postprocess_html(self,soup, True):
if self.one_picture_per_article:
# Remove all images after first
largeImg = soup.find(True, {'class':'articleSpanImage'})
inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
if largeImg:
for inlineImg in inlineImgs:
inlineImg.extract()
else:
if inlineImgs:
firstImg = inlineImgs[0]
for inlineImg in inlineImgs[1:]:
inlineImg.extract()
# Move firstImg after headline
cgFirst = soup.find(True, {'class':'columnGroup first'})
if cgFirst:
# Strip all sibling NavigableStrings: noise
navstrings = cgFirst.findAll(text=True, recursive=False)
[ns.extract() for ns in navstrings]
headline_found = False
tag = cgFirst.find(True)
insertLoc = 0
while True:
insertLoc += 1
if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
headline_found = True
break
tag = tag.nextSibling
if not tag:
headline_found = False
break
if headline_found:
cgFirst.insert(insertLoc,firstImg)
else:
self.log(">>> No class:'columnGroup first' found <<<")
# Change class="kicker" to <h3>
kicker = soup.find(True, {'class':'kicker'})
if kicker and kicker.contents[0]:
h3Tag = Tag(soup, "h3")
h3Tag.insert(0, self.fixChars(self.tag_to_string(kicker,
use_alt=False)))
kicker.replaceWith(h3Tag)
# Change captions to italic -1
for caption in soup.findAll(True, {'class':'caption'}) :
if caption and caption.contents[0]:
emTag = Tag(soup, "em")
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
mp_off = c.find("More Photos")
if mp_off >= 0:
c = c[:mp_off]
emTag.insert(0, c)
hrTag = Tag(soup, 'hr')
#hrTag['style'] = "margin-top:0em;margin-bottom:0em"
emTag.insert(1, hrTag)
caption.replaceWith(emTag)
# Change <nyt_headline> to <h2>
h1 = soup.find('h1')
if h1:
headline = h1.find("nyt_headline")
if headline:
tag = Tag(soup, "h2")
tag['class'] = "headline"
tag.insert(0, self.fixChars(headline.contents[0]))
h1.replaceWith(tag)
else:
# Blog entry - replace headline, remove <hr> tags
headline = soup.find('title')
if headline:
tag = Tag(soup, "h2")
tag['class'] = "headline"
tag.insert(0, self.fixChars(headline.contents[0]))
soup.insert(0, tag)
hrs = soup.findAll('hr')
for hr in hrs:
hr.extract()
# Change <h1> to <h3> - used in editorial blogs
masthead = soup.find("h1")
if masthead:
# Nuke the href
if masthead.a:
del(masthead.a['href'])
tag = Tag(soup, "h3")
tag.insert(0, self.fixChars(masthead.contents[0]))
masthead.replaceWith(tag)
# Change <span class="bold"> to <b>
for subhead in soup.findAll(True, {'class':'bold'}) :
if subhead.contents:
bTag = Tag(soup, "b")
bTag.insert(0, subhead.contents[0])
subhead.replaceWith(bTag)
# Synthesize a section header
dsk = soup.find('meta', attrs={'name':'dsk'})
if dsk and dsk.has_key('content'):
hTag = Tag(soup,'h3')
hTag['class'] = 'section'
hTag.insert(0,NavigableString(dsk['content']))
articleTag = soup.find(True, attrs={'id':'article'})
if articleTag:
articleTag.insert(0,hTag)
# Add class="articleBody" to <div> so we can format with CSS
divTag = soup.find('div',attrs={'id':'articleBody'})
if divTag:
divTag['class'] = divTag['id']
# Add class="authorId" to <div> so we can format with CSS
divTag = soup.find('div',attrs={'id':'authorId'})
if divTag and divTag.contents[0]:
tag = Tag(soup, "p")
tag['class'] = "authorId"
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
use_alt=False)))
divTag.replaceWith(tag)
return soup
def postprocess_book(self, oeb, opts, log) :
def extract_byline(href) :
# <meta name="byline" content=
soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
byline = soup.find('meta',attrs={'name':['byl','CLMST']})
if byline :
author = byline['content']
else :
# Try for <div class="byline">
byline = soup.find('div', attrs={'class':'byline'})
if byline:
author = byline.renderContents()
else:
print "couldn't find byline in %s" % href
print soup.prettify()
return None
# Kill commas - Kindle switches to '&'
return re.sub(',','',author)
def extract_description(href) :
soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
description = soup.find('meta',attrs={'name':['description','description ']})
if description :
# print repr(description['content'])
# print self.massageNCXText(description['content'])
return self.massageNCXText(description['content'])
else:
# Take first paragraph of article
articleBody = soup.find('div',attrs={'id':'articleBody'})
if not articleBody:
# Try again with class instead of id
articleBody = soup.find('div',attrs={'class':'articleBody'})
if not articleBody:
print 'postprocess_book.extract_description(): Did not find <div id="articleBody">:'
print soup.prettify()
return None
paras = articleBody.findAll('p')
for p in paras:
if p.renderContents() > '' :
return self.massageNCXText(self.tag_to_string(p,use_alt=False))
return None
# Method entry point here
# Single section toc looks different than multi-section tocs
if oeb.toc.depth() == 2 :
for article in oeb.toc :
if article.author is None :
article.author = extract_byline(article.href)
if article.description is None :
article.description = extract_description(article.href).decode('utf-8')
elif oeb.toc.depth() == 3 :
for section in oeb.toc :
for article in section :
if article.author is None :
article.author = extract_byline(article.href)
if article.description is None :
article.description = extract_description(article.href)
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('utf-8','replace'))
#a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup

View File

@ -0,0 +1,34 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
oilprice.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class OilPrice(BasicNewsRecipe):
title = 'Oil Price'
__author__ = 'Darko Miletic'
description = 'The nr. 1 source for Oil Price Information'
publisher = 'oilprice.com'
category = 'news, oil, politics, world, usa'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'cp1252'
use_embedded_content = False
language = 'en'
country = 'US'
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
remove_tags = [dict(name='div',attrs={'class':'banner'})]
keep_only_tags = [dict(name='div',attrs={'id':'storyContent'})]
remove_tags_after = dict(attrs={'id':'KonaBody'})
feeds = [(u'Articles', u'http://www.oilprice.com/rss.xml')]

View File

@ -0,0 +1,45 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1268409464(BasicNewsRecipe):
title = u'The Sun'
__author__ = 'Chaz Ralph'
description = 'News from The Sun'
oldest_article = 1
max_articles_per_feed = 100
language = 'en'
no_stylesheets = True
extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
encoding= 'iso-8859-1'
remove_javascript = True
keep_only_tags = [
dict(name='div', attrs={'class':'medium-centered'})
,dict(name='div', attrs={'class':'article'})
,dict(name='div', attrs={'class':'clear-left'})
,dict(name='div', attrs={'class':'text-center'})
]
remove_tags = [
dict(name='div', attrs={'class':'slideshow'})
,dict(name='div', attrs={'class':'float-left'})
,dict(name='div', attrs={'class':'ltbx-slideshow ltbx-btn-ss'})
,dict(name='a', attrs={'class':'add_a_comment'})
,dict(name='div', attrs={'id':'vxFlashPlayerContent'})
,dict(name='div', attrs={'id':'k1006094r1c1t5w380h529'})
,dict(name='div', attrs={'id':'tum_login_form_container'})
,dict(name='div', attrs={'class':'discHeader'})
,dict(name='div', attrs={'class':'margin-bottom-neg-2'})
]
feeds = [(u'News', u'http://www.thesun.co.uk/sol/homepage/feeds/rss/article312900.ece')
,(u'Sport', u'http://www.thesun.co.uk/sol/homepage/feeds/rss/article247732.ece')
,(u'Football', u'http://www.thesun.co.uk/sol/homepage/feeds/rss/article247739.ece')
,(u'Gizmo', u'http://www.thesun.co.uk/sol/homepage/feeds/rss/article247829.ece')
,(u'Bizarre', u'http://www.thesun.co.uk/sol/homepage/feeds/rss/article247767.ece')]
def print_version(self, url):
return re.sub(r'\?OTC-RSS&ATTR=[-a-zA-Z]+', '?print=yes', url)

View File

@ -0,0 +1,49 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
import re
from calibre.web.feeds.news import BasicNewsRecipe
class WashingtonTimes(BasicNewsRecipe):
title = 'Washington Times'
max_articles_per_feed = 15
language = 'en'
__author__ = 'Kos Semonski'
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
(r'<HEAD>.*?</HEAD>' , lambda match : '<HEAD></HEAD>'),
(r'<div id="apple-rss-sidebar-background">.*?<!-- start Entries -->', lambda match : ''),
(r'<!-- end apple-rss-content-area -->.*?</body>', lambda match : '</body>'),
(r'<script.*?>.*?</script>', lambda match : ''),
(r'<body onload=.*?>.*?<a href="http://www.upi.com">', lambda match : '<body style="font: 8pt arial;">'),
##(r'<div class=\'headerDIV\'><h2><a style="color: #990000;" href="http://www.washingtontimes.com/NewsTrack/Top_News/">Top News</a></h2></div>.*?<br clear="all">', lambda match : ''),
(r'<script src="http://www.g.*?>.*?</body>', lambda match : ''),
(r'<span style="font: 16pt arial', lambda match : '<span style="font: 12pt arial'),
]
]
def get_feeds(self):
return [ (u'Headlines', u'http://www.washingtontimes.com/rss/headlines/news/headlines/'),
(u'Newsmakers', u'http://www.washingtontimes.com/rss/headlines/news/newsmakers/'),
(u'National', u'http://www.washingtontimes.com/rss/headlines/news/national/'),
(u'World', u'http://www.washingtontimes.com/rss/headlines/news/world/'),
(u'Editor Favs', u'http://www.washingtontimes.com/rss/headlines/news/editor-favorites/'),
(u'Editorials', u'http://www.washingtontimes.com/rss/headlines/opinion/editorials/'),
(u'Cartoons', u'http://www.washingtontimes.com/rss/headlines/opinion/cartoons/'),
(u'Business', u'http://www.washingtontimes.com/rss/headlines/news/business/'),
(u'Technology', u'http://www.washingtontimes.com/rss/headlines/news/technology/'),
(u'Security', u'http://www.washingtontimes.com/rss/headlines/news/security/'),
(u'Politics', u'http://www.washingtontimes.com/rss/headlines/news/politics/'),
(u'Congress', u'http://www.washingtontimes.com/rss/headlines/news/congress/'),
]
def print_version(self, url):
return url + '/print/'

View File

@ -181,7 +181,7 @@ def main():
cols = 80
parser = OptionParser(usage="usage: %prog [options] command args\n\ncommand "+
"is one of: info, books, df, ls, cp, mkdir, touch, cat, rm, eject\n\n"+
"is one of: info, books, df, ls, cp, mkdir, touch, cat, rm, eject, test_file\n\n"+
"For help on a particular command: %prog command", version=__appname__+" version: " + __version__)
parser.add_option("--log-packets", help="print out packet stream to stdout. "+\
"The numbers in the left column are byte offsets that allow the packet size to be read off easily.",
@ -335,6 +335,20 @@ def main():
parser.print_help()
return 1
dev.touch(args[0])
elif command == 'test_file':
parser = OptionParser(usage=("usage: %prog test_file path\n"
'Open device, copy file psecified by path to device and '
'then eject device.'))
options, args = parser.parse_args(args)
if len(args) != 1:
parser.print_help()
return 1
path = args[0]
from calibre.ebooks.metadata.meta import get_metadata
mi = get_metadata(open(path, 'rb'), path.rpartition('.')[-1].lower())
print dev.upload_books([args[0]], [os.path.basename(args[0])],
end_session=False, metadata=[mi])
dev.eject()
else:
parser.print_help()
if getattr(dev, 'handle', False): dev.close()

View File

@ -92,17 +92,20 @@ class EPUBOutput(OutputFormatPlugin):
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta name="calibre:cover" content="true" />
<title>Cover</title>
<style type="text/css" title="override_css">
@page {padding: 0pt; margin:0pt}
body { text-align: center; padding:0pt; margin: 0pt; }
div { margin: 0pt; padding: 0pt; }
</style>
</head>
<body>
<div>
<img src="%s" alt="cover" style="height: 100%%" />
</div>
<svg version="1.1" xmlns="http://www.w3.org/2000/svg"
xmlns:xlink="http://www.w3.org/1999/xlink"
width="100%%" height="100%%" viewBox="0 0 600 800"
preserveAspectRatio="xMidYMid meet">
<image width="600" height="800" xlink:href="%s"/>
</svg>
</body>
</html>
'''

View File

@ -8,13 +8,13 @@ msgstr ""
"Project-Id-Version: calibre\n"
"Report-Msgid-Bugs-To: FULL NAME <EMAIL@ADDRESS>\n"
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
"PO-Revision-Date: 2010-03-12 07:01+0000\n"
"PO-Revision-Date: 2010-03-16 00:45+0000\n"
"Last-Translator: Kovid Goyal <Unknown>\n"
"Language-Team: Arabic <ar@li.org>\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n"
"X-Launchpad-Export-Date: 2010-03-13 04:48+0000\n"
"X-Launchpad-Export-Date: 2010-03-16 04:44+0000\n"
"X-Generator: Launchpad (build Unknown)\n"
#: /home/kovid/work/calibre/src/calibre/customize/__init__.py:43

File diff suppressed because it is too large Load Diff

View File

@ -8,13 +8,13 @@ msgstr ""
"Project-Id-Version: de\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
"PO-Revision-Date: 2010-03-12 09:01+0000\n"
"Last-Translator: S. Dorscht <Unknown>\n"
"PO-Revision-Date: 2010-03-16 00:48+0000\n"
"Last-Translator: Kovid Goyal <Unknown>\n"
"Language-Team: American English <kde-i18n-doc@lists.kde.org>\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n"
"X-Launchpad-Export-Date: 2010-03-13 04:49+0000\n"
"X-Launchpad-Export-Date: 2010-03-16 04:44+0000\n"
"X-Generator: Launchpad (build Unknown)\n"
"Generated-By: pygettext.py 1.5\n"

View File

@ -11,13 +11,13 @@ msgstr ""
"Project-Id-Version: es\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
"PO-Revision-Date: 2010-03-13 12:12+0000\n"
"Last-Translator: Jellby <Unknown>\n"
"PO-Revision-Date: 2010-03-16 00:49+0000\n"
"Last-Translator: Kovid Goyal <Unknown>\n"
"Language-Team: Spanish\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n"
"X-Launchpad-Export-Date: 2010-03-14 05:01+0000\n"
"X-Launchpad-Export-Date: 2010-03-16 04:45+0000\n"
"X-Generator: Launchpad (build Unknown)\n"
#: /home/kovid/work/calibre/src/calibre/customize/__init__.py:43

View File

@ -7,13 +7,13 @@ msgstr ""
"Project-Id-Version: calibre 0.4.22\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
"PO-Revision-Date: 2010-03-12 20:07+0000\n"
"Last-Translator: Vincent C. <Unknown>\n"
"PO-Revision-Date: 2010-03-16 00:47+0000\n"
"Last-Translator: Kovid Goyal <Unknown>\n"
"Language-Team: fr\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n"
"X-Launchpad-Export-Date: 2010-03-13 04:49+0000\n"
"X-Launchpad-Export-Date: 2010-03-16 04:44+0000\n"
"X-Generator: Launchpad (build Unknown)\n"
"Generated-By: pygettext.py 1.5\n"

View File

@ -8,13 +8,13 @@ msgstr ""
"Project-Id-Version: calibre\n"
"Report-Msgid-Bugs-To: FULL NAME <EMAIL@ADDRESS>\n"
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
"PO-Revision-Date: 2010-03-14 14:29+0000\n"
"PO-Revision-Date: 2010-03-16 01:01+0000\n"
"Last-Translator: Miguel Anxo Bouzada <mbouzada@gmail.com>\n"
"Language-Team: Galician <gl@li.org>\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n"
"X-Launchpad-Export-Date: 2010-03-15 04:47+0000\n"
"X-Launchpad-Export-Date: 2010-03-16 04:44+0000\n"
"X-Generator: Launchpad (build Unknown)\n"
#: /home/kovid/work/calibre/src/calibre/customize/__init__.py:43

View File

@ -8,13 +8,13 @@ msgstr ""
"Project-Id-Version: calibre\n"
"Report-Msgid-Bugs-To: FULL NAME <EMAIL@ADDRESS>\n"
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
"PO-Revision-Date: 2010-03-12 06:58+0000\n"
"PO-Revision-Date: 2010-03-16 00:52+0000\n"
"Last-Translator: Kovid Goyal <Unknown>\n"
"Language-Team: Latvian <ivars_a@inbox.lv>\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n"
"X-Launchpad-Export-Date: 2010-03-13 04:49+0000\n"
"X-Launchpad-Export-Date: 2010-03-16 04:44+0000\n"
"X-Generator: Launchpad (build Unknown)\n"
"X-Poedit-Country: LATVIA\n"
"X-Poedit-Language: Latvian\n"

View File

@ -8,13 +8,13 @@ msgstr ""
"Project-Id-Version: calibre\n"
"Report-Msgid-Bugs-To: FULL NAME <EMAIL@ADDRESS>\n"
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
"PO-Revision-Date: 2010-03-12 23:15+0000\n"
"Last-Translator: Øyvind Øritsland <Unknown>\n"
"PO-Revision-Date: 2010-03-16 00:48+0000\n"
"Last-Translator: Kovid Goyal <Unknown>\n"
"Language-Team: Norwegian Bokmal <nb@li.org>\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n"
"X-Launchpad-Export-Date: 2010-03-14 05:01+0000\n"
"X-Launchpad-Export-Date: 2010-03-16 04:44+0000\n"
"X-Generator: Launchpad (build Unknown)\n"
#: /home/kovid/work/calibre/src/calibre/customize/__init__.py:43

View File

@ -7,13 +7,13 @@ msgstr ""
"Project-Id-Version: calibre 0.4.55\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
"PO-Revision-Date: 2010-03-12 07:15+0000\n"
"PO-Revision-Date: 2010-03-16 01:01+0000\n"
"Last-Translator: Kovid Goyal <Unknown>\n"
"Language-Team: American English <kde-i18n-doc@lists.kde.org>\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n"
"X-Launchpad-Export-Date: 2010-03-13 04:49+0000\n"
"X-Launchpad-Export-Date: 2010-03-16 04:44+0000\n"
"X-Generator: Launchpad (build Unknown)\n"
"X-Poedit-Country: RUSSIAN FEDERATION\n"
"X-Poedit-Language: Russian\n"

View File

@ -8,13 +8,13 @@ msgstr ""
"Project-Id-Version: calibre\n"
"Report-Msgid-Bugs-To: FULL NAME <EMAIL@ADDRESS>\n"
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
"PO-Revision-Date: 2010-03-12 06:57+0000\n"
"PO-Revision-Date: 2010-03-16 00:47+0000\n"
"Last-Translator: Besnik <besnik@programeshqip.org>\n"
"Language-Team: Albanian <sq@li.org>\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n"
"X-Launchpad-Export-Date: 2010-03-13 04:48+0000\n"
"X-Launchpad-Export-Date: 2010-03-16 04:43+0000\n"
"X-Generator: Launchpad (build Unknown)\n"
#: /home/kovid/work/calibre/src/calibre/customize/__init__.py:43

File diff suppressed because it is too large Load Diff

View File

@ -8,13 +8,13 @@ msgstr ""
"Project-Id-Version: calibre\n"
"Report-Msgid-Bugs-To: FULL NAME <EMAIL@ADDRESS>\n"
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
"PO-Revision-Date: 2010-03-12 07:15+0000\n"
"PO-Revision-Date: 2010-03-16 01:01+0000\n"
"Last-Translator: Thruth Wang <wanglihao@gmail.com>\n"
"Language-Team: Simplified Chinese <wanglihao@gmail.com>\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n"
"X-Launchpad-Export-Date: 2010-03-13 04:50+0000\n"
"X-Launchpad-Export-Date: 2010-03-16 04:45+0000\n"
"X-Generator: Launchpad (build Unknown)\n"
"X-Poedit-Country: CHINA\n"
"X-Poedit-Language: Chinese\n"
@ -6929,7 +6929,7 @@ msgstr "打印预览"
#: /home/kovid/work/calibre/src/calibre/gui2/viewer/main.py:294
msgid "Connecting to dict.org to lookup: <b>%s</b>&hellip;"
msgstr "正在链接 dict.org 查询:"
msgstr "正在链接 dict.org 查询:<b>%s</b>"
#: /home/kovid/work/calibre/src/calibre/gui2/viewer/main.py:393
msgid "Choose ebook"

View File

@ -8,13 +8,13 @@ msgstr ""
"Project-Id-Version: calibre\n"
"Report-Msgid-Bugs-To: FULL NAME <EMAIL@ADDRESS>\n"
"POT-Creation-Date: 2010-03-12 07:36+0000\n"
"PO-Revision-Date: 2010-03-13 03:27+0000\n"
"PO-Revision-Date: 2010-03-16 00:51+0000\n"
"Last-Translator: Chao-Hsiung Liao <j_h_liau@yahoo.com.tw>\n"
"Language-Team: Chinese (traditional)\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n"
"X-Launchpad-Export-Date: 2010-03-14 05:01+0000\n"
"X-Launchpad-Export-Date: 2010-03-16 04:45+0000\n"
"X-Generator: Launchpad (build Unknown)\n"
"Language: zh_TW\n"