GwR revisions - cdeType PDOC tag

This commit is contained in:
GRiker 2010-03-27 04:10:59 -07:00
commit f7108d173b
23 changed files with 901 additions and 210 deletions

View File

@ -22,3 +22,7 @@ src/cssutils/stylesheets/.svn/
src/odf/.svn src/odf/.svn
tags tags
nbproject/ nbproject/
*.mdproj
*.pidb
*.sln
*.userprefs

Binary file not shown.

After

Width:  |  Height:  |  Size: 400 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 995 B

View File

@ -0,0 +1,63 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Franco Venturi <fventuri at comcast.net>'
'''
spectrum.ieee.org
'''
from calibre.web.feeds.news import BasicNewsRecipe
from string import capwords
from urlparse import urljoin
class IEEESpectrum(BasicNewsRecipe):
title = 'IEEE Spectrum'
__author__ = 'Franco Venturi'
description = 'Electronics News from IEEE'
publisher = 'IEEE'
category = 'news, electronics, IT, computer science'
oldest_article = 32
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
language = 'en'
index = 'http://spectrum.ieee.org/magazine/'
masthead_url = 'http://spectrum.ieee.org/images/logo_hdr.png'
remove_javascript = True
remove_tags = [dict(name={'script':True, 'object':True})]
remove_attributes = ['height','width','alt']
keep_only_tags = [dict(attrs={'class': {'artSctn':True, 'artTitle':True, 'dekTitle': True}}), dict(attrs={'id':'artBody'})]
def parse_index(self):
soup = self.index_to_soup(self.index)
img = soup.find('img', image='cover.gif', src=True)
if img is not None:
self.cover_url = 'http://spectrum.ieee.org'+img['src']
content = soup.find(id='gnrlContent')
title = content.find(attrs={'class':'style4'}).string.strip()
date = ' '.join(title.split()[0:2])
self.timefmt = ' [' + date + ']'
contents = []
for tag in content.findAll(attrs={'class': {'style2':True, 'lstngTitle':True, 'lstngBody': True}}):
if tag['class'] == 'style2':
contents.append((capwords(tag.renderContents().strip()), []))
elif tag['class'] == 'lstngTitle':
url = urljoin(self.index, tag.findPrevious('a')['href']) + '/0'
contents[-1][1].append({'title': tag.renderContents().strip(),
'url': url,
'date': date,
'description': '',
'content': ''
})
elif tag['class'] == 'lstngBody':
contents[-1][1][-1]['description'] = tag.renderContents().strip()
return contents
def preprocess_html(self, soup):
for a in soup.findAll('a'):
if not a['href'].lower().startswith('http'):
a['href'] = urljoin(self.index, a['href'])
return soup

View File

@ -0,0 +1,36 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
nypost.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class NYPost(BasicNewsRecipe):
title = 'New York Post'
__author__ = 'Darko Miletic'
description = 'Daily newspaper'
publisher = 'NYP Holdings, Inc.'
category = 'news, politics, USA'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
language = 'en'
masthead_url = 'http://www.nypost.com/rw/SysConfig/WebPortal/nypost/images/nyp_logo_230x32.gif'
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
keep_only_tags=[dict(name='div', attrs={'id':'story'})]
feeds = [(u'Articles', u'http://www.nypost.com/rss/all_section.xml')]
def print_version(self, url):
return url.replace('nypost.com/p/','nypost.com/f/print/')

View File

@ -5,7 +5,8 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
''' '''
nytimes.com nytimes.com
''' '''
import re, time import re
import time
from calibre import entity_to_unicode from calibre import entity_to_unicode
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment
@ -14,7 +15,7 @@ class NYTimes(BasicNewsRecipe):
title = 'New York Times Top Stories' title = 'New York Times Top Stories'
__author__ = 'GRiker' __author__ = 'GRiker'
language = 'en' language = _('English')
description = 'Top Stories from the New York Times' description = 'Top Stories from the New York Times'
# List of sections typically included in Top Stories. Use a keyword from the # List of sections typically included in Top Stories. Use a keyword from the
@ -39,9 +40,6 @@ class NYTimes(BasicNewsRecipe):
'world' : 'World' 'world' : 'World'
} }
# By default, no sections are skipped.
excludeSectionKeywords = []
# Add section keywords from the right column above to skip that section # Add section keywords from the right column above to skip that section
# For example, to skip sections containing the word 'Sports' or 'Dining', use: # For example, to skip sections containing the word 'Sports' or 'Dining', use:
# excludeSectionKeywords = ['Sports', 'Dining'] # excludeSectionKeywords = ['Sports', 'Dining']
@ -49,36 +47,138 @@ class NYTimes(BasicNewsRecipe):
# excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World'] # excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World']
# Fetch only Top Stories # Fetch only Top Stories
# excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World'] # excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World']
# By default, no sections are skipped.
excludeSectionKeywords = []
# one_picture_per_article specifies that calibre should only use the first image
# from an article (if one exists). If one_picture_per_article = True, the image
# will be moved to a location between the headline and the byline.
# If one_picture_per_article = False, all images from the article will be included
# and shown in their original location.
one_picture_per_article = True
# The maximum number of articles that will be downloaded # The maximum number of articles that will be downloaded
max_articles_per_feed = 40 max_articles_per_feed = 40
timefmt = '' timefmt = ''
needs_subscription = True needs_subscription = True
keep_only_tags = [ dict(attrs={ 'id':['article']}), masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
dict(attrs={'class':['blog wrap']}) ]
remove_tags = [ dict(attrs={'class':['nextArticleLink clearfix', 'clearfix', remove_tags_before = dict(id='article')
'inlineVideo left brightcove', 'entry-meta']}), remove_tags_after = dict(id='article')
dict(attrs={ 'id':['toolsRight','inlineBox','sidebarArticles', remove_tags = [dict(attrs={'class':[
'portfolioInline','articleInline','readerscomment', 'articleFooter',
'nytRating']}) ] 'articleTools',
'columnGroup doubleRule',
'columnGroup singleRule',
'columnGroup last',
'columnGroup last',
'doubleRule',
'dottedLine',
'entry-meta',
'icon enlargeThis',
'leftNavTabs',
'module box nav',
'nextArticleLink',
'nextArticleLink clearfix',
'post-tools',
'relatedSearchesModule',
'side_tool',
'singleAd',
'subNavigation tabContent active clearfix',
]}),
dict(id=[
'adxLeaderboard',
'archive',
'articleExtras',
'articleInline',
'blog_sidebar',
'cCol',
'entertainmentSearchBar',
'footer',
'header',
'header_search',
'login',
'masthead',
'memberTools',
'navigation',
'portfolioInline',
'relatedArticles',
'side_search',
'side_index',
'side_tool',
'toolsRight',
]),
dict(name=['script', 'noscript', 'style'])]
encoding = 'cp1252'
no_stylesheets = True no_stylesheets = True
extra_css = '.headline {text-align: left;}\n \ extra_css = '.headline {text-align: left;}\n \
.byline {font-family: monospace; \ .byline {font-family: monospace; \
text-align: left; \ text-align: left; \
margin-top: 0px; \
margin-bottom: 0px;}\n \
.dateline {font-size: small; \
margin-top: 0px; \
margin-bottom: 0px;}\n \
.timestamp {font-size: small; \
margin-top: 0px; \
margin-bottom: 0px;}\n \ margin-bottom: 0px;}\n \
.timestamp {font-size: smaller;}\n \
.source {text-align: left;}\n \ .source {text-align: left;}\n \
.image {text-align: center;}\n \ .image {text-align: center;}\n \
.credit {text-align: right; \ .credit {text-align: right; \
font-size: smaller;}\n \ font-size: small; \
margin-top: 0px; \
margin-bottom: 0px;}\n \
.articleBody {text-align: left;}\n \ .articleBody {text-align: left;}\n \
.authorId {text-align: left; \ .authorId {text-align: left; \
font-style: italic;}\n ' font-style: italic;}\n '
def dump_ans(self, ans) :
total_article_count = 0
for section in ans :
if self.verbose:
self.log("section %s: %d articles" % (section[0], len(section[1])) )
for article in section[1]:
total_article_count += 1
if self.verbose:
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
article['url'].encode('cp1252','replace')))
self.log( "Queued %d articles" % total_article_count )
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","&#8216;",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","&#8217;",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","&#8220;",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","&#8221;",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","&#8211;",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","&#8212;",fixed)
return fixed
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
try:
br.open('http://www.nytimes.com/auth/login')
br.select_form(name='login')
br['USERID'] = self.username
br['PASSWORD'] = self.password
br.submit()
except:
self.log("\nFailed to login")
return br
def get_cover_url(self): def get_cover_url(self):
cover = None cover = None
st = time.localtime() st = time.localtime()
@ -94,26 +194,6 @@ class NYTimes(BasicNewsRecipe):
cover = None cover = None
return cover return cover
def get_masthead_url(self):
masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
br = BasicNewsRecipe.get_browser()
try:
br.open(masthead)
except:
self.log("\nCover unavailable")
masthead = None
return masthead
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://www.nytimes.com/auth/login')
br.select_form(name='login')
br['USERID'] = self.username
br['PASSWORD'] = self.password
br.submit()
return br
def index_to_soup(self, url_or_raw, raw=False): def index_to_soup(self, url_or_raw, raw=False):
''' '''
OVERRIDE of class method OVERRIDE of class method
@ -138,6 +218,7 @@ class NYTimes(BasicNewsRecipe):
return BeautifulSoup(_raw, markupMassage=massage) return BeautifulSoup(_raw, markupMassage=massage)
# Entry point # Entry point
print "index_to_soup()"
soup = get_the_soup( self.encoding, url_or_raw ) soup = get_the_soup( self.encoding, url_or_raw )
contentType = soup.find(True,attrs={'http-equiv':'Content-Type'}) contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')] docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
@ -151,6 +232,16 @@ class NYTimes(BasicNewsRecipe):
return soup return soup
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&#38;'
massaged = re.sub("&","&#38;", massaged)
return self.fixChars(massaged)
else:
return description
def parse_index(self): def parse_index(self):
articles = {} articles = {}
ans = [] ans = []
@ -158,12 +249,14 @@ class NYTimes(BasicNewsRecipe):
feed = key = 'All Top Stories' feed = key = 'All Top Stories'
articles[key] = [] articles[key] = []
ans.append(key) ans.append(key)
self.log("Scanning 1 section ...")
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/') soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
# Fetch the outer table # Fetch the outer table
table = soup.find('table') table = soup.find('table')
previousTable = table previousTable = table
contentTable = None
# Find the deepest table containing the stories # Find the deepest table containing the stories
while True : while True :
@ -191,8 +284,9 @@ class NYTimes(BasicNewsRecipe):
continue continue
skipThisSection = False skipThisSection = False
todays_article_count = 0
# Within this table are <font face="times new roman, times, san serif"> entries # Within this table are <font face="times new roman, times, san serif"> entries
self.log("Fetching feed Top Stories")
for tr in storyblock.findAllNext('tr'): for tr in storyblock.findAllNext('tr'):
if tr.find('span') is not None : if tr.find('span') is not None :
@ -244,6 +338,7 @@ class NYTimes(BasicNewsRecipe):
# Fetch the article titles and URLs # Fetch the article titles and URLs
articleCount = len(sectionblock.findAll('span')) articleCount = len(sectionblock.findAll('span'))
todays_article_count += articleCount
for (i,span) in enumerate(sectionblock.findAll(attrs={'class':'headlineWrapper'})) : for (i,span) in enumerate(sectionblock.findAll(attrs={'class':'headlineWrapper'})) :
a = span.find('a', href=True) a = span.find('a', href=True)
url = re.sub(r'\?.*', '', a['href']) url = re.sub(r'\?.*', '', a['href'])
@ -277,6 +372,7 @@ class NYTimes(BasicNewsRecipe):
if duplicateFound: if duplicateFound:
# Continue fetching, don't add this article # Continue fetching, don't add this article
todays_article_count -= 1
continue continue
if not articles.has_key(feed): if not articles.has_key(feed):
@ -284,11 +380,138 @@ class NYTimes(BasicNewsRecipe):
articles[feed].append( articles[feed].append(
dict(title=title, url=url, date=pubdate, dict(title=title, url=url, date=pubdate,
description=description, author=author, content='')) description=description, author=author, content=''))
# self.log("Queuing %d articles from %s" % (todays_article_count, "Top Stories"))
ans = self.sort_index_by(ans, {'Top Stories':-1}) ans = self.sort_index_by(ans, {'Top Stories':-1})
ans = [(key, articles[key]) for key in ans if articles.has_key(key)] ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
self.dump_ans(ans)
return ans return ans
def preprocess_html(self, soup):
return self.strip_anchors(soup)
def postprocess_html(self,soup, True):
if self.one_picture_per_article:
# Remove all images after first
largeImg = soup.find(True, {'class':'articleSpanImage'})
inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
if largeImg:
for inlineImg in inlineImgs:
inlineImg.extract()
else:
if inlineImgs:
firstImg = inlineImgs[0]
for inlineImg in inlineImgs[1:]:
inlineImg.extract()
# Move firstImg after headline
cgFirst = soup.find(True, {'class':'columnGroup first'})
if cgFirst:
# Strip all sibling NavigableStrings: noise
navstrings = cgFirst.findAll(text=True, recursive=False)
[ns.extract() for ns in navstrings]
headline_found = False
tag = cgFirst.find(True)
insertLoc = 0
while True:
insertLoc += 1
if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
headline_found = True
break
tag = tag.nextSibling
if not tag:
headline_found = False
break
if headline_found:
cgFirst.insert(insertLoc,firstImg)
else:
self.log(">>> No class:'columnGroup first' found <<<")
# Change class="kicker" to <h3>
kicker = soup.find(True, {'class':'kicker'})
if kicker and kicker.contents[0]:
h3Tag = Tag(soup, "h3")
h3Tag.insert(0, self.fixChars(self.tag_to_string(kicker,
use_alt=False)))
kicker.replaceWith(h3Tag)
# Change captions to italic -1
for caption in soup.findAll(True, {'class':'caption'}) :
if caption and caption.contents[0]:
emTag = Tag(soup, "em")
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
mp_off = c.find("More Photos")
if mp_off >= 0:
c = c[:mp_off]
emTag.insert(0, c)
hrTag = Tag(soup, 'hr')
#hrTag['style'] = "margin-top:0em;margin-bottom:0em"
emTag.insert(1, hrTag)
caption.replaceWith(emTag)
# Change <nyt_headline> to <h2>
h1 = soup.find('h1')
if h1:
headline = h1.find("nyt_headline")
if headline:
tag = Tag(soup, "h2")
tag['class'] = "headline"
tag.insert(0, self.fixChars(headline.contents[0]))
h1.replaceWith(tag)
else:
# Blog entry - replace headline, remove <hr> tags
headline = soup.find('title')
if headline:
tag = Tag(soup, "h2")
tag['class'] = "headline"
tag.insert(0, self.fixChars(headline.contents[0]))
soup.insert(0, tag)
hrs = soup.findAll('hr')
for hr in hrs:
hr.extract()
# Change <h1> to <h3> - used in editorial blogs
masthead = soup.find("h1")
if masthead:
# Nuke the href
if masthead.a:
del(masthead.a['href'])
tag = Tag(soup, "h3")
tag.insert(0, self.fixChars(masthead.contents[0]))
masthead.replaceWith(tag)
# Change <span class="bold"> to <b>
for subhead in soup.findAll(True, {'class':'bold'}) :
if subhead.contents:
bTag = Tag(soup, "b")
bTag.insert(0, subhead.contents[0])
subhead.replaceWith(bTag)
# Synthesize a section header
dsk = soup.find('meta', attrs={'name':'dsk'})
if dsk and dsk.has_key('content'):
hTag = Tag(soup,'h3')
hTag['class'] = 'section'
hTag.insert(0,NavigableString(dsk['content']))
articleTag = soup.find(True, attrs={'id':'article'})
if articleTag:
articleTag.insert(0,hTag)
# Add class="articleBody" to <div> so we can format with CSS
divTag = soup.find('div',attrs={'id':'articleBody'})
if divTag:
divTag['class'] = divTag['id']
# Add class="authorId" to <div> so we can format with CSS
divTag = soup.find('div',attrs={'id':'authorId'})
if divTag and divTag.contents[0]:
tag = Tag(soup, "p")
tag['class'] = "authorId"
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
use_alt=False)))
divTag.replaceWith(tag)
return soup
def strip_anchors(self,soup): def strip_anchors(self,soup):
paras = soup.findAll(True) paras = soup.findAll(True)
for para in paras: for para in paras:
@ -297,94 +520,3 @@ class NYTimes(BasicNewsRecipe):
if a.img is None: if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace')) a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup return soup
def preprocess_html(self, soup):
# refresh = soup.find('meta', {'http-equiv':'refresh'})
# if refresh is None:
# return self.strip_anchors(soup)
#
# content = refresh.get('content').partition('=')[2]
# raw = self.browser.open('http://www.nytimes.com'+content).read()
# soup = BeautifulSoup(raw.decode('cp1252', 'replace'))
return self.strip_anchors(soup)
refresh = soup.find('meta', {'http-equiv':'refresh'})
if refresh is not None:
content = refresh.get('content').partition('=')[2]
raw = self.browser.open('http://www.nytimes.com'+content).read()
soup = BeautifulSoup(raw.decode('cp1252', 'replace'))
soup = self.strip_anchors(soup)
# Test for empty content
body = soup.find('body')
tagCount = len(body.findAll(True))
if tagCount:
# print "%d tags in article" % tagCount
return soup
else:
print "no allowed content found, removing article"
raise Exception
def postprocess_html(self,soup, True):
# Change class="kicker" to <h3>
kicker = soup.find(True, {'class':'kicker'})
if kicker is not None :
h3Tag = Tag(soup, "h3")
h3Tag.insert(0, kicker.contents[0])
kicker.replaceWith(h3Tag)
# Change captions to italic -1
for caption in soup.findAll(True, {'class':'caption'}) :
if caption is not None:
emTag = Tag(soup, "em")
emTag.insert(0, caption.contents[0])
hrTag = Tag(soup, 'hr')
emTag.insert(1, hrTag)
caption.replaceWith(emTag)
# Change <nyt_headline> to <h2>
headline = soup.find("nyt_headline")
if headline is not None :
tag = Tag(soup, "h2")
tag['class'] = "headline"
tag.insert(0, headline.contents[0])
soup.h1.replaceWith(tag)
# Change <h1> to <h3> - used in editorial blogs
masthead = soup.find("h1")
if masthead is not None :
# Nuke the href
if masthead.a is not None :
del(masthead.a['href'])
tag = Tag(soup, "h3")
tag.insert(0, masthead.contents[0])
soup.h1.replaceWith(tag)
# Change <span class="bold"> to <b>
for subhead in soup.findAll(True, {'class':'bold'}) :
bTag = Tag(soup, "b")
bTag.insert(0, subhead.contents[0])
subhead.replaceWith(bTag)
# Synthesize a section header
dsk = soup.find('meta', attrs={'name':'dsk'})
if dsk is not None and dsk.has_key('content'):
hTag = Tag(soup,'h3')
hTag['class'] = 'section'
hTag.insert(0,NavigableString(dsk['content']))
articleTag = soup.find(True, attrs={'id':'article'})
articleTag.insert(0,hTag)
# Add class="articleBody" to <div> so we can format with CSS
divTag = soup.find('div',attrs={'id':'articleBody'})
if divTag is not None :
divTag['class'] = divTag['id']
# Add class="authorId" to <div> so we can format with CSS
divTag = soup.find('div',attrs={'id':'authorId'})
if divTag is not None :
divTag['class'] = divTag['id']
return soup

View File

@ -31,7 +31,7 @@ class NYTimes(BasicNewsRecipe):
# List of sections to exclude # List of sections to exclude
# To add a section, copy the section name from the allSectionKeywords list above # To add a section, copy the section name from the allSectionKeywords list above
# For example, to exclude 'Dining' and 'Weddings': # For example, to exclude 'Dining' and 'Weddings':
# excludeSectionKeywords = ['Dining','Weddings'] #excludeSectionKeywords = ['Dining','Weddings']
excludeSectionKeywords = [] excludeSectionKeywords = []
# List of sections to include (test and debug only) # List of sections to include (test and debug only)
@ -56,20 +56,25 @@ class NYTimes(BasicNewsRecipe):
remove_tags_before = dict(id='article') remove_tags_before = dict(id='article')
remove_tags_after = dict(id='article') remove_tags_after = dict(id='article')
remove_tags = [dict(attrs={'class':[ remove_tags = [dict(attrs={'class':[
'articleFooter',
'articleTools', 'articleTools',
'columnGroup doubleRule', 'columnGroup doubleRule',
'columnGroup singleRule',
'columnGroup last', 'columnGroup last',
'columnGroup last',
'doubleRule', 'doubleRule',
'dottedLine', 'dottedLine',
'entry-meta', 'entry-meta',
'icon enlargeThis', 'icon enlargeThis',
'leftNavTabs', 'leftNavTabs',
'module box nav', 'module box nav',
'nextArticleLink',
'nextArticleLink clearfix', 'nextArticleLink clearfix',
'post-tools', 'post-tools',
'relatedSearchesModule', 'relatedSearchesModule',
'side_tool', 'side_tool',
'singleAd', 'singleAd',
'subNavigation tabContent active clearfix',
]}), ]}),
dict(id=[ dict(id=[
'adxLeaderboard', 'adxLeaderboard',
@ -222,11 +227,11 @@ class NYTimes(BasicNewsRecipe):
if div['class'] == 'section-headline': if div['class'] == 'section-headline':
key = string.capwords(feed_title(div)) key = string.capwords(feed_title(div))
excluded = re.compile('|'.join(self.excludeSectionKeywords)) if self.excludeSectionKeywords:
if excluded.search(key): excluded = re.compile('|'.join(self.excludeSectionKeywords))
self.log("Skipping section %s" % key) if excluded.search(key):
continue self.log("Skipping section %s" % key)
continue
articles[key] = [] articles[key] = []
ans.append(key) ans.append(key)

View File

@ -21,9 +21,8 @@ class Timesonline(BasicNewsRecipe):
use_embedded_content = False use_embedded_content = False
simultaneous_downloads = 1 simultaneous_downloads = 1
encoding = 'ISO-8859-1' encoding = 'ISO-8859-1'
lang = 'en-UK'
remove_javascript = True remove_javascript = True
language = 'en' language = 'en_GB'
recursions = 9 recursions = 9
match_regexps = [r'http://www.timesonline.co.uk/.*page=[2-9]'] match_regexps = [r'http://www.timesonline.co.uk/.*page=[2-9]']

View File

@ -0,0 +1,47 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
tulsaworld.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class TulsaWorld(BasicNewsRecipe):
title = 'Tulsa World'
__author__ = 'Darko Miletic'
description = 'Find breaking news, local news, Oklahoma weather, sports, business, entertainment, lifestyle, opinion, government, movies, books, jobs, education, blogs, video & multimedia.'
publisher = 'World Publishing Co.'
category = 'Tulsa World, tulsa world, daily newspaper, breaking news, stories, articles, news, local, weather, coverage, editorial, government, education, community, sports, business, entertainment, lifestyle, opinion, multimedia, media, blogs, consumer, OU, OSU, TU, ORU, football, basketball, school, schools, sudoku, movie reviews, stocks, classified ads, classifieds, books, job, jobs, careers, real estate, home, homes, Oklahoma, northeastern, reviews, auto, autos, archives, forecasts, Sooners, Cowboys, Hurricane, Golden Eagles, NFL, NBA, MLB, pro football, scores, college basketball, college football, college baseball, sports columns, fashion and style, associated press, regional news coverage, health, obituaries, politics, political news, Jenks, Union, Owasso, Tulsa, Booker T. Washington, Trojans, Rams, Hornets, video, photography, photos, images, games, search, the picker, predictions, satellite, family, food, teens, polls, births, celebrations, death notices, divorces, marriages, obituaries, audio, podcasts.'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
language = 'en'
country = 'US'
remove_empty_feeds = True
masthead_url = 'http://www.tulsaworld.com/images/TW_logo-blue-footer.jpg'
extra_css = ' body{font-family: Arial,Verdana,sans-serif } img{margin-bottom: 0.4em} .articleHeadline{font-size: xx-large; font-weight: bold} .articleKicker{font-size: x-large; font-weight: bold} .articleByline,.articleDate{font-size: small} .leadp{font-size: 1.1em} '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
, 'linearize_tables' : True
}
keep_only_tags = [dict(name='div',attrs={'id':['ctl00_body1_ArticleControl_divArticleText','ctl00_BodyContent_ArticleControl_divArticleText']})]
feeds = [
(u'News' , u'http://www.tulsaworld.com/site/rss.aspx?group=1')
,(u'Business', u'http://www.tulsaworld.com/site/rss.aspx?group=5')
,(u'Opinion' , u'http://www.tulsaworld.com/site/rss.aspx?group=7')
]
def get_article_url(self, article):
return article.get('link', None).rpartition('&rss')[0]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return self.adeify_images(soup)

View File

@ -7,62 +7,430 @@ usatoday.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, NavigableString, Tag
import re import re
class USAToday(BasicNewsRecipe): class USAToday(BasicNewsRecipe):
title = 'USA Today' title = 'USA Today'
timefmt = ' [%d %b %Y]' __author__ = 'GRiker'
__author__ = 'Kovid Goyal and Sujata Raman' oldest_article = 1
timefmt = ''
max_articles_per_feed = 20 max_articles_per_feed = 20
language = 'en' language = 'en'
no_stylesheets = True no_stylesheets = True
extra_css = ''' extra_css = '.headline {text-align: left;}\n \
.inside-head{font-family:Arial,Helvetica,sans-serif; font-size:large; font-weight:bold } .byline {font-family: monospace; \
.inside-head2{font-family:Arial,Helvetica,sans-serif; font-size:large; font-weight:bold } text-align: left; \
.inside-head3{font-family:Arial,Helvetica,sans-serif; font-size:large; font-weight:bold } margin-bottom: 1em;}\n \
h3{font-family:Arial,Helvetica,sans-serif; font-size:large; font-weight:bold; } .image {text-align: center;}\n \
h4{font-family:Arial,Helvetica,sans-serif; font-size:x-small; font-weight:bold; } .caption {text-align: center; \
.side-by-side{font-family:Arial,Helvetica,sans-serif; font-size:x-small;} font-size: smaller; \
#byLineTag{font-family:Arial,Helvetica,sans-serif; font-size:xx-small;} font-style: italic}\n \
.inside-copy{font-family:Arial,Helvetica,sans-serif; font-size:x-small;text-align:left;} .credit {text-align: right; \
.caption{font-family:Arial,Helvetica,sans-serif; font-size:x-small;} margin-bottom: 0em; \
li{font-family:Arial,Helvetica,sans-serif; font-size:x-small;text-align:left ;} font-size: smaller;}\n \
.vatext{font-family:Arial,Helvetica,sans-serif; font-size:x-small;text-align:left ;} .articleBody {text-align: left;}\n '
.vaTextBold{font-family:Arial,Helvetica,sans-serif; font-size:x-small;font-weight:bold; color:#666666;}
'''
remove_tags = [
{'class':['tagListLabel','piped-taglist-string','socialcontainer','social-wrapper',]},
{'id':['topSocialButtons']},
]
conversion_options = { 'linearize_tables' : True } conversion_options = { 'linearize_tables' : True }
#simultaneous_downloads = 1
preprocess_regexps = [
(re.compile(r'<BODY.*?<!--Article Goes Here-->', re.IGNORECASE | re.DOTALL), lambda match : '<BODY>'),
(re.compile(r'<!--Article End-->.*?</BODY>', re.IGNORECASE | re.DOTALL), lambda match : '</BODY>'),
]
feeds = [ feeds = [
('Top Headlines', 'http://rssfeeds.usatoday.com/usatoday-NewsTopStories'), ('Top Headlines', 'http://rssfeeds.usatoday.com/usatoday-NewsTopStories'),
('Sport Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomSports-TopStories'),
('Tech Headlines', 'http://rssfeeds.usatoday.com/usatoday-TechTopStories'), ('Tech Headlines', 'http://rssfeeds.usatoday.com/usatoday-TechTopStories'),
('Personal Tech', 'http://rssfeeds.usatoday.com/UsatodaycomTech-PersonalTalk'),
('Science', 'http://rssfeeds.usatoday.com/TP-ScienceFair'),
('Health', 'http://rssfeeds.usatoday.com/UsatodaycomHealth-TopStories'),
('Travel Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomTravel-TopStories'), ('Travel Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomTravel-TopStories'),
('Money Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomMoney-TopStories'), ('Money Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomMoney-TopStories'),
('Entertainment Headlines', 'http://rssfeeds.usatoday.com/usatoday-LifeTopStories'), ('Entertainment Headlines', 'http://rssfeeds.usatoday.com/usatoday-LifeTopStories'),
('Sport Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomSports-TopStories'),
('Weather Headlines', 'http://rssfeeds.usatoday.com/usatoday-WeatherTopStories'), ('Weather Headlines', 'http://rssfeeds.usatoday.com/usatoday-WeatherTopStories'),
('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles'), ('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles'),
('Offbeat News', 'http://rssfeeds.usatoday.com/UsatodaycomOffbeat-TopStories'),
] ]
keep_only_tags = [dict(attrs={'class':[
'byLine',
'inside-copy',
'inside-head',
'inside-head2',
'item',
'item-block',
'photo-container',
]}),
dict(id=[
'applyMainStoryPhoto',
'permalink',
])]
## Getting the print version remove_tags = [dict(attrs={'class':[
'comments',
'jump',
'pagetools',
'post-attributes',
'tags',
]}),
dict(id=[])]
def print_version(self, url): #feeds = [('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles')]
return 'http://www.printthis.clickability.com/pt/printThis?clickMap=printThis&fb=Y&url=' + url
def dump_hex(self, src, length=16):
''' Diagnostic '''
FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)])
N=0; result=''
while src:
s,src = src[:length],src[length:]
hexa = ' '.join(["%02X"%ord(x) for x in s])
s = s.translate(FILTER)
result += "%04X %-*s %s\n" % (N, length*3, hexa, s)
N+=length
print result
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","&#8216;",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","&#8217;",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","&#8220;",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","&#8221;",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","&#8211;",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","&#8212;",fixed)
return fixed
def get_masthead_url(self):
masthead = 'http://i.usatoday.net/mobile/_common/_images/565x73_usat_mobile.gif'
br = BasicNewsRecipe.get_browser()
try:
br.open(masthead)
except:
self.log("\nCover unavailable")
masthead = None
return masthead
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&#38;'
massaged = re.sub("&","&#38;", massaged)
return self.fixChars(massaged)
else:
return description
def parse_feeds(self, *args, **kwargs):
parsed_feeds = BasicNewsRecipe.parse_feeds(self, *args, **kwargs)
# Count articles for progress dialog
content_feeds = []
article_count = 0
for feed in parsed_feeds:
article_count += len(feed)
self.log( "Queued %d articles" % article_count)
return parsed_feeds
def preprocess_html(self, soup):
soup = self.strip_anchors(soup)
return soup
def postprocess_html(self, soup, first_fetch): def postprocess_html(self, soup, first_fetch):
for t in soup.findAll(['table', 'tr', 'td']):
t.name = 'div' # Remove navLinks <div class="inside-copy" style="padding-bottom:3px">
navLinks = soup.find(True,{'style':'padding-bottom:3px'})
if navLinks:
navLinks.extract()
# Remove <div class="inside-copy" style="margin-bottom:10px">
gibberish = soup.find(True,{'style':'margin-bottom:10px'})
if gibberish:
gibberish.extract()
# Change <inside-head> to <h2>
headline = soup.find(True, {'class':['inside-head','inside-head2']})
if not headline:
headline = soup.find('h3')
if headline:
tag = Tag(soup, "h2")
tag['class'] = "headline"
tag.insert(0, headline.contents[0])
headline.replaceWith(tag)
else:
print "unable to find headline:\n%s\n" % soup
# Change byLine to byline, change commas to middot
# Kindle renders commas in byline as '&'
byline = soup.find(True, {'class':'byLine'})
if byline:
byline['class'] = 'byline'
# Replace comma with middot
byline.contents[0].replaceWith(re.sub(","," &middot;", byline.renderContents()))
jumpout_punc_list = [':','?']
# Remove the inline jumpouts in <div class="inside-copy">
paras = soup.findAll(True, {'class':'inside-copy'})
for para in paras:
if re.match("<b>[\w\W]+ ",para.renderContents()):
p = para.find('b')
for punc in jumpout_punc_list:
punc_offset = p.contents[0].find(punc)
if punc_offset == -1:
continue
if punc_offset > 1:
if p.contents[0][:punc_offset] == p.contents[0][:punc_offset].upper():
#print "extracting \n%s\n" % para.prettify()
para.extract()
# Reset class for remaining
paras = soup.findAll(True, {'class':'inside-copy'})
for para in paras:
para['class'] = 'articleBody'
# Remove inline jumpouts in <p>
paras = soup.findAll(['p'])
for p in paras:
if hasattr(p,'contents') and len(p.contents):
for punc in jumpout_punc_list:
punc_offset = p.contents[0].find(punc)
if punc_offset == -1:
continue
if punc_offset > 2 and hasattr(p,'a') and len(p.contents):
#print "evaluating %s\n" % p.contents[0][:punc_offset+1]
if p.contents[0][:punc_offset] == p.contents[0][:punc_offset].upper():
#print "extracting \n%s\n" % p.prettify()
p.extract()
# Capture the first img, insert after headline
imgs = soup.findAll('img')
print "postprocess_html(): %d images" % len(imgs)
if imgs:
divTag = Tag(soup, 'div')
divTag['class'] = 'image'
body = soup.find('body')
img = imgs[0]
#print "img: \n%s\n" % img.prettify()
# Table for photo and credit
tableTag = Tag(soup,'table')
# Photo
trimgTag = Tag(soup, 'tr')
tdimgTag = Tag(soup, 'td')
tdimgTag.insert(0,img)
trimgTag.insert(0,tdimgTag)
tableTag.insert(0,trimgTag)
# Credit
trcreditTag = Tag(soup, 'tr')
tdcreditTag = Tag(soup, 'td')
tdcreditTag['class'] = 'credit'
credit = soup.find('td',{'class':'photoCredit'})
if credit:
tdcreditTag.insert(0,NavigableString(credit.renderContents()))
else:
credit = img['credit']
if credit:
tdcreditTag.insert(0,NavigableString(credit))
else:
tdcreditTag.insert(0,NavigableString(''))
trcreditTag.insert(0,tdcreditTag)
tableTag.insert(1,trcreditTag)
dtc = 0
divTag.insert(dtc,tableTag)
dtc += 1
if False:
# Add the caption in the table
tableCaptionTag = Tag(soup,'caption')
tableCaptionTag.insert(0,soup.find('td',{'class':'photoCredit'}).renderContents())
tableTag.insert(1,tableCaptionTag)
divTag.insert(dtc,tableTag)
dtc += 1
body.insert(1,divTag)
else:
# Add the caption below the table
#print "Looking for caption in this soup:\n%s" % img.prettify()
captionTag = Tag(soup,'p')
captionTag['class'] = 'caption'
if hasattr(img,'alt') and img['alt']:
captionTag.insert(0,NavigableString('<blockquote>%s</blockquote>' % img['alt']))
divTag.insert(dtc, captionTag)
dtc += 1
else:
try:
captionTag.insert(0,NavigableString('<blockquote>%s</blockquote>' % img['cutline']))
divTag.insert(dtc, captionTag)
dtc += 1
except:
pass
hrTag = Tag(soup, 'hr')
divTag.insert(dtc, hrTag)
dtc += 1
# Delete <div id="applyMainStoryPhoto"
photoJunk = soup.find('div',{'id':'applyMainStoryPhoto'})
if photoJunk:
photoJunk.extract()
# Insert img after headline
tag = body.find(True)
insertLoc = 0
headline_found = False
while True:
# Scan the top-level tags
insertLoc += 1
if hasattr(tag,'class') and tag['class'] == 'headline':
headline_found = True
body.insert(insertLoc,divTag)
break
tag = tag.nextSibling
if not tag:
break
if not headline_found:
# Monolithic <div> - restructure
insert_loc = 0
tag = body.find(True)
while True:
insertLoc += 1
try:
if hasattr(tag,'class') and tag['class'] == 'headline':
headline_found = True
tag.insert(insertLoc,divTag)
break
except:
pass
tag = tag.next
if not tag:
break
# Yank out headline, img and caption
headline = body.find('h2','headline')
img = body.find('div','image')
caption = body.find('p''class')
# body(0) is calibre_navbar
# body(1) is <div class="item">
btc = 1
headline.extract()
body.insert(1, headline)
btc += 1
if img:
img.extract()
body.insert(btc, img)
btc += 1
if caption:
caption.extract()
body.insert(btc, caption)
btc += 1
if len(imgs) > 1:
if True:
[img.extract() for img in imgs[1:]]
else:
# Format the remaining images
# This doesn't work yet
for img in imgs[1:]:
print "img:\n%s\n" % img.prettify()
divTag = Tag(soup, 'div')
divTag['class'] = 'image'
# Table for photo and credit
tableTag = Tag(soup,'table')
# Photo
trimgTag = Tag(soup, 'tr')
tdimgTag = Tag(soup, 'td')
tdimgTag.insert(0,img)
trimgTag.insert(0,tdimgTag)
tableTag.insert(0,trimgTag)
# Credit
trcreditTag = Tag(soup, 'tr')
tdcreditTag = Tag(soup, 'td')
tdcreditTag['class'] = 'credit'
try:
tdcreditTag.insert(0,NavigableString(img['credit']))
except:
tdcreditTag.insert(0,NavigableString(''))
trcreditTag.insert(0,tdcreditTag)
tableTag.insert(1,trcreditTag)
divTag.insert(0,tableTag)
soup.img.replaceWith(divTag)
return soup
def postprocess_book(self, oeb, opts, log) :
def extract_byline(href) :
# <meta name="byline" content=
soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
byline = soup.find('div',attrs={'class':'byline'})
if byline:
byline['class'] = 'byline'
# Replace comma with middot
byline.contents[0].replaceWith(re.sub(","," &middot;", byline.renderContents()))
return byline.renderContents()
else :
paras = soup.findAll(text=True)
for para in paras:
if para.startswith("Copyright"):
return para[len('Copyright xxxx '):para.find('.')]
return None
def extract_description(href) :
soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
description = soup.find('meta',attrs={'name':'description'})
if description :
return self.massageNCXText(description['content'])
else:
# Take first paragraph of article
articleBody = soup.find('div',attrs={'id':['articleBody','item']})
if articleBody:
paras = articleBody.findAll('p')
for p in paras:
if p.renderContents() > '' :
return self.massageNCXText(self.tag_to_string(p,use_alt=False))
else:
print "Didn't find <div id='articleBody'> in this soup:\n%s" % soup.prettify()
return None
# Method entry point here
# Single section toc looks different than multi-section tocs
if oeb.toc.depth() == 2 :
for article in oeb.toc :
if article.author is None :
article.author = extract_byline(article.href)
if article.description is None :
article.description = extract_description(article.href)
elif oeb.toc.depth() == 3 :
for section in oeb.toc :
for article in section :
article.author = extract_byline(article.href)
'''
if article.author is None :
article.author = self.massageNCXText(extract_byline(article.href))
else:
article.author = self.massageNCXText(article.author)
'''
if article.description is None :
article.description = extract_description(article.href)
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup return soup

View File

@ -20,7 +20,7 @@ class ANDROID(USBMS):
VENDOR_ID = { VENDOR_ID = {
0x0bb4 : { 0x0c02 : [0x100], 0x0c01 : [0x100]}, 0x0bb4 : { 0x0c02 : [0x100], 0x0c01 : [0x100]},
0x22b8 : { 0x41d9 : [0x216]}, 0x22b8 : { 0x41d9 : [0x216]},
0x18d1 : { 0x4e11 : [0x0100], 0x4e12: [0x0100]}, 0x18d1 : { 0x4e11 : [0x0100, 0x226], 0x4e12: [0x0100, 0x226]},
0x04e8 : { 0x681d : [0x0222]}, 0x04e8 : { 0x681d : [0x0222]},
} }
EBOOK_DIR_MAIN = ['wordplayer/calibretransfer', 'eBooks/import', 'Books'] EBOOK_DIR_MAIN = ['wordplayer/calibretransfer', 'eBooks/import', 'Books']

View File

@ -226,11 +226,19 @@ class BookList(_BookList):
for item in collections: for item in collections:
item = item.strip() item = item.strip()
mitem = getattr(mi, item, None) mitem = getattr(mi, item, None)
titems = []
if mitem: if mitem:
if isinstance(mitem, list): if isinstance(mitem, list):
tags.extend(mitem) titems = mitem
else: else:
tags.append(mitem) titems = [mitem]
if item == 'tags' and titems:
litems = []
for i in titems:
if not i.strip().startswith('[') and not i.strip().endswith(']'):
litems.append(i)
titems = litems
tags.extend(titems)
if tags: if tags:
tags = list(set(tags)) tags = list(set(tags))
if hasattr(mi, 'tag_order'): if hasattr(mi, 'tag_order'):

View File

@ -150,7 +150,8 @@ class PRS505(CLI, Device):
for location in locations: for location in locations:
info = metadata.next() info = metadata.next()
path = location[0] path = location[0]
blist = 2 if location[3] == 'cardb' else 1 if location[3] == 'carda' else 0 oncard = location[3]
blist = 2 if oncard == 'cardb' else 1 if oncard == 'carda' else 0
if self._main_prefix and path.startswith(self._main_prefix): if self._main_prefix and path.startswith(self._main_prefix):
name = path.replace(self._main_prefix, '') name = path.replace(self._main_prefix, '')
@ -166,7 +167,11 @@ class PRS505(CLI, Device):
opts = self.settings() opts = self.settings()
collections = opts.extra_customization.split(',') if opts.extra_customization else [] collections = opts.extra_customization.split(',') if opts.extra_customization else []
booklists[blist].add_book(info, name, collections, *location[1:-1]) booklist = booklists[blist]
if not hasattr(booklist, 'add_book'):
raise ValueError(('Incorrect upload location %s. Did you choose the'
' correct card A or B, to send books to?')%oncard)
booklist.add_book(info, name, collections, *location[1:-1])
fix_ids(*booklists) fix_ids(*booklists)
def delete_books(self, paths, end_session=True): def delete_books(self, paths, end_session=True):

View File

@ -230,14 +230,25 @@ class HTMLPreProcessor(object):
end_rules = [] end_rules = []
if getattr(self.extra_opts, 'remove_header', None): if getattr(self.extra_opts, 'remove_header', None):
end_rules.append( try:
(re.compile(getattr(self.extra_opts, 'header_regex')), lambda match : '') end_rules.append(
) (re.compile(self.extra_opts.header_regex), lambda match : '')
)
except:
import traceback
print 'Failed to parse remove_header regexp'
traceback.print_exc()
if getattr(self.extra_opts, 'remove_footer', None): if getattr(self.extra_opts, 'remove_footer', None):
end_rules.append( try:
(re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '') end_rules.append(
) (re.compile(self.extra_opts.footer_regex), lambda match : '')
)
except:
import traceback
print 'Failed to parse remove_footer regexp'
traceback.print_exc()
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01: if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
length = line_length(html, getattr(self.extra_opts, 'unwrap_factor')) length = line_length(html, getattr(self.extra_opts, 'unwrap_factor'))
if length: if length:

View File

@ -230,8 +230,8 @@ class FB2MLizer(object):
if '://' in href: if '://' in href:
fb2_text.append('<a xlink:href="%s">' % href) fb2_text.append('<a xlink:href="%s">' % href)
else: else:
if '#' not in href: if href.startswith('#'):
href += '#' href = href[1:]
if href not in self.link_hrefs.keys(): if href not in self.link_hrefs.keys():
self.link_hrefs[href] = 'calibre_link-%s' % len(self.link_hrefs.keys()) self.link_hrefs[href] = 'calibre_link-%s' % len(self.link_hrefs.keys())
href = self.link_hrefs[href] href = self.link_hrefs[href]

View File

@ -12,6 +12,7 @@ __docformat__ = 'restructuredtext en'
from struct import pack, unpack from struct import pack, unpack
from cStringIO import StringIO from cStringIO import StringIO
from calibre.ebooks.conversion.config import load_defaults
from calibre.ebooks.mobi import MobiError from calibre.ebooks.mobi import MobiError
from calibre.ebooks.mobi.writer import rescale_image, MAX_THUMB_DIMEN from calibre.ebooks.mobi.writer import rescale_image, MAX_THUMB_DIMEN
from calibre.ebooks.mobi.langcodes import iana2mobi from calibre.ebooks.mobi.langcodes import iana2mobi
@ -350,15 +351,10 @@ class MetadataUpdater(object):
subjects = '; '.join(mi.tags) subjects = '; '.join(mi.tags)
update_exth_record((105, subjects.encode(self.codec, 'replace'))) update_exth_record((105, subjects.encode(self.codec, 'replace')))
# >>> Begin patch for ticket #4652 <<< prefs = load_defaults('mobi_output')
kindle_doc_types = set([u'[kindle_ebok]',u'[kindle_pdoc]']) kindle_pdoc = prefs.get('personal_doc', None)
doc_type = list(kindle_doc_types.intersection(set(mi.tags)))[0] if kindle_pdoc in mi.tags:
if doc_type: update_exth_record((501, str('PDOC')))
if doc_type == '[kindle_ebok]':
update_exth_record((501,str('EBOK')))
elif doc_type == '[kindle_pdoc]':
update_exth_record((501, str('PDOC')))
# >>> End patch
if mi.pubdate: if mi.pubdate:
update_exth_record((106, str(mi.pubdate).encode(self.codec, 'replace'))) update_exth_record((106, str(mi.pubdate).encode(self.codec, 'replace')))

View File

@ -4,7 +4,7 @@ __copyright__ = '2010, Greg Riker <griker@hotmail.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
''' Read/write metadata from Amazon's topaz format ''' ''' Read/write metadata from Amazon's topaz format '''
import os, StringIO, sys import StringIO, sys
from struct import pack from struct import pack
from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata import MetaInformation
@ -83,7 +83,7 @@ class MetadataUpdater(object):
sig = self.data[:4] sig = self.data[:4]
if not sig.startswith('TPZ'): if not sig.startswith('TPZ'):
raise ValueError("'%s': unexpected Topaz signature '%s'" % (os.path.basename(stream.name),self.data[:4])) raise ValueError("'%s': Not a Topaz file" % getattr(stream, 'name', 'Unnamed stream'))
offset = 4 offset = 4
self.header_records, consumed = self.decode_vwi(self.data[offset:offset+4]) self.header_records, consumed = self.decode_vwi(self.data[offset:offset+4])
@ -92,13 +92,13 @@ class MetadataUpdater(object):
# First integrity test - metadata header # First integrity test - metadata header
if not 'metadata' in self.topaz_headers: if not 'metadata' in self.topaz_headers:
raise ValueError("'%s': Topaz metadata record missing" % os.path.basename(stream.name)) raise ValueError("'%s': Invalid Topaz format - no metadata record" % getattr(stream, 'name', 'Unnamed stream'))
# Second integrity test - metadata body # Second integrity test - metadata body
md_offset = self.topaz_headers['metadata']['blocks'][0]['offset'] md_offset = self.topaz_headers['metadata']['blocks'][0]['offset']
md_offset += self.base md_offset += self.base
if self.data[md_offset+1:md_offset+9] != 'metadata': if self.data[md_offset+1:md_offset+9] != 'metadata':
raise ValueError("'%s': damaged Topaz metadata record" % os.path.basename(stream.name)) raise ValueError("'%s': Damaged metadata record" % getattr(stream, 'name', 'Unnamed stream'))
def book_length(self): def book_length(self):
''' convenience method for retrieving book length ''' ''' convenience method for retrieving book length '''

View File

@ -36,6 +36,9 @@ class MOBIOutput(OutputFormatPlugin):
recommended_value=False, level=OptionRecommendation.LOW, recommended_value=False, level=OptionRecommendation.LOW,
help=_('Disable compression of the file contents.') help=_('Disable compression of the file contents.')
), ),
OptionRecommendation(name='personal_doc', recommended_value='[kindle_pdoc]',
help=_('Tag marking book to be filed with Personal Docs')
),
]) ])
def check_for_periodical(self): def check_for_periodical(self):

View File

@ -168,9 +168,9 @@ class BookHeader(object):
try: try:
self.exth.mi.language = mobi2iana(langid, sublangid) self.exth.mi.language = mobi2iana(langid, sublangid)
except: except:
self.log.exception('Unknown language code') self.log.exception("'%s': Unknown language code" % getattr(stream, 'name', 'Unnamed stream'))
except: except:
self.log.exception('Invalid EXTH header') self.log.exception("'%s': Invalid EXTH header" % getattr(stream, 'name', 'Unnamed stream'))
self.exth_flag = 0 self.exth_flag = 0
@ -833,7 +833,7 @@ def get_metadata(stream):
try: try:
im = PILImage.open(buf) im = PILImage.open(buf)
except: except:
log.exception("Failed to read MOBI cover: '%s'" % os.path.basename(stream.name)) log.exception("'%s': Failed to read MOBI cover" % getattr(stream, 'name', 'Unnamed stream'))
else: else:
obuf = cStringIO.StringIO() obuf = cStringIO.StringIO()
im.convert('RGB').save(obuf, format='JPEG') im.convert('RGB').save(obuf, format='JPEG')

View File

@ -260,8 +260,8 @@ class PMLMLizer(object):
href += '#' href += '#'
if href not in self.link_hrefs.keys(): if href not in self.link_hrefs.keys():
self.link_hrefs[href] = 'calibre_link-%s' % len(self.link_hrefs.keys()) self.link_hrefs[href] = 'calibre_link-%s' % len(self.link_hrefs.keys())
href = self.link_hrefs[href] href = '#%s' % self.link_hrefs[href]
text.append('\\q="#%s"' % href) text.append('\\q="%s"' % href)
tags.append('q') tags.append('q')
# Anchor ids # Anchor ids

View File

@ -24,7 +24,7 @@ class PluginWidget(Widget, Ui_Form):
def __init__(self, parent, get_option, get_help, db=None, book_id=None): def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent, 'mobi_output', Widget.__init__(self, parent, 'mobi_output',
['prefer_author_sort', 'rescale_images', 'toc_title', ['prefer_author_sort', 'rescale_images', 'toc_title',
'dont_compress', 'no_inline_toc', 'masthead_font'] 'dont_compress', 'no_inline_toc', 'masthead_font','personal_doc']
) )
self.db, self.book_id = db, book_id self.db, self.book_id = db, book_id

View File

@ -6,8 +6,8 @@
<rect> <rect>
<x>0</x> <x>0</x>
<y>0</y> <y>0</y>
<width>421</width> <width>521</width>
<height>300</height> <height>331</height>
</rect> </rect>
</property> </property>
<property name="windowTitle"> <property name="windowTitle">
@ -64,13 +64,27 @@
<item> <item>
<widget class="QLabel" name="label_2"> <widget class="QLabel" name="label_2">
<property name="text"> <property name="text">
<string>Masthead font:</string> <string>Periodical masthead font:</string>
</property> </property>
</widget> </widget>
</item> </item>
<item> <item>
<widget class="QComboBox" name="opt_masthead_font"/> <widget class="QComboBox" name="opt_masthead_font"/>
</item> </item>
<item>
<layout class="QHBoxLayout" name="horizontalLayout">
<item>
<widget class="QLabel" name="label_3">
<property name="text">
<string>Personal Doc tag:</string>
</property>
</widget>
</item>
<item>
<widget class="QLineEdit" name="opt_personal_doc"/>
</item>
</layout>
</item>
<item> <item>
<spacer name="verticalSpacer"> <spacer name="verticalSpacer">
<property name="orientation"> <property name="orientation">
@ -79,7 +93,7 @@
<property name="sizeHint" stdset="0"> <property name="sizeHint" stdset="0">
<size> <size>
<width>20</width> <width>20</width>
<height>55</height> <height>40</height>
</size> </size>
</property> </property>
</spacer> </spacer>

View File

@ -864,10 +864,10 @@ class BasicNewsRecipe(Recipe):
self.log.error(_('Could not download cover: %s')%str(err)) self.log.error(_('Could not download cover: %s')%str(err))
self.log.debug(traceback.format_exc()) self.log.debug(traceback.format_exc())
if cu is not None: if cu is not None:
ext = cu.rpartition('.')[-1] ext = cu.split('/')[-1].rpartition('.')[-1]
if '?' in ext: if '?' in ext:
ext = '' ext = ''
ext = ext.lower() if ext else 'jpg' ext = ext.lower() if ext and '/' not in ext else 'jpg'
cpath = os.path.join(self.output_dir, 'cover.'+ext) cpath = os.path.join(self.output_dir, 'cover.'+ext)
if os.access(cu, os.R_OK): if os.access(cu, os.R_OK):
with open(cpath, 'wb') as cfile: with open(cpath, 'wb') as cfile: