Merge from trunk

This commit is contained in:
Sengian 2010-11-02 21:35:21 +01:00
commit f61daece95
21 changed files with 1267 additions and 405 deletions

View File

@ -25,15 +25,15 @@ class Fudzilla(BasicNewsRecipe):
remove_tags_before = dict(name='div', attrs={'class':['padding']})
remove_tags = [dict(name='td', attrs={'class':['left','right']}),
dict(name='div', attrs={'id':['toolbar','buttons']}),
dict(name='div', attrs={'class':['artbannersxtd','back_button']}),
dict(name='span', attrs={'class':['pathway']}),
dict(name='th', attrs={'class':['pagenav_next','pagenav_prev']}),
dict(name='table', attrs={'class':['headlines']}),
dict(name='div', attrs={'id':['toolbar','buttons']}),
dict(name='div', attrs={'class':['artbannersxtd','back_button']}),
dict(name='span', attrs={'class':['pathway']}),
dict(name='th', attrs={'class':['pagenav_next','pagenav_prev']}),
dict(name='table', attrs={'class':['headlines']}),
]
feeds = [
(u'Posts', u'http://www.fudzilla.com/index.php?option=com_rss&feed=RSS2.0&no_html=1')
(u'Posts', u'http://www.fudzilla.com/?format=feed')
]
preprocess_regexps = [

View File

@ -5,62 +5,59 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
nytimes.com
'''
import re
import time
from calibre import entity_to_unicode
import re, string, time
from calibre import entity_to_unicode, strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, \
Comment, BeautifulStoneSoup
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
class NYTimes(BasicNewsRecipe):
title = 'New York Times Top Stories'
__author__ = 'GRiker'
language = 'en'
requires_version = (0, 7, 5)
description = 'Top Stories from the New York Times'
# set headlinesOnly to True for the headlines-only version
headlinesOnly = True
# List of sections typically included in Top Stories. Use a keyword from the
# right column in the excludeSectionKeywords[] list to skip downloading that section
sections = {
'arts' : 'Arts',
'business' : 'Business',
'diningwine' : 'Dining & Wine',
'editorials' : 'Editorials',
'health' : 'Health',
'magazine' : 'Magazine',
'mediaadvertising' : 'Media & Advertising',
'newyorkregion' : 'New York/Region',
'oped' : 'Op-Ed',
'politics' : 'Politics',
'science' : 'Science',
'sports' : 'Sports',
'technology' : 'Technology',
'topstories' : 'Top Stories',
'travel' : 'Travel',
'us' : 'U.S.',
'world' : 'World'
}
# includeSections: List of sections to include. If empty, all sections found will be included.
# Otherwise, only the sections named will be included. For example,
#
# includeSections = ['Politics','Sports']
#
# would cause only the Politics and Sports sections to be included.
# Add section keywords from the right column above to skip that section
# For example, to skip sections containing the word 'Sports' or 'Dining', use:
# excludeSectionKeywords = ['Sports', 'Dining']
# Fetch only Business and Technology
# excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World']
# Fetch only Top Stories
# excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World']
# By default, no sections are skipped.
excludeSectionKeywords = []
includeSections = [] # by default, all sections included
# excludeSections: List of sections to exclude. If empty, all sections found will be included.
# Otherwise, the sections named will be excluded. For example,
#
# excludeSections = ['Politics','Sports']
#
# would cause the Politics and Sports sections to be excluded. This parameter can be used
# in conjuction with includeSections although in most cases using one or the other, but
# not both, is sufficient.
excludeSections = []
# one_picture_per_article specifies that calibre should only use the first image
# from an article (if one exists). If one_picture_per_article = True, the image
# will be moved to a location between the headline and the byline.
# If one_picture_per_article = False, all images from the article will be included
# and shown in their original location.
one_picture_per_article = True
# The maximum number of articles that will be downloaded
max_articles_per_feed = 40
max_articles_per_feed = 100
if headlinesOnly:
title='New York Times Headlines'
description = 'Headlines from the New York Times'
else:
title='New York Times'
description = 'Today\'s New York Times'
__author__ = 'GRiker/Kovid Goyal/Nick Redding'
language = 'en'
requires_version = (0, 7, 5)
timefmt = ''
needs_subscription = True
@ -82,6 +79,7 @@ class NYTimes(BasicNewsRecipe):
'entry-response module',
'icon enlargeThis',
'leftNavTabs',
'metaFootnote',
'module box nav',
'nextArticleLink',
'nextArticleLink clearfix',
@ -89,12 +87,13 @@ class NYTimes(BasicNewsRecipe):
'relatedSearchesModule',
'side_tool',
'singleAd',
'subNavigation clearfix',
'subNavigation tabContent active',
'subNavigation tabContent active clearfix',
re.compile('^subNavigation'),
re.compile('^leaderboard'),
re.compile('^module'),
]}),
dict(id=[
'adxLeaderboard',
'adxSponLink',
'archive',
'articleExtras',
'articleInline',
@ -105,87 +104,98 @@ class NYTimes(BasicNewsRecipe):
'footer',
'header',
'header_search',
'inlineBox',
'login',
'masthead',
'masthead-nav',
'memberTools',
'navigation',
'portfolioInline',
'readerReviews',
'readerReviewsCount',
'relatedArticles',
'relatedTopics',
'respond',
'side_search',
'side_index',
'side_tool',
'toolsRight',
]),
dict(name=['script', 'noscript', 'style'])]
dict(name=['script', 'noscript', 'style','form','hr'])]
no_stylesheets = True
extra_css = '.headline {text-align: left;}\n \
.byline {font-family: monospace; \
text-align: left; \
margin-top: 0px; \
margin-bottom: 0px;}\n \
.dateline {font-size: small; \
margin-top: 0px; \
margin-bottom: 0px;}\n \
.timestamp {font-size: small; \
margin-top: 0px; \
margin-bottom: 0px;}\n \
.source {text-align: left;}\n \
.image {text-align: center;}\n \
.credit {text-align: right; \
font-size: small; \
margin-top: 0px; \
margin-bottom: 0px;}\n \
.articleBody {text-align: left;}\n \
.authorId {text-align: left; \
font-style: italic;}\n '
extra_css = '''
.articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
.credit { text-align: right; font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
.byline { text-align: left; font-size: small; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
.dateline { text-align: left; font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
.kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
.timestamp { text-align: left; font-size: small; }
.caption { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
a:link {text-decoration: none; }
.articleBody { }
.authorId {text-align: left; }
.image {text-align: center;}
.source {text-align: left; }'''
def dump_ans(self, ans) :
def filter_ans(self, ans) :
total_article_count = 0
for section in ans :
idx = 0
idx_max = len(ans)-1
while idx <= idx_max:
if self.includeSections != []:
if ans[idx][0] not in self.includeSections:
print "SECTION NOT INCLUDED: ",ans[idx][0]
del ans[idx]
idx_max = idx_max-1
continue
if ans[idx][0] in self.excludeSections:
print "SECTION EXCLUDED: ",ans[idx][0]
del ans[idx]
idx_max = idx_max-1
continue
if self.verbose:
self.log("section %s: %d articles" % (section[0], len(section[1])) )
for article in section[1]:
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
for article in ans[idx][1]:
total_article_count += 1
if self.verbose:
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
article['url'].encode('cp1252','replace')))
idx = idx+1
self.log( "Queued %d articles" % total_article_count )
return ans
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","&#8216;",string)
fixed = re.sub("\x91","",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","&#8217;",fixed)
fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","&#8220;",fixed)
fixed = re.sub("\x93","",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","&#8221;",fixed)
fixed = re.sub("\x94","",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","&#8211;",fixed)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","&#8212;",fixed)
fixed = re.sub("\x97","",fixed)
return fixed
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
try:
br.open('http://www.nytimes.com/auth/login')
br.select_form(name='login')
br['USERID'] = self.username
br['PASSWORD'] = self.password
br.submit()
except:
self.log("\nFailed to login")
br.open('http://www.nytimes.com/auth/login')
br.select_form(name='login')
br['USERID'] = self.username
br['PASSWORD'] = self.password
raw = br.submit().read()
if 'Please try again' in raw:
raise Exception('Your username and password are incorrect')
return br
def skip_ad_pages(self, soup):
@ -213,6 +223,9 @@ class NYTimes(BasicNewsRecipe):
cover = None
return cover
def short_title(self):
return self.title
def index_to_soup(self, url_or_raw, raw=False):
'''
OVERRIDE of class method
@ -255,157 +268,184 @@ class NYTimes(BasicNewsRecipe):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&#38;'
massaged = re.sub("&","&#38;", massaged)
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def parse_index(self):
def parse_todays_index(self):
def feed_title(div):
return ''.join(div.findAll(text=True, recursive=True)).strip()
articles = {}
key = None
ans = []
url_list = []
def handle_article(div):
a = div.find('a', href=True)
if not a:
return
url = re.sub(r'\?.*', '', a['href'])
if not url.startswith("http"):
return
if not url.endswith(".html"):
return
if 'podcast' in url:
return
if '/video/' in url:
return
url += '?pagewanted=all'
if url in url_list:
return
url_list.append(url)
title = self.tag_to_string(a, use_alt=True).strip()
description = ''
pubdate = strftime('%a, %d %b')
summary = div.find(True, attrs={'class':'summary'})
if summary:
description = self.tag_to_string(summary, use_alt=False)
author = ''
authorAttribution = div.find(True, attrs={'class':'byline'})
if authorAttribution:
author = self.tag_to_string(authorAttribution, use_alt=False)
else:
authorAttribution = div.find(True, attrs={'class':'byline'})
if authorAttribution:
author = self.tag_to_string(authorAttribution, use_alt=False)
feed = key if key is not None else 'Uncategorized'
if not articles.has_key(feed):
ans.append(feed)
articles[feed] = []
articles[feed].append(
dict(title=title, url=url, date=pubdate,
description=description, author=author,
content=''))
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
# Find each article
for div in soup.findAll(True,
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
if div['class'] in ['section-headline','sectionHeader']:
key = string.capwords(feed_title(div))
key = key.replace('Op-ed','Op-Ed')
key = key.replace('U.s.','U.S.')
elif div['class'] in ['story', 'story headline'] :
handle_article(div)
elif div['class'] == 'headlinesOnly multiline flush':
for lidiv in div.findAll('li'):
handle_article(lidiv)
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return self.filter_ans(ans)
def parse_headline_index(self):
articles = {}
ans = []
feed = key = 'All Top Stories'
articles[key] = []
ans.append(key)
self.log("Scanning 1 section ...")
url_list = []
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
# Fetch the outer table
table = soup.find('table')
previousTable = table
# Fetch the content table
content_table = soup.find('table',{'id':'content'})
if content_table is None:
self.log("FATAL ERROR: CANNOT FIND CONTENT TABLE")
return None
# Find the deepest table containing the stories
while True :
table = table.find('table')
if table.find(text=re.compile('top stories start')) :
previousTable = table
continue
else :
table = previousTable
break
# Within this table are <td id=".*Column.*"> entries, each containing one or more h6 tags which represent sections
# There are multiple subtables, find the one containing the stories
for block in table.findAll('table') :
if block.find(text=re.compile('top stories start')) :
table = block
break
else :
continue
for td_col in content_table.findAll('td', {'id' : re.compile('Column')}):
for div_sec in td_col.findAll('div',recursive=False):
for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}):
section_name = self.tag_to_string(h6_sec_name,use_alt=False)
section_name = re.sub(r'^ *$','',section_name)
if section_name == '':
continue
section_name=string.capwords(section_name)
if section_name == 'U.s.':
section_name = 'U.S.'
elif section_name == 'Op-ed':
section_name = 'Op-Ed'
pubdate = strftime('%a, %d %b')
# Again there are multiple subtables, find the one containing the stories
for storyblock in table.findAll('table') :
if storyblock.find(text=re.compile('top stories start')) :
break
else :
continue
skipThisSection = False
todays_article_count = 0
# Within this table are <font face="times new roman, times, san serif"> entries
self.log("Fetching feed Top Stories")
for tr in storyblock.findAllNext('tr'):
if tr.find('span') is not None :
sectionblock = tr.find(True, attrs={'face':['times new roman, times,sans serif',
'times new roman,times, sans serif',
'times new roman, times, sans serif']})
section = None
bylines = []
descriptions = []
pubdate = None
# Get the Section title
for (x,i) in enumerate(sectionblock.contents) :
skipThisSection = False
# Extract the section title
if ('Comment' in str(i.__class__)) :
if 'start(name=' in i :
section = i[i.find('=')+1:-2]
if not self.sections.has_key(section) :
skipThisSection = True
search_div = div_sec
for next_tag in h6_sec_name.findNextSiblings(True):
if next_tag.__class__.__name__ == 'Tag':
if next_tag.name == 'div':
search_div = next_tag
break
# Check for excluded section
if len(self.excludeSectionKeywords):
key = self.sections[section]
excluded = re.compile('|'.join(self.excludeSectionKeywords))
if excluded.search(key) or articles.has_key(key):
skipThisSection = True
break
# Get the bylines and descriptions
if not skipThisSection :
lines = sectionblock.contents
contentStrings = []
for line in lines:
if not isinstance(line, Comment) and line.strip and line.strip() > "":
contentStrings.append(line.strip())
# Gather the byline/description pairs
bylines = []
descriptions = []
for contentString in contentStrings:
if contentString[0:3] == 'By ' and contentString[3].isupper() :
bylines.append(contentString)
# Get the articles
for h3_item in search_div.findAll('h3'):
byline = h3_item.h6
if byline is not None:
author = self.tag_to_string(byline,usa_alt=False)
else:
descriptions.append(contentString)
# Fetch the article titles and URLs
articleCount = len(sectionblock.findAll('span'))
todays_article_count += articleCount
for (i,span) in enumerate(sectionblock.findAll(attrs={'class':'headlineWrapper'})) :
a = span.find('a', href=True)
author = ''
a = h3_item.find('a', href=True)
if not a:
continue
url = re.sub(r'\?.*', '', a['href'])
if not url.startswith("http"):
continue
if not url.endswith(".html"):
continue
if 'podcast' in url:
continue
if 'video' in url:
continue
url += '?pagewanted=all'
if url in url_list:
continue
url_list.append(url)
self.log("URL %s" % url)
title = self.tag_to_string(a, use_alt=True).strip()
desc = h3_item.find('p')
if desc is not None:
description = self.tag_to_string(desc,use_alt=False)
else:
description = ''
if not articles.has_key(section_name):
ans.append(section_name)
articles[section_name] = []
articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
title = self.tag_to_string(a, use_alt=True)
# prepend the section name
title = self.sections[section] + " &middot; " + title
if not isinstance(title, unicode):
title = title.decode('utf-8', 'replace')
# Allow for unattributed, undescribed entries "Editor's Note"
if i >= len(descriptions) :
description = None
else :
description = descriptions[i]
if len(bylines) == articleCount :
author = bylines[i]
else :
author = None
# Check for duplicates
duplicateFound = False
if len(articles[feed]) > 1:
for article in articles[feed] :
if url == article['url'] :
duplicateFound = True
break
if duplicateFound:
# Continue fetching, don't add this article
todays_article_count -= 1
continue
if not articles.has_key(feed):
articles[feed] = []
articles[feed].append(
dict(title=title, url=url, date=pubdate,
description=description, author=author, content=''))
# self.log("Queuing %d articles from %s" % (todays_article_count, "Top Stories"))
ans = self.sort_index_by(ans, {'Top Stories':-1})
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
self.dump_ans(ans)
return ans
return self.filter_ans(ans)
def parse_index(self):
if self.headlinesOnly:
return self.parse_headline_index()
else:
return self.parse_todays_index()
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self, soup):
kicker_tag = soup.find(attrs={'class':'kicker'})
if kicker_tag: # remove Op_Ed author head shots
tagline = self.tag_to_string(kicker_tag)
if tagline=='Op-Ed Columnist':
img_div = soup.find('div','inlineImage module')
if img_div:
img_div.extract()
return self.strip_anchors(soup)
def postprocess_html(self,soup, True):
@ -422,8 +462,9 @@ class NYTimes(BasicNewsRecipe):
firstImg = inlineImgs[0]
for inlineImg in inlineImgs[1:]:
inlineImg.extract()
# Move firstImg after headline
cgFirst = soup.find(True, {'class':'columnGroup first'})
# Move firstImg before article body
#article_body = soup.find(True, {'id':'articleBody'})
cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
if cgFirst:
# Strip all sibling NavigableStrings: noise
navstrings = cgFirst.findAll(text=True, recursive=False)
@ -443,30 +484,18 @@ class NYTimes(BasicNewsRecipe):
if headline_found:
cgFirst.insert(insertLoc,firstImg)
else:
self.log(">>> No class:'columnGroup first' found <<<")
# Change class="kicker" to <h3>
kicker = soup.find(True, {'class':'kicker'})
if kicker and kicker.contents[0]:
h3Tag = Tag(soup, "h3")
h3Tag.insert(0, self.fixChars(self.tag_to_string(kicker,
use_alt=False)))
kicker.replaceWith(h3Tag)
self.log(">>> No class:'columnGroup first' found <<<")
# Change captions to italic -1
# Change captions to italic
for caption in soup.findAll(True, {'class':'caption'}) :
if caption and caption.contents[0]:
emTag = Tag(soup, "em")
cTag = Tag(soup, "p", [("class", "caption")])
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
mp_off = c.find("More Photos")
if mp_off >= 0:
c = c[:mp_off]
emTag.insert(0, c)
#hrTag = Tag(soup, 'hr')
#hrTag['class'] = 'caption_divider'
hrTag = Tag(soup, 'div')
hrTag['class'] = 'divider'
emTag.insert(1, hrTag)
caption.replaceWith(emTag)
cTag.insert(0, c)
caption.replaceWith(cTag)
# Change <nyt_headline> to <h2>
h1 = soup.find('h1')
@ -506,17 +535,6 @@ class NYTimes(BasicNewsRecipe):
bTag.insert(0, subhead.contents[0])
subhead.replaceWith(bTag)
# Synthesize a section header
dsk = soup.find('meta', attrs={'name':'dsk'})
if dsk and dsk.has_key('content'):
hTag = Tag(soup,'h3')
hTag['class'] = 'section'
hTag.insert(0,NavigableString(dsk['content']))
articleTag = soup.find(True, attrs={'id':'article'})
if articleTag:
articleTag.insert(0,hTag)
# Add class="articleBody" to <div> so we can format with CSS
divTag = soup.find('div',attrs={'id':'articleBody'})
if divTag:
divTag['class'] = divTag['id']
@ -532,11 +550,3 @@ class NYTimes(BasicNewsRecipe):
return soup
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup

View File

@ -5,52 +5,186 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
nytimes.com
'''
import string, re, time
from calibre import strftime
import re, string, time
from calibre import entity_to_unicode, strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
def decode(self, src):
enc = 'utf-8'
if 'iso-8859-1' in src:
enc = 'cp1252'
return src.decode(enc, 'ignore')
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
class NYTimes(BasicNewsRecipe):
title = u'New York Times'
__author__ = 'Kovid Goyal/Nick Redding'
language = 'en'
requires_version = (0, 6, 36)
# set headlinesOnly to True for the headlines-only version
headlinesOnly = False
description = 'Daily news from the New York Times (subscription version)'
timefmt = ' [%b %d]'
# includeSections: List of sections to include. If empty, all sections found will be included.
# Otherwise, only the sections named will be included. For example,
#
# includeSections = ['Politics','Sports']
#
# would cause only the Politics and Sports sections to be included.
includeSections = [] # by default, all sections included
# excludeSections: List of sections to exclude. If empty, all sections found will be included.
# Otherwise, the sections named will be excluded. For example,
#
# excludeSections = ['Politics','Sports']
#
# would cause the Politics and Sports sections to be excluded. This parameter can be used
# in conjuction with includeSections although in most cases using one or the other, but
# not both, is sufficient.
excludeSections = []
# one_picture_per_article specifies that calibre should only use the first image
# from an article (if one exists). If one_picture_per_article = True, the image
# will be moved to a location between the headline and the byline.
# If one_picture_per_article = False, all images from the article will be included
# and shown in their original location.
one_picture_per_article = True
# The maximum number of articles that will be downloaded
max_articles_per_feed = 100
if headlinesOnly:
title='New York Times Headlines'
description = 'Headlines from the New York Times'
else:
title='New York Times'
description = 'Today\'s New York Times'
__author__ = 'GRiker/Kovid Goyal/Nick Redding'
language = 'en'
requires_version = (0, 7, 5)
timefmt = ''
needs_subscription = True
masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
cover_margins = (18,18,'grey99')
remove_tags_before = dict(id='article')
remove_tags_after = dict(id='article')
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool','nextArticleLink',
'nextArticleLink clearfix','columnGroup doubleRule','doubleRule','entry-meta',
'icon enlargeThis','columnGroup last','relatedSearchesModule']}),
dict({'class':re.compile('^subNavigation')}),
dict({'class':re.compile('^leaderboard')}),
dict({'class':re.compile('^module')}),
dict({'class':'metaFootnote'}),
dict(id=['inlineBox','footer', 'toolsRight', 'articleInline','login','masthead',
'navigation', 'archive', 'side_search', 'blog_sidebar','cCol','portfolioInline',
'side_tool', 'side_index','header','readerReviewsCount','readerReviews',
'relatedArticles', 'relatedTopics', 'adxSponLink']),
remove_tags = [dict(attrs={'class':[
'articleFooter',
'articleTools',
'columnGroup doubleRule',
'columnGroup singleRule',
'columnGroup last',
'columnGroup last',
'doubleRule',
'dottedLine',
'entry-meta',
'entry-response module',
'icon enlargeThis',
'leftNavTabs',
'metaFootnote',
'module box nav',
'nextArticleLink',
'nextArticleLink clearfix',
'post-tools',
'relatedSearchesModule',
'side_tool',
'singleAd',
re.compile('^subNavigation'),
re.compile('^leaderboard'),
re.compile('^module'),
]}),
dict(id=[
'adxLeaderboard',
'adxSponLink',
'archive',
'articleExtras',
'articleInline',
'blog_sidebar',
'businessSearchBar',
'cCol',
'entertainmentSearchBar',
'footer',
'header',
'header_search',
'inlineBox',
'login',
'masthead',
'masthead-nav',
'memberTools',
'navigation',
'portfolioInline',
'readerReviews',
'readerReviewsCount',
'relatedArticles',
'relatedTopics',
'respond',
'side_search',
'side_index',
'side_tool',
'toolsRight',
]),
dict(name=['script', 'noscript', 'style','form','hr'])]
encoding = decode
no_stylesheets = True
extra_css = '''
.articleHeadline { margin-top:0.5em; margin-bottom:0.25em; }
.credit { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
.byline { font-size: small; font-style:italic; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
.dateline { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
.articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
.credit { text-align: right; font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
.byline { text-align: left; font-size: small; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
.dateline { text-align: left; font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
.kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
.timestamp { font-size: small; }
.caption { font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
a:link {text-decoration: none; }'''
.timestamp { text-align: left; font-size: small; }
.caption { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
a:link {text-decoration: none; }
.articleBody { }
.authorId {text-align: left; }
.image {text-align: center;}
.source {text-align: left; }'''
def filter_ans(self, ans) :
total_article_count = 0
idx = 0
idx_max = len(ans)-1
while idx <= idx_max:
if self.includeSections != []:
if ans[idx][0] not in self.includeSections:
print "SECTION NOT INCLUDED: ",ans[idx][0]
del ans[idx]
idx_max = idx_max-1
continue
if ans[idx][0] in self.excludeSections:
print "SECTION EXCLUDED: ",ans[idx][0]
del ans[idx]
idx_max = idx_max-1
continue
if self.verbose:
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
for article in ans[idx][1]:
total_article_count += 1
if self.verbose:
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
article['url'].encode('cp1252','replace')))
idx = idx+1
self.log( "Queued %d articles" % total_article_count )
return ans
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","“",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","—",fixed)
return fixed
def get_browser(self):
br = BasicNewsRecipe.get_browser()
@ -60,22 +194,19 @@ class NYTimes(BasicNewsRecipe):
br['USERID'] = self.username
br['PASSWORD'] = self.password
raw = br.submit().read()
if 'Sorry, we could not find the combination you entered. Please try again.' in raw:
if 'Please try again' in raw:
raise Exception('Your username and password are incorrect')
#open('/t/log.html', 'wb').write(raw)
return br
def get_masthead_url(self):
masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
#masthead = 'http://members.cox.net/nickredding/nytlogo.gif'
br = BasicNewsRecipe.get_browser()
try:
br.open(masthead)
except:
self.log("\nMasthead unavailable")
masthead = None
return masthead
def skip_ad_pages(self, soup):
# Skip ad pages served before actual article
skip_tag = soup.find(True, {'name':'skip'})
if skip_tag is not None:
self.log.warn("Found forwarding link: %s" % skip_tag.parent['href'])
url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
url += '?pagewanted=all'
self.log.warn("Skipping ad to article at '%s'" % url)
return self.index_to_soup(url, raw=True)
def get_cover_url(self):
cover = None
@ -93,12 +224,57 @@ class NYTimes(BasicNewsRecipe):
return cover
def short_title(self):
return 'New York Times'
return self.title
def parse_index(self):
self.encoding = 'cp1252'
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
self.encoding = decode
def index_to_soup(self, url_or_raw, raw=False):
'''
OVERRIDE of class method
deals with various page encodings between index and articles
'''
def get_the_soup(docEncoding, url_or_raw, raw=False) :
if re.match(r'\w+://', url_or_raw):
f = self.browser.open(url_or_raw)
_raw = f.read()
f.close()
if not _raw:
raise RuntimeError('Could not fetch index from %s'%url_or_raw)
else:
_raw = url_or_raw
if raw:
return _raw
if not isinstance(_raw, unicode) and self.encoding:
_raw = _raw.decode(docEncoding, 'replace')
massage = list(BeautifulSoup.MARKUP_MASSAGE)
massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding)))
return BeautifulSoup(_raw, markupMassage=massage)
# Entry point
print "index_to_soup()"
soup = get_the_soup( self.encoding, url_or_raw )
contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
if docEncoding == '' :
docEncoding = self.encoding
if self.verbose > 2:
self.log( " document encoding: '%s'" % docEncoding)
if docEncoding != self.encoding :
soup = get_the_soup(docEncoding, url_or_raw)
return soup
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def parse_todays_index(self):
def feed_title(div):
return ''.join(div.findAll(text=True, recursive=True)).strip()
@ -119,12 +295,13 @@ class NYTimes(BasicNewsRecipe):
return
if 'podcast' in url:
return
if '/video/' in url:
return
url += '?pagewanted=all'
if url in url_list:
return
url_list.append(url)
title = self.tag_to_string(a, use_alt=True).strip()
#self.log("Title: %s" % title)
description = ''
pubdate = strftime('%a, %d %b')
summary = div.find(True, attrs={'class':'summary'})
@ -140,6 +317,7 @@ class NYTimes(BasicNewsRecipe):
author = self.tag_to_string(authorAttribution, use_alt=False)
feed = key if key is not None else 'Uncategorized'
if not articles.has_key(feed):
ans.append(feed)
articles[feed] = []
articles[feed].append(
dict(title=title, url=url, date=pubdate,
@ -147,46 +325,228 @@ class NYTimes(BasicNewsRecipe):
content=''))
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
# Find each instance of class="section-headline", class="story", class="story headline"
# Find each article
for div in soup.findAll(True,
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
if div['class'] in ['section-headline','sectionHeader']:
key = string.capwords(feed_title(div))
articles[key] = []
ans.append(key)
#self.log('Section: %s' % key)
key = key.replace('Op-ed','Op-Ed')
key = key.replace('U.s.','U.S.')
elif div['class'] in ['story', 'story headline'] :
handle_article(div)
elif div['class'] == 'headlinesOnly multiline flush':
for lidiv in div.findAll('li'):
handle_article(lidiv)
# ans = self.sort_index_by(ans, {'The Front Page':-1,
# 'Dining In, Dining Out':1,
# 'Obituaries':2})
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return self.filter_ans(ans)
def parse_headline_index(self):
articles = {}
ans = []
url_list = []
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
# Fetch the content table
content_table = soup.find('table',{'id':'content'})
if content_table is None:
self.log("FATAL ERROR: CANNOT FIND CONTENT TABLE")
return None
# Within this table are <td id=".*Column.*"> entries, each containing one or more h6 tags which represent sections
for td_col in content_table.findAll('td', {'id' : re.compile('Column')}):
for div_sec in td_col.findAll('div',recursive=False):
for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}):
section_name = self.tag_to_string(h6_sec_name,use_alt=False)
section_name = re.sub(r'^ *$','',section_name)
if section_name == '':
continue
section_name=string.capwords(section_name)
if section_name == 'U.s.':
section_name = 'U.S.'
elif section_name == 'Op-ed':
section_name = 'Op-Ed'
pubdate = strftime('%a, %d %b')
search_div = div_sec
for next_tag in h6_sec_name.findNextSiblings(True):
if next_tag.__class__.__name__ == 'Tag':
if next_tag.name == 'div':
search_div = next_tag
break
# Get the articles
for h3_item in search_div.findAll('h3'):
byline = h3_item.h6
if byline is not None:
author = self.tag_to_string(byline,usa_alt=False)
else:
author = ''
a = h3_item.find('a', href=True)
if not a:
continue
url = re.sub(r'\?.*', '', a['href'])
if not url.startswith("http"):
continue
if not url.endswith(".html"):
continue
if 'podcast' in url:
continue
if 'video' in url:
continue
url += '?pagewanted=all'
if url in url_list:
continue
url_list.append(url)
self.log("URL %s" % url)
title = self.tag_to_string(a, use_alt=True).strip()
desc = h3_item.find('p')
if desc is not None:
description = self.tag_to_string(desc,use_alt=False)
else:
description = ''
if not articles.has_key(section_name):
ans.append(section_name)
articles[section_name] = []
articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return self.filter_ans(ans)
def parse_index(self):
if self.headlinesOnly:
return self.parse_headline_index()
else:
return self.parse_todays_index()
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
return ans
def preprocess_html(self, soup):
kicker_tag = soup.find(attrs={'class':'kicker'})
if kicker_tag:
if kicker_tag: # remove Op_Ed author head shots
tagline = self.tag_to_string(kicker_tag)
#self.log("FOUND KICKER %s" % tagline)
if tagline=='Op-Ed Columnist':
img_div = soup.find('div','inlineImage module')
#self.log("Searching for photo")
if img_div:
img_div.extract()
#self.log("Photo deleted")
refresh = soup.find('meta', {'http-equiv':'refresh'})
if refresh is None:
return soup
content = refresh.get('content').partition('=')[2]
raw = self.browser.open_novisit('http://www.nytimes.com'+content).read()
return BeautifulSoup(raw.decode('cp1252', 'replace'))
return self.strip_anchors(soup)
def postprocess_html(self,soup, True):
if self.one_picture_per_article:
# Remove all images after first
largeImg = soup.find(True, {'class':'articleSpanImage'})
inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
if largeImg:
for inlineImg in inlineImgs:
inlineImg.extract()
else:
if inlineImgs:
firstImg = inlineImgs[0]
for inlineImg in inlineImgs[1:]:
inlineImg.extract()
# Move firstImg before article body
#article_body = soup.find(True, {'id':'articleBody'})
cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
if cgFirst:
# Strip all sibling NavigableStrings: noise
navstrings = cgFirst.findAll(text=True, recursive=False)
[ns.extract() for ns in navstrings]
headline_found = False
tag = cgFirst.find(True)
insertLoc = 0
while True:
insertLoc += 1
if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
headline_found = True
break
tag = tag.nextSibling
if not tag:
headline_found = False
break
if headline_found:
cgFirst.insert(insertLoc,firstImg)
else:
self.log(">>> No class:'columnGroup first' found <<<")
# Change captions to italic
for caption in soup.findAll(True, {'class':'caption'}) :
if caption and caption.contents[0]:
cTag = Tag(soup, "p", [("class", "caption")])
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
mp_off = c.find("More Photos")
if mp_off >= 0:
c = c[:mp_off]
cTag.insert(0, c)
caption.replaceWith(cTag)
# Change <nyt_headline> to <h2>
h1 = soup.find('h1')
if h1:
headline = h1.find("nyt_headline")
if headline:
tag = Tag(soup, "h2")
tag['class'] = "headline"
tag.insert(0, self.fixChars(headline.contents[0]))
h1.replaceWith(tag)
else:
# Blog entry - replace headline, remove <hr> tags
headline = soup.find('title')
if headline:
tag = Tag(soup, "h2")
tag['class'] = "headline"
tag.insert(0, self.fixChars(headline.contents[0]))
soup.insert(0, tag)
hrs = soup.findAll('hr')
for hr in hrs:
hr.extract()
# Change <h1> to <h3> - used in editorial blogs
masthead = soup.find("h1")
if masthead:
# Nuke the href
if masthead.a:
del(masthead.a['href'])
tag = Tag(soup, "h3")
tag.insert(0, self.fixChars(masthead.contents[0]))
masthead.replaceWith(tag)
# Change <span class="bold"> to <b>
for subhead in soup.findAll(True, {'class':'bold'}) :
if subhead.contents:
bTag = Tag(soup, "b")
bTag.insert(0, subhead.contents[0])
subhead.replaceWith(bTag)
divTag = soup.find('div',attrs={'id':'articleBody'})
if divTag:
divTag['class'] = divTag['id']
# Add class="authorId" to <div> so we can format with CSS
divTag = soup.find('div',attrs={'id':'authorId'})
if divTag and divTag.contents[0]:
tag = Tag(soup, "p")
tag['class'] = "authorId"
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
use_alt=False)))
divTag.replaceWith(tag)
return soup

View File

@ -6,22 +6,25 @@ Fetch Die Zeit.
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class ZeitDe(BasicNewsRecipe):
title = 'ZEIT Online'
description = 'ZEIT Online'
title = 'Zeit Online'
description = 'Zeit Online'
language = 'de'
lang = 'de_DE'
__author__ = 'Martin Pitt, Sujata Raman and Ingo Paschke'
use_embedded_content = False
__author__ = 'Martin Pitt, Sujata Raman, Ingo Paschke and Marc Toensing'
max_articles_per_feed = 40
remove_empty_feeds = True
no_stylesheets = True
no_javascript = True
encoding = 'utf-8'
remove_tags = [
dict(name='iframe'),
dict(name='div', attrs={'class':["response","pagination block","pagenav","inline link", "copyright"] }),
dict(name='p', attrs={'class':["ressortbacklink", "copyright"] }),
dict(name='div', attrs={'id':["place_5","place_4","comments"]})
]
keep_only_tags = [dict(id=['main'])]
feeds = [
('Seite 1', 'http://newsfeed.zeit.de/index_xml'),
@ -40,43 +43,15 @@ class ZeitDe(BasicNewsRecipe):
('Sport', 'http://newsfeed.zeit.de/sport/index'),
]
extra_css = '''
.supertitle{color:#990000; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
.excerpt{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:small;}
.title{font-family:Arial,Helvetica,sans-serif;font-size:large;clear:right;}
.caption{color:#666666; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
.copyright{color:#666666; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
.article{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:x-small}
.quote{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:x-small}
.quote .cite{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:xx-small}
.headline iconportrait_inline{font-family:Arial,Helvetica,sans-serif;font-size:x-small}
.inline{float:left;margin-top:0;margin-right:15px;position:relative;width:180px; }
img.inline{float:none}
.intertitle{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:x-small;font-weight:700}
.ebinfobox{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:xx-small;list-style-type:none;float:right;margin-top:0;border-left-style:solid;border-left-width:1px;padding-left:10px;}
.infobox {border-style: solid; border-width: 1px;padding:8px;}
.infobox dt {font-weight:700;}
'''
extra_css = '.reaktion,.taglist,.comments,.reponse,.responsetitle,.responsebody,.reponse,.inline,.date{display:none;}li.date{display:block}'
#filter_regexps = [r'ad.de.doubleclick.net/']
keep_only_tags = [
dict(name='div', attrs={'class':["article"]}) ,
dict(name='ul', attrs={'class':["tools"]}) ,
]
remove_tags = [
dict(name='link'), dict(name='iframe'),dict(name='style'),dict(name='meta'),
dict(name='div', attrs={'class':["pagination block","pagenav","inline link", "copyright"] }),
dict(name='p', attrs={'class':["ressortbacklink", "copyright"] }),
dict(name='div', attrs={'id':["place_5","place_4","comments"]})
]
remove_attributes = ['style', 'font']
def get_article_url(self, article):
ans = article.get('link',None)
ans += "?page=all"
ans += "?page=all&print=true"
if 'video' in ans or 'quiz' in ans :
if 'video' in ans or 'quiz' in ans or 'blog' in ans :
ans = None
return ans
@ -86,25 +61,3 @@ class ZeitDe(BasicNewsRecipe):
return inhalt.find('div', attrs={'class':'singlearchive clearfix'}).img['src'].replace('icon_','')
except:
return 'http://images.zeit.de/bilder/titelseiten_zeit/1946/001_001.jpg'
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '">'
soup.head.insert(0,mtag)
title = soup.find('h2', attrs={'class':'title'})
if title is None:
print "no title"
return soup
info = Tag(soup,'ul',[('class','ebinfobox')])
tools = soup.find('ul', attrs={'class':'tools'})
#author = tools.find('li','author first')
for tag in ['author first', 'date', 'date first', 'author', 'source']:
line = tools.find('li', tag)
if line:
info.insert(0,line)
title.parent.insert(0,info)
tools.extract()
return soup

View File

@ -0,0 +1,60 @@
body{
margin:0px;
padding: 0.5em;
background-color:#F6F3E9;
font-size:12px;
font-family:Arial, Helvetica, sans-serif;
}
.calibreMeta{
background-color:#39322B;
color:white;
padding:10px;
}
.calibreMeta a, .calibreEbNav a, .calibreEbNavTop a, .calibreToc a{
color:white;
}
.calibreMeta h1{
margin:0px;
font-size:18px;
background-color:#39322B;
}
.calibreEbookContent{
padding:20px;
}
.calibreEbNav, .calibreEbNavTop{
clear:both;
background-color:#39322B;
color:white;
padding:10px;
text-align:center;
}
.calibreEbNavTop{
margin-bottom:20px;
}
.calibreEbNav a, .calibreEbNavTop a{
padding:0px 5px;
}
.calibreTocIndex{
line-height:18px;
}
.calibreToc{
float:left;
margin:20px;
width:300px;
background-color:#39322B;
color:white;
padding:10px;
}
.calibreEbookContent{
width:600px;
float:left;
}

View File

@ -0,0 +1,74 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
${head_content}$
<link href="${cssLink}$" type="text/css" rel="stylesheet" />
</head>
<body>
<div class="calibreMeta">
<div class="calibreMetaTitle">
${pos1=1}$
${for title in meta.titles():}$
${if pos1:}$
<h1>
<a href="${tocUrl}$">${print title}$</a>
</h1>
${:else:}$
<div class="calibreMetaSubtitle">${print title}$</div>
${:endif}$
${pos1=0}$
${:endfor}$
</div>
<div class="calibreMetaAuthor">
${print ', '.join(meta.creators())}$
</div>
</div>
<div class="calibreMain">
<div class="calibreEbookContent">
${if prevLink or nextLink:}$
<div class="calibreEbNavTop">
${if prevLink:}$
<a href="${prevLink}$" class="calibreAPrev">${print _('previous page'),}$</a>
${:else:}$
<a href="${tocUrl}$" class="calibreAPrev">${print _('previous page'),}$</a>
${:endif}$
${if nextLink:}$
<a href="${nextLink}$" class="calibreANext">${print _('next page'),}$</a>
${:endif}$
</div>
${:endif}$
${ebookContent}$
</div>
${if has_toc:}$
<div class="calibreToc">
<h2><a href="${tocUrl}$">${print _('Table of contents'),}$</a></h2>
${print toc()}$
</div>
${:endif}$
<div class="calibreEbNav">
${if prevLink:}$
<a href="${prevLink}$" class="calibreAPrev">${print _('previous page'),}$</a>
${:else:}$
<a href="${tocUrl}$" class="calibreAPrev">${print _('previous page'),}$</a>
${:endif}$
<a href="${tocUrl}$" class="calibreAHome">${print _('start'),}$</a>
${if nextLink:}$
<a href="${nextLink}$" class="calibreANext">${print _('next page'),}$</a>
${:endif}$
</div>
</div>
</body>
</html>

View File

@ -0,0 +1,61 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<link rel="schema.DC" href="http://purl.org/dc/elements/1.1/" />
<link rel="schema.DCTERMS" href="http://purl.org/dc/terms/" />
<title>${print ', '.join(meta.creators()),}$ - ${print meta.titles().next(); meta.titles().close()}$</title>
${for item in meta:}$
<meta ${print 'name="DC.'+item['name']+'"',}$ ${print 'content="'+item['value']+'"',}$ />
${:endfor}$
<link href="${cssLink}$" type="text/css" rel="stylesheet" />
</head>
<body>
<div class="calibreMeta">
<div class="calibreMetaTitle">
${pos1=1}$
${for title in meta.titles():}$
${if pos1:}$
<h1>
<a href="${tocUrl}$">${print title}$</a>
</h1>
${:else:}$
<div class="calibreMetaSubtitle">${print title}$</div>
${:endif}$
${pos1=0}$
${:endfor}$
</div>
<div class="calibreMetaAuthor">
${print ', '.join(meta.creators()),}$
</div>
</div>
<div class="calibreMain">
<div class="calibreEbookContent">
${if has_toc:}$
<div class="calibreTocIndex">
<h2>${print _('Table of contents'),}$</h2>
${toc}$
</div>
${:else:}$
<h2>${print _('No table of contents present'),}$</h2>
<div><strong><a href="${nextLink}$">${print _('begin to read'),}$</a></strong></div>
${:endif}$
</div>
<div class="calibreEbNav">
${if nextLink:}$
<a href="${nextLink}$" class="calibreANext">${print _('next page'),}$</a>
${:endif}$
</div>
</div>
</body>
</html>

View File

@ -89,7 +89,7 @@ class Server(Command):
t = telnetlib.Telnet('localhost', 4242)
t.read_until("repl>")
t.write('BrowserReload();')
print t.read_until("repl>")
t.read_until("repl>")
t.close()
except:
print 'Failed to reload browser'

View File

@ -446,6 +446,7 @@ from calibre.ebooks.rb.output import RBOutput
from calibre.ebooks.rtf.output import RTFOutput
from calibre.ebooks.tcr.output import TCROutput
from calibre.ebooks.txt.output import TXTOutput
from calibre.ebooks.html.output import HTMLOutput
from calibre.ebooks.snb.output import SNBOutput
from calibre.customize.profiles import input_profiles, output_profiles
@ -525,6 +526,7 @@ plugins += [
RTFOutput,
TCROutput,
TXTOutput,
HTMLOutput,
SNBOutput,
]
# Order here matters. The first matched device is the one used.
@ -893,4 +895,3 @@ plugins += [LookAndFeel, Behavior, Columns, Toolbar, InputOptions,
Email, Server, Plugins, Tweaks, Misc]
#}}}

View File

@ -0,0 +1,33 @@
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2010, Fabian Grassl <fg@jusmeum.de>'
__docformat__ = 'restructuredtext en'
from calibre.ebooks.oeb.base import namespace, barename, DC11_NS
class EasyMeta(object):
def __init__(self, meta):
self.meta = meta
def __iter__(self):
meta = self.meta
for item_name in meta.items:
for item in meta[item_name]:
if namespace(item.term) == DC11_NS:
yield { 'name': barename(item.term), 'value': item.value }
def __len__(self):
count = 0
for item in self:
count = count+1
return count
def titles(self):
for item in self.meta['title']:
yield item.value
def creators(self):
for item in self.meta['creator']:
yield item.value

View File

@ -0,0 +1,201 @@
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2010, Fabian Grassl <fg@jusmeum.de>'
__docformat__ = 'restructuredtext en'
import os, re, shutil
from os.path import dirname, abspath, relpath, exists
from lxml import etree
from templite import Templite
from calibre.ebooks.oeb.base import element
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
from calibre import CurrentDir
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.utils.zipfile import ZipFile
from urllib import unquote
from calibre.ebooks.html.meta import EasyMeta
class HTMLOutput(OutputFormatPlugin):
name = 'HTML Output'
author = 'Fabian Grassl'
file_type = 'zip'
options = set([
OptionRecommendation(name='template_css',
help=_('CSS file used for the output instead of the default file')),
OptionRecommendation(name='template_html_index',
help=_('Template used for generation of the html index file instead of the default file')),
OptionRecommendation(name='template_html',
help=_('Template used for the generation of the html contents of the book instead of the default file')),
OptionRecommendation(name='extract_to',
help=_('Extract the contents of the generated ZIP file to the directory of the generated ZIP file')
),
])
recommendations = set([('pretty_print', True, OptionRecommendation.HIGH)])
def generate_toc(self, oeb_book, ref_url, output_dir):
'''
Generate table of contents
'''
with CurrentDir(output_dir):
def build_node(current_node, parent=None):
if parent is None:
parent = etree.Element('ul')
elif len(current_node.nodes):
parent = element(parent, ('ul'))
for node in current_node.nodes:
point = element(parent, 'li')
href = relpath(abspath(unquote(node.href)), dirname(ref_url))
link = element(point, 'a', href=href)
title = node.title
if title:
title = re.sub(r'\s+', ' ', title)
link.text=title
build_node(node, point)
return parent
wrap = etree.Element('div')
wrap.append(build_node(oeb_book.toc))
return wrap
def generate_html_toc(self, oeb_book, ref_url, output_dir):
root = self.generate_toc(oeb_book, ref_url, output_dir)
return etree.tostring(root, pretty_print=True, encoding='utf-8',
xml_declaration=True)
def convert(self, oeb_book, output_path, input_plugin, opts, log):
# read template files
if opts.template_html_index is not None:
template_html_index_data = open(opts.template_html_index, 'rb').read()
else:
template_html_index_data = P('templates/html_export_default_index.tmpl', data=True)
if opts.template_html is not None:
template_html_data = open(opts.template_html, 'rb').read()
else:
template_html_data = P('templates/html_export_default.tmpl', data=True)
if opts.template_css is not None:
template_css_data = open(opts.template_css, 'rb').read()
else:
template_css_data = P('templates/html_export_default.css', data=True)
template_html_index_data = template_html_index_data.decode('utf-8')
template_html_data = template_html_data.decode('utf-8')
template_css_data = template_css_data.decode('utf-8')
self.log = log
self.opts = opts
meta = EasyMeta(oeb_book.metadata)
tempdir = PersistentTemporaryDirectory()
output_file = os.path.join(tempdir,
os.path.basename(re.sub(r'\.zip', '', output_path)+'.html'))
output_dir = re.sub(r'\.html', '', output_file)+'_files'
if not exists(output_dir):
os.makedirs(output_dir)
css_path = output_dir+os.sep+'calibreHtmlOutBasicCss.css'
with open(css_path, 'wb') as f:
f.write(template_css_data.encode('utf-8'))
with open(output_file, 'wb') as f:
html_toc = self.generate_html_toc(oeb_book, output_file, output_dir)
templite = Templite(template_html_index_data)
nextLink = oeb_book.spine[0].href
nextLink = relpath(output_dir+os.sep+nextLink, dirname(output_file))
cssLink = relpath(abspath(css_path), dirname(output_file))
tocUrl = relpath(output_file, dirname(output_file))
t = templite.render(has_toc=bool(oeb_book.toc.count()),
toc=html_toc, meta=meta, nextLink=nextLink,
tocUrl=tocUrl, cssLink=cssLink)
f.write(t)
with CurrentDir(output_dir):
for item in oeb_book.manifest:
path = abspath(unquote(item.href))
dir = dirname(path)
if not exists(dir):
os.makedirs(dir)
if item.spine_position is not None:
with open(path, 'wb') as f:
pass
else:
with open(path, 'wb') as f:
f.write(str(item))
item.unload_data_from_memory(memory=path)
for item in oeb_book.spine:
path = abspath(unquote(item.href))
dir = dirname(path)
root = item.data.getroottree()
# get & clean HTML <HEAD>-data
head = root.xpath('//h:head', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
head_content = etree.tostring(head, pretty_print=True, encoding='utf-8')
head_content = re.sub(r'\<\/?head.*\>', '', head_content)
head_content = re.sub(re.compile(r'\<style.*\/style\>', re.M|re.S), '', head_content)
# get & clean HTML <BODY>-data
body = root.xpath('//h:body', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
ebook_content = etree.tostring(body, pretty_print=True, encoding='utf-8')
ebook_content = re.sub(r'\<\/?body.*\>', '', ebook_content)
# generate link to next page
if item.spine_position+1 < len(oeb_book.spine):
nextLink = oeb_book.spine[item.spine_position+1].href
nextLink = relpath(abspath(nextLink), dir)
else:
nextLink = None
# generate link to previous page
if item.spine_position > 0:
prevLink = oeb_book.spine[item.spine_position-1].href
prevLink = relpath(abspath(prevLink), dir)
else:
prevLink = None
cssLink = relpath(abspath(css_path), dir)
tocUrl = relpath(output_file, dir)
# render template
templite = Templite(template_html_data)
toc = lambda: self.generate_html_toc(oeb_book, path, output_dir)
t = templite.render(ebookContent=ebook_content,
prevLink=prevLink, nextLink=nextLink,
has_toc=bool(oeb_book.toc.count()), toc=toc,
tocUrl=tocUrl, head_content=head_content,
meta=meta, cssLink=cssLink)
# write html to file
with open(path, 'wb') as f:
f.write(t)
item.unload_data_from_memory(memory=path)
zfile = ZipFile(output_path, "w")
zfile.add_dir(output_dir)
if opts.extract_to:
if os.path.exists(opts.extract_to):
shutil.rmtree(opts.extract_to)
os.makedirs(opts.extract_to)
zfile.extractall(opts.extract_to)
self.log('Zip file extracted to', opts.extract_to)
zfile.close()
# cleanup temp dir
shutil.rmtree(tempdir)

View File

@ -112,13 +112,12 @@ def get_metadata(br, asin, mi):
def main(args=sys.argv):
# Test xisbn
#print get_social_metadata('Learning Python', None, None, '8324616489')
#print
print get_social_metadata('Learning Python', None, None, '8324616489')
print
# Test sophisticated comment formatting
print get_social_metadata('Angels & Demons', None, None, '9781416580829')
print
return
# Random tests
print get_social_metadata('Star Trek: Destiny: Mere Mortals', None, None, '9781416551720')

View File

@ -275,7 +275,15 @@ class MobiMLizer(object):
# <mbp:frame-set/> does not exist lalalala
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
or style['visibility'] == 'hidden':
return
id_ = elem.get('id', None)
if id_:
# Keep anchors so people can use display:none
# to generate hidden TOCs
elem.clear()
elem.text = None
elem.set('id', id_)
else:
return
tag = barename(elem.tag)
istate = copy.copy(istates[-1])
istate.rendered = False
@ -406,6 +414,12 @@ class MobiMLizer(object):
parent = bstate.para if bstate.inline is None else bstate.inline
if parent is not None:
vtag = etree.SubElement(parent, XHTML(vtag))
# Add anchors
for child in vbstate.body:
if child is not vbstate.para:
vtag.append(child)
else:
break
for child in vbstate.para:
vtag.append(child)
return

View File

@ -49,5 +49,3 @@ class OEBOutput(OutputFormatPlugin):
with open(path, 'wb') as f:
f.write(str(item))
item.unload_data_from_memory(memory=path)

View File

@ -101,11 +101,12 @@ class SNBMLizer(object):
subitem = ''
bodyTree = trees[subitem].find(".//body")
for line in output.splitlines():
if not line.find(CALIBRE_SNB_PRE_TAG) == 0:
pos = line.find(CALIBRE_SNB_PRE_TAG)
if pos == -1:
line = line.strip(u' \t\n\r\u3000')
else:
etree.SubElement(bodyTree, "text").text = \
etree.CDATA(line[len(CALIBRE_SNB_PRE_TAG):])
etree.CDATA(line[pos+len(CALIBRE_SNB_PRE_TAG):])
continue
if len(line) != 0:
if line.find(CALIBRE_SNB_IMG_TAG) == 0:

View File

@ -35,7 +35,6 @@ class ViewAction(InterfaceAction):
self.qaction.setMenu(self.view_menu)
ac.triggered.connect(self.view_specific_format, type=Qt.QueuedConnection)
def location_selected(self, loc):
enabled = loc == 'library'
for action in list(self.view_menu.actions())[1:]:
@ -134,6 +133,9 @@ class ViewAction(InterfaceAction):
rows = self.gui.current_view().selectionModel().selectedRows()
self._view_books(rows)
def view_triggered(self, index):
self._view_books([index])
def view_specific_book(self, index):
self._view_books([index])

View File

@ -28,6 +28,8 @@ def gui_catalog(fmt, title, dbspec, ids, out_file_name, sync, fmt_options, conne
if log is None:
log = Log()
from calibre.library import db
from calibre.utils.config import prefs
prefs.refresh()
db = db()
db.catalog_plugin_on_device_temp_mapping = dbspec

View File

@ -50,6 +50,8 @@ class BooksView(QTableView): # {{{
def __init__(self, parent, modelcls=BooksModel):
QTableView.__init__(self, parent)
self.setEditTriggers(self.SelectedClicked|self.EditKeyPressed)
self.drag_allowed = True
self.setDragEnabled(True)
self.setDragDropOverwriteMode(False)
@ -98,6 +100,8 @@ class BooksView(QTableView): # {{{
self._model.about_to_be_sorted.connect(self.about_to_be_sorted)
self._model.sorting_done.connect(self.sorting_done)
self.doubleClicked.connect(parent.iactions['View'].view_triggered)
# Column Header Context Menu {{{
def column_header_context_handler(self, action=None, column=None):
if not action or not column:

View File

@ -128,7 +128,7 @@ class ContentServer(object):
if want_mobile:
return self.mobile()
return self.browse_toplevel()
return self.browse_catalog()
def old(self, **kwargs):
return self.static('index.html').replace('{prefix}',

View File

@ -338,6 +338,8 @@ Calibre has several keyboard shortcuts to save you time and mouse movement. Thes
* - Keyboard Shortcut
- Action
* - :kbd:`F2 (Enter in OS X)`
- Edit the metadata of the currently selected field in the book list.
* - :kbd:`A`
- Add Books
* - :kbd:`C`

87
src/templite/__init__.py Normal file
View File

@ -0,0 +1,87 @@
#!/usr/bin/env python
#
# Templite+
# A light-weight, fully functional, general purpose templating engine
#
# Copyright (c) 2009 joonis new media
# Author: Thimo Kraemer <thimo.kraemer@joonis.de>
#
# Based on Templite - Tomer Filiba
# http://code.activestate.com/recipes/496702/
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA.
#
import sys, re
class Templite(object):
auto_emit = re.compile('(^[\'\"])|(^[a-zA-Z0-9_\[\]\'\"]+$)')
def __init__(self, template, start='${', end='}$'):
if len(start) != 2 or len(end) != 2:
raise ValueError('each delimiter must be two characters long')
delimiter = re.compile('%s(.*?)%s' % (re.escape(start), re.escape(end)), re.DOTALL)
offset = 0
tokens = []
for i, part in enumerate(delimiter.split(template)):
part = part.replace('\\'.join(list(start)), start)
part = part.replace('\\'.join(list(end)), end)
if i % 2 == 0:
if not part: continue
part = part.replace('\\', '\\\\').replace('"', '\\"')
part = '\t' * offset + 'emit("""%s""")' % part
else:
part = part.rstrip()
if not part: continue
if part.lstrip().startswith(':'):
if not offset:
raise SyntaxError('no block statement to terminate: ${%s}$' % part)
offset -= 1
part = part.lstrip()[1:]
if not part.endswith(':'): continue
elif self.auto_emit.match(part.lstrip()):
part = 'emit(%s)' % part.lstrip()
lines = part.splitlines()
margin = min(len(l) - len(l.lstrip()) for l in lines if l.strip())
part = '\n'.join('\t' * offset + l[margin:] for l in lines)
if part.endswith(':'):
offset += 1
tokens.append(part)
if offset:
raise SyntaxError('%i block statement(s) not terminated' % offset)
self.__code = compile('\n'.join(tokens), '<templite %r>' % template[:20], 'exec')
def render(self, __namespace=None, **kw):
"""
renders the template according to the given namespace.
__namespace - a dictionary serving as a namespace for evaluation
**kw - keyword arguments which are added to the namespace
"""
namespace = {}
if __namespace: namespace.update(__namespace)
if kw: namespace.update(kw)
namespace['emit'] = self.write
__stdout = sys.stdout
sys.stdout = self
self.__output = []
eval(self.__code, namespace)
sys.stdout = __stdout
return ''.join(self.__output)
def write(self, *args):
for a in args:
self.__output.append(str(a))