Merge from trunk

This commit is contained in:
Sengian 2010-11-02 21:35:21 +01:00
commit f61daece95
21 changed files with 1267 additions and 405 deletions

View File

@ -25,15 +25,15 @@ class Fudzilla(BasicNewsRecipe):
remove_tags_before = dict(name='div', attrs={'class':['padding']}) remove_tags_before = dict(name='div', attrs={'class':['padding']})
remove_tags = [dict(name='td', attrs={'class':['left','right']}), remove_tags = [dict(name='td', attrs={'class':['left','right']}),
dict(name='div', attrs={'id':['toolbar','buttons']}), dict(name='div', attrs={'id':['toolbar','buttons']}),
dict(name='div', attrs={'class':['artbannersxtd','back_button']}), dict(name='div', attrs={'class':['artbannersxtd','back_button']}),
dict(name='span', attrs={'class':['pathway']}), dict(name='span', attrs={'class':['pathway']}),
dict(name='th', attrs={'class':['pagenav_next','pagenav_prev']}), dict(name='th', attrs={'class':['pagenav_next','pagenav_prev']}),
dict(name='table', attrs={'class':['headlines']}), dict(name='table', attrs={'class':['headlines']}),
] ]
feeds = [ feeds = [
(u'Posts', u'http://www.fudzilla.com/index.php?option=com_rss&feed=RSS2.0&no_html=1') (u'Posts', u'http://www.fudzilla.com/?format=feed')
] ]
preprocess_regexps = [ preprocess_regexps = [

View File

@ -5,62 +5,59 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
''' '''
nytimes.com nytimes.com
''' '''
import re import re, string, time
import time from calibre import entity_to_unicode, strftime
from calibre import entity_to_unicode
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, \ from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
Comment, BeautifulStoneSoup
class NYTimes(BasicNewsRecipe): class NYTimes(BasicNewsRecipe):
title = 'New York Times Top Stories' # set headlinesOnly to True for the headlines-only version
__author__ = 'GRiker' headlinesOnly = True
language = 'en'
requires_version = (0, 7, 5)
description = 'Top Stories from the New York Times'
# List of sections typically included in Top Stories. Use a keyword from the # includeSections: List of sections to include. If empty, all sections found will be included.
# right column in the excludeSectionKeywords[] list to skip downloading that section # Otherwise, only the sections named will be included. For example,
sections = { #
'arts' : 'Arts', # includeSections = ['Politics','Sports']
'business' : 'Business', #
'diningwine' : 'Dining & Wine', # would cause only the Politics and Sports sections to be included.
'editorials' : 'Editorials',
'health' : 'Health',
'magazine' : 'Magazine',
'mediaadvertising' : 'Media & Advertising',
'newyorkregion' : 'New York/Region',
'oped' : 'Op-Ed',
'politics' : 'Politics',
'science' : 'Science',
'sports' : 'Sports',
'technology' : 'Technology',
'topstories' : 'Top Stories',
'travel' : 'Travel',
'us' : 'U.S.',
'world' : 'World'
}
# Add section keywords from the right column above to skip that section includeSections = [] # by default, all sections included
# For example, to skip sections containing the word 'Sports' or 'Dining', use:
# excludeSectionKeywords = ['Sports', 'Dining'] # excludeSections: List of sections to exclude. If empty, all sections found will be included.
# Fetch only Business and Technology # Otherwise, the sections named will be excluded. For example,
# excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World'] #
# Fetch only Top Stories # excludeSections = ['Politics','Sports']
# excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World'] #
# By default, no sections are skipped. # would cause the Politics and Sports sections to be excluded. This parameter can be used
excludeSectionKeywords = [] # in conjuction with includeSections although in most cases using one or the other, but
# not both, is sufficient.
excludeSections = []
# one_picture_per_article specifies that calibre should only use the first image # one_picture_per_article specifies that calibre should only use the first image
# from an article (if one exists). If one_picture_per_article = True, the image # from an article (if one exists). If one_picture_per_article = True, the image
# will be moved to a location between the headline and the byline. # will be moved to a location between the headline and the byline.
# If one_picture_per_article = False, all images from the article will be included # If one_picture_per_article = False, all images from the article will be included
# and shown in their original location. # and shown in their original location.
one_picture_per_article = True one_picture_per_article = True
# The maximum number of articles that will be downloaded # The maximum number of articles that will be downloaded
max_articles_per_feed = 40 max_articles_per_feed = 100
if headlinesOnly:
title='New York Times Headlines'
description = 'Headlines from the New York Times'
else:
title='New York Times'
description = 'Today\'s New York Times'
__author__ = 'GRiker/Kovid Goyal/Nick Redding'
language = 'en'
requires_version = (0, 7, 5)
timefmt = '' timefmt = ''
needs_subscription = True needs_subscription = True
@ -82,6 +79,7 @@ class NYTimes(BasicNewsRecipe):
'entry-response module', 'entry-response module',
'icon enlargeThis', 'icon enlargeThis',
'leftNavTabs', 'leftNavTabs',
'metaFootnote',
'module box nav', 'module box nav',
'nextArticleLink', 'nextArticleLink',
'nextArticleLink clearfix', 'nextArticleLink clearfix',
@ -89,12 +87,13 @@ class NYTimes(BasicNewsRecipe):
'relatedSearchesModule', 'relatedSearchesModule',
'side_tool', 'side_tool',
'singleAd', 'singleAd',
'subNavigation clearfix', re.compile('^subNavigation'),
'subNavigation tabContent active', re.compile('^leaderboard'),
'subNavigation tabContent active clearfix', re.compile('^module'),
]}), ]}),
dict(id=[ dict(id=[
'adxLeaderboard', 'adxLeaderboard',
'adxSponLink',
'archive', 'archive',
'articleExtras', 'articleExtras',
'articleInline', 'articleInline',
@ -105,87 +104,98 @@ class NYTimes(BasicNewsRecipe):
'footer', 'footer',
'header', 'header',
'header_search', 'header_search',
'inlineBox',
'login', 'login',
'masthead', 'masthead',
'masthead-nav', 'masthead-nav',
'memberTools', 'memberTools',
'navigation', 'navigation',
'portfolioInline', 'portfolioInline',
'readerReviews',
'readerReviewsCount',
'relatedArticles', 'relatedArticles',
'relatedTopics',
'respond', 'respond',
'side_search', 'side_search',
'side_index', 'side_index',
'side_tool', 'side_tool',
'toolsRight', 'toolsRight',
]), ]),
dict(name=['script', 'noscript', 'style'])] dict(name=['script', 'noscript', 'style','form','hr'])]
no_stylesheets = True no_stylesheets = True
extra_css = '.headline {text-align: left;}\n \ extra_css = '''
.byline {font-family: monospace; \ .articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
text-align: left; \ .credit { text-align: right; font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
margin-top: 0px; \ .byline { text-align: left; font-size: small; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
margin-bottom: 0px;}\n \ .dateline { text-align: left; font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
.dateline {font-size: small; \ .kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
margin-top: 0px; \ .timestamp { text-align: left; font-size: small; }
margin-bottom: 0px;}\n \ .caption { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
.timestamp {font-size: small; \ a:link {text-decoration: none; }
margin-top: 0px; \ .articleBody { }
margin-bottom: 0px;}\n \ .authorId {text-align: left; }
.source {text-align: left;}\n \ .image {text-align: center;}
.image {text-align: center;}\n \ .source {text-align: left; }'''
.credit {text-align: right; \
font-size: small; \
margin-top: 0px; \
margin-bottom: 0px;}\n \
.articleBody {text-align: left;}\n \
.authorId {text-align: left; \
font-style: italic;}\n '
def dump_ans(self, ans) : def filter_ans(self, ans) :
total_article_count = 0 total_article_count = 0
for section in ans : idx = 0
idx_max = len(ans)-1
while idx <= idx_max:
if self.includeSections != []:
if ans[idx][0] not in self.includeSections:
print "SECTION NOT INCLUDED: ",ans[idx][0]
del ans[idx]
idx_max = idx_max-1
continue
if ans[idx][0] in self.excludeSections:
print "SECTION EXCLUDED: ",ans[idx][0]
del ans[idx]
idx_max = idx_max-1
continue
if self.verbose: if self.verbose:
self.log("section %s: %d articles" % (section[0], len(section[1])) ) self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
for article in section[1]: for article in ans[idx][1]:
total_article_count += 1 total_article_count += 1
if self.verbose: if self.verbose:
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'), self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
article['url'].encode('cp1252','replace'))) article['url'].encode('cp1252','replace')))
idx = idx+1
self.log( "Queued %d articles" % total_article_count ) self.log( "Queued %d articles" % total_article_count )
return ans
def fixChars(self,string): def fixChars(self,string):
# Replace lsquo (\x91) # Replace lsquo (\x91)
fixed = re.sub("\x91","&#8216;",string) fixed = re.sub("\x91","",string)
# Replace rsquo (\x92) # Replace rsquo (\x92)
fixed = re.sub("\x92","&#8217;",fixed) fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93) # Replace ldquo (\x93)
fixed = re.sub("\x93","&#8220;",fixed) fixed = re.sub("\x93","",fixed)
# Replace rdquo (\x94) # Replace rdquo (\x94)
fixed = re.sub("\x94","&#8221;",fixed) fixed = re.sub("\x94","",fixed)
# Replace ndash (\x96) # Replace ndash (\x96)
fixed = re.sub("\x96","&#8211;",fixed) fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97) # Replace mdash (\x97)
fixed = re.sub("\x97","&#8212;",fixed) fixed = re.sub("\x97","",fixed)
return fixed return fixed
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None: if self.username is not None and self.password is not None:
try: br.open('http://www.nytimes.com/auth/login')
br.open('http://www.nytimes.com/auth/login') br.select_form(name='login')
br.select_form(name='login') br['USERID'] = self.username
br['USERID'] = self.username br['PASSWORD'] = self.password
br['PASSWORD'] = self.password raw = br.submit().read()
br.submit() if 'Please try again' in raw:
except: raise Exception('Your username and password are incorrect')
self.log("\nFailed to login")
return br return br
def skip_ad_pages(self, soup): def skip_ad_pages(self, soup):
@ -213,6 +223,9 @@ class NYTimes(BasicNewsRecipe):
cover = None cover = None
return cover return cover
def short_title(self):
return self.title
def index_to_soup(self, url_or_raw, raw=False): def index_to_soup(self, url_or_raw, raw=False):
''' '''
OVERRIDE of class method OVERRIDE of class method
@ -255,157 +268,184 @@ class NYTimes(BasicNewsRecipe):
# Kindle TOC descriptions won't render certain characters # Kindle TOC descriptions won't render certain characters
if description: if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&#38;' # Replace '&' with '&'
massaged = re.sub("&","&#38;", massaged) massaged = re.sub("&","&", massaged)
return self.fixChars(massaged) return self.fixChars(massaged)
else: else:
return description return description
def parse_index(self): def parse_todays_index(self):
def feed_title(div):
return ''.join(div.findAll(text=True, recursive=True)).strip()
articles = {}
key = None
ans = []
url_list = []
def handle_article(div):
a = div.find('a', href=True)
if not a:
return
url = re.sub(r'\?.*', '', a['href'])
if not url.startswith("http"):
return
if not url.endswith(".html"):
return
if 'podcast' in url:
return
if '/video/' in url:
return
url += '?pagewanted=all'
if url in url_list:
return
url_list.append(url)
title = self.tag_to_string(a, use_alt=True).strip()
description = ''
pubdate = strftime('%a, %d %b')
summary = div.find(True, attrs={'class':'summary'})
if summary:
description = self.tag_to_string(summary, use_alt=False)
author = ''
authorAttribution = div.find(True, attrs={'class':'byline'})
if authorAttribution:
author = self.tag_to_string(authorAttribution, use_alt=False)
else:
authorAttribution = div.find(True, attrs={'class':'byline'})
if authorAttribution:
author = self.tag_to_string(authorAttribution, use_alt=False)
feed = key if key is not None else 'Uncategorized'
if not articles.has_key(feed):
ans.append(feed)
articles[feed] = []
articles[feed].append(
dict(title=title, url=url, date=pubdate,
description=description, author=author,
content=''))
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
# Find each article
for div in soup.findAll(True,
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
if div['class'] in ['section-headline','sectionHeader']:
key = string.capwords(feed_title(div))
key = key.replace('Op-ed','Op-Ed')
key = key.replace('U.s.','U.S.')
elif div['class'] in ['story', 'story headline'] :
handle_article(div)
elif div['class'] == 'headlinesOnly multiline flush':
for lidiv in div.findAll('li'):
handle_article(lidiv)
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return self.filter_ans(ans)
def parse_headline_index(self):
articles = {} articles = {}
ans = [] ans = []
url_list = []
feed = key = 'All Top Stories'
articles[key] = []
ans.append(key)
self.log("Scanning 1 section ...")
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/') soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
# Fetch the outer table # Fetch the content table
table = soup.find('table') content_table = soup.find('table',{'id':'content'})
previousTable = table if content_table is None:
self.log("FATAL ERROR: CANNOT FIND CONTENT TABLE")
return None
# Find the deepest table containing the stories # Within this table are <td id=".*Column.*"> entries, each containing one or more h6 tags which represent sections
while True :
table = table.find('table')
if table.find(text=re.compile('top stories start')) :
previousTable = table
continue
else :
table = previousTable
break
# There are multiple subtables, find the one containing the stories for td_col in content_table.findAll('td', {'id' : re.compile('Column')}):
for block in table.findAll('table') : for div_sec in td_col.findAll('div',recursive=False):
if block.find(text=re.compile('top stories start')) : for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}):
table = block section_name = self.tag_to_string(h6_sec_name,use_alt=False)
break section_name = re.sub(r'^ *$','',section_name)
else : if section_name == '':
continue continue
section_name=string.capwords(section_name)
if section_name == 'U.s.':
section_name = 'U.S.'
elif section_name == 'Op-ed':
section_name = 'Op-Ed'
pubdate = strftime('%a, %d %b')
# Again there are multiple subtables, find the one containing the stories search_div = div_sec
for storyblock in table.findAll('table') : for next_tag in h6_sec_name.findNextSiblings(True):
if storyblock.find(text=re.compile('top stories start')) : if next_tag.__class__.__name__ == 'Tag':
break if next_tag.name == 'div':
else : search_div = next_tag
continue
skipThisSection = False
todays_article_count = 0
# Within this table are <font face="times new roman, times, san serif"> entries
self.log("Fetching feed Top Stories")
for tr in storyblock.findAllNext('tr'):
if tr.find('span') is not None :
sectionblock = tr.find(True, attrs={'face':['times new roman, times,sans serif',
'times new roman,times, sans serif',
'times new roman, times, sans serif']})
section = None
bylines = []
descriptions = []
pubdate = None
# Get the Section title
for (x,i) in enumerate(sectionblock.contents) :
skipThisSection = False
# Extract the section title
if ('Comment' in str(i.__class__)) :
if 'start(name=' in i :
section = i[i.find('=')+1:-2]
if not self.sections.has_key(section) :
skipThisSection = True
break break
# Check for excluded section # Get the articles
if len(self.excludeSectionKeywords): for h3_item in search_div.findAll('h3'):
key = self.sections[section] byline = h3_item.h6
excluded = re.compile('|'.join(self.excludeSectionKeywords)) if byline is not None:
if excluded.search(key) or articles.has_key(key): author = self.tag_to_string(byline,usa_alt=False)
skipThisSection = True
break
# Get the bylines and descriptions
if not skipThisSection :
lines = sectionblock.contents
contentStrings = []
for line in lines:
if not isinstance(line, Comment) and line.strip and line.strip() > "":
contentStrings.append(line.strip())
# Gather the byline/description pairs
bylines = []
descriptions = []
for contentString in contentStrings:
if contentString[0:3] == 'By ' and contentString[3].isupper() :
bylines.append(contentString)
else: else:
descriptions.append(contentString) author = ''
a = h3_item.find('a', href=True)
# Fetch the article titles and URLs if not a:
articleCount = len(sectionblock.findAll('span')) continue
todays_article_count += articleCount
for (i,span) in enumerate(sectionblock.findAll(attrs={'class':'headlineWrapper'})) :
a = span.find('a', href=True)
url = re.sub(r'\?.*', '', a['href']) url = re.sub(r'\?.*', '', a['href'])
if not url.startswith("http"):
continue
if not url.endswith(".html"):
continue
if 'podcast' in url:
continue
if 'video' in url:
continue
url += '?pagewanted=all' url += '?pagewanted=all'
if url in url_list:
continue
url_list.append(url)
self.log("URL %s" % url)
title = self.tag_to_string(a, use_alt=True).strip()
desc = h3_item.find('p')
if desc is not None:
description = self.tag_to_string(desc,use_alt=False)
else:
description = ''
if not articles.has_key(section_name):
ans.append(section_name)
articles[section_name] = []
articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
title = self.tag_to_string(a, use_alt=True)
# prepend the section name
title = self.sections[section] + " &middot; " + title
if not isinstance(title, unicode):
title = title.decode('utf-8', 'replace')
# Allow for unattributed, undescribed entries "Editor's Note"
if i >= len(descriptions) :
description = None
else :
description = descriptions[i]
if len(bylines) == articleCount :
author = bylines[i]
else :
author = None
# Check for duplicates
duplicateFound = False
if len(articles[feed]) > 1:
for article in articles[feed] :
if url == article['url'] :
duplicateFound = True
break
if duplicateFound:
# Continue fetching, don't add this article
todays_article_count -= 1
continue
if not articles.has_key(feed):
articles[feed] = []
articles[feed].append(
dict(title=title, url=url, date=pubdate,
description=description, author=author, content=''))
# self.log("Queuing %d articles from %s" % (todays_article_count, "Top Stories"))
ans = self.sort_index_by(ans, {'Top Stories':-1})
ans = [(key, articles[key]) for key in ans if articles.has_key(key)] ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
self.dump_ans(ans) return self.filter_ans(ans)
return ans
def parse_index(self):
if self.headlinesOnly:
return self.parse_headline_index()
else:
return self.parse_todays_index()
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self, soup): def preprocess_html(self, soup):
kicker_tag = soup.find(attrs={'class':'kicker'})
if kicker_tag: # remove Op_Ed author head shots
tagline = self.tag_to_string(kicker_tag)
if tagline=='Op-Ed Columnist':
img_div = soup.find('div','inlineImage module')
if img_div:
img_div.extract()
return self.strip_anchors(soup) return self.strip_anchors(soup)
def postprocess_html(self,soup, True): def postprocess_html(self,soup, True):
@ -422,8 +462,9 @@ class NYTimes(BasicNewsRecipe):
firstImg = inlineImgs[0] firstImg = inlineImgs[0]
for inlineImg in inlineImgs[1:]: for inlineImg in inlineImgs[1:]:
inlineImg.extract() inlineImg.extract()
# Move firstImg after headline # Move firstImg before article body
cgFirst = soup.find(True, {'class':'columnGroup first'}) #article_body = soup.find(True, {'id':'articleBody'})
cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
if cgFirst: if cgFirst:
# Strip all sibling NavigableStrings: noise # Strip all sibling NavigableStrings: noise
navstrings = cgFirst.findAll(text=True, recursive=False) navstrings = cgFirst.findAll(text=True, recursive=False)
@ -443,30 +484,18 @@ class NYTimes(BasicNewsRecipe):
if headline_found: if headline_found:
cgFirst.insert(insertLoc,firstImg) cgFirst.insert(insertLoc,firstImg)
else: else:
self.log(">>> No class:'columnGroup first' found <<<") self.log(">>> No class:'columnGroup first' found <<<")
# Change class="kicker" to <h3>
kicker = soup.find(True, {'class':'kicker'})
if kicker and kicker.contents[0]:
h3Tag = Tag(soup, "h3")
h3Tag.insert(0, self.fixChars(self.tag_to_string(kicker,
use_alt=False)))
kicker.replaceWith(h3Tag)
# Change captions to italic -1 # Change captions to italic
for caption in soup.findAll(True, {'class':'caption'}) : for caption in soup.findAll(True, {'class':'caption'}) :
if caption and caption.contents[0]: if caption and caption.contents[0]:
emTag = Tag(soup, "em") cTag = Tag(soup, "p", [("class", "caption")])
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip() c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
mp_off = c.find("More Photos") mp_off = c.find("More Photos")
if mp_off >= 0: if mp_off >= 0:
c = c[:mp_off] c = c[:mp_off]
emTag.insert(0, c) cTag.insert(0, c)
#hrTag = Tag(soup, 'hr') caption.replaceWith(cTag)
#hrTag['class'] = 'caption_divider'
hrTag = Tag(soup, 'div')
hrTag['class'] = 'divider'
emTag.insert(1, hrTag)
caption.replaceWith(emTag)
# Change <nyt_headline> to <h2> # Change <nyt_headline> to <h2>
h1 = soup.find('h1') h1 = soup.find('h1')
@ -506,17 +535,6 @@ class NYTimes(BasicNewsRecipe):
bTag.insert(0, subhead.contents[0]) bTag.insert(0, subhead.contents[0])
subhead.replaceWith(bTag) subhead.replaceWith(bTag)
# Synthesize a section header
dsk = soup.find('meta', attrs={'name':'dsk'})
if dsk and dsk.has_key('content'):
hTag = Tag(soup,'h3')
hTag['class'] = 'section'
hTag.insert(0,NavigableString(dsk['content']))
articleTag = soup.find(True, attrs={'id':'article'})
if articleTag:
articleTag.insert(0,hTag)
# Add class="articleBody" to <div> so we can format with CSS
divTag = soup.find('div',attrs={'id':'articleBody'}) divTag = soup.find('div',attrs={'id':'articleBody'})
if divTag: if divTag:
divTag['class'] = divTag['id'] divTag['class'] = divTag['id']
@ -532,11 +550,3 @@ class NYTimes(BasicNewsRecipe):
return soup return soup
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup

View File

@ -5,52 +5,186 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
''' '''
nytimes.com nytimes.com
''' '''
import string, re, time import re, string, time
from calibre import strftime from calibre import entity_to_unicode, strftime
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
def decode(self, src):
enc = 'utf-8'
if 'iso-8859-1' in src:
enc = 'cp1252'
return src.decode(enc, 'ignore')
class NYTimes(BasicNewsRecipe): class NYTimes(BasicNewsRecipe):
title = u'New York Times' # set headlinesOnly to True for the headlines-only version
__author__ = 'Kovid Goyal/Nick Redding' headlinesOnly = False
language = 'en'
requires_version = (0, 6, 36)
description = 'Daily news from the New York Times (subscription version)' # includeSections: List of sections to include. If empty, all sections found will be included.
timefmt = ' [%b %d]' # Otherwise, only the sections named will be included. For example,
#
# includeSections = ['Politics','Sports']
#
# would cause only the Politics and Sports sections to be included.
includeSections = [] # by default, all sections included
# excludeSections: List of sections to exclude. If empty, all sections found will be included.
# Otherwise, the sections named will be excluded. For example,
#
# excludeSections = ['Politics','Sports']
#
# would cause the Politics and Sports sections to be excluded. This parameter can be used
# in conjuction with includeSections although in most cases using one or the other, but
# not both, is sufficient.
excludeSections = []
# one_picture_per_article specifies that calibre should only use the first image
# from an article (if one exists). If one_picture_per_article = True, the image
# will be moved to a location between the headline and the byline.
# If one_picture_per_article = False, all images from the article will be included
# and shown in their original location.
one_picture_per_article = True
# The maximum number of articles that will be downloaded
max_articles_per_feed = 100
if headlinesOnly:
title='New York Times Headlines'
description = 'Headlines from the New York Times'
else:
title='New York Times'
description = 'Today\'s New York Times'
__author__ = 'GRiker/Kovid Goyal/Nick Redding'
language = 'en'
requires_version = (0, 7, 5)
timefmt = ''
needs_subscription = True needs_subscription = True
masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
cover_margins = (18,18,'grey99')
remove_tags_before = dict(id='article') remove_tags_before = dict(id='article')
remove_tags_after = dict(id='article') remove_tags_after = dict(id='article')
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool','nextArticleLink', remove_tags = [dict(attrs={'class':[
'nextArticleLink clearfix','columnGroup doubleRule','doubleRule','entry-meta', 'articleFooter',
'icon enlargeThis','columnGroup last','relatedSearchesModule']}), 'articleTools',
dict({'class':re.compile('^subNavigation')}), 'columnGroup doubleRule',
dict({'class':re.compile('^leaderboard')}), 'columnGroup singleRule',
dict({'class':re.compile('^module')}), 'columnGroup last',
dict({'class':'metaFootnote'}), 'columnGroup last',
dict(id=['inlineBox','footer', 'toolsRight', 'articleInline','login','masthead', 'doubleRule',
'navigation', 'archive', 'side_search', 'blog_sidebar','cCol','portfolioInline', 'dottedLine',
'side_tool', 'side_index','header','readerReviewsCount','readerReviews', 'entry-meta',
'relatedArticles', 'relatedTopics', 'adxSponLink']), 'entry-response module',
'icon enlargeThis',
'leftNavTabs',
'metaFootnote',
'module box nav',
'nextArticleLink',
'nextArticleLink clearfix',
'post-tools',
'relatedSearchesModule',
'side_tool',
'singleAd',
re.compile('^subNavigation'),
re.compile('^leaderboard'),
re.compile('^module'),
]}),
dict(id=[
'adxLeaderboard',
'adxSponLink',
'archive',
'articleExtras',
'articleInline',
'blog_sidebar',
'businessSearchBar',
'cCol',
'entertainmentSearchBar',
'footer',
'header',
'header_search',
'inlineBox',
'login',
'masthead',
'masthead-nav',
'memberTools',
'navigation',
'portfolioInline',
'readerReviews',
'readerReviewsCount',
'relatedArticles',
'relatedTopics',
'respond',
'side_search',
'side_index',
'side_tool',
'toolsRight',
]),
dict(name=['script', 'noscript', 'style','form','hr'])] dict(name=['script', 'noscript', 'style','form','hr'])]
encoding = decode
no_stylesheets = True no_stylesheets = True
extra_css = ''' extra_css = '''
.articleHeadline { margin-top:0.5em; margin-bottom:0.25em; } .articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
.credit { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } .credit { text-align: right; font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
.byline { font-size: small; font-style:italic; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; } .byline { text-align: left; font-size: small; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
.dateline { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } .dateline { text-align: left; font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
.kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } .kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
.timestamp { font-size: small; } .timestamp { text-align: left; font-size: small; }
.caption { font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } .caption { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
a:link {text-decoration: none; }''' a:link {text-decoration: none; }
.articleBody { }
.authorId {text-align: left; }
.image {text-align: center;}
.source {text-align: left; }'''
def filter_ans(self, ans) :
total_article_count = 0
idx = 0
idx_max = len(ans)-1
while idx <= idx_max:
if self.includeSections != []:
if ans[idx][0] not in self.includeSections:
print "SECTION NOT INCLUDED: ",ans[idx][0]
del ans[idx]
idx_max = idx_max-1
continue
if ans[idx][0] in self.excludeSections:
print "SECTION EXCLUDED: ",ans[idx][0]
del ans[idx]
idx_max = idx_max-1
continue
if self.verbose:
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
for article in ans[idx][1]:
total_article_count += 1
if self.verbose:
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
article['url'].encode('cp1252','replace')))
idx = idx+1
self.log( "Queued %d articles" % total_article_count )
return ans
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","“",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","—",fixed)
return fixed
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
@ -60,22 +194,19 @@ class NYTimes(BasicNewsRecipe):
br['USERID'] = self.username br['USERID'] = self.username
br['PASSWORD'] = self.password br['PASSWORD'] = self.password
raw = br.submit().read() raw = br.submit().read()
if 'Sorry, we could not find the combination you entered. Please try again.' in raw: if 'Please try again' in raw:
raise Exception('Your username and password are incorrect') raise Exception('Your username and password are incorrect')
#open('/t/log.html', 'wb').write(raw)
return br return br
def get_masthead_url(self): def skip_ad_pages(self, soup):
masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' # Skip ad pages served before actual article
#masthead = 'http://members.cox.net/nickredding/nytlogo.gif' skip_tag = soup.find(True, {'name':'skip'})
br = BasicNewsRecipe.get_browser() if skip_tag is not None:
try: self.log.warn("Found forwarding link: %s" % skip_tag.parent['href'])
br.open(masthead) url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
except: url += '?pagewanted=all'
self.log("\nMasthead unavailable") self.log.warn("Skipping ad to article at '%s'" % url)
masthead = None return self.index_to_soup(url, raw=True)
return masthead
def get_cover_url(self): def get_cover_url(self):
cover = None cover = None
@ -93,12 +224,57 @@ class NYTimes(BasicNewsRecipe):
return cover return cover
def short_title(self): def short_title(self):
return 'New York Times' return self.title
def parse_index(self): def index_to_soup(self, url_or_raw, raw=False):
self.encoding = 'cp1252' '''
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html') OVERRIDE of class method
self.encoding = decode deals with various page encodings between index and articles
'''
def get_the_soup(docEncoding, url_or_raw, raw=False) :
if re.match(r'\w+://', url_or_raw):
f = self.browser.open(url_or_raw)
_raw = f.read()
f.close()
if not _raw:
raise RuntimeError('Could not fetch index from %s'%url_or_raw)
else:
_raw = url_or_raw
if raw:
return _raw
if not isinstance(_raw, unicode) and self.encoding:
_raw = _raw.decode(docEncoding, 'replace')
massage = list(BeautifulSoup.MARKUP_MASSAGE)
massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding)))
return BeautifulSoup(_raw, markupMassage=massage)
# Entry point
print "index_to_soup()"
soup = get_the_soup( self.encoding, url_or_raw )
contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
if docEncoding == '' :
docEncoding = self.encoding
if self.verbose > 2:
self.log( " document encoding: '%s'" % docEncoding)
if docEncoding != self.encoding :
soup = get_the_soup(docEncoding, url_or_raw)
return soup
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def parse_todays_index(self):
def feed_title(div): def feed_title(div):
return ''.join(div.findAll(text=True, recursive=True)).strip() return ''.join(div.findAll(text=True, recursive=True)).strip()
@ -119,12 +295,13 @@ class NYTimes(BasicNewsRecipe):
return return
if 'podcast' in url: if 'podcast' in url:
return return
if '/video/' in url:
return
url += '?pagewanted=all' url += '?pagewanted=all'
if url in url_list: if url in url_list:
return return
url_list.append(url) url_list.append(url)
title = self.tag_to_string(a, use_alt=True).strip() title = self.tag_to_string(a, use_alt=True).strip()
#self.log("Title: %s" % title)
description = '' description = ''
pubdate = strftime('%a, %d %b') pubdate = strftime('%a, %d %b')
summary = div.find(True, attrs={'class':'summary'}) summary = div.find(True, attrs={'class':'summary'})
@ -140,6 +317,7 @@ class NYTimes(BasicNewsRecipe):
author = self.tag_to_string(authorAttribution, use_alt=False) author = self.tag_to_string(authorAttribution, use_alt=False)
feed = key if key is not None else 'Uncategorized' feed = key if key is not None else 'Uncategorized'
if not articles.has_key(feed): if not articles.has_key(feed):
ans.append(feed)
articles[feed] = [] articles[feed] = []
articles[feed].append( articles[feed].append(
dict(title=title, url=url, date=pubdate, dict(title=title, url=url, date=pubdate,
@ -147,46 +325,228 @@ class NYTimes(BasicNewsRecipe):
content='')) content=''))
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
# Find each instance of class="section-headline", class="story", class="story headline"
# Find each article
for div in soup.findAll(True, for div in soup.findAll(True,
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}): attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
if div['class'] in ['section-headline','sectionHeader']: if div['class'] in ['section-headline','sectionHeader']:
key = string.capwords(feed_title(div)) key = string.capwords(feed_title(div))
articles[key] = [] key = key.replace('Op-ed','Op-Ed')
ans.append(key) key = key.replace('U.s.','U.S.')
#self.log('Section: %s' % key)
elif div['class'] in ['story', 'story headline'] : elif div['class'] in ['story', 'story headline'] :
handle_article(div) handle_article(div)
elif div['class'] == 'headlinesOnly multiline flush': elif div['class'] == 'headlinesOnly multiline flush':
for lidiv in div.findAll('li'): for lidiv in div.findAll('li'):
handle_article(lidiv) handle_article(lidiv)
# ans = self.sort_index_by(ans, {'The Front Page':-1,
# 'Dining In, Dining Out':1,
# 'Obituaries':2})
ans = [(key, articles[key]) for key in ans if articles.has_key(key)] ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return self.filter_ans(ans)
def parse_headline_index(self):
articles = {}
ans = []
url_list = []
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
# Fetch the content table
content_table = soup.find('table',{'id':'content'})
if content_table is None:
self.log("FATAL ERROR: CANNOT FIND CONTENT TABLE")
return None
# Within this table are <td id=".*Column.*"> entries, each containing one or more h6 tags which represent sections
for td_col in content_table.findAll('td', {'id' : re.compile('Column')}):
for div_sec in td_col.findAll('div',recursive=False):
for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}):
section_name = self.tag_to_string(h6_sec_name,use_alt=False)
section_name = re.sub(r'^ *$','',section_name)
if section_name == '':
continue
section_name=string.capwords(section_name)
if section_name == 'U.s.':
section_name = 'U.S.'
elif section_name == 'Op-ed':
section_name = 'Op-Ed'
pubdate = strftime('%a, %d %b')
search_div = div_sec
for next_tag in h6_sec_name.findNextSiblings(True):
if next_tag.__class__.__name__ == 'Tag':
if next_tag.name == 'div':
search_div = next_tag
break
# Get the articles
for h3_item in search_div.findAll('h3'):
byline = h3_item.h6
if byline is not None:
author = self.tag_to_string(byline,usa_alt=False)
else:
author = ''
a = h3_item.find('a', href=True)
if not a:
continue
url = re.sub(r'\?.*', '', a['href'])
if not url.startswith("http"):
continue
if not url.endswith(".html"):
continue
if 'podcast' in url:
continue
if 'video' in url:
continue
url += '?pagewanted=all'
if url in url_list:
continue
url_list.append(url)
self.log("URL %s" % url)
title = self.tag_to_string(a, use_alt=True).strip()
desc = h3_item.find('p')
if desc is not None:
description = self.tag_to_string(desc,use_alt=False)
else:
description = ''
if not articles.has_key(section_name):
ans.append(section_name)
articles[section_name] = []
articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return self.filter_ans(ans)
def parse_index(self):
if self.headlinesOnly:
return self.parse_headline_index()
else:
return self.parse_todays_index()
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
return ans
def preprocess_html(self, soup): def preprocess_html(self, soup):
kicker_tag = soup.find(attrs={'class':'kicker'}) kicker_tag = soup.find(attrs={'class':'kicker'})
if kicker_tag: if kicker_tag: # remove Op_Ed author head shots
tagline = self.tag_to_string(kicker_tag) tagline = self.tag_to_string(kicker_tag)
#self.log("FOUND KICKER %s" % tagline)
if tagline=='Op-Ed Columnist': if tagline=='Op-Ed Columnist':
img_div = soup.find('div','inlineImage module') img_div = soup.find('div','inlineImage module')
#self.log("Searching for photo")
if img_div: if img_div:
img_div.extract() img_div.extract()
#self.log("Photo deleted") return self.strip_anchors(soup)
refresh = soup.find('meta', {'http-equiv':'refresh'})
if refresh is None:
return soup
content = refresh.get('content').partition('=')[2]
raw = self.browser.open_novisit('http://www.nytimes.com'+content).read()
return BeautifulSoup(raw.decode('cp1252', 'replace'))
def postprocess_html(self,soup, True):
if self.one_picture_per_article:
# Remove all images after first
largeImg = soup.find(True, {'class':'articleSpanImage'})
inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
if largeImg:
for inlineImg in inlineImgs:
inlineImg.extract()
else:
if inlineImgs:
firstImg = inlineImgs[0]
for inlineImg in inlineImgs[1:]:
inlineImg.extract()
# Move firstImg before article body
#article_body = soup.find(True, {'id':'articleBody'})
cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
if cgFirst:
# Strip all sibling NavigableStrings: noise
navstrings = cgFirst.findAll(text=True, recursive=False)
[ns.extract() for ns in navstrings]
headline_found = False
tag = cgFirst.find(True)
insertLoc = 0
while True:
insertLoc += 1
if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
headline_found = True
break
tag = tag.nextSibling
if not tag:
headline_found = False
break
if headline_found:
cgFirst.insert(insertLoc,firstImg)
else:
self.log(">>> No class:'columnGroup first' found <<<")
# Change captions to italic
for caption in soup.findAll(True, {'class':'caption'}) :
if caption and caption.contents[0]:
cTag = Tag(soup, "p", [("class", "caption")])
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
mp_off = c.find("More Photos")
if mp_off >= 0:
c = c[:mp_off]
cTag.insert(0, c)
caption.replaceWith(cTag)
# Change <nyt_headline> to <h2>
h1 = soup.find('h1')
if h1:
headline = h1.find("nyt_headline")
if headline:
tag = Tag(soup, "h2")
tag['class'] = "headline"
tag.insert(0, self.fixChars(headline.contents[0]))
h1.replaceWith(tag)
else:
# Blog entry - replace headline, remove <hr> tags
headline = soup.find('title')
if headline:
tag = Tag(soup, "h2")
tag['class'] = "headline"
tag.insert(0, self.fixChars(headline.contents[0]))
soup.insert(0, tag)
hrs = soup.findAll('hr')
for hr in hrs:
hr.extract()
# Change <h1> to <h3> - used in editorial blogs
masthead = soup.find("h1")
if masthead:
# Nuke the href
if masthead.a:
del(masthead.a['href'])
tag = Tag(soup, "h3")
tag.insert(0, self.fixChars(masthead.contents[0]))
masthead.replaceWith(tag)
# Change <span class="bold"> to <b>
for subhead in soup.findAll(True, {'class':'bold'}) :
if subhead.contents:
bTag = Tag(soup, "b")
bTag.insert(0, subhead.contents[0])
subhead.replaceWith(bTag)
divTag = soup.find('div',attrs={'id':'articleBody'})
if divTag:
divTag['class'] = divTag['id']
# Add class="authorId" to <div> so we can format with CSS
divTag = soup.find('div',attrs={'id':'authorId'})
if divTag and divTag.contents[0]:
tag = Tag(soup, "p")
tag['class'] = "authorId"
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
use_alt=False)))
divTag.replaceWith(tag)
return soup

View File

@ -6,22 +6,25 @@ Fetch Die Zeit.
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class ZeitDe(BasicNewsRecipe): class ZeitDe(BasicNewsRecipe):
title = 'ZEIT Online' title = 'Zeit Online'
description = 'ZEIT Online' description = 'Zeit Online'
language = 'de' language = 'de'
lang = 'de_DE'
__author__ = 'Martin Pitt, Sujata Raman and Ingo Paschke' __author__ = 'Martin Pitt, Sujata Raman, Ingo Paschke and Marc Toensing'
use_embedded_content = False
max_articles_per_feed = 40 max_articles_per_feed = 40
remove_empty_feeds = True
no_stylesheets = True remove_tags = [
no_javascript = True dict(name='iframe'),
encoding = 'utf-8' dict(name='div', attrs={'class':["response","pagination block","pagenav","inline link", "copyright"] }),
dict(name='p', attrs={'class':["ressortbacklink", "copyright"] }),
dict(name='div', attrs={'id':["place_5","place_4","comments"]})
]
keep_only_tags = [dict(id=['main'])]
feeds = [ feeds = [
('Seite 1', 'http://newsfeed.zeit.de/index_xml'), ('Seite 1', 'http://newsfeed.zeit.de/index_xml'),
@ -40,43 +43,15 @@ class ZeitDe(BasicNewsRecipe):
('Sport', 'http://newsfeed.zeit.de/sport/index'), ('Sport', 'http://newsfeed.zeit.de/sport/index'),
] ]
extra_css = ''' extra_css = '.reaktion,.taglist,.comments,.reponse,.responsetitle,.responsebody,.reponse,.inline,.date{display:none;}li.date{display:block}'
.supertitle{color:#990000; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
.excerpt{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:small;}
.title{font-family:Arial,Helvetica,sans-serif;font-size:large;clear:right;}
.caption{color:#666666; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
.copyright{color:#666666; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
.article{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:x-small}
.quote{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:x-small}
.quote .cite{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:xx-small}
.headline iconportrait_inline{font-family:Arial,Helvetica,sans-serif;font-size:x-small}
.inline{float:left;margin-top:0;margin-right:15px;position:relative;width:180px; }
img.inline{float:none}
.intertitle{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:x-small;font-weight:700}
.ebinfobox{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:xx-small;list-style-type:none;float:right;margin-top:0;border-left-style:solid;border-left-width:1px;padding-left:10px;}
.infobox {border-style: solid; border-width: 1px;padding:8px;}
.infobox dt {font-weight:700;}
'''
#filter_regexps = [r'ad.de.doubleclick.net/'] #filter_regexps = [r'ad.de.doubleclick.net/']
keep_only_tags = [
dict(name='div', attrs={'class':["article"]}) ,
dict(name='ul', attrs={'class':["tools"]}) ,
]
remove_tags = [
dict(name='link'), dict(name='iframe'),dict(name='style'),dict(name='meta'),
dict(name='div', attrs={'class':["pagination block","pagenav","inline link", "copyright"] }),
dict(name='p', attrs={'class':["ressortbacklink", "copyright"] }),
dict(name='div', attrs={'id':["place_5","place_4","comments"]})
]
remove_attributes = ['style', 'font']
def get_article_url(self, article): def get_article_url(self, article):
ans = article.get('link',None) ans = article.get('link',None)
ans += "?page=all" ans += "?page=all&print=true"
if 'video' in ans or 'quiz' in ans : if 'video' in ans or 'quiz' in ans or 'blog' in ans :
ans = None ans = None
return ans return ans
@ -86,25 +61,3 @@ class ZeitDe(BasicNewsRecipe):
return inhalt.find('div', attrs={'class':'singlearchive clearfix'}).img['src'].replace('icon_','') return inhalt.find('div', attrs={'class':'singlearchive clearfix'}).img['src'].replace('icon_','')
except: except:
return 'http://images.zeit.de/bilder/titelseiten_zeit/1946/001_001.jpg' return 'http://images.zeit.de/bilder/titelseiten_zeit/1946/001_001.jpg'
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '">'
soup.head.insert(0,mtag)
title = soup.find('h2', attrs={'class':'title'})
if title is None:
print "no title"
return soup
info = Tag(soup,'ul',[('class','ebinfobox')])
tools = soup.find('ul', attrs={'class':'tools'})
#author = tools.find('li','author first')
for tag in ['author first', 'date', 'date first', 'author', 'source']:
line = tools.find('li', tag)
if line:
info.insert(0,line)
title.parent.insert(0,info)
tools.extract()
return soup

View File

@ -0,0 +1,60 @@
body{
margin:0px;
padding: 0.5em;
background-color:#F6F3E9;
font-size:12px;
font-family:Arial, Helvetica, sans-serif;
}
.calibreMeta{
background-color:#39322B;
color:white;
padding:10px;
}
.calibreMeta a, .calibreEbNav a, .calibreEbNavTop a, .calibreToc a{
color:white;
}
.calibreMeta h1{
margin:0px;
font-size:18px;
background-color:#39322B;
}
.calibreEbookContent{
padding:20px;
}
.calibreEbNav, .calibreEbNavTop{
clear:both;
background-color:#39322B;
color:white;
padding:10px;
text-align:center;
}
.calibreEbNavTop{
margin-bottom:20px;
}
.calibreEbNav a, .calibreEbNavTop a{
padding:0px 5px;
}
.calibreTocIndex{
line-height:18px;
}
.calibreToc{
float:left;
margin:20px;
width:300px;
background-color:#39322B;
color:white;
padding:10px;
}
.calibreEbookContent{
width:600px;
float:left;
}

View File

@ -0,0 +1,74 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
${head_content}$
<link href="${cssLink}$" type="text/css" rel="stylesheet" />
</head>
<body>
<div class="calibreMeta">
<div class="calibreMetaTitle">
${pos1=1}$
${for title in meta.titles():}$
${if pos1:}$
<h1>
<a href="${tocUrl}$">${print title}$</a>
</h1>
${:else:}$
<div class="calibreMetaSubtitle">${print title}$</div>
${:endif}$
${pos1=0}$
${:endfor}$
</div>
<div class="calibreMetaAuthor">
${print ', '.join(meta.creators())}$
</div>
</div>
<div class="calibreMain">
<div class="calibreEbookContent">
${if prevLink or nextLink:}$
<div class="calibreEbNavTop">
${if prevLink:}$
<a href="${prevLink}$" class="calibreAPrev">${print _('previous page'),}$</a>
${:else:}$
<a href="${tocUrl}$" class="calibreAPrev">${print _('previous page'),}$</a>
${:endif}$
${if nextLink:}$
<a href="${nextLink}$" class="calibreANext">${print _('next page'),}$</a>
${:endif}$
</div>
${:endif}$
${ebookContent}$
</div>
${if has_toc:}$
<div class="calibreToc">
<h2><a href="${tocUrl}$">${print _('Table of contents'),}$</a></h2>
${print toc()}$
</div>
${:endif}$
<div class="calibreEbNav">
${if prevLink:}$
<a href="${prevLink}$" class="calibreAPrev">${print _('previous page'),}$</a>
${:else:}$
<a href="${tocUrl}$" class="calibreAPrev">${print _('previous page'),}$</a>
${:endif}$
<a href="${tocUrl}$" class="calibreAHome">${print _('start'),}$</a>
${if nextLink:}$
<a href="${nextLink}$" class="calibreANext">${print _('next page'),}$</a>
${:endif}$
</div>
</div>
</body>
</html>

View File

@ -0,0 +1,61 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<link rel="schema.DC" href="http://purl.org/dc/elements/1.1/" />
<link rel="schema.DCTERMS" href="http://purl.org/dc/terms/" />
<title>${print ', '.join(meta.creators()),}$ - ${print meta.titles().next(); meta.titles().close()}$</title>
${for item in meta:}$
<meta ${print 'name="DC.'+item['name']+'"',}$ ${print 'content="'+item['value']+'"',}$ />
${:endfor}$
<link href="${cssLink}$" type="text/css" rel="stylesheet" />
</head>
<body>
<div class="calibreMeta">
<div class="calibreMetaTitle">
${pos1=1}$
${for title in meta.titles():}$
${if pos1:}$
<h1>
<a href="${tocUrl}$">${print title}$</a>
</h1>
${:else:}$
<div class="calibreMetaSubtitle">${print title}$</div>
${:endif}$
${pos1=0}$
${:endfor}$
</div>
<div class="calibreMetaAuthor">
${print ', '.join(meta.creators()),}$
</div>
</div>
<div class="calibreMain">
<div class="calibreEbookContent">
${if has_toc:}$
<div class="calibreTocIndex">
<h2>${print _('Table of contents'),}$</h2>
${toc}$
</div>
${:else:}$
<h2>${print _('No table of contents present'),}$</h2>
<div><strong><a href="${nextLink}$">${print _('begin to read'),}$</a></strong></div>
${:endif}$
</div>
<div class="calibreEbNav">
${if nextLink:}$
<a href="${nextLink}$" class="calibreANext">${print _('next page'),}$</a>
${:endif}$
</div>
</div>
</body>
</html>

View File

@ -89,7 +89,7 @@ class Server(Command):
t = telnetlib.Telnet('localhost', 4242) t = telnetlib.Telnet('localhost', 4242)
t.read_until("repl>") t.read_until("repl>")
t.write('BrowserReload();') t.write('BrowserReload();')
print t.read_until("repl>") t.read_until("repl>")
t.close() t.close()
except: except:
print 'Failed to reload browser' print 'Failed to reload browser'

View File

@ -446,6 +446,7 @@ from calibre.ebooks.rb.output import RBOutput
from calibre.ebooks.rtf.output import RTFOutput from calibre.ebooks.rtf.output import RTFOutput
from calibre.ebooks.tcr.output import TCROutput from calibre.ebooks.tcr.output import TCROutput
from calibre.ebooks.txt.output import TXTOutput from calibre.ebooks.txt.output import TXTOutput
from calibre.ebooks.html.output import HTMLOutput
from calibre.ebooks.snb.output import SNBOutput from calibre.ebooks.snb.output import SNBOutput
from calibre.customize.profiles import input_profiles, output_profiles from calibre.customize.profiles import input_profiles, output_profiles
@ -525,6 +526,7 @@ plugins += [
RTFOutput, RTFOutput,
TCROutput, TCROutput,
TXTOutput, TXTOutput,
HTMLOutput,
SNBOutput, SNBOutput,
] ]
# Order here matters. The first matched device is the one used. # Order here matters. The first matched device is the one used.
@ -893,4 +895,3 @@ plugins += [LookAndFeel, Behavior, Columns, Toolbar, InputOptions,
Email, Server, Plugins, Tweaks, Misc] Email, Server, Plugins, Tweaks, Misc]
#}}} #}}}

View File

@ -0,0 +1,33 @@
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2010, Fabian Grassl <fg@jusmeum.de>'
__docformat__ = 'restructuredtext en'
from calibre.ebooks.oeb.base import namespace, barename, DC11_NS
class EasyMeta(object):
def __init__(self, meta):
self.meta = meta
def __iter__(self):
meta = self.meta
for item_name in meta.items:
for item in meta[item_name]:
if namespace(item.term) == DC11_NS:
yield { 'name': barename(item.term), 'value': item.value }
def __len__(self):
count = 0
for item in self:
count = count+1
return count
def titles(self):
for item in self.meta['title']:
yield item.value
def creators(self):
for item in self.meta['creator']:
yield item.value

View File

@ -0,0 +1,201 @@
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2010, Fabian Grassl <fg@jusmeum.de>'
__docformat__ = 'restructuredtext en'
import os, re, shutil
from os.path import dirname, abspath, relpath, exists
from lxml import etree
from templite import Templite
from calibre.ebooks.oeb.base import element
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
from calibre import CurrentDir
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.utils.zipfile import ZipFile
from urllib import unquote
from calibre.ebooks.html.meta import EasyMeta
class HTMLOutput(OutputFormatPlugin):
name = 'HTML Output'
author = 'Fabian Grassl'
file_type = 'zip'
options = set([
OptionRecommendation(name='template_css',
help=_('CSS file used for the output instead of the default file')),
OptionRecommendation(name='template_html_index',
help=_('Template used for generation of the html index file instead of the default file')),
OptionRecommendation(name='template_html',
help=_('Template used for the generation of the html contents of the book instead of the default file')),
OptionRecommendation(name='extract_to',
help=_('Extract the contents of the generated ZIP file to the directory of the generated ZIP file')
),
])
recommendations = set([('pretty_print', True, OptionRecommendation.HIGH)])
def generate_toc(self, oeb_book, ref_url, output_dir):
'''
Generate table of contents
'''
with CurrentDir(output_dir):
def build_node(current_node, parent=None):
if parent is None:
parent = etree.Element('ul')
elif len(current_node.nodes):
parent = element(parent, ('ul'))
for node in current_node.nodes:
point = element(parent, 'li')
href = relpath(abspath(unquote(node.href)), dirname(ref_url))
link = element(point, 'a', href=href)
title = node.title
if title:
title = re.sub(r'\s+', ' ', title)
link.text=title
build_node(node, point)
return parent
wrap = etree.Element('div')
wrap.append(build_node(oeb_book.toc))
return wrap
def generate_html_toc(self, oeb_book, ref_url, output_dir):
root = self.generate_toc(oeb_book, ref_url, output_dir)
return etree.tostring(root, pretty_print=True, encoding='utf-8',
xml_declaration=True)
def convert(self, oeb_book, output_path, input_plugin, opts, log):
# read template files
if opts.template_html_index is not None:
template_html_index_data = open(opts.template_html_index, 'rb').read()
else:
template_html_index_data = P('templates/html_export_default_index.tmpl', data=True)
if opts.template_html is not None:
template_html_data = open(opts.template_html, 'rb').read()
else:
template_html_data = P('templates/html_export_default.tmpl', data=True)
if opts.template_css is not None:
template_css_data = open(opts.template_css, 'rb').read()
else:
template_css_data = P('templates/html_export_default.css', data=True)
template_html_index_data = template_html_index_data.decode('utf-8')
template_html_data = template_html_data.decode('utf-8')
template_css_data = template_css_data.decode('utf-8')
self.log = log
self.opts = opts
meta = EasyMeta(oeb_book.metadata)
tempdir = PersistentTemporaryDirectory()
output_file = os.path.join(tempdir,
os.path.basename(re.sub(r'\.zip', '', output_path)+'.html'))
output_dir = re.sub(r'\.html', '', output_file)+'_files'
if not exists(output_dir):
os.makedirs(output_dir)
css_path = output_dir+os.sep+'calibreHtmlOutBasicCss.css'
with open(css_path, 'wb') as f:
f.write(template_css_data.encode('utf-8'))
with open(output_file, 'wb') as f:
html_toc = self.generate_html_toc(oeb_book, output_file, output_dir)
templite = Templite(template_html_index_data)
nextLink = oeb_book.spine[0].href
nextLink = relpath(output_dir+os.sep+nextLink, dirname(output_file))
cssLink = relpath(abspath(css_path), dirname(output_file))
tocUrl = relpath(output_file, dirname(output_file))
t = templite.render(has_toc=bool(oeb_book.toc.count()),
toc=html_toc, meta=meta, nextLink=nextLink,
tocUrl=tocUrl, cssLink=cssLink)
f.write(t)
with CurrentDir(output_dir):
for item in oeb_book.manifest:
path = abspath(unquote(item.href))
dir = dirname(path)
if not exists(dir):
os.makedirs(dir)
if item.spine_position is not None:
with open(path, 'wb') as f:
pass
else:
with open(path, 'wb') as f:
f.write(str(item))
item.unload_data_from_memory(memory=path)
for item in oeb_book.spine:
path = abspath(unquote(item.href))
dir = dirname(path)
root = item.data.getroottree()
# get & clean HTML <HEAD>-data
head = root.xpath('//h:head', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
head_content = etree.tostring(head, pretty_print=True, encoding='utf-8')
head_content = re.sub(r'\<\/?head.*\>', '', head_content)
head_content = re.sub(re.compile(r'\<style.*\/style\>', re.M|re.S), '', head_content)
# get & clean HTML <BODY>-data
body = root.xpath('//h:body', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
ebook_content = etree.tostring(body, pretty_print=True, encoding='utf-8')
ebook_content = re.sub(r'\<\/?body.*\>', '', ebook_content)
# generate link to next page
if item.spine_position+1 < len(oeb_book.spine):
nextLink = oeb_book.spine[item.spine_position+1].href
nextLink = relpath(abspath(nextLink), dir)
else:
nextLink = None
# generate link to previous page
if item.spine_position > 0:
prevLink = oeb_book.spine[item.spine_position-1].href
prevLink = relpath(abspath(prevLink), dir)
else:
prevLink = None
cssLink = relpath(abspath(css_path), dir)
tocUrl = relpath(output_file, dir)
# render template
templite = Templite(template_html_data)
toc = lambda: self.generate_html_toc(oeb_book, path, output_dir)
t = templite.render(ebookContent=ebook_content,
prevLink=prevLink, nextLink=nextLink,
has_toc=bool(oeb_book.toc.count()), toc=toc,
tocUrl=tocUrl, head_content=head_content,
meta=meta, cssLink=cssLink)
# write html to file
with open(path, 'wb') as f:
f.write(t)
item.unload_data_from_memory(memory=path)
zfile = ZipFile(output_path, "w")
zfile.add_dir(output_dir)
if opts.extract_to:
if os.path.exists(opts.extract_to):
shutil.rmtree(opts.extract_to)
os.makedirs(opts.extract_to)
zfile.extractall(opts.extract_to)
self.log('Zip file extracted to', opts.extract_to)
zfile.close()
# cleanup temp dir
shutil.rmtree(tempdir)

View File

@ -112,13 +112,12 @@ def get_metadata(br, asin, mi):
def main(args=sys.argv): def main(args=sys.argv):
# Test xisbn # Test xisbn
#print get_social_metadata('Learning Python', None, None, '8324616489') print get_social_metadata('Learning Python', None, None, '8324616489')
#print print
# Test sophisticated comment formatting # Test sophisticated comment formatting
print get_social_metadata('Angels & Demons', None, None, '9781416580829') print get_social_metadata('Angels & Demons', None, None, '9781416580829')
print print
return
# Random tests # Random tests
print get_social_metadata('Star Trek: Destiny: Mere Mortals', None, None, '9781416551720') print get_social_metadata('Star Trek: Destiny: Mere Mortals', None, None, '9781416551720')

View File

@ -275,7 +275,15 @@ class MobiMLizer(object):
# <mbp:frame-set/> does not exist lalalala # <mbp:frame-set/> does not exist lalalala
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
or style['visibility'] == 'hidden': or style['visibility'] == 'hidden':
return id_ = elem.get('id', None)
if id_:
# Keep anchors so people can use display:none
# to generate hidden TOCs
elem.clear()
elem.text = None
elem.set('id', id_)
else:
return
tag = barename(elem.tag) tag = barename(elem.tag)
istate = copy.copy(istates[-1]) istate = copy.copy(istates[-1])
istate.rendered = False istate.rendered = False
@ -406,6 +414,12 @@ class MobiMLizer(object):
parent = bstate.para if bstate.inline is None else bstate.inline parent = bstate.para if bstate.inline is None else bstate.inline
if parent is not None: if parent is not None:
vtag = etree.SubElement(parent, XHTML(vtag)) vtag = etree.SubElement(parent, XHTML(vtag))
# Add anchors
for child in vbstate.body:
if child is not vbstate.para:
vtag.append(child)
else:
break
for child in vbstate.para: for child in vbstate.para:
vtag.append(child) vtag.append(child)
return return

View File

@ -49,5 +49,3 @@ class OEBOutput(OutputFormatPlugin):
with open(path, 'wb') as f: with open(path, 'wb') as f:
f.write(str(item)) f.write(str(item))
item.unload_data_from_memory(memory=path) item.unload_data_from_memory(memory=path)

View File

@ -101,11 +101,12 @@ class SNBMLizer(object):
subitem = '' subitem = ''
bodyTree = trees[subitem].find(".//body") bodyTree = trees[subitem].find(".//body")
for line in output.splitlines(): for line in output.splitlines():
if not line.find(CALIBRE_SNB_PRE_TAG) == 0: pos = line.find(CALIBRE_SNB_PRE_TAG)
if pos == -1:
line = line.strip(u' \t\n\r\u3000') line = line.strip(u' \t\n\r\u3000')
else: else:
etree.SubElement(bodyTree, "text").text = \ etree.SubElement(bodyTree, "text").text = \
etree.CDATA(line[len(CALIBRE_SNB_PRE_TAG):]) etree.CDATA(line[pos+len(CALIBRE_SNB_PRE_TAG):])
continue continue
if len(line) != 0: if len(line) != 0:
if line.find(CALIBRE_SNB_IMG_TAG) == 0: if line.find(CALIBRE_SNB_IMG_TAG) == 0:

View File

@ -35,7 +35,6 @@ class ViewAction(InterfaceAction):
self.qaction.setMenu(self.view_menu) self.qaction.setMenu(self.view_menu)
ac.triggered.connect(self.view_specific_format, type=Qt.QueuedConnection) ac.triggered.connect(self.view_specific_format, type=Qt.QueuedConnection)
def location_selected(self, loc): def location_selected(self, loc):
enabled = loc == 'library' enabled = loc == 'library'
for action in list(self.view_menu.actions())[1:]: for action in list(self.view_menu.actions())[1:]:
@ -134,6 +133,9 @@ class ViewAction(InterfaceAction):
rows = self.gui.current_view().selectionModel().selectedRows() rows = self.gui.current_view().selectionModel().selectedRows()
self._view_books(rows) self._view_books(rows)
def view_triggered(self, index):
self._view_books([index])
def view_specific_book(self, index): def view_specific_book(self, index):
self._view_books([index]) self._view_books([index])

View File

@ -28,6 +28,8 @@ def gui_catalog(fmt, title, dbspec, ids, out_file_name, sync, fmt_options, conne
if log is None: if log is None:
log = Log() log = Log()
from calibre.library import db from calibre.library import db
from calibre.utils.config import prefs
prefs.refresh()
db = db() db = db()
db.catalog_plugin_on_device_temp_mapping = dbspec db.catalog_plugin_on_device_temp_mapping = dbspec

View File

@ -50,6 +50,8 @@ class BooksView(QTableView): # {{{
def __init__(self, parent, modelcls=BooksModel): def __init__(self, parent, modelcls=BooksModel):
QTableView.__init__(self, parent) QTableView.__init__(self, parent)
self.setEditTriggers(self.SelectedClicked|self.EditKeyPressed)
self.drag_allowed = True self.drag_allowed = True
self.setDragEnabled(True) self.setDragEnabled(True)
self.setDragDropOverwriteMode(False) self.setDragDropOverwriteMode(False)
@ -98,6 +100,8 @@ class BooksView(QTableView): # {{{
self._model.about_to_be_sorted.connect(self.about_to_be_sorted) self._model.about_to_be_sorted.connect(self.about_to_be_sorted)
self._model.sorting_done.connect(self.sorting_done) self._model.sorting_done.connect(self.sorting_done)
self.doubleClicked.connect(parent.iactions['View'].view_triggered)
# Column Header Context Menu {{{ # Column Header Context Menu {{{
def column_header_context_handler(self, action=None, column=None): def column_header_context_handler(self, action=None, column=None):
if not action or not column: if not action or not column:

View File

@ -128,7 +128,7 @@ class ContentServer(object):
if want_mobile: if want_mobile:
return self.mobile() return self.mobile()
return self.browse_toplevel() return self.browse_catalog()
def old(self, **kwargs): def old(self, **kwargs):
return self.static('index.html').replace('{prefix}', return self.static('index.html').replace('{prefix}',

View File

@ -338,6 +338,8 @@ Calibre has several keyboard shortcuts to save you time and mouse movement. Thes
* - Keyboard Shortcut * - Keyboard Shortcut
- Action - Action
* - :kbd:`F2 (Enter in OS X)`
- Edit the metadata of the currently selected field in the book list.
* - :kbd:`A` * - :kbd:`A`
- Add Books - Add Books
* - :kbd:`C` * - :kbd:`C`

87
src/templite/__init__.py Normal file
View File

@ -0,0 +1,87 @@
#!/usr/bin/env python
#
# Templite+
# A light-weight, fully functional, general purpose templating engine
#
# Copyright (c) 2009 joonis new media
# Author: Thimo Kraemer <thimo.kraemer@joonis.de>
#
# Based on Templite - Tomer Filiba
# http://code.activestate.com/recipes/496702/
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA.
#
import sys, re
class Templite(object):
auto_emit = re.compile('(^[\'\"])|(^[a-zA-Z0-9_\[\]\'\"]+$)')
def __init__(self, template, start='${', end='}$'):
if len(start) != 2 or len(end) != 2:
raise ValueError('each delimiter must be two characters long')
delimiter = re.compile('%s(.*?)%s' % (re.escape(start), re.escape(end)), re.DOTALL)
offset = 0
tokens = []
for i, part in enumerate(delimiter.split(template)):
part = part.replace('\\'.join(list(start)), start)
part = part.replace('\\'.join(list(end)), end)
if i % 2 == 0:
if not part: continue
part = part.replace('\\', '\\\\').replace('"', '\\"')
part = '\t' * offset + 'emit("""%s""")' % part
else:
part = part.rstrip()
if not part: continue
if part.lstrip().startswith(':'):
if not offset:
raise SyntaxError('no block statement to terminate: ${%s}$' % part)
offset -= 1
part = part.lstrip()[1:]
if not part.endswith(':'): continue
elif self.auto_emit.match(part.lstrip()):
part = 'emit(%s)' % part.lstrip()
lines = part.splitlines()
margin = min(len(l) - len(l.lstrip()) for l in lines if l.strip())
part = '\n'.join('\t' * offset + l[margin:] for l in lines)
if part.endswith(':'):
offset += 1
tokens.append(part)
if offset:
raise SyntaxError('%i block statement(s) not terminated' % offset)
self.__code = compile('\n'.join(tokens), '<templite %r>' % template[:20], 'exec')
def render(self, __namespace=None, **kw):
"""
renders the template according to the given namespace.
__namespace - a dictionary serving as a namespace for evaluation
**kw - keyword arguments which are added to the namespace
"""
namespace = {}
if __namespace: namespace.update(__namespace)
if kw: namespace.update(kw)
namespace['emit'] = self.write
__stdout = sys.stdout
sys.stdout = self
self.__output = []
eval(self.__code, namespace)
sys.stdout = __stdout
return ''.join(self.__output)
def write(self, *args):
for a in args:
self.__output.append(str(a))