mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Merge from trunk
This commit is contained in:
commit
f61daece95
@ -25,15 +25,15 @@ class Fudzilla(BasicNewsRecipe):
|
|||||||
remove_tags_before = dict(name='div', attrs={'class':['padding']})
|
remove_tags_before = dict(name='div', attrs={'class':['padding']})
|
||||||
|
|
||||||
remove_tags = [dict(name='td', attrs={'class':['left','right']}),
|
remove_tags = [dict(name='td', attrs={'class':['left','right']}),
|
||||||
dict(name='div', attrs={'id':['toolbar','buttons']}),
|
dict(name='div', attrs={'id':['toolbar','buttons']}),
|
||||||
dict(name='div', attrs={'class':['artbannersxtd','back_button']}),
|
dict(name='div', attrs={'class':['artbannersxtd','back_button']}),
|
||||||
dict(name='span', attrs={'class':['pathway']}),
|
dict(name='span', attrs={'class':['pathway']}),
|
||||||
dict(name='th', attrs={'class':['pagenav_next','pagenav_prev']}),
|
dict(name='th', attrs={'class':['pagenav_next','pagenav_prev']}),
|
||||||
dict(name='table', attrs={'class':['headlines']}),
|
dict(name='table', attrs={'class':['headlines']}),
|
||||||
]
|
]
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Posts', u'http://www.fudzilla.com/index.php?option=com_rss&feed=RSS2.0&no_html=1')
|
(u'Posts', u'http://www.fudzilla.com/?format=feed')
|
||||||
]
|
]
|
||||||
|
|
||||||
preprocess_regexps = [
|
preprocess_regexps = [
|
||||||
|
@ -5,62 +5,59 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
|||||||
'''
|
'''
|
||||||
nytimes.com
|
nytimes.com
|
||||||
'''
|
'''
|
||||||
import re
|
import re, string, time
|
||||||
import time
|
from calibre import entity_to_unicode, strftime
|
||||||
from calibre import entity_to_unicode
|
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, \
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
|
||||||
Comment, BeautifulStoneSoup
|
|
||||||
|
|
||||||
class NYTimes(BasicNewsRecipe):
|
class NYTimes(BasicNewsRecipe):
|
||||||
|
|
||||||
title = 'New York Times Top Stories'
|
# set headlinesOnly to True for the headlines-only version
|
||||||
__author__ = 'GRiker'
|
headlinesOnly = True
|
||||||
language = 'en'
|
|
||||||
requires_version = (0, 7, 5)
|
|
||||||
description = 'Top Stories from the New York Times'
|
|
||||||
|
|
||||||
# List of sections typically included in Top Stories. Use a keyword from the
|
# includeSections: List of sections to include. If empty, all sections found will be included.
|
||||||
# right column in the excludeSectionKeywords[] list to skip downloading that section
|
# Otherwise, only the sections named will be included. For example,
|
||||||
sections = {
|
#
|
||||||
'arts' : 'Arts',
|
# includeSections = ['Politics','Sports']
|
||||||
'business' : 'Business',
|
#
|
||||||
'diningwine' : 'Dining & Wine',
|
# would cause only the Politics and Sports sections to be included.
|
||||||
'editorials' : 'Editorials',
|
|
||||||
'health' : 'Health',
|
|
||||||
'magazine' : 'Magazine',
|
|
||||||
'mediaadvertising' : 'Media & Advertising',
|
|
||||||
'newyorkregion' : 'New York/Region',
|
|
||||||
'oped' : 'Op-Ed',
|
|
||||||
'politics' : 'Politics',
|
|
||||||
'science' : 'Science',
|
|
||||||
'sports' : 'Sports',
|
|
||||||
'technology' : 'Technology',
|
|
||||||
'topstories' : 'Top Stories',
|
|
||||||
'travel' : 'Travel',
|
|
||||||
'us' : 'U.S.',
|
|
||||||
'world' : 'World'
|
|
||||||
}
|
|
||||||
|
|
||||||
# Add section keywords from the right column above to skip that section
|
includeSections = [] # by default, all sections included
|
||||||
# For example, to skip sections containing the word 'Sports' or 'Dining', use:
|
|
||||||
# excludeSectionKeywords = ['Sports', 'Dining']
|
# excludeSections: List of sections to exclude. If empty, all sections found will be included.
|
||||||
# Fetch only Business and Technology
|
# Otherwise, the sections named will be excluded. For example,
|
||||||
# excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World']
|
#
|
||||||
# Fetch only Top Stories
|
# excludeSections = ['Politics','Sports']
|
||||||
# excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World']
|
#
|
||||||
# By default, no sections are skipped.
|
# would cause the Politics and Sports sections to be excluded. This parameter can be used
|
||||||
excludeSectionKeywords = []
|
# in conjuction with includeSections although in most cases using one or the other, but
|
||||||
|
# not both, is sufficient.
|
||||||
|
|
||||||
|
excludeSections = []
|
||||||
|
|
||||||
# one_picture_per_article specifies that calibre should only use the first image
|
# one_picture_per_article specifies that calibre should only use the first image
|
||||||
# from an article (if one exists). If one_picture_per_article = True, the image
|
# from an article (if one exists). If one_picture_per_article = True, the image
|
||||||
# will be moved to a location between the headline and the byline.
|
# will be moved to a location between the headline and the byline.
|
||||||
# If one_picture_per_article = False, all images from the article will be included
|
# If one_picture_per_article = False, all images from the article will be included
|
||||||
|
|
||||||
# and shown in their original location.
|
# and shown in their original location.
|
||||||
one_picture_per_article = True
|
one_picture_per_article = True
|
||||||
|
|
||||||
# The maximum number of articles that will be downloaded
|
# The maximum number of articles that will be downloaded
|
||||||
max_articles_per_feed = 40
|
max_articles_per_feed = 100
|
||||||
|
|
||||||
|
|
||||||
|
if headlinesOnly:
|
||||||
|
title='New York Times Headlines'
|
||||||
|
description = 'Headlines from the New York Times'
|
||||||
|
else:
|
||||||
|
title='New York Times'
|
||||||
|
description = 'Today\'s New York Times'
|
||||||
|
|
||||||
|
__author__ = 'GRiker/Kovid Goyal/Nick Redding'
|
||||||
|
language = 'en'
|
||||||
|
requires_version = (0, 7, 5)
|
||||||
|
|
||||||
|
|
||||||
timefmt = ''
|
timefmt = ''
|
||||||
needs_subscription = True
|
needs_subscription = True
|
||||||
@ -82,6 +79,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
'entry-response module',
|
'entry-response module',
|
||||||
'icon enlargeThis',
|
'icon enlargeThis',
|
||||||
'leftNavTabs',
|
'leftNavTabs',
|
||||||
|
'metaFootnote',
|
||||||
'module box nav',
|
'module box nav',
|
||||||
'nextArticleLink',
|
'nextArticleLink',
|
||||||
'nextArticleLink clearfix',
|
'nextArticleLink clearfix',
|
||||||
@ -89,12 +87,13 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
'relatedSearchesModule',
|
'relatedSearchesModule',
|
||||||
'side_tool',
|
'side_tool',
|
||||||
'singleAd',
|
'singleAd',
|
||||||
'subNavigation clearfix',
|
re.compile('^subNavigation'),
|
||||||
'subNavigation tabContent active',
|
re.compile('^leaderboard'),
|
||||||
'subNavigation tabContent active clearfix',
|
re.compile('^module'),
|
||||||
]}),
|
]}),
|
||||||
dict(id=[
|
dict(id=[
|
||||||
'adxLeaderboard',
|
'adxLeaderboard',
|
||||||
|
'adxSponLink',
|
||||||
'archive',
|
'archive',
|
||||||
'articleExtras',
|
'articleExtras',
|
||||||
'articleInline',
|
'articleInline',
|
||||||
@ -105,87 +104,98 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
'footer',
|
'footer',
|
||||||
'header',
|
'header',
|
||||||
'header_search',
|
'header_search',
|
||||||
|
'inlineBox',
|
||||||
'login',
|
'login',
|
||||||
'masthead',
|
'masthead',
|
||||||
'masthead-nav',
|
'masthead-nav',
|
||||||
'memberTools',
|
'memberTools',
|
||||||
'navigation',
|
'navigation',
|
||||||
'portfolioInline',
|
'portfolioInline',
|
||||||
|
'readerReviews',
|
||||||
|
'readerReviewsCount',
|
||||||
'relatedArticles',
|
'relatedArticles',
|
||||||
|
'relatedTopics',
|
||||||
'respond',
|
'respond',
|
||||||
'side_search',
|
'side_search',
|
||||||
'side_index',
|
'side_index',
|
||||||
'side_tool',
|
'side_tool',
|
||||||
'toolsRight',
|
'toolsRight',
|
||||||
]),
|
]),
|
||||||
dict(name=['script', 'noscript', 'style'])]
|
dict(name=['script', 'noscript', 'style','form','hr'])]
|
||||||
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
extra_css = '.headline {text-align: left;}\n \
|
extra_css = '''
|
||||||
.byline {font-family: monospace; \
|
.articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
|
||||||
text-align: left; \
|
.credit { text-align: right; font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||||
margin-top: 0px; \
|
.byline { text-align: left; font-size: small; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||||
margin-bottom: 0px;}\n \
|
.dateline { text-align: left; font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||||
.dateline {font-size: small; \
|
.kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||||
margin-top: 0px; \
|
.timestamp { text-align: left; font-size: small; }
|
||||||
margin-bottom: 0px;}\n \
|
.caption { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||||
.timestamp {font-size: small; \
|
a:link {text-decoration: none; }
|
||||||
margin-top: 0px; \
|
.articleBody { }
|
||||||
margin-bottom: 0px;}\n \
|
.authorId {text-align: left; }
|
||||||
.source {text-align: left;}\n \
|
.image {text-align: center;}
|
||||||
.image {text-align: center;}\n \
|
.source {text-align: left; }'''
|
||||||
.credit {text-align: right; \
|
|
||||||
font-size: small; \
|
|
||||||
margin-top: 0px; \
|
|
||||||
margin-bottom: 0px;}\n \
|
|
||||||
.articleBody {text-align: left;}\n \
|
|
||||||
.authorId {text-align: left; \
|
|
||||||
font-style: italic;}\n '
|
|
||||||
|
|
||||||
def dump_ans(self, ans) :
|
def filter_ans(self, ans) :
|
||||||
total_article_count = 0
|
total_article_count = 0
|
||||||
for section in ans :
|
idx = 0
|
||||||
|
idx_max = len(ans)-1
|
||||||
|
while idx <= idx_max:
|
||||||
|
if self.includeSections != []:
|
||||||
|
if ans[idx][0] not in self.includeSections:
|
||||||
|
print "SECTION NOT INCLUDED: ",ans[idx][0]
|
||||||
|
del ans[idx]
|
||||||
|
idx_max = idx_max-1
|
||||||
|
continue
|
||||||
|
if ans[idx][0] in self.excludeSections:
|
||||||
|
print "SECTION EXCLUDED: ",ans[idx][0]
|
||||||
|
del ans[idx]
|
||||||
|
idx_max = idx_max-1
|
||||||
|
continue
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
self.log("section %s: %d articles" % (section[0], len(section[1])) )
|
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
|
||||||
for article in section[1]:
|
for article in ans[idx][1]:
|
||||||
total_article_count += 1
|
total_article_count += 1
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
|
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
|
||||||
article['url'].encode('cp1252','replace')))
|
article['url'].encode('cp1252','replace')))
|
||||||
|
idx = idx+1
|
||||||
|
|
||||||
self.log( "Queued %d articles" % total_article_count )
|
self.log( "Queued %d articles" % total_article_count )
|
||||||
|
return ans
|
||||||
|
|
||||||
def fixChars(self,string):
|
def fixChars(self,string):
|
||||||
# Replace lsquo (\x91)
|
# Replace lsquo (\x91)
|
||||||
fixed = re.sub("\x91","‘",string)
|
fixed = re.sub("\x91","‘",string)
|
||||||
|
|
||||||
# Replace rsquo (\x92)
|
# Replace rsquo (\x92)
|
||||||
fixed = re.sub("\x92","’",fixed)
|
fixed = re.sub("\x92","’",fixed)
|
||||||
|
|
||||||
# Replace ldquo (\x93)
|
# Replace ldquo (\x93)
|
||||||
fixed = re.sub("\x93","“",fixed)
|
fixed = re.sub("\x93","“",fixed)
|
||||||
|
|
||||||
# Replace rdquo (\x94)
|
# Replace rdquo (\x94)
|
||||||
fixed = re.sub("\x94","”",fixed)
|
fixed = re.sub("\x94","”",fixed)
|
||||||
|
|
||||||
# Replace ndash (\x96)
|
# Replace ndash (\x96)
|
||||||
fixed = re.sub("\x96","–",fixed)
|
fixed = re.sub("\x96","–",fixed)
|
||||||
|
|
||||||
# Replace mdash (\x97)
|
# Replace mdash (\x97)
|
||||||
fixed = re.sub("\x97","—",fixed)
|
fixed = re.sub("\x97","—",fixed)
|
||||||
|
|
||||||
return fixed
|
return fixed
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
if self.username is not None and self.password is not None:
|
if self.username is not None and self.password is not None:
|
||||||
try:
|
br.open('http://www.nytimes.com/auth/login')
|
||||||
br.open('http://www.nytimes.com/auth/login')
|
br.select_form(name='login')
|
||||||
br.select_form(name='login')
|
br['USERID'] = self.username
|
||||||
br['USERID'] = self.username
|
br['PASSWORD'] = self.password
|
||||||
br['PASSWORD'] = self.password
|
raw = br.submit().read()
|
||||||
br.submit()
|
if 'Please try again' in raw:
|
||||||
except:
|
raise Exception('Your username and password are incorrect')
|
||||||
self.log("\nFailed to login")
|
|
||||||
return br
|
return br
|
||||||
|
|
||||||
def skip_ad_pages(self, soup):
|
def skip_ad_pages(self, soup):
|
||||||
@ -213,6 +223,9 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
cover = None
|
cover = None
|
||||||
return cover
|
return cover
|
||||||
|
|
||||||
|
def short_title(self):
|
||||||
|
return self.title
|
||||||
|
|
||||||
def index_to_soup(self, url_or_raw, raw=False):
|
def index_to_soup(self, url_or_raw, raw=False):
|
||||||
'''
|
'''
|
||||||
OVERRIDE of class method
|
OVERRIDE of class method
|
||||||
@ -255,157 +268,184 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
# Kindle TOC descriptions won't render certain characters
|
# Kindle TOC descriptions won't render certain characters
|
||||||
if description:
|
if description:
|
||||||
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||||
# Replace '&' with '&'
|
# Replace '&' with '&'
|
||||||
massaged = re.sub("&","&", massaged)
|
massaged = re.sub("&","&", massaged)
|
||||||
return self.fixChars(massaged)
|
return self.fixChars(massaged)
|
||||||
else:
|
else:
|
||||||
return description
|
return description
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_todays_index(self):
|
||||||
|
|
||||||
|
def feed_title(div):
|
||||||
|
return ''.join(div.findAll(text=True, recursive=True)).strip()
|
||||||
|
|
||||||
|
articles = {}
|
||||||
|
key = None
|
||||||
|
ans = []
|
||||||
|
url_list = []
|
||||||
|
|
||||||
|
def handle_article(div):
|
||||||
|
a = div.find('a', href=True)
|
||||||
|
if not a:
|
||||||
|
return
|
||||||
|
url = re.sub(r'\?.*', '', a['href'])
|
||||||
|
if not url.startswith("http"):
|
||||||
|
return
|
||||||
|
if not url.endswith(".html"):
|
||||||
|
return
|
||||||
|
if 'podcast' in url:
|
||||||
|
return
|
||||||
|
if '/video/' in url:
|
||||||
|
return
|
||||||
|
url += '?pagewanted=all'
|
||||||
|
if url in url_list:
|
||||||
|
return
|
||||||
|
url_list.append(url)
|
||||||
|
title = self.tag_to_string(a, use_alt=True).strip()
|
||||||
|
description = ''
|
||||||
|
pubdate = strftime('%a, %d %b')
|
||||||
|
summary = div.find(True, attrs={'class':'summary'})
|
||||||
|
if summary:
|
||||||
|
description = self.tag_to_string(summary, use_alt=False)
|
||||||
|
author = ''
|
||||||
|
authorAttribution = div.find(True, attrs={'class':'byline'})
|
||||||
|
if authorAttribution:
|
||||||
|
author = self.tag_to_string(authorAttribution, use_alt=False)
|
||||||
|
else:
|
||||||
|
authorAttribution = div.find(True, attrs={'class':'byline'})
|
||||||
|
if authorAttribution:
|
||||||
|
author = self.tag_to_string(authorAttribution, use_alt=False)
|
||||||
|
feed = key if key is not None else 'Uncategorized'
|
||||||
|
if not articles.has_key(feed):
|
||||||
|
ans.append(feed)
|
||||||
|
articles[feed] = []
|
||||||
|
articles[feed].append(
|
||||||
|
dict(title=title, url=url, date=pubdate,
|
||||||
|
description=description, author=author,
|
||||||
|
content=''))
|
||||||
|
|
||||||
|
|
||||||
|
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
|
||||||
|
|
||||||
|
|
||||||
|
# Find each article
|
||||||
|
for div in soup.findAll(True,
|
||||||
|
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
||||||
|
|
||||||
|
if div['class'] in ['section-headline','sectionHeader']:
|
||||||
|
key = string.capwords(feed_title(div))
|
||||||
|
key = key.replace('Op-ed','Op-Ed')
|
||||||
|
key = key.replace('U.s.','U.S.')
|
||||||
|
elif div['class'] in ['story', 'story headline'] :
|
||||||
|
handle_article(div)
|
||||||
|
elif div['class'] == 'headlinesOnly multiline flush':
|
||||||
|
for lidiv in div.findAll('li'):
|
||||||
|
handle_article(lidiv)
|
||||||
|
|
||||||
|
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||||
|
return self.filter_ans(ans)
|
||||||
|
|
||||||
|
def parse_headline_index(self):
|
||||||
|
|
||||||
articles = {}
|
articles = {}
|
||||||
ans = []
|
ans = []
|
||||||
|
url_list = []
|
||||||
feed = key = 'All Top Stories'
|
|
||||||
articles[key] = []
|
|
||||||
ans.append(key)
|
|
||||||
self.log("Scanning 1 section ...")
|
|
||||||
|
|
||||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
|
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
|
||||||
|
|
||||||
# Fetch the outer table
|
# Fetch the content table
|
||||||
table = soup.find('table')
|
content_table = soup.find('table',{'id':'content'})
|
||||||
previousTable = table
|
if content_table is None:
|
||||||
|
self.log("FATAL ERROR: CANNOT FIND CONTENT TABLE")
|
||||||
|
return None
|
||||||
|
|
||||||
# Find the deepest table containing the stories
|
# Within this table are <td id=".*Column.*"> entries, each containing one or more h6 tags which represent sections
|
||||||
while True :
|
|
||||||
table = table.find('table')
|
|
||||||
if table.find(text=re.compile('top stories start')) :
|
|
||||||
previousTable = table
|
|
||||||
continue
|
|
||||||
else :
|
|
||||||
table = previousTable
|
|
||||||
break
|
|
||||||
|
|
||||||
# There are multiple subtables, find the one containing the stories
|
for td_col in content_table.findAll('td', {'id' : re.compile('Column')}):
|
||||||
for block in table.findAll('table') :
|
for div_sec in td_col.findAll('div',recursive=False):
|
||||||
if block.find(text=re.compile('top stories start')) :
|
for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}):
|
||||||
table = block
|
section_name = self.tag_to_string(h6_sec_name,use_alt=False)
|
||||||
break
|
section_name = re.sub(r'^ *$','',section_name)
|
||||||
else :
|
if section_name == '':
|
||||||
continue
|
continue
|
||||||
|
section_name=string.capwords(section_name)
|
||||||
|
if section_name == 'U.s.':
|
||||||
|
section_name = 'U.S.'
|
||||||
|
elif section_name == 'Op-ed':
|
||||||
|
section_name = 'Op-Ed'
|
||||||
|
pubdate = strftime('%a, %d %b')
|
||||||
|
|
||||||
# Again there are multiple subtables, find the one containing the stories
|
search_div = div_sec
|
||||||
for storyblock in table.findAll('table') :
|
for next_tag in h6_sec_name.findNextSiblings(True):
|
||||||
if storyblock.find(text=re.compile('top stories start')) :
|
if next_tag.__class__.__name__ == 'Tag':
|
||||||
break
|
if next_tag.name == 'div':
|
||||||
else :
|
search_div = next_tag
|
||||||
continue
|
|
||||||
|
|
||||||
skipThisSection = False
|
|
||||||
todays_article_count = 0
|
|
||||||
# Within this table are <font face="times new roman, times, san serif"> entries
|
|
||||||
self.log("Fetching feed Top Stories")
|
|
||||||
for tr in storyblock.findAllNext('tr'):
|
|
||||||
if tr.find('span') is not None :
|
|
||||||
|
|
||||||
sectionblock = tr.find(True, attrs={'face':['times new roman, times,sans serif',
|
|
||||||
'times new roman,times, sans serif',
|
|
||||||
'times new roman, times, sans serif']})
|
|
||||||
section = None
|
|
||||||
bylines = []
|
|
||||||
descriptions = []
|
|
||||||
pubdate = None
|
|
||||||
|
|
||||||
# Get the Section title
|
|
||||||
for (x,i) in enumerate(sectionblock.contents) :
|
|
||||||
skipThisSection = False
|
|
||||||
# Extract the section title
|
|
||||||
if ('Comment' in str(i.__class__)) :
|
|
||||||
if 'start(name=' in i :
|
|
||||||
section = i[i.find('=')+1:-2]
|
|
||||||
|
|
||||||
if not self.sections.has_key(section) :
|
|
||||||
skipThisSection = True
|
|
||||||
break
|
break
|
||||||
|
|
||||||
# Check for excluded section
|
# Get the articles
|
||||||
if len(self.excludeSectionKeywords):
|
for h3_item in search_div.findAll('h3'):
|
||||||
key = self.sections[section]
|
byline = h3_item.h6
|
||||||
excluded = re.compile('|'.join(self.excludeSectionKeywords))
|
if byline is not None:
|
||||||
if excluded.search(key) or articles.has_key(key):
|
author = self.tag_to_string(byline,usa_alt=False)
|
||||||
skipThisSection = True
|
|
||||||
break
|
|
||||||
|
|
||||||
# Get the bylines and descriptions
|
|
||||||
if not skipThisSection :
|
|
||||||
lines = sectionblock.contents
|
|
||||||
contentStrings = []
|
|
||||||
|
|
||||||
for line in lines:
|
|
||||||
if not isinstance(line, Comment) and line.strip and line.strip() > "":
|
|
||||||
contentStrings.append(line.strip())
|
|
||||||
|
|
||||||
# Gather the byline/description pairs
|
|
||||||
bylines = []
|
|
||||||
descriptions = []
|
|
||||||
for contentString in contentStrings:
|
|
||||||
if contentString[0:3] == 'By ' and contentString[3].isupper() :
|
|
||||||
bylines.append(contentString)
|
|
||||||
else:
|
else:
|
||||||
descriptions.append(contentString)
|
author = ''
|
||||||
|
a = h3_item.find('a', href=True)
|
||||||
# Fetch the article titles and URLs
|
if not a:
|
||||||
articleCount = len(sectionblock.findAll('span'))
|
continue
|
||||||
todays_article_count += articleCount
|
|
||||||
for (i,span) in enumerate(sectionblock.findAll(attrs={'class':'headlineWrapper'})) :
|
|
||||||
a = span.find('a', href=True)
|
|
||||||
url = re.sub(r'\?.*', '', a['href'])
|
url = re.sub(r'\?.*', '', a['href'])
|
||||||
|
if not url.startswith("http"):
|
||||||
|
continue
|
||||||
|
if not url.endswith(".html"):
|
||||||
|
continue
|
||||||
|
if 'podcast' in url:
|
||||||
|
continue
|
||||||
|
if 'video' in url:
|
||||||
|
continue
|
||||||
url += '?pagewanted=all'
|
url += '?pagewanted=all'
|
||||||
|
if url in url_list:
|
||||||
|
continue
|
||||||
|
url_list.append(url)
|
||||||
|
self.log("URL %s" % url)
|
||||||
|
title = self.tag_to_string(a, use_alt=True).strip()
|
||||||
|
desc = h3_item.find('p')
|
||||||
|
if desc is not None:
|
||||||
|
description = self.tag_to_string(desc,use_alt=False)
|
||||||
|
else:
|
||||||
|
description = ''
|
||||||
|
if not articles.has_key(section_name):
|
||||||
|
ans.append(section_name)
|
||||||
|
articles[section_name] = []
|
||||||
|
articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
|
||||||
|
|
||||||
title = self.tag_to_string(a, use_alt=True)
|
|
||||||
# prepend the section name
|
|
||||||
title = self.sections[section] + " · " + title
|
|
||||||
|
|
||||||
if not isinstance(title, unicode):
|
|
||||||
title = title.decode('utf-8', 'replace')
|
|
||||||
|
|
||||||
# Allow for unattributed, undescribed entries "Editor's Note"
|
|
||||||
if i >= len(descriptions) :
|
|
||||||
description = None
|
|
||||||
else :
|
|
||||||
description = descriptions[i]
|
|
||||||
|
|
||||||
if len(bylines) == articleCount :
|
|
||||||
author = bylines[i]
|
|
||||||
else :
|
|
||||||
author = None
|
|
||||||
|
|
||||||
# Check for duplicates
|
|
||||||
duplicateFound = False
|
|
||||||
if len(articles[feed]) > 1:
|
|
||||||
for article in articles[feed] :
|
|
||||||
if url == article['url'] :
|
|
||||||
duplicateFound = True
|
|
||||||
break
|
|
||||||
|
|
||||||
if duplicateFound:
|
|
||||||
# Continue fetching, don't add this article
|
|
||||||
todays_article_count -= 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
if not articles.has_key(feed):
|
|
||||||
articles[feed] = []
|
|
||||||
articles[feed].append(
|
|
||||||
dict(title=title, url=url, date=pubdate,
|
|
||||||
description=description, author=author, content=''))
|
|
||||||
# self.log("Queuing %d articles from %s" % (todays_article_count, "Top Stories"))
|
|
||||||
|
|
||||||
ans = self.sort_index_by(ans, {'Top Stories':-1})
|
|
||||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||||
self.dump_ans(ans)
|
return self.filter_ans(ans)
|
||||||
return ans
|
|
||||||
|
def parse_index(self):
|
||||||
|
if self.headlinesOnly:
|
||||||
|
return self.parse_headline_index()
|
||||||
|
else:
|
||||||
|
return self.parse_todays_index()
|
||||||
|
|
||||||
|
def strip_anchors(self,soup):
|
||||||
|
paras = soup.findAll(True)
|
||||||
|
for para in paras:
|
||||||
|
aTags = para.findAll('a')
|
||||||
|
for a in aTags:
|
||||||
|
if a.img is None:
|
||||||
|
a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
||||||
|
return soup
|
||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
|
|
||||||
|
kicker_tag = soup.find(attrs={'class':'kicker'})
|
||||||
|
if kicker_tag: # remove Op_Ed author head shots
|
||||||
|
tagline = self.tag_to_string(kicker_tag)
|
||||||
|
if tagline=='Op-Ed Columnist':
|
||||||
|
img_div = soup.find('div','inlineImage module')
|
||||||
|
if img_div:
|
||||||
|
img_div.extract()
|
||||||
return self.strip_anchors(soup)
|
return self.strip_anchors(soup)
|
||||||
|
|
||||||
def postprocess_html(self,soup, True):
|
def postprocess_html(self,soup, True):
|
||||||
@ -422,8 +462,9 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
firstImg = inlineImgs[0]
|
firstImg = inlineImgs[0]
|
||||||
for inlineImg in inlineImgs[1:]:
|
for inlineImg in inlineImgs[1:]:
|
||||||
inlineImg.extract()
|
inlineImg.extract()
|
||||||
# Move firstImg after headline
|
# Move firstImg before article body
|
||||||
cgFirst = soup.find(True, {'class':'columnGroup first'})
|
#article_body = soup.find(True, {'id':'articleBody'})
|
||||||
|
cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
|
||||||
if cgFirst:
|
if cgFirst:
|
||||||
# Strip all sibling NavigableStrings: noise
|
# Strip all sibling NavigableStrings: noise
|
||||||
navstrings = cgFirst.findAll(text=True, recursive=False)
|
navstrings = cgFirst.findAll(text=True, recursive=False)
|
||||||
@ -443,30 +484,18 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
if headline_found:
|
if headline_found:
|
||||||
cgFirst.insert(insertLoc,firstImg)
|
cgFirst.insert(insertLoc,firstImg)
|
||||||
else:
|
else:
|
||||||
self.log(">>> No class:'columnGroup first' found <<<")
|
self.log(">>> No class:'columnGroup first' found <<<")
|
||||||
# Change class="kicker" to <h3>
|
|
||||||
kicker = soup.find(True, {'class':'kicker'})
|
|
||||||
if kicker and kicker.contents[0]:
|
|
||||||
h3Tag = Tag(soup, "h3")
|
|
||||||
h3Tag.insert(0, self.fixChars(self.tag_to_string(kicker,
|
|
||||||
use_alt=False)))
|
|
||||||
kicker.replaceWith(h3Tag)
|
|
||||||
|
|
||||||
# Change captions to italic -1
|
# Change captions to italic
|
||||||
for caption in soup.findAll(True, {'class':'caption'}) :
|
for caption in soup.findAll(True, {'class':'caption'}) :
|
||||||
if caption and caption.contents[0]:
|
if caption and caption.contents[0]:
|
||||||
emTag = Tag(soup, "em")
|
cTag = Tag(soup, "p", [("class", "caption")])
|
||||||
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
|
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
|
||||||
mp_off = c.find("More Photos")
|
mp_off = c.find("More Photos")
|
||||||
if mp_off >= 0:
|
if mp_off >= 0:
|
||||||
c = c[:mp_off]
|
c = c[:mp_off]
|
||||||
emTag.insert(0, c)
|
cTag.insert(0, c)
|
||||||
#hrTag = Tag(soup, 'hr')
|
caption.replaceWith(cTag)
|
||||||
#hrTag['class'] = 'caption_divider'
|
|
||||||
hrTag = Tag(soup, 'div')
|
|
||||||
hrTag['class'] = 'divider'
|
|
||||||
emTag.insert(1, hrTag)
|
|
||||||
caption.replaceWith(emTag)
|
|
||||||
|
|
||||||
# Change <nyt_headline> to <h2>
|
# Change <nyt_headline> to <h2>
|
||||||
h1 = soup.find('h1')
|
h1 = soup.find('h1')
|
||||||
@ -506,17 +535,6 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
bTag.insert(0, subhead.contents[0])
|
bTag.insert(0, subhead.contents[0])
|
||||||
subhead.replaceWith(bTag)
|
subhead.replaceWith(bTag)
|
||||||
|
|
||||||
# Synthesize a section header
|
|
||||||
dsk = soup.find('meta', attrs={'name':'dsk'})
|
|
||||||
if dsk and dsk.has_key('content'):
|
|
||||||
hTag = Tag(soup,'h3')
|
|
||||||
hTag['class'] = 'section'
|
|
||||||
hTag.insert(0,NavigableString(dsk['content']))
|
|
||||||
articleTag = soup.find(True, attrs={'id':'article'})
|
|
||||||
if articleTag:
|
|
||||||
articleTag.insert(0,hTag)
|
|
||||||
|
|
||||||
# Add class="articleBody" to <div> so we can format with CSS
|
|
||||||
divTag = soup.find('div',attrs={'id':'articleBody'})
|
divTag = soup.find('div',attrs={'id':'articleBody'})
|
||||||
if divTag:
|
if divTag:
|
||||||
divTag['class'] = divTag['id']
|
divTag['class'] = divTag['id']
|
||||||
@ -532,11 +550,3 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def strip_anchors(self,soup):
|
|
||||||
paras = soup.findAll(True)
|
|
||||||
for para in paras:
|
|
||||||
aTags = para.findAll('a')
|
|
||||||
for a in aTags:
|
|
||||||
if a.img is None:
|
|
||||||
a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
|
||||||
return soup
|
|
||||||
|
@ -5,52 +5,186 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
|||||||
'''
|
'''
|
||||||
nytimes.com
|
nytimes.com
|
||||||
'''
|
'''
|
||||||
import string, re, time
|
import re, string, time
|
||||||
from calibre import strftime
|
from calibre import entity_to_unicode, strftime
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
|
||||||
|
|
||||||
def decode(self, src):
|
|
||||||
enc = 'utf-8'
|
|
||||||
if 'iso-8859-1' in src:
|
|
||||||
enc = 'cp1252'
|
|
||||||
return src.decode(enc, 'ignore')
|
|
||||||
|
|
||||||
class NYTimes(BasicNewsRecipe):
|
class NYTimes(BasicNewsRecipe):
|
||||||
|
|
||||||
title = u'New York Times'
|
# set headlinesOnly to True for the headlines-only version
|
||||||
__author__ = 'Kovid Goyal/Nick Redding'
|
headlinesOnly = False
|
||||||
language = 'en'
|
|
||||||
requires_version = (0, 6, 36)
|
|
||||||
|
|
||||||
description = 'Daily news from the New York Times (subscription version)'
|
# includeSections: List of sections to include. If empty, all sections found will be included.
|
||||||
timefmt = ' [%b %d]'
|
# Otherwise, only the sections named will be included. For example,
|
||||||
|
#
|
||||||
|
# includeSections = ['Politics','Sports']
|
||||||
|
#
|
||||||
|
# would cause only the Politics and Sports sections to be included.
|
||||||
|
|
||||||
|
includeSections = [] # by default, all sections included
|
||||||
|
|
||||||
|
# excludeSections: List of sections to exclude. If empty, all sections found will be included.
|
||||||
|
# Otherwise, the sections named will be excluded. For example,
|
||||||
|
#
|
||||||
|
# excludeSections = ['Politics','Sports']
|
||||||
|
#
|
||||||
|
# would cause the Politics and Sports sections to be excluded. This parameter can be used
|
||||||
|
# in conjuction with includeSections although in most cases using one or the other, but
|
||||||
|
# not both, is sufficient.
|
||||||
|
|
||||||
|
excludeSections = []
|
||||||
|
|
||||||
|
# one_picture_per_article specifies that calibre should only use the first image
|
||||||
|
# from an article (if one exists). If one_picture_per_article = True, the image
|
||||||
|
# will be moved to a location between the headline and the byline.
|
||||||
|
# If one_picture_per_article = False, all images from the article will be included
|
||||||
|
|
||||||
|
# and shown in their original location.
|
||||||
|
one_picture_per_article = True
|
||||||
|
|
||||||
|
# The maximum number of articles that will be downloaded
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
|
||||||
|
|
||||||
|
if headlinesOnly:
|
||||||
|
title='New York Times Headlines'
|
||||||
|
description = 'Headlines from the New York Times'
|
||||||
|
else:
|
||||||
|
title='New York Times'
|
||||||
|
description = 'Today\'s New York Times'
|
||||||
|
|
||||||
|
__author__ = 'GRiker/Kovid Goyal/Nick Redding'
|
||||||
|
language = 'en'
|
||||||
|
requires_version = (0, 7, 5)
|
||||||
|
|
||||||
|
|
||||||
|
timefmt = ''
|
||||||
needs_subscription = True
|
needs_subscription = True
|
||||||
|
masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
|
||||||
|
cover_margins = (18,18,'grey99')
|
||||||
|
|
||||||
remove_tags_before = dict(id='article')
|
remove_tags_before = dict(id='article')
|
||||||
remove_tags_after = dict(id='article')
|
remove_tags_after = dict(id='article')
|
||||||
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool','nextArticleLink',
|
remove_tags = [dict(attrs={'class':[
|
||||||
'nextArticleLink clearfix','columnGroup doubleRule','doubleRule','entry-meta',
|
'articleFooter',
|
||||||
'icon enlargeThis','columnGroup last','relatedSearchesModule']}),
|
'articleTools',
|
||||||
dict({'class':re.compile('^subNavigation')}),
|
'columnGroup doubleRule',
|
||||||
dict({'class':re.compile('^leaderboard')}),
|
'columnGroup singleRule',
|
||||||
dict({'class':re.compile('^module')}),
|
'columnGroup last',
|
||||||
dict({'class':'metaFootnote'}),
|
'columnGroup last',
|
||||||
dict(id=['inlineBox','footer', 'toolsRight', 'articleInline','login','masthead',
|
'doubleRule',
|
||||||
'navigation', 'archive', 'side_search', 'blog_sidebar','cCol','portfolioInline',
|
'dottedLine',
|
||||||
'side_tool', 'side_index','header','readerReviewsCount','readerReviews',
|
'entry-meta',
|
||||||
'relatedArticles', 'relatedTopics', 'adxSponLink']),
|
'entry-response module',
|
||||||
|
'icon enlargeThis',
|
||||||
|
'leftNavTabs',
|
||||||
|
'metaFootnote',
|
||||||
|
'module box nav',
|
||||||
|
'nextArticleLink',
|
||||||
|
'nextArticleLink clearfix',
|
||||||
|
'post-tools',
|
||||||
|
'relatedSearchesModule',
|
||||||
|
'side_tool',
|
||||||
|
'singleAd',
|
||||||
|
re.compile('^subNavigation'),
|
||||||
|
re.compile('^leaderboard'),
|
||||||
|
re.compile('^module'),
|
||||||
|
]}),
|
||||||
|
dict(id=[
|
||||||
|
'adxLeaderboard',
|
||||||
|
'adxSponLink',
|
||||||
|
'archive',
|
||||||
|
'articleExtras',
|
||||||
|
'articleInline',
|
||||||
|
'blog_sidebar',
|
||||||
|
'businessSearchBar',
|
||||||
|
'cCol',
|
||||||
|
'entertainmentSearchBar',
|
||||||
|
'footer',
|
||||||
|
'header',
|
||||||
|
'header_search',
|
||||||
|
'inlineBox',
|
||||||
|
'login',
|
||||||
|
'masthead',
|
||||||
|
'masthead-nav',
|
||||||
|
'memberTools',
|
||||||
|
'navigation',
|
||||||
|
'portfolioInline',
|
||||||
|
'readerReviews',
|
||||||
|
'readerReviewsCount',
|
||||||
|
'relatedArticles',
|
||||||
|
'relatedTopics',
|
||||||
|
'respond',
|
||||||
|
'side_search',
|
||||||
|
'side_index',
|
||||||
|
'side_tool',
|
||||||
|
'toolsRight',
|
||||||
|
]),
|
||||||
dict(name=['script', 'noscript', 'style','form','hr'])]
|
dict(name=['script', 'noscript', 'style','form','hr'])]
|
||||||
encoding = decode
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
.articleHeadline { margin-top:0.5em; margin-bottom:0.25em; }
|
.articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
|
||||||
.credit { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
.credit { text-align: right; font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||||
.byline { font-size: small; font-style:italic; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
.byline { text-align: left; font-size: small; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||||
.dateline { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
.dateline { text-align: left; font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||||
.kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
.kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||||
.timestamp { font-size: small; }
|
.timestamp { text-align: left; font-size: small; }
|
||||||
.caption { font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
.caption { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||||
a:link {text-decoration: none; }'''
|
a:link {text-decoration: none; }
|
||||||
|
.articleBody { }
|
||||||
|
.authorId {text-align: left; }
|
||||||
|
.image {text-align: center;}
|
||||||
|
.source {text-align: left; }'''
|
||||||
|
|
||||||
|
def filter_ans(self, ans) :
|
||||||
|
total_article_count = 0
|
||||||
|
idx = 0
|
||||||
|
idx_max = len(ans)-1
|
||||||
|
while idx <= idx_max:
|
||||||
|
if self.includeSections != []:
|
||||||
|
if ans[idx][0] not in self.includeSections:
|
||||||
|
print "SECTION NOT INCLUDED: ",ans[idx][0]
|
||||||
|
del ans[idx]
|
||||||
|
idx_max = idx_max-1
|
||||||
|
continue
|
||||||
|
if ans[idx][0] in self.excludeSections:
|
||||||
|
print "SECTION EXCLUDED: ",ans[idx][0]
|
||||||
|
del ans[idx]
|
||||||
|
idx_max = idx_max-1
|
||||||
|
continue
|
||||||
|
if self.verbose:
|
||||||
|
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
|
||||||
|
for article in ans[idx][1]:
|
||||||
|
total_article_count += 1
|
||||||
|
if self.verbose:
|
||||||
|
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
|
||||||
|
article['url'].encode('cp1252','replace')))
|
||||||
|
idx = idx+1
|
||||||
|
|
||||||
|
self.log( "Queued %d articles" % total_article_count )
|
||||||
|
return ans
|
||||||
|
|
||||||
|
def fixChars(self,string):
|
||||||
|
# Replace lsquo (\x91)
|
||||||
|
fixed = re.sub("\x91","‘",string)
|
||||||
|
|
||||||
|
# Replace rsquo (\x92)
|
||||||
|
fixed = re.sub("\x92","’",fixed)
|
||||||
|
|
||||||
|
# Replace ldquo (\x93)
|
||||||
|
fixed = re.sub("\x93","“",fixed)
|
||||||
|
|
||||||
|
# Replace rdquo (\x94)
|
||||||
|
fixed = re.sub("\x94","”",fixed)
|
||||||
|
|
||||||
|
# Replace ndash (\x96)
|
||||||
|
fixed = re.sub("\x96","–",fixed)
|
||||||
|
|
||||||
|
# Replace mdash (\x97)
|
||||||
|
fixed = re.sub("\x97","—",fixed)
|
||||||
|
|
||||||
|
return fixed
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
@ -60,22 +194,19 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
br['USERID'] = self.username
|
br['USERID'] = self.username
|
||||||
br['PASSWORD'] = self.password
|
br['PASSWORD'] = self.password
|
||||||
raw = br.submit().read()
|
raw = br.submit().read()
|
||||||
if 'Sorry, we could not find the combination you entered. Please try again.' in raw:
|
if 'Please try again' in raw:
|
||||||
raise Exception('Your username and password are incorrect')
|
raise Exception('Your username and password are incorrect')
|
||||||
#open('/t/log.html', 'wb').write(raw)
|
|
||||||
return br
|
return br
|
||||||
|
|
||||||
def get_masthead_url(self):
|
def skip_ad_pages(self, soup):
|
||||||
masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
|
# Skip ad pages served before actual article
|
||||||
#masthead = 'http://members.cox.net/nickredding/nytlogo.gif'
|
skip_tag = soup.find(True, {'name':'skip'})
|
||||||
br = BasicNewsRecipe.get_browser()
|
if skip_tag is not None:
|
||||||
try:
|
self.log.warn("Found forwarding link: %s" % skip_tag.parent['href'])
|
||||||
br.open(masthead)
|
url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
|
||||||
except:
|
url += '?pagewanted=all'
|
||||||
self.log("\nMasthead unavailable")
|
self.log.warn("Skipping ad to article at '%s'" % url)
|
||||||
masthead = None
|
return self.index_to_soup(url, raw=True)
|
||||||
return masthead
|
|
||||||
|
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
cover = None
|
cover = None
|
||||||
@ -93,12 +224,57 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
return cover
|
return cover
|
||||||
|
|
||||||
def short_title(self):
|
def short_title(self):
|
||||||
return 'New York Times'
|
return self.title
|
||||||
|
|
||||||
def parse_index(self):
|
def index_to_soup(self, url_or_raw, raw=False):
|
||||||
self.encoding = 'cp1252'
|
'''
|
||||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
|
OVERRIDE of class method
|
||||||
self.encoding = decode
|
deals with various page encodings between index and articles
|
||||||
|
'''
|
||||||
|
def get_the_soup(docEncoding, url_or_raw, raw=False) :
|
||||||
|
if re.match(r'\w+://', url_or_raw):
|
||||||
|
f = self.browser.open(url_or_raw)
|
||||||
|
_raw = f.read()
|
||||||
|
f.close()
|
||||||
|
if not _raw:
|
||||||
|
raise RuntimeError('Could not fetch index from %s'%url_or_raw)
|
||||||
|
else:
|
||||||
|
_raw = url_or_raw
|
||||||
|
if raw:
|
||||||
|
return _raw
|
||||||
|
|
||||||
|
if not isinstance(_raw, unicode) and self.encoding:
|
||||||
|
_raw = _raw.decode(docEncoding, 'replace')
|
||||||
|
massage = list(BeautifulSoup.MARKUP_MASSAGE)
|
||||||
|
massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding)))
|
||||||
|
return BeautifulSoup(_raw, markupMassage=massage)
|
||||||
|
|
||||||
|
# Entry point
|
||||||
|
print "index_to_soup()"
|
||||||
|
soup = get_the_soup( self.encoding, url_or_raw )
|
||||||
|
contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
|
||||||
|
docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
|
||||||
|
if docEncoding == '' :
|
||||||
|
docEncoding = self.encoding
|
||||||
|
|
||||||
|
if self.verbose > 2:
|
||||||
|
self.log( " document encoding: '%s'" % docEncoding)
|
||||||
|
if docEncoding != self.encoding :
|
||||||
|
soup = get_the_soup(docEncoding, url_or_raw)
|
||||||
|
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def massageNCXText(self, description):
|
||||||
|
# Kindle TOC descriptions won't render certain characters
|
||||||
|
if description:
|
||||||
|
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||||
|
# Replace '&' with '&'
|
||||||
|
massaged = re.sub("&","&", massaged)
|
||||||
|
return self.fixChars(massaged)
|
||||||
|
else:
|
||||||
|
return description
|
||||||
|
|
||||||
|
def parse_todays_index(self):
|
||||||
|
|
||||||
def feed_title(div):
|
def feed_title(div):
|
||||||
return ''.join(div.findAll(text=True, recursive=True)).strip()
|
return ''.join(div.findAll(text=True, recursive=True)).strip()
|
||||||
@ -119,12 +295,13 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
return
|
return
|
||||||
if 'podcast' in url:
|
if 'podcast' in url:
|
||||||
return
|
return
|
||||||
|
if '/video/' in url:
|
||||||
|
return
|
||||||
url += '?pagewanted=all'
|
url += '?pagewanted=all'
|
||||||
if url in url_list:
|
if url in url_list:
|
||||||
return
|
return
|
||||||
url_list.append(url)
|
url_list.append(url)
|
||||||
title = self.tag_to_string(a, use_alt=True).strip()
|
title = self.tag_to_string(a, use_alt=True).strip()
|
||||||
#self.log("Title: %s" % title)
|
|
||||||
description = ''
|
description = ''
|
||||||
pubdate = strftime('%a, %d %b')
|
pubdate = strftime('%a, %d %b')
|
||||||
summary = div.find(True, attrs={'class':'summary'})
|
summary = div.find(True, attrs={'class':'summary'})
|
||||||
@ -140,6 +317,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
author = self.tag_to_string(authorAttribution, use_alt=False)
|
author = self.tag_to_string(authorAttribution, use_alt=False)
|
||||||
feed = key if key is not None else 'Uncategorized'
|
feed = key if key is not None else 'Uncategorized'
|
||||||
if not articles.has_key(feed):
|
if not articles.has_key(feed):
|
||||||
|
ans.append(feed)
|
||||||
articles[feed] = []
|
articles[feed] = []
|
||||||
articles[feed].append(
|
articles[feed].append(
|
||||||
dict(title=title, url=url, date=pubdate,
|
dict(title=title, url=url, date=pubdate,
|
||||||
@ -147,46 +325,228 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
content=''))
|
content=''))
|
||||||
|
|
||||||
|
|
||||||
|
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
|
||||||
|
|
||||||
# Find each instance of class="section-headline", class="story", class="story headline"
|
|
||||||
|
# Find each article
|
||||||
for div in soup.findAll(True,
|
for div in soup.findAll(True,
|
||||||
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
||||||
|
|
||||||
if div['class'] in ['section-headline','sectionHeader']:
|
if div['class'] in ['section-headline','sectionHeader']:
|
||||||
key = string.capwords(feed_title(div))
|
key = string.capwords(feed_title(div))
|
||||||
articles[key] = []
|
key = key.replace('Op-ed','Op-Ed')
|
||||||
ans.append(key)
|
key = key.replace('U.s.','U.S.')
|
||||||
#self.log('Section: %s' % key)
|
|
||||||
|
|
||||||
elif div['class'] in ['story', 'story headline'] :
|
elif div['class'] in ['story', 'story headline'] :
|
||||||
handle_article(div)
|
handle_article(div)
|
||||||
elif div['class'] == 'headlinesOnly multiline flush':
|
elif div['class'] == 'headlinesOnly multiline flush':
|
||||||
for lidiv in div.findAll('li'):
|
for lidiv in div.findAll('li'):
|
||||||
handle_article(lidiv)
|
handle_article(lidiv)
|
||||||
|
|
||||||
# ans = self.sort_index_by(ans, {'The Front Page':-1,
|
|
||||||
# 'Dining In, Dining Out':1,
|
|
||||||
# 'Obituaries':2})
|
|
||||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||||
|
return self.filter_ans(ans)
|
||||||
|
|
||||||
|
def parse_headline_index(self):
|
||||||
|
|
||||||
|
articles = {}
|
||||||
|
ans = []
|
||||||
|
url_list = []
|
||||||
|
|
||||||
|
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
|
||||||
|
|
||||||
|
# Fetch the content table
|
||||||
|
content_table = soup.find('table',{'id':'content'})
|
||||||
|
if content_table is None:
|
||||||
|
self.log("FATAL ERROR: CANNOT FIND CONTENT TABLE")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Within this table are <td id=".*Column.*"> entries, each containing one or more h6 tags which represent sections
|
||||||
|
|
||||||
|
for td_col in content_table.findAll('td', {'id' : re.compile('Column')}):
|
||||||
|
for div_sec in td_col.findAll('div',recursive=False):
|
||||||
|
for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}):
|
||||||
|
section_name = self.tag_to_string(h6_sec_name,use_alt=False)
|
||||||
|
section_name = re.sub(r'^ *$','',section_name)
|
||||||
|
if section_name == '':
|
||||||
|
continue
|
||||||
|
section_name=string.capwords(section_name)
|
||||||
|
if section_name == 'U.s.':
|
||||||
|
section_name = 'U.S.'
|
||||||
|
elif section_name == 'Op-ed':
|
||||||
|
section_name = 'Op-Ed'
|
||||||
|
pubdate = strftime('%a, %d %b')
|
||||||
|
|
||||||
|
search_div = div_sec
|
||||||
|
for next_tag in h6_sec_name.findNextSiblings(True):
|
||||||
|
if next_tag.__class__.__name__ == 'Tag':
|
||||||
|
if next_tag.name == 'div':
|
||||||
|
search_div = next_tag
|
||||||
|
break
|
||||||
|
|
||||||
|
# Get the articles
|
||||||
|
for h3_item in search_div.findAll('h3'):
|
||||||
|
byline = h3_item.h6
|
||||||
|
if byline is not None:
|
||||||
|
author = self.tag_to_string(byline,usa_alt=False)
|
||||||
|
else:
|
||||||
|
author = ''
|
||||||
|
a = h3_item.find('a', href=True)
|
||||||
|
if not a:
|
||||||
|
continue
|
||||||
|
url = re.sub(r'\?.*', '', a['href'])
|
||||||
|
if not url.startswith("http"):
|
||||||
|
continue
|
||||||
|
if not url.endswith(".html"):
|
||||||
|
continue
|
||||||
|
if 'podcast' in url:
|
||||||
|
continue
|
||||||
|
if 'video' in url:
|
||||||
|
continue
|
||||||
|
url += '?pagewanted=all'
|
||||||
|
if url in url_list:
|
||||||
|
continue
|
||||||
|
url_list.append(url)
|
||||||
|
self.log("URL %s" % url)
|
||||||
|
title = self.tag_to_string(a, use_alt=True).strip()
|
||||||
|
desc = h3_item.find('p')
|
||||||
|
if desc is not None:
|
||||||
|
description = self.tag_to_string(desc,use_alt=False)
|
||||||
|
else:
|
||||||
|
description = ''
|
||||||
|
if not articles.has_key(section_name):
|
||||||
|
ans.append(section_name)
|
||||||
|
articles[section_name] = []
|
||||||
|
articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
|
||||||
|
|
||||||
|
|
||||||
|
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||||
|
return self.filter_ans(ans)
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
if self.headlinesOnly:
|
||||||
|
return self.parse_headline_index()
|
||||||
|
else:
|
||||||
|
return self.parse_todays_index()
|
||||||
|
|
||||||
|
def strip_anchors(self,soup):
|
||||||
|
paras = soup.findAll(True)
|
||||||
|
for para in paras:
|
||||||
|
aTags = para.findAll('a')
|
||||||
|
for a in aTags:
|
||||||
|
if a.img is None:
|
||||||
|
a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
||||||
|
return soup
|
||||||
|
|
||||||
return ans
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
|
|
||||||
kicker_tag = soup.find(attrs={'class':'kicker'})
|
kicker_tag = soup.find(attrs={'class':'kicker'})
|
||||||
if kicker_tag:
|
if kicker_tag: # remove Op_Ed author head shots
|
||||||
tagline = self.tag_to_string(kicker_tag)
|
tagline = self.tag_to_string(kicker_tag)
|
||||||
#self.log("FOUND KICKER %s" % tagline)
|
|
||||||
if tagline=='Op-Ed Columnist':
|
if tagline=='Op-Ed Columnist':
|
||||||
img_div = soup.find('div','inlineImage module')
|
img_div = soup.find('div','inlineImage module')
|
||||||
#self.log("Searching for photo")
|
|
||||||
if img_div:
|
if img_div:
|
||||||
img_div.extract()
|
img_div.extract()
|
||||||
#self.log("Photo deleted")
|
return self.strip_anchors(soup)
|
||||||
refresh = soup.find('meta', {'http-equiv':'refresh'})
|
|
||||||
if refresh is None:
|
|
||||||
return soup
|
|
||||||
content = refresh.get('content').partition('=')[2]
|
|
||||||
raw = self.browser.open_novisit('http://www.nytimes.com'+content).read()
|
|
||||||
return BeautifulSoup(raw.decode('cp1252', 'replace'))
|
|
||||||
|
|
||||||
|
def postprocess_html(self,soup, True):
|
||||||
|
|
||||||
|
if self.one_picture_per_article:
|
||||||
|
# Remove all images after first
|
||||||
|
largeImg = soup.find(True, {'class':'articleSpanImage'})
|
||||||
|
inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
|
||||||
|
if largeImg:
|
||||||
|
for inlineImg in inlineImgs:
|
||||||
|
inlineImg.extract()
|
||||||
|
else:
|
||||||
|
if inlineImgs:
|
||||||
|
firstImg = inlineImgs[0]
|
||||||
|
for inlineImg in inlineImgs[1:]:
|
||||||
|
inlineImg.extract()
|
||||||
|
# Move firstImg before article body
|
||||||
|
#article_body = soup.find(True, {'id':'articleBody'})
|
||||||
|
cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
|
||||||
|
if cgFirst:
|
||||||
|
# Strip all sibling NavigableStrings: noise
|
||||||
|
navstrings = cgFirst.findAll(text=True, recursive=False)
|
||||||
|
[ns.extract() for ns in navstrings]
|
||||||
|
headline_found = False
|
||||||
|
tag = cgFirst.find(True)
|
||||||
|
insertLoc = 0
|
||||||
|
while True:
|
||||||
|
insertLoc += 1
|
||||||
|
if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
|
||||||
|
headline_found = True
|
||||||
|
break
|
||||||
|
tag = tag.nextSibling
|
||||||
|
if not tag:
|
||||||
|
headline_found = False
|
||||||
|
break
|
||||||
|
if headline_found:
|
||||||
|
cgFirst.insert(insertLoc,firstImg)
|
||||||
|
else:
|
||||||
|
self.log(">>> No class:'columnGroup first' found <<<")
|
||||||
|
|
||||||
|
# Change captions to italic
|
||||||
|
for caption in soup.findAll(True, {'class':'caption'}) :
|
||||||
|
if caption and caption.contents[0]:
|
||||||
|
cTag = Tag(soup, "p", [("class", "caption")])
|
||||||
|
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
|
||||||
|
mp_off = c.find("More Photos")
|
||||||
|
if mp_off >= 0:
|
||||||
|
c = c[:mp_off]
|
||||||
|
cTag.insert(0, c)
|
||||||
|
caption.replaceWith(cTag)
|
||||||
|
|
||||||
|
# Change <nyt_headline> to <h2>
|
||||||
|
h1 = soup.find('h1')
|
||||||
|
if h1:
|
||||||
|
headline = h1.find("nyt_headline")
|
||||||
|
if headline:
|
||||||
|
tag = Tag(soup, "h2")
|
||||||
|
tag['class'] = "headline"
|
||||||
|
tag.insert(0, self.fixChars(headline.contents[0]))
|
||||||
|
h1.replaceWith(tag)
|
||||||
|
else:
|
||||||
|
# Blog entry - replace headline, remove <hr> tags
|
||||||
|
headline = soup.find('title')
|
||||||
|
if headline:
|
||||||
|
tag = Tag(soup, "h2")
|
||||||
|
tag['class'] = "headline"
|
||||||
|
tag.insert(0, self.fixChars(headline.contents[0]))
|
||||||
|
soup.insert(0, tag)
|
||||||
|
hrs = soup.findAll('hr')
|
||||||
|
for hr in hrs:
|
||||||
|
hr.extract()
|
||||||
|
|
||||||
|
# Change <h1> to <h3> - used in editorial blogs
|
||||||
|
masthead = soup.find("h1")
|
||||||
|
if masthead:
|
||||||
|
# Nuke the href
|
||||||
|
if masthead.a:
|
||||||
|
del(masthead.a['href'])
|
||||||
|
tag = Tag(soup, "h3")
|
||||||
|
tag.insert(0, self.fixChars(masthead.contents[0]))
|
||||||
|
masthead.replaceWith(tag)
|
||||||
|
|
||||||
|
# Change <span class="bold"> to <b>
|
||||||
|
for subhead in soup.findAll(True, {'class':'bold'}) :
|
||||||
|
if subhead.contents:
|
||||||
|
bTag = Tag(soup, "b")
|
||||||
|
bTag.insert(0, subhead.contents[0])
|
||||||
|
subhead.replaceWith(bTag)
|
||||||
|
|
||||||
|
divTag = soup.find('div',attrs={'id':'articleBody'})
|
||||||
|
if divTag:
|
||||||
|
divTag['class'] = divTag['id']
|
||||||
|
|
||||||
|
# Add class="authorId" to <div> so we can format with CSS
|
||||||
|
divTag = soup.find('div',attrs={'id':'authorId'})
|
||||||
|
if divTag and divTag.contents[0]:
|
||||||
|
tag = Tag(soup, "p")
|
||||||
|
tag['class'] = "authorId"
|
||||||
|
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
|
||||||
|
use_alt=False)))
|
||||||
|
divTag.replaceWith(tag)
|
||||||
|
|
||||||
|
return soup
|
||||||
|
|
||||||
|
@ -6,22 +6,25 @@ Fetch Die Zeit.
|
|||||||
'''
|
'''
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import Tag
|
|
||||||
|
|
||||||
class ZeitDe(BasicNewsRecipe):
|
class ZeitDe(BasicNewsRecipe):
|
||||||
|
|
||||||
title = 'ZEIT Online'
|
title = 'Zeit Online'
|
||||||
description = 'ZEIT Online'
|
description = 'Zeit Online'
|
||||||
language = 'de'
|
language = 'de'
|
||||||
lang = 'de_DE'
|
|
||||||
|
|
||||||
__author__ = 'Martin Pitt, Sujata Raman and Ingo Paschke'
|
__author__ = 'Martin Pitt, Sujata Raman, Ingo Paschke and Marc Toensing'
|
||||||
use_embedded_content = False
|
|
||||||
max_articles_per_feed = 40
|
max_articles_per_feed = 40
|
||||||
remove_empty_feeds = True
|
|
||||||
no_stylesheets = True
|
remove_tags = [
|
||||||
no_javascript = True
|
dict(name='iframe'),
|
||||||
encoding = 'utf-8'
|
dict(name='div', attrs={'class':["response","pagination block","pagenav","inline link", "copyright"] }),
|
||||||
|
dict(name='p', attrs={'class':["ressortbacklink", "copyright"] }),
|
||||||
|
dict(name='div', attrs={'id':["place_5","place_4","comments"]})
|
||||||
|
]
|
||||||
|
|
||||||
|
keep_only_tags = [dict(id=['main'])]
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
('Seite 1', 'http://newsfeed.zeit.de/index_xml'),
|
('Seite 1', 'http://newsfeed.zeit.de/index_xml'),
|
||||||
@ -40,43 +43,15 @@ class ZeitDe(BasicNewsRecipe):
|
|||||||
('Sport', 'http://newsfeed.zeit.de/sport/index'),
|
('Sport', 'http://newsfeed.zeit.de/sport/index'),
|
||||||
]
|
]
|
||||||
|
|
||||||
extra_css = '''
|
extra_css = '.reaktion,.taglist,.comments,.reponse,.responsetitle,.responsebody,.reponse,.inline,.date{display:none;}li.date{display:block}'
|
||||||
.supertitle{color:#990000; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
|
|
||||||
.excerpt{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:small;}
|
|
||||||
.title{font-family:Arial,Helvetica,sans-serif;font-size:large;clear:right;}
|
|
||||||
.caption{color:#666666; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
|
|
||||||
.copyright{color:#666666; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
|
|
||||||
.article{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:x-small}
|
|
||||||
.quote{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:x-small}
|
|
||||||
.quote .cite{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:xx-small}
|
|
||||||
.headline iconportrait_inline{font-family:Arial,Helvetica,sans-serif;font-size:x-small}
|
|
||||||
.inline{float:left;margin-top:0;margin-right:15px;position:relative;width:180px; }
|
|
||||||
img.inline{float:none}
|
|
||||||
.intertitle{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:x-small;font-weight:700}
|
|
||||||
.ebinfobox{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:xx-small;list-style-type:none;float:right;margin-top:0;border-left-style:solid;border-left-width:1px;padding-left:10px;}
|
|
||||||
.infobox {border-style: solid; border-width: 1px;padding:8px;}
|
|
||||||
.infobox dt {font-weight:700;}
|
|
||||||
'''
|
|
||||||
#filter_regexps = [r'ad.de.doubleclick.net/']
|
#filter_regexps = [r'ad.de.doubleclick.net/']
|
||||||
|
|
||||||
keep_only_tags = [
|
|
||||||
dict(name='div', attrs={'class':["article"]}) ,
|
|
||||||
dict(name='ul', attrs={'class':["tools"]}) ,
|
|
||||||
]
|
|
||||||
remove_tags = [
|
|
||||||
dict(name='link'), dict(name='iframe'),dict(name='style'),dict(name='meta'),
|
|
||||||
dict(name='div', attrs={'class':["pagination block","pagenav","inline link", "copyright"] }),
|
|
||||||
dict(name='p', attrs={'class':["ressortbacklink", "copyright"] }),
|
|
||||||
dict(name='div', attrs={'id':["place_5","place_4","comments"]})
|
|
||||||
]
|
|
||||||
|
|
||||||
remove_attributes = ['style', 'font']
|
|
||||||
|
|
||||||
def get_article_url(self, article):
|
def get_article_url(self, article):
|
||||||
ans = article.get('link',None)
|
ans = article.get('link',None)
|
||||||
ans += "?page=all"
|
ans += "?page=all&print=true"
|
||||||
|
|
||||||
if 'video' in ans or 'quiz' in ans :
|
if 'video' in ans or 'quiz' in ans or 'blog' in ans :
|
||||||
ans = None
|
ans = None
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
@ -86,25 +61,3 @@ class ZeitDe(BasicNewsRecipe):
|
|||||||
return inhalt.find('div', attrs={'class':'singlearchive clearfix'}).img['src'].replace('icon_','')
|
return inhalt.find('div', attrs={'class':'singlearchive clearfix'}).img['src'].replace('icon_','')
|
||||||
except:
|
except:
|
||||||
return 'http://images.zeit.de/bilder/titelseiten_zeit/1946/001_001.jpg'
|
return 'http://images.zeit.de/bilder/titelseiten_zeit/1946/001_001.jpg'
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
soup.html['xml:lang'] = self.lang
|
|
||||||
soup.html['lang'] = self.lang
|
|
||||||
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '">'
|
|
||||||
soup.head.insert(0,mtag)
|
|
||||||
title = soup.find('h2', attrs={'class':'title'})
|
|
||||||
if title is None:
|
|
||||||
print "no title"
|
|
||||||
return soup
|
|
||||||
info = Tag(soup,'ul',[('class','ebinfobox')])
|
|
||||||
tools = soup.find('ul', attrs={'class':'tools'})
|
|
||||||
#author = tools.find('li','author first')
|
|
||||||
for tag in ['author first', 'date', 'date first', 'author', 'source']:
|
|
||||||
line = tools.find('li', tag)
|
|
||||||
if line:
|
|
||||||
info.insert(0,line)
|
|
||||||
title.parent.insert(0,info)
|
|
||||||
tools.extract()
|
|
||||||
return soup
|
|
||||||
|
|
||||||
|
|
||||||
|
60
resources/templates/html_export_default.css
Normal file
60
resources/templates/html_export_default.css
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
body{
|
||||||
|
margin:0px;
|
||||||
|
padding: 0.5em;
|
||||||
|
background-color:#F6F3E9;
|
||||||
|
font-size:12px;
|
||||||
|
font-family:Arial, Helvetica, sans-serif;
|
||||||
|
}
|
||||||
|
|
||||||
|
.calibreMeta{
|
||||||
|
background-color:#39322B;
|
||||||
|
color:white;
|
||||||
|
padding:10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.calibreMeta a, .calibreEbNav a, .calibreEbNavTop a, .calibreToc a{
|
||||||
|
color:white;
|
||||||
|
}
|
||||||
|
|
||||||
|
.calibreMeta h1{
|
||||||
|
margin:0px;
|
||||||
|
font-size:18px;
|
||||||
|
background-color:#39322B;
|
||||||
|
}
|
||||||
|
|
||||||
|
.calibreEbookContent{
|
||||||
|
padding:20px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.calibreEbNav, .calibreEbNavTop{
|
||||||
|
clear:both;
|
||||||
|
background-color:#39322B;
|
||||||
|
color:white;
|
||||||
|
padding:10px;
|
||||||
|
text-align:center;
|
||||||
|
}
|
||||||
|
|
||||||
|
.calibreEbNavTop{
|
||||||
|
margin-bottom:20px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.calibreEbNav a, .calibreEbNavTop a{
|
||||||
|
padding:0px 5px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.calibreTocIndex{
|
||||||
|
line-height:18px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.calibreToc{
|
||||||
|
float:left;
|
||||||
|
margin:20px;
|
||||||
|
width:300px;
|
||||||
|
background-color:#39322B;
|
||||||
|
color:white;
|
||||||
|
padding:10px;
|
||||||
|
}
|
||||||
|
.calibreEbookContent{
|
||||||
|
width:600px;
|
||||||
|
float:left;
|
||||||
|
}
|
74
resources/templates/html_export_default.tmpl
Normal file
74
resources/templates/html_export_default.tmpl
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||||
|
<head>
|
||||||
|
${head_content}$
|
||||||
|
|
||||||
|
<link href="${cssLink}$" type="text/css" rel="stylesheet" />
|
||||||
|
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<div class="calibreMeta">
|
||||||
|
<div class="calibreMetaTitle">
|
||||||
|
${pos1=1}$
|
||||||
|
${for title in meta.titles():}$
|
||||||
|
${if pos1:}$
|
||||||
|
<h1>
|
||||||
|
<a href="${tocUrl}$">${print title}$</a>
|
||||||
|
</h1>
|
||||||
|
${:else:}$
|
||||||
|
<div class="calibreMetaSubtitle">${print title}$</div>
|
||||||
|
${:endif}$
|
||||||
|
${pos1=0}$
|
||||||
|
${:endfor}$
|
||||||
|
</div>
|
||||||
|
<div class="calibreMetaAuthor">
|
||||||
|
${print ', '.join(meta.creators())}$
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="calibreMain">
|
||||||
|
|
||||||
|
<div class="calibreEbookContent">
|
||||||
|
${if prevLink or nextLink:}$
|
||||||
|
<div class="calibreEbNavTop">
|
||||||
|
${if prevLink:}$
|
||||||
|
<a href="${prevLink}$" class="calibreAPrev">${print _('previous page'),}$</a>
|
||||||
|
${:else:}$
|
||||||
|
<a href="${tocUrl}$" class="calibreAPrev">${print _('previous page'),}$</a>
|
||||||
|
${:endif}$
|
||||||
|
|
||||||
|
${if nextLink:}$
|
||||||
|
<a href="${nextLink}$" class="calibreANext">${print _('next page'),}$</a>
|
||||||
|
${:endif}$
|
||||||
|
</div>
|
||||||
|
${:endif}$
|
||||||
|
|
||||||
|
${ebookContent}$
|
||||||
|
</div>
|
||||||
|
|
||||||
|
${if has_toc:}$
|
||||||
|
<div class="calibreToc">
|
||||||
|
<h2><a href="${tocUrl}$">${print _('Table of contents'),}$</a></h2>
|
||||||
|
${print toc()}$
|
||||||
|
</div>
|
||||||
|
${:endif}$
|
||||||
|
|
||||||
|
<div class="calibreEbNav">
|
||||||
|
${if prevLink:}$
|
||||||
|
<a href="${prevLink}$" class="calibreAPrev">${print _('previous page'),}$</a>
|
||||||
|
${:else:}$
|
||||||
|
<a href="${tocUrl}$" class="calibreAPrev">${print _('previous page'),}$</a>
|
||||||
|
${:endif}$
|
||||||
|
|
||||||
|
<a href="${tocUrl}$" class="calibreAHome">${print _('start'),}$</a>
|
||||||
|
|
||||||
|
${if nextLink:}$
|
||||||
|
<a href="${nextLink}$" class="calibreANext">${print _('next page'),}$</a>
|
||||||
|
${:endif}$
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</body>
|
||||||
|
</html>
|
61
resources/templates/html_export_default_index.tmpl
Normal file
61
resources/templates/html_export_default_index.tmpl
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||||
|
<head>
|
||||||
|
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||||
|
|
||||||
|
<link rel="schema.DC" href="http://purl.org/dc/elements/1.1/" />
|
||||||
|
<link rel="schema.DCTERMS" href="http://purl.org/dc/terms/" />
|
||||||
|
|
||||||
|
<title>${print ', '.join(meta.creators()),}$ - ${print meta.titles().next(); meta.titles().close()}$</title>
|
||||||
|
|
||||||
|
${for item in meta:}$
|
||||||
|
<meta ${print 'name="DC.'+item['name']+'"',}$ ${print 'content="'+item['value']+'"',}$ />
|
||||||
|
${:endfor}$
|
||||||
|
|
||||||
|
<link href="${cssLink}$" type="text/css" rel="stylesheet" />
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<div class="calibreMeta">
|
||||||
|
<div class="calibreMetaTitle">
|
||||||
|
${pos1=1}$
|
||||||
|
${for title in meta.titles():}$
|
||||||
|
${if pos1:}$
|
||||||
|
<h1>
|
||||||
|
<a href="${tocUrl}$">${print title}$</a>
|
||||||
|
</h1>
|
||||||
|
${:else:}$
|
||||||
|
<div class="calibreMetaSubtitle">${print title}$</div>
|
||||||
|
${:endif}$
|
||||||
|
${pos1=0}$
|
||||||
|
${:endfor}$
|
||||||
|
</div>
|
||||||
|
<div class="calibreMetaAuthor">
|
||||||
|
${print ', '.join(meta.creators()),}$
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="calibreMain">
|
||||||
|
<div class="calibreEbookContent">
|
||||||
|
|
||||||
|
${if has_toc:}$
|
||||||
|
<div class="calibreTocIndex">
|
||||||
|
<h2>${print _('Table of contents'),}$</h2>
|
||||||
|
${toc}$
|
||||||
|
</div>
|
||||||
|
${:else:}$
|
||||||
|
<h2>${print _('No table of contents present'),}$</h2>
|
||||||
|
<div><strong><a href="${nextLink}$">${print _('begin to read'),}$</a></strong></div>
|
||||||
|
${:endif}$
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="calibreEbNav">
|
||||||
|
${if nextLink:}$
|
||||||
|
<a href="${nextLink}$" class="calibreANext">${print _('next page'),}$</a>
|
||||||
|
${:endif}$
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</body>
|
||||||
|
</html>
|
@ -89,7 +89,7 @@ class Server(Command):
|
|||||||
t = telnetlib.Telnet('localhost', 4242)
|
t = telnetlib.Telnet('localhost', 4242)
|
||||||
t.read_until("repl>")
|
t.read_until("repl>")
|
||||||
t.write('BrowserReload();')
|
t.write('BrowserReload();')
|
||||||
print t.read_until("repl>")
|
t.read_until("repl>")
|
||||||
t.close()
|
t.close()
|
||||||
except:
|
except:
|
||||||
print 'Failed to reload browser'
|
print 'Failed to reload browser'
|
||||||
|
@ -446,6 +446,7 @@ from calibre.ebooks.rb.output import RBOutput
|
|||||||
from calibre.ebooks.rtf.output import RTFOutput
|
from calibre.ebooks.rtf.output import RTFOutput
|
||||||
from calibre.ebooks.tcr.output import TCROutput
|
from calibre.ebooks.tcr.output import TCROutput
|
||||||
from calibre.ebooks.txt.output import TXTOutput
|
from calibre.ebooks.txt.output import TXTOutput
|
||||||
|
from calibre.ebooks.html.output import HTMLOutput
|
||||||
from calibre.ebooks.snb.output import SNBOutput
|
from calibre.ebooks.snb.output import SNBOutput
|
||||||
|
|
||||||
from calibre.customize.profiles import input_profiles, output_profiles
|
from calibre.customize.profiles import input_profiles, output_profiles
|
||||||
@ -525,6 +526,7 @@ plugins += [
|
|||||||
RTFOutput,
|
RTFOutput,
|
||||||
TCROutput,
|
TCROutput,
|
||||||
TXTOutput,
|
TXTOutput,
|
||||||
|
HTMLOutput,
|
||||||
SNBOutput,
|
SNBOutput,
|
||||||
]
|
]
|
||||||
# Order here matters. The first matched device is the one used.
|
# Order here matters. The first matched device is the one used.
|
||||||
@ -893,4 +895,3 @@ plugins += [LookAndFeel, Behavior, Columns, Toolbar, InputOptions,
|
|||||||
Email, Server, Plugins, Tweaks, Misc]
|
Email, Server, Plugins, Tweaks, Misc]
|
||||||
|
|
||||||
#}}}
|
#}}}
|
||||||
|
|
||||||
|
33
src/calibre/ebooks/html/meta.py
Normal file
33
src/calibre/ebooks/html/meta.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
from __future__ import with_statement
|
||||||
|
__license__ = 'GPL 3'
|
||||||
|
__copyright__ = '2010, Fabian Grassl <fg@jusmeum.de>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
|
||||||
|
from calibre.ebooks.oeb.base import namespace, barename, DC11_NS
|
||||||
|
|
||||||
|
class EasyMeta(object):
|
||||||
|
|
||||||
|
def __init__(self, meta):
|
||||||
|
self.meta = meta
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
meta = self.meta
|
||||||
|
for item_name in meta.items:
|
||||||
|
for item in meta[item_name]:
|
||||||
|
if namespace(item.term) == DC11_NS:
|
||||||
|
yield { 'name': barename(item.term), 'value': item.value }
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
count = 0
|
||||||
|
for item in self:
|
||||||
|
count = count+1
|
||||||
|
return count
|
||||||
|
|
||||||
|
def titles(self):
|
||||||
|
for item in self.meta['title']:
|
||||||
|
yield item.value
|
||||||
|
|
||||||
|
def creators(self):
|
||||||
|
for item in self.meta['creator']:
|
||||||
|
yield item.value
|
201
src/calibre/ebooks/html/output.py
Normal file
201
src/calibre/ebooks/html/output.py
Normal file
@ -0,0 +1,201 @@
|
|||||||
|
from __future__ import with_statement
|
||||||
|
__license__ = 'GPL 3'
|
||||||
|
__copyright__ = '2010, Fabian Grassl <fg@jusmeum.de>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import os, re, shutil
|
||||||
|
|
||||||
|
from os.path import dirname, abspath, relpath, exists
|
||||||
|
|
||||||
|
from lxml import etree
|
||||||
|
from templite import Templite
|
||||||
|
|
||||||
|
from calibre.ebooks.oeb.base import element
|
||||||
|
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
|
||||||
|
from calibre import CurrentDir
|
||||||
|
from calibre.ptempfile import PersistentTemporaryDirectory
|
||||||
|
from calibre.utils.zipfile import ZipFile
|
||||||
|
|
||||||
|
from urllib import unquote
|
||||||
|
|
||||||
|
from calibre.ebooks.html.meta import EasyMeta
|
||||||
|
|
||||||
|
class HTMLOutput(OutputFormatPlugin):
|
||||||
|
|
||||||
|
name = 'HTML Output'
|
||||||
|
author = 'Fabian Grassl'
|
||||||
|
file_type = 'zip'
|
||||||
|
|
||||||
|
options = set([
|
||||||
|
OptionRecommendation(name='template_css',
|
||||||
|
help=_('CSS file used for the output instead of the default file')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='template_html_index',
|
||||||
|
help=_('Template used for generation of the html index file instead of the default file')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='template_html',
|
||||||
|
help=_('Template used for the generation of the html contents of the book instead of the default file')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='extract_to',
|
||||||
|
help=_('Extract the contents of the generated ZIP file to the directory of the generated ZIP file')
|
||||||
|
),
|
||||||
|
])
|
||||||
|
|
||||||
|
recommendations = set([('pretty_print', True, OptionRecommendation.HIGH)])
|
||||||
|
|
||||||
|
def generate_toc(self, oeb_book, ref_url, output_dir):
|
||||||
|
'''
|
||||||
|
Generate table of contents
|
||||||
|
'''
|
||||||
|
with CurrentDir(output_dir):
|
||||||
|
def build_node(current_node, parent=None):
|
||||||
|
if parent is None:
|
||||||
|
parent = etree.Element('ul')
|
||||||
|
elif len(current_node.nodes):
|
||||||
|
parent = element(parent, ('ul'))
|
||||||
|
for node in current_node.nodes:
|
||||||
|
point = element(parent, 'li')
|
||||||
|
href = relpath(abspath(unquote(node.href)), dirname(ref_url))
|
||||||
|
link = element(point, 'a', href=href)
|
||||||
|
title = node.title
|
||||||
|
if title:
|
||||||
|
title = re.sub(r'\s+', ' ', title)
|
||||||
|
link.text=title
|
||||||
|
build_node(node, point)
|
||||||
|
return parent
|
||||||
|
wrap = etree.Element('div')
|
||||||
|
wrap.append(build_node(oeb_book.toc))
|
||||||
|
return wrap
|
||||||
|
|
||||||
|
def generate_html_toc(self, oeb_book, ref_url, output_dir):
|
||||||
|
root = self.generate_toc(oeb_book, ref_url, output_dir)
|
||||||
|
return etree.tostring(root, pretty_print=True, encoding='utf-8',
|
||||||
|
xml_declaration=True)
|
||||||
|
|
||||||
|
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||||
|
|
||||||
|
# read template files
|
||||||
|
if opts.template_html_index is not None:
|
||||||
|
template_html_index_data = open(opts.template_html_index, 'rb').read()
|
||||||
|
else:
|
||||||
|
template_html_index_data = P('templates/html_export_default_index.tmpl', data=True)
|
||||||
|
|
||||||
|
if opts.template_html is not None:
|
||||||
|
template_html_data = open(opts.template_html, 'rb').read()
|
||||||
|
else:
|
||||||
|
template_html_data = P('templates/html_export_default.tmpl', data=True)
|
||||||
|
|
||||||
|
if opts.template_css is not None:
|
||||||
|
template_css_data = open(opts.template_css, 'rb').read()
|
||||||
|
else:
|
||||||
|
template_css_data = P('templates/html_export_default.css', data=True)
|
||||||
|
|
||||||
|
template_html_index_data = template_html_index_data.decode('utf-8')
|
||||||
|
template_html_data = template_html_data.decode('utf-8')
|
||||||
|
template_css_data = template_css_data.decode('utf-8')
|
||||||
|
|
||||||
|
self.log = log
|
||||||
|
self.opts = opts
|
||||||
|
meta = EasyMeta(oeb_book.metadata)
|
||||||
|
|
||||||
|
tempdir = PersistentTemporaryDirectory()
|
||||||
|
output_file = os.path.join(tempdir,
|
||||||
|
os.path.basename(re.sub(r'\.zip', '', output_path)+'.html'))
|
||||||
|
output_dir = re.sub(r'\.html', '', output_file)+'_files'
|
||||||
|
|
||||||
|
if not exists(output_dir):
|
||||||
|
os.makedirs(output_dir)
|
||||||
|
|
||||||
|
css_path = output_dir+os.sep+'calibreHtmlOutBasicCss.css'
|
||||||
|
with open(css_path, 'wb') as f:
|
||||||
|
f.write(template_css_data.encode('utf-8'))
|
||||||
|
|
||||||
|
with open(output_file, 'wb') as f:
|
||||||
|
html_toc = self.generate_html_toc(oeb_book, output_file, output_dir)
|
||||||
|
templite = Templite(template_html_index_data)
|
||||||
|
nextLink = oeb_book.spine[0].href
|
||||||
|
nextLink = relpath(output_dir+os.sep+nextLink, dirname(output_file))
|
||||||
|
cssLink = relpath(abspath(css_path), dirname(output_file))
|
||||||
|
tocUrl = relpath(output_file, dirname(output_file))
|
||||||
|
t = templite.render(has_toc=bool(oeb_book.toc.count()),
|
||||||
|
toc=html_toc, meta=meta, nextLink=nextLink,
|
||||||
|
tocUrl=tocUrl, cssLink=cssLink)
|
||||||
|
f.write(t)
|
||||||
|
|
||||||
|
with CurrentDir(output_dir):
|
||||||
|
for item in oeb_book.manifest:
|
||||||
|
path = abspath(unquote(item.href))
|
||||||
|
dir = dirname(path)
|
||||||
|
if not exists(dir):
|
||||||
|
os.makedirs(dir)
|
||||||
|
if item.spine_position is not None:
|
||||||
|
with open(path, 'wb') as f:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
with open(path, 'wb') as f:
|
||||||
|
f.write(str(item))
|
||||||
|
item.unload_data_from_memory(memory=path)
|
||||||
|
|
||||||
|
for item in oeb_book.spine:
|
||||||
|
path = abspath(unquote(item.href))
|
||||||
|
dir = dirname(path)
|
||||||
|
root = item.data.getroottree()
|
||||||
|
|
||||||
|
# get & clean HTML <HEAD>-data
|
||||||
|
head = root.xpath('//h:head', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
|
||||||
|
head_content = etree.tostring(head, pretty_print=True, encoding='utf-8')
|
||||||
|
head_content = re.sub(r'\<\/?head.*\>', '', head_content)
|
||||||
|
head_content = re.sub(re.compile(r'\<style.*\/style\>', re.M|re.S), '', head_content)
|
||||||
|
|
||||||
|
# get & clean HTML <BODY>-data
|
||||||
|
body = root.xpath('//h:body', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
|
||||||
|
ebook_content = etree.tostring(body, pretty_print=True, encoding='utf-8')
|
||||||
|
ebook_content = re.sub(r'\<\/?body.*\>', '', ebook_content)
|
||||||
|
|
||||||
|
# generate link to next page
|
||||||
|
if item.spine_position+1 < len(oeb_book.spine):
|
||||||
|
nextLink = oeb_book.spine[item.spine_position+1].href
|
||||||
|
nextLink = relpath(abspath(nextLink), dir)
|
||||||
|
else:
|
||||||
|
nextLink = None
|
||||||
|
|
||||||
|
# generate link to previous page
|
||||||
|
if item.spine_position > 0:
|
||||||
|
prevLink = oeb_book.spine[item.spine_position-1].href
|
||||||
|
prevLink = relpath(abspath(prevLink), dir)
|
||||||
|
else:
|
||||||
|
prevLink = None
|
||||||
|
|
||||||
|
cssLink = relpath(abspath(css_path), dir)
|
||||||
|
tocUrl = relpath(output_file, dir)
|
||||||
|
|
||||||
|
# render template
|
||||||
|
templite = Templite(template_html_data)
|
||||||
|
toc = lambda: self.generate_html_toc(oeb_book, path, output_dir)
|
||||||
|
t = templite.render(ebookContent=ebook_content,
|
||||||
|
prevLink=prevLink, nextLink=nextLink,
|
||||||
|
has_toc=bool(oeb_book.toc.count()), toc=toc,
|
||||||
|
tocUrl=tocUrl, head_content=head_content,
|
||||||
|
meta=meta, cssLink=cssLink)
|
||||||
|
|
||||||
|
# write html to file
|
||||||
|
with open(path, 'wb') as f:
|
||||||
|
f.write(t)
|
||||||
|
item.unload_data_from_memory(memory=path)
|
||||||
|
|
||||||
|
zfile = ZipFile(output_path, "w")
|
||||||
|
zfile.add_dir(output_dir)
|
||||||
|
|
||||||
|
if opts.extract_to:
|
||||||
|
if os.path.exists(opts.extract_to):
|
||||||
|
shutil.rmtree(opts.extract_to)
|
||||||
|
os.makedirs(opts.extract_to)
|
||||||
|
zfile.extractall(opts.extract_to)
|
||||||
|
self.log('Zip file extracted to', opts.extract_to)
|
||||||
|
|
||||||
|
zfile.close()
|
||||||
|
|
||||||
|
# cleanup temp dir
|
||||||
|
shutil.rmtree(tempdir)
|
||||||
|
|
||||||
|
|
@ -112,13 +112,12 @@ def get_metadata(br, asin, mi):
|
|||||||
|
|
||||||
def main(args=sys.argv):
|
def main(args=sys.argv):
|
||||||
# Test xisbn
|
# Test xisbn
|
||||||
#print get_social_metadata('Learning Python', None, None, '8324616489')
|
print get_social_metadata('Learning Python', None, None, '8324616489')
|
||||||
#print
|
print
|
||||||
|
|
||||||
# Test sophisticated comment formatting
|
# Test sophisticated comment formatting
|
||||||
print get_social_metadata('Angels & Demons', None, None, '9781416580829')
|
print get_social_metadata('Angels & Demons', None, None, '9781416580829')
|
||||||
print
|
print
|
||||||
return
|
|
||||||
|
|
||||||
# Random tests
|
# Random tests
|
||||||
print get_social_metadata('Star Trek: Destiny: Mere Mortals', None, None, '9781416551720')
|
print get_social_metadata('Star Trek: Destiny: Mere Mortals', None, None, '9781416551720')
|
||||||
|
@ -275,7 +275,15 @@ class MobiMLizer(object):
|
|||||||
# <mbp:frame-set/> does not exist lalalala
|
# <mbp:frame-set/> does not exist lalalala
|
||||||
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
|
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
|
||||||
or style['visibility'] == 'hidden':
|
or style['visibility'] == 'hidden':
|
||||||
return
|
id_ = elem.get('id', None)
|
||||||
|
if id_:
|
||||||
|
# Keep anchors so people can use display:none
|
||||||
|
# to generate hidden TOCs
|
||||||
|
elem.clear()
|
||||||
|
elem.text = None
|
||||||
|
elem.set('id', id_)
|
||||||
|
else:
|
||||||
|
return
|
||||||
tag = barename(elem.tag)
|
tag = barename(elem.tag)
|
||||||
istate = copy.copy(istates[-1])
|
istate = copy.copy(istates[-1])
|
||||||
istate.rendered = False
|
istate.rendered = False
|
||||||
@ -406,6 +414,12 @@ class MobiMLizer(object):
|
|||||||
parent = bstate.para if bstate.inline is None else bstate.inline
|
parent = bstate.para if bstate.inline is None else bstate.inline
|
||||||
if parent is not None:
|
if parent is not None:
|
||||||
vtag = etree.SubElement(parent, XHTML(vtag))
|
vtag = etree.SubElement(parent, XHTML(vtag))
|
||||||
|
# Add anchors
|
||||||
|
for child in vbstate.body:
|
||||||
|
if child is not vbstate.para:
|
||||||
|
vtag.append(child)
|
||||||
|
else:
|
||||||
|
break
|
||||||
for child in vbstate.para:
|
for child in vbstate.para:
|
||||||
vtag.append(child)
|
vtag.append(child)
|
||||||
return
|
return
|
||||||
|
@ -49,5 +49,3 @@ class OEBOutput(OutputFormatPlugin):
|
|||||||
with open(path, 'wb') as f:
|
with open(path, 'wb') as f:
|
||||||
f.write(str(item))
|
f.write(str(item))
|
||||||
item.unload_data_from_memory(memory=path)
|
item.unload_data_from_memory(memory=path)
|
||||||
|
|
||||||
|
|
||||||
|
@ -101,11 +101,12 @@ class SNBMLizer(object):
|
|||||||
subitem = ''
|
subitem = ''
|
||||||
bodyTree = trees[subitem].find(".//body")
|
bodyTree = trees[subitem].find(".//body")
|
||||||
for line in output.splitlines():
|
for line in output.splitlines():
|
||||||
if not line.find(CALIBRE_SNB_PRE_TAG) == 0:
|
pos = line.find(CALIBRE_SNB_PRE_TAG)
|
||||||
|
if pos == -1:
|
||||||
line = line.strip(u' \t\n\r\u3000')
|
line = line.strip(u' \t\n\r\u3000')
|
||||||
else:
|
else:
|
||||||
etree.SubElement(bodyTree, "text").text = \
|
etree.SubElement(bodyTree, "text").text = \
|
||||||
etree.CDATA(line[len(CALIBRE_SNB_PRE_TAG):])
|
etree.CDATA(line[pos+len(CALIBRE_SNB_PRE_TAG):])
|
||||||
continue
|
continue
|
||||||
if len(line) != 0:
|
if len(line) != 0:
|
||||||
if line.find(CALIBRE_SNB_IMG_TAG) == 0:
|
if line.find(CALIBRE_SNB_IMG_TAG) == 0:
|
||||||
|
@ -35,7 +35,6 @@ class ViewAction(InterfaceAction):
|
|||||||
self.qaction.setMenu(self.view_menu)
|
self.qaction.setMenu(self.view_menu)
|
||||||
ac.triggered.connect(self.view_specific_format, type=Qt.QueuedConnection)
|
ac.triggered.connect(self.view_specific_format, type=Qt.QueuedConnection)
|
||||||
|
|
||||||
|
|
||||||
def location_selected(self, loc):
|
def location_selected(self, loc):
|
||||||
enabled = loc == 'library'
|
enabled = loc == 'library'
|
||||||
for action in list(self.view_menu.actions())[1:]:
|
for action in list(self.view_menu.actions())[1:]:
|
||||||
@ -134,6 +133,9 @@ class ViewAction(InterfaceAction):
|
|||||||
rows = self.gui.current_view().selectionModel().selectedRows()
|
rows = self.gui.current_view().selectionModel().selectedRows()
|
||||||
self._view_books(rows)
|
self._view_books(rows)
|
||||||
|
|
||||||
|
def view_triggered(self, index):
|
||||||
|
self._view_books([index])
|
||||||
|
|
||||||
def view_specific_book(self, index):
|
def view_specific_book(self, index):
|
||||||
self._view_books([index])
|
self._view_books([index])
|
||||||
|
|
||||||
|
@ -28,6 +28,8 @@ def gui_catalog(fmt, title, dbspec, ids, out_file_name, sync, fmt_options, conne
|
|||||||
if log is None:
|
if log is None:
|
||||||
log = Log()
|
log = Log()
|
||||||
from calibre.library import db
|
from calibre.library import db
|
||||||
|
from calibre.utils.config import prefs
|
||||||
|
prefs.refresh()
|
||||||
db = db()
|
db = db()
|
||||||
db.catalog_plugin_on_device_temp_mapping = dbspec
|
db.catalog_plugin_on_device_temp_mapping = dbspec
|
||||||
|
|
||||||
|
@ -50,6 +50,8 @@ class BooksView(QTableView): # {{{
|
|||||||
def __init__(self, parent, modelcls=BooksModel):
|
def __init__(self, parent, modelcls=BooksModel):
|
||||||
QTableView.__init__(self, parent)
|
QTableView.__init__(self, parent)
|
||||||
|
|
||||||
|
self.setEditTriggers(self.SelectedClicked|self.EditKeyPressed)
|
||||||
|
|
||||||
self.drag_allowed = True
|
self.drag_allowed = True
|
||||||
self.setDragEnabled(True)
|
self.setDragEnabled(True)
|
||||||
self.setDragDropOverwriteMode(False)
|
self.setDragDropOverwriteMode(False)
|
||||||
@ -98,6 +100,8 @@ class BooksView(QTableView): # {{{
|
|||||||
self._model.about_to_be_sorted.connect(self.about_to_be_sorted)
|
self._model.about_to_be_sorted.connect(self.about_to_be_sorted)
|
||||||
self._model.sorting_done.connect(self.sorting_done)
|
self._model.sorting_done.connect(self.sorting_done)
|
||||||
|
|
||||||
|
self.doubleClicked.connect(parent.iactions['View'].view_triggered)
|
||||||
|
|
||||||
# Column Header Context Menu {{{
|
# Column Header Context Menu {{{
|
||||||
def column_header_context_handler(self, action=None, column=None):
|
def column_header_context_handler(self, action=None, column=None):
|
||||||
if not action or not column:
|
if not action or not column:
|
||||||
|
@ -128,7 +128,7 @@ class ContentServer(object):
|
|||||||
if want_mobile:
|
if want_mobile:
|
||||||
return self.mobile()
|
return self.mobile()
|
||||||
|
|
||||||
return self.browse_toplevel()
|
return self.browse_catalog()
|
||||||
|
|
||||||
def old(self, **kwargs):
|
def old(self, **kwargs):
|
||||||
return self.static('index.html').replace('{prefix}',
|
return self.static('index.html').replace('{prefix}',
|
||||||
|
@ -338,6 +338,8 @@ Calibre has several keyboard shortcuts to save you time and mouse movement. Thes
|
|||||||
|
|
||||||
* - Keyboard Shortcut
|
* - Keyboard Shortcut
|
||||||
- Action
|
- Action
|
||||||
|
* - :kbd:`F2 (Enter in OS X)`
|
||||||
|
- Edit the metadata of the currently selected field in the book list.
|
||||||
* - :kbd:`A`
|
* - :kbd:`A`
|
||||||
- Add Books
|
- Add Books
|
||||||
* - :kbd:`C`
|
* - :kbd:`C`
|
||||||
|
87
src/templite/__init__.py
Normal file
87
src/templite/__init__.py
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
#
|
||||||
|
# Templite+
|
||||||
|
# A light-weight, fully functional, general purpose templating engine
|
||||||
|
#
|
||||||
|
# Copyright (c) 2009 joonis new media
|
||||||
|
# Author: Thimo Kraemer <thimo.kraemer@joonis.de>
|
||||||
|
#
|
||||||
|
# Based on Templite - Tomer Filiba
|
||||||
|
# http://code.activestate.com/recipes/496702/
|
||||||
|
#
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation; either version 2 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program; if not, write to the Free Software
|
||||||
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
||||||
|
# MA 02110-1301, USA.
|
||||||
|
#
|
||||||
|
|
||||||
|
import sys, re
|
||||||
|
|
||||||
|
class Templite(object):
|
||||||
|
auto_emit = re.compile('(^[\'\"])|(^[a-zA-Z0-9_\[\]\'\"]+$)')
|
||||||
|
|
||||||
|
def __init__(self, template, start='${', end='}$'):
|
||||||
|
if len(start) != 2 or len(end) != 2:
|
||||||
|
raise ValueError('each delimiter must be two characters long')
|
||||||
|
delimiter = re.compile('%s(.*?)%s' % (re.escape(start), re.escape(end)), re.DOTALL)
|
||||||
|
offset = 0
|
||||||
|
tokens = []
|
||||||
|
for i, part in enumerate(delimiter.split(template)):
|
||||||
|
part = part.replace('\\'.join(list(start)), start)
|
||||||
|
part = part.replace('\\'.join(list(end)), end)
|
||||||
|
if i % 2 == 0:
|
||||||
|
if not part: continue
|
||||||
|
part = part.replace('\\', '\\\\').replace('"', '\\"')
|
||||||
|
part = '\t' * offset + 'emit("""%s""")' % part
|
||||||
|
else:
|
||||||
|
part = part.rstrip()
|
||||||
|
if not part: continue
|
||||||
|
if part.lstrip().startswith(':'):
|
||||||
|
if not offset:
|
||||||
|
raise SyntaxError('no block statement to terminate: ${%s}$' % part)
|
||||||
|
offset -= 1
|
||||||
|
part = part.lstrip()[1:]
|
||||||
|
if not part.endswith(':'): continue
|
||||||
|
elif self.auto_emit.match(part.lstrip()):
|
||||||
|
part = 'emit(%s)' % part.lstrip()
|
||||||
|
lines = part.splitlines()
|
||||||
|
margin = min(len(l) - len(l.lstrip()) for l in lines if l.strip())
|
||||||
|
part = '\n'.join('\t' * offset + l[margin:] for l in lines)
|
||||||
|
if part.endswith(':'):
|
||||||
|
offset += 1
|
||||||
|
tokens.append(part)
|
||||||
|
if offset:
|
||||||
|
raise SyntaxError('%i block statement(s) not terminated' % offset)
|
||||||
|
self.__code = compile('\n'.join(tokens), '<templite %r>' % template[:20], 'exec')
|
||||||
|
|
||||||
|
def render(self, __namespace=None, **kw):
|
||||||
|
"""
|
||||||
|
renders the template according to the given namespace.
|
||||||
|
__namespace - a dictionary serving as a namespace for evaluation
|
||||||
|
**kw - keyword arguments which are added to the namespace
|
||||||
|
"""
|
||||||
|
namespace = {}
|
||||||
|
if __namespace: namespace.update(__namespace)
|
||||||
|
if kw: namespace.update(kw)
|
||||||
|
namespace['emit'] = self.write
|
||||||
|
|
||||||
|
__stdout = sys.stdout
|
||||||
|
sys.stdout = self
|
||||||
|
self.__output = []
|
||||||
|
eval(self.__code, namespace)
|
||||||
|
sys.stdout = __stdout
|
||||||
|
return ''.join(self.__output)
|
||||||
|
|
||||||
|
def write(self, *args):
|
||||||
|
for a in args:
|
||||||
|
self.__output.append(str(a))
|
Loading…
x
Reference in New Issue
Block a user