sync with Kovid's branch

This commit is contained in:
Tomasz Długosz 2012-12-06 22:31:55 +01:00
commit 9e6c33961d
109 changed files with 53404 additions and 48427 deletions

View File

@ -20,6 +20,7 @@ class Aksiyon (BasicNewsRecipe):
auto_cleanup = True auto_cleanup = True
cover_img_url = 'http://www.aksiyon.com.tr/aksiyon/images/aksiyon/top-page/aksiyon_top_r2_c1.jpg' cover_img_url = 'http://www.aksiyon.com.tr/aksiyon/images/aksiyon/top-page/aksiyon_top_r2_c1.jpg'
masthead_url = 'http://aksiyon.com.tr/aksiyon/images/aksiyon/top-page/aksiyon_top_r2_c1.jpg' masthead_url = 'http://aksiyon.com.tr/aksiyon/images/aksiyon/top-page/aksiyon_top_r2_c1.jpg'
ignore_duplicate_articles = { 'title', 'url' }
remove_empty_feeds= True remove_empty_feeds= True
feeds = [ feeds = [
( u'KAPAK', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=26'), ( u'KAPAK', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=26'),

View File

@ -6,22 +6,41 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
nytimes.com nytimes.com
''' '''
import re, string, time import re, string, time
from calibre import entity_to_unicode, strftime from calibre import strftime
from datetime import timedelta, date from datetime import timedelta, date
from time import sleep
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
class NYTimes(BasicNewsRecipe): class NYTimes(BasicNewsRecipe):
recursions=1 # set this to zero to omit Related articles lists
# set getTechBlogs to True to include the technology blogs
# set tech_oldest_article to control article age
# set tech_max_articles_per_feed to control article count
getTechBlogs = True
remove_empty_feeds = True
tech_oldest_article = 14
tech_max_articles_per_feed = 25
# set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored. # set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored.
headlinesOnly = True headlinesOnly = True
# set webEdition to True for the Web edition of the newspaper. Set oldest_article to the # set webEdition to True for the Web edition of the newspaper. Set oldest_web_article to the
# number of days old an article can be for inclusion. If oldest_article = 0 all articles # number of days old an article can be for inclusion. If oldest_web_article = None all articles
# will be included. Note: oldest_article is ignored if webEdition = False # will be included. Note: oldest_web_article is ignored if webEdition = False
webEdition = False webEdition = False
oldest_article = 7 oldest_web_article = 7
# download higher resolution images than the small thumbnails typically included in the article
# the down side of having large beautiful images is the file size is much larger, on the order of 7MB per paper
useHighResImages = True
# replace paid Kindle Version: the name will be changed to "The New York Times" to cause
# previous paid versions of the new york times to best sent to the back issues folder on the kindle
replaceKindleVersion = False
# includeSections: List of sections to include. If empty, all sections found will be included. # includeSections: List of sections to include. If empty, all sections found will be included.
# Otherwise, only the sections named will be included. For example, # Otherwise, only the sections named will be included. For example,
@ -82,57 +101,68 @@ class NYTimes(BasicNewsRecipe):
('Education',u'education'), ('Education',u'education'),
('Multimedia',u'multimedia'), ('Multimedia',u'multimedia'),
(u'Obituaries',u'obituaries'), (u'Obituaries',u'obituaries'),
(u'Sunday Magazine',u'magazine'), (u'Sunday Magazine',u'magazine')
(u'Week in Review',u'weekinreview')] ]
tech_feeds = [
(u'Tech - Pogues Posts', u'http://pogue.blogs.nytimes.com/feed/'),
(u'Tech - Bits', u'http://bits.blogs.nytimes.com/feed/'),
(u'Tech - Gadgetwise', u'http://gadgetwise.blogs.nytimes.com/feed/'),
(u'Tech - Open', u'http://open.blogs.nytimes.com/feed/')
]
if headlinesOnly: if headlinesOnly:
title='New York Times Headlines' title='New York Times Headlines'
description = 'Headlines from the New York Times. Needs a subscription from http://www.nytimes.com' description = 'Headlines from the New York Times'
needs_subscription = 'optional' needs_subscription = False
elif webEdition: elif webEdition:
title='New York Times (Web)' title='New York Times (Web)'
description = 'New York Times on the Web' description = 'New York Times on the Web'
needs_subscription = True needs_subscription = False
elif replaceKindleVersion:
title='The New York Times'
description = 'Today\'s New York Times'
needs_subscription = False
else: else:
title='New York Times' title='New York Times'
description = 'Today\'s New York Times' description = 'Today\'s New York Times'
needs_subscription = True needs_subscription = False
def decode_url_date(self,url):
month_list = ['january','february','march','april','may','june','july','august','september','october','november','december'] urlitems = url.split('/')
def decode_us_date(self,datestr):
udate = datestr.strip().lower().split()
try: try:
m = self.month_list.index(udate[0])+1 d = date(int(urlitems[3]),int(urlitems[4]),int(urlitems[5]))
except: except:
return date.today()
d = int(udate[1])
y = int(udate[2])
try: try:
d = date(y,m,d) d = date(int(urlitems[4]),int(urlitems[5]),int(urlitems[6]))
except: except:
d = date.today return None
return d return d
earliest_date = date.today() - timedelta(days=oldest_article) if oldest_web_article is None:
earliest_date = date.today()
else:
earliest_date = date.today() - timedelta(days=oldest_web_article)
oldest_article = 365 # by default, a long time ago
__author__ = 'GRiker/Kovid Goyal/Nick Redding' __author__ = 'GRiker/Kovid Goyal/Nick Redding'
language = 'en' language = 'en'
requires_version = (0, 7, 5) requires_version = (0, 7, 5)
encoding = 'utf-8'
timefmt = '' timefmt = ''
masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
simultaneous_downloads = 1
cover_margins = (18,18,'grey99') cover_margins = (18,18,'grey99')
remove_tags_before = dict(id='article') remove_tags_before = dict(id='article')
remove_tags_after = dict(id='article') remove_tags_after = dict(id='article')
remove_tags = [dict(attrs={'class':[ remove_tags = [
dict(attrs={'class':[
'articleFooter', 'articleFooter',
'articleTools', 'articleTools',
'columnGroup doubleRule',
'columnGroup singleRule', 'columnGroup singleRule',
'columnGroup last', 'columnGroup last',
'columnGroup last', 'columnGroup last',
@ -140,7 +170,6 @@ class NYTimes(BasicNewsRecipe):
'dottedLine', 'dottedLine',
'entry-meta', 'entry-meta',
'entry-response module', 'entry-response module',
'icon enlargeThis',
'leftNavTabs', 'leftNavTabs',
'metaFootnote', 'metaFootnote',
'module box nav', 'module box nav',
@ -150,10 +179,43 @@ class NYTimes(BasicNewsRecipe):
'relatedSearchesModule', 'relatedSearchesModule',
'side_tool', 'side_tool',
'singleAd', 'singleAd',
'entry entry-utility', #added for DealBook
'entry-tags', #added for DealBook
'footer promos clearfix', #added for DealBook
'footer links clearfix', #added for DealBook
'tabsContainer', #added for other blog downloads
'column lastColumn', #added for other blog downloads
'pageHeaderWithLabel', #added for other gadgetwise downloads
'column two', #added for other blog downloads
'column two last', #added for other blog downloads
'column three', #added for other blog downloads
'column three last', #added for other blog downloads
'column four',#added for other blog downloads
'column four last',#added for other blog downloads
'column last', #added for other blog downloads
'entry entry-related',
'subNavigation tabContent active', #caucus blog navigation
'mediaOverlay slideshow',
'wideThumb',
'video', #added 02-11-2011
'videoHeader',#added 02-11-2011
'articleInlineVideoHolder', #added 02-11-2011
'assetCompanionAd',
re.compile('^subNavigation'), re.compile('^subNavigation'),
re.compile('^leaderboard'), re.compile('^leaderboard'),
re.compile('^module'), re.compile('^module'),
re.compile('commentCount')
]}), ]}),
dict(name='div', attrs={'class':re.compile('toolsList')}), # bits
dict(name='div', attrs={'class':re.compile('postNavigation')}), # bits
dict(name='div', attrs={'class':'tweet'}),
dict(name='span', attrs={'class':'commentCount meta'}),
dict(name='div', attrs={'id':'header'}),
dict(name='div', attrs={'id':re.compile('commentsContainer')}), # bits, pogue, gadgetwise, open
dict(name='ul', attrs={'class':re.compile('entry-tools')}), # pogue, gadgetwise
dict(name='div', attrs={'class':re.compile('nocontent')}), # pogue, gadgetwise
dict(name='div', attrs={'id':re.compile('respond')}), # open
dict(name='div', attrs={'class':re.compile('entry-tags')}), # pogue
dict(id=[ dict(id=[
'adxLeaderboard', 'adxLeaderboard',
'adxSponLink', 'adxSponLink',
@ -183,22 +245,29 @@ class NYTimes(BasicNewsRecipe):
'side_index', 'side_index',
'side_tool', 'side_tool',
'toolsRight', 'toolsRight',
'skybox', #added for DealBook
'TopAd', #added for DealBook
'related-content', #added for DealBook
]), ]),
dict(name=['script', 'noscript', 'style','form','hr'])] dict(name=['script', 'noscript', 'style','form','hr'])]
no_stylesheets = True no_stylesheets = True
extra_css = ''' extra_css = '''
.articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; } .articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
.credit { text-align: right; font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } .credit { font-weight: normal; text-align: right; font-size: 50%; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
.byline { text-align: left; font-size: small; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; } .byline { text-align: left; font-size: 50%; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
.dateline { text-align: left; font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } .dateline { text-align: left; font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
.kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } .kicker { font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
.timestamp { text-align: left; font-size: small; } .timestamp { font-weight: normal; text-align: left; font-size: 50%; }
.caption { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } .caption { font-size: 50%; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
a:link {text-decoration: none; } a:link {text-decoration: none; }
.date{font-size: 50%; }
.update{font-size: 50%; }
.articleBody { } .articleBody { }
.authorId {text-align: left; } .authorId {text-align: left; font-size: 50%; }
.image {text-align: center;} .image {text-align: center;}
.source {text-align: left; }''' .aside {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;}
.asidenote {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;font-weight:bold;}
.source {text-align: left; font-size: x-small; }'''
articles = {} articles = {}
@ -237,7 +306,7 @@ class NYTimes(BasicNewsRecipe):
def exclude_url(self,url): def exclude_url(self,url):
if not url.startswith("http"): if not url.startswith("http"):
return True return True
if not url.endswith(".html"): if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: #added for DealBook
return True return True
if 'nytimes.com' not in url: if 'nytimes.com' not in url:
return True return True
@ -280,88 +349,91 @@ class NYTimes(BasicNewsRecipe):
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://www.nytimes.com/auth/login')
br.form = br.forms().next()
br['userid'] = self.username
br['password'] = self.password
raw = br.submit().read()
if 'Please try again' in raw:
raise Exception('Your username and password are incorrect')
return br return br
def skip_ad_pages(self, soup): ## This doesn't work (and probably never did). It either gets another serve of the advertisement,
# Skip ad pages served before actual article ## or if it gets the article then get_soup (from which it is invoked) traps trying to do xml decoding.
skip_tag = soup.find(True, {'name':'skip'}) ##
if skip_tag is not None: ## def skip_ad_pages(self, soup):
self.log.warn("Found forwarding link: %s" % skip_tag.parent['href']) ## # Skip ad pages served before actual article
url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href']) ## skip_tag = soup.find(True, {'name':'skip'})
url += '?pagewanted=all' ## if skip_tag is not None:
self.log.warn("Skipping ad to article at '%s'" % url) ## self.log.warn("Found forwarding link: %s" % skip_tag.parent['href'])
return self.index_to_soup(url, raw=True) ## url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
## url += '?pagewanted=all'
## self.log.warn("Skipping ad to article at '%s'" % url)
## return self.index_to_soup(url, raw=True)
cover_tag = 'NY_NYT'
def get_cover_url(self): def get_cover_url(self):
cover = None from datetime import timedelta, date
st = time.localtime() cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.cover_tag+'.jpg'
year = str(st.tm_year) br = BasicNewsRecipe.get_browser()
month = "%.2d" % st.tm_mon daysback=1
day = "%.2d" % st.tm_mday try:
cover = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/nytfrontpage/scan.jpg' br.open(cover)
except:
while daysback<7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.cover_tag+'.jpg'
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
try: try:
br.open(cover) br.open(cover)
except: except:
daysback = daysback+1
continue
break
if daysback==7:
self.log("\nCover unavailable") self.log("\nCover unavailable")
cover = None cover = None
return cover return cover
masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
def short_title(self): def short_title(self):
return self.title return self.title
def index_to_soup(self, url_or_raw, raw=False):
''' def article_to_soup(self, url_or_raw, raw=False):
OVERRIDE of class method from contextlib import closing
deals with various page encodings between index and articles import copy
''' from calibre.ebooks.chardet import xml_to_unicode
def get_the_soup(docEncoding, url_or_raw, raw=False) :
if re.match(r'\w+://', url_or_raw): if re.match(r'\w+://', url_or_raw):
br = self.clone_browser(self.browser) br = self.clone_browser(self.browser)
f = br.open_novisit(url_or_raw) open_func = getattr(br, 'open_novisit', br.open)
with closing(open_func(url_or_raw)) as f:
_raw = f.read() _raw = f.read()
f.close()
if not _raw: if not _raw:
raise RuntimeError('Could not fetch index from %s'%url_or_raw) raise RuntimeError('Could not fetch index from %s'%url_or_raw)
else: else:
_raw = url_or_raw _raw = url_or_raw
if raw: if raw:
return _raw return _raw
if not isinstance(_raw, unicode) and self.encoding: if not isinstance(_raw, unicode) and self.encoding:
_raw = _raw.decode(docEncoding, 'replace') if callable(self.encoding):
massage = list(BeautifulSoup.MARKUP_MASSAGE) _raw = self.encoding(_raw)
massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding))) else:
return BeautifulSoup(_raw, markupMassage=massage) _raw = _raw.decode(self.encoding, 'replace')
# Entry point nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
soup = get_the_soup( self.encoding, url_or_raw ) nmassage.extend(self.preprocess_regexps)
contentType = soup.find(True,attrs={'http-equiv':'Content-Type'}) nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL), lambda m: '')]
docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')] # Some websites have buggy doctype declarations that mess up beautifulsoup
if docEncoding == '' : # Remove comments as they can leave detritus when extracting tags leaves
docEncoding = self.encoding # multiple nested comments
nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
usrc = xml_to_unicode(_raw, self.verbose, strip_encoding_pats=True)[0]
usrc = self.preprocess_raw_html(usrc, url_or_raw)
return BeautifulSoup(usrc, markupMassage=nmassage)
if self.verbose > 2:
self.log( " document encoding: '%s'" % docEncoding)
if docEncoding != self.encoding :
soup = get_the_soup(docEncoding, url_or_raw)
return soup
def massageNCXText(self, description): def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters # Kindle TOC descriptions won't render certain characters
if description: if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&' # Replace '&' with '&'
massaged = re.sub("&","&", massaged) massaged = re.sub("&#038;","&", massaged)
massaged = re.sub("&amp;","&", massaged)
return self.fixChars(massaged) return self.fixChars(massaged)
else: else:
return description return description
@ -383,6 +455,16 @@ class NYTimes(BasicNewsRecipe):
if self.filterDuplicates: if self.filterDuplicates:
if url in self.url_list: if url in self.url_list:
return return
if self.webEdition:
date_tag = self.decode_url_date(url)
if date_tag is not None:
if self.oldest_web_article is not None:
if date_tag < self.earliest_date:
self.log("Skipping article %s" % url)
return
else:
self.log("Skipping article %s" % url)
return
self.url_list.append(url) self.url_list.append(url)
title = self.tag_to_string(a, use_alt=True).strip() title = self.tag_to_string(a, use_alt=True).strip()
description = '' description = ''
@ -407,6 +489,31 @@ class NYTimes(BasicNewsRecipe):
description=description, author=author, description=description, author=author,
content='')) content=''))
def get_tech_feeds(self,ans):
if self.getTechBlogs:
tech_articles = {}
key_list = []
save_oldest_article = self.oldest_article
save_max_articles_per_feed = self.max_articles_per_feed
self.oldest_article = self.tech_oldest_article
self.max_articles_per_feed = self.tech_max_articles_per_feed
self.feeds = self.tech_feeds
tech = self.parse_feeds()
self.oldest_article = save_oldest_article
self.max_articles_per_feed = save_max_articles_per_feed
self.feeds = None
for f in tech:
key_list.append(f.title)
tech_articles[f.title] = []
for a in f.articles:
tech_articles[f.title].append(
dict(title=a.title, url=a.url, date=a.date,
description=a.summary, author=a.author,
content=a.content))
tech_ans = [(k, tech_articles[k]) for k in key_list if tech_articles.has_key(k)]
for x in tech_ans:
ans.append(x)
return ans
def parse_web_edition(self): def parse_web_edition(self):
@ -418,31 +525,41 @@ class NYTimes(BasicNewsRecipe):
if sec_title in self.excludeSections: if sec_title in self.excludeSections:
print "SECTION EXCLUDED: ",sec_title print "SECTION EXCLUDED: ",sec_title
continue continue
print 'Index URL: '+'http://www.nytimes.com/pages/'+index_url+'/index.html' try:
soup = self.index_to_soup('http://www.nytimes.com/pages/'+index_url+'/index.html') soup = self.index_to_soup('http://www.nytimes.com/pages/'+index_url+'/index.html')
except:
continue
print 'Index URL: '+'http://www.nytimes.com/pages/'+index_url+'/index.html'
self.key = sec_title self.key = sec_title
# Find each article # Find each article
for div in soup.findAll(True, for div in soup.findAll(True,
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}): attrs={'class':['section-headline', 'ledeStory', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
if div['class'] in ['story', 'story headline'] : if div['class'] in ['story', 'story headline', 'storyHeader'] :
self.handle_article(div) self.handle_article(div)
elif div['class'] == 'ledeStory':
divsub = div.find('div','storyHeader')
if divsub is not None:
self.handle_article(divsub)
ulrefer = div.find('ul','refer')
if ulrefer is not None:
for lidiv in ulrefer.findAll('li'):
self.handle_article(lidiv)
elif div['class'] == 'headlinesOnly multiline flush': elif div['class'] == 'headlinesOnly multiline flush':
for lidiv in div.findAll('li'): for lidiv in div.findAll('li'):
self.handle_article(lidiv) self.handle_article(lidiv)
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
return self.filter_ans(self.ans) return self.filter_ans(self.get_tech_feeds(self.ans))
def parse_todays_index(self): def parse_todays_index(self):
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html') soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
skipping = False skipping = False
# Find each article # Find each article
for div in soup.findAll(True, for div in soup.findAll(True,
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}): attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
if div['class'] in ['section-headline','sectionHeader']: if div['class'] in ['section-headline','sectionHeader']:
self.key = string.capwords(self.feed_title(div)) self.key = string.capwords(self.feed_title(div))
self.key = self.key.replace('Op-ed','Op-Ed') self.key = self.key.replace('Op-ed','Op-Ed')
@ -466,7 +583,7 @@ class NYTimes(BasicNewsRecipe):
self.handle_article(lidiv) self.handle_article(lidiv)
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
return self.filter_ans(self.ans) return self.filter_ans(self.get_tech_feeds(self.ans))
def parse_headline_index(self): def parse_headline_index(self):
@ -514,7 +631,7 @@ class NYTimes(BasicNewsRecipe):
for h3_item in search_div.findAll('h3'): for h3_item in search_div.findAll('h3'):
byline = h3_item.h6 byline = h3_item.h6
if byline is not None: if byline is not None:
author = self.tag_to_string(byline,usa_alt=False) author = self.tag_to_string(byline,use_alt=False)
else: else:
author = '' author = ''
a = h3_item.find('a', href=True) a = h3_item.find('a', href=True)
@ -540,7 +657,7 @@ class NYTimes(BasicNewsRecipe):
self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content='')) self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
return self.filter_ans(self.ans) return self.filter_ans(self.get_tech_feeds(self.ans))
def parse_index(self): def parse_index(self):
if self.headlinesOnly: if self.headlinesOnly:
@ -550,32 +667,190 @@ class NYTimes(BasicNewsRecipe):
else: else:
return self.parse_todays_index() return self.parse_todays_index()
def strip_anchors(self,soup): def strip_anchors(self,soup,kill_all=False):
paras = soup.findAll(True) paras = soup.findAll(True)
for para in paras: for para in paras:
aTags = para.findAll('a') aTags = para.findAll('a')
for a in aTags: for a in aTags:
if a.img is None: if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace')) if kill_all or (self.recursions==0):
a.replaceWith(self.tag_to_string(a,False))
else:
if a.has_key('href'):
if a['href'].startswith('http://www.nytimes'):
if not a['href'].endswith('pagewanted=all'):
url = re.sub(r'\?.*', '', a['href'])
if self.exclude_url(url):
a.replaceWith(self.tag_to_string(a,False))
else:
a['href'] = url+'?pagewanted=all'
elif not (a['href'].startswith('http://pogue') or \
a['href'].startswith('http://bits') or \
a['href'].startswith('http://travel') or \
a['href'].startswith('http://business') or \
a['href'].startswith('http://tech') or \
a['href'].startswith('http://health') or \
a['href'].startswith('http://dealbook') or \
a['href'].startswith('http://open')):
a.replaceWith(self.tag_to_string(a,False))
return soup
def handle_tags(self,soup):
try:
print("HANDLE TAGS: TITLE = "+self.tag_to_string(soup.title))
except:
print("HANDLE TAGS: NO TITLE")
if soup is None:
print("ERROR: handle_tags received NoneType")
return None
## print("HANDLING AD FORWARD:")
## print(soup)
if self.keep_only_tags:
body = Tag(soup, 'body')
try:
if isinstance(self.keep_only_tags, dict):
self.keep_only_tags = [self.keep_only_tags]
for spec in self.keep_only_tags:
for tag in soup.find('body').findAll(**spec):
body.insert(len(body.contents), tag)
soup.find('body').replaceWith(body)
except AttributeError: # soup has no body element
pass
def remove_beyond(tag, next):
while tag is not None and getattr(tag, 'name', None) != 'body':
after = getattr(tag, next)
while after is not None:
ns = getattr(tag, next)
after.extract()
after = ns
tag = tag.parent
if self.remove_tags_after is not None:
rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
for spec in rt:
tag = soup.find(**spec)
remove_beyond(tag, 'nextSibling')
if self.remove_tags_before is not None:
tag = soup.find(**self.remove_tags_before)
remove_beyond(tag, 'previousSibling')
for kwds in self.remove_tags:
for tag in soup.findAll(**kwds):
tag.extract()
return soup return soup
def preprocess_html(self, soup): def preprocess_html(self, soup):
print("PREPROCESS TITLE="+self.tag_to_string(soup.title))
skip_tag = soup.find(True, {'name':'skip'})
if skip_tag is not None:
url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
url += '?pagewanted=all'
self.log.warn("Skipping ad to article at '%s'" % url)
sleep(5)
soup = self.handle_tags(self.article_to_soup(url))
if self.webEdition & (self.oldest_article>0): # check if the article is from one of the tech blogs
date_tag = soup.find(True,attrs={'class': ['dateline','date']}) blog=soup.find('div',attrs={'id':['pogue','bits','gadgetwise','open']})
if date_tag:
date_str = self.tag_to_string(date_tag,use_alt=False) if blog is not None:
date_str = date_str.replace('Published:','') old_body = soup.find('body')
date_items = date_str.split(',') new_body=Tag(soup,'body')
try: new_body.append(soup.find('div',attrs={'id':'content'}))
datestring = date_items[0]+' '+date_items[1] new_body.find('div',attrs={'id':'content'})['id']='blogcontent' # identify for postprocess_html
article_date = self.decode_us_date(datestring) old_body.replaceWith(new_body)
except: for divr in soup.findAll('div',attrs={'class':re.compile('w190 right')}):
article_date = date.today() if divr.find(text=re.compile('Sign up')):
if article_date < self.earliest_date: divr.extract()
self.log("Skipping article dated %s" % date_str) divr = soup.find('div',attrs={'id':re.compile('related-content')})
return None if divr is not None:
# handle related articles
rlist = []
ul = divr.find('ul')
if ul is not None:
for li in ul.findAll('li'):
atag = li.find('a')
if atag is not None:
if atag['href'].startswith('http://pogue') or atag['href'].startswith('http://bits') or \
atag['href'].startswith('http://open'):
atag.find(text=True).replaceWith(self.massageNCXText(self.tag_to_string(atag,False)))
rlist.append(atag)
divr.extract()
if rlist != []:
asidediv = Tag(soup,'div',[('class','aside')])
if soup.find('hr') is None:
asidediv.append(Tag(soup,'hr'))
h4 = Tag(soup,'h4',[('class','asidenote')])
h4.insert(0,"Related Posts")
asidediv.append(h4)
ul = Tag(soup,'ul')
for r in rlist:
li = Tag(soup,'li',[('class','aside')])
r['class'] = 'aside'
li.append(r)
ul.append(li)
asidediv.append(ul)
asidediv.append(Tag(soup,'hr'))
smain = soup.find('body')
smain.append(asidediv)
for atag in soup.findAll('a'):
img = atag.find('img')
if img is not None:
atag.replaceWith(img)
elif not atag.has_key('href'):
atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
elif not (atag['href'].startswith('http://www.nytimes') or atag['href'].startswith('http://pogue') or \
atag['href'].startswith('http://bits') or atag['href'].startswith('http://open')):
atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
hdr = soup.find('address')
if hdr is not None:
hdr.name='span'
for span_credit in soup.findAll('span','credit'):
sp = Tag(soup,'span')
span_credit.replaceWith(sp)
sp.append(Tag(soup,'br'))
sp.append(span_credit)
sp.append(Tag(soup,'br'))
else: # nytimes article
related = [] # these will be the related articles
first_outer = None # first related outer tag
first_related = None # first related tag
for outerdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}):
for rdiv in soup.findAll('div','columnGroup doubleRule'):
if rdiv.find('h3') is not None:
if self.tag_to_string(rdiv.h3,False).startswith('Related'):
rdiv.h3.find(text=True).replaceWith("Related articles")
rdiv.h3['class'] = 'asidenote'
for litag in rdiv.findAll('li'):
if litag.find('a') is not None:
if litag.find('a')['href'].startswith('http://www.nytimes.com'):
url = re.sub(r'\?.*', '', litag.find('a')['href'])
litag.find('a')['href'] = url+'?pagewanted=all'
litag.extract()
related.append(litag)
if first_related is None:
first_related = rdiv
first_outer = outerdiv
else:
litag.extract()
if related != []:
for r in related:
if r.h6: # don't want the anchor inside a h6 tag
r.h6.replaceWith(r.h6.a)
first_related.ul.append(r)
first_related.insert(0,Tag(soup,'hr'))
first_related.append(Tag(soup,'hr'))
first_related['class'] = 'aside'
first_outer.replaceWith(first_related) # replace the outer tag with the related tag
for rdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}):
rdiv.extract()
kicker_tag = soup.find(attrs={'class':'kicker'}) kicker_tag = soup.find(attrs={'class':'kicker'})
if kicker_tag: # remove Op_Ed author head shots if kicker_tag: # remove Op_Ed author head shots
@ -584,9 +859,77 @@ class NYTimes(BasicNewsRecipe):
img_div = soup.find('div','inlineImage module') img_div = soup.find('div','inlineImage module')
if img_div: if img_div:
img_div.extract() img_div.extract()
return self.strip_anchors(soup)
def postprocess_html(self,soup, True): if self.useHighResImages:
try:
#open up all the "Enlarge this Image" pop-ups and download the full resolution jpegs
enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
if enlargeThisList:
for popupref in enlargeThisList:
popupreflink = popupref.find('a')
if popupreflink:
reflinkstring = str(popupreflink['href'])
refstart = reflinkstring.find("javascript:pop_me_up2('") + len("javascript:pop_me_up2('")
refend = reflinkstring.find(".html", refstart) + len(".html")
reflinkstring = reflinkstring[refstart:refend]
popuppage = self.browser.open(reflinkstring)
popuphtml = popuppage.read()
popuppage.close()
if popuphtml:
st = time.localtime()
year = str(st.tm_year)
month = "%.2d" % st.tm_mon
day = "%.2d" % st.tm_mday
imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/') + len('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/')
highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/' + popuphtml[imgstartpos:popuphtml.find('.jpg',imgstartpos)+4]
popupSoup = BeautifulSoup(popuphtml)
highResTag = popupSoup.find('img', {'src':highResImageLink})
if highResTag:
try:
newWidth = highResTag['width']
newHeight = highResTag['height']
imageTag = popupref.parent.find("img")
except:
self.log("Error: finding width and height of img")
popupref.extract()
if imageTag:
try:
imageTag['src'] = highResImageLink
imageTag['width'] = newWidth
imageTag['height'] = newHeight
except:
self.log("Error setting the src width and height parameters")
except Exception:
self.log("Error pulling high resolution images")
try:
#in case pulling images failed, delete the enlarge this text
enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
if enlargeThisList:
for popupref in enlargeThisList:
popupref.extract()
except:
self.log("Error removing Enlarge this text")
return self.strip_anchors(soup,False)
def postprocess_html(self,soup,first_fetch):
if not first_fetch: # remove Related links
for aside in soup.findAll('div','aside'):
aside.extract()
soup = self.strip_anchors(soup,True)
if soup.find('div',attrs={'id':'blogcontent'}) is None:
if first_fetch:
aside = soup.find('div','aside')
if aside is not None: # move the related list to the end of the article
art = soup.find('div',attrs={'id':'article'})
if art is None:
art = soup.find('div',attrs={'class':'article'})
if art is not None:
art.append(aside)
try: try:
if self.one_picture_per_article: if self.one_picture_per_article:
# Remove all images after first # Remove all images after first
@ -642,6 +985,7 @@ class NYTimes(BasicNewsRecipe):
try: try:
# Change <nyt_headline> to <h2> # Change <nyt_headline> to <h2>
h1 = soup.find('h1') h1 = soup.find('h1')
blogheadline = str(h1) #added for dealbook
if h1: if h1:
headline = h1.find("nyt_headline") headline = h1.find("nyt_headline")
if headline: if headline:
@ -649,13 +993,19 @@ class NYTimes(BasicNewsRecipe):
tag['class'] = "headline" tag['class'] = "headline"
tag.insert(0, self.fixChars(headline.contents[0])) tag.insert(0, self.fixChars(headline.contents[0]))
h1.replaceWith(tag) h1.replaceWith(tag)
elif blogheadline.find('entry-title'):#added for dealbook
tag = Tag(soup, "h2")#added for dealbook
tag['class'] = "headline"#added for dealbook
tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook
h1.replaceWith(tag)#added for dealbook
else: else:
# Blog entry - replace headline, remove <hr> tags # Blog entry - replace headline, remove <hr> tags - BCC I think this is no longer functional 1-18-2011
headline = soup.find('title') headline = soup.find('title')
if headline: if headline:
tag = Tag(soup, "h2") tag = Tag(soup, "h2")
tag['class'] = "headline" tag['class'] = "headline"
tag.insert(0, self.fixChars(headline.contents[0])) tag.insert(0, self.fixChars(headline.renderContents()))
soup.insert(0, tag) soup.insert(0, tag)
hrs = soup.findAll('hr') hrs = soup.findAll('hr')
for hr in hrs: for hr in hrs:
@ -663,6 +1013,29 @@ class NYTimes(BasicNewsRecipe):
except: except:
self.log("ERROR: Problem in Change <nyt_headline> to <h2>") self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
try:
#if this is from a blog (dealbook, fix the byline format
bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
if bylineauthor:
tag = Tag(soup, "h6")
tag['class'] = "byline"
tag.insert(0, self.fixChars(bylineauthor.renderContents()))
bylineauthor.replaceWith(tag)
except:
self.log("ERROR: fixing byline author format")
try:
#if this is a blog (dealbook) fix the credit style for the pictures
blogcredit = soup.find('div',attrs={'class':'credit'})
if blogcredit:
tag = Tag(soup, "h6")
tag['class'] = "credit"
tag.insert(0, self.fixChars(blogcredit.renderContents()))
blogcredit.replaceWith(tag)
except:
self.log("ERROR: fixing credit format")
try: try:
# Change <h1> to <h3> - used in editorial blogs # Change <h1> to <h3> - used in editorial blogs
masthead = soup.find("h1") masthead = soup.find("h1")
@ -685,6 +1058,13 @@ class NYTimes(BasicNewsRecipe):
subhead.replaceWith(bTag) subhead.replaceWith(bTag)
except: except:
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs") self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
try:
#remove the <strong> update tag
blogupdated = soup.find('span', {'class':'update'})
if blogupdated:
blogupdated.replaceWith("")
except:
self.log("ERROR: Removing strong tag")
try: try:
divTag = soup.find('div',attrs={'id':'articleBody'}) divTag = soup.find('div',attrs={'id':'articleBody'})
@ -708,16 +1088,16 @@ class NYTimes(BasicNewsRecipe):
return soup return soup
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'): if not first:
return
idxdiv = soup.find('div',attrs={'class':'articleSpanImage'}) idxdiv = soup.find('div',attrs={'class':'articleSpanImage'})
if idxdiv is not None: if idxdiv is not None:
if idxdiv.img: if idxdiv.img:
self.add_toc_thumbnail(article, idxdiv.img['src']) self.add_toc_thumbnail(article, re.sub(r'links\\link\d+\\','',idxdiv.img['src']))
else: else:
img = soup.find('img') img = soup.find('body').find('img')
if img is not None: if img is not None:
self.add_toc_thumbnail(article, img['src']) self.add_toc_thumbnail(article, re.sub(r'links\\link\d+\\','',img['src']))
shortparagraph = "" shortparagraph = ""
try: try:
if len(article.text_summary.strip()) == 0: if len(article.text_summary.strip()) == 0:
@ -731,13 +1111,22 @@ class NYTimes(BasicNewsRecipe):
#account for blank paragraphs and short paragraphs by appending them to longer ones #account for blank paragraphs and short paragraphs by appending them to longer ones
if len(refparagraph) > 0: if len(refparagraph) > 0:
if len(refparagraph) > 70: #approximately one line of text if len(refparagraph) > 70: #approximately one line of text
article.summary = article.text_summary = shortparagraph + refparagraph newpara = shortparagraph + refparagraph
newparaDateline,newparaEm,newparaDesc = newpara.partition('&mdash;')
if newparaEm == '':
newparaDateline,newparaEm,newparaDesc = newpara.partition('—')
if newparaEm == '':
newparaDesc = newparaDateline
article.summary = article.text_summary = newparaDesc.strip()
return return
else: else:
shortparagraph = refparagraph + " " shortparagraph = refparagraph + " "
if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"): if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
shortparagraph = shortparagraph + "- " shortparagraph = shortparagraph + "- "
else:
article.summary = article.text_summary = self.massageNCXText(article.text_summary)
except: except:
self.log("Error creating article descriptions") self.log("Error creating article descriptions")
return return

View File

@ -6,31 +6,42 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
nytimes.com nytimes.com
''' '''
import re, string, time import re, string, time
from calibre import entity_to_unicode, strftime from calibre import strftime
from datetime import timedelta, date from datetime import timedelta, date
from time import sleep
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
class NYTimes(BasicNewsRecipe): class NYTimes(BasicNewsRecipe):
recursions=1 # set this to zero to omit Related articles lists
# set getTechBlogs to True to include the technology blogs
# set tech_oldest_article to control article age
# set tech_max_articles_per_feed to control article count
getTechBlogs = True
remove_empty_feeds = True
tech_oldest_article = 14
tech_max_articles_per_feed = 25
# set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored. # set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored.
headlinesOnly = False headlinesOnly = False
# set webEdition to True for the Web edition of the newspaper. Set oldest_article to the # set webEdition to True for the Web edition of the newspaper. Set oldest_web_article to the
# number of days old an article can be for inclusion. If oldest_article = 0 all articles # number of days old an article can be for inclusion. If oldest_web_article = None all articles
# will be included. Note: oldest_article is ignored if webEdition = False # will be included. Note: oldest_web_article is ignored if webEdition = False
webEdition = False webEdition = False
oldest_article = 7 oldest_web_article = 7
# replace paid Kindle Version: the name will be changed to "The New York Times" to cause
# previous paid versions of the new york times to best sent to the back issues folder on the kindle
replaceKindleVersion = False
# download higher resolution images than the small thumbnails typically included in the article # download higher resolution images than the small thumbnails typically included in the article
# the down side of having large beautiful images is the file size is much larger, on the order of 7MB per paper # the down side of having large beautiful images is the file size is much larger, on the order of 7MB per paper
useHighResImages = True useHighResImages = True
# replace paid Kindle Version: the name will be changed to "The New York Times" to cause
# previous paid versions of the new york times to best sent to the back issues folder on the kindle
replaceKindleVersion = False
# includeSections: List of sections to include. If empty, all sections found will be included. # includeSections: List of sections to include. If empty, all sections found will be included.
# Otherwise, only the sections named will be included. For example, # Otherwise, only the sections named will be included. For example,
# #
@ -90,60 +101,68 @@ class NYTimes(BasicNewsRecipe):
('Education',u'education'), ('Education',u'education'),
('Multimedia',u'multimedia'), ('Multimedia',u'multimedia'),
(u'Obituaries',u'obituaries'), (u'Obituaries',u'obituaries'),
(u'Sunday Magazine',u'magazine'), (u'Sunday Magazine',u'magazine')
(u'Week in Review',u'weekinreview')] ]
tech_feeds = [
(u'Tech - Pogues Posts', u'http://pogue.blogs.nytimes.com/feed/'),
(u'Tech - Bits', u'http://bits.blogs.nytimes.com/feed/'),
(u'Tech - Gadgetwise', u'http://gadgetwise.blogs.nytimes.com/feed/'),
(u'Tech - Open', u'http://open.blogs.nytimes.com/feed/')
]
if headlinesOnly: if headlinesOnly:
title='New York Times Headlines' title='New York Times Headlines'
description = 'Headlines from the New York Times' description = 'Headlines from the New York Times'
needs_subscription = True needs_subscription = False
elif webEdition: elif webEdition:
title='New York Times (Web)' title='New York Times (Web)'
description = 'New York Times on the Web' description = 'New York Times on the Web'
needs_subscription = True needs_subscription = False
elif replaceKindleVersion: elif replaceKindleVersion:
title='The New York Times' title='The New York Times'
description = 'Today\'s New York Times' description = 'Today\'s New York Times'
needs_subscription = True needs_subscription = False
else: else:
title='New York Times' title='New York Times'
description = 'Today\'s New York Times. Needs subscription from http://www.nytimes.com' description = 'Today\'s New York Times'
needs_subscription = True needs_subscription = False
def decode_url_date(self,url):
month_list = ['january','february','march','april','may','june','july','august','september','october','november','december'] urlitems = url.split('/')
def decode_us_date(self,datestr):
udate = datestr.strip().lower().split()
try: try:
m = self.month_list.index(udate[0])+1 d = date(int(urlitems[3]),int(urlitems[4]),int(urlitems[5]))
except: except:
return date.today()
d = int(udate[1])
y = int(udate[2])
try: try:
d = date(y,m,d) d = date(int(urlitems[4]),int(urlitems[5]),int(urlitems[6]))
except: except:
d = date.today return None
return d return d
earliest_date = date.today() - timedelta(days=oldest_article) if oldest_web_article is None:
earliest_date = date.today()
else:
earliest_date = date.today() - timedelta(days=oldest_web_article)
oldest_article = 365 # by default, a long time ago
__author__ = 'GRiker/Kovid Goyal/Nick Redding/Ben Collier' __author__ = 'GRiker/Kovid Goyal/Nick Redding'
language = 'en' language = 'en'
requires_version = (0, 7, 5) requires_version = (0, 7, 5)
encoding = 'utf-8'
timefmt = '' timefmt = ''
masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
simultaneous_downloads = 1
cover_margins = (18,18,'grey99') cover_margins = (18,18,'grey99')
remove_tags_before = dict(id='article') remove_tags_before = dict(id='article')
remove_tags_after = dict(id='article') remove_tags_after = dict(id='article')
remove_tags = [dict(attrs={'class':[ remove_tags = [
dict(attrs={'class':[
'articleFooter', 'articleFooter',
'articleTools', 'articleTools',
'columnGroup doubleRule',
'columnGroup singleRule', 'columnGroup singleRule',
'columnGroup last', 'columnGroup last',
'columnGroup last', 'columnGroup last',
@ -151,7 +170,6 @@ class NYTimes(BasicNewsRecipe):
'dottedLine', 'dottedLine',
'entry-meta', 'entry-meta',
'entry-response module', 'entry-response module',
#'icon enlargeThis', #removed to provide option for high res images
'leftNavTabs', 'leftNavTabs',
'metaFootnote', 'metaFootnote',
'module box nav', 'module box nav',
@ -175,12 +193,9 @@ class NYTimes(BasicNewsRecipe):
'column four',#added for other blog downloads 'column four',#added for other blog downloads
'column four last',#added for other blog downloads 'column four last',#added for other blog downloads
'column last', #added for other blog downloads 'column last', #added for other blog downloads
'timestamp published', #added for other blog downloads
'entry entry-related', 'entry entry-related',
'subNavigation tabContent active', #caucus blog navigation 'subNavigation tabContent active', #caucus blog navigation
'columnGroup doubleRule',
'mediaOverlay slideshow', 'mediaOverlay slideshow',
'headlinesOnly multiline flush',
'wideThumb', 'wideThumb',
'video', #added 02-11-2011 'video', #added 02-11-2011
'videoHeader',#added 02-11-2011 'videoHeader',#added 02-11-2011
@ -189,7 +204,18 @@ class NYTimes(BasicNewsRecipe):
re.compile('^subNavigation'), re.compile('^subNavigation'),
re.compile('^leaderboard'), re.compile('^leaderboard'),
re.compile('^module'), re.compile('^module'),
re.compile('commentCount')
]}), ]}),
dict(name='div', attrs={'class':re.compile('toolsList')}), # bits
dict(name='div', attrs={'class':re.compile('postNavigation')}), # bits
dict(name='div', attrs={'class':'tweet'}),
dict(name='span', attrs={'class':'commentCount meta'}),
dict(name='div', attrs={'id':'header'}),
dict(name='div', attrs={'id':re.compile('commentsContainer')}), # bits, pogue, gadgetwise, open
dict(name='ul', attrs={'class':re.compile('entry-tools')}), # pogue, gadgetwise
dict(name='div', attrs={'class':re.compile('nocontent')}), # pogue, gadgetwise
dict(name='div', attrs={'id':re.compile('respond')}), # open
dict(name='div', attrs={'class':re.compile('entry-tags')}), # pogue
dict(id=[ dict(id=[
'adxLeaderboard', 'adxLeaderboard',
'adxSponLink', 'adxSponLink',
@ -227,17 +253,21 @@ class NYTimes(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
extra_css = ''' extra_css = '''
.articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; } .articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
.credit { text-align: right; font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } .credit { font-weight: normal; text-align: right; font-size: 50%; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
.byline { text-align: left; font-size: small; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; } .byline { text-align: left; font-size: 50%; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
.dateline { text-align: left; font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } .dateline { text-align: left; font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
.kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } .kicker { font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
.timestamp { text-align: left; font-size: small; } .timestamp { font-weight: normal; text-align: left; font-size: 50%; }
.caption { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } .caption { font-size: 50%; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
a:link {text-decoration: none; } a:link {text-decoration: none; }
.date{font-size: 50%; }
.update{font-size: 50%; }
.articleBody { } .articleBody { }
.authorId {text-align: left; } .authorId {text-align: left; font-size: 50%; }
.image {text-align: center;} .image {text-align: center;}
.source {text-align: left; }''' .aside {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;}
.asidenote {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;font-weight:bold;}
.source {text-align: left; font-size: x-small; }'''
articles = {} articles = {}
@ -276,7 +306,7 @@ class NYTimes(BasicNewsRecipe):
def exclude_url(self,url): def exclude_url(self,url):
if not url.startswith("http"): if not url.startswith("http"):
return True return True
if not url.endswith(".html") and 'dealbook.nytimes.com' not in url and 'blogs.nytimes.com' not in url: #added for DealBook if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: #added for DealBook
return True return True
if 'nytimes.com' not in url: if 'nytimes.com' not in url:
return True return True
@ -319,88 +349,91 @@ class NYTimes(BasicNewsRecipe):
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://www.nytimes.com/auth/login')
br.form = br.forms().next()
br['userid'] = self.username
br['password'] = self.password
raw = br.submit().read()
if 'Please try again' in raw:
raise Exception('Your username and password are incorrect')
return br return br
def skip_ad_pages(self, soup): ## This doesn't work (and probably never did). It either gets another serve of the advertisement,
# Skip ad pages served before actual article ## or if it gets the article then get_soup (from which it is invoked) traps trying to do xml decoding.
skip_tag = soup.find(True, {'name':'skip'}) ##
if skip_tag is not None: ## def skip_ad_pages(self, soup):
self.log.warn("Found forwarding link: %s" % skip_tag.parent['href']) ## # Skip ad pages served before actual article
url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href']) ## skip_tag = soup.find(True, {'name':'skip'})
url += '?pagewanted=all' ## if skip_tag is not None:
self.log.warn("Skipping ad to article at '%s'" % url) ## self.log.warn("Found forwarding link: %s" % skip_tag.parent['href'])
return self.index_to_soup(url, raw=True) ## url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
## url += '?pagewanted=all'
## self.log.warn("Skipping ad to article at '%s'" % url)
## return self.index_to_soup(url, raw=True)
cover_tag = 'NY_NYT'
def get_cover_url(self): def get_cover_url(self):
cover = None from datetime import timedelta, date
st = time.localtime() cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.cover_tag+'.jpg'
year = str(st.tm_year) br = BasicNewsRecipe.get_browser()
month = "%.2d" % st.tm_mon daysback=1
day = "%.2d" % st.tm_mday try:
cover = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/nytfrontpage/scan.jpg' br.open(cover)
except:
while daysback<7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.cover_tag+'.jpg'
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
try: try:
br.open(cover) br.open(cover)
except: except:
daysback = daysback+1
continue
break
if daysback==7:
self.log("\nCover unavailable") self.log("\nCover unavailable")
cover = None cover = None
return cover return cover
masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
def short_title(self): def short_title(self):
return self.title return self.title
def index_to_soup(self, url_or_raw, raw=False):
''' def article_to_soup(self, url_or_raw, raw=False):
OVERRIDE of class method from contextlib import closing
deals with various page encodings between index and articles import copy
''' from calibre.ebooks.chardet import xml_to_unicode
def get_the_soup(docEncoding, url_or_raw, raw=False) :
if re.match(r'\w+://', url_or_raw): if re.match(r'\w+://', url_or_raw):
br = self.clone_browser(self.browser) br = self.clone_browser(self.browser)
f = br.open_novisit(url_or_raw) open_func = getattr(br, 'open_novisit', br.open)
with closing(open_func(url_or_raw)) as f:
_raw = f.read() _raw = f.read()
f.close()
if not _raw: if not _raw:
raise RuntimeError('Could not fetch index from %s'%url_or_raw) raise RuntimeError('Could not fetch index from %s'%url_or_raw)
else: else:
_raw = url_or_raw _raw = url_or_raw
if raw: if raw:
return _raw return _raw
if not isinstance(_raw, unicode) and self.encoding: if not isinstance(_raw, unicode) and self.encoding:
_raw = _raw.decode(docEncoding, 'replace') if callable(self.encoding):
massage = list(BeautifulSoup.MARKUP_MASSAGE) _raw = self.encoding(_raw)
massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding))) else:
return BeautifulSoup(_raw, markupMassage=massage) _raw = _raw.decode(self.encoding, 'replace')
# Entry point nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
soup = get_the_soup( self.encoding, url_or_raw ) nmassage.extend(self.preprocess_regexps)
contentType = soup.find(True,attrs={'http-equiv':'Content-Type'}) nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL), lambda m: '')]
docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')] # Some websites have buggy doctype declarations that mess up beautifulsoup
if docEncoding == '' : # Remove comments as they can leave detritus when extracting tags leaves
docEncoding = self.encoding # multiple nested comments
nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
usrc = xml_to_unicode(_raw, self.verbose, strip_encoding_pats=True)[0]
usrc = self.preprocess_raw_html(usrc, url_or_raw)
return BeautifulSoup(usrc, markupMassage=nmassage)
if self.verbose > 2:
self.log( " document encoding: '%s'" % docEncoding)
if docEncoding != self.encoding :
soup = get_the_soup(docEncoding, url_or_raw)
return soup
def massageNCXText(self, description): def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters # Kindle TOC descriptions won't render certain characters
if description: if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&' # Replace '&' with '&'
massaged = re.sub("&","&", massaged) massaged = re.sub("&#038;","&", massaged)
massaged = re.sub("&amp;","&", massaged)
return self.fixChars(massaged) return self.fixChars(massaged)
else: else:
return description return description
@ -422,6 +455,16 @@ class NYTimes(BasicNewsRecipe):
if self.filterDuplicates: if self.filterDuplicates:
if url in self.url_list: if url in self.url_list:
return return
if self.webEdition:
date_tag = self.decode_url_date(url)
if date_tag is not None:
if self.oldest_web_article is not None:
if date_tag < self.earliest_date:
self.log("Skipping article %s" % url)
return
else:
self.log("Skipping article %s" % url)
return
self.url_list.append(url) self.url_list.append(url)
title = self.tag_to_string(a, use_alt=True).strip() title = self.tag_to_string(a, use_alt=True).strip()
description = '' description = ''
@ -446,6 +489,31 @@ class NYTimes(BasicNewsRecipe):
description=description, author=author, description=description, author=author,
content='')) content=''))
def get_tech_feeds(self,ans):
if self.getTechBlogs:
tech_articles = {}
key_list = []
save_oldest_article = self.oldest_article
save_max_articles_per_feed = self.max_articles_per_feed
self.oldest_article = self.tech_oldest_article
self.max_articles_per_feed = self.tech_max_articles_per_feed
self.feeds = self.tech_feeds
tech = self.parse_feeds()
self.oldest_article = save_oldest_article
self.max_articles_per_feed = save_max_articles_per_feed
self.feeds = None
for f in tech:
key_list.append(f.title)
tech_articles[f.title] = []
for a in f.articles:
tech_articles[f.title].append(
dict(title=a.title, url=a.url, date=a.date,
description=a.summary, author=a.author,
content=a.content))
tech_ans = [(k, tech_articles[k]) for k in key_list if tech_articles.has_key(k)]
for x in tech_ans:
ans.append(x)
return ans
def parse_web_edition(self): def parse_web_edition(self):
@ -457,31 +525,41 @@ class NYTimes(BasicNewsRecipe):
if sec_title in self.excludeSections: if sec_title in self.excludeSections:
print "SECTION EXCLUDED: ",sec_title print "SECTION EXCLUDED: ",sec_title
continue continue
print 'Index URL: '+'http://www.nytimes.com/pages/'+index_url+'/index.html' try:
soup = self.index_to_soup('http://www.nytimes.com/pages/'+index_url+'/index.html') soup = self.index_to_soup('http://www.nytimes.com/pages/'+index_url+'/index.html')
except:
continue
print 'Index URL: '+'http://www.nytimes.com/pages/'+index_url+'/index.html'
self.key = sec_title self.key = sec_title
# Find each article # Find each article
for div in soup.findAll(True, for div in soup.findAll(True,
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}): attrs={'class':['section-headline', 'ledeStory', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
if div['class'] in ['story', 'story headline'] : if div['class'] in ['story', 'story headline', 'storyHeader'] :
self.handle_article(div) self.handle_article(div)
elif div['class'] == 'ledeStory':
divsub = div.find('div','storyHeader')
if divsub is not None:
self.handle_article(divsub)
ulrefer = div.find('ul','refer')
if ulrefer is not None:
for lidiv in ulrefer.findAll('li'):
self.handle_article(lidiv)
elif div['class'] == 'headlinesOnly multiline flush': elif div['class'] == 'headlinesOnly multiline flush':
for lidiv in div.findAll('li'): for lidiv in div.findAll('li'):
self.handle_article(lidiv) self.handle_article(lidiv)
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
return self.filter_ans(self.ans) return self.filter_ans(self.get_tech_feeds(self.ans))
def parse_todays_index(self): def parse_todays_index(self):
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html') soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
skipping = False skipping = False
# Find each article # Find each article
for div in soup.findAll(True, for div in soup.findAll(True,
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}): attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
if div['class'] in ['section-headline','sectionHeader']: if div['class'] in ['section-headline','sectionHeader']:
self.key = string.capwords(self.feed_title(div)) self.key = string.capwords(self.feed_title(div))
self.key = self.key.replace('Op-ed','Op-Ed') self.key = self.key.replace('Op-ed','Op-Ed')
@ -505,7 +583,7 @@ class NYTimes(BasicNewsRecipe):
self.handle_article(lidiv) self.handle_article(lidiv)
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
return self.filter_ans(self.ans) return self.filter_ans(self.get_tech_feeds(self.ans))
def parse_headline_index(self): def parse_headline_index(self):
@ -553,7 +631,7 @@ class NYTimes(BasicNewsRecipe):
for h3_item in search_div.findAll('h3'): for h3_item in search_div.findAll('h3'):
byline = h3_item.h6 byline = h3_item.h6
if byline is not None: if byline is not None:
author = self.tag_to_string(byline,usa_alt=False) author = self.tag_to_string(byline,use_alt=False)
else: else:
author = '' author = ''
a = h3_item.find('a', href=True) a = h3_item.find('a', href=True)
@ -579,7 +657,7 @@ class NYTimes(BasicNewsRecipe):
self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content='')) self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
return self.filter_ans(self.ans) return self.filter_ans(self.get_tech_feeds(self.ans))
def parse_index(self): def parse_index(self):
if self.headlinesOnly: if self.headlinesOnly:
@ -589,40 +667,198 @@ class NYTimes(BasicNewsRecipe):
else: else:
return self.parse_todays_index() return self.parse_todays_index()
def strip_anchors(self,soup): def strip_anchors(self,soup,kill_all=False):
paras = soup.findAll(True) paras = soup.findAll(True)
for para in paras: for para in paras:
aTags = para.findAll('a') aTags = para.findAll('a')
for a in aTags: for a in aTags:
if a.img is None: if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace')) if kill_all or (self.recursions==0):
a.replaceWith(self.tag_to_string(a,False))
else:
if a.has_key('href'):
if a['href'].startswith('http://www.nytimes'):
if not a['href'].endswith('pagewanted=all'):
url = re.sub(r'\?.*', '', a['href'])
if self.exclude_url(url):
a.replaceWith(self.tag_to_string(a,False))
else:
a['href'] = url+'?pagewanted=all'
elif not (a['href'].startswith('http://pogue') or \
a['href'].startswith('http://bits') or \
a['href'].startswith('http://travel') or \
a['href'].startswith('http://business') or \
a['href'].startswith('http://tech') or \
a['href'].startswith('http://health') or \
a['href'].startswith('http://dealbook') or \
a['href'].startswith('http://open')):
a.replaceWith(self.tag_to_string(a,False))
return soup
def handle_tags(self,soup):
try:
print("HANDLE TAGS: TITLE = "+self.tag_to_string(soup.title))
except:
print("HANDLE TAGS: NO TITLE")
if soup is None:
print("ERROR: handle_tags received NoneType")
return None
## print("HANDLING AD FORWARD:")
## print(soup)
if self.keep_only_tags:
body = Tag(soup, 'body')
try:
if isinstance(self.keep_only_tags, dict):
self.keep_only_tags = [self.keep_only_tags]
for spec in self.keep_only_tags:
for tag in soup.find('body').findAll(**spec):
body.insert(len(body.contents), tag)
soup.find('body').replaceWith(body)
except AttributeError: # soup has no body element
pass
def remove_beyond(tag, next):
while tag is not None and getattr(tag, 'name', None) != 'body':
after = getattr(tag, next)
while after is not None:
ns = getattr(tag, next)
after.extract()
after = ns
tag = tag.parent
if self.remove_tags_after is not None:
rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
for spec in rt:
tag = soup.find(**spec)
remove_beyond(tag, 'nextSibling')
if self.remove_tags_before is not None:
tag = soup.find(**self.remove_tags_before)
remove_beyond(tag, 'previousSibling')
for kwds in self.remove_tags:
for tag in soup.findAll(**kwds):
tag.extract()
return soup return soup
def preprocess_html(self, soup): def preprocess_html(self, soup):
if self.webEdition & (self.oldest_article>0): print("PREPROCESS TITLE="+self.tag_to_string(soup.title))
date_tag = soup.find(True,attrs={'class': ['dateline','date']}) skip_tag = soup.find(True, {'name':'skip'})
if date_tag: if skip_tag is not None:
date_str = self.tag_to_string(date_tag,use_alt=False) url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
date_str = date_str.replace('Published:','') url += '?pagewanted=all'
date_items = date_str.split(',') self.log.warn("Skipping ad to article at '%s'" % url)
try: sleep(5)
datestring = date_items[0]+' '+date_items[1] soup = self.handle_tags(self.article_to_soup(url))
article_date = self.decode_us_date(datestring)
except:
article_date = date.today()
if article_date < self.earliest_date:
self.log("Skipping article dated %s" % date_str)
return None
#all articles are from today, no need to print the date on every page # check if the article is from one of the tech blogs
try: blog=soup.find('div',attrs={'id':['pogue','bits','gadgetwise','open']})
if not self.webEdition:
date_tag = soup.find(True,attrs={'class': ['dateline','date']}) if blog is not None:
if date_tag: old_body = soup.find('body')
date_tag.extract() new_body=Tag(soup,'body')
except: new_body.append(soup.find('div',attrs={'id':'content'}))
self.log("Error removing the published date") new_body.find('div',attrs={'id':'content'})['id']='blogcontent' # identify for postprocess_html
old_body.replaceWith(new_body)
for divr in soup.findAll('div',attrs={'class':re.compile('w190 right')}):
if divr.find(text=re.compile('Sign up')):
divr.extract()
divr = soup.find('div',attrs={'id':re.compile('related-content')})
if divr is not None:
# handle related articles
rlist = []
ul = divr.find('ul')
if ul is not None:
for li in ul.findAll('li'):
atag = li.find('a')
if atag is not None:
if atag['href'].startswith('http://pogue') or atag['href'].startswith('http://bits') or \
atag['href'].startswith('http://open'):
atag.find(text=True).replaceWith(self.massageNCXText(self.tag_to_string(atag,False)))
rlist.append(atag)
divr.extract()
if rlist != []:
asidediv = Tag(soup,'div',[('class','aside')])
if soup.find('hr') is None:
asidediv.append(Tag(soup,'hr'))
h4 = Tag(soup,'h4',[('class','asidenote')])
h4.insert(0,"Related Posts")
asidediv.append(h4)
ul = Tag(soup,'ul')
for r in rlist:
li = Tag(soup,'li',[('class','aside')])
r['class'] = 'aside'
li.append(r)
ul.append(li)
asidediv.append(ul)
asidediv.append(Tag(soup,'hr'))
smain = soup.find('body')
smain.append(asidediv)
for atag in soup.findAll('a'):
img = atag.find('img')
if img is not None:
atag.replaceWith(img)
elif not atag.has_key('href'):
atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
elif not (atag['href'].startswith('http://www.nytimes') or atag['href'].startswith('http://pogue') or \
atag['href'].startswith('http://bits') or atag['href'].startswith('http://open')):
atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
hdr = soup.find('address')
if hdr is not None:
hdr.name='span'
for span_credit in soup.findAll('span','credit'):
sp = Tag(soup,'span')
span_credit.replaceWith(sp)
sp.append(Tag(soup,'br'))
sp.append(span_credit)
sp.append(Tag(soup,'br'))
else: # nytimes article
related = [] # these will be the related articles
first_outer = None # first related outer tag
first_related = None # first related tag
for outerdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}):
for rdiv in soup.findAll('div','columnGroup doubleRule'):
if rdiv.find('h3') is not None:
if self.tag_to_string(rdiv.h3,False).startswith('Related'):
rdiv.h3.find(text=True).replaceWith("Related articles")
rdiv.h3['class'] = 'asidenote'
for litag in rdiv.findAll('li'):
if litag.find('a') is not None:
if litag.find('a')['href'].startswith('http://www.nytimes.com'):
url = re.sub(r'\?.*', '', litag.find('a')['href'])
litag.find('a')['href'] = url+'?pagewanted=all'
litag.extract()
related.append(litag)
if first_related is None:
first_related = rdiv
first_outer = outerdiv
else:
litag.extract()
if related != []:
for r in related:
if r.h6: # don't want the anchor inside a h6 tag
r.h6.replaceWith(r.h6.a)
first_related.ul.append(r)
first_related.insert(0,Tag(soup,'hr'))
first_related.append(Tag(soup,'hr'))
first_related['class'] = 'aside'
first_outer.replaceWith(first_related) # replace the outer tag with the related tag
for rdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}):
rdiv.extract()
kicker_tag = soup.find(attrs={'class':'kicker'})
if kicker_tag: # remove Op_Ed author head shots
tagline = self.tag_to_string(kicker_tag)
if tagline=='Op-Ed Columnist':
img_div = soup.find('div','inlineImage module')
if img_div:
img_div.extract()
if self.useHighResImages: if self.useHighResImages:
try: try:
@ -667,26 +903,6 @@ class NYTimes(BasicNewsRecipe):
except Exception: except Exception:
self.log("Error pulling high resolution images") self.log("Error pulling high resolution images")
try:
#remove "Related content" bar
runAroundsFound = soup.findAll('div',{'class':['articleInline runaroundLeft','articleInline doubleRule runaroundLeft','articleInline runaroundLeft firstArticleInline','articleInline runaroundLeft ','articleInline runaroundLeft lastArticleInline']})
if runAroundsFound:
for runAround in runAroundsFound:
#find all section headers
hlines = runAround.findAll(True ,{'class':['sectionHeader','sectionHeader flushBottom']})
if hlines:
for hline in hlines:
hline.extract()
#find all section headers
hlines = runAround.findAll('h6')
if hlines:
for hline in hlines:
hline.extract()
except:
self.log("Error removing related content bar")
try: try:
#in case pulling images failed, delete the enlarge this text #in case pulling images failed, delete the enlarge this text
enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'}) enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
@ -696,9 +912,24 @@ class NYTimes(BasicNewsRecipe):
except: except:
self.log("Error removing Enlarge this text") self.log("Error removing Enlarge this text")
return self.strip_anchors(soup)
def postprocess_html(self,soup, True): return self.strip_anchors(soup,False)
def postprocess_html(self,soup,first_fetch):
if not first_fetch: # remove Related links
for aside in soup.findAll('div','aside'):
aside.extract()
soup = self.strip_anchors(soup,True)
if soup.find('div',attrs={'id':'blogcontent'}) is None:
if first_fetch:
aside = soup.find('div','aside')
if aside is not None: # move the related list to the end of the article
art = soup.find('div',attrs={'id':'article'})
if art is None:
art = soup.find('div',attrs={'class':'article'})
if art is not None:
art.append(aside)
try: try:
if self.one_picture_per_article: if self.one_picture_per_article:
# Remove all images after first # Remove all images after first
@ -855,23 +1086,22 @@ class NYTimes(BasicNewsRecipe):
self.log("ERROR: Problem in Add class=authorId to <div> so we can format with CSS") self.log("ERROR: Problem in Add class=authorId to <div> so we can format with CSS")
return soup return soup
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'): if not first:
return
idxdiv = soup.find('div',attrs={'class':'articleSpanImage'}) idxdiv = soup.find('div',attrs={'class':'articleSpanImage'})
if idxdiv is not None: if idxdiv is not None:
if idxdiv.img: if idxdiv.img:
self.add_toc_thumbnail(article, idxdiv.img['src']) self.add_toc_thumbnail(article, re.sub(r'links\\link\d+\\','',idxdiv.img['src']))
else: else:
img = soup.find('img') img = soup.find('body').find('img')
if img is not None: if img is not None:
self.add_toc_thumbnail(article, img['src']) self.add_toc_thumbnail(article, re.sub(r'links\\link\d+\\','',img['src']))
shortparagraph = "" shortparagraph = ""
try: try:
if len(article.text_summary.strip()) == 0: if len(article.text_summary.strip()) == 0:
articlebodies = soup.findAll('div',attrs={'class':'articleBody'}) articlebodies = soup.findAll('div',attrs={'class':'articleBody'})
if not articlebodies: #added to account for blog formats
articlebodies = soup.findAll('div', attrs={'class':'entry-content'}) #added to account for blog formats
if articlebodies: if articlebodies:
for articlebody in articlebodies: for articlebody in articlebodies:
if articlebody: if articlebody:
@ -880,15 +1110,23 @@ class NYTimes(BasicNewsRecipe):
refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip() refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
#account for blank paragraphs and short paragraphs by appending them to longer ones #account for blank paragraphs and short paragraphs by appending them to longer ones
if len(refparagraph) > 0: if len(refparagraph) > 0:
if len(refparagraph) > 140: #approximately two lines of text if len(refparagraph) > 70: #approximately one line of text
article.summary = article.text_summary = shortparagraph + refparagraph newpara = shortparagraph + refparagraph
newparaDateline,newparaEm,newparaDesc = newpara.partition('&mdash;')
if newparaEm == '':
newparaDateline,newparaEm,newparaDesc = newpara.partition('—')
if newparaEm == '':
newparaDesc = newparaDateline
article.summary = article.text_summary = newparaDesc.strip()
return return
else: else:
shortparagraph = refparagraph + " " shortparagraph = refparagraph + " "
if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"): if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
shortparagraph = shortparagraph + "- " shortparagraph = shortparagraph + "- "
else:
article.summary = article.text_summary = self.massageNCXText(article.text_summary)
except: except:
self.log("Error creating article descriptions") self.log("Error creating article descriptions")
return return

View File

@ -8,19 +8,19 @@ Fetch sueddeutsche.de
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Sueddeutsche(BasicNewsRecipe): class Sueddeutsche(BasicNewsRecipe):
title = u'Süddeutsche.de' # 2012-01-26 AGe Correct Title title = u'Süddeutsche.de'
description = 'News from Germany, Access to online content' # 2012-01-26 AGe description = 'News from Germany, Access to online content'
__author__ = 'Oliver Niesner and Armin Geller' #Update AGe 2012-01-26 __author__ = 'Oliver Niesner and Armin Geller' #Update AGe 2012-12-05
publisher = u'Süddeutsche Zeitung' # 2012-01-26 AGe add publisher = u'Süddeutsche Zeitung'
category = 'news, politics, Germany' # 2012-01-26 AGe add category = 'news, politics, Germany'
timefmt = ' [%a, %d %b %Y]' # 2012-01-26 AGe add %a timefmt = ' [%a, %d %b %Y]'
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
language = 'de' language = 'de'
encoding = 'utf-8' encoding = 'utf-8'
publication_type = 'newspaper' # 2012-01-26 add publication_type = 'newspaper'
cover_source = 'http://www.sueddeutsche.de/verlag' # 2012-01-26 AGe add from Darko Miletic paid content source cover_source = 'http://www.sueddeutsche.de/verlag' # 2012-01-26 AGe add from Darko Miletic paid content source
masthead_url = 'http://www.sueddeutsche.de/static_assets/build/img/sdesiteheader/logo_homepage.441d531c.png' # 2012-01-26 AGe add masthead_url = 'http://www.sueddeutsche.de/static_assets/img/sdesiteheader/logo_standard.a152b0df.png' # 2012-12-05 AGe add
use_embedded_content = False use_embedded_content = False
no_stylesheets = True no_stylesheets = True
@ -40,9 +40,9 @@ class Sueddeutsche(BasicNewsRecipe):
(u'Sport', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ESport%24?output=rss'), (u'Sport', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ESport%24?output=rss'),
(u'Leben', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ELeben%24?output=rss'), (u'Leben', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ELeben%24?output=rss'),
(u'Karriere', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EKarriere%24?output=rss'), (u'Karriere', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EKarriere%24?output=rss'),
(u'Bildung', u'http://rss.sueddeutsche.de/rss/bildung'), #2012-01-26 AGe New (u'Bildung', u'http://rss.sueddeutsche.de/rss/bildung'),
(u'Gesundheit', u'http://rss.sueddeutsche.de/rss/gesundheit'), #2012-01-26 AGe New (u'Gesundheit', u'http://rss.sueddeutsche.de/rss/gesundheit'),
(u'Stil', u'http://rss.sueddeutsche.de/rss/stil'), #2012-01-26 AGe New (u'Stil', u'http://rss.sueddeutsche.de/rss/stil'),
(u'München & Region', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EMünchen&Region%24?output=rss'), (u'München & Region', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EMünchen&Region%24?output=rss'),
(u'Bayern', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EBayern%24?output=rss'), (u'Bayern', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EBayern%24?output=rss'),
(u'Medien', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EMedien%24?output=rss'), (u'Medien', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EMedien%24?output=rss'),

View File

@ -2,8 +2,8 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '4 February 2011, desUBIKado' __copyright__ = '4 February 2011, desUBIKado'
__author__ = 'desUBIKado' __author__ = 'desUBIKado'
__version__ = 'v0.08' __version__ = 'v0.09'
__date__ = '30, June 2012' __date__ = '02, December 2012'
''' '''
http://www.weblogssl.com/ http://www.weblogssl.com/
''' '''
@ -37,6 +37,7 @@ class weblogssl(BasicNewsRecipe):
,(u'Xataka Mexico', u'http://feeds.weblogssl.com/xatakamx') ,(u'Xataka Mexico', u'http://feeds.weblogssl.com/xatakamx')
,(u'Xataka M\xf3vil', u'http://feeds.weblogssl.com/xatakamovil') ,(u'Xataka M\xf3vil', u'http://feeds.weblogssl.com/xatakamovil')
,(u'Xataka Android', u'http://feeds.weblogssl.com/xatakandroid') ,(u'Xataka Android', u'http://feeds.weblogssl.com/xatakandroid')
,(u'Xataka Windows', u'http://feeds.weblogssl.com/xatakawindows')
,(u'Xataka Foto', u'http://feeds.weblogssl.com/xatakafoto') ,(u'Xataka Foto', u'http://feeds.weblogssl.com/xatakafoto')
,(u'Xataka ON', u'http://feeds.weblogssl.com/xatakaon') ,(u'Xataka ON', u'http://feeds.weblogssl.com/xatakaon')
,(u'Xataka Ciencia', u'http://feeds.weblogssl.com/xatakaciencia') ,(u'Xataka Ciencia', u'http://feeds.weblogssl.com/xatakaciencia')
@ -80,19 +81,31 @@ class weblogssl(BasicNewsRecipe):
keep_only_tags = [dict(name='div', attrs={'id':'infoblock'}), keep_only_tags = [dict(name='div', attrs={'id':'infoblock'}),
dict(name='div', attrs={'class':'post'}), dict(name='div', attrs={'class':'post'}),
dict(name='div', attrs={'id':'blog-comments'}) dict(name='div', attrs={'id':'blog-comments'}),
dict(name='div', attrs={'class':'container'}) #m.xataka.com
] ]
remove_tags = [dict(name='div', attrs={'id':'comment-nav'})] remove_tags = [dict(name='div', attrs={'id':'comment-nav'}),
dict(name='menu', attrs={'class':'social-sharing'}), #m.xataka.com
dict(name='section' , attrs={'class':'comments'}), #m.xataka.com
dict(name='div' , attrs={'class':'article-comments'}), #m.xataka.com
dict(name='nav' , attrs={'class':'article-taxonomy'}) #m.xataka.com
]
remove_tags_after = dict(name='section' , attrs={'class':'comments'})
def print_version(self, url): def print_version(self, url):
return url.replace('http://www.', 'http://m.') return url.replace('http://www.', 'http://m.')
preprocess_regexps = [ preprocess_regexps = [
# Para poner una linea en blanco entre un comentario y el siguiente # Para poner una linea en blanco entre un comentario y el siguiente
(re.compile(r'<li id="c', re.DOTALL|re.IGNORECASE), lambda match: '<br><br><li id="c') (re.compile(r'<li id="c', re.DOTALL|re.IGNORECASE), lambda match: '<br><br><li id="c'),
# Para ver las imágenes en las noticias de m.xataka.com
(re.compile(r'<noscript>', re.DOTALL|re.IGNORECASE), lambda m: ''),
(re.compile(r'</noscript>', re.DOTALL|re.IGNORECASE), lambda m: '')
] ]
# Para sustituir el video incrustado de YouTube por una imagen # Para sustituir el video incrustado de YouTube por una imagen
def preprocess_html(self, soup): def preprocess_html(self, soup):
@ -108,14 +121,16 @@ class weblogssl(BasicNewsRecipe):
# Para obtener la url original del articulo a partir de la de "feedsportal" # Para obtener la url original del articulo a partir de la de "feedsportal"
# El siguiente código es gracias al usuario "bosplans" de www.mobileread.com # El siguiente código es gracias al usuario "bosplans" de www.mobileread.com
# http://www.mobileread.com/forums/sho...d.php?t=130297 # http://www.mobileread.com/forums/showthread.php?t=130297
def get_article_url(self, article): def get_article_url(self, article):
link = article.get('link', None) link = article.get('link', None)
if link is None: if link is None:
return article return article
# if link.split('/')[-4]=="xataka2":
# return article.get('feedburner_origlink', article.get('link', article.get('guid')))
if link.split('/')[-4]=="xataka2": if link.split('/')[-4]=="xataka2":
return article.get('feedburner_origlink', article.get('link', article.get('guid'))) return article.get('guid', None)
if link.split('/')[-1]=="story01.htm": if link.split('/')[-1]=="story01.htm":
link=link.split('/')[-2] link=link.split('/')[-2]
a=['0B','0C','0D','0E','0F','0G','0N' ,'0L0S','0A'] a=['0B','0C','0D','0E','0F','0G','0N' ,'0L0S','0A']

View File

@ -9,15 +9,15 @@ class Zaman (BasicNewsRecipe):
__author__ = u'thomass' __author__ = u'thomass'
oldest_article = 2 oldest_article = 2
max_articles_per_feed =50 max_articles_per_feed =50
# no_stylesheets = True no_stylesheets = True
#delay = 1 #delay = 1
#use_embedded_content = False use_embedded_content = False
encoding = 'ISO 8859-9' encoding = 'utf-8'
publisher = 'Zaman' publisher = 'Feza Gazetecilik'
category = 'news, haberler,TR,gazete' category = 'news, haberler,TR,gazete'
language = 'tr' language = 'tr'
publication_type = 'newspaper ' publication_type = 'newspaper '
extra_css = '.buyukbaslik{font-weight: bold; font-size: 18px;color:#0000FF}'#body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} ' extra_css = 'h1{text-transform: capitalize; font-weight: bold; font-size: 22px;color:#0000FF} p{text-align:justify} ' #.introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
conversion_options = { conversion_options = {
'tags' : category 'tags' : category
,'language' : language ,'language' : language
@ -26,25 +26,26 @@ class Zaman (BasicNewsRecipe):
} }
cover_img_url = 'https://fbcdn-profile-a.akamaihd.net/hprofile-ak-snc4/188140_81722291869_2111820_n.jpg' cover_img_url = 'https://fbcdn-profile-a.akamaihd.net/hprofile-ak-snc4/188140_81722291869_2111820_n.jpg'
masthead_url = 'http://medya.zaman.com.tr/extentions/zaman.com.tr/img/section/logo-section.png' masthead_url = 'http://medya.zaman.com.tr/extentions/zaman.com.tr/img/section/logo-section.png'
ignore_duplicate_articles = { 'title', 'url' }
auto_cleanup = False
remove_empty_feeds= True
#keep_only_tags = [dict(name='div', attrs={'id':[ 'news-detail-content']}), dict(name='td', attrs={'class':['columnist-detail','columnist_head']}) ] #keep_only_tags = [dict(name='div', attrs={'id':[ 'contentposition19']})]#,dict(name='div', attrs={'id':[ 'xxx']}),dict(name='div', attrs={'id':[ 'xxx']}),dict(name='div', attrs={'id':[ 'xxx']}),dict(name='div', attrs={'id':[ 'xxx']}),dict(name='div', attrs={'id':[ 'xxx']}),dict(name='div', attrs={'id':[ 'xxx']}),dict(name='div', attrs={'id':[ 'news-detail-content']}), dict(name='td', attrs={'class':['columnist-detail','columnist_head']}), ]
remove_tags = [ dict(name='img', attrs={'src':['http://medya.zaman.com.tr/zamantryeni/pics/zamanonline.gif']})]#,dict(name='div', attrs={'class':['radioEmbedBg','radyoProgramAdi']}),dict(name='a', attrs={'class':['webkit-html-attribute-value webkit-html-external-link']}),dict(name='table', attrs={'id':['yaziYorumTablosu']}),dict(name='img', attrs={'src':['http://medya.zaman.com.tr/pics/paylas.gif','http://medya.zaman.com.tr/extentions/zaman.com.tr/img/columnist/ma-16.png']}) remove_tags = [ dict(name='img', attrs={'src':['http://cmsmedya.zaman.com.tr/images/logo/logo.bmp']}),dict(name='hr', attrs={'class':['interactive-hr']})]# remove_tags = [ dict(name='div', attrs={'class':[ 'detayUyari']}),dict(name='div', attrs={'class':[ 'detayYorum']}),dict(name='div', attrs={'class':[ 'addthis_toolbox addthis_default_style ']}),dict(name='div', attrs={'id':[ 'tumYazi']})]#,dict(name='div', attrs={'id':[ 'xxx']}),dict(name='div', attrs={'id':[ 'xxx']}),dict(name='div', attrs={'id':[ 'xxx']}),dict(name='div', attrs={'id':[ 'xxx']}),dict(name='div', attrs={'id':[ 'xxx']}),dict(name='div', attrs={'id':[ 'xxx']}),dict(name='img', attrs={'src':['http://medya.zaman.com.tr/zamantryeni/pics/zamanonline.gif']}),dict(name='div', attrs={'class':['radioEmbedBg','radyoProgramAdi']}),dict(name='a', attrs={'class':['webkit-html-attribute-value webkit-html-external-link']}),dict(name='table', attrs={'id':['yaziYorumTablosu']}),dict(name='img', attrs={'src':['http://medya.zaman.com.tr/pics/paylas.gif','http://medya.zaman.com.tr/extentions/zaman.com.tr/img/columnist/ma-16.png']}),dict(name='div', attrs={'id':[ 'news-detail-gallery']}),dict(name='div', attrs={'id':[ 'news-detail-title-bottom-part']}),dict(name='div', attrs={'id':[ 'news-detail-news-paging-main']})]#
#remove_attributes = ['width','height'] #remove_attributes = ['width','height']
remove_empty_feeds= True remove_empty_feeds= True
feeds = [ feeds = [
( u'Anasayfa', u'http://www.zaman.com.tr/anasayfa.rss'), ( u'Manşet', u'http://www.zaman.com.tr/manset.rss'),
( u'Son Dakika', u'http://www.zaman.com.tr/sondakika.rss'),
#( u'En çok Okunanlar', u'http://www.zaman.com.tr/max_all.rss'),
#( u'Manşet', u'http://www.zaman.com.tr/manset.rss'),
( u'Gündem', u'http://www.zaman.com.tr/gundem.rss'),
( u'Yazarlar', u'http://www.zaman.com.tr/yazarlar.rss'), ( u'Yazarlar', u'http://www.zaman.com.tr/yazarlar.rss'),
( u'Politika', u'http://www.zaman.com.tr/politika.rss'), ( u'Politika', u'http://www.zaman.com.tr/politika.rss'),
( u'Ekonomi', u'http://www.zaman.com.tr/ekonomi.rss'), ( u'Ekonomi', u'http://www.zaman.com.tr/ekonomi.rss'),
( u'Dış Haberler', u'http://www.zaman.com.tr/dishaberler.rss'), ( u'Dış Haberler', u'http://www.zaman.com.tr/dishaberler.rss'),
( u'Son Dakika', u'http://www.zaman.com.tr/sondakika.rss'),
( u'Gündem', u'http://www.zaman.com.tr/gundem.rss'),
( u'Yorumlar', u'http://www.zaman.com.tr/yorumlar.rss'), ( u'Yorumlar', u'http://www.zaman.com.tr/yorumlar.rss'),
( u'Röportaj', u'http://www.zaman.com.tr/roportaj.rss'), ( u'Röportaj', u'http://www.zaman.com.tr/roportaj.rss'),
( u'Dizi Yazı', u'http://www.zaman.com.tr/dizi.rss'), ( u'Dizi Yazı', u'http://www.zaman.com.tr/dizi.rss'),
@ -59,8 +60,9 @@ class Zaman (BasicNewsRecipe):
( u'Cuma Eki', u'http://www.zaman.com.tr/cuma.rss'), ( u'Cuma Eki', u'http://www.zaman.com.tr/cuma.rss'),
( u'Cumaertesi Eki', u'http://www.zaman.com.tr/cumaertesi.rss'), ( u'Cumaertesi Eki', u'http://www.zaman.com.tr/cumaertesi.rss'),
( u'Pazar Eki', u'http://www.zaman.com.tr/pazar.rss'), ( u'Pazar Eki', u'http://www.zaman.com.tr/pazar.rss'),
( u'En çok Okunanlar', u'http://www.zaman.com.tr/max_all.rss'),
( u'Anasayfa', u'http://www.zaman.com.tr/anasayfa.rss'),
] ]
def print_version(self, url): def print_version(self, url):
return url.replace('http://www.zaman.com.tr/haber.do?haberno=', 'http://www.zaman.com.tr/yazdir.do?haberno=') return url.replace('http://www.zaman.com.tr/newsDetail_getNewsById.action?newsId=', 'http://www.zaman.com.tr/newsDetail_openPrintPage.action?newsId=')

Binary file not shown.

Before

Width:  |  Height:  |  Size: 17 KiB

After

Width:  |  Height:  |  Size: 62 KiB

View File

@ -39,18 +39,6 @@ class Win32(WinBase):
def msi64(self): def msi64(self):
return installer_name('msi', is64bit=True) return installer_name('msi', is64bit=True)
def sign_msi(self):
import xattr
print ('Signing installers ...')
sign64 = False
msi64 = self.msi64
if os.path.exists(msi64) and 'user.signed' not in xattr.list(msi64):
subprocess.check_call(['scp', msi64, self.VM_NAME +
':build/%s/%s'%(__appname__, msi64)])
sign64 = True
subprocess.check_call(['ssh', self.VM_NAME, '~/sign.sh'], shell=False)
return sign64
def do_dl(self, installer, errmsg): def do_dl(self, installer, errmsg):
subprocess.check_call(('scp', subprocess.check_call(('scp',
'%s:build/%s/%s'%(self.VM_NAME, __appname__, installer), 'dist')) '%s:build/%s/%s'%(self.VM_NAME, __appname__, installer), 'dist'))
@ -62,14 +50,8 @@ class Win32(WinBase):
installer = self.installer() installer = self.installer()
if os.path.exists('build/winfrozen'): if os.path.exists('build/winfrozen'):
shutil.rmtree('build/winfrozen') shutil.rmtree('build/winfrozen')
sign64 = self.sign_msi()
if sign64:
self.do_dl(self.msi64, 'Failed to d/l signed 64 bit installer')
import xattr
xattr.set(self.msi64, 'user.signed', 'true')
self.do_dl(installer, 'Failed to freeze') self.do_dl(installer, 'Failed to freeze')
installer = 'dist/%s-portable-installer-%s.exe'%(__appname__, __version__) installer = 'dist/%s-portable-installer-%s.exe'%(__appname__, __version__)
self.do_dl(installer, 'Failed to get portable installer') self.do_dl(installer, 'Failed to get portable installer')

View File

@ -91,6 +91,7 @@ class Win32Freeze(Command, WixMixIn):
if not is64bit: if not is64bit:
self.build_portable() self.build_portable()
self.build_portable_installer() self.build_portable_installer()
self.sign_installers()
def remove_CRT_from_manifests(self): def remove_CRT_from_manifests(self):
''' '''
@ -488,6 +489,17 @@ class Win32Freeze(Command, WixMixIn):
subprocess.check_call([LZMA + r'\bin\elzma.exe', '-9', '--lzip', name]) subprocess.check_call([LZMA + r'\bin\elzma.exe', '-9', '--lzip', name])
def sign_installers(self):
self.info('Signing installers...')
files = glob.glob(self.j('dist', '*.msi')) + glob.glob(self.j('dist',
'*.exe'))
if not files:
raise ValueError('No installers found')
subprocess.check_call(['signtool.exe', 'sign', '/a', '/d',
'calibre - E-book management', '/du',
'http://calibre-ebook.com', '/t',
'http://timestamp.verisign.com/scripts/timstamp.dll'] + files)
def add_dir_to_zip(self, zf, path, prefix=''): def add_dir_to_zip(self, zf, path, prefix=''):
''' '''
Add a directory recursively to the zip file with an optional prefix. Add a directory recursively to the zip file with an optional prefix.

File diff suppressed because it is too large Load Diff

View File

@ -148,10 +148,10 @@ def print_basic_debug_info(out=None):
out = functools.partial(prints, file=out) out = functools.partial(prints, file=out)
import platform import platform
from calibre.constants import (__appname__, get_version, isportable, isosx, from calibre.constants import (__appname__, get_version, isportable, isosx,
isfrozen) isfrozen, is64bit)
out(__appname__, get_version(), 'Portable' if isportable else '', out(__appname__, get_version(), 'Portable' if isportable else '',
'isfrozen:', isfrozen) 'isfrozen:', isfrozen, 'is64bit:', is64bit)
out(platform.platform(), platform.system()) out(platform.platform(), platform.system(), platform.architecture())
out(platform.system_alias(platform.system(), platform.release(), out(platform.system_alias(platform.system(), platform.release(),
platform.version())) platform.version()))
out('Python', platform.python_version()) out('Python', platform.python_version())

View File

@ -232,7 +232,7 @@ class ANDROID(USBMS):
'THINKPAD_TABLET', 'SGH-T989', 'YP-G70', 'STORAGE_DEVICE', 'THINKPAD_TABLET', 'SGH-T989', 'YP-G70', 'STORAGE_DEVICE',
'ADVANCED', 'SGH-I727', 'USB_FLASH_DRIVER', 'ANDROID', 'ADVANCED', 'SGH-I727', 'USB_FLASH_DRIVER', 'ANDROID',
'S5830I_CARD', 'MID7042', 'LINK-CREATE', '7035', 'VIEWPAD_7E', 'S5830I_CARD', 'MID7042', 'LINK-CREATE', '7035', 'VIEWPAD_7E',
'NOVO7', 'MB526', '_USB#WYK7MSF8KE'] 'NOVO7', 'MB526', '_USB#WYK7MSF8KE', 'TABLET_PC']
WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897', WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897',
'FILE-STOR_GADGET', 'SGH-T959_CARD', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD', 'FILE-STOR_GADGET', 'SGH-T959_CARD', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD',
'A70S', 'A101IT', '7', 'INCREDIBLE', 'A7EB', 'SGH-T849_CARD', 'A70S', 'A101IT', '7', 'INCREDIBLE', 'A7EB', 'SGH-T849_CARD',
@ -243,7 +243,7 @@ class ANDROID(USBMS):
'FILE-CD_GADGET', 'GT-I9001_CARD', 'USB_2.0', 'XT875', 'FILE-CD_GADGET', 'GT-I9001_CARD', 'USB_2.0', 'XT875',
'UMS_COMPOSITE', 'PRO', '.KOBO_VOX', 'SGH-T989_CARD', 'SGH-I727', 'UMS_COMPOSITE', 'PRO', '.KOBO_VOX', 'SGH-T989_CARD', 'SGH-I727',
'USB_FLASH_DRIVER', 'ANDROID', 'MID7042', '7035', 'VIEWPAD_7E', 'USB_FLASH_DRIVER', 'ANDROID', 'MID7042', '7035', 'VIEWPAD_7E',
'NOVO7', 'ADVANCED'] 'NOVO7', 'ADVANCED', 'TABLET_PC']
OSX_MAIN_MEM = 'Android Device Main Memory' OSX_MAIN_MEM = 'Android Device Main Memory'

File diff suppressed because it is too large Load Diff

View File

@ -2357,6 +2357,8 @@ class KOBOTOUCH(KOBO):
update_query = 'UPDATE content SET Series=?, SeriesNumber==? where BookID is Null and ContentID = ?' update_query = 'UPDATE content SET Series=?, SeriesNumber==? where BookID is Null and ContentID = ?'
if book.series is None: if book.series is None:
update_values = (None, None, book.contentID, ) update_values = (None, None, book.contentID, )
elif book.series_index is None: # This should never happen, but...
update_values = (book.series, None, book.contentID, )
else: else:
update_values = (book.series, "%g"%book.series_index, book.contentID, ) update_values = (book.series, "%g"%book.series_index, book.contentID, )

View File

@ -54,6 +54,8 @@ def synchronous(tlockname):
class ConnectionListener (Thread): class ConnectionListener (Thread):
NOT_SERVICED_COUNT = 6
def __init__(self, driver): def __init__(self, driver):
Thread.__init__(self) Thread.__init__(self)
self.daemon = True self.daemon = True
@ -78,8 +80,8 @@ class ConnectionListener (Thread):
if not self.driver.connection_queue.empty(): if not self.driver.connection_queue.empty():
queue_not_serviced_count += 1 queue_not_serviced_count += 1
if queue_not_serviced_count >= 3: if queue_not_serviced_count >= self.NOT_SERVICED_COUNT:
self.driver._debug('queue not serviced') self.driver._debug('queue not serviced', queue_not_serviced_count)
try: try:
sock = self.driver.connection_queue.get_nowait() sock = self.driver.connection_queue.get_nowait()
s = self.driver._json_encode( s = self.driver._json_encode(
@ -1281,10 +1283,10 @@ class SMART_DEVICE_APP(DeviceConfig, DevicePlugin):
self._close_listen_socket() self._close_listen_socket()
return message return message
else: else:
while i < 100: # try up to 100 random port numbers while i < 100: # try 9090 then up to 99 random port numbers
i += 1 i += 1
port = self._attach_to_port(self.listen_socket, port = self._attach_to_port(self.listen_socket,
random.randint(8192, 32000)) 9090 if i == 1 else random.randint(8192, 32000))
if port != 0: if port != 0:
break break
if port == 0: if port == 0:

View File

@ -74,11 +74,12 @@ def remove_kindlegen_markup(parts):
part = "".join(srcpieces) part = "".join(srcpieces)
parts[i] = part parts[i] = part
# we can safely remove all of the Kindlegen generated data-AmznPageBreak tags # we can safely remove all of the Kindlegen generated data-AmznPageBreak
# attributes
find_tag_with_AmznPageBreak_pattern = re.compile( find_tag_with_AmznPageBreak_pattern = re.compile(
r'''(<[^>]*\sdata-AmznPageBreak=[^>]*>)''', re.IGNORECASE) r'''(<[^>]*\sdata-AmznPageBreak=[^>]*>)''', re.IGNORECASE)
within_tag_AmznPageBreak_position_pattern = re.compile( within_tag_AmznPageBreak_position_pattern = re.compile(
r'''\sdata-AmznPageBreak=['"][^'"]*['"]''') r'''\sdata-AmznPageBreak=['"]([^'"]*)['"]''')
for i in xrange(len(parts)): for i in xrange(len(parts)):
part = parts[i] part = parts[i]
@ -86,10 +87,8 @@ def remove_kindlegen_markup(parts):
for j in range(len(srcpieces)): for j in range(len(srcpieces)):
tag = srcpieces[j] tag = srcpieces[j]
if tag.startswith('<'): if tag.startswith('<'):
for m in within_tag_AmznPageBreak_position_pattern.finditer(tag): srcpieces[j] = within_tag_AmznPageBreak_position_pattern.sub(
replacement = '' lambda m:' style="page-break-after:%s"'%m.group(1), tag)
tag = within_tag_AmznPageBreak_position_pattern.sub(replacement, tag, 1)
srcpieces[j] = tag
part = "".join(srcpieces) part = "".join(srcpieces)
parts[i] = part parts[i] = part

View File

@ -44,6 +44,18 @@ def locate_beg_end_of_tag(ml, aid):
return plt, pgt return plt, pgt
return 0, 0 return 0, 0
def reverse_tag_iter(block):
''' Iterate over all tags in block in reverse order, i.e. last tag
to first tag. '''
end = len(block)
while True:
pgt = block.rfind(b'>', 0, end)
if pgt == -1: break
plt = block.rfind(b'<', 0, pgt)
if plt == -1: break
yield block[plt:pgt+1]
end = plt
class Mobi8Reader(object): class Mobi8Reader(object):
def __init__(self, mobi6_reader, log): def __init__(self, mobi6_reader, log):
@ -275,13 +287,12 @@ class Mobi8Reader(object):
return '%s/%s'%(fi.type, fi.filename), idtext return '%s/%s'%(fi.type, fi.filename), idtext
def get_id_tag(self, pos): def get_id_tag(self, pos):
# find the correct tag by actually searching in the destination # Find the first tag with a named anchor (name or id attribute) before
# textblock at position # pos
fi = self.get_file_info(pos) fi = self.get_file_info(pos)
if fi.num is None and fi.start is None: if fi.num is None and fi.start is None:
raise ValueError('No file contains pos: %d'%pos) raise ValueError('No file contains pos: %d'%pos)
textblock = self.parts[fi.num] textblock = self.parts[fi.num]
id_map = []
npos = pos - fi.start npos = pos - fi.start
pgt = textblock.find(b'>', npos) pgt = textblock.find(b'>', npos)
plt = textblock.find(b'<', npos) plt = textblock.find(b'<', npos)
@ -290,28 +301,15 @@ class Mobi8Reader(object):
if plt == npos or pgt < plt: if plt == npos or pgt < plt:
npos = pgt + 1 npos = pgt + 1
textblock = textblock[0:npos] textblock = textblock[0:npos]
# find id links only inside of tags id_re = re.compile(br'''<[^>]+\sid\s*=\s*['"]([^'"]+)['"]''')
# inside any < > pair find all "id=' and return whatever is inside name_re = re.compile(br'''<\s*a\s*\sname\s*=\s*['"]([^'"]+)['"]''')
# the quotes for tag in reverse_tag_iter(textblock):
id_pattern = re.compile(br'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"][^>]*>''', m = id_re.match(tag) or name_re.match(tag)
re.IGNORECASE) if m is not None:
for m in re.finditer(id_pattern, textblock): return m.group(1)
id_map.append((m.start(), m.group(1)))
if not id_map: # No tag found, link to start of file
# Found no id in the textblock, link must be to top of file
return b'' return b''
# if npos is before first id= inside a tag, return the first
if npos < id_map[0][0]:
return id_map[0][1]
# if npos is after the last id= inside a tag, return the last
if npos > id_map[-1][0]:
return id_map[-1][1]
# otherwise find last id before npos
for i, item in enumerate(id_map):
if npos < item[0]:
return id_map[i-1][1]
return id_map[0][1]
def create_guide(self): def create_guide(self):
guide = Guide() guide = Guide()

View File

@ -320,13 +320,11 @@ class OEBReader(object):
self.logger.warn(u'Spine item %r not found' % idref) self.logger.warn(u'Spine item %r not found' % idref)
continue continue
item = manifest.ids[idref] item = manifest.ids[idref]
if item.media_type.lower() in OEB_DOCS and hasattr(item.data, 'xpath'):
spine.add(item, elem.get('linear')) spine.add(item, elem.get('linear'))
for item in spine: else:
if item.media_type.lower() not in OEB_DOCS:
if not hasattr(item.data, 'xpath'):
self.oeb.log.warn('The item %s is not a XML document.' self.oeb.log.warn('The item %s is not a XML document.'
' Removing it from spine.'%item.href) ' Removing it from spine.'%item.href)
spine.remove(item)
if len(spine) == 0: if len(spine) == 0:
raise OEBError("Spine is empty") raise OEBError("Spine is empty")
self._spine_add_extra() self._spine_add_extra()

View File

@ -114,7 +114,9 @@ class DetectStructure(object):
def find_matches(expr, doc): def find_matches(expr, doc):
try: try:
return XPath(expr)(doc) ans = XPath(expr)(doc)
len(ans)
return ans
except: except:
self.log.warn('Invalid chapter expression, ignoring: %s'%expr) self.log.warn('Invalid chapter expression, ignoring: %s'%expr)
return [] return []
@ -203,7 +205,9 @@ class DetectStructure(object):
def find_matches(expr, doc): def find_matches(expr, doc):
try: try:
return XPath(expr)(doc) ans = XPath(expr)(doc)
len(ans)
return ans
except: except:
self.log.warn('Invalid ToC expression, ignoring: %s'%expr) self.log.warn('Invalid ToC expression, ignoring: %s'%expr)
return [] return []

View File

@ -27,10 +27,10 @@ def get_custom_size(opts):
custom_size = None custom_size = None
if opts.custom_size != None: if opts.custom_size != None:
width, sep, height = opts.custom_size.partition('x') width, sep, height = opts.custom_size.partition('x')
if height != '': if height:
try: try:
width = int(width) width = float(width)
height = int(height) height = float(height)
custom_size = (width, height) custom_size = (width, height)
except: except:
custom_size = None custom_size = None

View File

@ -72,8 +72,8 @@ class LibreDEStore(BasicStoreConfig, StorePlugin):
mobi = details.xpath( mobi = details.xpath(
'boolean(.//span[@class="bindername" and contains(text(), "mobipocket")]/text())') 'boolean(.//span[@class="bindername" and contains(text(), "mobipocket")]/text())')
cover_url = ''.join(data.xpath('.//div[@class="coverImg"]/a/img/@src')) cover_url = ''.join(data.xpath('.//div[@class="coverimg"]/a/img/@src'))
price = ''.join(data.xpath('.//span[@class="preis"]/text()')).replace('*', '').strip() price = ''.join(data.xpath('.//div[@class="preis"]/text()')).replace('*', '').strip()
counter -= 1 counter -= 1

View File

@ -8,7 +8,7 @@ from PyQt4.Qt import (QThread, pyqtSignal, Qt, QUrl, QDialog, QGridLayout,
import mechanize import mechanize
from calibre.constants import (__appname__, __version__, iswindows, isosx, from calibre.constants import (__appname__, __version__, iswindows, isosx,
isportable) isportable, is64bit)
from calibre import browser, prints, as_unicode from calibre import browser, prints, as_unicode
from calibre.utils.config import prefs from calibre.utils.config import prefs
from calibre.gui2 import config, dynamic, open_url from calibre.gui2 import config, dynamic, open_url
@ -19,6 +19,13 @@ URL = 'http://status.calibre-ebook.com/latest'
NO_CALIBRE_UPDATE = '-0.0.0' NO_CALIBRE_UPDATE = '-0.0.0'
VSEP = '|' VSEP = '|'
def get_download_url():
which = ('portable' if isportable else 'windows' if iswindows
else 'osx' if isosx else 'linux')
if which == 'windows' and is64bit:
which += '64'
return 'http://calibre-ebook.com/download_' + which
def get_newest_version(): def get_newest_version():
br = browser() br = browser()
req = mechanize.Request(URL) req = mechanize.Request(URL)
@ -116,10 +123,7 @@ class UpdateNotification(QDialog):
config.set('new_version_notification', bool(self.cb.isChecked())) config.set('new_version_notification', bool(self.cb.isChecked()))
def accept(self): def accept(self):
url = ('http://calibre-ebook.com/download_' + open_url(QUrl(get_download_url()))
('portable' if isportable else 'windows' if iswindows
else 'osx' if isosx else 'linux'))
open_url(QUrl(url))
QDialog.accept(self) QDialog.accept(self)

View File

@ -12,6 +12,7 @@ from calibre.customize import CatalogPlugin
from calibre.library.catalogs import FIELDS from calibre.library.catalogs import FIELDS
from calibre.customize.conversion import DummyReporter from calibre.customize.conversion import DummyReporter
class CSV_XML(CatalogPlugin): class CSV_XML(CatalogPlugin):
'CSV/XML catalog generator' 'CSV/XML catalog generator'
@ -22,27 +23,27 @@ class CSV_XML(CatalogPlugin):
supported_platforms = ['windows', 'osx', 'linux'] supported_platforms = ['windows', 'osx', 'linux']
author = 'Greg Riker' author = 'Greg Riker'
version = (1, 0, 0) version = (1, 0, 0)
file_types = set(['csv','xml']) file_types = set(['csv', 'xml'])
cli_options = [ cli_options = [
Option('--fields', Option('--fields',
default = 'all', default='all',
dest = 'fields', dest='fields',
action = None, action=None,
help = _('The fields to output when cataloging books in the ' help=_('The fields to output when cataloging books in the '
'database. Should be a comma-separated list of fields.\n' 'database. Should be a comma-separated list of fields.\n'
'Available fields: %(fields)s,\n' 'Available fields: %(fields)s,\n'
'plus user-created custom fields.\n' 'plus user-created custom fields.\n'
'Example: %(opt)s=title,authors,tags\n' 'Example: %(opt)s=title,authors,tags\n'
"Default: '%%default'\n" "Default: '%%default'\n"
"Applies to: CSV, XML output formats")%dict( "Applies to: CSV, XML output formats") % dict(
fields=', '.join(FIELDS), opt='--fields')), fields=', '.join(FIELDS), opt='--fields')),
Option('--sort-by', Option('--sort-by',
default = 'id', default='id',
dest = 'sort_by', dest='sort_by',
action = None, action=None,
help = _('Output field to sort on.\n' help=_('Output field to sort on.\n'
'Available fields: author_sort, id, rating, size, timestamp, title_sort\n' 'Available fields: author_sort, id, rating, size, timestamp, title_sort\n'
"Default: '%default'\n" "Default: '%default'\n"
"Applies to: CSV, XML output formats"))] "Applies to: CSV, XML output formats"))]
@ -97,7 +98,7 @@ class CSV_XML(CatalogPlugin):
for entry in data: for entry in data:
entry['ondevice'] = db.catalog_plugin_on_device_temp_mapping[entry['id']]['ondevice'] entry['ondevice'] = db.catalog_plugin_on_device_temp_mapping[entry['id']]['ondevice']
fm = {x:db.field_metadata.get(x, {}) for x in fields} fm = {x: db.field_metadata.get(x, {}) for x in fields}
if self.fmt == 'csv': if self.fmt == 'csv':
outfile = codecs.open(path_to_output, 'w', 'utf8') outfile = codecs.open(path_to_output, 'w', 'utf8')
@ -113,7 +114,7 @@ class CSV_XML(CatalogPlugin):
outstr = [] outstr = []
for field in fields: for field in fields:
if field.startswith('#'): if field.startswith('#'):
item = db.get_field(entry['id'],field,index_is_id=True) item = db.get_field(entry['id'], field, index_is_id=True)
elif field == 'library_name': elif field == 'library_name':
item = current_library item = current_library
elif field == 'title_sort': elif field == 'title_sort':
@ -129,7 +130,7 @@ class CSV_XML(CatalogPlugin):
for format in item: for format in item:
fmt_list.append(format.rpartition('.')[2].lower()) fmt_list.append(format.rpartition('.')[2].lower())
item = ', '.join(fmt_list) item = ', '.join(fmt_list)
elif field in ['authors','tags']: elif field in ['authors', 'tags']:
item = ', '.join(item) item = ', '.join(item)
elif field == 'isbn': elif field == 'isbn':
# Could be 9, 10 or 13 digits # Could be 9, 10 or 13 digits
@ -137,20 +138,20 @@ class CSV_XML(CatalogPlugin):
elif field in ['pubdate', 'timestamp']: elif field in ['pubdate', 'timestamp']:
item = isoformat(item) item = isoformat(item)
elif field == 'comments': elif field == 'comments':
item = item.replace(u'\r\n',u' ') item = item.replace(u'\r\n', u' ')
item = item.replace(u'\n',u' ') item = item.replace(u'\n', u' ')
elif fm.get(field, {}).get('datatype', None) == 'rating' and item: elif fm.get(field, {}).get('datatype', None) == 'rating' and item:
item = u'%.2g'%(item/2.0) item = u'%.2g' % (item / 2.0)
# Convert HTML to markdown text # Convert HTML to markdown text
if type(item) is unicode: if type(item) is unicode:
opening_tag = re.search('<(\w+)(\x20|>)',item) opening_tag = re.search('<(\w+)(\x20|>)', item)
if opening_tag: if opening_tag:
closing_tag = re.search('<\/%s>$' % opening_tag.group(1), item) closing_tag = re.search('<\/%s>$' % opening_tag.group(1), item)
if closing_tag: if closing_tag:
item = html2text(item) item = html2text(item)
outstr.append(u'"%s"' % unicode(item).replace('"','""')) outstr.append(u'"%s"' % unicode(item).replace('"', '""'))
outfile.write(u','.join(outstr) + u'\n') outfile.write(u','.join(outstr) + u'\n')
outfile.close() outfile.close()
@ -165,14 +166,14 @@ class CSV_XML(CatalogPlugin):
for field in fields: for field in fields:
if field.startswith('#'): if field.startswith('#'):
val = db.get_field(r['id'],field,index_is_id=True) val = db.get_field(r['id'], field, index_is_id=True)
if not isinstance(val, (str, unicode)): if not isinstance(val, (str, unicode)):
val = unicode(val) val = unicode(val)
item = getattr(E, field.replace('#','_'))(val) item = getattr(E, field.replace('#', '_'))(val)
record.append(item) record.append(item)
for field in ('id', 'uuid', 'publisher', 'rating', 'size', for field in ('id', 'uuid', 'publisher', 'rating', 'size',
'isbn','ondevice', 'identifiers'): 'isbn', 'ondevice', 'identifiers'):
if field in fields: if field in fields:
val = r[field] val = r[field]
if not val: if not val:
@ -180,7 +181,7 @@ class CSV_XML(CatalogPlugin):
if not isinstance(val, (str, unicode)): if not isinstance(val, (str, unicode)):
if (fm.get(field, {}).get('datatype', None) == if (fm.get(field, {}).get('datatype', None) ==
'rating' and val): 'rating' and val):
val = u'%.2g'%(val/2.0) val = u'%.2g' % (val / 2.0)
val = unicode(val) val = unicode(val)
item = getattr(E, field)(val) item = getattr(E, field)(val)
record.append(item) record.append(item)
@ -227,4 +228,3 @@ class CSV_XML(CatalogPlugin):
with open(path_to_output, 'w') as f: with open(path_to_output, 'w') as f:
f.write(etree.tostring(root, encoding='utf-8', f.write(etree.tostring(root, encoding='utf-8',
xml_declaration=True, pretty_print=True)) xml_declaration=True, pretty_print=True))

View File

@ -21,6 +21,7 @@ from calibre.utils.localization import get_lang
Option = namedtuple('Option', 'option, default, dest, action, help') Option = namedtuple('Option', 'option, default, dest, action, help')
class EPUB_MOBI(CatalogPlugin): class EPUB_MOBI(CatalogPlugin):
'ePub catalog generator' 'ePub catalog generator'
@ -30,29 +31,29 @@ class EPUB_MOBI(CatalogPlugin):
minimum_calibre_version = (0, 7, 40) minimum_calibre_version = (0, 7, 40)
author = 'Greg Riker' author = 'Greg Riker'
version = (1, 0, 0) version = (1, 0, 0)
file_types = set(['azw3','epub','mobi']) file_types = set(['azw3', 'epub', 'mobi'])
THUMB_SMALLEST = "1.0" THUMB_SMALLEST = "1.0"
THUMB_LARGEST = "2.0" THUMB_LARGEST = "2.0"
cli_options = [Option('--catalog-title', # {{{ cli_options = [Option('--catalog-title', # {{{
default = 'My Books', default='My Books',
dest = 'catalog_title', dest='catalog_title',
action = None, action=None,
help = _('Title of generated catalog used as title in metadata.\n' help=_('Title of generated catalog used as title in metadata.\n'
"Default: '%default'\n" "Default: '%default'\n"
"Applies to: AZW3, ePub, MOBI output formats")), "Applies to: AZW3, ePub, MOBI output formats")),
Option('--cross-reference-authors', Option('--cross-reference-authors',
default=False, default=False,
dest='cross_reference_authors', dest='cross_reference_authors',
action = 'store_true', action='store_true',
help=_("Create cross-references in Authors section for books with multiple authors.\n" help=_("Create cross-references in Authors section for books with multiple authors.\n"
"Default: '%default'\n" "Default: '%default'\n"
"Applies to: AZW3, ePub, MOBI output formats")), "Applies to: AZW3, ePub, MOBI output formats")),
Option('--debug-pipeline', Option('--debug-pipeline',
default=None, default=None,
dest='debug_pipeline', dest='debug_pipeline',
action = None, action=None,
help=_("Save the output from different stages of the conversion " help=_("Save the output from different stages of the conversion "
"pipeline to the specified " "pipeline to the specified "
"directory. Useful if you are unsure at which stage " "directory. Useful if you are unsure at which stage "
@ -62,7 +63,7 @@ class EPUB_MOBI(CatalogPlugin):
Option('--exclude-genre', Option('--exclude-genre',
default='\[.+\]|^\+$', default='\[.+\]|^\+$',
dest='exclude_genre', dest='exclude_genre',
action = None, action=None,
help=_("Regex describing tags to exclude as genres.\n" help=_("Regex describing tags to exclude as genres.\n"
"Default: '%default' excludes bracketed tags, e.g. '[Project Gutenberg]', and '+', the default tag for read books.\n" "Default: '%default' excludes bracketed tags, e.g. '[Project Gutenberg]', and '+', the default tag for read books.\n"
"Applies to: AZW3, ePub, MOBI output formats")), "Applies to: AZW3, ePub, MOBI output formats")),
@ -82,63 +83,63 @@ class EPUB_MOBI(CatalogPlugin):
Option('--generate-authors', Option('--generate-authors',
default=False, default=False,
dest='generate_authors', dest='generate_authors',
action = 'store_true', action='store_true',
help=_("Include 'Authors' section in catalog.\n" help=_("Include 'Authors' section in catalog.\n"
"Default: '%default'\n" "Default: '%default'\n"
"Applies to: AZW3, ePub, MOBI output formats")), "Applies to: AZW3, ePub, MOBI output formats")),
Option('--generate-descriptions', Option('--generate-descriptions',
default=False, default=False,
dest='generate_descriptions', dest='generate_descriptions',
action = 'store_true', action='store_true',
help=_("Include 'Descriptions' section in catalog.\n" help=_("Include 'Descriptions' section in catalog.\n"
"Default: '%default'\n" "Default: '%default'\n"
"Applies to: AZW3, ePub, MOBI output formats")), "Applies to: AZW3, ePub, MOBI output formats")),
Option('--generate-genres', Option('--generate-genres',
default=False, default=False,
dest='generate_genres', dest='generate_genres',
action = 'store_true', action='store_true',
help=_("Include 'Genres' section in catalog.\n" help=_("Include 'Genres' section in catalog.\n"
"Default: '%default'\n" "Default: '%default'\n"
"Applies to: AZW3, ePub, MOBI output formats")), "Applies to: AZW3, ePub, MOBI output formats")),
Option('--generate-titles', Option('--generate-titles',
default=False, default=False,
dest='generate_titles', dest='generate_titles',
action = 'store_true', action='store_true',
help=_("Include 'Titles' section in catalog.\n" help=_("Include 'Titles' section in catalog.\n"
"Default: '%default'\n" "Default: '%default'\n"
"Applies to: AZW3, ePub, MOBI output formats")), "Applies to: AZW3, ePub, MOBI output formats")),
Option('--generate-series', Option('--generate-series',
default=False, default=False,
dest='generate_series', dest='generate_series',
action = 'store_true', action='store_true',
help=_("Include 'Series' section in catalog.\n" help=_("Include 'Series' section in catalog.\n"
"Default: '%default'\n" "Default: '%default'\n"
"Applies to: AZW3, ePub, MOBI output formats")), "Applies to: AZW3, ePub, MOBI output formats")),
Option('--generate-recently-added', Option('--generate-recently-added',
default=False, default=False,
dest='generate_recently_added', dest='generate_recently_added',
action = 'store_true', action='store_true',
help=_("Include 'Recently Added' section in catalog.\n" help=_("Include 'Recently Added' section in catalog.\n"
"Default: '%default'\n" "Default: '%default'\n"
"Applies to: AZW3, ePub, MOBI output formats")), "Applies to: AZW3, ePub, MOBI output formats")),
Option('--genre-source-field', Option('--genre-source-field',
default='Tags', default='Tags',
dest='genre_source_field', dest='genre_source_field',
action = None, action=None,
help=_("Source field for Genres section.\n" help=_("Source field for Genres section.\n"
"Default: '%default'\n" "Default: '%default'\n"
"Applies to: AZW3, ePub, MOBI output formats")), "Applies to: AZW3, ePub, MOBI output formats")),
Option('--header-note-source-field', Option('--header-note-source-field',
default='', default='',
dest='header_note_source_field', dest='header_note_source_field',
action = None, action=None,
help=_("Custom field containing note text to insert in Description header.\n" help=_("Custom field containing note text to insert in Description header.\n"
"Default: '%default'\n" "Default: '%default'\n"
"Applies to: AZW3, ePub, MOBI output formats")), "Applies to: AZW3, ePub, MOBI output formats")),
Option('--merge-comments-rule', Option('--merge-comments-rule',
default='::', default='::',
dest='merge_comments_rule', dest='merge_comments_rule',
action = None, action=None,
help=_("#<custom field>:[before|after]:[True|False] specifying:\n" help=_("#<custom field>:[before|after]:[True|False] specifying:\n"
" <custom field> Custom field containing notes to merge with Comments\n" " <custom field> Custom field containing notes to merge with Comments\n"
" [before|after] Placement of notes with respect to Comments\n" " [before|after] Placement of notes with respect to Comments\n"
@ -148,7 +149,7 @@ class EPUB_MOBI(CatalogPlugin):
Option('--output-profile', Option('--output-profile',
default=None, default=None,
dest='output_profile', dest='output_profile',
action = None, action=None,
help=_("Specifies the output profile. In some cases, an output profile is required to optimize the catalog for the device. For example, 'kindle' or 'kindle_dx' creates a structured Table of Contents with Sections and Articles.\n" help=_("Specifies the output profile. In some cases, an output profile is required to optimize the catalog for the device. For example, 'kindle' or 'kindle_dx' creates a structured Table of Contents with Sections and Articles.\n"
"Default: '%default'\n" "Default: '%default'\n"
"Applies to: AZW3, ePub, MOBI output formats")), "Applies to: AZW3, ePub, MOBI output formats")),
@ -164,14 +165,14 @@ class EPUB_MOBI(CatalogPlugin):
Option('--use-existing-cover', Option('--use-existing-cover',
default=False, default=False,
dest='use_existing_cover', dest='use_existing_cover',
action = 'store_true', action='store_true',
help=_("Replace existing cover when generating the catalog.\n" help=_("Replace existing cover when generating the catalog.\n"
"Default: '%default'\n" "Default: '%default'\n"
"Applies to: AZW3, ePub, MOBI output formats")), "Applies to: AZW3, ePub, MOBI output formats")),
Option('--thumb-width', Option('--thumb-width',
default='1.0', default='1.0',
dest='thumb_width', dest='thumb_width',
action = None, action=None,
help=_("Size hint (in inches) for book covers in catalog.\n" help=_("Size hint (in inches) for book covers in catalog.\n"
"Range: 1.0 - 2.0\n" "Range: 1.0 - 2.0\n"
"Default: '%default'\n" "Default: '%default'\n"
@ -199,7 +200,7 @@ class EPUB_MOBI(CatalogPlugin):
if opts.connected_device['name'] and 'kindle' in opts.connected_device['name'].lower(): if opts.connected_device['name'] and 'kindle' in opts.connected_device['name'].lower():
opts.connected_kindle = True opts.connected_kindle = True
if opts.connected_device['serial'] and \ if opts.connected_device['serial'] and \
opts.connected_device['serial'][:4] in ['B004','B005']: opts.connected_device['serial'][:4] in ['B004', 'B005']:
op = "kindle_dx" op = "kindle_dx"
else: else:
op = "kindle" op = "kindle"
@ -209,7 +210,7 @@ class EPUB_MOBI(CatalogPlugin):
opts.output_profile = op opts.output_profile = op
opts.basename = "Catalog" opts.basename = "Catalog"
opts.cli_environment = not hasattr(opts,'sync') opts.cli_environment = not hasattr(opts, 'sync')
# Hard-wired to always sort descriptions by author, with series after non-series # Hard-wired to always sort descriptions by author, with series after non-series
opts.sort_descriptions_by_author = True opts.sort_descriptions_by_author = True
@ -278,14 +279,14 @@ class EPUB_MOBI(CatalogPlugin):
opts.generate_genres = True opts.generate_genres = True
opts.generate_recently_added = True opts.generate_recently_added = True
opts.generate_descriptions = True opts.generate_descriptions = True
sections_list = ['Authors','Titles','Series','Genres','Recently Added','Descriptions'] sections_list = ['Authors', 'Titles', 'Series', 'Genres', 'Recently Added', 'Descriptions']
else: else:
opts.log.warn('\n*** No enabled Sections, terminating catalog generation ***') opts.log.warn('\n*** No enabled Sections, terminating catalog generation ***')
return ["No Included Sections","No enabled Sections.\nCheck E-book options tab\n'Included sections'\n"] return ["No Included Sections", "No enabled Sections.\nCheck E-book options tab\n'Included sections'\n"]
if opts.fmt == 'mobi' and sections_list == ['Descriptions']: if opts.fmt == 'mobi' and sections_list == ['Descriptions']:
warning = _("\n*** Adding 'By Authors' Section required for MOBI output ***") warning = _("\n*** Adding 'By Authors' Section required for MOBI output ***")
opts.log.warn(warning) opts.log.warn(warning)
sections_list.insert(0,'Authors') sections_list.insert(0, 'Authors')
opts.generate_authors = True opts.generate_authors = True
opts.log(u" Sections: %s" % ', '.join(sections_list)) opts.log(u" Sections: %s" % ', '.join(sections_list))
@ -294,14 +295,14 @@ class EPUB_MOBI(CatalogPlugin):
# Limit thumb_width to 1.0" - 2.0" # Limit thumb_width to 1.0" - 2.0"
try: try:
if float(opts.thumb_width) < float(self.THUMB_SMALLEST): if float(opts.thumb_width) < float(self.THUMB_SMALLEST):
log.warning("coercing thumb_width from '%s' to '%s'" % (opts.thumb_width,self.THUMB_SMALLEST)) log.warning("coercing thumb_width from '%s' to '%s'" % (opts.thumb_width, self.THUMB_SMALLEST))
opts.thumb_width = self.THUMB_SMALLEST opts.thumb_width = self.THUMB_SMALLEST
if float(opts.thumb_width) > float(self.THUMB_LARGEST): if float(opts.thumb_width) > float(self.THUMB_LARGEST):
log.warning("coercing thumb_width from '%s' to '%s'" % (opts.thumb_width,self.THUMB_LARGEST)) log.warning("coercing thumb_width from '%s' to '%s'" % (opts.thumb_width, self.THUMB_LARGEST))
opts.thumb_width = self.THUMB_LARGEST opts.thumb_width = self.THUMB_LARGEST
opts.thumb_width = "%.2f" % float(opts.thumb_width) opts.thumb_width = "%.2f" % float(opts.thumb_width)
except: except:
log.error("coercing thumb_width from '%s' to '%s'" % (opts.thumb_width,self.THUMB_SMALLEST)) log.error("coercing thumb_width from '%s' to '%s'" % (opts.thumb_width, self.THUMB_SMALLEST))
opts.thumb_width = "1.0" opts.thumb_width = "1.0"
# eval prefix_rules if passed from command line # eval prefix_rules if passed from command line
@ -331,13 +332,13 @@ class EPUB_MOBI(CatalogPlugin):
keys.sort() keys.sort()
build_log.append(" opts:") build_log.append(" opts:")
for key in keys: for key in keys:
if key in ['catalog_title','author_clip','connected_kindle','creator', if key in ['catalog_title', 'author_clip', 'connected_kindle', 'creator',
'cross_reference_authors','description_clip','exclude_book_marker', 'cross_reference_authors', 'description_clip', 'exclude_book_marker',
'exclude_genre','exclude_tags','exclusion_rules', 'fmt', 'exclude_genre', 'exclude_tags', 'exclusion_rules', 'fmt',
'genre_source_field', 'header_note_source_field','merge_comments_rule', 'genre_source_field', 'header_note_source_field', 'merge_comments_rule',
'output_profile','prefix_rules','read_book_marker', 'output_profile', 'prefix_rules', 'read_book_marker',
'search_text','sort_by','sort_descriptions_by_author','sync', 'search_text', 'sort_by', 'sort_descriptions_by_author', 'sync',
'thumb_width','use_existing_cover','wishlist_tag']: 'thumb_width', 'use_existing_cover', 'wishlist_tag']:
build_log.append(" %s: %s" % (key, repr(opts_dict[key]))) build_log.append(" %s: %s" % (key, repr(opts_dict[key])))
if opts.verbose: if opts.verbose:
log('\n'.join(line for line in build_log)) log('\n'.join(line for line in build_log))
@ -370,8 +371,8 @@ class EPUB_MOBI(CatalogPlugin):
""" """
GENERATE_DEBUG_EPUB = False GENERATE_DEBUG_EPUB = False
if GENERATE_DEBUG_EPUB: if GENERATE_DEBUG_EPUB:
catalog_debug_path = os.path.join(os.path.expanduser('~'),'Desktop','Catalog debug') catalog_debug_path = os.path.join(os.path.expanduser('~'), 'Desktop', 'Catalog debug')
setattr(opts,'debug_pipeline',os.path.expanduser(catalog_debug_path)) setattr(opts, 'debug_pipeline', os.path.expanduser(catalog_debug_path))
dp = getattr(opts, 'debug_pipeline', None) dp = getattr(opts, 'debug_pipeline', None)
if dp is not None: if dp is not None:
@ -381,11 +382,13 @@ class EPUB_MOBI(CatalogPlugin):
if opts.output_profile and opts.output_profile.startswith("kindle"): if opts.output_profile and opts.output_profile.startswith("kindle"):
recommendations.append(('output_profile', opts.output_profile, recommendations.append(('output_profile', opts.output_profile,
OptionRecommendation.HIGH)) OptionRecommendation.HIGH))
recommendations.append(('book_producer',opts.output_profile, recommendations.append(('book_producer', opts.output_profile,
OptionRecommendation.HIGH)) OptionRecommendation.HIGH))
if opts.fmt == 'mobi': if opts.fmt == 'mobi':
recommendations.append(('no_inline_toc', True, recommendations.append(('no_inline_toc', True,
OptionRecommendation.HIGH)) OptionRecommendation.HIGH))
recommendations.append(('verbose', 2,
OptionRecommendation.HIGH))
# Use existing cover or generate new cover # Use existing cover or generate new cover
cpath = None cpath = None
@ -432,14 +435,13 @@ class EPUB_MOBI(CatalogPlugin):
from calibre.ebooks.epub import initialize_container from calibre.ebooks.epub import initialize_container
from calibre.ebooks.tweak import zip_rebuilder from calibre.ebooks.tweak import zip_rebuilder
from calibre.utils.zipfile import ZipFile from calibre.utils.zipfile import ZipFile
input_path = os.path.join(catalog_debug_path,'input') input_path = os.path.join(catalog_debug_path, 'input')
epub_shell = os.path.join(catalog_debug_path,'epub_shell.zip') epub_shell = os.path.join(catalog_debug_path, 'epub_shell.zip')
initialize_container(epub_shell, opf_name='content.opf') initialize_container(epub_shell, opf_name='content.opf')
with ZipFile(epub_shell, 'r') as zf: with ZipFile(epub_shell, 'r') as zf:
zf.extractall(path=input_path) zf.extractall(path=input_path)
os.remove(epub_shell) os.remove(epub_shell)
zip_rebuilder(input_path, os.path.join(catalog_debug_path,'input.epub')) zip_rebuilder(input_path, os.path.join(catalog_debug_path, 'input.epub'))
# returns to gui2.actions.catalog:catalog_generated() # returns to gui2.actions.catalog:catalog_generated()
return catalog.error return catalog.error

File diff suppressed because it is too large Load Diff

View File

@ -22,6 +22,7 @@ from calibre.library.comments import comments_to_html
from calibre.library.server import custom_fields_to_display from calibre.library.server import custom_fields_to_display
from calibre.library.field_metadata import category_icon_map from calibre.library.field_metadata import category_icon_map
from calibre.library.server.utils import quote, unquote from calibre.library.server.utils import quote, unquote
from calibre.ebooks.metadata.sources.identify import urls_from_identifiers
def xml(*args, **kwargs): def xml(*args, **kwargs):
ans = prepare_string_for_xml(*args, **kwargs) ans = prepare_string_for_xml(*args, **kwargs)
@ -823,6 +824,16 @@ class BrowseServer(object):
if field in ('title', 'formats') or not args.get(field, False) \ if field in ('title', 'formats') or not args.get(field, False) \
or not m['name']: or not m['name']:
continue continue
if field == 'identifiers':
urls = urls_from_identifiers(mi.get(field, {}))
links = [u'<a class="details_category_link" target="_new" href="%s" title="%s:%s">%s</a>' % (url, id_typ, id_val, name)
for name, id_typ, id_val, url in urls]
links = u', '.join(links)
if links:
fields.append((m['name'], u'<strong>%s: </strong>%s'%(
_('Ids'), links)))
continue
if m['datatype'] == 'rating': if m['datatype'] == 'rating':
r = u'<strong>%s: </strong>'%xml(m['name']) + \ r = u'<strong>%s: </strong>'%xml(m['name']) + \
render_rating(mi.get(field)/2.0, self.opts.url_prefix, render_rating(mi.get(field)/2.0, self.opts.url_prefix,

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More