mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
sync with Kovid's branch
This commit is contained in:
commit
9e6c33961d
@ -20,6 +20,7 @@ class Aksiyon (BasicNewsRecipe):
|
||||
auto_cleanup = True
|
||||
cover_img_url = 'http://www.aksiyon.com.tr/aksiyon/images/aksiyon/top-page/aksiyon_top_r2_c1.jpg'
|
||||
masthead_url = 'http://aksiyon.com.tr/aksiyon/images/aksiyon/top-page/aksiyon_top_r2_c1.jpg'
|
||||
ignore_duplicate_articles = { 'title', 'url' }
|
||||
remove_empty_feeds= True
|
||||
feeds = [
|
||||
( u'KAPAK', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=26'),
|
||||
|
@ -6,22 +6,41 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
nytimes.com
|
||||
'''
|
||||
import re, string, time
|
||||
from calibre import entity_to_unicode, strftime
|
||||
from calibre import strftime
|
||||
from datetime import timedelta, date
|
||||
from time import sleep
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
|
||||
|
||||
|
||||
class NYTimes(BasicNewsRecipe):
|
||||
|
||||
recursions=1 # set this to zero to omit Related articles lists
|
||||
|
||||
# set getTechBlogs to True to include the technology blogs
|
||||
# set tech_oldest_article to control article age
|
||||
# set tech_max_articles_per_feed to control article count
|
||||
getTechBlogs = True
|
||||
remove_empty_feeds = True
|
||||
tech_oldest_article = 14
|
||||
tech_max_articles_per_feed = 25
|
||||
|
||||
|
||||
# set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored.
|
||||
headlinesOnly = True
|
||||
|
||||
# set webEdition to True for the Web edition of the newspaper. Set oldest_article to the
|
||||
# number of days old an article can be for inclusion. If oldest_article = 0 all articles
|
||||
# will be included. Note: oldest_article is ignored if webEdition = False
|
||||
# set webEdition to True for the Web edition of the newspaper. Set oldest_web_article to the
|
||||
# number of days old an article can be for inclusion. If oldest_web_article = None all articles
|
||||
# will be included. Note: oldest_web_article is ignored if webEdition = False
|
||||
webEdition = False
|
||||
oldest_article = 7
|
||||
oldest_web_article = 7
|
||||
|
||||
# download higher resolution images than the small thumbnails typically included in the article
|
||||
# the down side of having large beautiful images is the file size is much larger, on the order of 7MB per paper
|
||||
useHighResImages = True
|
||||
|
||||
# replace paid Kindle Version: the name will be changed to "The New York Times" to cause
|
||||
# previous paid versions of the new york times to best sent to the back issues folder on the kindle
|
||||
replaceKindleVersion = False
|
||||
|
||||
# includeSections: List of sections to include. If empty, all sections found will be included.
|
||||
# Otherwise, only the sections named will be included. For example,
|
||||
@ -82,57 +101,68 @@ class NYTimes(BasicNewsRecipe):
|
||||
('Education',u'education'),
|
||||
('Multimedia',u'multimedia'),
|
||||
(u'Obituaries',u'obituaries'),
|
||||
(u'Sunday Magazine',u'magazine'),
|
||||
(u'Week in Review',u'weekinreview')]
|
||||
(u'Sunday Magazine',u'magazine')
|
||||
]
|
||||
|
||||
tech_feeds = [
|
||||
(u'Tech - Pogues Posts', u'http://pogue.blogs.nytimes.com/feed/'),
|
||||
(u'Tech - Bits', u'http://bits.blogs.nytimes.com/feed/'),
|
||||
(u'Tech - Gadgetwise', u'http://gadgetwise.blogs.nytimes.com/feed/'),
|
||||
(u'Tech - Open', u'http://open.blogs.nytimes.com/feed/')
|
||||
]
|
||||
|
||||
|
||||
if headlinesOnly:
|
||||
title='New York Times Headlines'
|
||||
description = 'Headlines from the New York Times. Needs a subscription from http://www.nytimes.com'
|
||||
needs_subscription = 'optional'
|
||||
description = 'Headlines from the New York Times'
|
||||
needs_subscription = False
|
||||
elif webEdition:
|
||||
title='New York Times (Web)'
|
||||
description = 'New York Times on the Web'
|
||||
needs_subscription = True
|
||||
needs_subscription = False
|
||||
elif replaceKindleVersion:
|
||||
title='The New York Times'
|
||||
description = 'Today\'s New York Times'
|
||||
needs_subscription = False
|
||||
else:
|
||||
title='New York Times'
|
||||
description = 'Today\'s New York Times'
|
||||
needs_subscription = True
|
||||
needs_subscription = False
|
||||
|
||||
|
||||
month_list = ['january','february','march','april','may','june','july','august','september','october','november','december']
|
||||
|
||||
def decode_us_date(self,datestr):
|
||||
udate = datestr.strip().lower().split()
|
||||
def decode_url_date(self,url):
|
||||
urlitems = url.split('/')
|
||||
try:
|
||||
m = self.month_list.index(udate[0])+1
|
||||
d = date(int(urlitems[3]),int(urlitems[4]),int(urlitems[5]))
|
||||
except:
|
||||
return date.today()
|
||||
d = int(udate[1])
|
||||
y = int(udate[2])
|
||||
try:
|
||||
d = date(y,m,d)
|
||||
d = date(int(urlitems[4]),int(urlitems[5]),int(urlitems[6]))
|
||||
except:
|
||||
d = date.today
|
||||
return None
|
||||
return d
|
||||
|
||||
earliest_date = date.today() - timedelta(days=oldest_article)
|
||||
if oldest_web_article is None:
|
||||
earliest_date = date.today()
|
||||
else:
|
||||
earliest_date = date.today() - timedelta(days=oldest_web_article)
|
||||
oldest_article = 365 # by default, a long time ago
|
||||
|
||||
__author__ = 'GRiker/Kovid Goyal/Nick Redding'
|
||||
language = 'en'
|
||||
requires_version = (0, 7, 5)
|
||||
|
||||
encoding = 'utf-8'
|
||||
|
||||
timefmt = ''
|
||||
masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
|
||||
|
||||
simultaneous_downloads = 1
|
||||
|
||||
cover_margins = (18,18,'grey99')
|
||||
|
||||
remove_tags_before = dict(id='article')
|
||||
remove_tags_after = dict(id='article')
|
||||
remove_tags = [dict(attrs={'class':[
|
||||
remove_tags = [
|
||||
dict(attrs={'class':[
|
||||
'articleFooter',
|
||||
'articleTools',
|
||||
'columnGroup doubleRule',
|
||||
'columnGroup singleRule',
|
||||
'columnGroup last',
|
||||
'columnGroup last',
|
||||
@ -140,7 +170,6 @@ class NYTimes(BasicNewsRecipe):
|
||||
'dottedLine',
|
||||
'entry-meta',
|
||||
'entry-response module',
|
||||
'icon enlargeThis',
|
||||
'leftNavTabs',
|
||||
'metaFootnote',
|
||||
'module box nav',
|
||||
@ -150,10 +179,43 @@ class NYTimes(BasicNewsRecipe):
|
||||
'relatedSearchesModule',
|
||||
'side_tool',
|
||||
'singleAd',
|
||||
'entry entry-utility', #added for DealBook
|
||||
'entry-tags', #added for DealBook
|
||||
'footer promos clearfix', #added for DealBook
|
||||
'footer links clearfix', #added for DealBook
|
||||
'tabsContainer', #added for other blog downloads
|
||||
'column lastColumn', #added for other blog downloads
|
||||
'pageHeaderWithLabel', #added for other gadgetwise downloads
|
||||
'column two', #added for other blog downloads
|
||||
'column two last', #added for other blog downloads
|
||||
'column three', #added for other blog downloads
|
||||
'column three last', #added for other blog downloads
|
||||
'column four',#added for other blog downloads
|
||||
'column four last',#added for other blog downloads
|
||||
'column last', #added for other blog downloads
|
||||
'entry entry-related',
|
||||
'subNavigation tabContent active', #caucus blog navigation
|
||||
'mediaOverlay slideshow',
|
||||
'wideThumb',
|
||||
'video', #added 02-11-2011
|
||||
'videoHeader',#added 02-11-2011
|
||||
'articleInlineVideoHolder', #added 02-11-2011
|
||||
'assetCompanionAd',
|
||||
re.compile('^subNavigation'),
|
||||
re.compile('^leaderboard'),
|
||||
re.compile('^module'),
|
||||
re.compile('commentCount')
|
||||
]}),
|
||||
dict(name='div', attrs={'class':re.compile('toolsList')}), # bits
|
||||
dict(name='div', attrs={'class':re.compile('postNavigation')}), # bits
|
||||
dict(name='div', attrs={'class':'tweet'}),
|
||||
dict(name='span', attrs={'class':'commentCount meta'}),
|
||||
dict(name='div', attrs={'id':'header'}),
|
||||
dict(name='div', attrs={'id':re.compile('commentsContainer')}), # bits, pogue, gadgetwise, open
|
||||
dict(name='ul', attrs={'class':re.compile('entry-tools')}), # pogue, gadgetwise
|
||||
dict(name='div', attrs={'class':re.compile('nocontent')}), # pogue, gadgetwise
|
||||
dict(name='div', attrs={'id':re.compile('respond')}), # open
|
||||
dict(name='div', attrs={'class':re.compile('entry-tags')}), # pogue
|
||||
dict(id=[
|
||||
'adxLeaderboard',
|
||||
'adxSponLink',
|
||||
@ -183,22 +245,29 @@ class NYTimes(BasicNewsRecipe):
|
||||
'side_index',
|
||||
'side_tool',
|
||||
'toolsRight',
|
||||
'skybox', #added for DealBook
|
||||
'TopAd', #added for DealBook
|
||||
'related-content', #added for DealBook
|
||||
]),
|
||||
dict(name=['script', 'noscript', 'style','form','hr'])]
|
||||
no_stylesheets = True
|
||||
extra_css = '''
|
||||
.articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
|
||||
.credit { text-align: right; font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.byline { text-align: left; font-size: small; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.dateline { text-align: left; font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.timestamp { text-align: left; font-size: small; }
|
||||
.caption { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.credit { font-weight: normal; text-align: right; font-size: 50%; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.byline { text-align: left; font-size: 50%; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.dateline { text-align: left; font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.kicker { font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.timestamp { font-weight: normal; text-align: left; font-size: 50%; }
|
||||
.caption { font-size: 50%; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
a:link {text-decoration: none; }
|
||||
.date{font-size: 50%; }
|
||||
.update{font-size: 50%; }
|
||||
.articleBody { }
|
||||
.authorId {text-align: left; }
|
||||
.authorId {text-align: left; font-size: 50%; }
|
||||
.image {text-align: center;}
|
||||
.source {text-align: left; }'''
|
||||
.aside {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;}
|
||||
.asidenote {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;font-weight:bold;}
|
||||
.source {text-align: left; font-size: x-small; }'''
|
||||
|
||||
|
||||
articles = {}
|
||||
@ -237,7 +306,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
def exclude_url(self,url):
|
||||
if not url.startswith("http"):
|
||||
return True
|
||||
if not url.endswith(".html"):
|
||||
if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: #added for DealBook
|
||||
return True
|
||||
if 'nytimes.com' not in url:
|
||||
return True
|
||||
@ -280,88 +349,91 @@ class NYTimes(BasicNewsRecipe):
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
if self.username is not None and self.password is not None:
|
||||
br.open('http://www.nytimes.com/auth/login')
|
||||
br.form = br.forms().next()
|
||||
br['userid'] = self.username
|
||||
br['password'] = self.password
|
||||
raw = br.submit().read()
|
||||
if 'Please try again' in raw:
|
||||
raise Exception('Your username and password are incorrect')
|
||||
return br
|
||||
|
||||
def skip_ad_pages(self, soup):
|
||||
# Skip ad pages served before actual article
|
||||
skip_tag = soup.find(True, {'name':'skip'})
|
||||
if skip_tag is not None:
|
||||
self.log.warn("Found forwarding link: %s" % skip_tag.parent['href'])
|
||||
url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
|
||||
url += '?pagewanted=all'
|
||||
self.log.warn("Skipping ad to article at '%s'" % url)
|
||||
return self.index_to_soup(url, raw=True)
|
||||
## This doesn't work (and probably never did). It either gets another serve of the advertisement,
|
||||
## or if it gets the article then get_soup (from which it is invoked) traps trying to do xml decoding.
|
||||
##
|
||||
## def skip_ad_pages(self, soup):
|
||||
## # Skip ad pages served before actual article
|
||||
## skip_tag = soup.find(True, {'name':'skip'})
|
||||
## if skip_tag is not None:
|
||||
## self.log.warn("Found forwarding link: %s" % skip_tag.parent['href'])
|
||||
## url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
|
||||
## url += '?pagewanted=all'
|
||||
## self.log.warn("Skipping ad to article at '%s'" % url)
|
||||
## return self.index_to_soup(url, raw=True)
|
||||
|
||||
|
||||
cover_tag = 'NY_NYT'
|
||||
def get_cover_url(self):
|
||||
cover = None
|
||||
st = time.localtime()
|
||||
year = str(st.tm_year)
|
||||
month = "%.2d" % st.tm_mon
|
||||
day = "%.2d" % st.tm_mday
|
||||
cover = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/nytfrontpage/scan.jpg'
|
||||
from datetime import timedelta, date
|
||||
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.cover_tag+'.jpg'
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
daysback=1
|
||||
try:
|
||||
br.open(cover)
|
||||
except:
|
||||
while daysback<7:
|
||||
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.cover_tag+'.jpg'
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
try:
|
||||
br.open(cover)
|
||||
except:
|
||||
daysback = daysback+1
|
||||
continue
|
||||
break
|
||||
if daysback==7:
|
||||
self.log("\nCover unavailable")
|
||||
cover = None
|
||||
return cover
|
||||
|
||||
masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
|
||||
|
||||
def short_title(self):
|
||||
return self.title
|
||||
|
||||
def index_to_soup(self, url_or_raw, raw=False):
|
||||
'''
|
||||
OVERRIDE of class method
|
||||
deals with various page encodings between index and articles
|
||||
'''
|
||||
def get_the_soup(docEncoding, url_or_raw, raw=False) :
|
||||
|
||||
def article_to_soup(self, url_or_raw, raw=False):
|
||||
from contextlib import closing
|
||||
import copy
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
if re.match(r'\w+://', url_or_raw):
|
||||
br = self.clone_browser(self.browser)
|
||||
f = br.open_novisit(url_or_raw)
|
||||
open_func = getattr(br, 'open_novisit', br.open)
|
||||
with closing(open_func(url_or_raw)) as f:
|
||||
_raw = f.read()
|
||||
f.close()
|
||||
if not _raw:
|
||||
raise RuntimeError('Could not fetch index from %s'%url_or_raw)
|
||||
else:
|
||||
_raw = url_or_raw
|
||||
if raw:
|
||||
return _raw
|
||||
|
||||
if not isinstance(_raw, unicode) and self.encoding:
|
||||
_raw = _raw.decode(docEncoding, 'replace')
|
||||
massage = list(BeautifulSoup.MARKUP_MASSAGE)
|
||||
massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding)))
|
||||
return BeautifulSoup(_raw, markupMassage=massage)
|
||||
if callable(self.encoding):
|
||||
_raw = self.encoding(_raw)
|
||||
else:
|
||||
_raw = _raw.decode(self.encoding, 'replace')
|
||||
|
||||
# Entry point
|
||||
soup = get_the_soup( self.encoding, url_or_raw )
|
||||
contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
|
||||
docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
|
||||
if docEncoding == '' :
|
||||
docEncoding = self.encoding
|
||||
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
|
||||
nmassage.extend(self.preprocess_regexps)
|
||||
nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL), lambda m: '')]
|
||||
# Some websites have buggy doctype declarations that mess up beautifulsoup
|
||||
# Remove comments as they can leave detritus when extracting tags leaves
|
||||
# multiple nested comments
|
||||
nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
|
||||
usrc = xml_to_unicode(_raw, self.verbose, strip_encoding_pats=True)[0]
|
||||
usrc = self.preprocess_raw_html(usrc, url_or_raw)
|
||||
return BeautifulSoup(usrc, markupMassage=nmassage)
|
||||
|
||||
if self.verbose > 2:
|
||||
self.log( " document encoding: '%s'" % docEncoding)
|
||||
if docEncoding != self.encoding :
|
||||
soup = get_the_soup(docEncoding, url_or_raw)
|
||||
|
||||
return soup
|
||||
|
||||
def massageNCXText(self, description):
|
||||
# Kindle TOC descriptions won't render certain characters
|
||||
if description:
|
||||
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||
# Replace '&' with '&'
|
||||
massaged = re.sub("&","&", massaged)
|
||||
massaged = re.sub("&","&", massaged)
|
||||
massaged = re.sub("&","&", massaged)
|
||||
return self.fixChars(massaged)
|
||||
else:
|
||||
return description
|
||||
@ -383,6 +455,16 @@ class NYTimes(BasicNewsRecipe):
|
||||
if self.filterDuplicates:
|
||||
if url in self.url_list:
|
||||
return
|
||||
if self.webEdition:
|
||||
date_tag = self.decode_url_date(url)
|
||||
if date_tag is not None:
|
||||
if self.oldest_web_article is not None:
|
||||
if date_tag < self.earliest_date:
|
||||
self.log("Skipping article %s" % url)
|
||||
return
|
||||
else:
|
||||
self.log("Skipping article %s" % url)
|
||||
return
|
||||
self.url_list.append(url)
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
description = ''
|
||||
@ -407,6 +489,31 @@ class NYTimes(BasicNewsRecipe):
|
||||
description=description, author=author,
|
||||
content=''))
|
||||
|
||||
def get_tech_feeds(self,ans):
|
||||
if self.getTechBlogs:
|
||||
tech_articles = {}
|
||||
key_list = []
|
||||
save_oldest_article = self.oldest_article
|
||||
save_max_articles_per_feed = self.max_articles_per_feed
|
||||
self.oldest_article = self.tech_oldest_article
|
||||
self.max_articles_per_feed = self.tech_max_articles_per_feed
|
||||
self.feeds = self.tech_feeds
|
||||
tech = self.parse_feeds()
|
||||
self.oldest_article = save_oldest_article
|
||||
self.max_articles_per_feed = save_max_articles_per_feed
|
||||
self.feeds = None
|
||||
for f in tech:
|
||||
key_list.append(f.title)
|
||||
tech_articles[f.title] = []
|
||||
for a in f.articles:
|
||||
tech_articles[f.title].append(
|
||||
dict(title=a.title, url=a.url, date=a.date,
|
||||
description=a.summary, author=a.author,
|
||||
content=a.content))
|
||||
tech_ans = [(k, tech_articles[k]) for k in key_list if tech_articles.has_key(k)]
|
||||
for x in tech_ans:
|
||||
ans.append(x)
|
||||
return ans
|
||||
|
||||
def parse_web_edition(self):
|
||||
|
||||
@ -418,31 +525,41 @@ class NYTimes(BasicNewsRecipe):
|
||||
if sec_title in self.excludeSections:
|
||||
print "SECTION EXCLUDED: ",sec_title
|
||||
continue
|
||||
print 'Index URL: '+'http://www.nytimes.com/pages/'+index_url+'/index.html'
|
||||
try:
|
||||
soup = self.index_to_soup('http://www.nytimes.com/pages/'+index_url+'/index.html')
|
||||
except:
|
||||
continue
|
||||
print 'Index URL: '+'http://www.nytimes.com/pages/'+index_url+'/index.html'
|
||||
|
||||
self.key = sec_title
|
||||
# Find each article
|
||||
for div in soup.findAll(True,
|
||||
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
||||
if div['class'] in ['story', 'story headline'] :
|
||||
attrs={'class':['section-headline', 'ledeStory', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
||||
if div['class'] in ['story', 'story headline', 'storyHeader'] :
|
||||
self.handle_article(div)
|
||||
elif div['class'] == 'ledeStory':
|
||||
divsub = div.find('div','storyHeader')
|
||||
if divsub is not None:
|
||||
self.handle_article(divsub)
|
||||
ulrefer = div.find('ul','refer')
|
||||
if ulrefer is not None:
|
||||
for lidiv in ulrefer.findAll('li'):
|
||||
self.handle_article(lidiv)
|
||||
elif div['class'] == 'headlinesOnly multiline flush':
|
||||
for lidiv in div.findAll('li'):
|
||||
self.handle_article(lidiv)
|
||||
|
||||
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
||||
return self.filter_ans(self.ans)
|
||||
return self.filter_ans(self.get_tech_feeds(self.ans))
|
||||
|
||||
|
||||
def parse_todays_index(self):
|
||||
|
||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
|
||||
|
||||
skipping = False
|
||||
# Find each article
|
||||
for div in soup.findAll(True,
|
||||
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
||||
|
||||
if div['class'] in ['section-headline','sectionHeader']:
|
||||
self.key = string.capwords(self.feed_title(div))
|
||||
self.key = self.key.replace('Op-ed','Op-Ed')
|
||||
@ -466,7 +583,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
self.handle_article(lidiv)
|
||||
|
||||
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
||||
return self.filter_ans(self.ans)
|
||||
return self.filter_ans(self.get_tech_feeds(self.ans))
|
||||
|
||||
def parse_headline_index(self):
|
||||
|
||||
@ -514,7 +631,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
for h3_item in search_div.findAll('h3'):
|
||||
byline = h3_item.h6
|
||||
if byline is not None:
|
||||
author = self.tag_to_string(byline,usa_alt=False)
|
||||
author = self.tag_to_string(byline,use_alt=False)
|
||||
else:
|
||||
author = ''
|
||||
a = h3_item.find('a', href=True)
|
||||
@ -540,7 +657,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
|
||||
|
||||
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
||||
return self.filter_ans(self.ans)
|
||||
return self.filter_ans(self.get_tech_feeds(self.ans))
|
||||
|
||||
def parse_index(self):
|
||||
if self.headlinesOnly:
|
||||
@ -550,32 +667,190 @@ class NYTimes(BasicNewsRecipe):
|
||||
else:
|
||||
return self.parse_todays_index()
|
||||
|
||||
def strip_anchors(self,soup):
|
||||
def strip_anchors(self,soup,kill_all=False):
|
||||
paras = soup.findAll(True)
|
||||
for para in paras:
|
||||
aTags = para.findAll('a')
|
||||
for a in aTags:
|
||||
if a.img is None:
|
||||
a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
||||
if kill_all or (self.recursions==0):
|
||||
a.replaceWith(self.tag_to_string(a,False))
|
||||
else:
|
||||
if a.has_key('href'):
|
||||
if a['href'].startswith('http://www.nytimes'):
|
||||
if not a['href'].endswith('pagewanted=all'):
|
||||
url = re.sub(r'\?.*', '', a['href'])
|
||||
if self.exclude_url(url):
|
||||
a.replaceWith(self.tag_to_string(a,False))
|
||||
else:
|
||||
a['href'] = url+'?pagewanted=all'
|
||||
elif not (a['href'].startswith('http://pogue') or \
|
||||
a['href'].startswith('http://bits') or \
|
||||
a['href'].startswith('http://travel') or \
|
||||
a['href'].startswith('http://business') or \
|
||||
a['href'].startswith('http://tech') or \
|
||||
a['href'].startswith('http://health') or \
|
||||
a['href'].startswith('http://dealbook') or \
|
||||
a['href'].startswith('http://open')):
|
||||
a.replaceWith(self.tag_to_string(a,False))
|
||||
return soup
|
||||
|
||||
def handle_tags(self,soup):
|
||||
try:
|
||||
print("HANDLE TAGS: TITLE = "+self.tag_to_string(soup.title))
|
||||
except:
|
||||
print("HANDLE TAGS: NO TITLE")
|
||||
if soup is None:
|
||||
print("ERROR: handle_tags received NoneType")
|
||||
return None
|
||||
|
||||
## print("HANDLING AD FORWARD:")
|
||||
## print(soup)
|
||||
if self.keep_only_tags:
|
||||
body = Tag(soup, 'body')
|
||||
try:
|
||||
if isinstance(self.keep_only_tags, dict):
|
||||
self.keep_only_tags = [self.keep_only_tags]
|
||||
for spec in self.keep_only_tags:
|
||||
for tag in soup.find('body').findAll(**spec):
|
||||
body.insert(len(body.contents), tag)
|
||||
soup.find('body').replaceWith(body)
|
||||
except AttributeError: # soup has no body element
|
||||
pass
|
||||
|
||||
def remove_beyond(tag, next):
|
||||
while tag is not None and getattr(tag, 'name', None) != 'body':
|
||||
after = getattr(tag, next)
|
||||
while after is not None:
|
||||
ns = getattr(tag, next)
|
||||
after.extract()
|
||||
after = ns
|
||||
tag = tag.parent
|
||||
|
||||
if self.remove_tags_after is not None:
|
||||
rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
|
||||
for spec in rt:
|
||||
tag = soup.find(**spec)
|
||||
remove_beyond(tag, 'nextSibling')
|
||||
|
||||
if self.remove_tags_before is not None:
|
||||
tag = soup.find(**self.remove_tags_before)
|
||||
remove_beyond(tag, 'previousSibling')
|
||||
|
||||
for kwds in self.remove_tags:
|
||||
for tag in soup.findAll(**kwds):
|
||||
tag.extract()
|
||||
|
||||
return soup
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
print("PREPROCESS TITLE="+self.tag_to_string(soup.title))
|
||||
skip_tag = soup.find(True, {'name':'skip'})
|
||||
if skip_tag is not None:
|
||||
url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
|
||||
url += '?pagewanted=all'
|
||||
self.log.warn("Skipping ad to article at '%s'" % url)
|
||||
sleep(5)
|
||||
soup = self.handle_tags(self.article_to_soup(url))
|
||||
|
||||
if self.webEdition & (self.oldest_article>0):
|
||||
date_tag = soup.find(True,attrs={'class': ['dateline','date']})
|
||||
if date_tag:
|
||||
date_str = self.tag_to_string(date_tag,use_alt=False)
|
||||
date_str = date_str.replace('Published:','')
|
||||
date_items = date_str.split(',')
|
||||
try:
|
||||
datestring = date_items[0]+' '+date_items[1]
|
||||
article_date = self.decode_us_date(datestring)
|
||||
except:
|
||||
article_date = date.today()
|
||||
if article_date < self.earliest_date:
|
||||
self.log("Skipping article dated %s" % date_str)
|
||||
return None
|
||||
# check if the article is from one of the tech blogs
|
||||
blog=soup.find('div',attrs={'id':['pogue','bits','gadgetwise','open']})
|
||||
|
||||
if blog is not None:
|
||||
old_body = soup.find('body')
|
||||
new_body=Tag(soup,'body')
|
||||
new_body.append(soup.find('div',attrs={'id':'content'}))
|
||||
new_body.find('div',attrs={'id':'content'})['id']='blogcontent' # identify for postprocess_html
|
||||
old_body.replaceWith(new_body)
|
||||
for divr in soup.findAll('div',attrs={'class':re.compile('w190 right')}):
|
||||
if divr.find(text=re.compile('Sign up')):
|
||||
divr.extract()
|
||||
divr = soup.find('div',attrs={'id':re.compile('related-content')})
|
||||
if divr is not None:
|
||||
# handle related articles
|
||||
rlist = []
|
||||
ul = divr.find('ul')
|
||||
if ul is not None:
|
||||
for li in ul.findAll('li'):
|
||||
atag = li.find('a')
|
||||
if atag is not None:
|
||||
if atag['href'].startswith('http://pogue') or atag['href'].startswith('http://bits') or \
|
||||
atag['href'].startswith('http://open'):
|
||||
atag.find(text=True).replaceWith(self.massageNCXText(self.tag_to_string(atag,False)))
|
||||
rlist.append(atag)
|
||||
divr.extract()
|
||||
if rlist != []:
|
||||
asidediv = Tag(soup,'div',[('class','aside')])
|
||||
if soup.find('hr') is None:
|
||||
asidediv.append(Tag(soup,'hr'))
|
||||
h4 = Tag(soup,'h4',[('class','asidenote')])
|
||||
h4.insert(0,"Related Posts")
|
||||
asidediv.append(h4)
|
||||
ul = Tag(soup,'ul')
|
||||
for r in rlist:
|
||||
li = Tag(soup,'li',[('class','aside')])
|
||||
r['class'] = 'aside'
|
||||
li.append(r)
|
||||
ul.append(li)
|
||||
asidediv.append(ul)
|
||||
asidediv.append(Tag(soup,'hr'))
|
||||
smain = soup.find('body')
|
||||
smain.append(asidediv)
|
||||
for atag in soup.findAll('a'):
|
||||
img = atag.find('img')
|
||||
if img is not None:
|
||||
atag.replaceWith(img)
|
||||
elif not atag.has_key('href'):
|
||||
atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
|
||||
elif not (atag['href'].startswith('http://www.nytimes') or atag['href'].startswith('http://pogue') or \
|
||||
atag['href'].startswith('http://bits') or atag['href'].startswith('http://open')):
|
||||
atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
|
||||
hdr = soup.find('address')
|
||||
if hdr is not None:
|
||||
hdr.name='span'
|
||||
for span_credit in soup.findAll('span','credit'):
|
||||
sp = Tag(soup,'span')
|
||||
span_credit.replaceWith(sp)
|
||||
sp.append(Tag(soup,'br'))
|
||||
sp.append(span_credit)
|
||||
sp.append(Tag(soup,'br'))
|
||||
|
||||
else: # nytimes article
|
||||
|
||||
related = [] # these will be the related articles
|
||||
first_outer = None # first related outer tag
|
||||
first_related = None # first related tag
|
||||
for outerdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}):
|
||||
for rdiv in soup.findAll('div','columnGroup doubleRule'):
|
||||
if rdiv.find('h3') is not None:
|
||||
if self.tag_to_string(rdiv.h3,False).startswith('Related'):
|
||||
rdiv.h3.find(text=True).replaceWith("Related articles")
|
||||
rdiv.h3['class'] = 'asidenote'
|
||||
for litag in rdiv.findAll('li'):
|
||||
if litag.find('a') is not None:
|
||||
if litag.find('a')['href'].startswith('http://www.nytimes.com'):
|
||||
url = re.sub(r'\?.*', '', litag.find('a')['href'])
|
||||
litag.find('a')['href'] = url+'?pagewanted=all'
|
||||
litag.extract()
|
||||
related.append(litag)
|
||||
if first_related is None:
|
||||
first_related = rdiv
|
||||
first_outer = outerdiv
|
||||
else:
|
||||
litag.extract()
|
||||
if related != []:
|
||||
for r in related:
|
||||
if r.h6: # don't want the anchor inside a h6 tag
|
||||
r.h6.replaceWith(r.h6.a)
|
||||
first_related.ul.append(r)
|
||||
first_related.insert(0,Tag(soup,'hr'))
|
||||
first_related.append(Tag(soup,'hr'))
|
||||
first_related['class'] = 'aside'
|
||||
first_outer.replaceWith(first_related) # replace the outer tag with the related tag
|
||||
|
||||
for rdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}):
|
||||
rdiv.extract()
|
||||
|
||||
kicker_tag = soup.find(attrs={'class':'kicker'})
|
||||
if kicker_tag: # remove Op_Ed author head shots
|
||||
@ -584,9 +859,77 @@ class NYTimes(BasicNewsRecipe):
|
||||
img_div = soup.find('div','inlineImage module')
|
||||
if img_div:
|
||||
img_div.extract()
|
||||
return self.strip_anchors(soup)
|
||||
|
||||
def postprocess_html(self,soup, True):
|
||||
if self.useHighResImages:
|
||||
try:
|
||||
#open up all the "Enlarge this Image" pop-ups and download the full resolution jpegs
|
||||
enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
|
||||
if enlargeThisList:
|
||||
for popupref in enlargeThisList:
|
||||
popupreflink = popupref.find('a')
|
||||
if popupreflink:
|
||||
reflinkstring = str(popupreflink['href'])
|
||||
refstart = reflinkstring.find("javascript:pop_me_up2('") + len("javascript:pop_me_up2('")
|
||||
refend = reflinkstring.find(".html", refstart) + len(".html")
|
||||
reflinkstring = reflinkstring[refstart:refend]
|
||||
|
||||
popuppage = self.browser.open(reflinkstring)
|
||||
popuphtml = popuppage.read()
|
||||
popuppage.close()
|
||||
if popuphtml:
|
||||
st = time.localtime()
|
||||
year = str(st.tm_year)
|
||||
month = "%.2d" % st.tm_mon
|
||||
day = "%.2d" % st.tm_mday
|
||||
imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/') + len('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/')
|
||||
highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/' + popuphtml[imgstartpos:popuphtml.find('.jpg',imgstartpos)+4]
|
||||
popupSoup = BeautifulSoup(popuphtml)
|
||||
highResTag = popupSoup.find('img', {'src':highResImageLink})
|
||||
if highResTag:
|
||||
try:
|
||||
newWidth = highResTag['width']
|
||||
newHeight = highResTag['height']
|
||||
imageTag = popupref.parent.find("img")
|
||||
except:
|
||||
self.log("Error: finding width and height of img")
|
||||
popupref.extract()
|
||||
if imageTag:
|
||||
try:
|
||||
imageTag['src'] = highResImageLink
|
||||
imageTag['width'] = newWidth
|
||||
imageTag['height'] = newHeight
|
||||
except:
|
||||
self.log("Error setting the src width and height parameters")
|
||||
except Exception:
|
||||
self.log("Error pulling high resolution images")
|
||||
|
||||
try:
|
||||
#in case pulling images failed, delete the enlarge this text
|
||||
enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
|
||||
if enlargeThisList:
|
||||
for popupref in enlargeThisList:
|
||||
popupref.extract()
|
||||
except:
|
||||
self.log("Error removing Enlarge this text")
|
||||
|
||||
|
||||
return self.strip_anchors(soup,False)
|
||||
|
||||
def postprocess_html(self,soup,first_fetch):
|
||||
if not first_fetch: # remove Related links
|
||||
for aside in soup.findAll('div','aside'):
|
||||
aside.extract()
|
||||
soup = self.strip_anchors(soup,True)
|
||||
|
||||
if soup.find('div',attrs={'id':'blogcontent'}) is None:
|
||||
if first_fetch:
|
||||
aside = soup.find('div','aside')
|
||||
if aside is not None: # move the related list to the end of the article
|
||||
art = soup.find('div',attrs={'id':'article'})
|
||||
if art is None:
|
||||
art = soup.find('div',attrs={'class':'article'})
|
||||
if art is not None:
|
||||
art.append(aside)
|
||||
try:
|
||||
if self.one_picture_per_article:
|
||||
# Remove all images after first
|
||||
@ -642,6 +985,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
try:
|
||||
# Change <nyt_headline> to <h2>
|
||||
h1 = soup.find('h1')
|
||||
blogheadline = str(h1) #added for dealbook
|
||||
if h1:
|
||||
headline = h1.find("nyt_headline")
|
||||
if headline:
|
||||
@ -649,13 +993,19 @@ class NYTimes(BasicNewsRecipe):
|
||||
tag['class'] = "headline"
|
||||
tag.insert(0, self.fixChars(headline.contents[0]))
|
||||
h1.replaceWith(tag)
|
||||
elif blogheadline.find('entry-title'):#added for dealbook
|
||||
tag = Tag(soup, "h2")#added for dealbook
|
||||
tag['class'] = "headline"#added for dealbook
|
||||
tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook
|
||||
h1.replaceWith(tag)#added for dealbook
|
||||
|
||||
else:
|
||||
# Blog entry - replace headline, remove <hr> tags
|
||||
# Blog entry - replace headline, remove <hr> tags - BCC I think this is no longer functional 1-18-2011
|
||||
headline = soup.find('title')
|
||||
if headline:
|
||||
tag = Tag(soup, "h2")
|
||||
tag['class'] = "headline"
|
||||
tag.insert(0, self.fixChars(headline.contents[0]))
|
||||
tag.insert(0, self.fixChars(headline.renderContents()))
|
||||
soup.insert(0, tag)
|
||||
hrs = soup.findAll('hr')
|
||||
for hr in hrs:
|
||||
@ -663,6 +1013,29 @@ class NYTimes(BasicNewsRecipe):
|
||||
except:
|
||||
self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
|
||||
|
||||
try:
|
||||
#if this is from a blog (dealbook, fix the byline format
|
||||
bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
|
||||
if bylineauthor:
|
||||
tag = Tag(soup, "h6")
|
||||
tag['class'] = "byline"
|
||||
tag.insert(0, self.fixChars(bylineauthor.renderContents()))
|
||||
bylineauthor.replaceWith(tag)
|
||||
except:
|
||||
self.log("ERROR: fixing byline author format")
|
||||
|
||||
try:
|
||||
#if this is a blog (dealbook) fix the credit style for the pictures
|
||||
blogcredit = soup.find('div',attrs={'class':'credit'})
|
||||
if blogcredit:
|
||||
tag = Tag(soup, "h6")
|
||||
tag['class'] = "credit"
|
||||
tag.insert(0, self.fixChars(blogcredit.renderContents()))
|
||||
blogcredit.replaceWith(tag)
|
||||
except:
|
||||
self.log("ERROR: fixing credit format")
|
||||
|
||||
|
||||
try:
|
||||
# Change <h1> to <h3> - used in editorial blogs
|
||||
masthead = soup.find("h1")
|
||||
@ -685,6 +1058,13 @@ class NYTimes(BasicNewsRecipe):
|
||||
subhead.replaceWith(bTag)
|
||||
except:
|
||||
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
||||
try:
|
||||
#remove the <strong> update tag
|
||||
blogupdated = soup.find('span', {'class':'update'})
|
||||
if blogupdated:
|
||||
blogupdated.replaceWith("")
|
||||
except:
|
||||
self.log("ERROR: Removing strong tag")
|
||||
|
||||
try:
|
||||
divTag = soup.find('div',attrs={'id':'articleBody'})
|
||||
@ -708,16 +1088,16 @@ class NYTimes(BasicNewsRecipe):
|
||||
return soup
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||
if not first:
|
||||
return
|
||||
idxdiv = soup.find('div',attrs={'class':'articleSpanImage'})
|
||||
if idxdiv is not None:
|
||||
if idxdiv.img:
|
||||
self.add_toc_thumbnail(article, idxdiv.img['src'])
|
||||
self.add_toc_thumbnail(article, re.sub(r'links\\link\d+\\','',idxdiv.img['src']))
|
||||
else:
|
||||
img = soup.find('img')
|
||||
img = soup.find('body').find('img')
|
||||
if img is not None:
|
||||
self.add_toc_thumbnail(article, img['src'])
|
||||
|
||||
self.add_toc_thumbnail(article, re.sub(r'links\\link\d+\\','',img['src']))
|
||||
shortparagraph = ""
|
||||
try:
|
||||
if len(article.text_summary.strip()) == 0:
|
||||
@ -731,13 +1111,22 @@ class NYTimes(BasicNewsRecipe):
|
||||
#account for blank paragraphs and short paragraphs by appending them to longer ones
|
||||
if len(refparagraph) > 0:
|
||||
if len(refparagraph) > 70: #approximately one line of text
|
||||
article.summary = article.text_summary = shortparagraph + refparagraph
|
||||
newpara = shortparagraph + refparagraph
|
||||
newparaDateline,newparaEm,newparaDesc = newpara.partition('—')
|
||||
if newparaEm == '':
|
||||
newparaDateline,newparaEm,newparaDesc = newpara.partition('—')
|
||||
if newparaEm == '':
|
||||
newparaDesc = newparaDateline
|
||||
article.summary = article.text_summary = newparaDesc.strip()
|
||||
return
|
||||
else:
|
||||
shortparagraph = refparagraph + " "
|
||||
if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
|
||||
shortparagraph = shortparagraph + "- "
|
||||
else:
|
||||
article.summary = article.text_summary = self.massageNCXText(article.text_summary)
|
||||
except:
|
||||
self.log("Error creating article descriptions")
|
||||
return
|
||||
|
||||
|
||||
|
@ -6,31 +6,42 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
nytimes.com
|
||||
'''
|
||||
import re, string, time
|
||||
from calibre import entity_to_unicode, strftime
|
||||
from calibre import strftime
|
||||
from datetime import timedelta, date
|
||||
from time import sleep
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
|
||||
|
||||
|
||||
class NYTimes(BasicNewsRecipe):
|
||||
|
||||
recursions=1 # set this to zero to omit Related articles lists
|
||||
|
||||
# set getTechBlogs to True to include the technology blogs
|
||||
# set tech_oldest_article to control article age
|
||||
# set tech_max_articles_per_feed to control article count
|
||||
getTechBlogs = True
|
||||
remove_empty_feeds = True
|
||||
tech_oldest_article = 14
|
||||
tech_max_articles_per_feed = 25
|
||||
|
||||
|
||||
# set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored.
|
||||
headlinesOnly = False
|
||||
|
||||
# set webEdition to True for the Web edition of the newspaper. Set oldest_article to the
|
||||
# number of days old an article can be for inclusion. If oldest_article = 0 all articles
|
||||
# will be included. Note: oldest_article is ignored if webEdition = False
|
||||
# set webEdition to True for the Web edition of the newspaper. Set oldest_web_article to the
|
||||
# number of days old an article can be for inclusion. If oldest_web_article = None all articles
|
||||
# will be included. Note: oldest_web_article is ignored if webEdition = False
|
||||
webEdition = False
|
||||
oldest_article = 7
|
||||
|
||||
# replace paid Kindle Version: the name will be changed to "The New York Times" to cause
|
||||
# previous paid versions of the new york times to best sent to the back issues folder on the kindle
|
||||
replaceKindleVersion = False
|
||||
oldest_web_article = 7
|
||||
|
||||
# download higher resolution images than the small thumbnails typically included in the article
|
||||
# the down side of having large beautiful images is the file size is much larger, on the order of 7MB per paper
|
||||
useHighResImages = True
|
||||
|
||||
# replace paid Kindle Version: the name will be changed to "The New York Times" to cause
|
||||
# previous paid versions of the new york times to best sent to the back issues folder on the kindle
|
||||
replaceKindleVersion = False
|
||||
|
||||
# includeSections: List of sections to include. If empty, all sections found will be included.
|
||||
# Otherwise, only the sections named will be included. For example,
|
||||
#
|
||||
@ -90,60 +101,68 @@ class NYTimes(BasicNewsRecipe):
|
||||
('Education',u'education'),
|
||||
('Multimedia',u'multimedia'),
|
||||
(u'Obituaries',u'obituaries'),
|
||||
(u'Sunday Magazine',u'magazine'),
|
||||
(u'Week in Review',u'weekinreview')]
|
||||
(u'Sunday Magazine',u'magazine')
|
||||
]
|
||||
|
||||
tech_feeds = [
|
||||
(u'Tech - Pogues Posts', u'http://pogue.blogs.nytimes.com/feed/'),
|
||||
(u'Tech - Bits', u'http://bits.blogs.nytimes.com/feed/'),
|
||||
(u'Tech - Gadgetwise', u'http://gadgetwise.blogs.nytimes.com/feed/'),
|
||||
(u'Tech - Open', u'http://open.blogs.nytimes.com/feed/')
|
||||
]
|
||||
|
||||
|
||||
if headlinesOnly:
|
||||
title='New York Times Headlines'
|
||||
description = 'Headlines from the New York Times'
|
||||
needs_subscription = True
|
||||
needs_subscription = False
|
||||
elif webEdition:
|
||||
title='New York Times (Web)'
|
||||
description = 'New York Times on the Web'
|
||||
needs_subscription = True
|
||||
needs_subscription = False
|
||||
elif replaceKindleVersion:
|
||||
title='The New York Times'
|
||||
description = 'Today\'s New York Times'
|
||||
needs_subscription = True
|
||||
needs_subscription = False
|
||||
else:
|
||||
title='New York Times'
|
||||
description = 'Today\'s New York Times. Needs subscription from http://www.nytimes.com'
|
||||
needs_subscription = True
|
||||
description = 'Today\'s New York Times'
|
||||
needs_subscription = False
|
||||
|
||||
|
||||
month_list = ['january','february','march','april','may','june','july','august','september','october','november','december']
|
||||
|
||||
def decode_us_date(self,datestr):
|
||||
udate = datestr.strip().lower().split()
|
||||
def decode_url_date(self,url):
|
||||
urlitems = url.split('/')
|
||||
try:
|
||||
m = self.month_list.index(udate[0])+1
|
||||
d = date(int(urlitems[3]),int(urlitems[4]),int(urlitems[5]))
|
||||
except:
|
||||
return date.today()
|
||||
d = int(udate[1])
|
||||
y = int(udate[2])
|
||||
try:
|
||||
d = date(y,m,d)
|
||||
d = date(int(urlitems[4]),int(urlitems[5]),int(urlitems[6]))
|
||||
except:
|
||||
d = date.today
|
||||
return None
|
||||
return d
|
||||
|
||||
earliest_date = date.today() - timedelta(days=oldest_article)
|
||||
if oldest_web_article is None:
|
||||
earliest_date = date.today()
|
||||
else:
|
||||
earliest_date = date.today() - timedelta(days=oldest_web_article)
|
||||
oldest_article = 365 # by default, a long time ago
|
||||
|
||||
__author__ = 'GRiker/Kovid Goyal/Nick Redding/Ben Collier'
|
||||
__author__ = 'GRiker/Kovid Goyal/Nick Redding'
|
||||
language = 'en'
|
||||
requires_version = (0, 7, 5)
|
||||
|
||||
encoding = 'utf-8'
|
||||
|
||||
timefmt = ''
|
||||
masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
|
||||
|
||||
simultaneous_downloads = 1
|
||||
|
||||
cover_margins = (18,18,'grey99')
|
||||
|
||||
remove_tags_before = dict(id='article')
|
||||
remove_tags_after = dict(id='article')
|
||||
remove_tags = [dict(attrs={'class':[
|
||||
remove_tags = [
|
||||
dict(attrs={'class':[
|
||||
'articleFooter',
|
||||
'articleTools',
|
||||
'columnGroup doubleRule',
|
||||
'columnGroup singleRule',
|
||||
'columnGroup last',
|
||||
'columnGroup last',
|
||||
@ -151,7 +170,6 @@ class NYTimes(BasicNewsRecipe):
|
||||
'dottedLine',
|
||||
'entry-meta',
|
||||
'entry-response module',
|
||||
#'icon enlargeThis', #removed to provide option for high res images
|
||||
'leftNavTabs',
|
||||
'metaFootnote',
|
||||
'module box nav',
|
||||
@ -175,12 +193,9 @@ class NYTimes(BasicNewsRecipe):
|
||||
'column four',#added for other blog downloads
|
||||
'column four last',#added for other blog downloads
|
||||
'column last', #added for other blog downloads
|
||||
'timestamp published', #added for other blog downloads
|
||||
'entry entry-related',
|
||||
'subNavigation tabContent active', #caucus blog navigation
|
||||
'columnGroup doubleRule',
|
||||
'mediaOverlay slideshow',
|
||||
'headlinesOnly multiline flush',
|
||||
'wideThumb',
|
||||
'video', #added 02-11-2011
|
||||
'videoHeader',#added 02-11-2011
|
||||
@ -189,7 +204,18 @@ class NYTimes(BasicNewsRecipe):
|
||||
re.compile('^subNavigation'),
|
||||
re.compile('^leaderboard'),
|
||||
re.compile('^module'),
|
||||
re.compile('commentCount')
|
||||
]}),
|
||||
dict(name='div', attrs={'class':re.compile('toolsList')}), # bits
|
||||
dict(name='div', attrs={'class':re.compile('postNavigation')}), # bits
|
||||
dict(name='div', attrs={'class':'tweet'}),
|
||||
dict(name='span', attrs={'class':'commentCount meta'}),
|
||||
dict(name='div', attrs={'id':'header'}),
|
||||
dict(name='div', attrs={'id':re.compile('commentsContainer')}), # bits, pogue, gadgetwise, open
|
||||
dict(name='ul', attrs={'class':re.compile('entry-tools')}), # pogue, gadgetwise
|
||||
dict(name='div', attrs={'class':re.compile('nocontent')}), # pogue, gadgetwise
|
||||
dict(name='div', attrs={'id':re.compile('respond')}), # open
|
||||
dict(name='div', attrs={'class':re.compile('entry-tags')}), # pogue
|
||||
dict(id=[
|
||||
'adxLeaderboard',
|
||||
'adxSponLink',
|
||||
@ -227,17 +253,21 @@ class NYTimes(BasicNewsRecipe):
|
||||
no_stylesheets = True
|
||||
extra_css = '''
|
||||
.articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
|
||||
.credit { text-align: right; font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.byline { text-align: left; font-size: small; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.dateline { text-align: left; font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.timestamp { text-align: left; font-size: small; }
|
||||
.caption { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.credit { font-weight: normal; text-align: right; font-size: 50%; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.byline { text-align: left; font-size: 50%; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.dateline { text-align: left; font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.kicker { font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
.timestamp { font-weight: normal; text-align: left; font-size: 50%; }
|
||||
.caption { font-size: 50%; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||
a:link {text-decoration: none; }
|
||||
.date{font-size: 50%; }
|
||||
.update{font-size: 50%; }
|
||||
.articleBody { }
|
||||
.authorId {text-align: left; }
|
||||
.authorId {text-align: left; font-size: 50%; }
|
||||
.image {text-align: center;}
|
||||
.source {text-align: left; }'''
|
||||
.aside {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;}
|
||||
.asidenote {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;font-weight:bold;}
|
||||
.source {text-align: left; font-size: x-small; }'''
|
||||
|
||||
|
||||
articles = {}
|
||||
@ -276,7 +306,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
def exclude_url(self,url):
|
||||
if not url.startswith("http"):
|
||||
return True
|
||||
if not url.endswith(".html") and 'dealbook.nytimes.com' not in url and 'blogs.nytimes.com' not in url: #added for DealBook
|
||||
if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: #added for DealBook
|
||||
return True
|
||||
if 'nytimes.com' not in url:
|
||||
return True
|
||||
@ -319,88 +349,91 @@ class NYTimes(BasicNewsRecipe):
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
if self.username is not None and self.password is not None:
|
||||
br.open('http://www.nytimes.com/auth/login')
|
||||
br.form = br.forms().next()
|
||||
br['userid'] = self.username
|
||||
br['password'] = self.password
|
||||
raw = br.submit().read()
|
||||
if 'Please try again' in raw:
|
||||
raise Exception('Your username and password are incorrect')
|
||||
return br
|
||||
|
||||
def skip_ad_pages(self, soup):
|
||||
# Skip ad pages served before actual article
|
||||
skip_tag = soup.find(True, {'name':'skip'})
|
||||
if skip_tag is not None:
|
||||
self.log.warn("Found forwarding link: %s" % skip_tag.parent['href'])
|
||||
url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
|
||||
url += '?pagewanted=all'
|
||||
self.log.warn("Skipping ad to article at '%s'" % url)
|
||||
return self.index_to_soup(url, raw=True)
|
||||
## This doesn't work (and probably never did). It either gets another serve of the advertisement,
|
||||
## or if it gets the article then get_soup (from which it is invoked) traps trying to do xml decoding.
|
||||
##
|
||||
## def skip_ad_pages(self, soup):
|
||||
## # Skip ad pages served before actual article
|
||||
## skip_tag = soup.find(True, {'name':'skip'})
|
||||
## if skip_tag is not None:
|
||||
## self.log.warn("Found forwarding link: %s" % skip_tag.parent['href'])
|
||||
## url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
|
||||
## url += '?pagewanted=all'
|
||||
## self.log.warn("Skipping ad to article at '%s'" % url)
|
||||
## return self.index_to_soup(url, raw=True)
|
||||
|
||||
|
||||
cover_tag = 'NY_NYT'
|
||||
def get_cover_url(self):
|
||||
cover = None
|
||||
st = time.localtime()
|
||||
year = str(st.tm_year)
|
||||
month = "%.2d" % st.tm_mon
|
||||
day = "%.2d" % st.tm_mday
|
||||
cover = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/nytfrontpage/scan.jpg'
|
||||
from datetime import timedelta, date
|
||||
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.cover_tag+'.jpg'
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
daysback=1
|
||||
try:
|
||||
br.open(cover)
|
||||
except:
|
||||
while daysback<7:
|
||||
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.cover_tag+'.jpg'
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
try:
|
||||
br.open(cover)
|
||||
except:
|
||||
daysback = daysback+1
|
||||
continue
|
||||
break
|
||||
if daysback==7:
|
||||
self.log("\nCover unavailable")
|
||||
cover = None
|
||||
return cover
|
||||
|
||||
masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
|
||||
|
||||
def short_title(self):
|
||||
return self.title
|
||||
|
||||
def index_to_soup(self, url_or_raw, raw=False):
|
||||
'''
|
||||
OVERRIDE of class method
|
||||
deals with various page encodings between index and articles
|
||||
'''
|
||||
def get_the_soup(docEncoding, url_or_raw, raw=False) :
|
||||
|
||||
def article_to_soup(self, url_or_raw, raw=False):
|
||||
from contextlib import closing
|
||||
import copy
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
if re.match(r'\w+://', url_or_raw):
|
||||
br = self.clone_browser(self.browser)
|
||||
f = br.open_novisit(url_or_raw)
|
||||
open_func = getattr(br, 'open_novisit', br.open)
|
||||
with closing(open_func(url_or_raw)) as f:
|
||||
_raw = f.read()
|
||||
f.close()
|
||||
if not _raw:
|
||||
raise RuntimeError('Could not fetch index from %s'%url_or_raw)
|
||||
else:
|
||||
_raw = url_or_raw
|
||||
if raw:
|
||||
return _raw
|
||||
|
||||
if not isinstance(_raw, unicode) and self.encoding:
|
||||
_raw = _raw.decode(docEncoding, 'replace')
|
||||
massage = list(BeautifulSoup.MARKUP_MASSAGE)
|
||||
massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding)))
|
||||
return BeautifulSoup(_raw, markupMassage=massage)
|
||||
if callable(self.encoding):
|
||||
_raw = self.encoding(_raw)
|
||||
else:
|
||||
_raw = _raw.decode(self.encoding, 'replace')
|
||||
|
||||
# Entry point
|
||||
soup = get_the_soup( self.encoding, url_or_raw )
|
||||
contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
|
||||
docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
|
||||
if docEncoding == '' :
|
||||
docEncoding = self.encoding
|
||||
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
|
||||
nmassage.extend(self.preprocess_regexps)
|
||||
nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL), lambda m: '')]
|
||||
# Some websites have buggy doctype declarations that mess up beautifulsoup
|
||||
# Remove comments as they can leave detritus when extracting tags leaves
|
||||
# multiple nested comments
|
||||
nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
|
||||
usrc = xml_to_unicode(_raw, self.verbose, strip_encoding_pats=True)[0]
|
||||
usrc = self.preprocess_raw_html(usrc, url_or_raw)
|
||||
return BeautifulSoup(usrc, markupMassage=nmassage)
|
||||
|
||||
if self.verbose > 2:
|
||||
self.log( " document encoding: '%s'" % docEncoding)
|
||||
if docEncoding != self.encoding :
|
||||
soup = get_the_soup(docEncoding, url_or_raw)
|
||||
|
||||
return soup
|
||||
|
||||
def massageNCXText(self, description):
|
||||
# Kindle TOC descriptions won't render certain characters
|
||||
if description:
|
||||
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||
# Replace '&' with '&'
|
||||
massaged = re.sub("&","&", massaged)
|
||||
massaged = re.sub("&","&", massaged)
|
||||
massaged = re.sub("&","&", massaged)
|
||||
return self.fixChars(massaged)
|
||||
else:
|
||||
return description
|
||||
@ -422,6 +455,16 @@ class NYTimes(BasicNewsRecipe):
|
||||
if self.filterDuplicates:
|
||||
if url in self.url_list:
|
||||
return
|
||||
if self.webEdition:
|
||||
date_tag = self.decode_url_date(url)
|
||||
if date_tag is not None:
|
||||
if self.oldest_web_article is not None:
|
||||
if date_tag < self.earliest_date:
|
||||
self.log("Skipping article %s" % url)
|
||||
return
|
||||
else:
|
||||
self.log("Skipping article %s" % url)
|
||||
return
|
||||
self.url_list.append(url)
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
description = ''
|
||||
@ -446,6 +489,31 @@ class NYTimes(BasicNewsRecipe):
|
||||
description=description, author=author,
|
||||
content=''))
|
||||
|
||||
def get_tech_feeds(self,ans):
|
||||
if self.getTechBlogs:
|
||||
tech_articles = {}
|
||||
key_list = []
|
||||
save_oldest_article = self.oldest_article
|
||||
save_max_articles_per_feed = self.max_articles_per_feed
|
||||
self.oldest_article = self.tech_oldest_article
|
||||
self.max_articles_per_feed = self.tech_max_articles_per_feed
|
||||
self.feeds = self.tech_feeds
|
||||
tech = self.parse_feeds()
|
||||
self.oldest_article = save_oldest_article
|
||||
self.max_articles_per_feed = save_max_articles_per_feed
|
||||
self.feeds = None
|
||||
for f in tech:
|
||||
key_list.append(f.title)
|
||||
tech_articles[f.title] = []
|
||||
for a in f.articles:
|
||||
tech_articles[f.title].append(
|
||||
dict(title=a.title, url=a.url, date=a.date,
|
||||
description=a.summary, author=a.author,
|
||||
content=a.content))
|
||||
tech_ans = [(k, tech_articles[k]) for k in key_list if tech_articles.has_key(k)]
|
||||
for x in tech_ans:
|
||||
ans.append(x)
|
||||
return ans
|
||||
|
||||
def parse_web_edition(self):
|
||||
|
||||
@ -457,31 +525,41 @@ class NYTimes(BasicNewsRecipe):
|
||||
if sec_title in self.excludeSections:
|
||||
print "SECTION EXCLUDED: ",sec_title
|
||||
continue
|
||||
print 'Index URL: '+'http://www.nytimes.com/pages/'+index_url+'/index.html'
|
||||
try:
|
||||
soup = self.index_to_soup('http://www.nytimes.com/pages/'+index_url+'/index.html')
|
||||
except:
|
||||
continue
|
||||
print 'Index URL: '+'http://www.nytimes.com/pages/'+index_url+'/index.html'
|
||||
|
||||
self.key = sec_title
|
||||
# Find each article
|
||||
for div in soup.findAll(True,
|
||||
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
||||
if div['class'] in ['story', 'story headline'] :
|
||||
attrs={'class':['section-headline', 'ledeStory', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
||||
if div['class'] in ['story', 'story headline', 'storyHeader'] :
|
||||
self.handle_article(div)
|
||||
elif div['class'] == 'ledeStory':
|
||||
divsub = div.find('div','storyHeader')
|
||||
if divsub is not None:
|
||||
self.handle_article(divsub)
|
||||
ulrefer = div.find('ul','refer')
|
||||
if ulrefer is not None:
|
||||
for lidiv in ulrefer.findAll('li'):
|
||||
self.handle_article(lidiv)
|
||||
elif div['class'] == 'headlinesOnly multiline flush':
|
||||
for lidiv in div.findAll('li'):
|
||||
self.handle_article(lidiv)
|
||||
|
||||
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
||||
return self.filter_ans(self.ans)
|
||||
return self.filter_ans(self.get_tech_feeds(self.ans))
|
||||
|
||||
|
||||
def parse_todays_index(self):
|
||||
|
||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
|
||||
|
||||
skipping = False
|
||||
# Find each article
|
||||
for div in soup.findAll(True,
|
||||
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
||||
|
||||
if div['class'] in ['section-headline','sectionHeader']:
|
||||
self.key = string.capwords(self.feed_title(div))
|
||||
self.key = self.key.replace('Op-ed','Op-Ed')
|
||||
@ -505,7 +583,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
self.handle_article(lidiv)
|
||||
|
||||
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
||||
return self.filter_ans(self.ans)
|
||||
return self.filter_ans(self.get_tech_feeds(self.ans))
|
||||
|
||||
def parse_headline_index(self):
|
||||
|
||||
@ -553,7 +631,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
for h3_item in search_div.findAll('h3'):
|
||||
byline = h3_item.h6
|
||||
if byline is not None:
|
||||
author = self.tag_to_string(byline,usa_alt=False)
|
||||
author = self.tag_to_string(byline,use_alt=False)
|
||||
else:
|
||||
author = ''
|
||||
a = h3_item.find('a', href=True)
|
||||
@ -579,7 +657,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
|
||||
|
||||
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
||||
return self.filter_ans(self.ans)
|
||||
return self.filter_ans(self.get_tech_feeds(self.ans))
|
||||
|
||||
def parse_index(self):
|
||||
if self.headlinesOnly:
|
||||
@ -589,40 +667,198 @@ class NYTimes(BasicNewsRecipe):
|
||||
else:
|
||||
return self.parse_todays_index()
|
||||
|
||||
def strip_anchors(self,soup):
|
||||
def strip_anchors(self,soup,kill_all=False):
|
||||
paras = soup.findAll(True)
|
||||
for para in paras:
|
||||
aTags = para.findAll('a')
|
||||
for a in aTags:
|
||||
if a.img is None:
|
||||
a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
||||
if kill_all or (self.recursions==0):
|
||||
a.replaceWith(self.tag_to_string(a,False))
|
||||
else:
|
||||
if a.has_key('href'):
|
||||
if a['href'].startswith('http://www.nytimes'):
|
||||
if not a['href'].endswith('pagewanted=all'):
|
||||
url = re.sub(r'\?.*', '', a['href'])
|
||||
if self.exclude_url(url):
|
||||
a.replaceWith(self.tag_to_string(a,False))
|
||||
else:
|
||||
a['href'] = url+'?pagewanted=all'
|
||||
elif not (a['href'].startswith('http://pogue') or \
|
||||
a['href'].startswith('http://bits') or \
|
||||
a['href'].startswith('http://travel') or \
|
||||
a['href'].startswith('http://business') or \
|
||||
a['href'].startswith('http://tech') or \
|
||||
a['href'].startswith('http://health') or \
|
||||
a['href'].startswith('http://dealbook') or \
|
||||
a['href'].startswith('http://open')):
|
||||
a.replaceWith(self.tag_to_string(a,False))
|
||||
return soup
|
||||
|
||||
def handle_tags(self,soup):
|
||||
try:
|
||||
print("HANDLE TAGS: TITLE = "+self.tag_to_string(soup.title))
|
||||
except:
|
||||
print("HANDLE TAGS: NO TITLE")
|
||||
if soup is None:
|
||||
print("ERROR: handle_tags received NoneType")
|
||||
return None
|
||||
|
||||
## print("HANDLING AD FORWARD:")
|
||||
## print(soup)
|
||||
if self.keep_only_tags:
|
||||
body = Tag(soup, 'body')
|
||||
try:
|
||||
if isinstance(self.keep_only_tags, dict):
|
||||
self.keep_only_tags = [self.keep_only_tags]
|
||||
for spec in self.keep_only_tags:
|
||||
for tag in soup.find('body').findAll(**spec):
|
||||
body.insert(len(body.contents), tag)
|
||||
soup.find('body').replaceWith(body)
|
||||
except AttributeError: # soup has no body element
|
||||
pass
|
||||
|
||||
def remove_beyond(tag, next):
|
||||
while tag is not None and getattr(tag, 'name', None) != 'body':
|
||||
after = getattr(tag, next)
|
||||
while after is not None:
|
||||
ns = getattr(tag, next)
|
||||
after.extract()
|
||||
after = ns
|
||||
tag = tag.parent
|
||||
|
||||
if self.remove_tags_after is not None:
|
||||
rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
|
||||
for spec in rt:
|
||||
tag = soup.find(**spec)
|
||||
remove_beyond(tag, 'nextSibling')
|
||||
|
||||
if self.remove_tags_before is not None:
|
||||
tag = soup.find(**self.remove_tags_before)
|
||||
remove_beyond(tag, 'previousSibling')
|
||||
|
||||
for kwds in self.remove_tags:
|
||||
for tag in soup.findAll(**kwds):
|
||||
tag.extract()
|
||||
|
||||
return soup
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
if self.webEdition & (self.oldest_article>0):
|
||||
date_tag = soup.find(True,attrs={'class': ['dateline','date']})
|
||||
if date_tag:
|
||||
date_str = self.tag_to_string(date_tag,use_alt=False)
|
||||
date_str = date_str.replace('Published:','')
|
||||
date_items = date_str.split(',')
|
||||
try:
|
||||
datestring = date_items[0]+' '+date_items[1]
|
||||
article_date = self.decode_us_date(datestring)
|
||||
except:
|
||||
article_date = date.today()
|
||||
if article_date < self.earliest_date:
|
||||
self.log("Skipping article dated %s" % date_str)
|
||||
return None
|
||||
print("PREPROCESS TITLE="+self.tag_to_string(soup.title))
|
||||
skip_tag = soup.find(True, {'name':'skip'})
|
||||
if skip_tag is not None:
|
||||
url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
|
||||
url += '?pagewanted=all'
|
||||
self.log.warn("Skipping ad to article at '%s'" % url)
|
||||
sleep(5)
|
||||
soup = self.handle_tags(self.article_to_soup(url))
|
||||
|
||||
#all articles are from today, no need to print the date on every page
|
||||
try:
|
||||
if not self.webEdition:
|
||||
date_tag = soup.find(True,attrs={'class': ['dateline','date']})
|
||||
if date_tag:
|
||||
date_tag.extract()
|
||||
except:
|
||||
self.log("Error removing the published date")
|
||||
# check if the article is from one of the tech blogs
|
||||
blog=soup.find('div',attrs={'id':['pogue','bits','gadgetwise','open']})
|
||||
|
||||
if blog is not None:
|
||||
old_body = soup.find('body')
|
||||
new_body=Tag(soup,'body')
|
||||
new_body.append(soup.find('div',attrs={'id':'content'}))
|
||||
new_body.find('div',attrs={'id':'content'})['id']='blogcontent' # identify for postprocess_html
|
||||
old_body.replaceWith(new_body)
|
||||
for divr in soup.findAll('div',attrs={'class':re.compile('w190 right')}):
|
||||
if divr.find(text=re.compile('Sign up')):
|
||||
divr.extract()
|
||||
divr = soup.find('div',attrs={'id':re.compile('related-content')})
|
||||
if divr is not None:
|
||||
# handle related articles
|
||||
rlist = []
|
||||
ul = divr.find('ul')
|
||||
if ul is not None:
|
||||
for li in ul.findAll('li'):
|
||||
atag = li.find('a')
|
||||
if atag is not None:
|
||||
if atag['href'].startswith('http://pogue') or atag['href'].startswith('http://bits') or \
|
||||
atag['href'].startswith('http://open'):
|
||||
atag.find(text=True).replaceWith(self.massageNCXText(self.tag_to_string(atag,False)))
|
||||
rlist.append(atag)
|
||||
divr.extract()
|
||||
if rlist != []:
|
||||
asidediv = Tag(soup,'div',[('class','aside')])
|
||||
if soup.find('hr') is None:
|
||||
asidediv.append(Tag(soup,'hr'))
|
||||
h4 = Tag(soup,'h4',[('class','asidenote')])
|
||||
h4.insert(0,"Related Posts")
|
||||
asidediv.append(h4)
|
||||
ul = Tag(soup,'ul')
|
||||
for r in rlist:
|
||||
li = Tag(soup,'li',[('class','aside')])
|
||||
r['class'] = 'aside'
|
||||
li.append(r)
|
||||
ul.append(li)
|
||||
asidediv.append(ul)
|
||||
asidediv.append(Tag(soup,'hr'))
|
||||
smain = soup.find('body')
|
||||
smain.append(asidediv)
|
||||
for atag in soup.findAll('a'):
|
||||
img = atag.find('img')
|
||||
if img is not None:
|
||||
atag.replaceWith(img)
|
||||
elif not atag.has_key('href'):
|
||||
atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
|
||||
elif not (atag['href'].startswith('http://www.nytimes') or atag['href'].startswith('http://pogue') or \
|
||||
atag['href'].startswith('http://bits') or atag['href'].startswith('http://open')):
|
||||
atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
|
||||
hdr = soup.find('address')
|
||||
if hdr is not None:
|
||||
hdr.name='span'
|
||||
for span_credit in soup.findAll('span','credit'):
|
||||
sp = Tag(soup,'span')
|
||||
span_credit.replaceWith(sp)
|
||||
sp.append(Tag(soup,'br'))
|
||||
sp.append(span_credit)
|
||||
sp.append(Tag(soup,'br'))
|
||||
|
||||
else: # nytimes article
|
||||
|
||||
related = [] # these will be the related articles
|
||||
first_outer = None # first related outer tag
|
||||
first_related = None # first related tag
|
||||
for outerdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}):
|
||||
for rdiv in soup.findAll('div','columnGroup doubleRule'):
|
||||
if rdiv.find('h3') is not None:
|
||||
if self.tag_to_string(rdiv.h3,False).startswith('Related'):
|
||||
rdiv.h3.find(text=True).replaceWith("Related articles")
|
||||
rdiv.h3['class'] = 'asidenote'
|
||||
for litag in rdiv.findAll('li'):
|
||||
if litag.find('a') is not None:
|
||||
if litag.find('a')['href'].startswith('http://www.nytimes.com'):
|
||||
url = re.sub(r'\?.*', '', litag.find('a')['href'])
|
||||
litag.find('a')['href'] = url+'?pagewanted=all'
|
||||
litag.extract()
|
||||
related.append(litag)
|
||||
if first_related is None:
|
||||
first_related = rdiv
|
||||
first_outer = outerdiv
|
||||
else:
|
||||
litag.extract()
|
||||
if related != []:
|
||||
for r in related:
|
||||
if r.h6: # don't want the anchor inside a h6 tag
|
||||
r.h6.replaceWith(r.h6.a)
|
||||
first_related.ul.append(r)
|
||||
first_related.insert(0,Tag(soup,'hr'))
|
||||
first_related.append(Tag(soup,'hr'))
|
||||
first_related['class'] = 'aside'
|
||||
first_outer.replaceWith(first_related) # replace the outer tag with the related tag
|
||||
|
||||
for rdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}):
|
||||
rdiv.extract()
|
||||
|
||||
kicker_tag = soup.find(attrs={'class':'kicker'})
|
||||
if kicker_tag: # remove Op_Ed author head shots
|
||||
tagline = self.tag_to_string(kicker_tag)
|
||||
if tagline=='Op-Ed Columnist':
|
||||
img_div = soup.find('div','inlineImage module')
|
||||
if img_div:
|
||||
img_div.extract()
|
||||
|
||||
if self.useHighResImages:
|
||||
try:
|
||||
@ -667,26 +903,6 @@ class NYTimes(BasicNewsRecipe):
|
||||
except Exception:
|
||||
self.log("Error pulling high resolution images")
|
||||
|
||||
try:
|
||||
#remove "Related content" bar
|
||||
runAroundsFound = soup.findAll('div',{'class':['articleInline runaroundLeft','articleInline doubleRule runaroundLeft','articleInline runaroundLeft firstArticleInline','articleInline runaroundLeft ','articleInline runaroundLeft lastArticleInline']})
|
||||
if runAroundsFound:
|
||||
for runAround in runAroundsFound:
|
||||
#find all section headers
|
||||
hlines = runAround.findAll(True ,{'class':['sectionHeader','sectionHeader flushBottom']})
|
||||
if hlines:
|
||||
for hline in hlines:
|
||||
hline.extract()
|
||||
|
||||
#find all section headers
|
||||
hlines = runAround.findAll('h6')
|
||||
if hlines:
|
||||
for hline in hlines:
|
||||
hline.extract()
|
||||
except:
|
||||
self.log("Error removing related content bar")
|
||||
|
||||
|
||||
try:
|
||||
#in case pulling images failed, delete the enlarge this text
|
||||
enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
|
||||
@ -696,9 +912,24 @@ class NYTimes(BasicNewsRecipe):
|
||||
except:
|
||||
self.log("Error removing Enlarge this text")
|
||||
|
||||
return self.strip_anchors(soup)
|
||||
|
||||
def postprocess_html(self,soup, True):
|
||||
return self.strip_anchors(soup,False)
|
||||
|
||||
def postprocess_html(self,soup,first_fetch):
|
||||
if not first_fetch: # remove Related links
|
||||
for aside in soup.findAll('div','aside'):
|
||||
aside.extract()
|
||||
soup = self.strip_anchors(soup,True)
|
||||
|
||||
if soup.find('div',attrs={'id':'blogcontent'}) is None:
|
||||
if first_fetch:
|
||||
aside = soup.find('div','aside')
|
||||
if aside is not None: # move the related list to the end of the article
|
||||
art = soup.find('div',attrs={'id':'article'})
|
||||
if art is None:
|
||||
art = soup.find('div',attrs={'class':'article'})
|
||||
if art is not None:
|
||||
art.append(aside)
|
||||
try:
|
||||
if self.one_picture_per_article:
|
||||
# Remove all images after first
|
||||
@ -855,23 +1086,22 @@ class NYTimes(BasicNewsRecipe):
|
||||
self.log("ERROR: Problem in Add class=authorId to <div> so we can format with CSS")
|
||||
|
||||
return soup
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||
if not first:
|
||||
return
|
||||
idxdiv = soup.find('div',attrs={'class':'articleSpanImage'})
|
||||
if idxdiv is not None:
|
||||
if idxdiv.img:
|
||||
self.add_toc_thumbnail(article, idxdiv.img['src'])
|
||||
self.add_toc_thumbnail(article, re.sub(r'links\\link\d+\\','',idxdiv.img['src']))
|
||||
else:
|
||||
img = soup.find('img')
|
||||
img = soup.find('body').find('img')
|
||||
if img is not None:
|
||||
self.add_toc_thumbnail(article, img['src'])
|
||||
|
||||
self.add_toc_thumbnail(article, re.sub(r'links\\link\d+\\','',img['src']))
|
||||
shortparagraph = ""
|
||||
try:
|
||||
if len(article.text_summary.strip()) == 0:
|
||||
articlebodies = soup.findAll('div',attrs={'class':'articleBody'})
|
||||
if not articlebodies: #added to account for blog formats
|
||||
articlebodies = soup.findAll('div', attrs={'class':'entry-content'}) #added to account for blog formats
|
||||
if articlebodies:
|
||||
for articlebody in articlebodies:
|
||||
if articlebody:
|
||||
@ -880,15 +1110,23 @@ class NYTimes(BasicNewsRecipe):
|
||||
refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
|
||||
#account for blank paragraphs and short paragraphs by appending them to longer ones
|
||||
if len(refparagraph) > 0:
|
||||
if len(refparagraph) > 140: #approximately two lines of text
|
||||
article.summary = article.text_summary = shortparagraph + refparagraph
|
||||
if len(refparagraph) > 70: #approximately one line of text
|
||||
newpara = shortparagraph + refparagraph
|
||||
newparaDateline,newparaEm,newparaDesc = newpara.partition('—')
|
||||
if newparaEm == '':
|
||||
newparaDateline,newparaEm,newparaDesc = newpara.partition('—')
|
||||
if newparaEm == '':
|
||||
newparaDesc = newparaDateline
|
||||
article.summary = article.text_summary = newparaDesc.strip()
|
||||
return
|
||||
else:
|
||||
shortparagraph = refparagraph + " "
|
||||
if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
|
||||
shortparagraph = shortparagraph + "- "
|
||||
|
||||
else:
|
||||
article.summary = article.text_summary = self.massageNCXText(article.text_summary)
|
||||
except:
|
||||
self.log("Error creating article descriptions")
|
||||
return
|
||||
|
||||
|
||||
|
@ -8,19 +8,19 @@ Fetch sueddeutsche.de
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class Sueddeutsche(BasicNewsRecipe):
|
||||
|
||||
title = u'Süddeutsche.de' # 2012-01-26 AGe Correct Title
|
||||
description = 'News from Germany, Access to online content' # 2012-01-26 AGe
|
||||
__author__ = 'Oliver Niesner and Armin Geller' #Update AGe 2012-01-26
|
||||
publisher = u'Süddeutsche Zeitung' # 2012-01-26 AGe add
|
||||
category = 'news, politics, Germany' # 2012-01-26 AGe add
|
||||
timefmt = ' [%a, %d %b %Y]' # 2012-01-26 AGe add %a
|
||||
title = u'Süddeutsche.de'
|
||||
description = 'News from Germany, Access to online content'
|
||||
__author__ = 'Oliver Niesner and Armin Geller' #Update AGe 2012-12-05
|
||||
publisher = u'Süddeutsche Zeitung'
|
||||
category = 'news, politics, Germany'
|
||||
timefmt = ' [%a, %d %b %Y]'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
language = 'de'
|
||||
encoding = 'utf-8'
|
||||
publication_type = 'newspaper' # 2012-01-26 add
|
||||
publication_type = 'newspaper'
|
||||
cover_source = 'http://www.sueddeutsche.de/verlag' # 2012-01-26 AGe add from Darko Miletic paid content source
|
||||
masthead_url = 'http://www.sueddeutsche.de/static_assets/build/img/sdesiteheader/logo_homepage.441d531c.png' # 2012-01-26 AGe add
|
||||
masthead_url = 'http://www.sueddeutsche.de/static_assets/img/sdesiteheader/logo_standard.a152b0df.png' # 2012-12-05 AGe add
|
||||
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
@ -40,9 +40,9 @@ class Sueddeutsche(BasicNewsRecipe):
|
||||
(u'Sport', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ESport%24?output=rss'),
|
||||
(u'Leben', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ELeben%24?output=rss'),
|
||||
(u'Karriere', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EKarriere%24?output=rss'),
|
||||
(u'Bildung', u'http://rss.sueddeutsche.de/rss/bildung'), #2012-01-26 AGe New
|
||||
(u'Gesundheit', u'http://rss.sueddeutsche.de/rss/gesundheit'), #2012-01-26 AGe New
|
||||
(u'Stil', u'http://rss.sueddeutsche.de/rss/stil'), #2012-01-26 AGe New
|
||||
(u'Bildung', u'http://rss.sueddeutsche.de/rss/bildung'),
|
||||
(u'Gesundheit', u'http://rss.sueddeutsche.de/rss/gesundheit'),
|
||||
(u'Stil', u'http://rss.sueddeutsche.de/rss/stil'),
|
||||
(u'München & Region', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EMünchen&Region%24?output=rss'),
|
||||
(u'Bayern', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EBayern%24?output=rss'),
|
||||
(u'Medien', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EMedien%24?output=rss'),
|
||||
|
@ -2,8 +2,8 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '4 February 2011, desUBIKado'
|
||||
__author__ = 'desUBIKado'
|
||||
__version__ = 'v0.08'
|
||||
__date__ = '30, June 2012'
|
||||
__version__ = 'v0.09'
|
||||
__date__ = '02, December 2012'
|
||||
'''
|
||||
http://www.weblogssl.com/
|
||||
'''
|
||||
@ -37,6 +37,7 @@ class weblogssl(BasicNewsRecipe):
|
||||
,(u'Xataka Mexico', u'http://feeds.weblogssl.com/xatakamx')
|
||||
,(u'Xataka M\xf3vil', u'http://feeds.weblogssl.com/xatakamovil')
|
||||
,(u'Xataka Android', u'http://feeds.weblogssl.com/xatakandroid')
|
||||
,(u'Xataka Windows', u'http://feeds.weblogssl.com/xatakawindows')
|
||||
,(u'Xataka Foto', u'http://feeds.weblogssl.com/xatakafoto')
|
||||
,(u'Xataka ON', u'http://feeds.weblogssl.com/xatakaon')
|
||||
,(u'Xataka Ciencia', u'http://feeds.weblogssl.com/xatakaciencia')
|
||||
@ -80,19 +81,31 @@ class weblogssl(BasicNewsRecipe):
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'infoblock'}),
|
||||
dict(name='div', attrs={'class':'post'}),
|
||||
dict(name='div', attrs={'id':'blog-comments'})
|
||||
dict(name='div', attrs={'id':'blog-comments'}),
|
||||
dict(name='div', attrs={'class':'container'}) #m.xataka.com
|
||||
]
|
||||
|
||||
remove_tags = [dict(name='div', attrs={'id':'comment-nav'})]
|
||||
remove_tags = [dict(name='div', attrs={'id':'comment-nav'}),
|
||||
dict(name='menu', attrs={'class':'social-sharing'}), #m.xataka.com
|
||||
dict(name='section' , attrs={'class':'comments'}), #m.xataka.com
|
||||
dict(name='div' , attrs={'class':'article-comments'}), #m.xataka.com
|
||||
dict(name='nav' , attrs={'class':'article-taxonomy'}) #m.xataka.com
|
||||
]
|
||||
|
||||
remove_tags_after = dict(name='section' , attrs={'class':'comments'})
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('http://www.', 'http://m.')
|
||||
|
||||
preprocess_regexps = [
|
||||
# Para poner una linea en blanco entre un comentario y el siguiente
|
||||
(re.compile(r'<li id="c', re.DOTALL|re.IGNORECASE), lambda match: '<br><br><li id="c')
|
||||
(re.compile(r'<li id="c', re.DOTALL|re.IGNORECASE), lambda match: '<br><br><li id="c'),
|
||||
# Para ver las imágenes en las noticias de m.xataka.com
|
||||
(re.compile(r'<noscript>', re.DOTALL|re.IGNORECASE), lambda m: ''),
|
||||
(re.compile(r'</noscript>', re.DOTALL|re.IGNORECASE), lambda m: '')
|
||||
]
|
||||
|
||||
|
||||
# Para sustituir el video incrustado de YouTube por una imagen
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
@ -108,14 +121,16 @@ class weblogssl(BasicNewsRecipe):
|
||||
|
||||
# Para obtener la url original del articulo a partir de la de "feedsportal"
|
||||
# El siguiente código es gracias al usuario "bosplans" de www.mobileread.com
|
||||
# http://www.mobileread.com/forums/sho...d.php?t=130297
|
||||
# http://www.mobileread.com/forums/showthread.php?t=130297
|
||||
|
||||
def get_article_url(self, article):
|
||||
link = article.get('link', None)
|
||||
if link is None:
|
||||
return article
|
||||
# if link.split('/')[-4]=="xataka2":
|
||||
# return article.get('feedburner_origlink', article.get('link', article.get('guid')))
|
||||
if link.split('/')[-4]=="xataka2":
|
||||
return article.get('feedburner_origlink', article.get('link', article.get('guid')))
|
||||
return article.get('guid', None)
|
||||
if link.split('/')[-1]=="story01.htm":
|
||||
link=link.split('/')[-2]
|
||||
a=['0B','0C','0D','0E','0F','0G','0N' ,'0L0S','0A']
|
||||
|
@ -9,15 +9,15 @@ class Zaman (BasicNewsRecipe):
|
||||
__author__ = u'thomass'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed =50
|
||||
# no_stylesheets = True
|
||||
no_stylesheets = True
|
||||
#delay = 1
|
||||
#use_embedded_content = False
|
||||
encoding = 'ISO 8859-9'
|
||||
publisher = 'Zaman'
|
||||
use_embedded_content = False
|
||||
encoding = 'utf-8'
|
||||
publisher = 'Feza Gazetecilik'
|
||||
category = 'news, haberler,TR,gazete'
|
||||
language = 'tr'
|
||||
publication_type = 'newspaper '
|
||||
extra_css = '.buyukbaslik{font-weight: bold; font-size: 18px;color:#0000FF}'#body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
|
||||
extra_css = 'h1{text-transform: capitalize; font-weight: bold; font-size: 22px;color:#0000FF} p{text-align:justify} ' #.introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
|
||||
conversion_options = {
|
||||
'tags' : category
|
||||
,'language' : language
|
||||
@ -26,25 +26,26 @@ class Zaman (BasicNewsRecipe):
|
||||
}
|
||||
cover_img_url = 'https://fbcdn-profile-a.akamaihd.net/hprofile-ak-snc4/188140_81722291869_2111820_n.jpg'
|
||||
masthead_url = 'http://medya.zaman.com.tr/extentions/zaman.com.tr/img/section/logo-section.png'
|
||||
ignore_duplicate_articles = { 'title', 'url' }
|
||||
auto_cleanup = False
|
||||
remove_empty_feeds= True
|
||||
|
||||
|
||||
#keep_only_tags = [dict(name='div', attrs={'id':[ 'news-detail-content']}), dict(name='td', attrs={'class':['columnist-detail','columnist_head']}) ]
|
||||
remove_tags = [ dict(name='img', attrs={'src':['http://medya.zaman.com.tr/zamantryeni/pics/zamanonline.gif']})]#,dict(name='div', attrs={'class':['radioEmbedBg','radyoProgramAdi']}),dict(name='a', attrs={'class':['webkit-html-attribute-value webkit-html-external-link']}),dict(name='table', attrs={'id':['yaziYorumTablosu']}),dict(name='img', attrs={'src':['http://medya.zaman.com.tr/pics/paylas.gif','http://medya.zaman.com.tr/extentions/zaman.com.tr/img/columnist/ma-16.png']})
|
||||
#keep_only_tags = [dict(name='div', attrs={'id':[ 'contentposition19']})]#,dict(name='div', attrs={'id':[ 'xxx']}),dict(name='div', attrs={'id':[ 'xxx']}),dict(name='div', attrs={'id':[ 'xxx']}),dict(name='div', attrs={'id':[ 'xxx']}),dict(name='div', attrs={'id':[ 'xxx']}),dict(name='div', attrs={'id':[ 'xxx']}),dict(name='div', attrs={'id':[ 'news-detail-content']}), dict(name='td', attrs={'class':['columnist-detail','columnist_head']}), ]
|
||||
remove_tags = [ dict(name='img', attrs={'src':['http://cmsmedya.zaman.com.tr/images/logo/logo.bmp']}),dict(name='hr', attrs={'class':['interactive-hr']})]# remove_tags = [ dict(name='div', attrs={'class':[ 'detayUyari']}),dict(name='div', attrs={'class':[ 'detayYorum']}),dict(name='div', attrs={'class':[ 'addthis_toolbox addthis_default_style ']}),dict(name='div', attrs={'id':[ 'tumYazi']})]#,dict(name='div', attrs={'id':[ 'xxx']}),dict(name='div', attrs={'id':[ 'xxx']}),dict(name='div', attrs={'id':[ 'xxx']}),dict(name='div', attrs={'id':[ 'xxx']}),dict(name='div', attrs={'id':[ 'xxx']}),dict(name='div', attrs={'id':[ 'xxx']}),dict(name='img', attrs={'src':['http://medya.zaman.com.tr/zamantryeni/pics/zamanonline.gif']}),dict(name='div', attrs={'class':['radioEmbedBg','radyoProgramAdi']}),dict(name='a', attrs={'class':['webkit-html-attribute-value webkit-html-external-link']}),dict(name='table', attrs={'id':['yaziYorumTablosu']}),dict(name='img', attrs={'src':['http://medya.zaman.com.tr/pics/paylas.gif','http://medya.zaman.com.tr/extentions/zaman.com.tr/img/columnist/ma-16.png']}),dict(name='div', attrs={'id':[ 'news-detail-gallery']}),dict(name='div', attrs={'id':[ 'news-detail-title-bottom-part']}),dict(name='div', attrs={'id':[ 'news-detail-news-paging-main']})]#
|
||||
|
||||
|
||||
#remove_attributes = ['width','height']
|
||||
remove_empty_feeds= True
|
||||
|
||||
feeds = [
|
||||
( u'Anasayfa', u'http://www.zaman.com.tr/anasayfa.rss'),
|
||||
( u'Son Dakika', u'http://www.zaman.com.tr/sondakika.rss'),
|
||||
#( u'En çok Okunanlar', u'http://www.zaman.com.tr/max_all.rss'),
|
||||
#( u'Manşet', u'http://www.zaman.com.tr/manset.rss'),
|
||||
( u'Gündem', u'http://www.zaman.com.tr/gundem.rss'),
|
||||
( u'Manşet', u'http://www.zaman.com.tr/manset.rss'),
|
||||
( u'Yazarlar', u'http://www.zaman.com.tr/yazarlar.rss'),
|
||||
( u'Politika', u'http://www.zaman.com.tr/politika.rss'),
|
||||
( u'Ekonomi', u'http://www.zaman.com.tr/ekonomi.rss'),
|
||||
( u'Dış Haberler', u'http://www.zaman.com.tr/dishaberler.rss'),
|
||||
( u'Son Dakika', u'http://www.zaman.com.tr/sondakika.rss'),
|
||||
( u'Gündem', u'http://www.zaman.com.tr/gundem.rss'),
|
||||
( u'Yorumlar', u'http://www.zaman.com.tr/yorumlar.rss'),
|
||||
( u'Röportaj', u'http://www.zaman.com.tr/roportaj.rss'),
|
||||
( u'Dizi Yazı', u'http://www.zaman.com.tr/dizi.rss'),
|
||||
@ -59,8 +60,9 @@ class Zaman (BasicNewsRecipe):
|
||||
( u'Cuma Eki', u'http://www.zaman.com.tr/cuma.rss'),
|
||||
( u'Cumaertesi Eki', u'http://www.zaman.com.tr/cumaertesi.rss'),
|
||||
( u'Pazar Eki', u'http://www.zaman.com.tr/pazar.rss'),
|
||||
( u'En çok Okunanlar', u'http://www.zaman.com.tr/max_all.rss'),
|
||||
( u'Anasayfa', u'http://www.zaman.com.tr/anasayfa.rss'),
|
||||
|
||||
]
|
||||
def print_version(self, url):
|
||||
return url.replace('http://www.zaman.com.tr/haber.do?haberno=', 'http://www.zaman.com.tr/yazdir.do?haberno=')
|
||||
|
||||
return url.replace('http://www.zaman.com.tr/newsDetail_getNewsById.action?newsId=', 'http://www.zaman.com.tr/newsDetail_openPrintPage.action?newsId=')
|
||||
|
Binary file not shown.
Before Width: | Height: | Size: 17 KiB After Width: | Height: | Size: 62 KiB |
@ -39,18 +39,6 @@ class Win32(WinBase):
|
||||
def msi64(self):
|
||||
return installer_name('msi', is64bit=True)
|
||||
|
||||
def sign_msi(self):
|
||||
import xattr
|
||||
print ('Signing installers ...')
|
||||
sign64 = False
|
||||
msi64 = self.msi64
|
||||
if os.path.exists(msi64) and 'user.signed' not in xattr.list(msi64):
|
||||
subprocess.check_call(['scp', msi64, self.VM_NAME +
|
||||
':build/%s/%s'%(__appname__, msi64)])
|
||||
sign64 = True
|
||||
subprocess.check_call(['ssh', self.VM_NAME, '~/sign.sh'], shell=False)
|
||||
return sign64
|
||||
|
||||
def do_dl(self, installer, errmsg):
|
||||
subprocess.check_call(('scp',
|
||||
'%s:build/%s/%s'%(self.VM_NAME, __appname__, installer), 'dist'))
|
||||
@ -62,14 +50,8 @@ class Win32(WinBase):
|
||||
installer = self.installer()
|
||||
if os.path.exists('build/winfrozen'):
|
||||
shutil.rmtree('build/winfrozen')
|
||||
sign64 = self.sign_msi()
|
||||
if sign64:
|
||||
self.do_dl(self.msi64, 'Failed to d/l signed 64 bit installer')
|
||||
import xattr
|
||||
xattr.set(self.msi64, 'user.signed', 'true')
|
||||
|
||||
self.do_dl(installer, 'Failed to freeze')
|
||||
|
||||
installer = 'dist/%s-portable-installer-%s.exe'%(__appname__, __version__)
|
||||
self.do_dl(installer, 'Failed to get portable installer')
|
||||
|
||||
|
@ -91,6 +91,7 @@ class Win32Freeze(Command, WixMixIn):
|
||||
if not is64bit:
|
||||
self.build_portable()
|
||||
self.build_portable_installer()
|
||||
self.sign_installers()
|
||||
|
||||
def remove_CRT_from_manifests(self):
|
||||
'''
|
||||
@ -488,6 +489,17 @@ class Win32Freeze(Command, WixMixIn):
|
||||
|
||||
subprocess.check_call([LZMA + r'\bin\elzma.exe', '-9', '--lzip', name])
|
||||
|
||||
def sign_installers(self):
|
||||
self.info('Signing installers...')
|
||||
files = glob.glob(self.j('dist', '*.msi')) + glob.glob(self.j('dist',
|
||||
'*.exe'))
|
||||
if not files:
|
||||
raise ValueError('No installers found')
|
||||
subprocess.check_call(['signtool.exe', 'sign', '/a', '/d',
|
||||
'calibre - E-book management', '/du',
|
||||
'http://calibre-ebook.com', '/t',
|
||||
'http://timestamp.verisign.com/scripts/timstamp.dll'] + files)
|
||||
|
||||
def add_dir_to_zip(self, zf, path, prefix=''):
|
||||
'''
|
||||
Add a directory recursively to the zip file with an optional prefix.
|
||||
|
14524
setup/iso_639/ms.po
14524
setup/iso_639/ms.po
File diff suppressed because it is too large
Load Diff
@ -148,10 +148,10 @@ def print_basic_debug_info(out=None):
|
||||
out = functools.partial(prints, file=out)
|
||||
import platform
|
||||
from calibre.constants import (__appname__, get_version, isportable, isosx,
|
||||
isfrozen)
|
||||
isfrozen, is64bit)
|
||||
out(__appname__, get_version(), 'Portable' if isportable else '',
|
||||
'isfrozen:', isfrozen)
|
||||
out(platform.platform(), platform.system())
|
||||
'isfrozen:', isfrozen, 'is64bit:', is64bit)
|
||||
out(platform.platform(), platform.system(), platform.architecture())
|
||||
out(platform.system_alias(platform.system(), platform.release(),
|
||||
platform.version()))
|
||||
out('Python', platform.python_version())
|
||||
|
@ -232,7 +232,7 @@ class ANDROID(USBMS):
|
||||
'THINKPAD_TABLET', 'SGH-T989', 'YP-G70', 'STORAGE_DEVICE',
|
||||
'ADVANCED', 'SGH-I727', 'USB_FLASH_DRIVER', 'ANDROID',
|
||||
'S5830I_CARD', 'MID7042', 'LINK-CREATE', '7035', 'VIEWPAD_7E',
|
||||
'NOVO7', 'MB526', '_USB#WYK7MSF8KE']
|
||||
'NOVO7', 'MB526', '_USB#WYK7MSF8KE', 'TABLET_PC']
|
||||
WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897',
|
||||
'FILE-STOR_GADGET', 'SGH-T959_CARD', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD',
|
||||
'A70S', 'A101IT', '7', 'INCREDIBLE', 'A7EB', 'SGH-T849_CARD',
|
||||
@ -243,7 +243,7 @@ class ANDROID(USBMS):
|
||||
'FILE-CD_GADGET', 'GT-I9001_CARD', 'USB_2.0', 'XT875',
|
||||
'UMS_COMPOSITE', 'PRO', '.KOBO_VOX', 'SGH-T989_CARD', 'SGH-I727',
|
||||
'USB_FLASH_DRIVER', 'ANDROID', 'MID7042', '7035', 'VIEWPAD_7E',
|
||||
'NOVO7', 'ADVANCED']
|
||||
'NOVO7', 'ADVANCED', 'TABLET_PC']
|
||||
|
||||
OSX_MAIN_MEM = 'Android Device Main Memory'
|
||||
|
||||
|
@ -20,6 +20,7 @@ from calibre.utils.config import config_dir, dynamic, prefs
|
||||
from calibre.utils.date import now, parse_date
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
|
||||
|
||||
def strftime(fmt='%Y/%m/%d %H:%M:%S', dt=None):
|
||||
|
||||
if not hasattr(dt, 'timetuple'):
|
||||
@ -38,6 +39,7 @@ def logger():
|
||||
_log = ThreadSafeLog()
|
||||
return _log
|
||||
|
||||
|
||||
class AppleOpenFeedback(OpenFeedback):
|
||||
|
||||
def __init__(self, plugin):
|
||||
@ -102,6 +104,7 @@ class AppleOpenFeedback(OpenFeedback):
|
||||
|
||||
return Dialog(parent, self)
|
||||
|
||||
|
||||
class DriverBase(DeviceConfig, DevicePlugin):
|
||||
# Needed for config_widget to work
|
||||
FORMATS = ['epub', 'pdf']
|
||||
@ -133,11 +136,11 @@ class DriverBase(DeviceConfig, DevicePlugin):
|
||||
False,
|
||||
]
|
||||
|
||||
|
||||
@classmethod
|
||||
def _config_base_name(cls):
|
||||
return 'iTunes'
|
||||
|
||||
|
||||
class ITUNES(DriverBase):
|
||||
'''
|
||||
Calling sequences:
|
||||
@ -148,6 +151,8 @@ class ITUNES(DriverBase):
|
||||
open()
|
||||
card_prefix()
|
||||
can_handle()
|
||||
_launch_iTunes()
|
||||
_discover_manual_sync_mode()
|
||||
set_progress_reporter()
|
||||
get_device_information()
|
||||
card_prefix()
|
||||
@ -156,6 +161,7 @@ class ITUNES(DriverBase):
|
||||
can_handle()
|
||||
set_progress_reporter()
|
||||
books() (once for each storage point)
|
||||
(create self.cached_books)
|
||||
settings()
|
||||
settings()
|
||||
can_handle() (~1x per second OSX while idle)
|
||||
@ -186,14 +192,14 @@ class ITUNES(DriverBase):
|
||||
free_space()
|
||||
'''
|
||||
|
||||
name = 'Apple device interface'
|
||||
name = 'Apple iTunes interface'
|
||||
gui_name = _('Apple device')
|
||||
icon = I('devices/ipad.png')
|
||||
description = _('Communicate with iTunes/iBooks.')
|
||||
supported_platforms = ['osx', 'windows']
|
||||
author = 'GRiker'
|
||||
#: The version of this plugin as a 3-tuple (major, minor, revision)
|
||||
version = (1,1,0)
|
||||
version = (1, 1, 1)
|
||||
|
||||
DISPLAY_DISABLE_DIALOG = "display_disable_apple_driver_dialog"
|
||||
|
||||
@ -203,14 +209,14 @@ class ITUNES(DriverBase):
|
||||
USE_ITUNES_STORAGE = 2
|
||||
|
||||
OPEN_FEEDBACK_MESSAGE = _(
|
||||
'Apple device detected, launching iTunes, please wait ...')
|
||||
'Apple iDevice detected, launching iTunes, please wait ...')
|
||||
BACKLOADING_ERROR_MESSAGE = _(
|
||||
"Cannot copy books directly from iDevice. "
|
||||
"Drag from iTunes Library to desktop, then add to calibre's Library window.")
|
||||
UNSUPPORTED_DIRECT_CONNECT_MODE_MESSAGE = _(
|
||||
"Unsupported direct connect mode. "
|
||||
"*** Unsupported direct connect mode. "
|
||||
"See http://www.mobileread.com/forums/showthread.php?t=118559 "
|
||||
"for instructions on using 'Connect to iTunes'")
|
||||
"for instructions on using 'Connect to iTunes' ***")
|
||||
ITUNES_SANDBOX_LOCKOUT_MESSAGE = _(
|
||||
'<p>Unable to communicate with iTunes.</p>'
|
||||
'<p>Refer to this '
|
||||
@ -218,22 +224,9 @@ class ITUNES(DriverBase):
|
||||
'for more information.</p>'
|
||||
'<p></p>')
|
||||
|
||||
# Product IDs:
|
||||
# 0x1291 iPod Touch
|
||||
# 0x1293 iPod Touch 2G
|
||||
# 0x1299 iPod Touch 3G
|
||||
# 0x1292 iPhone 3G
|
||||
# 0x1294 iPhone 3GS
|
||||
# 0x1297 iPhone 4
|
||||
# 0x129a iPad
|
||||
# 0x129f iPad2 (WiFi)
|
||||
# 0x12a0 iPhone 4S (GSM)
|
||||
# 0x12a2 iPad2 (GSM)
|
||||
# 0x12a3 iPad2 (CDMA)
|
||||
# 0x12a6 iPad3 (GSM)
|
||||
VENDOR_ID = [0x05ac]
|
||||
PRODUCT_ID = [0x1292,0x1293,0x1294,0x1297,0x1299,0x129a,0x129f,0x12a2,0x12a3,0x12a6]
|
||||
BCD = [0x01]
|
||||
VENDOR_ID = []
|
||||
PRODUCT_ID = []
|
||||
BCD = []
|
||||
|
||||
# Plugboard ID
|
||||
DEVICE_PLUGBOARD_NAME = 'APPLE'
|
||||
@ -329,7 +322,7 @@ class ITUNES(DriverBase):
|
||||
L{books}(oncard='cardb')).
|
||||
'''
|
||||
if DEBUG:
|
||||
logger().info("ITUNES.add_books_to_metadata()")
|
||||
logger().info("%s.add_books_to_metadata()" % self.__class__.__name__)
|
||||
|
||||
task_count = float(len(self.update_list))
|
||||
|
||||
@ -414,13 +407,13 @@ class ITUNES(DriverBase):
|
||||
"""
|
||||
if not oncard:
|
||||
if DEBUG:
|
||||
logger().info("ITUNES:books():")
|
||||
logger().info("%s.books():" % self.__class__.__name__)
|
||||
if self.settings().extra_customization[self.CACHE_COVERS]:
|
||||
logger().info(" Cover fetching/caching enabled")
|
||||
else:
|
||||
logger().info(" Cover fetching/caching disabled")
|
||||
|
||||
# Fetch a list of books from iPod device connected to iTunes
|
||||
# Fetch a list of books from iDevice connected to iTunes
|
||||
if 'iPod' in self.sources:
|
||||
booklist = BookList(logger())
|
||||
cached_books = {}
|
||||
@ -451,7 +444,8 @@ class ITUNES(DriverBase):
|
||||
|
||||
cached_books[this_book.path] = {
|
||||
'title': book.name(),
|
||||
'author':book.artist().split(' & '),
|
||||
'author': book.artist(),
|
||||
'authors': book.artist().split(' & '),
|
||||
'lib_book': library_books[this_book.path] if this_book.path in library_books else None,
|
||||
'dev_book': book,
|
||||
'uuid': book.composer()
|
||||
@ -491,7 +485,8 @@ class ITUNES(DriverBase):
|
||||
|
||||
cached_books[this_book.path] = {
|
||||
'title': book.Name,
|
||||
'author':book.Artist.split(' & '),
|
||||
'author': book.Artist,
|
||||
'authors': book.Artist.split(' & '),
|
||||
'lib_book': library_books[this_book.path] if this_book.path in library_books else None,
|
||||
'uuid': book.Composer,
|
||||
'format': 'pdf' if book.KindAsString.startswith('PDF') else 'epub'
|
||||
@ -556,7 +551,7 @@ class ITUNES(DriverBase):
|
||||
# We need to know if iTunes sees the iPad
|
||||
# It may have been ejected
|
||||
if DEBUG:
|
||||
logger().info("ITUNES.can_handle()")
|
||||
logger().info("%s.can_handle()" % self.__class__.__name__)
|
||||
|
||||
self._launch_iTunes()
|
||||
self.sources = self._get_sources()
|
||||
@ -567,12 +562,12 @@ class ITUNES(DriverBase):
|
||||
self.sources = self._get_sources()
|
||||
if (not 'iPod' in self.sources) or (self.sources['iPod'] == ''):
|
||||
attempts -= 1
|
||||
time.sleep(0.5)
|
||||
time.sleep(1.0)
|
||||
if DEBUG:
|
||||
logger().warning(" waiting for connected iPad, attempt #%d" % (10 - attempts))
|
||||
logger().warning(" waiting for connected iDevice, attempt #%d" % (10 - attempts))
|
||||
else:
|
||||
if DEBUG:
|
||||
logger().info(' found connected iPad')
|
||||
logger().info(' found connected iDevice')
|
||||
break
|
||||
else:
|
||||
# iTunes running, but not connected iPad
|
||||
@ -613,26 +608,26 @@ class ITUNES(DriverBase):
|
||||
sys.stdout.write('.')
|
||||
sys.stdout.flush()
|
||||
if DEBUG:
|
||||
logger().info('ITUNES.can_handle_windows:\n confirming connected iPad')
|
||||
logger().info("%s.can_handle_windows:\n confirming connected iPad" % self.__class__.__name__)
|
||||
self.ejected = False
|
||||
self._discover_manual_sync_mode()
|
||||
return True
|
||||
else:
|
||||
if DEBUG:
|
||||
logger().info("ITUNES.can_handle_windows():\n device ejected")
|
||||
logger().info("%s.can_handle_windows():\n device ejected" % self.__class__.__name__)
|
||||
self.ejected = True
|
||||
return False
|
||||
except:
|
||||
# iTunes connection failed, probably not running anymore
|
||||
|
||||
logger().error("ITUNES.can_handle_windows():\n lost connection to iTunes")
|
||||
logger().error("%s.can_handle_windows():\n lost connection to iTunes" % self.__class__.__name__)
|
||||
return False
|
||||
finally:
|
||||
pythoncom.CoUninitialize()
|
||||
|
||||
else:
|
||||
if DEBUG:
|
||||
logger().info("ITUNES:can_handle_windows():\n Launching iTunes")
|
||||
logger().info("%s.can_handle_windows():\n Launching iTunes" % self.__class__.__name__)
|
||||
|
||||
try:
|
||||
pythoncom.CoInitialize()
|
||||
@ -645,9 +640,9 @@ class ITUNES(DriverBase):
|
||||
self.sources = self._get_sources()
|
||||
if (not 'iPod' in self.sources) or (self.sources['iPod'] == ''):
|
||||
attempts -= 1
|
||||
time.sleep(0.5)
|
||||
time.sleep(1.0)
|
||||
if DEBUG:
|
||||
logger().warning(" waiting for connected iPad, attempt #%d" % (10 - attempts))
|
||||
logger().warning(" waiting for connected iDevice, attempt #%d" % (10 - attempts))
|
||||
else:
|
||||
if DEBUG:
|
||||
logger().info(' found connected iPad in iTunes')
|
||||
@ -702,7 +697,7 @@ class ITUNES(DriverBase):
|
||||
self.problem_msg = _("Some books not found in iTunes database.\n"
|
||||
"Delete using the iBooks app.\n"
|
||||
"Click 'Show Details' for a list.")
|
||||
logger().info("ITUNES:delete_books()")
|
||||
logger().info("%s.delete_books()" % self.__class__.__name__)
|
||||
for path in paths:
|
||||
if self.cached_books[path]['lib_book']:
|
||||
if DEBUG:
|
||||
@ -731,8 +726,11 @@ class ITUNES(DriverBase):
|
||||
else:
|
||||
if self.manual_sync_mode:
|
||||
metadata = MetaInformation(self.cached_books[path]['title'],
|
||||
[self.cached_books[path]['author']])
|
||||
self.cached_books[path]['authors'])
|
||||
metadata.author = self.cached_books[path]['author']
|
||||
metadata.uuid = self.cached_books[path]['uuid']
|
||||
if not metadata.uuid:
|
||||
metadata.uuid = "unknown"
|
||||
|
||||
if isosx:
|
||||
self._remove_existing_copy(self.cached_books[path], metadata)
|
||||
@ -754,7 +752,7 @@ class ITUNES(DriverBase):
|
||||
are pending GUI jobs that need to communicate with the device.
|
||||
'''
|
||||
if DEBUG:
|
||||
logger().info("ITUNES:eject(): ejecting '%s'" % self.sources['iPod'])
|
||||
logger().info("%s:eject(): ejecting '%s'" % (self.__class__.__name__, self.sources['iPod']))
|
||||
if isosx:
|
||||
self.iTunes.eject(self.sources['iPod'])
|
||||
elif iswindows:
|
||||
@ -785,7 +783,7 @@ class ITUNES(DriverBase):
|
||||
In Windows, a sync-in-progress blocks this call until sync is complete
|
||||
"""
|
||||
if DEBUG:
|
||||
logger().info("ITUNES:free_space()")
|
||||
logger().info("%s.free_space()" % self.__class__.__name__)
|
||||
|
||||
free_space = 0
|
||||
if isosx:
|
||||
@ -818,9 +816,9 @@ class ITUNES(DriverBase):
|
||||
@return: (device name, device version, software version on device, mime type)
|
||||
"""
|
||||
if DEBUG:
|
||||
logger().info("ITUNES:get_device_information()")
|
||||
logger().info("%s.get_device_information()" % self.__class__.__name__)
|
||||
|
||||
return (self.sources['iPod'],'hw v1.0','sw v1.0', 'mime type normally goes here')
|
||||
return (self.sources['iPod'], 'hw v1.0', 'sw v1.0', 'unknown mime type')
|
||||
|
||||
def get_file(self, path, outfile, end_session=True):
|
||||
'''
|
||||
@ -828,7 +826,7 @@ class ITUNES(DriverBase):
|
||||
@param outfile: file object like C{sys.stdout} or the result of an C{open} call
|
||||
'''
|
||||
if DEBUG:
|
||||
logger().info("ITUNES.get_file(): exporting '%s'" % path)
|
||||
logger().info("%s.get_file(): exporting '%s'" % (self.__class__.__name__, path))
|
||||
|
||||
try:
|
||||
outfile.write(open(self.cached_books[path]['lib_book'].location().path).read())
|
||||
@ -859,15 +857,32 @@ class ITUNES(DriverBase):
|
||||
raise OpenFeedback(self.ITUNES_SANDBOX_LOCKOUT_MESSAGE)
|
||||
|
||||
if DEBUG:
|
||||
logger().info("ITUNES.open(connected_device: %s)" % repr(connected_device))
|
||||
vendor_id = "0x%x" % connected_device[0]
|
||||
product_id = "0x%x" % connected_device[1]
|
||||
bcd = "0x%x" % connected_device[2]
|
||||
mfg = connected_device[3]
|
||||
model = connected_device[4]
|
||||
logger().info("%s.open(MFG: %s, VENDOR_ID: %s, MODEL: %s, BCD: %s, PRODUCT_ID: %s)" %
|
||||
(self.__class__.__name__,
|
||||
mfg,
|
||||
vendor_id,
|
||||
model,
|
||||
bcd,
|
||||
product_id
|
||||
))
|
||||
|
||||
if False:
|
||||
# Display a dialog recommending using 'Connect to iTunes' if user hasn't
|
||||
# previously disabled the dialog
|
||||
if dynamic.get(confirm_config_name(self.DISPLAY_DISABLE_DIALOG), True):
|
||||
raise AppleOpenFeedback(self)
|
||||
else:
|
||||
if DEBUG:
|
||||
logger().warning(" %s" % self.UNSUPPORTED_DIRECT_CONNECT_MODE_MESSAGE)
|
||||
logger().info(" %s" % self.UNSUPPORTED_DIRECT_CONNECT_MODE_MESSAGE)
|
||||
|
||||
# Log supported DEVICE_IDs and BCDs
|
||||
logger().info(" BCD: %s" % ['0x%x' % x for x in sorted(self.BCD)])
|
||||
logger().info(" PRODUCT_ID: %s" % ['0x%x' % x for x in sorted(self.PRODUCT_ID)])
|
||||
|
||||
# Confirm/create thumbs archive
|
||||
if not os.path.exists(self.cache_dir):
|
||||
@ -908,14 +923,14 @@ class ITUNES(DriverBase):
|
||||
as uuids are different
|
||||
'''
|
||||
if DEBUG:
|
||||
logger().info("ITUNES.remove_books_from_metadata()")
|
||||
logger().info("%s.remove_books_from_metadata()" % self.__class__.__name__)
|
||||
for path in paths:
|
||||
if DEBUG:
|
||||
self._dump_cached_book(self.cached_books[path], indent=2)
|
||||
logger().info(" looking for '%s' by '%s' uuid:%s" %
|
||||
(self.cached_books[path]['title'],
|
||||
self.cached_books[path]['author'],
|
||||
self.cached_books[path]['uuid']))
|
||||
repr(self.cached_books[path]['uuid'])))
|
||||
|
||||
# Purge the booklist, self.cached_books, thumb cache
|
||||
for i, bl_book in enumerate(booklists[0]):
|
||||
@ -924,24 +939,28 @@ class ITUNES(DriverBase):
|
||||
(bl_book.title, bl_book.author, bl_book.uuid))
|
||||
|
||||
found = False
|
||||
if bl_book.uuid == self.cached_books[path]['uuid']:
|
||||
if False:
|
||||
logger().info(" matched with uuid")
|
||||
if bl_book.uuid and bl_book.uuid == self.cached_books[path]['uuid']:
|
||||
if True:
|
||||
logger().info(" --matched uuid")
|
||||
booklists[0].pop(i)
|
||||
found = True
|
||||
elif bl_book.title == self.cached_books[path]['title'] and \
|
||||
bl_book.author[0] == self.cached_books[path]['author']:
|
||||
if False:
|
||||
logger().info(" matched with title + author")
|
||||
bl_book.author == self.cached_books[path]['author']:
|
||||
if True:
|
||||
logger().info(" --matched title + author")
|
||||
booklists[0].pop(i)
|
||||
found = True
|
||||
|
||||
if found:
|
||||
# Remove from self.cached_books
|
||||
for cb in self.cached_books:
|
||||
if self.cached_books[cb]['uuid'] == self.cached_books[path]['uuid']:
|
||||
if (self.cached_books[cb]['uuid'] == self.cached_books[path]['uuid'] and
|
||||
self.cached_books[cb]['author'] == self.cached_books[path]['author'] and
|
||||
self.cached_books[cb]['title'] == self.cached_books[path]['title']):
|
||||
self.cached_books.pop(cb)
|
||||
break
|
||||
else:
|
||||
logger().error(" '%s' not found in self.cached_books" % self.cached_books[path]['title'])
|
||||
|
||||
# Remove from thumb from thumb cache
|
||||
thumb_path = path.rpartition('.')[0] + '.jpg'
|
||||
@ -964,7 +983,9 @@ class ITUNES(DriverBase):
|
||||
else:
|
||||
if DEBUG:
|
||||
logger().error(" unable to find '%s' by '%s' (%s)" %
|
||||
(bl_book.title, bl_book.author,bl_book.uuid))
|
||||
(self.cached_books[path]['title'],
|
||||
self.cached_books[path]['author'],
|
||||
self.cached_books[path]['uuid']))
|
||||
|
||||
if False:
|
||||
self._dump_booklist(booklists[0], indent=2)
|
||||
@ -982,7 +1003,7 @@ class ITUNES(DriverBase):
|
||||
:detected_device: Device information from the device scanner
|
||||
"""
|
||||
if DEBUG:
|
||||
logger().info("ITUNES.reset()")
|
||||
logger().info("%s.reset()" % self.__class__.__name__)
|
||||
if report_progress:
|
||||
self.set_progress_reporter(report_progress)
|
||||
|
||||
@ -994,7 +1015,7 @@ class ITUNES(DriverBase):
|
||||
task does not have any progress information
|
||||
'''
|
||||
if DEBUG:
|
||||
logger().info("ITUNES.set_progress_reporter()")
|
||||
logger().info("%s.set_progress_reporter()" % self.__class__.__name__)
|
||||
|
||||
self.report_progress = report_progress
|
||||
|
||||
@ -1002,11 +1023,15 @@ class ITUNES(DriverBase):
|
||||
# This method is called with the plugboard that matches the format
|
||||
# declared in use_plugboard_ext and a device name of ITUNES
|
||||
if DEBUG:
|
||||
logger().info("ITUNES.set_plugboard()")
|
||||
logger().info("%s.set_plugboard()" % self.__class__.__name__)
|
||||
#logger().info(' plugboard: %s' % plugboards)
|
||||
self.plugboards = plugboards
|
||||
self.plugboard_func = pb_func
|
||||
|
||||
def shutdown(self):
|
||||
if DEBUG:
|
||||
logger().info("%s.shutdown()\n" % self.__class__.__name__)
|
||||
|
||||
def sync_booklists(self, booklists, end_session=True):
|
||||
'''
|
||||
Update metadata on device.
|
||||
@ -1016,7 +1041,7 @@ class ITUNES(DriverBase):
|
||||
'''
|
||||
|
||||
if DEBUG:
|
||||
logger().info("ITUNES.sync_booklists()")
|
||||
logger().info("%s.sync_booklists()" % self.__class__.__name__)
|
||||
|
||||
if self.update_needed:
|
||||
if DEBUG:
|
||||
@ -1043,7 +1068,7 @@ class ITUNES(DriverBase):
|
||||
particular device doesn't have any of these locations it should return 0.
|
||||
"""
|
||||
if DEBUG:
|
||||
logger().info("ITUNES:total_space()")
|
||||
logger().info("%s.total_space()" % self.__class__.__name__)
|
||||
capacity = 0
|
||||
if isosx:
|
||||
if 'iPod' in self.sources:
|
||||
@ -1081,7 +1106,7 @@ class ITUNES(DriverBase):
|
||||
"Click 'Show Details' for a list.")
|
||||
|
||||
if DEBUG:
|
||||
logger().info("ITUNES.upload_books()")
|
||||
logger().info("%s.upload_books()" % self.__class__.__name__)
|
||||
|
||||
if isosx:
|
||||
for (i, fpath) in enumerate(files):
|
||||
@ -1098,7 +1123,7 @@ class ITUNES(DriverBase):
|
||||
|
||||
# Add new_book to self.cached_books
|
||||
if DEBUG:
|
||||
logger().info("ITUNES.upload_books()")
|
||||
logger().info("%s.upload_books()" % self.__class__.__name__)
|
||||
logger().info(" adding '%s' by '%s' uuid:%s to self.cached_books" %
|
||||
(metadata[i].title,
|
||||
authors_to_string(metadata[i].authors),
|
||||
@ -1144,7 +1169,7 @@ class ITUNES(DriverBase):
|
||||
|
||||
# Add new_book to self.cached_books
|
||||
if DEBUG:
|
||||
logger().info("ITUNES.upload_books()")
|
||||
logger().info("%s.upload_books()" % self.__class__.__name__)
|
||||
logger().info(" adding '%s' by '%s' uuid:%s to self.cached_books" %
|
||||
(metadata[i].title,
|
||||
authors_to_string(metadata[i].authors),
|
||||
@ -1182,7 +1207,7 @@ class ITUNES(DriverBase):
|
||||
'''
|
||||
assumes pythoncom wrapper for windows
|
||||
'''
|
||||
logger().info(" ITUNES._add_device_book()")
|
||||
logger().info(" %s._add_device_book()" % self.__class__.__name__)
|
||||
if isosx:
|
||||
import appscript
|
||||
if 'iPod' in self.sources:
|
||||
@ -1292,7 +1317,7 @@ class ITUNES(DriverBase):
|
||||
windows assumes pythoncom wrapper
|
||||
'''
|
||||
if DEBUG:
|
||||
logger().info(" ITUNES._add_library_book()")
|
||||
logger().info(" %s._add_library_book()" % self.__class__.__name__)
|
||||
if isosx:
|
||||
import appscript
|
||||
added = self.iTunes.add(appscript.mactypes.File(file))
|
||||
@ -1360,7 +1385,7 @@ class ITUNES(DriverBase):
|
||||
fp = cached_book['lib_book'].Location
|
||||
'''
|
||||
if DEBUG:
|
||||
logger().info(" ITUNES._add_new_copy()")
|
||||
logger().info(" %s._add_new_copy()" % self.__class__.__name__)
|
||||
|
||||
if fpath.rpartition('.')[2].lower() == 'epub':
|
||||
self._update_epub_metadata(fpath, metadata)
|
||||
@ -1399,7 +1424,7 @@ class ITUNES(DriverBase):
|
||||
from PIL import Image as PILImage
|
||||
|
||||
if DEBUG:
|
||||
logger().info(" ITUNES._cover_to_thumb()")
|
||||
logger().info(" %s._cover_to_thumb()" % self.__class__.__name__)
|
||||
|
||||
thumb = None
|
||||
if metadata.cover:
|
||||
@ -1526,7 +1551,7 @@ class ITUNES(DriverBase):
|
||||
'''
|
||||
'''
|
||||
if DEBUG:
|
||||
logger().info(" ITUNES._create_new_book()")
|
||||
logger().info(" %s._create_new_book()" % self.__class__.__name__)
|
||||
|
||||
this_book = Book(metadata.title, authors_to_string(metadata.authors))
|
||||
this_book.datetime = time.gmtime()
|
||||
@ -1575,7 +1600,7 @@ class ITUNES(DriverBase):
|
||||
wait is passed when launching iTunes, as it seems to need a moment to come to its senses
|
||||
'''
|
||||
if DEBUG:
|
||||
logger().info(" ITUNES._discover_manual_sync_mode()")
|
||||
logger().info(" %s._discover_manual_sync_mode()" % self.__class__.__name__)
|
||||
if wait:
|
||||
time.sleep(wait)
|
||||
if isosx:
|
||||
@ -1593,7 +1618,7 @@ class ITUNES(DriverBase):
|
||||
if dev_books is not None and len(dev_books):
|
||||
first_book = dev_books[0]
|
||||
if False:
|
||||
logger().info(" determing manual mode by modifying '%s' by %s" % (first_book.name(), first_book.artist()))
|
||||
logger().info(" determining manual mode by modifying '%s' by %s" % (first_book.name(), first_book.artist()))
|
||||
try:
|
||||
first_book.bpm.set(0)
|
||||
self.manual_sync_mode = True
|
||||
@ -1655,8 +1680,8 @@ class ITUNES(DriverBase):
|
||||
|
||||
for book in booklist:
|
||||
if isosx:
|
||||
logger().info("%s%-40.40s %-30.30s %-10.10s %s" %
|
||||
(' '*indent,book.title, book.author, str(book.library_id)[-9:], book.uuid))
|
||||
logger().info("%s%-40.40s %-30.30s %-40.40s %-10.10s" %
|
||||
(' ' * indent, book.title, book.author, book.uuid, str(book.library_id)[-9:]))
|
||||
elif iswindows:
|
||||
logger().info("%s%-40.40s %-30.30s" %
|
||||
(' ' * indent, book.title, book.author))
|
||||
@ -1705,13 +1730,14 @@ class ITUNES(DriverBase):
|
||||
logger().info("%s%s" % (' ' * indent, '-' * len(msg)))
|
||||
if isosx:
|
||||
for cb in self.cached_books.keys():
|
||||
logger().info("%s%-40.40s %-30.30s %-10.10s %-10.10s %s" %
|
||||
logger().info("%s%-40.40s %-30.30s %-40.40s %-10.10s %-10.10s" %
|
||||
(' ' * indent,
|
||||
self.cached_books[cb]['title'],
|
||||
self.cached_books[cb]['author'],
|
||||
self.cached_books[cb]['uuid'],
|
||||
str(self.cached_books[cb]['lib_book'])[-9:],
|
||||
str(self.cached_books[cb]['dev_book'])[-9:],
|
||||
self.cached_books[cb]['uuid']))
|
||||
))
|
||||
elif iswindows:
|
||||
for cb in self.cached_books.keys():
|
||||
logger().info("%s%-40.40s %-30.30s %-4.4s %s" %
|
||||
@ -1728,7 +1754,7 @@ class ITUNES(DriverBase):
|
||||
'''
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
|
||||
logger().info(" ITUNES.__get_epub_metadata()")
|
||||
logger().info(" %s.__get_epub_metadata()" % self.__class__.__name__)
|
||||
title = None
|
||||
author = None
|
||||
timestamp = None
|
||||
@ -1760,7 +1786,8 @@ class ITUNES(DriverBase):
|
||||
'''
|
||||
'''
|
||||
FILTER = ''.join([(len(repr(chr(x))) == 3) and chr(x) or '.' for x in range(256)])
|
||||
N=0; result=''
|
||||
N = 0
|
||||
result = ''
|
||||
while src:
|
||||
s, src = src[:length], src[length:]
|
||||
hexa = ' '.join(["%02X" % ord(x) for x in s])
|
||||
@ -1806,7 +1833,7 @@ class ITUNES(DriverBase):
|
||||
if iswindows:
|
||||
dev_books = self._get_device_books_playlist()
|
||||
if DEBUG:
|
||||
logger().info(" ITUNES._find_device_book()")
|
||||
logger().info(" %s._find_device_book()" % self.__class__.__name__)
|
||||
logger().info(" searching for '%s' by '%s' (%s)" %
|
||||
(search['title'], search['author'], search['uuid']))
|
||||
attempts = 9
|
||||
@ -1876,7 +1903,7 @@ class ITUNES(DriverBase):
|
||||
'''
|
||||
if iswindows:
|
||||
if DEBUG:
|
||||
logger().info(" ITUNES._find_library_book()")
|
||||
logger().info(" %s._find_library_book()" % self.__class__.__name__)
|
||||
'''
|
||||
if 'uuid' in search:
|
||||
logger().info(" looking for '%s' by %s (%s)" %
|
||||
@ -1909,7 +1936,6 @@ class ITUNES(DriverBase):
|
||||
if DEBUG:
|
||||
logger().error(" no Books playlist found")
|
||||
|
||||
|
||||
attempts = 9
|
||||
while attempts:
|
||||
# Find book whose Album field = search['uuid']
|
||||
@ -1996,7 +2022,8 @@ class ITUNES(DriverBase):
|
||||
thumb_data = zfr.read(thumb_path)
|
||||
if thumb_data == 'None':
|
||||
if False:
|
||||
logger().info(" ITUNES._generate_thumbnail()\n returning None from cover cache for '%s'" % title)
|
||||
logger().info(" %s._generate_thumbnail()\n returning None from cover cache for '%s'" %
|
||||
(self.__class__.__name__, title))
|
||||
zfr.close()
|
||||
return None
|
||||
except:
|
||||
@ -2007,7 +2034,7 @@ class ITUNES(DriverBase):
|
||||
return thumb_data
|
||||
|
||||
if DEBUG:
|
||||
logger().info(" ITUNES._generate_thumbnail('%s'):" % title)
|
||||
logger().info(" %s._generate_thumbnail('%s'):" % (self.__class__.__name__, title))
|
||||
if isosx:
|
||||
|
||||
# Fetch the artwork from iTunes
|
||||
@ -2049,7 +2076,6 @@ class ITUNES(DriverBase):
|
||||
|
||||
return thumb_data
|
||||
|
||||
|
||||
elif iswindows:
|
||||
if not book.Artwork.Count:
|
||||
if DEBUG:
|
||||
@ -2101,7 +2127,7 @@ class ITUNES(DriverBase):
|
||||
for file in myZipList:
|
||||
exploded_file_size += file.file_size
|
||||
if False:
|
||||
logger().info(" ITUNES._get_device_book_size()")
|
||||
logger().info(" %s._get_device_book_size()" % self.__class__.__name__)
|
||||
logger().info(" %d items in archive" % len(myZipList))
|
||||
logger().info(" compressed: %d exploded: %d" % (compressed_size, exploded_file_size))
|
||||
myZip.close()
|
||||
@ -2112,7 +2138,7 @@ class ITUNES(DriverBase):
|
||||
Assumes pythoncom wrapper for Windows
|
||||
'''
|
||||
if DEBUG:
|
||||
logger().info("\n ITUNES._get_device_books()")
|
||||
logger().info("\n %s._get_device_books()" % self.__class__.__name__)
|
||||
|
||||
device_books = []
|
||||
if isosx:
|
||||
@ -2131,14 +2157,13 @@ class ITUNES(DriverBase):
|
||||
logger().error(" book_playlist not found")
|
||||
|
||||
for book in dev_books:
|
||||
# This may need additional entries for international iTunes users
|
||||
if book.kind() in self.Audiobooks:
|
||||
if DEBUG:
|
||||
logger().info(" ignoring '%s' of type '%s'" % (book.name(), book.kind()))
|
||||
else:
|
||||
if DEBUG:
|
||||
logger().info(" %-30.30s %-30.30s %-40.40s [%s]" %
|
||||
(book.name(), book.artist(), book.album(), book.kind()))
|
||||
logger().info(" %-40.40s %-30.30s %-40.40s [%s]" %
|
||||
(book.name(), book.artist(), book.composer(), book.kind()))
|
||||
device_books.append(book)
|
||||
if DEBUG:
|
||||
logger().info()
|
||||
@ -2165,13 +2190,12 @@ class ITUNES(DriverBase):
|
||||
logger().info(" no Books playlist found")
|
||||
|
||||
for book in dev_books:
|
||||
# This may need additional entries for international iTunes users
|
||||
if book.KindAsString in self.Audiobooks:
|
||||
if DEBUG:
|
||||
logger().info(" ignoring '%s' of type '%s'" % (book.Name, book.KindAsString))
|
||||
else:
|
||||
if DEBUG:
|
||||
logger().info(" %-30.30s %-30.30s %-40.40s [%s]" % (book.Name, book.Artist, book.Album, book.KindAsString))
|
||||
logger().info(" %-40.40s %-30.30s %-40.40s [%s]" % (book.Name, book.Artist, book.Composer, book.KindAsString))
|
||||
device_books.append(book)
|
||||
if DEBUG:
|
||||
logger().info()
|
||||
@ -2206,7 +2230,7 @@ class ITUNES(DriverBase):
|
||||
Windows assumes pythoncom wrapper
|
||||
'''
|
||||
if DEBUG:
|
||||
logger().info("\n ITUNES._get_library_books()")
|
||||
logger().info("\n %s._get_library_books()" % self.__class__.__name__)
|
||||
|
||||
library_books = {}
|
||||
library_orphans = {}
|
||||
@ -2317,6 +2341,7 @@ class ITUNES(DriverBase):
|
||||
except:
|
||||
if DEBUG:
|
||||
logger().info(" no books in library")
|
||||
|
||||
self.library_orphans = library_orphans
|
||||
return library_books
|
||||
|
||||
@ -2381,7 +2406,7 @@ class ITUNES(DriverBase):
|
||||
'''
|
||||
'''
|
||||
if DEBUG:
|
||||
logger().info(" ITUNES:_launch_iTunes():\n Instantiating iTunes")
|
||||
logger().info(" %s._launch_iTunes():\n Instantiating iTunes" % self.__class__.__name__)
|
||||
|
||||
if isosx:
|
||||
import appscript
|
||||
@ -2394,12 +2419,13 @@ class ITUNES(DriverBase):
|
||||
running_apps = appscript.app('System Events')
|
||||
if not 'iTunes' in running_apps.processes.name():
|
||||
if DEBUG:
|
||||
logger().info( "ITUNES:_launch_iTunes(): Launching iTunes" )
|
||||
logger().info("%s:_launch_iTunes(): Launching iTunes" % self.__class__.__name__)
|
||||
try:
|
||||
self.iTunes = iTunes = appscript.app('iTunes', hide=True)
|
||||
except:
|
||||
self.iTunes = None
|
||||
raise UserFeedback(' ITUNES._launch_iTunes(): unable to find installed iTunes', details=None, level=UserFeedback.WARN)
|
||||
raise UserFeedback(' %s._launch_iTunes(): unable to find installed iTunes' %
|
||||
self.__class__.__name__, details=None, level=UserFeedback.WARN)
|
||||
|
||||
iTunes.run()
|
||||
self.initial_status = 'launched'
|
||||
@ -2444,10 +2470,10 @@ class ITUNES(DriverBase):
|
||||
|
||||
if DEBUG:
|
||||
logger().info(" %s %s" % (__appname__, __version__))
|
||||
logger().info(" [OSX %s, %s %s (%s), driver version %d.%d.%d]" %
|
||||
logger().info(" [OSX %s, %s %s (%s), %s driver version %d.%d.%d]" %
|
||||
(platform.mac_ver()[0],
|
||||
self.iTunes.name(), self.iTunes.version(), self.initial_status,
|
||||
self.version[0],self.version[1],self.version[2]))
|
||||
self.__class__.__name__, self.version[0], self.version[1], self.version[2]))
|
||||
logger().info(" communicating with iTunes via %s %s using %s binding" % (as_name, as_version, as_binding))
|
||||
logger().info(" calibre_library_path: %s" % self.calibre_library_path)
|
||||
|
||||
@ -2474,7 +2500,8 @@ class ITUNES(DriverBase):
|
||||
self.iTunes = win32com.client.Dispatch("iTunes.Application")
|
||||
except:
|
||||
self.iTunes = None
|
||||
raise UserFeedback(' ITUNES._launch_iTunes(): unable to find installed iTunes', details=None, level=UserFeedback.WARN)
|
||||
raise UserFeedback(' %s._launch_iTunes(): unable to find installed iTunes'
|
||||
% self.__class__.__name__, details=None, level=UserFeedback.WARN)
|
||||
|
||||
if not DEBUG:
|
||||
self.iTunes.Windows[0].Minimized = True
|
||||
@ -2524,8 +2551,10 @@ class ITUNES(DriverBase):
|
||||
Remove any iTunes orphans originally added by calibre
|
||||
This occurs when the user deletes a book in iBooks while disconnected
|
||||
'''
|
||||
PURGE_ORPHANS = False
|
||||
|
||||
if DEBUG:
|
||||
logger().info(" ITUNES._purge_orphans()")
|
||||
logger().info(" %s._purge_orphans()" % self.__class__.__name__)
|
||||
#self._dump_library_books(library_books)
|
||||
#logger().info(" cached_books:\n %s" % "\n ".join(cached_books.keys()))
|
||||
|
||||
@ -2533,45 +2562,48 @@ class ITUNES(DriverBase):
|
||||
if isosx:
|
||||
if book not in cached_books and \
|
||||
str(library_books[book].description()).startswith(self.description_prefix):
|
||||
if PURGE_ORPHANS:
|
||||
if DEBUG:
|
||||
logger().info(" '%s' not found on iDevice, removing from iTunes" % book)
|
||||
btr = { 'title':library_books[book].name(),
|
||||
btr = {
|
||||
'title': library_books[book].name(),
|
||||
'author': library_books[book].artist(),
|
||||
'lib_book': library_books[book]}
|
||||
self._remove_from_iTunes(btr)
|
||||
else:
|
||||
if DEBUG:
|
||||
logger().info(" '%s' found in iTunes, but not on iDevice" % (book))
|
||||
|
||||
elif iswindows:
|
||||
if book not in cached_books and \
|
||||
library_books[book].Description.startswith(self.description_prefix):
|
||||
if PURGE_ORPHANS:
|
||||
if DEBUG:
|
||||
logger().info(" '%s' not found on iDevice, removing from iTunes" % book)
|
||||
btr = { 'title':library_books[book].Name,
|
||||
btr = {
|
||||
'title': library_books[book].Name,
|
||||
'author': library_books[book].Artist,
|
||||
'lib_book': library_books[book]}
|
||||
self._remove_from_iTunes(btr)
|
||||
else:
|
||||
if DEBUG:
|
||||
logger().info()
|
||||
logger().info(" '%s' found in iTunes, but not on iDevice" % (book))
|
||||
|
||||
def _remove_existing_copy(self, path, metadata):
|
||||
'''
|
||||
'''
|
||||
if DEBUG:
|
||||
logger().info(" ITUNES._remove_existing_copy()")
|
||||
logger().info(" %s._remove_existing_copy()" % self.__class__.__name__)
|
||||
|
||||
if self.manual_sync_mode:
|
||||
# Delete existing from Device|Books, add to self.update_list
|
||||
# for deletion from booklist[0] during add_books_to_metadata
|
||||
for book in self.cached_books:
|
||||
if self.cached_books[book]['uuid'] == metadata.uuid or \
|
||||
(self.cached_books[book]['title'] == metadata.title and \
|
||||
self.cached_books[book]['author'] == authors_to_string(metadata.authors)):
|
||||
if (self.cached_books[book]['uuid'] == metadata.uuid or
|
||||
(self.cached_books[book]['title'] == metadata.title and
|
||||
self.cached_books[book]['author'] == metadata.author)):
|
||||
self.update_list.append(self.cached_books[book])
|
||||
|
||||
if DEBUG:
|
||||
logger().info( " deleting device book '%s'" % (metadata.title))
|
||||
self._remove_from_device(self.cached_books[book])
|
||||
|
||||
if DEBUG:
|
||||
logger().info(" deleting library book '%s'" % metadata.title)
|
||||
self._remove_from_iTunes(self.cached_books[book])
|
||||
break
|
||||
else:
|
||||
@ -2581,9 +2613,9 @@ class ITUNES(DriverBase):
|
||||
# Delete existing from Library|Books, add to self.update_list
|
||||
# for deletion from booklist[0] during add_books_to_metadata
|
||||
for book in self.cached_books:
|
||||
if self.cached_books[book]['uuid'] == metadata.uuid or \
|
||||
if (self.cached_books[book]['uuid'] == metadata.uuid or
|
||||
(self.cached_books[book]['title'] == metadata.title and \
|
||||
self.cached_books[book]['author'] == authors_to_string(metadata.authors)):
|
||||
self.cached_books[book]['author'] == metadata.author)):
|
||||
self.update_list.append(self.cached_books[book])
|
||||
if DEBUG:
|
||||
logger().info(" deleting library book '%s'" % metadata.title)
|
||||
@ -2598,7 +2630,7 @@ class ITUNES(DriverBase):
|
||||
Windows assumes pythoncom wrapper
|
||||
'''
|
||||
if DEBUG:
|
||||
logger().info(" ITUNES._remove_from_device()")
|
||||
logger().info(" %s._remove_from_device()" % self.__class__.__name__)
|
||||
if isosx:
|
||||
if DEBUG:
|
||||
logger().info(" deleting '%s' from iDevice" % cached_book['title'])
|
||||
@ -2622,7 +2654,7 @@ class ITUNES(DriverBase):
|
||||
iTunes does not delete books from storage when removing from database via automation
|
||||
'''
|
||||
if DEBUG:
|
||||
logger().info(" ITUNES._remove_from_iTunes():")
|
||||
logger().info(" %s._remove_from_iTunes():" % self.__class__.__name__)
|
||||
|
||||
if isosx:
|
||||
''' Manually remove the book from iTunes storage '''
|
||||
@ -2664,7 +2696,8 @@ class ITUNES(DriverBase):
|
||||
except:
|
||||
# We get here if there was an error with .location().path
|
||||
if DEBUG:
|
||||
logger().info(" '%s' not found in iTunes storage" % cached_book['title'])
|
||||
logger().info(" '%s' by %s not found in iTunes storage" %
|
||||
(cached_book['title'], cached_book['author']))
|
||||
|
||||
# Delete the book from the iTunes database
|
||||
try:
|
||||
@ -2739,7 +2772,7 @@ class ITUNES(DriverBase):
|
||||
from lxml import etree
|
||||
|
||||
if DEBUG:
|
||||
logger().info(" ITUNES._update_epub_metadata()")
|
||||
logger().info(" %s._update_epub_metadata()" % self.__class__.__name__)
|
||||
|
||||
# Fetch plugboard updates
|
||||
metadata_x = self._xform_metadata_via_plugboard(metadata, 'epub')
|
||||
@ -2807,7 +2840,7 @@ class ITUNES(DriverBase):
|
||||
Trigger a sync, wait for completion
|
||||
'''
|
||||
if DEBUG:
|
||||
logger().info(" ITUNES:_update_device():\n %s" % msg)
|
||||
logger().info(" %s:_update_device():\n %s" % (self.__class__.__name__, msg))
|
||||
|
||||
if isosx:
|
||||
self.iTunes.update()
|
||||
@ -2855,7 +2888,7 @@ class ITUNES(DriverBase):
|
||||
'''
|
||||
'''
|
||||
if DEBUG:
|
||||
logger().info(" ITUNES._update_iTunes_metadata()")
|
||||
logger().info(" %s._update_iTunes_metadata()" % self.__class__.__name__)
|
||||
|
||||
STRIP_TAGS = re.compile(r'<[^<]*?/?>')
|
||||
|
||||
@ -2907,7 +2940,7 @@ class ITUNES(DriverBase):
|
||||
# If title_sort applied in plugboard, that overrides using series/index as title_sort
|
||||
if metadata_x.series and self.settings().extra_customization[self.USE_SERIES_AS_CATEGORY]:
|
||||
if DEBUG:
|
||||
logger().info(" ITUNES._update_iTunes_metadata()")
|
||||
logger().info(" %s._update_iTunes_metadata()" % self.__class__.__name__)
|
||||
logger().info(" using Series name '%s' as Genre" % metadata_x.series)
|
||||
|
||||
# Format the index as a sort key
|
||||
@ -2949,7 +2982,6 @@ class ITUNES(DriverBase):
|
||||
db_added.genre.set(tag)
|
||||
break
|
||||
|
||||
|
||||
elif metadata_x.tags is not None:
|
||||
if DEBUG:
|
||||
logger().info(" %susing Tag as Genre" %
|
||||
@ -3089,8 +3121,7 @@ class ITUNES(DriverBase):
|
||||
Ensure iDevice metadata is writable. Direct connect mode only
|
||||
'''
|
||||
if DEBUG:
|
||||
logger().info(" ITUNES._wait_for_writable_metadata()")
|
||||
logger().warning(" %s" % self.UNSUPPORTED_DIRECT_CONNECT_MODE_MESSAGE)
|
||||
logger().info(" %s._wait_for_writable_metadata()" % self.__class__.__name__)
|
||||
|
||||
attempts = 9
|
||||
while attempts:
|
||||
@ -3113,7 +3144,7 @@ class ITUNES(DriverBase):
|
||||
def _xform_metadata_via_plugboard(self, book, format):
|
||||
''' Transform book metadata from plugboard templates '''
|
||||
if DEBUG:
|
||||
logger().info(" ITUNES._xform_metadata_via_plugboard()")
|
||||
logger().info(" %s._xform_metadata_via_plugboard()" % self.__class__.__name__)
|
||||
|
||||
if self.plugboard_func:
|
||||
pb = self.plugboard_func(self.DEVICE_PLUGBOARD_NAME, format, self.plugboards)
|
||||
@ -3143,6 +3174,7 @@ class ITUNES(DriverBase):
|
||||
newmi = book
|
||||
return newmi
|
||||
|
||||
|
||||
class ITUNES_ASYNC(ITUNES):
|
||||
'''
|
||||
This subclass allows the user to interact directly with iTunes via a menu option
|
||||
@ -3160,7 +3192,7 @@ class ITUNES_ASYNC(ITUNES):
|
||||
|
||||
def __init__(self, path):
|
||||
if DEBUG:
|
||||
logger().info("ITUNES_ASYNC:__init__()")
|
||||
logger().info("%s.__init__()" % self.__class__.__name__)
|
||||
|
||||
try:
|
||||
import appscript
|
||||
@ -3210,7 +3242,7 @@ class ITUNES_ASYNC(ITUNES):
|
||||
"""
|
||||
if not oncard:
|
||||
if DEBUG:
|
||||
logger().info("ITUNES_ASYNC:books()")
|
||||
logger().info("%s.books()" % self.__class__.__name__)
|
||||
if self.settings().extra_customization[self.CACHE_COVERS]:
|
||||
logger().info(" Cover fetching/caching enabled")
|
||||
else:
|
||||
@ -3324,7 +3356,7 @@ class ITUNES_ASYNC(ITUNES):
|
||||
are pending GUI jobs that need to communicate with the device.
|
||||
'''
|
||||
if DEBUG:
|
||||
logger().info("ITUNES_ASYNC:eject()")
|
||||
logger().info("%s.eject()" % self.__class__.__name__)
|
||||
self.iTunes = None
|
||||
self.connected = False
|
||||
|
||||
@ -3339,7 +3371,7 @@ class ITUNES_ASYNC(ITUNES):
|
||||
particular device doesn't have any of these locations it should return -1.
|
||||
"""
|
||||
if DEBUG:
|
||||
logger().info("ITUNES_ASYNC:free_space()")
|
||||
logger().info("%s.free_space()" % self.__class__.__name__)
|
||||
free_space = 0
|
||||
if isosx:
|
||||
s = os.statvfs(os.sep)
|
||||
@ -3356,7 +3388,7 @@ class ITUNES_ASYNC(ITUNES):
|
||||
@return: (device name, device version, software version on device, mime type)
|
||||
"""
|
||||
if DEBUG:
|
||||
logger().info("ITUNES_ASYNC:get_device_information()")
|
||||
logger().info("%s.get_device_information()" % self.__class__.__name__)
|
||||
|
||||
return ('iTunes', 'hw v1.0', 'sw v1.0', 'mime type normally goes here')
|
||||
|
||||
@ -3382,7 +3414,8 @@ class ITUNES_ASYNC(ITUNES):
|
||||
raise OpenFeedback(self.ITUNES_SANDBOX_LOCKOUT_MESSAGE)
|
||||
|
||||
if DEBUG:
|
||||
logger().info("ITUNES_ASYNC.open(connected_device: %s)" % repr(connected_device))
|
||||
logger().info("%s.open(connected_device: %s)" %
|
||||
(self.__class__.__name__, repr(connected_device)))
|
||||
|
||||
# Confirm/create thumbs archive
|
||||
if not os.path.exists(self.cache_dir):
|
||||
@ -3419,7 +3452,7 @@ class ITUNES_ASYNC(ITUNES):
|
||||
'''
|
||||
|
||||
if DEBUG:
|
||||
logger().info("ITUNES_ASYNC.sync_booklists()")
|
||||
logger().info("%s.sync_booklists()" % self.__class__.__name__)
|
||||
|
||||
# Inform user of any problem books
|
||||
if self.problem_titles:
|
||||
@ -3433,9 +3466,10 @@ class ITUNES_ASYNC(ITUNES):
|
||||
'''
|
||||
'''
|
||||
if DEBUG:
|
||||
logger().info("ITUNES_ASYNC:unmount_device()")
|
||||
logger().info("%s.unmount_device()" % self.__class__.__name__)
|
||||
self.connected = False
|
||||
|
||||
|
||||
class BookList(list):
|
||||
'''
|
||||
A list of books. Each Book object must have the fields:
|
||||
@ -3488,6 +3522,7 @@ class BookList(list):
|
||||
'''
|
||||
return {}
|
||||
|
||||
|
||||
class Book(Metadata):
|
||||
'''
|
||||
A simple class describing a book in the iTunes Books Library.
|
||||
@ -3495,9 +3530,9 @@ class Book(Metadata):
|
||||
'''
|
||||
def __init__(self, title, author):
|
||||
Metadata.__init__(self, title, authors=author.split(' & '))
|
||||
self.author = author
|
||||
self.author_sort = author_to_author_sort(author)
|
||||
|
||||
@property
|
||||
def title_sorter(self):
|
||||
return title_sort(self.title)
|
||||
|
||||
|
@ -2357,6 +2357,8 @@ class KOBOTOUCH(KOBO):
|
||||
update_query = 'UPDATE content SET Series=?, SeriesNumber==? where BookID is Null and ContentID = ?'
|
||||
if book.series is None:
|
||||
update_values = (None, None, book.contentID, )
|
||||
elif book.series_index is None: # This should never happen, but...
|
||||
update_values = (book.series, None, book.contentID, )
|
||||
else:
|
||||
update_values = (book.series, "%g"%book.series_index, book.contentID, )
|
||||
|
||||
|
@ -54,6 +54,8 @@ def synchronous(tlockname):
|
||||
|
||||
class ConnectionListener (Thread):
|
||||
|
||||
NOT_SERVICED_COUNT = 6
|
||||
|
||||
def __init__(self, driver):
|
||||
Thread.__init__(self)
|
||||
self.daemon = True
|
||||
@ -78,8 +80,8 @@ class ConnectionListener (Thread):
|
||||
|
||||
if not self.driver.connection_queue.empty():
|
||||
queue_not_serviced_count += 1
|
||||
if queue_not_serviced_count >= 3:
|
||||
self.driver._debug('queue not serviced')
|
||||
if queue_not_serviced_count >= self.NOT_SERVICED_COUNT:
|
||||
self.driver._debug('queue not serviced', queue_not_serviced_count)
|
||||
try:
|
||||
sock = self.driver.connection_queue.get_nowait()
|
||||
s = self.driver._json_encode(
|
||||
@ -1281,10 +1283,10 @@ class SMART_DEVICE_APP(DeviceConfig, DevicePlugin):
|
||||
self._close_listen_socket()
|
||||
return message
|
||||
else:
|
||||
while i < 100: # try up to 100 random port numbers
|
||||
while i < 100: # try 9090 then up to 99 random port numbers
|
||||
i += 1
|
||||
port = self._attach_to_port(self.listen_socket,
|
||||
random.randint(8192, 32000))
|
||||
9090 if i == 1 else random.randint(8192, 32000))
|
||||
if port != 0:
|
||||
break
|
||||
if port == 0:
|
||||
|
@ -74,11 +74,12 @@ def remove_kindlegen_markup(parts):
|
||||
part = "".join(srcpieces)
|
||||
parts[i] = part
|
||||
|
||||
# we can safely remove all of the Kindlegen generated data-AmznPageBreak tags
|
||||
# we can safely remove all of the Kindlegen generated data-AmznPageBreak
|
||||
# attributes
|
||||
find_tag_with_AmznPageBreak_pattern = re.compile(
|
||||
r'''(<[^>]*\sdata-AmznPageBreak=[^>]*>)''', re.IGNORECASE)
|
||||
within_tag_AmznPageBreak_position_pattern = re.compile(
|
||||
r'''\sdata-AmznPageBreak=['"][^'"]*['"]''')
|
||||
r'''\sdata-AmznPageBreak=['"]([^'"]*)['"]''')
|
||||
|
||||
for i in xrange(len(parts)):
|
||||
part = parts[i]
|
||||
@ -86,10 +87,8 @@ def remove_kindlegen_markup(parts):
|
||||
for j in range(len(srcpieces)):
|
||||
tag = srcpieces[j]
|
||||
if tag.startswith('<'):
|
||||
for m in within_tag_AmznPageBreak_position_pattern.finditer(tag):
|
||||
replacement = ''
|
||||
tag = within_tag_AmznPageBreak_position_pattern.sub(replacement, tag, 1)
|
||||
srcpieces[j] = tag
|
||||
srcpieces[j] = within_tag_AmznPageBreak_position_pattern.sub(
|
||||
lambda m:' style="page-break-after:%s"'%m.group(1), tag)
|
||||
part = "".join(srcpieces)
|
||||
parts[i] = part
|
||||
|
||||
|
@ -44,6 +44,18 @@ def locate_beg_end_of_tag(ml, aid):
|
||||
return plt, pgt
|
||||
return 0, 0
|
||||
|
||||
def reverse_tag_iter(block):
|
||||
''' Iterate over all tags in block in reverse order, i.e. last tag
|
||||
to first tag. '''
|
||||
end = len(block)
|
||||
while True:
|
||||
pgt = block.rfind(b'>', 0, end)
|
||||
if pgt == -1: break
|
||||
plt = block.rfind(b'<', 0, pgt)
|
||||
if plt == -1: break
|
||||
yield block[plt:pgt+1]
|
||||
end = plt
|
||||
|
||||
class Mobi8Reader(object):
|
||||
|
||||
def __init__(self, mobi6_reader, log):
|
||||
@ -275,13 +287,12 @@ class Mobi8Reader(object):
|
||||
return '%s/%s'%(fi.type, fi.filename), idtext
|
||||
|
||||
def get_id_tag(self, pos):
|
||||
# find the correct tag by actually searching in the destination
|
||||
# textblock at position
|
||||
# Find the first tag with a named anchor (name or id attribute) before
|
||||
# pos
|
||||
fi = self.get_file_info(pos)
|
||||
if fi.num is None and fi.start is None:
|
||||
raise ValueError('No file contains pos: %d'%pos)
|
||||
textblock = self.parts[fi.num]
|
||||
id_map = []
|
||||
npos = pos - fi.start
|
||||
pgt = textblock.find(b'>', npos)
|
||||
plt = textblock.find(b'<', npos)
|
||||
@ -290,28 +301,15 @@ class Mobi8Reader(object):
|
||||
if plt == npos or pgt < plt:
|
||||
npos = pgt + 1
|
||||
textblock = textblock[0:npos]
|
||||
# find id links only inside of tags
|
||||
# inside any < > pair find all "id=' and return whatever is inside
|
||||
# the quotes
|
||||
id_pattern = re.compile(br'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"][^>]*>''',
|
||||
re.IGNORECASE)
|
||||
for m in re.finditer(id_pattern, textblock):
|
||||
id_map.append((m.start(), m.group(1)))
|
||||
id_re = re.compile(br'''<[^>]+\sid\s*=\s*['"]([^'"]+)['"]''')
|
||||
name_re = re.compile(br'''<\s*a\s*\sname\s*=\s*['"]([^'"]+)['"]''')
|
||||
for tag in reverse_tag_iter(textblock):
|
||||
m = id_re.match(tag) or name_re.match(tag)
|
||||
if m is not None:
|
||||
return m.group(1)
|
||||
|
||||
if not id_map:
|
||||
# Found no id in the textblock, link must be to top of file
|
||||
# No tag found, link to start of file
|
||||
return b''
|
||||
# if npos is before first id= inside a tag, return the first
|
||||
if npos < id_map[0][0]:
|
||||
return id_map[0][1]
|
||||
# if npos is after the last id= inside a tag, return the last
|
||||
if npos > id_map[-1][0]:
|
||||
return id_map[-1][1]
|
||||
# otherwise find last id before npos
|
||||
for i, item in enumerate(id_map):
|
||||
if npos < item[0]:
|
||||
return id_map[i-1][1]
|
||||
return id_map[0][1]
|
||||
|
||||
def create_guide(self):
|
||||
guide = Guide()
|
||||
|
@ -320,13 +320,11 @@ class OEBReader(object):
|
||||
self.logger.warn(u'Spine item %r not found' % idref)
|
||||
continue
|
||||
item = manifest.ids[idref]
|
||||
if item.media_type.lower() in OEB_DOCS and hasattr(item.data, 'xpath'):
|
||||
spine.add(item, elem.get('linear'))
|
||||
for item in spine:
|
||||
if item.media_type.lower() not in OEB_DOCS:
|
||||
if not hasattr(item.data, 'xpath'):
|
||||
else:
|
||||
self.oeb.log.warn('The item %s is not a XML document.'
|
||||
' Removing it from spine.'%item.href)
|
||||
spine.remove(item)
|
||||
if len(spine) == 0:
|
||||
raise OEBError("Spine is empty")
|
||||
self._spine_add_extra()
|
||||
|
@ -114,7 +114,9 @@ class DetectStructure(object):
|
||||
|
||||
def find_matches(expr, doc):
|
||||
try:
|
||||
return XPath(expr)(doc)
|
||||
ans = XPath(expr)(doc)
|
||||
len(ans)
|
||||
return ans
|
||||
except:
|
||||
self.log.warn('Invalid chapter expression, ignoring: %s'%expr)
|
||||
return []
|
||||
@ -203,7 +205,9 @@ class DetectStructure(object):
|
||||
|
||||
def find_matches(expr, doc):
|
||||
try:
|
||||
return XPath(expr)(doc)
|
||||
ans = XPath(expr)(doc)
|
||||
len(ans)
|
||||
return ans
|
||||
except:
|
||||
self.log.warn('Invalid ToC expression, ignoring: %s'%expr)
|
||||
return []
|
||||
|
@ -27,10 +27,10 @@ def get_custom_size(opts):
|
||||
custom_size = None
|
||||
if opts.custom_size != None:
|
||||
width, sep, height = opts.custom_size.partition('x')
|
||||
if height != '':
|
||||
if height:
|
||||
try:
|
||||
width = int(width)
|
||||
height = int(height)
|
||||
width = float(width)
|
||||
height = float(height)
|
||||
custom_size = (width, height)
|
||||
except:
|
||||
custom_size = None
|
||||
|
@ -72,8 +72,8 @@ class LibreDEStore(BasicStoreConfig, StorePlugin):
|
||||
mobi = details.xpath(
|
||||
'boolean(.//span[@class="bindername" and contains(text(), "mobipocket")]/text())')
|
||||
|
||||
cover_url = ''.join(data.xpath('.//div[@class="coverImg"]/a/img/@src'))
|
||||
price = ''.join(data.xpath('.//span[@class="preis"]/text()')).replace('*', '').strip()
|
||||
cover_url = ''.join(data.xpath('.//div[@class="coverimg"]/a/img/@src'))
|
||||
price = ''.join(data.xpath('.//div[@class="preis"]/text()')).replace('*', '').strip()
|
||||
|
||||
counter -= 1
|
||||
|
||||
|
@ -8,7 +8,7 @@ from PyQt4.Qt import (QThread, pyqtSignal, Qt, QUrl, QDialog, QGridLayout,
|
||||
import mechanize
|
||||
|
||||
from calibre.constants import (__appname__, __version__, iswindows, isosx,
|
||||
isportable)
|
||||
isportable, is64bit)
|
||||
from calibre import browser, prints, as_unicode
|
||||
from calibre.utils.config import prefs
|
||||
from calibre.gui2 import config, dynamic, open_url
|
||||
@ -19,6 +19,13 @@ URL = 'http://status.calibre-ebook.com/latest'
|
||||
NO_CALIBRE_UPDATE = '-0.0.0'
|
||||
VSEP = '|'
|
||||
|
||||
def get_download_url():
|
||||
which = ('portable' if isportable else 'windows' if iswindows
|
||||
else 'osx' if isosx else 'linux')
|
||||
if which == 'windows' and is64bit:
|
||||
which += '64'
|
||||
return 'http://calibre-ebook.com/download_' + which
|
||||
|
||||
def get_newest_version():
|
||||
br = browser()
|
||||
req = mechanize.Request(URL)
|
||||
@ -116,10 +123,7 @@ class UpdateNotification(QDialog):
|
||||
config.set('new_version_notification', bool(self.cb.isChecked()))
|
||||
|
||||
def accept(self):
|
||||
url = ('http://calibre-ebook.com/download_' +
|
||||
('portable' if isportable else 'windows' if iswindows
|
||||
else 'osx' if isosx else 'linux'))
|
||||
open_url(QUrl(url))
|
||||
open_url(QUrl(get_download_url()))
|
||||
|
||||
QDialog.accept(self)
|
||||
|
||||
|
@ -12,6 +12,7 @@ from calibre.customize import CatalogPlugin
|
||||
from calibre.library.catalogs import FIELDS
|
||||
from calibre.customize.conversion import DummyReporter
|
||||
|
||||
|
||||
class CSV_XML(CatalogPlugin):
|
||||
'CSV/XML catalog generator'
|
||||
|
||||
@ -227,4 +228,3 @@ class CSV_XML(CatalogPlugin):
|
||||
with open(path_to_output, 'w') as f:
|
||||
f.write(etree.tostring(root, encoding='utf-8',
|
||||
xml_declaration=True, pretty_print=True))
|
||||
|
||||
|
@ -21,6 +21,7 @@ from calibre.utils.localization import get_lang
|
||||
|
||||
Option = namedtuple('Option', 'option, default, dest, action, help')
|
||||
|
||||
|
||||
class EPUB_MOBI(CatalogPlugin):
|
||||
'ePub catalog generator'
|
||||
|
||||
@ -386,6 +387,8 @@ class EPUB_MOBI(CatalogPlugin):
|
||||
if opts.fmt == 'mobi':
|
||||
recommendations.append(('no_inline_toc', True,
|
||||
OptionRecommendation.HIGH))
|
||||
recommendations.append(('verbose', 2,
|
||||
OptionRecommendation.HIGH))
|
||||
|
||||
# Use existing cover or generate new cover
|
||||
cpath = None
|
||||
@ -442,4 +445,3 @@ class EPUB_MOBI(CatalogPlugin):
|
||||
|
||||
# returns to gui2.actions.catalog:catalog_generated()
|
||||
return catalog.error
|
||||
|
||||
|
@ -25,6 +25,7 @@ from calibre.utils.icu import capitalize, collation_order, sort_key
|
||||
from calibre.utils.magick.draw import thumbnail
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
|
||||
|
||||
class CatalogBuilder(object):
|
||||
'''
|
||||
Generates catalog source files from calibre database
|
||||
@ -98,7 +99,6 @@ class CatalogBuilder(object):
|
||||
else:
|
||||
return ' '
|
||||
|
||||
|
||||
def __init__(self, db, _opts, plugin,
|
||||
report_progress=DummyReporter(),
|
||||
stylesheet="content/stylesheet.css",
|
||||
@ -120,11 +120,13 @@ class CatalogBuilder(object):
|
||||
_opts.output_profile and
|
||||
_opts.output_profile.startswith("kindle")) else False
|
||||
|
||||
self.all_series = set()
|
||||
self.authors = None
|
||||
self.bookmarked_books = None
|
||||
self.bookmarked_books_by_date_read = None
|
||||
self.books_by_author = None
|
||||
self.books_by_date_range = None
|
||||
self.books_by_description = []
|
||||
self.books_by_month = None
|
||||
self.books_by_series = None
|
||||
self.books_by_title = None
|
||||
@ -139,6 +141,7 @@ class CatalogBuilder(object):
|
||||
if self.opts.generate_genres else None
|
||||
self.html_filelist_1 = []
|
||||
self.html_filelist_2 = []
|
||||
self.individual_authors = None
|
||||
self.merge_comments_rule = dict(zip(['field', 'position', 'hr'],
|
||||
_opts.merge_comments_rule.split(':')))
|
||||
self.ncx_soup = None
|
||||
@ -154,6 +157,7 @@ class CatalogBuilder(object):
|
||||
self.total_steps = 6.0
|
||||
self.use_series_prefix_in_titles_section = False
|
||||
|
||||
self.dump_custom_fields()
|
||||
self.books_to_catalog = self.fetch_books_to_catalog()
|
||||
self.compute_total_steps()
|
||||
self.calculate_thumbnail_dimensions()
|
||||
@ -447,7 +451,7 @@ class CatalogBuilder(object):
|
||||
hits.remove(amp)
|
||||
for hit in hits:
|
||||
name = hit[1:-1]
|
||||
if htmlentitydefs.name2codepoint.has_key(name):
|
||||
if htmlentitydefs.name2codepoint in name:
|
||||
s = s.replace(hit, unichr(htmlentitydefs.name2codepoint[name]))
|
||||
s = s.replace(amp, "&")
|
||||
return s
|
||||
@ -586,7 +590,7 @@ class CatalogBuilder(object):
|
||||
# Literal comparison for Tags field
|
||||
if rule['field'].lower() == 'tags':
|
||||
if rule['pattern'].lower() in map(unicode.lower, record['tags']):
|
||||
if self.opts.verbose:
|
||||
if self.DEBUG and self.opts.verbose:
|
||||
self.opts.log.info(" %s '%s' by %s (%s: Tags includes '%s')" %
|
||||
(rule['prefix'], record['title'],
|
||||
record['authors'][0], rule['name'],
|
||||
@ -616,7 +620,7 @@ class CatalogBuilder(object):
|
||||
try:
|
||||
if re.search(rule['pattern'], unicode(field_contents),
|
||||
re.IGNORECASE) is not None:
|
||||
if self.opts.verbose:
|
||||
if self.DEBUG:
|
||||
_log_prefix_rule_match_info(rule, record, field_contents)
|
||||
return rule['prefix']
|
||||
except:
|
||||
@ -624,12 +628,24 @@ class CatalogBuilder(object):
|
||||
self.opts.log.error("pattern failed to compile: %s" % rule['pattern'])
|
||||
pass
|
||||
elif field_contents is None and rule['pattern'] == 'None':
|
||||
if self.opts.verbose:
|
||||
if self.DEBUG:
|
||||
_log_prefix_rule_match_info(rule, record, field_contents)
|
||||
return rule['prefix']
|
||||
|
||||
return None
|
||||
|
||||
def dump_custom_fields(self):
|
||||
"""
|
||||
Dump custom field mappings for debugging
|
||||
"""
|
||||
if self.opts.verbose:
|
||||
self.opts.log.info(" Custom fields:")
|
||||
all_custom_fields = self.db.custom_field_keys()
|
||||
for cf in all_custom_fields:
|
||||
self.opts.log.info(" %-20s %-20s %s" %
|
||||
(cf, "'%s'" % self.db.metadata_for_field(cf)['name'],
|
||||
self.db.metadata_for_field(cf)['datatype']))
|
||||
|
||||
def establish_equivalencies(self, item_list, key=None):
|
||||
""" Return icu equivalent sort letter.
|
||||
|
||||
@ -716,7 +732,8 @@ class CatalogBuilder(object):
|
||||
|
||||
Outputs:
|
||||
books_by_author: database, sorted by author
|
||||
authors: list of unique authors
|
||||
authors: list of book authors. Two credited authors are considered an
|
||||
individual entity
|
||||
error: author_sort mismatches
|
||||
|
||||
Return:
|
||||
@ -728,6 +745,13 @@ class CatalogBuilder(object):
|
||||
|
||||
books_by_author = list(self.books_to_catalog)
|
||||
self.detect_author_sort_mismatches(books_by_author)
|
||||
|
||||
# Assumes books_by_title already populated
|
||||
# init books_by_description before relisting multiple authors
|
||||
if self.opts.generate_descriptions:
|
||||
books_by_description = list(books_by_author) if self.opts.sort_descriptions_by_author \
|
||||
else list(self.books_by_title)
|
||||
|
||||
if self.opts.cross_reference_authors:
|
||||
books_by_author = self.relist_multiple_authors(books_by_author)
|
||||
|
||||
@ -737,6 +761,10 @@ class CatalogBuilder(object):
|
||||
asl = [i['author_sort'] for i in books_by_author]
|
||||
las = max(asl, key=len)
|
||||
|
||||
if self.opts.generate_descriptions:
|
||||
self.books_by_description = sorted(books_by_description,
|
||||
key=lambda x: sort_key(self._kf_books_by_author_sorter_author_sort(x, len(las))))
|
||||
|
||||
books_by_author = sorted(books_by_author,
|
||||
key=lambda x: sort_key(self._kf_books_by_author_sorter_author_sort(x, len(las))))
|
||||
|
||||
@ -758,6 +786,7 @@ class CatalogBuilder(object):
|
||||
current_author = authors[0]
|
||||
multiple_authors = False
|
||||
unique_authors = []
|
||||
individual_authors = set()
|
||||
for (i, author) in enumerate(authors):
|
||||
if author != current_author:
|
||||
# Note that current_author and author are tuples: (friendly, sort)
|
||||
@ -780,14 +809,23 @@ class CatalogBuilder(object):
|
||||
unique_authors.append((current_author[0], icu_title(current_author[1]),
|
||||
books_by_current_author))
|
||||
|
||||
self.authors = list(unique_authors)
|
||||
self.books_by_author = books_by_author
|
||||
|
||||
for ua in unique_authors:
|
||||
for ia in ua[0].replace(' & ', ' & ').split(' & '):
|
||||
individual_authors.add(ia)
|
||||
self.individual_authors = list(individual_authors)
|
||||
|
||||
if self.DEBUG and self.opts.verbose:
|
||||
self.opts.log.info("\nfetch_books_by_author(): %d unique authors" % len(unique_authors))
|
||||
for author in unique_authors:
|
||||
self.opts.log.info((u" %-50s %-25s %2d" % (author[0][0:45], author[1][0:20],
|
||||
author[2])).encode('utf-8'))
|
||||
self.opts.log.info("\nfetch_books_by_author(): %d individual authors" % len(individual_authors))
|
||||
for author in sorted(individual_authors):
|
||||
self.opts.log.info("%s" % author)
|
||||
|
||||
self.authors = unique_authors
|
||||
self.books_by_author = books_by_author
|
||||
return True
|
||||
|
||||
def fetch_books_by_title(self):
|
||||
@ -869,6 +907,7 @@ class CatalogBuilder(object):
|
||||
this_title['title'] = self.convert_html_entities(record['title'])
|
||||
if record['series']:
|
||||
this_title['series'] = record['series']
|
||||
self.all_series.add(this_title['series'])
|
||||
this_title['series_index'] = record['series_index']
|
||||
else:
|
||||
this_title['series'] = None
|
||||
@ -1000,7 +1039,7 @@ class CatalogBuilder(object):
|
||||
data = self.plugin.search_sort_db(self.db, self.opts)
|
||||
data = self.process_exclusions(data)
|
||||
|
||||
if self.opts.verbose and self.prefix_rules:
|
||||
if self.prefix_rules and self.DEBUG:
|
||||
self.opts.log.info(" Added prefixes:")
|
||||
|
||||
# Populate this_title{} from data[{},{}]
|
||||
@ -1042,6 +1081,7 @@ class CatalogBuilder(object):
|
||||
def initialize(self, save_template):
|
||||
self._save_template = save_template
|
||||
self.SUPPORTS_SUB_DIRS = True
|
||||
|
||||
def save_template(self):
|
||||
return self._save_template
|
||||
|
||||
@ -2070,7 +2110,6 @@ class CatalogBuilder(object):
|
||||
len(genre[key]),
|
||||
'titles' if len(genre[key]) > 1 else 'title'))
|
||||
|
||||
|
||||
# Write the results
|
||||
# genre_list = [ {friendly_tag:[{book},{book}]}, {friendly_tag:[{book},{book}]}, ...]
|
||||
master_genre_list = []
|
||||
@ -2107,7 +2146,8 @@ class CatalogBuilder(object):
|
||||
outfile)
|
||||
|
||||
tag_file = "content/Genre_%s.html" % genre
|
||||
master_genre_list.append({'tag':genre,
|
||||
master_genre_list.append({
|
||||
'tag': genre,
|
||||
'file': tag_file,
|
||||
'authors': unique_authors,
|
||||
'books': genre_tag_set[genre],
|
||||
@ -2935,12 +2975,10 @@ class CatalogBuilder(object):
|
||||
contentTag = Tag(soup, 'content')
|
||||
contentTag['src'] = "content/ByDateAdded.html"
|
||||
navPointTag.insert(1, contentTag)
|
||||
else:
|
||||
elif self.opts.generate_descriptions:
|
||||
# Descriptions only
|
||||
sort_descriptions_by = self.books_by_author if self.opts.sort_descriptions_by_author \
|
||||
else self.books_by_title
|
||||
contentTag = Tag(soup, 'content')
|
||||
contentTag['src'] = "content/book_%d.html" % int(sort_descriptions_by[0]['id'])
|
||||
contentTag['src'] = "content/book_%d.html" % int(self.books_by_description[0]['id'])
|
||||
navPointTag.insert(1, contentTag)
|
||||
|
||||
if self.generate_for_kindle_mobi:
|
||||
@ -2970,9 +3008,6 @@ class CatalogBuilder(object):
|
||||
|
||||
self.update_progress_full_step(_("NCX for Descriptions"))
|
||||
|
||||
sort_descriptions_by = self.books_by_author if self.opts.sort_descriptions_by_author \
|
||||
else self.books_by_title
|
||||
|
||||
# --- Construct the 'Descriptions' section ---
|
||||
ncx_soup = self.ncx_soup
|
||||
if self.generate_for_kindle_mobi:
|
||||
@ -2990,19 +3025,22 @@ class CatalogBuilder(object):
|
||||
self.play_order += 1
|
||||
navLabelTag = Tag(ncx_soup, 'navLabel')
|
||||
textTag = Tag(ncx_soup, 'text')
|
||||
textTag.insert(0, NavigableString(tocTitle))
|
||||
section_header = '%s [%d]' % (tocTitle, len(self.books_by_description))
|
||||
if self.generate_for_kindle_mobi:
|
||||
section_header = tocTitle
|
||||
textTag.insert(0, NavigableString(section_header))
|
||||
navLabelTag.insert(0, textTag)
|
||||
nptc = 0
|
||||
navPointTag.insert(nptc, navLabelTag)
|
||||
nptc += 1
|
||||
contentTag = Tag(ncx_soup, "content")
|
||||
contentTag['src'] = "content/book_%d.html" % int(sort_descriptions_by[0]['id'])
|
||||
contentTag['src'] = "content/book_%d.html" % int(self.books_by_description[0]['id'])
|
||||
navPointTag.insert(nptc, contentTag)
|
||||
nptc += 1
|
||||
|
||||
# Loop over the titles
|
||||
|
||||
for book in sort_descriptions_by:
|
||||
for book in self.books_by_description:
|
||||
navPointVolumeTag = Tag(ncx_soup, 'navPoint')
|
||||
if self.generate_for_kindle_mobi:
|
||||
navPointVolumeTag['class'] = "article"
|
||||
@ -3119,7 +3157,10 @@ class CatalogBuilder(object):
|
||||
self.play_order += 1
|
||||
navLabelTag = Tag(ncx_soup, 'navLabel')
|
||||
textTag = Tag(ncx_soup, 'text')
|
||||
textTag.insert(0, NavigableString(tocTitle))
|
||||
section_header = '%s [%d]' % (tocTitle, len(self.all_series))
|
||||
if self.generate_for_kindle_mobi:
|
||||
section_header = tocTitle
|
||||
textTag.insert(0, NavigableString(section_header))
|
||||
navLabelTag.insert(0, textTag)
|
||||
nptc = 0
|
||||
navPointTag.insert(nptc, navLabelTag)
|
||||
@ -3247,7 +3288,10 @@ class CatalogBuilder(object):
|
||||
self.play_order += 1
|
||||
navLabelTag = Tag(ncx_soup, 'navLabel')
|
||||
textTag = Tag(ncx_soup, 'text')
|
||||
textTag.insert(0, NavigableString(tocTitle))
|
||||
section_header = '%s [%d]' % (tocTitle, len(self.books_by_title))
|
||||
if self.generate_for_kindle_mobi:
|
||||
section_header = tocTitle
|
||||
textTag.insert(0, NavigableString(section_header))
|
||||
navLabelTag.insert(0, textTag)
|
||||
nptc = 0
|
||||
navPointTag.insert(nptc, navLabelTag)
|
||||
@ -3377,7 +3421,10 @@ class CatalogBuilder(object):
|
||||
self.play_order += 1
|
||||
navLabelTag = Tag(ncx_soup, 'navLabel')
|
||||
textTag = Tag(ncx_soup, 'text')
|
||||
textTag.insert(0, NavigableString('%s' % tocTitle))
|
||||
section_header = '%s [%d]' % (tocTitle, len(self.individual_authors))
|
||||
if self.generate_for_kindle_mobi:
|
||||
section_header = tocTitle
|
||||
textTag.insert(0, NavigableString(section_header))
|
||||
navLabelTag.insert(0, textTag)
|
||||
nptc = 0
|
||||
navPointTag.insert(nptc, navLabelTag)
|
||||
@ -3430,7 +3477,7 @@ class CatalogBuilder(object):
|
||||
fmt_string = _(u"Authors beginning with %s")
|
||||
else:
|
||||
fmt_string = _(u"Authors beginning with '%s'")
|
||||
textTag.insert(0, NavigableString(fmt_string % (authors_by_letter[1])))
|
||||
textTag.insert(0, NavigableString(fmt_string % authors_by_letter[1]))
|
||||
navLabelTag.insert(0, textTag)
|
||||
navPointByLetterTag.insert(0, navLabelTag)
|
||||
contentTag = Tag(ncx_soup, 'content')
|
||||
@ -3808,7 +3855,7 @@ class CatalogBuilder(object):
|
||||
self.update_progress_full_step(_("NCX for Genres"))
|
||||
|
||||
if not len(self.genres):
|
||||
self.opts.log.warn(" No genres found in tags.\n"
|
||||
self.opts.log.warn(" No genres found\n"
|
||||
" No Genre section added to Catalog")
|
||||
return
|
||||
|
||||
@ -3830,8 +3877,10 @@ class CatalogBuilder(object):
|
||||
self.play_order += 1
|
||||
navLabelTag = Tag(ncx_soup, 'navLabel')
|
||||
textTag = Tag(ncx_soup, 'text')
|
||||
# textTag.insert(0, NavigableString('%s (%d)' % (section_title, len(genre_list))))
|
||||
textTag.insert(0, NavigableString('%s' % tocTitle))
|
||||
section_header = '%s [%d]' % (tocTitle, len(self.genres))
|
||||
if self.generate_for_kindle_mobi:
|
||||
section_header = tocTitle
|
||||
textTag.insert(0, NavigableString(section_header))
|
||||
navLabelTag.insert(0, textTag)
|
||||
nptc = 0
|
||||
navPointTag.insert(nptc, navLabelTag)
|
||||
@ -3993,7 +4042,6 @@ class CatalogBuilder(object):
|
||||
mtc += 1
|
||||
|
||||
# Write the thumbnail images, descriptions to the manifest
|
||||
sort_descriptions_by = []
|
||||
if self.opts.generate_descriptions:
|
||||
for thumb in self.thumbs:
|
||||
itemTag = Tag(soup, "item")
|
||||
@ -4004,9 +4052,6 @@ class CatalogBuilder(object):
|
||||
manifest.insert(mtc, itemTag)
|
||||
mtc += 1
|
||||
|
||||
# HTML files - add descriptions to manifest and spine
|
||||
sort_descriptions_by = self.books_by_author if self.opts.sort_descriptions_by_author \
|
||||
else self.books_by_title
|
||||
# Add html_files to manifest and spine
|
||||
|
||||
for file in self.html_filelist_1:
|
||||
@ -4060,7 +4105,7 @@ class CatalogBuilder(object):
|
||||
spine.insert(stc, itemrefTag)
|
||||
stc += 1
|
||||
|
||||
for book in sort_descriptions_by:
|
||||
for book in self.books_by_description:
|
||||
# manifest
|
||||
itemTag = Tag(soup, "item")
|
||||
itemTag['href'] = "content/book_%d.html" % int(book['id'])
|
||||
@ -4286,7 +4331,8 @@ class CatalogBuilder(object):
|
||||
f.write(thumb_data)
|
||||
|
||||
# Save thumb to archive
|
||||
if zf is not None: # Ensure that the read succeeded
|
||||
if zf is not None:
|
||||
# Ensure that the read succeeded
|
||||
# If we failed to open the zip file for reading,
|
||||
# we dont know if it contained the thumb or not
|
||||
zf = _open_archive('a')
|
||||
@ -4363,7 +4409,6 @@ class CatalogBuilder(object):
|
||||
# Clear the book's cover property
|
||||
title['cover'] = None
|
||||
|
||||
|
||||
# Write thumb_width to the file, validating cache contents
|
||||
# Allows detection of aborted catalog builds
|
||||
with ZipFile(self.thumbs_path, mode='a') as zfw:
|
||||
@ -4853,5 +4898,3 @@ class CatalogBuilder(object):
|
||||
|
||||
outfile = open("%s/%s.ncx" % (self.catalog_path, self.opts.basename), 'w')
|
||||
outfile.write(self.ncx_soup.prettify())
|
||||
|
||||
|
||||
|
@ -22,6 +22,7 @@ from calibre.library.comments import comments_to_html
|
||||
from calibre.library.server import custom_fields_to_display
|
||||
from calibre.library.field_metadata import category_icon_map
|
||||
from calibre.library.server.utils import quote, unquote
|
||||
from calibre.ebooks.metadata.sources.identify import urls_from_identifiers
|
||||
|
||||
def xml(*args, **kwargs):
|
||||
ans = prepare_string_for_xml(*args, **kwargs)
|
||||
@ -823,6 +824,16 @@ class BrowseServer(object):
|
||||
if field in ('title', 'formats') or not args.get(field, False) \
|
||||
or not m['name']:
|
||||
continue
|
||||
if field == 'identifiers':
|
||||
urls = urls_from_identifiers(mi.get(field, {}))
|
||||
links = [u'<a class="details_category_link" target="_new" href="%s" title="%s:%s">%s</a>' % (url, id_typ, id_val, name)
|
||||
for name, id_typ, id_val, url in urls]
|
||||
links = u', '.join(links)
|
||||
if links:
|
||||
fields.append((m['name'], u'<strong>%s: </strong>%s'%(
|
||||
_('Ids'), links)))
|
||||
continue
|
||||
|
||||
if m['datatype'] == 'rating':
|
||||
r = u'<strong>%s: </strong>'%xml(m['name']) + \
|
||||
render_rating(mi.get(field)/2.0, self.opts.url_prefix,
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user