This commit is contained in:
Gregory Riker 2014-01-13 05:14:55 -07:00
commit 3e81bee473
8 changed files with 269 additions and 249 deletions

View File

@ -33,7 +33,6 @@ class NYTimes(BasicNewsRecipe):
# and 30 will get the most popular measured over 30 days. # and 30 will get the most popular measured over 30 days.
# you still only get up to 20 articles in each category # you still only get up to 20 articles in each category
# set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored. # set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored.
headlinesOnly = True headlinesOnly = True
@ -82,6 +81,7 @@ class NYTimes(BasicNewsRecipe):
# The maximum number of articles that will be downloaded # The maximum number of articles that will be downloaded
max_articles_per_feed = 100 max_articles_per_feed = 100
use_embedded_content = False
# Whether to omit duplicates of articles (typically arsing when articles are indexed in # Whether to omit duplicates of articles (typically arsing when articles are indexed in
# more than one section). If True, only the first occurance will be downloaded. # more than one section). If True, only the first occurance will be downloaded.
@ -122,7 +122,6 @@ class NYTimes(BasicNewsRecipe):
(u'Tech - Open', u'http://open.blogs.nytimes.com/feed/') (u'Tech - Open', u'http://open.blogs.nytimes.com/feed/')
] ]
if headlinesOnly: if headlinesOnly:
title='New York Times Headlines' title='New York Times Headlines'
description = 'Headlines from the New York Times' description = 'Headlines from the New York Times'
@ -164,12 +163,11 @@ class NYTimes(BasicNewsRecipe):
timefmt = '' timefmt = ''
#simultaneous_downloads = 1 # no longer required to deal with ads # simultaneous_downloads = 1 # no longer required to deal with ads
cover_margins = (18,18,'grey99') cover_margins = (18,18,'grey99')
remove_tags_before = dict(id='article') keep_only_tags = dict(id=['article', 'story', 'content'])
remove_tags_after = dict(id='article')
remove_tags = [ remove_tags = [
dict(attrs={'class':[ dict(attrs={'class':[
'articleFooter', 'articleFooter',
@ -184,6 +182,7 @@ class NYTimes(BasicNewsRecipe):
'entry-response module', 'entry-response module',
'leftNavTabs', 'leftNavTabs',
'metaFootnote', 'metaFootnote',
'inside-story',
'module box nav', 'module box nav',
'nextArticleLink', 'nextArticleLink',
'nextArticleLink clearfix', 'nextArticleLink clearfix',
@ -193,27 +192,27 @@ class NYTimes(BasicNewsRecipe):
'singleAd', 'singleAd',
'postCategory column', 'postCategory column',
'refer tagRefer', # added for bits blog post 'refer tagRefer', # added for bits blog post
'entry entry-utility', #added for DealBook 'entry entry-utility', # added for DealBook
'entry-tags', #added for DealBook 'entry-tags', # added for DealBook
'footer promos clearfix', #added for DealBook 'footer promos clearfix', # added for DealBook
'footer links clearfix', #added for DealBook 'footer links clearfix', # added for DealBook
'tabsContainer', #added for other blog downloads 'tabsContainer', # added for other blog downloads
'column lastColumn', #added for other blog downloads 'column lastColumn', # added for other blog downloads
'pageHeaderWithLabel', #added for other gadgetwise downloads 'pageHeaderWithLabel', # added for other gadgetwise downloads
'column two', #added for other blog downloads 'column two', # added for other blog downloads
'column two last', #added for other blog downloads 'column two last', # added for other blog downloads
'column three', #added for other blog downloads 'column three', # added for other blog downloads
'column three last', #added for other blog downloads 'column three last', # added for other blog downloads
'column four',#added for other blog downloads 'column four', # added for other blog downloads
'column four last',#added for other blog downloads 'column four last', # added for other blog downloads
'column last', #added for other blog downloads 'column last', # added for other blog downloads
'entry entry-related', 'entry entry-related',
'subNavigation tabContent active', #caucus blog navigation 'subNavigation tabContent active', # caucus blog navigation
'mediaOverlay slideshow', 'mediaOverlay slideshow',
'wideThumb', 'wideThumb',
'video', #added 02-11-2011 'video', # added 02-11-2011
'videoHeader',#added 02-11-2011 'videoHeader', # added 02-11-2011
'articleInlineVideoHolder', #added 02-11-2011 'articleInlineVideoHolder', # added 02-11-2011
'assetCompanionAd', 'assetCompanionAd',
'nytint-sectionHeader', 'nytint-sectionHeader',
re.compile('^subNavigation'), re.compile('^subNavigation'),
@ -222,6 +221,8 @@ class NYTimes(BasicNewsRecipe):
re.compile('commentCount'), re.compile('commentCount'),
'credit' 'credit'
]}), ]}),
dict(attrs={'class':lambda x: x and 'related-coverage-marginalia' in x.split()}),
dict(attrs={'class':lambda x: x and 'interactive' in x.split()}),
dict(name='div', attrs={'class':re.compile('toolsList')}), # bits dict(name='div', attrs={'class':re.compile('toolsList')}), # bits
dict(name='div', attrs={'class':re.compile('postNavigation')}), # bits dict(name='div', attrs={'class':re.compile('postNavigation')}), # bits
dict(name='div', attrs={'class':'tweet'}), dict(name='div', attrs={'class':'tweet'}),
@ -235,6 +236,8 @@ class NYTimes(BasicNewsRecipe):
dict(id=[ dict(id=[
'adxLeaderboard', 'adxLeaderboard',
'adxSponLink', 'adxSponLink',
'anchoredAd_module',
'anchoredAd_spot',
'archive', 'archive',
'articleExtras', 'articleExtras',
'articleInline', 'articleInline',
@ -263,16 +266,18 @@ class NYTimes(BasicNewsRecipe):
'side_index', 'side_index',
'side_tool', 'side_tool',
'toolsRight', 'toolsRight',
'skybox', #added for DealBook 'skybox', # added for DealBook
'TopAd', #added for DealBook 'TopAd', # added for DealBook
'related-content', #added for DealBook 'related-content', # added for DealBook
'whats-next', 'whats-next',
]), ]),
dict(name=['script', 'noscript', 'style','form','hr', 'button'])] dict(name=['script', 'noscript', 'style','form','hr', 'button', 'meta', 'footer'])]
no_stylesheets = True no_stylesheets = True
extra_css = ''' extra_css = '''
.articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; } .articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
.credit { font-weight: normal; text-align: right; font-size: 50%; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } .credit { font-weight: normal; text-align: right; font-size:
50%; line-height:1em; margin-top:5px; margin-left:0;
margin-right:0; margin-bottom: 0; }
.byline { text-align: left; font-size: 50%; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; } .byline { text-align: left; font-size: 50%; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
.dateline { text-align: left; font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } .dateline { text-align: left; font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
.kicker { font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } .kicker { font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
@ -288,7 +293,6 @@ class NYTimes(BasicNewsRecipe):
.asidenote {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;font-weight:bold;} .asidenote {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;font-weight:bold;}
.source {text-align: left; font-size: x-small; }''' .source {text-align: left; font-size: x-small; }'''
articles = {} articles = {}
key = None key = None
ans = [] ans = []
@ -310,22 +314,22 @@ class NYTimes(BasicNewsRecipe):
del ans[idx] del ans[idx]
idx_max = idx_max-1 idx_max = idx_max-1
continue continue
if True: #self.verbose if True: # self.verbose
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) ) self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])))
for article in ans[idx][1]: for article in ans[idx][1]:
total_article_count += 1 total_article_count += 1
if True: #self.verbose if True: # self.verbose
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'), self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
article['url'].encode('cp1252','replace'))) article['url'].encode('cp1252','replace')))
idx = idx+1 idx = idx+1
self.log( "Queued %d articles" % total_article_count ) self.log("Queued %d articles" % total_article_count)
return ans return ans
def exclude_url(self,url): def exclude_url(self,url):
if not url.startswith("http"): if not url.startswith("http"):
return True return True
if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: #added for DealBook if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: # added for DealBook
return True return True
if 'nytimes.com' not in url: if 'nytimes.com' not in url:
return True return True
@ -409,7 +413,6 @@ class NYTimes(BasicNewsRecipe):
def short_title(self): def short_title(self):
return self.title return self.title
def article_to_soup(self, url_or_raw, raw=False): def article_to_soup(self, url_or_raw, raw=False):
from contextlib import closing from contextlib import closing
import copy import copy
@ -443,7 +446,6 @@ class NYTimes(BasicNewsRecipe):
usrc = self.preprocess_raw_html(usrc, url_or_raw) usrc = self.preprocess_raw_html(usrc, url_or_raw)
return BeautifulSoup(usrc, markupMassage=nmassage) return BeautifulSoup(usrc, markupMassage=nmassage)
def massageNCXText(self, description): def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters # Kindle TOC descriptions won't render certain characters
if description: if description:
@ -498,7 +500,7 @@ class NYTimes(BasicNewsRecipe):
if authorAttribution: if authorAttribution:
author = self.tag_to_string(authorAttribution, use_alt=False) author = self.tag_to_string(authorAttribution, use_alt=False)
feed = self.key if self.key is not None else 'Uncategorized' feed = self.key if self.key is not None else 'Uncategorized'
if not self.articles.has_key(feed): if feed not in self.articles:
self.ans.append(feed) self.ans.append(feed)
self.articles[feed] = [] self.articles[feed] = []
self.articles[feed].append( self.articles[feed].append(
@ -533,7 +535,6 @@ class NYTimes(BasicNewsRecipe):
desc = '' desc = ''
return(title,url,author,desc) return(title,url,author,desc)
have_emailed = False have_emailed = False
emailed_soup = self.index_to_soup('http://www.nytimes.com/most-popular-emailed?period='+self.popularPeriod) emailed_soup = self.index_to_soup('http://www.nytimes.com/most-popular-emailed?period='+self.popularPeriod)
for h3tag in emailed_soup.findAll('h3'): for h3tag in emailed_soup.findAll('h3'):
@ -562,7 +563,7 @@ class NYTimes(BasicNewsRecipe):
dict(title=title, url=url, date=strftime('%a, %d %b'), dict(title=title, url=url, date=strftime('%a, %d %b'),
description=desc, author=author, description=desc, author=author,
content='')) content=''))
viewed_ans = [(k, popular_articles[k]) for k in key_list if popular_articles.has_key(k)] viewed_ans = [(k, popular_articles[k]) for k in key_list if k in popular_articles]
for x in viewed_ans: for x in viewed_ans:
ans.append(x) ans.append(x)
return ans return ans
@ -585,10 +586,10 @@ class NYTimes(BasicNewsRecipe):
tech_articles[f.title] = [] tech_articles[f.title] = []
for a in f.articles: for a in f.articles:
tech_articles[f.title].append( tech_articles[f.title].append(
dict(title=a.title, url=a.url, date=a.date, dict(title=a.title, url=a.url.partition('?')[0], date=a.date,
description=a.summary, author=a.author, description=a.summary, author=a.author,
content=a.content)) content=a.content))
tech_ans = [(k, tech_articles[k]) for k in key_list if tech_articles.has_key(k)] tech_ans = [(k, tech_articles[k]) for k in key_list if k in tech_articles]
for x in tech_ans: for x in tech_ans:
ans.append(x) ans.append(x)
return ans return ans
@ -627,10 +628,9 @@ class NYTimes(BasicNewsRecipe):
for lidiv in div.findAll('li'): for lidiv in div.findAll('li'):
self.handle_article(lidiv) self.handle_article(lidiv)
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] self.ans = [(k, self.articles[k]) for k in self.ans if k in self.articles]
return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans))) return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
def parse_todays_index(self): def parse_todays_index(self):
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html') soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
@ -660,7 +660,7 @@ class NYTimes(BasicNewsRecipe):
if not skipping: if not skipping:
self.handle_article(lidiv) self.handle_article(lidiv)
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] self.ans = [(k, self.articles[k]) for k in self.ans if k in self.articles]
return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans))) return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
def parse_headline_index(self): def parse_headline_index(self):
@ -706,13 +706,13 @@ class NYTimes(BasicNewsRecipe):
description = self.tag_to_string(desc,use_alt=False) description = self.tag_to_string(desc,use_alt=False)
else: else:
description = '' description = ''
if not self.articles.has_key(section_name): if section_name not in self.articles:
self.ans.append(section_name) self.ans.append(section_name)
self.articles[section_name] = [] self.articles[section_name] = []
print('Title '+title+' author '+author) print('Title '+title+' author '+author)
self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content='')) self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] self.ans = [(k, self.articles[k]) for k in self.ans if k in self.articles]
return self.filter_ans(self.ans) return self.filter_ans(self.ans)
def parse_index(self): def parse_index(self):
@ -732,7 +732,7 @@ class NYTimes(BasicNewsRecipe):
if kill_all or (self.recursions==0): if kill_all or (self.recursions==0):
a.replaceWith(self.tag_to_string(a,False)) a.replaceWith(self.tag_to_string(a,False))
else: else:
if a.has_key('href'): if 'href' in a:
if a['href'].startswith('http://www.nytimes'): if a['href'].startswith('http://www.nytimes'):
if not a['href'].endswith('pagewanted=all'): if not a['href'].endswith('pagewanted=all'):
url = re.sub(r'\?.*', '', a['href']) url = re.sub(r'\?.*', '', a['href'])
@ -740,13 +740,13 @@ class NYTimes(BasicNewsRecipe):
a.replaceWith(self.tag_to_string(a,False)) a.replaceWith(self.tag_to_string(a,False))
else: else:
a['href'] = url+'?pagewanted=all' a['href'] = url+'?pagewanted=all'
elif not (a['href'].startswith('http://pogue') or \ elif not (a['href'].startswith('http://pogue') or
a['href'].startswith('http://bits') or \ a['href'].startswith('http://bits') or
a['href'].startswith('http://travel') or \ a['href'].startswith('http://travel') or
a['href'].startswith('http://business') or \ a['href'].startswith('http://business') or
a['href'].startswith('http://tech') or \ a['href'].startswith('http://tech') or
a['href'].startswith('http://health') or \ a['href'].startswith('http://health') or
a['href'].startswith('http://dealbook') or \ a['href'].startswith('http://dealbook') or
a['href'].startswith('http://open')): a['href'].startswith('http://open')):
a.replaceWith(self.tag_to_string(a,False)) a.replaceWith(self.tag_to_string(a,False))
return soup return soup
@ -761,7 +761,7 @@ class NYTimes(BasicNewsRecipe):
return None return None
## print("HANDLING AD FORWARD:") ## print("HANDLING AD FORWARD:")
## print(soup) # print(soup)
if self.keep_only_tags: if self.keep_only_tags:
body = Tag(soup, 'body') body = Tag(soup, 'body')
try: try:
@ -799,7 +799,6 @@ class NYTimes(BasicNewsRecipe):
return soup return soup
def preprocess_html(self, soup): def preprocess_html(self, soup):
#print(strftime("%H:%M:%S")+" -- PREPROCESS TITLE="+self.tag_to_string(soup.title)) #print(strftime("%H:%M:%S")+" -- PREPROCESS TITLE="+self.tag_to_string(soup.title))
skip_tag = soup.find(True, {'name':'skip'}) skip_tag = soup.find(True, {'name':'skip'})
@ -861,9 +860,9 @@ class NYTimes(BasicNewsRecipe):
img = atag.find('img') img = atag.find('img')
if img is not None: if img is not None:
atag.replaceWith(img) atag.replaceWith(img)
elif not atag.has_key('href'): elif 'href' not in atag:
atag.replaceWith(atag.renderContents().decode('cp1252','replace')) atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
elif not (atag['href'].startswith('http://www.nytimes') or atag['href'].startswith('http://pogue') or \ elif not (atag['href'].startswith('http://www.nytimes') or atag['href'].startswith('http://pogue') or
atag['href'].startswith('http://bits') or atag['href'].startswith('http://open')): atag['href'].startswith('http://bits') or atag['href'].startswith('http://open')):
atag.replaceWith(atag.renderContents().decode('cp1252','replace')) atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
hdr = soup.find('address') hdr = soup.find('address')
@ -934,7 +933,7 @@ class NYTimes(BasicNewsRecipe):
if self.useHighResImages: if self.useHighResImages:
try: try:
#open up all the "Enlarge this Image" pop-ups and download the full resolution jpegs # open up all the "Enlarge this Image" pop-ups and download the full resolution jpegs
enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'}) enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
if enlargeThisList: if enlargeThisList:
for popupref in enlargeThisList: for popupref in enlargeThisList:
@ -953,8 +952,10 @@ class NYTimes(BasicNewsRecipe):
year = str(st.tm_year) year = str(st.tm_year)
month = "%.2d" % st.tm_mon month = "%.2d" % st.tm_mon
day = "%.2d" % st.tm_mday day = "%.2d" % st.tm_mday
imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/') + len('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/') imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/') + \
highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/' + popuphtml[imgstartpos:popuphtml.find('.jpg',imgstartpos)+4] len('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/')
highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' + \
month +'/' + day +'/' + popuphtml[imgstartpos:popuphtml.find('.jpg',imgstartpos)+4]
popupSoup = BeautifulSoup(popuphtml) popupSoup = BeautifulSoup(popuphtml)
highResTag = popupSoup.find('img', {'src':highResImageLink}) highResTag = popupSoup.find('img', {'src':highResImageLink})
if highResTag: if highResTag:
@ -976,7 +977,7 @@ class NYTimes(BasicNewsRecipe):
self.log("Error pulling high resolution images") self.log("Error pulling high resolution images")
try: try:
#in case pulling images failed, delete the enlarge this text # in case pulling images failed, delete the enlarge this text
enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'}) enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
if enlargeThisList: if enlargeThisList:
for popupref in enlargeThisList: for popupref in enlargeThisList:
@ -984,7 +985,6 @@ class NYTimes(BasicNewsRecipe):
except: except:
self.log("Error removing Enlarge this text") self.log("Error removing Enlarge this text")
return self.strip_anchors(soup,False) return self.strip_anchors(soup,False)
def postprocess_html(self,soup,first_fetch): def postprocess_html(self,soup,first_fetch):
@ -1058,7 +1058,7 @@ class NYTimes(BasicNewsRecipe):
try: try:
# Change <nyt_headline> to <h2> # Change <nyt_headline> to <h2>
h1 = soup.find('h1') h1 = soup.find('h1')
blogheadline = str(h1) #added for dealbook blogheadline = str(h1) # added for dealbook
if h1: if h1:
headline = h1.find("nyt_headline") headline = h1.find("nyt_headline")
if headline: if headline:
@ -1066,11 +1066,11 @@ class NYTimes(BasicNewsRecipe):
tag['class'] = "headline" tag['class'] = "headline"
tag.insert(0, self.fixChars(headline.contents[0])) tag.insert(0, self.fixChars(headline.contents[0]))
h1.replaceWith(tag) h1.replaceWith(tag)
elif blogheadline.find('entry-title'):#added for dealbook elif blogheadline.find('entry-title'): # added for dealbook
tag = Tag(soup, "h2")#added for dealbook tag = Tag(soup, "h2") # added for dealbook
tag['class'] = "headline"#added for dealbook tag['class'] = "headline" # added for dealbook
tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook tag.insert(0, self.fixChars(h1.contents[0])) # added for dealbook
h1.replaceWith(tag)#added for dealbook h1.replaceWith(tag) # added for dealbook
else: else:
# Blog entry - replace headline, remove <hr> tags - BCC I think this is no longer functional 1-18-2011 # Blog entry - replace headline, remove <hr> tags - BCC I think this is no longer functional 1-18-2011
@ -1087,7 +1087,7 @@ class NYTimes(BasicNewsRecipe):
self.log("ERROR: Problem in Change <nyt_headline> to <h2>") self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
try: try:
#if this is from a blog (dealbook, fix the byline format # if this is from a blog (dealbook, fix the byline format
bylineauthor = soup.find('address',attrs={'class':'byline author vcard'}) bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
if bylineauthor: if bylineauthor:
tag = Tag(soup, "h6") tag = Tag(soup, "h6")
@ -1098,7 +1098,7 @@ class NYTimes(BasicNewsRecipe):
self.log("ERROR: fixing byline author format") self.log("ERROR: fixing byline author format")
try: try:
#if this is a blog (dealbook) fix the credit style for the pictures # if this is a blog (dealbook) fix the credit style for the pictures
blogcredit = soup.find('div',attrs={'class':'credit'}) blogcredit = soup.find('div',attrs={'class':'credit'})
if blogcredit: if blogcredit:
tag = Tag(soup, "h6") tag = Tag(soup, "h6")
@ -1108,7 +1108,6 @@ class NYTimes(BasicNewsRecipe):
except: except:
self.log("ERROR: fixing credit format") self.log("ERROR: fixing credit format")
try: try:
# Change <h1> to <h3> - used in editorial blogs # Change <h1> to <h3> - used in editorial blogs
masthead = soup.find("h1") masthead = soup.find("h1")
@ -1132,7 +1131,7 @@ class NYTimes(BasicNewsRecipe):
except: except:
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs") self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
try: try:
#remove the <strong> update tag # remove the <strong> update tag
blogupdated = soup.find('span', {'class':'update'}) blogupdated = soup.find('span', {'class':'update'})
if blogupdated: if blogupdated:
blogupdated.replaceWith("") blogupdated.replaceWith("")
@ -1181,9 +1180,9 @@ class NYTimes(BasicNewsRecipe):
paras = articlebody.findAll('p') paras = articlebody.findAll('p')
for p in paras: for p in paras:
refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip() refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
#account for blank paragraphs and short paragraphs by appending them to longer ones # account for blank paragraphs and short paragraphs by appending them to longer ones
if len(refparagraph) > 0: if len(refparagraph) > 0:
if len(refparagraph) > 70: #approximately one line of text if len(refparagraph) > 70: # approximately one line of text
newpara = shortparagraph + refparagraph newpara = shortparagraph + refparagraph
newparaDateline,newparaEm,newparaDesc = newpara.partition('&mdash;') newparaDateline,newparaEm,newparaDesc = newpara.partition('&mdash;')
if newparaEm == '': if newparaEm == '':
@ -1202,4 +1201,3 @@ class NYTimes(BasicNewsRecipe):
self.log("Error creating article descriptions") self.log("Error creating article descriptions")
return return

View File

@ -33,7 +33,6 @@ class NYTimes(BasicNewsRecipe):
# and 30 will get the most popular measured over 30 days. # and 30 will get the most popular measured over 30 days.
# you still only get up to 20 articles in each category # you still only get up to 20 articles in each category
# set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored. # set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored.
headlinesOnly = False headlinesOnly = False
@ -82,6 +81,7 @@ class NYTimes(BasicNewsRecipe):
# The maximum number of articles that will be downloaded # The maximum number of articles that will be downloaded
max_articles_per_feed = 100 max_articles_per_feed = 100
use_embedded_content = False
# Whether to omit duplicates of articles (typically arsing when articles are indexed in # Whether to omit duplicates of articles (typically arsing when articles are indexed in
# more than one section). If True, only the first occurance will be downloaded. # more than one section). If True, only the first occurance will be downloaded.
@ -122,7 +122,6 @@ class NYTimes(BasicNewsRecipe):
(u'Tech - Open', u'http://open.blogs.nytimes.com/feed/') (u'Tech - Open', u'http://open.blogs.nytimes.com/feed/')
] ]
if headlinesOnly: if headlinesOnly:
title='New York Times Headlines' title='New York Times Headlines'
description = 'Headlines from the New York Times' description = 'Headlines from the New York Times'
@ -164,12 +163,11 @@ class NYTimes(BasicNewsRecipe):
timefmt = '' timefmt = ''
#simultaneous_downloads = 1 # no longer required to deal with ads # simultaneous_downloads = 1 # no longer required to deal with ads
cover_margins = (18,18,'grey99') cover_margins = (18,18,'grey99')
remove_tags_before = dict(id='article') keep_only_tags = dict(id=['article', 'story', 'content'])
remove_tags_after = dict(id='article')
remove_tags = [ remove_tags = [
dict(attrs={'class':[ dict(attrs={'class':[
'articleFooter', 'articleFooter',
@ -184,6 +182,7 @@ class NYTimes(BasicNewsRecipe):
'entry-response module', 'entry-response module',
'leftNavTabs', 'leftNavTabs',
'metaFootnote', 'metaFootnote',
'inside-story',
'module box nav', 'module box nav',
'nextArticleLink', 'nextArticleLink',
'nextArticleLink clearfix', 'nextArticleLink clearfix',
@ -193,27 +192,27 @@ class NYTimes(BasicNewsRecipe):
'singleAd', 'singleAd',
'postCategory column', 'postCategory column',
'refer tagRefer', # added for bits blog post 'refer tagRefer', # added for bits blog post
'entry entry-utility', #added for DealBook 'entry entry-utility', # added for DealBook
'entry-tags', #added for DealBook 'entry-tags', # added for DealBook
'footer promos clearfix', #added for DealBook 'footer promos clearfix', # added for DealBook
'footer links clearfix', #added for DealBook 'footer links clearfix', # added for DealBook
'tabsContainer', #added for other blog downloads 'tabsContainer', # added for other blog downloads
'column lastColumn', #added for other blog downloads 'column lastColumn', # added for other blog downloads
'pageHeaderWithLabel', #added for other gadgetwise downloads 'pageHeaderWithLabel', # added for other gadgetwise downloads
'column two', #added for other blog downloads 'column two', # added for other blog downloads
'column two last', #added for other blog downloads 'column two last', # added for other blog downloads
'column three', #added for other blog downloads 'column three', # added for other blog downloads
'column three last', #added for other blog downloads 'column three last', # added for other blog downloads
'column four',#added for other blog downloads 'column four', # added for other blog downloads
'column four last',#added for other blog downloads 'column four last', # added for other blog downloads
'column last', #added for other blog downloads 'column last', # added for other blog downloads
'entry entry-related', 'entry entry-related',
'subNavigation tabContent active', #caucus blog navigation 'subNavigation tabContent active', # caucus blog navigation
'mediaOverlay slideshow', 'mediaOverlay slideshow',
'wideThumb', 'wideThumb',
'video', #added 02-11-2011 'video', # added 02-11-2011
'videoHeader',#added 02-11-2011 'videoHeader', # added 02-11-2011
'articleInlineVideoHolder', #added 02-11-2011 'articleInlineVideoHolder', # added 02-11-2011
'assetCompanionAd', 'assetCompanionAd',
'nytint-sectionHeader', 'nytint-sectionHeader',
re.compile('^subNavigation'), re.compile('^subNavigation'),
@ -222,6 +221,8 @@ class NYTimes(BasicNewsRecipe):
re.compile('commentCount'), re.compile('commentCount'),
'credit' 'credit'
]}), ]}),
dict(attrs={'class':lambda x: x and 'related-coverage-marginalia' in x.split()}),
dict(attrs={'class':lambda x: x and 'interactive' in x.split()}),
dict(name='div', attrs={'class':re.compile('toolsList')}), # bits dict(name='div', attrs={'class':re.compile('toolsList')}), # bits
dict(name='div', attrs={'class':re.compile('postNavigation')}), # bits dict(name='div', attrs={'class':re.compile('postNavigation')}), # bits
dict(name='div', attrs={'class':'tweet'}), dict(name='div', attrs={'class':'tweet'}),
@ -235,6 +236,8 @@ class NYTimes(BasicNewsRecipe):
dict(id=[ dict(id=[
'adxLeaderboard', 'adxLeaderboard',
'adxSponLink', 'adxSponLink',
'anchoredAd_module',
'anchoredAd_spot',
'archive', 'archive',
'articleExtras', 'articleExtras',
'articleInline', 'articleInline',
@ -251,6 +254,7 @@ class NYTimes(BasicNewsRecipe):
'masthead-nav', 'masthead-nav',
'memberTools', 'memberTools',
'navigation', 'navigation-ghost', 'navigation-modal', 'navigation-edge', 'navigation', 'navigation-ghost', 'navigation-modal', 'navigation-edge',
'page-footer',
'portfolioInline', 'portfolioInline',
'readerReviews', 'readerReviews',
'readerReviewsCount', 'readerReviewsCount',
@ -262,16 +266,18 @@ class NYTimes(BasicNewsRecipe):
'side_index', 'side_index',
'side_tool', 'side_tool',
'toolsRight', 'toolsRight',
'skybox', #added for DealBook 'skybox', # added for DealBook
'TopAd', #added for DealBook 'TopAd', # added for DealBook
'related-content', #added for DealBook 'related-content', # added for DealBook
'whats-next', 'whats-next',
]), ]),
dict(name=['script', 'noscript', 'style','form','hr', 'button'])] dict(name=['script', 'noscript', 'style','form','hr', 'button', 'meta', 'footer'])]
no_stylesheets = True no_stylesheets = True
extra_css = ''' extra_css = '''
.articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; } .articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
.credit { font-weight: normal; text-align: right; font-size: 50%; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } .credit { font-weight: normal; text-align: right; font-size:
50%; line-height:1em; margin-top:5px; margin-left:0;
margin-right:0; margin-bottom: 0; }
.byline { text-align: left; font-size: 50%; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; } .byline { text-align: left; font-size: 50%; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
.dateline { text-align: left; font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } .dateline { text-align: left; font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
.kicker { font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } .kicker { font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
@ -287,7 +293,6 @@ class NYTimes(BasicNewsRecipe):
.asidenote {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;font-weight:bold;} .asidenote {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;font-weight:bold;}
.source {text-align: left; font-size: x-small; }''' .source {text-align: left; font-size: x-small; }'''
articles = {} articles = {}
key = None key = None
ans = [] ans = []
@ -309,22 +314,22 @@ class NYTimes(BasicNewsRecipe):
del ans[idx] del ans[idx]
idx_max = idx_max-1 idx_max = idx_max-1
continue continue
if True: #self.verbose if True: # self.verbose
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) ) self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])))
for article in ans[idx][1]: for article in ans[idx][1]:
total_article_count += 1 total_article_count += 1
if True: #self.verbose if True: # self.verbose
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'), self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
article['url'].encode('cp1252','replace'))) article['url'].encode('cp1252','replace')))
idx = idx+1 idx = idx+1
self.log( "Queued %d articles" % total_article_count ) self.log("Queued %d articles" % total_article_count)
return ans return ans
def exclude_url(self,url): def exclude_url(self,url):
if not url.startswith("http"): if not url.startswith("http"):
return True return True
if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: #added for DealBook if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: # added for DealBook
return True return True
if 'nytimes.com' not in url: if 'nytimes.com' not in url:
return True return True
@ -416,7 +421,6 @@ class NYTimes(BasicNewsRecipe):
def short_title(self): def short_title(self):
return self.title return self.title
def article_to_soup(self, url_or_raw, raw=False): def article_to_soup(self, url_or_raw, raw=False):
from contextlib import closing from contextlib import closing
import copy import copy
@ -450,7 +454,6 @@ class NYTimes(BasicNewsRecipe):
usrc = self.preprocess_raw_html(usrc, url_or_raw) usrc = self.preprocess_raw_html(usrc, url_or_raw)
return BeautifulSoup(usrc, markupMassage=nmassage) return BeautifulSoup(usrc, markupMassage=nmassage)
def massageNCXText(self, description): def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters # Kindle TOC descriptions won't render certain characters
if description: if description:
@ -505,7 +508,7 @@ class NYTimes(BasicNewsRecipe):
if authorAttribution: if authorAttribution:
author = self.tag_to_string(authorAttribution, use_alt=False) author = self.tag_to_string(authorAttribution, use_alt=False)
feed = self.key if self.key is not None else 'Uncategorized' feed = self.key if self.key is not None else 'Uncategorized'
if not self.articles.has_key(feed): if feed not in self.articles:
self.ans.append(feed) self.ans.append(feed)
self.articles[feed] = [] self.articles[feed] = []
self.articles[feed].append( self.articles[feed].append(
@ -540,7 +543,6 @@ class NYTimes(BasicNewsRecipe):
desc = '' desc = ''
return(title,url,author,desc) return(title,url,author,desc)
have_emailed = False have_emailed = False
emailed_soup = self.index_to_soup('http://www.nytimes.com/most-popular-emailed?period='+self.popularPeriod) emailed_soup = self.index_to_soup('http://www.nytimes.com/most-popular-emailed?period='+self.popularPeriod)
for h3tag in emailed_soup.findAll('h3'): for h3tag in emailed_soup.findAll('h3'):
@ -569,7 +571,7 @@ class NYTimes(BasicNewsRecipe):
dict(title=title, url=url, date=strftime('%a, %d %b'), dict(title=title, url=url, date=strftime('%a, %d %b'),
description=desc, author=author, description=desc, author=author,
content='')) content=''))
viewed_ans = [(k, popular_articles[k]) for k in key_list if popular_articles.has_key(k)] viewed_ans = [(k, popular_articles[k]) for k in key_list if k in popular_articles]
for x in viewed_ans: for x in viewed_ans:
ans.append(x) ans.append(x)
return ans return ans
@ -592,10 +594,10 @@ class NYTimes(BasicNewsRecipe):
tech_articles[f.title] = [] tech_articles[f.title] = []
for a in f.articles: for a in f.articles:
tech_articles[f.title].append( tech_articles[f.title].append(
dict(title=a.title, url=a.url, date=a.date, dict(title=a.title, url=a.url.partition('?')[0], date=a.date,
description=a.summary, author=a.author, description=a.summary, author=a.author,
content=a.content)) content=a.content))
tech_ans = [(k, tech_articles[k]) for k in key_list if tech_articles.has_key(k)] tech_ans = [(k, tech_articles[k]) for k in key_list if k in tech_articles]
for x in tech_ans: for x in tech_ans:
ans.append(x) ans.append(x)
return ans return ans
@ -634,10 +636,9 @@ class NYTimes(BasicNewsRecipe):
for lidiv in div.findAll('li'): for lidiv in div.findAll('li'):
self.handle_article(lidiv) self.handle_article(lidiv)
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] self.ans = [(k, self.articles[k]) for k in self.ans if k in self.articles]
return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans))) return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
def parse_todays_index(self): def parse_todays_index(self):
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html') soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
@ -667,7 +668,7 @@ class NYTimes(BasicNewsRecipe):
if not skipping: if not skipping:
self.handle_article(lidiv) self.handle_article(lidiv)
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] self.ans = [(k, self.articles[k]) for k in self.ans if k in self.articles]
return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans))) return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
def parse_headline_index(self): def parse_headline_index(self):
@ -713,13 +714,13 @@ class NYTimes(BasicNewsRecipe):
description = self.tag_to_string(desc,use_alt=False) description = self.tag_to_string(desc,use_alt=False)
else: else:
description = '' description = ''
if not self.articles.has_key(section_name): if section_name not in self.articles:
self.ans.append(section_name) self.ans.append(section_name)
self.articles[section_name] = [] self.articles[section_name] = []
print('Title '+title+' author '+author) print('Title '+title+' author '+author)
self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content='')) self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] self.ans = [(k, self.articles[k]) for k in self.ans if k in self.articles]
return self.filter_ans(self.ans) return self.filter_ans(self.ans)
def parse_index(self): def parse_index(self):
@ -739,7 +740,7 @@ class NYTimes(BasicNewsRecipe):
if kill_all or (self.recursions==0): if kill_all or (self.recursions==0):
a.replaceWith(self.tag_to_string(a,False)) a.replaceWith(self.tag_to_string(a,False))
else: else:
if a.has_key('href'): if 'href' in a:
if a['href'].startswith('http://www.nytimes'): if a['href'].startswith('http://www.nytimes'):
if not a['href'].endswith('pagewanted=all'): if not a['href'].endswith('pagewanted=all'):
url = re.sub(r'\?.*', '', a['href']) url = re.sub(r'\?.*', '', a['href'])
@ -747,13 +748,13 @@ class NYTimes(BasicNewsRecipe):
a.replaceWith(self.tag_to_string(a,False)) a.replaceWith(self.tag_to_string(a,False))
else: else:
a['href'] = url+'?pagewanted=all' a['href'] = url+'?pagewanted=all'
elif not (a['href'].startswith('http://pogue') or \ elif not (a['href'].startswith('http://pogue') or
a['href'].startswith('http://bits') or \ a['href'].startswith('http://bits') or
a['href'].startswith('http://travel') or \ a['href'].startswith('http://travel') or
a['href'].startswith('http://business') or \ a['href'].startswith('http://business') or
a['href'].startswith('http://tech') or \ a['href'].startswith('http://tech') or
a['href'].startswith('http://health') or \ a['href'].startswith('http://health') or
a['href'].startswith('http://dealbook') or \ a['href'].startswith('http://dealbook') or
a['href'].startswith('http://open')): a['href'].startswith('http://open')):
a.replaceWith(self.tag_to_string(a,False)) a.replaceWith(self.tag_to_string(a,False))
return soup return soup
@ -768,7 +769,7 @@ class NYTimes(BasicNewsRecipe):
return None return None
## print("HANDLING AD FORWARD:") ## print("HANDLING AD FORWARD:")
## print(soup) # print(soup)
if self.keep_only_tags: if self.keep_only_tags:
body = Tag(soup, 'body') body = Tag(soup, 'body')
try: try:
@ -806,7 +807,6 @@ class NYTimes(BasicNewsRecipe):
return soup return soup
def preprocess_html(self, soup): def preprocess_html(self, soup):
#print(strftime("%H:%M:%S")+" -- PREPROCESS TITLE="+self.tag_to_string(soup.title)) #print(strftime("%H:%M:%S")+" -- PREPROCESS TITLE="+self.tag_to_string(soup.title))
skip_tag = soup.find(True, {'name':'skip'}) skip_tag = soup.find(True, {'name':'skip'})
@ -868,9 +868,9 @@ class NYTimes(BasicNewsRecipe):
img = atag.find('img') img = atag.find('img')
if img is not None: if img is not None:
atag.replaceWith(img) atag.replaceWith(img)
elif not atag.has_key('href'): elif 'href' not in atag:
atag.replaceWith(atag.renderContents().decode('cp1252','replace')) atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
elif not (atag['href'].startswith('http://www.nytimes') or atag['href'].startswith('http://pogue') or \ elif not (atag['href'].startswith('http://www.nytimes') or atag['href'].startswith('http://pogue') or
atag['href'].startswith('http://bits') or atag['href'].startswith('http://open')): atag['href'].startswith('http://bits') or atag['href'].startswith('http://open')):
atag.replaceWith(atag.renderContents().decode('cp1252','replace')) atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
hdr = soup.find('address') hdr = soup.find('address')
@ -941,7 +941,7 @@ class NYTimes(BasicNewsRecipe):
if self.useHighResImages: if self.useHighResImages:
try: try:
#open up all the "Enlarge this Image" pop-ups and download the full resolution jpegs # open up all the "Enlarge this Image" pop-ups and download the full resolution jpegs
enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'}) enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
if enlargeThisList: if enlargeThisList:
for popupref in enlargeThisList: for popupref in enlargeThisList:
@ -960,8 +960,10 @@ class NYTimes(BasicNewsRecipe):
year = str(st.tm_year) year = str(st.tm_year)
month = "%.2d" % st.tm_mon month = "%.2d" % st.tm_mon
day = "%.2d" % st.tm_mday day = "%.2d" % st.tm_mday
imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/') + len('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/') imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/') + \
highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/' + popuphtml[imgstartpos:popuphtml.find('.jpg',imgstartpos)+4] len('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/')
highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' + \
month +'/' + day +'/' + popuphtml[imgstartpos:popuphtml.find('.jpg',imgstartpos)+4]
popupSoup = BeautifulSoup(popuphtml) popupSoup = BeautifulSoup(popuphtml)
highResTag = popupSoup.find('img', {'src':highResImageLink}) highResTag = popupSoup.find('img', {'src':highResImageLink})
if highResTag: if highResTag:
@ -983,7 +985,7 @@ class NYTimes(BasicNewsRecipe):
self.log("Error pulling high resolution images") self.log("Error pulling high resolution images")
try: try:
#in case pulling images failed, delete the enlarge this text # in case pulling images failed, delete the enlarge this text
enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'}) enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
if enlargeThisList: if enlargeThisList:
for popupref in enlargeThisList: for popupref in enlargeThisList:
@ -991,7 +993,6 @@ class NYTimes(BasicNewsRecipe):
except: except:
self.log("Error removing Enlarge this text") self.log("Error removing Enlarge this text")
return self.strip_anchors(soup,False) return self.strip_anchors(soup,False)
def postprocess_html(self,soup,first_fetch): def postprocess_html(self,soup,first_fetch):
@ -1065,7 +1066,7 @@ class NYTimes(BasicNewsRecipe):
try: try:
# Change <nyt_headline> to <h2> # Change <nyt_headline> to <h2>
h1 = soup.find('h1') h1 = soup.find('h1')
blogheadline = str(h1) #added for dealbook blogheadline = str(h1) # added for dealbook
if h1: if h1:
headline = h1.find("nyt_headline") headline = h1.find("nyt_headline")
if headline: if headline:
@ -1073,11 +1074,11 @@ class NYTimes(BasicNewsRecipe):
tag['class'] = "headline" tag['class'] = "headline"
tag.insert(0, self.fixChars(headline.contents[0])) tag.insert(0, self.fixChars(headline.contents[0]))
h1.replaceWith(tag) h1.replaceWith(tag)
elif blogheadline.find('entry-title'):#added for dealbook elif blogheadline.find('entry-title'): # added for dealbook
tag = Tag(soup, "h2")#added for dealbook tag = Tag(soup, "h2") # added for dealbook
tag['class'] = "headline"#added for dealbook tag['class'] = "headline" # added for dealbook
tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook tag.insert(0, self.fixChars(h1.contents[0])) # added for dealbook
h1.replaceWith(tag)#added for dealbook h1.replaceWith(tag) # added for dealbook
else: else:
# Blog entry - replace headline, remove <hr> tags - BCC I think this is no longer functional 1-18-2011 # Blog entry - replace headline, remove <hr> tags - BCC I think this is no longer functional 1-18-2011
@ -1094,7 +1095,7 @@ class NYTimes(BasicNewsRecipe):
self.log("ERROR: Problem in Change <nyt_headline> to <h2>") self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
try: try:
#if this is from a blog (dealbook, fix the byline format # if this is from a blog (dealbook, fix the byline format
bylineauthor = soup.find('address',attrs={'class':'byline author vcard'}) bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
if bylineauthor: if bylineauthor:
tag = Tag(soup, "h6") tag = Tag(soup, "h6")
@ -1105,7 +1106,7 @@ class NYTimes(BasicNewsRecipe):
self.log("ERROR: fixing byline author format") self.log("ERROR: fixing byline author format")
try: try:
#if this is a blog (dealbook) fix the credit style for the pictures # if this is a blog (dealbook) fix the credit style for the pictures
blogcredit = soup.find('div',attrs={'class':'credit'}) blogcredit = soup.find('div',attrs={'class':'credit'})
if blogcredit: if blogcredit:
tag = Tag(soup, "h6") tag = Tag(soup, "h6")
@ -1115,7 +1116,6 @@ class NYTimes(BasicNewsRecipe):
except: except:
self.log("ERROR: fixing credit format") self.log("ERROR: fixing credit format")
try: try:
# Change <h1> to <h3> - used in editorial blogs # Change <h1> to <h3> - used in editorial blogs
masthead = soup.find("h1") masthead = soup.find("h1")
@ -1139,7 +1139,7 @@ class NYTimes(BasicNewsRecipe):
except: except:
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs") self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
try: try:
#remove the <strong> update tag # remove the <strong> update tag
blogupdated = soup.find('span', {'class':'update'}) blogupdated = soup.find('span', {'class':'update'})
if blogupdated: if blogupdated:
blogupdated.replaceWith("") blogupdated.replaceWith("")
@ -1188,9 +1188,9 @@ class NYTimes(BasicNewsRecipe):
paras = articlebody.findAll('p') paras = articlebody.findAll('p')
for p in paras: for p in paras:
refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip() refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
#account for blank paragraphs and short paragraphs by appending them to longer ones # account for blank paragraphs and short paragraphs by appending them to longer ones
if len(refparagraph) > 0: if len(refparagraph) > 0:
if len(refparagraph) > 70: #approximately one line of text if len(refparagraph) > 70: # approximately one line of text
newpara = shortparagraph + refparagraph newpara = shortparagraph + refparagraph
newparaDateline,newparaEm,newparaDesc = newpara.partition('&mdash;') newparaDateline,newparaEm,newparaDesc = newpara.partition('&mdash;')
if newparaEm == '': if newparaEm == '':
@ -1209,4 +1209,3 @@ class NYTimes(BasicNewsRecipe):
self.log("Error creating article descriptions") self.log("Error creating article descriptions")
return return

View File

@ -20,6 +20,7 @@ from calibre.constants import iswindows
from calibre import unicode_path, as_unicode, replace_entities from calibre import unicode_path, as_unicode, replace_entities
class Link(object): class Link(object):
''' '''
Represents a link in a HTML file. Represents a link in a HTML file.
''' '''
@ -73,6 +74,7 @@ class IgnoreFile(Exception):
self.errno = errno self.errno = errno
class HTMLFile(object): class HTMLFile(object):
''' '''
Contains basic information about an HTML file. This Contains basic information about an HTML file. This
includes a list of links to other files as well as includes a list of links to other files as well as
@ -103,8 +105,14 @@ class HTMLFile(object):
try: try:
with open(self.path, 'rb') as f: with open(self.path, 'rb') as f:
src = f.read(4096) src = header = f.read(4096)
self.is_binary = level > 0 and not bool(self.HTML_PAT.search(src)) encoding = detect_xml_encoding(src)[1]
if encoding:
try:
header = header.decode(encoding)
except ValueError:
pass
self.is_binary = level > 0 and not bool(self.HTML_PAT.search(header))
if not self.is_binary: if not self.is_binary:
src += f.read() src += f.read()
except IOError as err: except IOError as err:
@ -139,7 +147,6 @@ class HTMLFile(object):
def __repr__(self): def __repr__(self):
return str(self) return str(self)
def find_links(self, src): def find_links(self, src):
for match in self.LINK_PAT.finditer(src): for match in self.LINK_PAT.finditer(src):
url = None url = None
@ -232,8 +239,7 @@ def get_filelist(htmlfile, dir, opts, log):
log.info('Building file list...') log.info('Building file list...')
filelist = traverse(htmlfile, max_levels=int(opts.max_levels), filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
verbose=opts.verbose, verbose=opts.verbose,
encoding=opts.input_encoding)\ encoding=opts.input_encoding)[0 if opts.breadth_first else 1]
[0 if opts.breadth_first else 1]
if opts.verbose: if opts.verbose:
log.debug('\tFound files...') log.debug('\tFound files...')
for f in filelist: for f in filelist:

View File

@ -317,13 +317,11 @@ class FlowSplitter(object):
def split_to_size(self, tree): def split_to_size(self, tree):
self.log.debug('\t\tSplitting...') self.log.debug('\t\tSplitting...')
root = tree.getroot() root = tree.getroot()
# Split large <pre> tags # Split large <pre> tags if they contain only text
for pre in list(XPath('//h:pre')(root)): for pre in XPath('//h:pre')(root):
text = u''.join(pre.xpath('descendant::text()')) if len(tuple(pre.iterchildren(etree.Element))) > 0:
pre.text = text continue
for child in list(pre.iterchildren()): if pre.text and len(pre.text) > self.max_flow_size*0.5:
pre.remove(child)
if len(pre.text) > self.max_flow_size*0.5:
self.log.debug('\t\tSplitting large <pre> tag') self.log.debug('\t\tSplitting large <pre> tag')
frags = self.split_text(pre.text, root, int(0.2*self.max_flow_size)) frags = self.split_text(pre.text, root, int(0.2*self.max_flow_size))
new_pres = [] new_pres = []

View File

@ -1104,7 +1104,8 @@ class OnDeviceSearch(SearchQueryParser): # {{{
'format', 'format',
'formats', 'formats',
'title', 'title',
'inlibrary' 'inlibrary',
'tags'
] ]
def __init__(self, model): def __init__(self, model):
@ -1135,14 +1136,15 @@ class OnDeviceSearch(SearchQueryParser): # {{{
if location not in self.USABLE_LOCATIONS: if location not in self.USABLE_LOCATIONS:
return set([]) return set([])
matches = set([]) matches = set([])
all_locs = set(self.USABLE_LOCATIONS) - set(['all']) all_locs = set(self.USABLE_LOCATIONS) - set(['all', 'tags'])
locations = all_locs if location == 'all' else [location] locations = all_locs if location == 'all' else [location]
q = { q = {
'title' : lambda x : getattr(x, 'title').lower(), 'title' : lambda x : getattr(x, 'title').lower(),
'author': lambda x: ' & '.join(getattr(x, 'authors')).lower(), 'author': lambda x: ' & '.join(getattr(x, 'authors')).lower(),
'collections':lambda x: ','.join(getattr(x, 'device_collections')).lower(), 'collections':lambda x: ','.join(getattr(x, 'device_collections')).lower(),
'format':lambda x: os.path.splitext(x.path)[1].lower(), 'format':lambda x: os.path.splitext(x.path)[1].lower(),
'inlibrary':lambda x : getattr(x, 'in_library') 'inlibrary':lambda x : getattr(x, 'in_library'),
'tags':lambda x : getattr(x, 'tags', [])
} }
for x in ('author', 'format'): for x in ('author', 'format'):
q[x+'s'] = q[x] q[x+'s'] = q[x]
@ -1169,10 +1171,11 @@ class OnDeviceSearch(SearchQueryParser): # {{{
else: else:
m = matchkind m = matchkind
if locvalue == 'collections': vals = accessor(row)
vals = accessor(row).split(',') if vals is None:
else: vals = ''
vals = [accessor(row)] if isinstance(vals, basestring):
vals = vals.split(',') if locvalue == 'collections' else [vals]
if _match(query, vals, m, use_primary_find_in_search=upf): if _match(query, vals, m, use_primary_find_in_search=upf):
matches.add(index) matches.add(index)
break break

View File

@ -21,7 +21,7 @@ from calibre.constants import ispy3, plugins, cache_dir
from calibre.gui2 import NONE from calibre.gui2 import NONE
from calibre.gui2.widgets2 import HistoryLineEdit2 from calibre.gui2.widgets2 import HistoryLineEdit2
from calibre.gui2.tweak_book import tprefs from calibre.gui2.tweak_book import tprefs
from calibre.gui2.tweak_book.editor.insert_resource import Dialog from calibre.gui2.tweak_book.widgets import Dialog
if not ispy3: if not ispy3:
if sys.maxunicode >= 0x10FFFF: if sys.maxunicode >= 0x10FFFF:

View File

@ -10,11 +10,11 @@ import sys, os
from functools import partial from functools import partial
from PyQt4.Qt import ( from PyQt4.Qt import (
QDialog, QGridLayout, QDialogButtonBox, QSize, QListView, QStyledItemDelegate, QGridLayout, QSize, QListView, QStyledItemDelegate, QLabel, QPixmap,
QLabel, QPixmap, QApplication, QSizePolicy, QAbstractListModel, QVariant, QApplication, QSizePolicy, QAbstractListModel, QVariant, Qt, QRect,
Qt, QRect, QPainter, QModelIndex, QSortFilterProxyModel, QLineEdit, QPainter, QModelIndex, QSortFilterProxyModel, QLineEdit, QToolButton,
QToolButton, QIcon, QFormLayout, pyqtSignal, QTreeWidget, QTreeWidgetItem, QIcon, QFormLayout, pyqtSignal, QTreeWidget, QTreeWidgetItem, QVBoxLayout,
QVBoxLayout, QMenu, QInputDialog) QMenu, QInputDialog)
from calibre import fit_image from calibre import fit_image
from calibre.constants import plugins from calibre.constants import plugins
@ -23,43 +23,11 @@ from calibre.ebooks.metadata.book.base import Metadata
from calibre.gui2 import NONE, choose_files, error_dialog from calibre.gui2 import NONE, choose_files, error_dialog
from calibre.gui2.languages import LanguagesEdit from calibre.gui2.languages import LanguagesEdit
from calibre.gui2.tweak_book import current_container, tprefs from calibre.gui2.tweak_book import current_container, tprefs
from calibre.gui2.tweak_book.widgets import Dialog
from calibre.gui2.tweak_book.file_list import name_is_ok from calibre.gui2.tweak_book.file_list import name_is_ok
from calibre.utils.localization import get_lang, canonicalize_lang from calibre.utils.localization import get_lang, canonicalize_lang
from calibre.utils.icu import sort_key from calibre.utils.icu import sort_key
class Dialog(QDialog):
def __init__(self, title, name, parent=None):
QDialog.__init__(self, parent)
self.setWindowTitle(title)
self.name = name
self.bb = QDialogButtonBox(QDialogButtonBox.Ok | QDialogButtonBox.Cancel)
self.bb.accepted.connect(self.accept)
self.bb.rejected.connect(self.reject)
self.setup_ui()
self.resize(self.sizeHint())
geom = tprefs.get(name + '-geometry', None)
if geom is not None:
self.restoreGeometry(geom)
if hasattr(self, 'splitter'):
state = tprefs.get(name + '-splitter-state', None)
if state is not None:
self.splitter.restoreState(state)
def accept(self):
tprefs.set(self.name + '-geometry', bytearray(self.saveGeometry()))
if hasattr(self, 'splitter'):
tprefs.set(self.name + '-splitter-state', bytearray(self.splitter.saveState()))
QDialog.accept(self)
def reject(self):
tprefs.set(self.name + '-geometry', bytearray(self.saveGeometry()))
if hasattr(self, 'splitter'):
tprefs.set(self.name + '-splitter-state', bytearray(self.splitter.saveState()))
QDialog.reject(self)
class ChooseName(Dialog): # {{{ class ChooseName(Dialog): # {{{
''' Chooses the filename for a newly imported file, with error checking ''' ''' Chooses the filename for a newly imported file, with error checking '''

View File

@ -0,0 +1,48 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
from PyQt4.Qt import (QDialog, QDialogButtonBox)
from calibre.gui2.tweak_book import tprefs
class Dialog(QDialog):
def __init__(self, title, name, parent=None):
QDialog.__init__(self, parent)
self.setWindowTitle(title)
self.name = name
self.bb = QDialogButtonBox(QDialogButtonBox.Ok | QDialogButtonBox.Cancel)
self.bb.accepted.connect(self.accept)
self.bb.rejected.connect(self.reject)
self.setup_ui()
self.resize(self.sizeHint())
geom = tprefs.get(name + '-geometry', None)
if geom is not None:
self.restoreGeometry(geom)
if hasattr(self, 'splitter'):
state = tprefs.get(name + '-splitter-state', None)
if state is not None:
self.splitter.restoreState(state)
def accept(self):
tprefs.set(self.name + '-geometry', bytearray(self.saveGeometry()))
if hasattr(self, 'splitter'):
tprefs.set(self.name + '-splitter-state', bytearray(self.splitter.saveState()))
QDialog.accept(self)
def reject(self):
tprefs.set(self.name + '-geometry', bytearray(self.saveGeometry()))
if hasattr(self, 'splitter'):
tprefs.set(self.name + '-splitter-state', bytearray(self.splitter.saveState()))
QDialog.reject(self)
def setup_ui(self):
raise NotImplementedError('You must implement this method in Dialog subclasses')