mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
Update New York Times
This commit is contained in:
parent
0e23b98274
commit
28a126709d
@ -153,7 +153,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
|
||||
timefmt = ''
|
||||
|
||||
simultaneous_downloads = 1
|
||||
#simultaneous_downloads = 1 # no longer required to deal with ads
|
||||
|
||||
cover_margins = (18,18,'grey99')
|
||||
|
||||
@ -204,7 +204,8 @@ class NYTimes(BasicNewsRecipe):
|
||||
re.compile('^subNavigation'),
|
||||
re.compile('^leaderboard'),
|
||||
re.compile('^module'),
|
||||
re.compile('commentCount')
|
||||
re.compile('commentCount'),
|
||||
'credit'
|
||||
]}),
|
||||
dict(name='div', attrs={'class':re.compile('toolsList')}), # bits
|
||||
dict(name='div', attrs={'class':re.compile('postNavigation')}), # bits
|
||||
@ -291,11 +292,11 @@ class NYTimes(BasicNewsRecipe):
|
||||
del ans[idx]
|
||||
idx_max = idx_max-1
|
||||
continue
|
||||
if self.verbose:
|
||||
if True: #self.verbose
|
||||
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
|
||||
for article in ans[idx][1]:
|
||||
total_article_count += 1
|
||||
if self.verbose:
|
||||
if True: #self.verbose
|
||||
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
|
||||
article['url'].encode('cp1252','replace')))
|
||||
idx = idx+1
|
||||
@ -351,23 +352,8 @@ class NYTimes(BasicNewsRecipe):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
return br
|
||||
|
||||
## This doesn't work (and probably never did). It either gets another serve of the advertisement,
|
||||
## or if it gets the article then get_soup (from which it is invoked) traps trying to do xml decoding.
|
||||
##
|
||||
## def skip_ad_pages(self, soup):
|
||||
## # Skip ad pages served before actual article
|
||||
## skip_tag = soup.find(True, {'name':'skip'})
|
||||
## if skip_tag is not None:
|
||||
## self.log.warn("Found forwarding link: %s" % skip_tag.parent['href'])
|
||||
## url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
|
||||
## url += '?pagewanted=all'
|
||||
## self.log.warn("Skipping ad to article at '%s'" % url)
|
||||
## return self.index_to_soup(url, raw=True)
|
||||
|
||||
|
||||
cover_tag = 'NY_NYT'
|
||||
def get_cover_url(self):
|
||||
from datetime import timedelta, date
|
||||
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.cover_tag+'.jpg'
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
daysback=1
|
||||
@ -745,11 +731,12 @@ class NYTimes(BasicNewsRecipe):
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
print("PREPROCESS TITLE="+self.tag_to_string(soup.title))
|
||||
#print("PREPROCESS TITLE="+self.tag_to_string(soup.title))
|
||||
skip_tag = soup.find(True, {'name':'skip'})
|
||||
if skip_tag is not None:
|
||||
url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
|
||||
url += '?pagewanted=all'
|
||||
#url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
|
||||
url = 'http://www.nytimes.com' + skip_tag.parent['href']
|
||||
#url += '?pagewanted=all'
|
||||
self.log.warn("Skipping ad to article at '%s'" % url)
|
||||
sleep(5)
|
||||
soup = self.handle_tags(self.article_to_soup(url))
|
||||
@ -1005,7 +992,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
if headline:
|
||||
tag = Tag(soup, "h2")
|
||||
tag['class'] = "headline"
|
||||
tag.insert(0, self.fixChars(headline.renderContents()))
|
||||
tag.insert(0, self.fixChars(self.tag_to_string(headline,False)))
|
||||
soup.insert(0, tag)
|
||||
hrs = soup.findAll('hr')
|
||||
for hr in hrs:
|
||||
@ -1019,7 +1006,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
if bylineauthor:
|
||||
tag = Tag(soup, "h6")
|
||||
tag['class'] = "byline"
|
||||
tag.insert(0, self.fixChars(bylineauthor.renderContents()))
|
||||
tag.insert(0, self.fixChars(self.tag_to_string(bylineauthor,False)))
|
||||
bylineauthor.replaceWith(tag)
|
||||
except:
|
||||
self.log("ERROR: fixing byline author format")
|
||||
@ -1030,7 +1017,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
if blogcredit:
|
||||
tag = Tag(soup, "h6")
|
||||
tag['class'] = "credit"
|
||||
tag.insert(0, self.fixChars(blogcredit.renderContents()))
|
||||
tag.insert(0, self.fixChars(self.tag_to_string(blogcredit,False)))
|
||||
blogcredit.replaceWith(tag)
|
||||
except:
|
||||
self.log("ERROR: fixing credit format")
|
||||
|
@ -32,7 +32,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
# number of days old an article can be for inclusion. If oldest_web_article = None all articles
|
||||
# will be included. Note: oldest_web_article is ignored if webEdition = False
|
||||
webEdition = False
|
||||
oldest_web_article = 7
|
||||
oldest_web_article = None
|
||||
|
||||
# download higher resolution images than the small thumbnails typically included in the article
|
||||
# the down side of having large beautiful images is the file size is much larger, on the order of 7MB per paper
|
||||
@ -153,7 +153,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
|
||||
timefmt = ''
|
||||
|
||||
simultaneous_downloads = 1
|
||||
#simultaneous_downloads = 1 # no longer required to deal with ads
|
||||
|
||||
cover_margins = (18,18,'grey99')
|
||||
|
||||
@ -204,7 +204,8 @@ class NYTimes(BasicNewsRecipe):
|
||||
re.compile('^subNavigation'),
|
||||
re.compile('^leaderboard'),
|
||||
re.compile('^module'),
|
||||
re.compile('commentCount')
|
||||
re.compile('commentCount'),
|
||||
'credit'
|
||||
]}),
|
||||
dict(name='div', attrs={'class':re.compile('toolsList')}), # bits
|
||||
dict(name='div', attrs={'class':re.compile('postNavigation')}), # bits
|
||||
@ -291,11 +292,11 @@ class NYTimes(BasicNewsRecipe):
|
||||
del ans[idx]
|
||||
idx_max = idx_max-1
|
||||
continue
|
||||
if self.verbose:
|
||||
if True: #self.verbose
|
||||
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
|
||||
for article in ans[idx][1]:
|
||||
total_article_count += 1
|
||||
if self.verbose:
|
||||
if True: #self.verbose
|
||||
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
|
||||
article['url'].encode('cp1252','replace')))
|
||||
idx = idx+1
|
||||
@ -351,23 +352,8 @@ class NYTimes(BasicNewsRecipe):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
return br
|
||||
|
||||
## This doesn't work (and probably never did). It either gets another serve of the advertisement,
|
||||
## or if it gets the article then get_soup (from which it is invoked) traps trying to do xml decoding.
|
||||
##
|
||||
## def skip_ad_pages(self, soup):
|
||||
## # Skip ad pages served before actual article
|
||||
## skip_tag = soup.find(True, {'name':'skip'})
|
||||
## if skip_tag is not None:
|
||||
## self.log.warn("Found forwarding link: %s" % skip_tag.parent['href'])
|
||||
## url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
|
||||
## url += '?pagewanted=all'
|
||||
## self.log.warn("Skipping ad to article at '%s'" % url)
|
||||
## return self.index_to_soup(url, raw=True)
|
||||
|
||||
|
||||
cover_tag = 'NY_NYT'
|
||||
def get_cover_url(self):
|
||||
from datetime import timedelta, date
|
||||
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.cover_tag+'.jpg'
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
daysback=1
|
||||
@ -745,11 +731,12 @@ class NYTimes(BasicNewsRecipe):
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
print("PREPROCESS TITLE="+self.tag_to_string(soup.title))
|
||||
#print("PREPROCESS TITLE="+self.tag_to_string(soup.title))
|
||||
skip_tag = soup.find(True, {'name':'skip'})
|
||||
if skip_tag is not None:
|
||||
url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
|
||||
url += '?pagewanted=all'
|
||||
#url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
|
||||
url = 'http://www.nytimes.com' + skip_tag.parent['href']
|
||||
#url += '?pagewanted=all'
|
||||
self.log.warn("Skipping ad to article at '%s'" % url)
|
||||
sleep(5)
|
||||
soup = self.handle_tags(self.article_to_soup(url))
|
||||
@ -1005,7 +992,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
if headline:
|
||||
tag = Tag(soup, "h2")
|
||||
tag['class'] = "headline"
|
||||
tag.insert(0, self.fixChars(headline.renderContents()))
|
||||
tag.insert(0, self.fixChars(self.tag_to_string(headline,False)))
|
||||
soup.insert(0, tag)
|
||||
hrs = soup.findAll('hr')
|
||||
for hr in hrs:
|
||||
@ -1019,7 +1006,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
if bylineauthor:
|
||||
tag = Tag(soup, "h6")
|
||||
tag['class'] = "byline"
|
||||
tag.insert(0, self.fixChars(bylineauthor.renderContents()))
|
||||
tag.insert(0, self.fixChars(self.tag_to_string(bylineauthor,False)))
|
||||
bylineauthor.replaceWith(tag)
|
||||
except:
|
||||
self.log("ERROR: fixing byline author format")
|
||||
@ -1030,7 +1017,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
if blogcredit:
|
||||
tag = Tag(soup, "h6")
|
||||
tag['class'] = "credit"
|
||||
tag.insert(0, self.fixChars(blogcredit.renderContents()))
|
||||
tag.insert(0, self.fixChars(self.tag_to_string(blogcredit,False)))
|
||||
blogcredit.replaceWith(tag)
|
||||
except:
|
||||
self.log("ERROR: fixing credit format")
|
||||
|
Loading…
x
Reference in New Issue
Block a user