mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Updated NY Times
This commit is contained in:
parent
54fb874621
commit
ebda738c81
@ -159,6 +159,11 @@ class NYTimes(BasicNewsRecipe):
|
||||
'relatedSearchesModule',
|
||||
'side_tool',
|
||||
'singleAd',
|
||||
'entry entry-utility', #added for DealBook
|
||||
'entry-tags', #added for DealBook
|
||||
'footer promos clearfix', #added for DealBook
|
||||
'footer links clearfix', #added for DealBook
|
||||
'inlineImage module', #added for DealBook
|
||||
re.compile('^subNavigation'),
|
||||
re.compile('^leaderboard'),
|
||||
re.compile('^module'),
|
||||
@ -192,6 +197,9 @@ class NYTimes(BasicNewsRecipe):
|
||||
'side_index',
|
||||
'side_tool',
|
||||
'toolsRight',
|
||||
'skybox', #added for DealBook
|
||||
'TopAd', #added for DealBook
|
||||
'related-content', #added for DealBook
|
||||
]),
|
||||
dict(name=['script', 'noscript', 'style','form','hr'])]
|
||||
no_stylesheets = True
|
||||
@ -246,7 +254,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
def exclude_url(self,url):
|
||||
if not url.startswith("http"):
|
||||
return True
|
||||
if not url.endswith(".html"):
|
||||
if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: #added for DealBook
|
||||
return True
|
||||
if 'nytimes.com' not in url:
|
||||
return True
|
||||
@ -569,7 +577,6 @@ class NYTimes(BasicNewsRecipe):
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
|
||||
if self.webEdition & (self.oldest_article>0):
|
||||
date_tag = soup.find(True,attrs={'class': ['dateline','date']})
|
||||
if date_tag:
|
||||
@ -592,9 +599,12 @@ class NYTimes(BasicNewsRecipe):
|
||||
img_div = soup.find('div','inlineImage module')
|
||||
if img_div:
|
||||
img_div.extract()
|
||||
|
||||
|
||||
return self.strip_anchors(soup)
|
||||
|
||||
def postprocess_html(self,soup, True):
|
||||
|
||||
try:
|
||||
if self.one_picture_per_article:
|
||||
# Remove all images after first
|
||||
@ -650,6 +660,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
try:
|
||||
# Change <nyt_headline> to <h2>
|
||||
h1 = soup.find('h1')
|
||||
blogheadline = str(h1) #added for dealbook
|
||||
if h1:
|
||||
headline = h1.find("nyt_headline")
|
||||
if headline:
|
||||
@ -657,13 +668,19 @@ class NYTimes(BasicNewsRecipe):
|
||||
tag['class'] = "headline"
|
||||
tag.insert(0, self.fixChars(headline.contents[0]))
|
||||
h1.replaceWith(tag)
|
||||
elif blogheadline.find('entry-title'):#added for dealbook
|
||||
tag = Tag(soup, "h2")#added for dealbook
|
||||
tag['class'] = "headline"#added for dealbook
|
||||
tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook
|
||||
h1.replaceWith(tag)#added for dealbook
|
||||
|
||||
else:
|
||||
# Blog entry - replace headline, remove <hr> tags
|
||||
# Blog entry - replace headline, remove <hr> tags - BCC I think this is no longer functional 1-18-2011
|
||||
headline = soup.find('title')
|
||||
if headline:
|
||||
tag = Tag(soup, "h2")
|
||||
tag['class'] = "headline"
|
||||
tag.insert(0, self.fixChars(headline.contents[0]))
|
||||
tag.insert(0, self.fixChars(headline.renderContents()))
|
||||
soup.insert(0, tag)
|
||||
hrs = soup.findAll('hr')
|
||||
for hr in hrs:
|
||||
@ -671,6 +688,29 @@ class NYTimes(BasicNewsRecipe):
|
||||
except:
|
||||
self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
|
||||
|
||||
try:
|
||||
#if this is from a blog (dealbook, fix the byline format
|
||||
bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
|
||||
if bylineauthor:
|
||||
tag = Tag(soup, "h6")
|
||||
tag['class'] = "byline"
|
||||
tag.insert(0, self.fixChars(bylineauthor.renderContents()))
|
||||
bylineauthor.replaceWith(tag)
|
||||
except:
|
||||
self.log("ERROR: fixing byline author format")
|
||||
|
||||
try:
|
||||
#if this is a blog (dealbook) fix the credit style for the pictures
|
||||
blogcredit = soup.find('div',attrs={'class':'credit'})
|
||||
if blogcredit:
|
||||
tag = Tag(soup, "h6")
|
||||
tag['class'] = "credit"
|
||||
tag.insert(0, self.fixChars(blogcredit.renderContents()))
|
||||
blogcredit.replaceWith(tag)
|
||||
except:
|
||||
self.log("ERROR: fixing credit format")
|
||||
|
||||
|
||||
try:
|
||||
# Change <h1> to <h3> - used in editorial blogs
|
||||
masthead = soup.find("h1")
|
||||
@ -693,6 +733,13 @@ class NYTimes(BasicNewsRecipe):
|
||||
subhead.replaceWith(bTag)
|
||||
except:
|
||||
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
||||
try:
|
||||
#remove the <strong> update tag
|
||||
blogupdated = soup.find('span', {'class':'update'})
|
||||
if blogupdated:
|
||||
blogupdated.replaceWith("")
|
||||
except:
|
||||
self.log("ERROR: Removing strong tag")
|
||||
|
||||
try:
|
||||
divTag = soup.find('div',attrs={'id':'articleBody'})
|
||||
|
Loading…
x
Reference in New Issue
Block a user