mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Make postprocess_html in the NY Times recipes more robust
This commit is contained in:
parent
aa28b37951
commit
84d1dd94d2
@ -586,7 +586,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
return self.strip_anchors(soup)
|
||||
|
||||
def postprocess_html(self,soup, True):
|
||||
|
||||
try:
|
||||
if self.one_picture_per_article:
|
||||
# Remove all images after first
|
||||
largeImg = soup.find(True, {'class':'articleSpanImage'})
|
||||
@ -621,10 +621,13 @@ class NYTimes(BasicNewsRecipe):
|
||||
cgFirst.insert(insertLoc,firstImg)
|
||||
else:
|
||||
self.log(">>> No class:'columnGroup first' found <<<")
|
||||
except:
|
||||
self.log("ERROR: One picture per article in postprocess_html")
|
||||
|
||||
try:
|
||||
# Change captions to italic
|
||||
for caption in soup.findAll(True, {'class':'caption'}) :
|
||||
if caption and caption.contents[0]:
|
||||
if caption and len(caption) > 0:
|
||||
cTag = Tag(soup, "p", [("class", "caption")])
|
||||
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
|
||||
mp_off = c.find("More Photos")
|
||||
@ -632,7 +635,10 @@ class NYTimes(BasicNewsRecipe):
|
||||
c = c[:mp_off]
|
||||
cTag.insert(0, c)
|
||||
caption.replaceWith(cTag)
|
||||
except:
|
||||
self.log("ERROR: Problem in change captions to italic")
|
||||
|
||||
try:
|
||||
# Change <nyt_headline> to <h2>
|
||||
h1 = soup.find('h1')
|
||||
if h1:
|
||||
@ -653,7 +659,10 @@ class NYTimes(BasicNewsRecipe):
|
||||
hrs = soup.findAll('hr')
|
||||
for hr in hrs:
|
||||
hr.extract()
|
||||
except:
|
||||
self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
|
||||
|
||||
try:
|
||||
# Change <h1> to <h3> - used in editorial blogs
|
||||
masthead = soup.find("h1")
|
||||
if masthead:
|
||||
@ -663,18 +672,27 @@ class NYTimes(BasicNewsRecipe):
|
||||
tag = Tag(soup, "h3")
|
||||
tag.insert(0, self.fixChars(masthead.contents[0]))
|
||||
masthead.replaceWith(tag)
|
||||
except:
|
||||
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
||||
|
||||
try:
|
||||
# Change <span class="bold"> to <b>
|
||||
for subhead in soup.findAll(True, {'class':'bold'}) :
|
||||
if subhead.contents:
|
||||
bTag = Tag(soup, "b")
|
||||
bTag.insert(0, subhead.contents[0])
|
||||
subhead.replaceWith(bTag)
|
||||
except:
|
||||
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
||||
|
||||
try:
|
||||
divTag = soup.find('div',attrs={'id':'articleBody'})
|
||||
if divTag:
|
||||
divTag['class'] = divTag['id']
|
||||
except:
|
||||
self.log("ERROR: Problem in soup.find(div,attrs={id:articleBody})")
|
||||
|
||||
try:
|
||||
# Add class="authorId" to <div> so we can format with CSS
|
||||
divTag = soup.find('div',attrs={'id':'authorId'})
|
||||
if divTag and divTag.contents[0]:
|
||||
@ -683,6 +701,8 @@ class NYTimes(BasicNewsRecipe):
|
||||
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
|
||||
use_alt=False)))
|
||||
divTag.replaceWith(tag)
|
||||
except:
|
||||
self.log("ERROR: Problem in Add class=authorId to <div> so we can format with CSS")
|
||||
|
||||
return soup
|
||||
|
||||
|
@ -586,7 +586,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
return self.strip_anchors(soup)
|
||||
|
||||
def postprocess_html(self,soup, True):
|
||||
|
||||
try:
|
||||
if self.one_picture_per_article:
|
||||
# Remove all images after first
|
||||
largeImg = soup.find(True, {'class':'articleSpanImage'})
|
||||
@ -621,10 +621,13 @@ class NYTimes(BasicNewsRecipe):
|
||||
cgFirst.insert(insertLoc,firstImg)
|
||||
else:
|
||||
self.log(">>> No class:'columnGroup first' found <<<")
|
||||
except:
|
||||
self.log("ERROR: One picture per article in postprocess_html")
|
||||
|
||||
try:
|
||||
# Change captions to italic
|
||||
for caption in soup.findAll(True, {'class':'caption'}) :
|
||||
if caption and caption.contents[0]:
|
||||
if caption and len(caption) > 0:
|
||||
cTag = Tag(soup, "p", [("class", "caption")])
|
||||
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
|
||||
mp_off = c.find("More Photos")
|
||||
@ -632,7 +635,10 @@ class NYTimes(BasicNewsRecipe):
|
||||
c = c[:mp_off]
|
||||
cTag.insert(0, c)
|
||||
caption.replaceWith(cTag)
|
||||
except:
|
||||
self.log("ERROR: Problem in change captions to italic")
|
||||
|
||||
try:
|
||||
# Change <nyt_headline> to <h2>
|
||||
h1 = soup.find('h1')
|
||||
if h1:
|
||||
@ -653,7 +659,10 @@ class NYTimes(BasicNewsRecipe):
|
||||
hrs = soup.findAll('hr')
|
||||
for hr in hrs:
|
||||
hr.extract()
|
||||
except:
|
||||
self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
|
||||
|
||||
try:
|
||||
# Change <h1> to <h3> - used in editorial blogs
|
||||
masthead = soup.find("h1")
|
||||
if masthead:
|
||||
@ -663,18 +672,27 @@ class NYTimes(BasicNewsRecipe):
|
||||
tag = Tag(soup, "h3")
|
||||
tag.insert(0, self.fixChars(masthead.contents[0]))
|
||||
masthead.replaceWith(tag)
|
||||
except:
|
||||
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
||||
|
||||
try:
|
||||
# Change <span class="bold"> to <b>
|
||||
for subhead in soup.findAll(True, {'class':'bold'}) :
|
||||
if subhead.contents:
|
||||
bTag = Tag(soup, "b")
|
||||
bTag.insert(0, subhead.contents[0])
|
||||
subhead.replaceWith(bTag)
|
||||
except:
|
||||
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
||||
|
||||
try:
|
||||
divTag = soup.find('div',attrs={'id':'articleBody'})
|
||||
if divTag:
|
||||
divTag['class'] = divTag['id']
|
||||
except:
|
||||
self.log("ERROR: Problem in soup.find(div,attrs={id:articleBody})")
|
||||
|
||||
try:
|
||||
# Add class="authorId" to <div> so we can format with CSS
|
||||
divTag = soup.find('div',attrs={'id':'authorId'})
|
||||
if divTag and divTag.contents[0]:
|
||||
@ -683,6 +701,8 @@ class NYTimes(BasicNewsRecipe):
|
||||
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
|
||||
use_alt=False)))
|
||||
divTag.replaceWith(tag)
|
||||
except:
|
||||
self.log("ERROR: Problem in Add class=authorId to <div> so we can format with CSS")
|
||||
|
||||
return soup
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
|
Loading…
x
Reference in New Issue
Block a user