mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Make postprocess_html in the NY Times recipes more robust
This commit is contained in:
parent
aa28b37951
commit
84d1dd94d2
@ -586,105 +586,125 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
return self.strip_anchors(soup)
|
return self.strip_anchors(soup)
|
||||||
|
|
||||||
def postprocess_html(self,soup, True):
|
def postprocess_html(self,soup, True):
|
||||||
|
try:
|
||||||
|
if self.one_picture_per_article:
|
||||||
|
# Remove all images after first
|
||||||
|
largeImg = soup.find(True, {'class':'articleSpanImage'})
|
||||||
|
inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
|
||||||
|
if largeImg:
|
||||||
|
for inlineImg in inlineImgs:
|
||||||
|
inlineImg.extract()
|
||||||
|
else:
|
||||||
|
if inlineImgs:
|
||||||
|
firstImg = inlineImgs[0]
|
||||||
|
for inlineImg in inlineImgs[1:]:
|
||||||
|
inlineImg.extract()
|
||||||
|
# Move firstImg before article body
|
||||||
|
cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
|
||||||
|
if cgFirst:
|
||||||
|
# Strip all sibling NavigableStrings: noise
|
||||||
|
navstrings = cgFirst.findAll(text=True, recursive=False)
|
||||||
|
[ns.extract() for ns in navstrings]
|
||||||
|
headline_found = False
|
||||||
|
tag = cgFirst.find(True)
|
||||||
|
insertLoc = 0
|
||||||
|
while True:
|
||||||
|
insertLoc += 1
|
||||||
|
if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
|
||||||
|
headline_found = True
|
||||||
|
break
|
||||||
|
tag = tag.nextSibling
|
||||||
|
if not tag:
|
||||||
|
headline_found = False
|
||||||
|
break
|
||||||
|
if headline_found:
|
||||||
|
cgFirst.insert(insertLoc,firstImg)
|
||||||
|
else:
|
||||||
|
self.log(">>> No class:'columnGroup first' found <<<")
|
||||||
|
except:
|
||||||
|
self.log("ERROR: One picture per article in postprocess_html")
|
||||||
|
|
||||||
if self.one_picture_per_article:
|
try:
|
||||||
# Remove all images after first
|
# Change captions to italic
|
||||||
largeImg = soup.find(True, {'class':'articleSpanImage'})
|
for caption in soup.findAll(True, {'class':'caption'}) :
|
||||||
inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
|
if caption and len(caption) > 0:
|
||||||
if largeImg:
|
cTag = Tag(soup, "p", [("class", "caption")])
|
||||||
for inlineImg in inlineImgs:
|
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
|
||||||
inlineImg.extract()
|
mp_off = c.find("More Photos")
|
||||||
else:
|
if mp_off >= 0:
|
||||||
if inlineImgs:
|
c = c[:mp_off]
|
||||||
firstImg = inlineImgs[0]
|
cTag.insert(0, c)
|
||||||
for inlineImg in inlineImgs[1:]:
|
caption.replaceWith(cTag)
|
||||||
inlineImg.extract()
|
except:
|
||||||
# Move firstImg before article body
|
self.log("ERROR: Problem in change captions to italic")
|
||||||
cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
|
|
||||||
if cgFirst:
|
|
||||||
# Strip all sibling NavigableStrings: noise
|
|
||||||
navstrings = cgFirst.findAll(text=True, recursive=False)
|
|
||||||
[ns.extract() for ns in navstrings]
|
|
||||||
headline_found = False
|
|
||||||
tag = cgFirst.find(True)
|
|
||||||
insertLoc = 0
|
|
||||||
while True:
|
|
||||||
insertLoc += 1
|
|
||||||
if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
|
|
||||||
headline_found = True
|
|
||||||
break
|
|
||||||
tag = tag.nextSibling
|
|
||||||
if not tag:
|
|
||||||
headline_found = False
|
|
||||||
break
|
|
||||||
if headline_found:
|
|
||||||
cgFirst.insert(insertLoc,firstImg)
|
|
||||||
else:
|
|
||||||
self.log(">>> No class:'columnGroup first' found <<<")
|
|
||||||
|
|
||||||
# Change captions to italic
|
try:
|
||||||
for caption in soup.findAll(True, {'class':'caption'}) :
|
# Change <nyt_headline> to <h2>
|
||||||
if caption and caption.contents[0]:
|
h1 = soup.find('h1')
|
||||||
cTag = Tag(soup, "p", [("class", "caption")])
|
if h1:
|
||||||
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
|
headline = h1.find("nyt_headline")
|
||||||
mp_off = c.find("More Photos")
|
if headline:
|
||||||
if mp_off >= 0:
|
tag = Tag(soup, "h2")
|
||||||
c = c[:mp_off]
|
tag['class'] = "headline"
|
||||||
cTag.insert(0, c)
|
tag.insert(0, self.fixChars(headline.contents[0]))
|
||||||
caption.replaceWith(cTag)
|
h1.replaceWith(tag)
|
||||||
|
else:
|
||||||
|
# Blog entry - replace headline, remove <hr> tags
|
||||||
|
headline = soup.find('title')
|
||||||
|
if headline:
|
||||||
|
tag = Tag(soup, "h2")
|
||||||
|
tag['class'] = "headline"
|
||||||
|
tag.insert(0, self.fixChars(headline.contents[0]))
|
||||||
|
soup.insert(0, tag)
|
||||||
|
hrs = soup.findAll('hr')
|
||||||
|
for hr in hrs:
|
||||||
|
hr.extract()
|
||||||
|
except:
|
||||||
|
self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
|
||||||
|
|
||||||
# Change <nyt_headline> to <h2>
|
try:
|
||||||
h1 = soup.find('h1')
|
# Change <h1> to <h3> - used in editorial blogs
|
||||||
if h1:
|
masthead = soup.find("h1")
|
||||||
headline = h1.find("nyt_headline")
|
if masthead:
|
||||||
if headline:
|
# Nuke the href
|
||||||
tag = Tag(soup, "h2")
|
if masthead.a:
|
||||||
tag['class'] = "headline"
|
del(masthead.a['href'])
|
||||||
tag.insert(0, self.fixChars(headline.contents[0]))
|
tag = Tag(soup, "h3")
|
||||||
h1.replaceWith(tag)
|
tag.insert(0, self.fixChars(masthead.contents[0]))
|
||||||
else:
|
masthead.replaceWith(tag)
|
||||||
# Blog entry - replace headline, remove <hr> tags
|
except:
|
||||||
headline = soup.find('title')
|
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
||||||
if headline:
|
|
||||||
tag = Tag(soup, "h2")
|
|
||||||
tag['class'] = "headline"
|
|
||||||
tag.insert(0, self.fixChars(headline.contents[0]))
|
|
||||||
soup.insert(0, tag)
|
|
||||||
hrs = soup.findAll('hr')
|
|
||||||
for hr in hrs:
|
|
||||||
hr.extract()
|
|
||||||
|
|
||||||
# Change <h1> to <h3> - used in editorial blogs
|
try:
|
||||||
masthead = soup.find("h1")
|
# Change <span class="bold"> to <b>
|
||||||
if masthead:
|
for subhead in soup.findAll(True, {'class':'bold'}) :
|
||||||
# Nuke the href
|
if subhead.contents:
|
||||||
if masthead.a:
|
bTag = Tag(soup, "b")
|
||||||
del(masthead.a['href'])
|
bTag.insert(0, subhead.contents[0])
|
||||||
tag = Tag(soup, "h3")
|
subhead.replaceWith(bTag)
|
||||||
tag.insert(0, self.fixChars(masthead.contents[0]))
|
except:
|
||||||
masthead.replaceWith(tag)
|
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
||||||
|
|
||||||
# Change <span class="bold"> to <b>
|
try:
|
||||||
for subhead in soup.findAll(True, {'class':'bold'}) :
|
divTag = soup.find('div',attrs={'id':'articleBody'})
|
||||||
if subhead.contents:
|
if divTag:
|
||||||
bTag = Tag(soup, "b")
|
divTag['class'] = divTag['id']
|
||||||
bTag.insert(0, subhead.contents[0])
|
except:
|
||||||
subhead.replaceWith(bTag)
|
self.log("ERROR: Problem in soup.find(div,attrs={id:articleBody})")
|
||||||
|
|
||||||
divTag = soup.find('div',attrs={'id':'articleBody'})
|
try:
|
||||||
if divTag:
|
# Add class="authorId" to <div> so we can format with CSS
|
||||||
divTag['class'] = divTag['id']
|
divTag = soup.find('div',attrs={'id':'authorId'})
|
||||||
|
if divTag and divTag.contents[0]:
|
||||||
|
tag = Tag(soup, "p")
|
||||||
|
tag['class'] = "authorId"
|
||||||
|
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
|
||||||
|
use_alt=False)))
|
||||||
|
divTag.replaceWith(tag)
|
||||||
|
except:
|
||||||
|
self.log("ERROR: Problem in Add class=authorId to <div> so we can format with CSS")
|
||||||
|
|
||||||
# Add class="authorId" to <div> so we can format with CSS
|
return soup
|
||||||
divTag = soup.find('div',attrs={'id':'authorId'})
|
|
||||||
if divTag and divTag.contents[0]:
|
|
||||||
tag = Tag(soup, "p")
|
|
||||||
tag['class'] = "authorId"
|
|
||||||
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
|
|
||||||
use_alt=False)))
|
|
||||||
divTag.replaceWith(tag)
|
|
||||||
|
|
||||||
return soup
|
|
||||||
|
|
||||||
def populate_article_metadata(self, article, soup, first):
|
def populate_article_metadata(self, article, soup, first):
|
||||||
shortparagraph = ""
|
shortparagraph = ""
|
||||||
|
@ -586,105 +586,125 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
return self.strip_anchors(soup)
|
return self.strip_anchors(soup)
|
||||||
|
|
||||||
def postprocess_html(self,soup, True):
|
def postprocess_html(self,soup, True):
|
||||||
|
try:
|
||||||
|
if self.one_picture_per_article:
|
||||||
|
# Remove all images after first
|
||||||
|
largeImg = soup.find(True, {'class':'articleSpanImage'})
|
||||||
|
inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
|
||||||
|
if largeImg:
|
||||||
|
for inlineImg in inlineImgs:
|
||||||
|
inlineImg.extract()
|
||||||
|
else:
|
||||||
|
if inlineImgs:
|
||||||
|
firstImg = inlineImgs[0]
|
||||||
|
for inlineImg in inlineImgs[1:]:
|
||||||
|
inlineImg.extract()
|
||||||
|
# Move firstImg before article body
|
||||||
|
cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
|
||||||
|
if cgFirst:
|
||||||
|
# Strip all sibling NavigableStrings: noise
|
||||||
|
navstrings = cgFirst.findAll(text=True, recursive=False)
|
||||||
|
[ns.extract() for ns in navstrings]
|
||||||
|
headline_found = False
|
||||||
|
tag = cgFirst.find(True)
|
||||||
|
insertLoc = 0
|
||||||
|
while True:
|
||||||
|
insertLoc += 1
|
||||||
|
if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
|
||||||
|
headline_found = True
|
||||||
|
break
|
||||||
|
tag = tag.nextSibling
|
||||||
|
if not tag:
|
||||||
|
headline_found = False
|
||||||
|
break
|
||||||
|
if headline_found:
|
||||||
|
cgFirst.insert(insertLoc,firstImg)
|
||||||
|
else:
|
||||||
|
self.log(">>> No class:'columnGroup first' found <<<")
|
||||||
|
except:
|
||||||
|
self.log("ERROR: One picture per article in postprocess_html")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Change captions to italic
|
||||||
|
for caption in soup.findAll(True, {'class':'caption'}) :
|
||||||
|
if caption and len(caption) > 0:
|
||||||
|
cTag = Tag(soup, "p", [("class", "caption")])
|
||||||
|
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
|
||||||
|
mp_off = c.find("More Photos")
|
||||||
|
if mp_off >= 0:
|
||||||
|
c = c[:mp_off]
|
||||||
|
cTag.insert(0, c)
|
||||||
|
caption.replaceWith(cTag)
|
||||||
|
except:
|
||||||
|
self.log("ERROR: Problem in change captions to italic")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Change <nyt_headline> to <h2>
|
||||||
|
h1 = soup.find('h1')
|
||||||
|
if h1:
|
||||||
|
headline = h1.find("nyt_headline")
|
||||||
|
if headline:
|
||||||
|
tag = Tag(soup, "h2")
|
||||||
|
tag['class'] = "headline"
|
||||||
|
tag.insert(0, self.fixChars(headline.contents[0]))
|
||||||
|
h1.replaceWith(tag)
|
||||||
|
else:
|
||||||
|
# Blog entry - replace headline, remove <hr> tags
|
||||||
|
headline = soup.find('title')
|
||||||
|
if headline:
|
||||||
|
tag = Tag(soup, "h2")
|
||||||
|
tag['class'] = "headline"
|
||||||
|
tag.insert(0, self.fixChars(headline.contents[0]))
|
||||||
|
soup.insert(0, tag)
|
||||||
|
hrs = soup.findAll('hr')
|
||||||
|
for hr in hrs:
|
||||||
|
hr.extract()
|
||||||
|
except:
|
||||||
|
self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
|
||||||
|
|
||||||
if self.one_picture_per_article:
|
try:
|
||||||
# Remove all images after first
|
# Change <h1> to <h3> - used in editorial blogs
|
||||||
largeImg = soup.find(True, {'class':'articleSpanImage'})
|
masthead = soup.find("h1")
|
||||||
inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
|
if masthead:
|
||||||
if largeImg:
|
# Nuke the href
|
||||||
for inlineImg in inlineImgs:
|
if masthead.a:
|
||||||
inlineImg.extract()
|
del(masthead.a['href'])
|
||||||
else:
|
tag = Tag(soup, "h3")
|
||||||
if inlineImgs:
|
tag.insert(0, self.fixChars(masthead.contents[0]))
|
||||||
firstImg = inlineImgs[0]
|
masthead.replaceWith(tag)
|
||||||
for inlineImg in inlineImgs[1:]:
|
except:
|
||||||
inlineImg.extract()
|
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
||||||
# Move firstImg before article body
|
|
||||||
cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
|
|
||||||
if cgFirst:
|
|
||||||
# Strip all sibling NavigableStrings: noise
|
|
||||||
navstrings = cgFirst.findAll(text=True, recursive=False)
|
|
||||||
[ns.extract() for ns in navstrings]
|
|
||||||
headline_found = False
|
|
||||||
tag = cgFirst.find(True)
|
|
||||||
insertLoc = 0
|
|
||||||
while True:
|
|
||||||
insertLoc += 1
|
|
||||||
if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
|
|
||||||
headline_found = True
|
|
||||||
break
|
|
||||||
tag = tag.nextSibling
|
|
||||||
if not tag:
|
|
||||||
headline_found = False
|
|
||||||
break
|
|
||||||
if headline_found:
|
|
||||||
cgFirst.insert(insertLoc,firstImg)
|
|
||||||
else:
|
|
||||||
self.log(">>> No class:'columnGroup first' found <<<")
|
|
||||||
|
|
||||||
# Change captions to italic
|
try:
|
||||||
for caption in soup.findAll(True, {'class':'caption'}) :
|
# Change <span class="bold"> to <b>
|
||||||
if caption and caption.contents[0]:
|
for subhead in soup.findAll(True, {'class':'bold'}) :
|
||||||
cTag = Tag(soup, "p", [("class", "caption")])
|
if subhead.contents:
|
||||||
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
|
bTag = Tag(soup, "b")
|
||||||
mp_off = c.find("More Photos")
|
bTag.insert(0, subhead.contents[0])
|
||||||
if mp_off >= 0:
|
subhead.replaceWith(bTag)
|
||||||
c = c[:mp_off]
|
except:
|
||||||
cTag.insert(0, c)
|
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
||||||
caption.replaceWith(cTag)
|
|
||||||
|
try:
|
||||||
# Change <nyt_headline> to <h2>
|
divTag = soup.find('div',attrs={'id':'articleBody'})
|
||||||
h1 = soup.find('h1')
|
if divTag:
|
||||||
if h1:
|
divTag['class'] = divTag['id']
|
||||||
headline = h1.find("nyt_headline")
|
except:
|
||||||
if headline:
|
self.log("ERROR: Problem in soup.find(div,attrs={id:articleBody})")
|
||||||
tag = Tag(soup, "h2")
|
|
||||||
tag['class'] = "headline"
|
try:
|
||||||
tag.insert(0, self.fixChars(headline.contents[0]))
|
# Add class="authorId" to <div> so we can format with CSS
|
||||||
h1.replaceWith(tag)
|
divTag = soup.find('div',attrs={'id':'authorId'})
|
||||||
else:
|
if divTag and divTag.contents[0]:
|
||||||
# Blog entry - replace headline, remove <hr> tags
|
tag = Tag(soup, "p")
|
||||||
headline = soup.find('title')
|
tag['class'] = "authorId"
|
||||||
if headline:
|
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
|
||||||
tag = Tag(soup, "h2")
|
use_alt=False)))
|
||||||
tag['class'] = "headline"
|
divTag.replaceWith(tag)
|
||||||
tag.insert(0, self.fixChars(headline.contents[0]))
|
except:
|
||||||
soup.insert(0, tag)
|
self.log("ERROR: Problem in Add class=authorId to <div> so we can format with CSS")
|
||||||
hrs = soup.findAll('hr')
|
|
||||||
for hr in hrs:
|
return soup
|
||||||
hr.extract()
|
|
||||||
|
|
||||||
# Change <h1> to <h3> - used in editorial blogs
|
|
||||||
masthead = soup.find("h1")
|
|
||||||
if masthead:
|
|
||||||
# Nuke the href
|
|
||||||
if masthead.a:
|
|
||||||
del(masthead.a['href'])
|
|
||||||
tag = Tag(soup, "h3")
|
|
||||||
tag.insert(0, self.fixChars(masthead.contents[0]))
|
|
||||||
masthead.replaceWith(tag)
|
|
||||||
|
|
||||||
# Change <span class="bold"> to <b>
|
|
||||||
for subhead in soup.findAll(True, {'class':'bold'}) :
|
|
||||||
if subhead.contents:
|
|
||||||
bTag = Tag(soup, "b")
|
|
||||||
bTag.insert(0, subhead.contents[0])
|
|
||||||
subhead.replaceWith(bTag)
|
|
||||||
|
|
||||||
divTag = soup.find('div',attrs={'id':'articleBody'})
|
|
||||||
if divTag:
|
|
||||||
divTag['class'] = divTag['id']
|
|
||||||
|
|
||||||
# Add class="authorId" to <div> so we can format with CSS
|
|
||||||
divTag = soup.find('div',attrs={'id':'authorId'})
|
|
||||||
if divTag and divTag.contents[0]:
|
|
||||||
tag = Tag(soup, "p")
|
|
||||||
tag['class'] = "authorId"
|
|
||||||
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
|
|
||||||
use_alt=False)))
|
|
||||||
divTag.replaceWith(tag)
|
|
||||||
|
|
||||||
return soup
|
|
||||||
def populate_article_metadata(self, article, soup, first):
|
def populate_article_metadata(self, article, soup, first):
|
||||||
shortparagraph = ""
|
shortparagraph = ""
|
||||||
try:
|
try:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user