mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Handle direct querying of class attribute across BS versions
This commit is contained in:
parent
3045dc3c71
commit
8813a31a38
@ -198,12 +198,12 @@ A reasonably complex real life example that exposes more of the :term:`API` of `
|
|||||||
for div in soup.findAll(True,
|
for div in soup.findAll(True,
|
||||||
attrs={'class':['section-headline', 'story', 'story headline']}):
|
attrs={'class':['section-headline', 'story', 'story headline']}):
|
||||||
|
|
||||||
if div['class'] == 'section-headline':
|
if ''.join(div['class']) == 'section-headline':
|
||||||
key = string.capwords(feed_title(div))
|
key = string.capwords(feed_title(div))
|
||||||
articles[key] = []
|
articles[key] = []
|
||||||
ans.append(key)
|
ans.append(key)
|
||||||
|
|
||||||
elif div['class'] in ['story', 'story headline']:
|
elif ''.join(div['class']) in ['story', 'story headline']:
|
||||||
a = div.find('a', href=True)
|
a = div.find('a', href=True)
|
||||||
if not a:
|
if not a:
|
||||||
continue
|
continue
|
||||||
|
@ -44,7 +44,7 @@ class E1843(BasicNewsRecipe):
|
|||||||
current_section = articles = None
|
current_section = articles = None
|
||||||
|
|
||||||
for div in soup.findAll(**classes('field-name-field-header node-article')):
|
for div in soup.findAll(**classes('field-name-field-header node-article')):
|
||||||
if 'field-header' in div['class']:
|
if 'field-header' in ''.join(div['class']):
|
||||||
if current_section and articles:
|
if current_section and articles:
|
||||||
ans.append((current_section, articles))
|
ans.append((current_section, articles))
|
||||||
current_section = self.tag_to_string(div)
|
current_section = self.tag_to_string(div)
|
||||||
|
@ -41,7 +41,7 @@ class AdvancedUserRecipe1290663986(BasicNewsRecipe):
|
|||||||
'description': article_desc,
|
'description': article_desc,
|
||||||
'url': article_url})
|
'url': article_url})
|
||||||
# Avoid including the multimedia stuff.
|
# Avoid including the multimedia stuff.
|
||||||
if entry['class'].find('last') != -1:
|
if ''.join(entry['class']).find('last') != -1:
|
||||||
break
|
break
|
||||||
|
|
||||||
return articles
|
return articles
|
||||||
@ -86,7 +86,7 @@ class AdvancedUserRecipe1290663986(BasicNewsRecipe):
|
|||||||
'description': article_desc,
|
'description': article_desc,
|
||||||
'url': article_url})
|
'url': article_url})
|
||||||
# Avoid including the multimedia stuff.
|
# Avoid including the multimedia stuff.
|
||||||
if entry['class'].find('last') != -1:
|
if ''.join(entry['class']).find('last') != -1:
|
||||||
break
|
break
|
||||||
|
|
||||||
return articles
|
return articles
|
||||||
|
@ -128,13 +128,16 @@ class TheAtlantic(BasicNewsRecipe):
|
|||||||
feeds = []
|
feeds = []
|
||||||
for div in soup.findAll('div', attrs={'class': lambda x: x and set(x.split()).intersection({'top-sections', 'bottom-sections'})}):
|
for div in soup.findAll('div', attrs={'class': lambda x: x and set(x.split()).intersection({'top-sections', 'bottom-sections'})}):
|
||||||
for h2 in div.findAll('h2', attrs={'class': True}):
|
for h2 in div.findAll('h2', attrs={'class': True}):
|
||||||
if 'section-name' in h2['class'].split():
|
cls = h2['class']
|
||||||
|
if hasattr(cls, 'split'):
|
||||||
|
cls = cls.split()
|
||||||
|
if 'section-name' in cls:
|
||||||
if current_articles:
|
if current_articles:
|
||||||
feeds.append((current_section, current_articles))
|
feeds.append((current_section, current_articles))
|
||||||
current_articles = []
|
current_articles = []
|
||||||
current_section = self.tag_to_string(h2)
|
current_section = self.tag_to_string(h2)
|
||||||
self.log('\nFound section:', current_section)
|
self.log('\nFound section:', current_section)
|
||||||
elif 'hed' in h2['class'].split():
|
elif 'hed' in cls:
|
||||||
title = self.tag_to_string(h2)
|
title = self.tag_to_string(h2)
|
||||||
a = h2.findParent('a', href=True)
|
a = h2.findParent('a', href=True)
|
||||||
url = a['href']
|
url = a['href']
|
||||||
|
@ -128,13 +128,16 @@ class TheAtlantic(BasicNewsRecipe):
|
|||||||
feeds = []
|
feeds = []
|
||||||
for div in soup.findAll('div', attrs={'class': lambda x: x and set(x.split()).intersection({'top-sections', 'bottom-sections'})}):
|
for div in soup.findAll('div', attrs={'class': lambda x: x and set(x.split()).intersection({'top-sections', 'bottom-sections'})}):
|
||||||
for h2 in div.findAll('h2', attrs={'class': True}):
|
for h2 in div.findAll('h2', attrs={'class': True}):
|
||||||
if 'section-name' in h2['class'].split():
|
cls = h2['class']
|
||||||
|
if hasattr(cls, 'split'):
|
||||||
|
cls = cls.split()
|
||||||
|
if 'section-name' in cls:
|
||||||
if current_articles:
|
if current_articles:
|
||||||
feeds.append((current_section, current_articles))
|
feeds.append((current_section, current_articles))
|
||||||
current_articles = []
|
current_articles = []
|
||||||
current_section = self.tag_to_string(h2)
|
current_section = self.tag_to_string(h2)
|
||||||
self.log('\nFound section:', current_section)
|
self.log('\nFound section:', current_section)
|
||||||
elif 'hed' in h2['class'].split():
|
elif 'hed' in cls:
|
||||||
title = self.tag_to_string(h2)
|
title = self.tag_to_string(h2)
|
||||||
a = h2.findParent('a', href=True)
|
a = h2.findParent('a', href=True)
|
||||||
url = a['href']
|
url = a['href']
|
||||||
|
@ -280,7 +280,7 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
if dtag is not None:
|
if dtag is not None:
|
||||||
stag = dtag.span
|
stag = dtag.span
|
||||||
if stag is not None:
|
if stag is not None:
|
||||||
if stag['class'] != 'timestamp':
|
if ''.join(stag['class']) != 'timestamp':
|
||||||
description = self.tag_to_string(stag, False)
|
description = self.tag_to_string(stag, False)
|
||||||
else:
|
else:
|
||||||
description = self.tag_to_string(dtag, False)
|
description = self.tag_to_string(dtag, False)
|
||||||
|
@ -102,13 +102,13 @@ class CIO_Magazine(BasicNewsRecipe):
|
|||||||
for div in soup.findAll(True,
|
for div in soup.findAll(True,
|
||||||
attrs={'class': ['heading', 'issue_item']}):
|
attrs={'class': ['heading', 'issue_item']}):
|
||||||
|
|
||||||
if div['class'] == 'heading':
|
if ''.join(div['class']) == 'heading':
|
||||||
key = string.capwords(self.tag_to_string(div.span))
|
key = string.capwords(self.tag_to_string(div.span))
|
||||||
print("Key: ", key) # Esto es para depurar
|
print("Key: ", key) # Esto es para depurar
|
||||||
articles[key] = []
|
articles[key] = []
|
||||||
feeds.append(key)
|
feeds.append(key)
|
||||||
|
|
||||||
elif div['class'] == 'issue_item':
|
elif ''.join(div['class']) == 'issue_item':
|
||||||
a = div.find('a', href=True)
|
a = div.find('a', href=True)
|
||||||
if not a:
|
if not a:
|
||||||
continue
|
continue
|
||||||
|
@ -280,7 +280,7 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
if dtag is not None:
|
if dtag is not None:
|
||||||
stag = dtag.span
|
stag = dtag.span
|
||||||
if stag is not None:
|
if stag is not None:
|
||||||
if stag['class'] != 'timestamp':
|
if ''.join(stag['class']) != 'timestamp':
|
||||||
description = self.tag_to_string(stag, False)
|
description = self.tag_to_string(stag, False)
|
||||||
else:
|
else:
|
||||||
description = self.tag_to_string(dtag, False)
|
description = self.tag_to_string(dtag, False)
|
||||||
|
@ -79,7 +79,7 @@ class Esensja(BasicNewsRecipe):
|
|||||||
section += ' - ' + subchapter
|
section += ' - ' + subchapter
|
||||||
feeds.append((section, articles))
|
feeds.append((section, articles))
|
||||||
articles = []
|
articles = []
|
||||||
if tag['class'] == 'chapter':
|
if ''.join(tag['class']) == 'chapter':
|
||||||
chapter = self.tag_to_string(tag).capitalize()
|
chapter = self.tag_to_string(tag).capitalize()
|
||||||
subchapter = ''
|
subchapter = ''
|
||||||
else:
|
else:
|
||||||
|
@ -31,7 +31,7 @@ class KopalniaWiedzy(BasicNewsRecipe):
|
|||||||
(re.compile(u'<a .* rel="lightboxText" .*><img (.*)></a>'),
|
(re.compile(u'<a .* rel="lightboxText" .*><img (.*)></a>'),
|
||||||
lambda match: '<img class="topimage" ' + match.group(1) + '>'),
|
lambda match: '<img class="topimage" ' + match.group(1) + '>'),
|
||||||
(re.compile(u'<br /><br />'),
|
(re.compile(u'<br /><br />'),
|
||||||
lambda match: '<br\/>')
|
lambda match: '<br/>')
|
||||||
]
|
]
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
@ -44,7 +44,7 @@ class KopalniaWiedzy(BasicNewsRecipe):
|
|||||||
]
|
]
|
||||||
|
|
||||||
def is_link_wanted(self, url, tag):
|
def is_link_wanted(self, url, tag):
|
||||||
return tag['class'] == 'next'
|
return ''.join(tag['class']) == 'next'
|
||||||
|
|
||||||
def remove_beyond(self, tag, next):
|
def remove_beyond(self, tag, next):
|
||||||
while tag is not None and getattr(tag, 'name', None) != 'body':
|
while tag is not None and getattr(tag, 'name', None) != 'body':
|
||||||
|
@ -151,7 +151,7 @@ class LentaRURecipe(BasicNewsRecipe):
|
|||||||
for date in dates:
|
for date in dates:
|
||||||
for string in date:
|
for string in date:
|
||||||
parent = date.parent
|
parent = date.parent
|
||||||
if (parent and isinstance(parent, Tag) and 'div' == parent.name and 'dt' == parent['class']):
|
if (parent and isinstance(parent, Tag) and 'div' == parent.name and 'dt' == ''.join(parent['class'])):
|
||||||
# Date div found
|
# Date div found
|
||||||
parent.extract()
|
parent.extract()
|
||||||
parent[
|
parent[
|
||||||
|
@ -80,7 +80,7 @@ class LetsGetCritical(BasicNewsRecipe):
|
|||||||
p = post.previousSibling
|
p = post.previousSibling
|
||||||
# navigate up sibling to find date
|
# navigate up sibling to find date
|
||||||
while p:
|
while p:
|
||||||
if hasattr(p, 'class') and p['class'] == 'singledate':
|
if ''.join(p.get('class') or '') == 'singledate':
|
||||||
date = self.tag_to_string(p)
|
date = self.tag_to_string(p)
|
||||||
break
|
break
|
||||||
p = p.previousSibling
|
p = p.previousSibling
|
||||||
|
@ -95,15 +95,16 @@ class WeeklyLWN(BasicNewsRecipe):
|
|||||||
break
|
break
|
||||||
|
|
||||||
text = self.tag_to_string(curr.contents[0])
|
text = self.tag_to_string(curr.contents[0])
|
||||||
|
cclass = ''.join(curr['class'])
|
||||||
|
|
||||||
if 'Cat2HL' in curr['class']:
|
if 'Cat2HL' in cclass:
|
||||||
subsection = text
|
subsection = text
|
||||||
|
|
||||||
elif 'Cat1HL' in curr['class']:
|
elif 'Cat1HL' in cclass:
|
||||||
section = text
|
section = text
|
||||||
subsection = None
|
subsection = None
|
||||||
|
|
||||||
elif 'SummaryHL' in curr['class']:
|
elif 'SummaryHL' in cclass:
|
||||||
article_title = text
|
article_title = text
|
||||||
if not article_title:
|
if not article_title:
|
||||||
article_title = _('Undefined article title')
|
article_title = _('Undefined article title')
|
||||||
|
@ -60,7 +60,7 @@ class Mediapart(BasicNewsRecipe):
|
|||||||
try:
|
try:
|
||||||
title = article.find('h3', recursive=False)
|
title = article.find('h3', recursive=False)
|
||||||
|
|
||||||
if title is None or title['class'] == 'title-specific':
|
if title is None or ''.join(title['class']) == 'title-specific':
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# print "found fil ",title
|
# print "found fil ",title
|
||||||
|
@ -51,7 +51,7 @@ class MoneyControlRecipe(BasicNewsRecipe):
|
|||||||
freshSoup.body.append(h1)
|
freshSoup.body.append(h1)
|
||||||
|
|
||||||
for p in soup.findAll('p', attrs={'class': true}):
|
for p in soup.findAll('p', attrs={'class': true}):
|
||||||
if p['class'] == 'MsoNormal':
|
if ''.join(p['class']) == 'MsoNormal':
|
||||||
# We have some weird pagebreak marker here; it will not find all of them however
|
# We have some weird pagebreak marker here; it will not find all of them however
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -280,7 +280,7 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
if dtag is not None:
|
if dtag is not None:
|
||||||
stag = dtag.span
|
stag = dtag.span
|
||||||
if stag is not None:
|
if stag is not None:
|
||||||
if stag['class'] != 'timestamp':
|
if ''.join(stag['class']) != 'timestamp':
|
||||||
description = self.tag_to_string(stag, False)
|
description = self.tag_to_string(stag, False)
|
||||||
else:
|
else:
|
||||||
description = self.tag_to_string(dtag, False)
|
description = self.tag_to_string(dtag, False)
|
||||||
|
@ -12,7 +12,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
|||||||
|
|
||||||
|
|
||||||
def find_header(tag):
|
def find_header(tag):
|
||||||
return tag.name == 'header' and tag.parent['class'] == 'article'
|
return tag.name == 'header' and ''.join(tag.parent['class']) == 'article'
|
||||||
|
|
||||||
|
|
||||||
def absurl(url):
|
def absurl(url):
|
||||||
|
@ -12,7 +12,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
|||||||
|
|
||||||
|
|
||||||
def find_header(tag):
|
def find_header(tag):
|
||||||
return tag.name == 'header' and tag.parent['class'] == 'article'
|
return tag.name == 'header' and ''.join(tag.parent['class']) == 'article'
|
||||||
|
|
||||||
|
|
||||||
def absurl(url):
|
def absurl(url):
|
||||||
|
@ -280,7 +280,7 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
if dtag is not None:
|
if dtag is not None:
|
||||||
stag = dtag.span
|
stag = dtag.span
|
||||||
if stag is not None:
|
if stag is not None:
|
||||||
if stag['class'] != 'timestamp':
|
if ''.join(stag['class']) != 'timestamp':
|
||||||
description = self.tag_to_string(stag, False)
|
description = self.tag_to_string(stag, False)
|
||||||
else:
|
else:
|
||||||
description = self.tag_to_string(dtag, False)
|
description = self.tag_to_string(dtag, False)
|
||||||
|
@ -47,12 +47,12 @@ class Polter(BasicNewsRecipe):
|
|||||||
for s in soup.findAll(style=True):
|
for s in soup.findAll(style=True):
|
||||||
if 'bold;' in s['style']:
|
if 'bold;' in s['style']:
|
||||||
if s.get('class', ''):
|
if s.get('class', ''):
|
||||||
s['class'] = s['class'] + ' p_title'
|
s['class'] = ''.join(s['class']) + ' p_title'
|
||||||
else:
|
else:
|
||||||
s['class'] = 'p_title'
|
s['class'] = 'p_title'
|
||||||
if 'italic;' in s['style']:
|
if 'italic;' in s['style']:
|
||||||
if s.get('class', ''):
|
if s.get('class', ''):
|
||||||
s['class'] = s['class'] + ' italic'
|
s['class'] = ''.join(s['class']) + ' italic'
|
||||||
else:
|
else:
|
||||||
s['class'] = 'italic'
|
s['class'] = 'italic'
|
||||||
del s['style']
|
del s['style']
|
||||||
|
@ -180,7 +180,7 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
|
|
||||||
# Find each instance of class="sectiontitle", class="featurecontent"
|
# Find each instance of class="sectiontitle", class="featurecontent"
|
||||||
for divtag in soup.findAll('div', attrs={'class': ["section_title02", "featurecontent"]}):
|
for divtag in soup.findAll('div', attrs={'class': ["section_title02", "featurecontent"]}):
|
||||||
if divtag['class'].startswith('section_title'):
|
if 'section_title' in ''.join(divtag['class']):
|
||||||
# div contains section title
|
# div contains section title
|
||||||
if not divtag.h3:
|
if not divtag.h3:
|
||||||
continue
|
continue
|
||||||
|
@ -180,7 +180,7 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
|
|
||||||
# Find each instance of class="sectiontitle", class="featurecontent"
|
# Find each instance of class="sectiontitle", class="featurecontent"
|
||||||
for divtag in soup.findAll('div', attrs={'class': ["section_title02", "featurecontent"]}):
|
for divtag in soup.findAll('div', attrs={'class': ["section_title02", "featurecontent"]}):
|
||||||
if divtag['class'].startswith('section_title'):
|
if ''.join(divtag['class']).startswith('section_title'):
|
||||||
# div contains section title
|
# div contains section title
|
||||||
if not divtag.h3:
|
if not divtag.h3:
|
||||||
continue
|
continue
|
||||||
|
@ -281,7 +281,7 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
if dtag is not None:
|
if dtag is not None:
|
||||||
stag = dtag.span
|
stag = dtag.span
|
||||||
if stag is not None:
|
if stag is not None:
|
||||||
if stag['class'] != 'timestamp':
|
if ''.join(stag['class']) != 'timestamp':
|
||||||
description = self.tag_to_string(stag, False)
|
description = self.tag_to_string(stag, False)
|
||||||
else:
|
else:
|
||||||
description = self.tag_to_string(dtag, False)
|
description = self.tag_to_string(dtag, False)
|
||||||
|
@ -70,7 +70,7 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
|
|
||||||
# Find each instance of class="sectiontitle", class="featurecontent"
|
# Find each instance of class="sectiontitle", class="featurecontent"
|
||||||
for divtag in soup.findAll('div', attrs={'class': ["section_title02", "featurecontent"]}):
|
for divtag in soup.findAll('div', attrs={'class': ["section_title02", "featurecontent"]}):
|
||||||
if divtag['class'].startswith('section_title'):
|
if 'section_title' in ''.join(divtag['class']):
|
||||||
# div contains section title
|
# div contains section title
|
||||||
if not divtag.h3:
|
if not divtag.h3:
|
||||||
continue
|
continue
|
||||||
|
@ -181,7 +181,7 @@ class CanWestPaper(BasicNewsRecipe):
|
|||||||
# Find each instance of class="sectiontitle", class="featurecontent"
|
# Find each instance of class="sectiontitle", class="featurecontent"
|
||||||
for divtag in soup.findAll('div', attrs={'class': ["section_title02", "featurecontent"]}):
|
for divtag in soup.findAll('div', attrs={'class': ["section_title02", "featurecontent"]}):
|
||||||
# self.log(" div class = %s" % divtag['class'])
|
# self.log(" div class = %s" % divtag['class'])
|
||||||
if divtag['class'].startswith('section_title'):
|
if ''.join(divtag['class']).startswith('section_title'):
|
||||||
# div contains section title
|
# div contains section title
|
||||||
if not divtag.h3:
|
if not divtag.h3:
|
||||||
continue
|
continue
|
||||||
|
Loading…
x
Reference in New Issue
Block a user