Handle direct querying of class attribute across BS versions

This commit is contained in:
Kovid Goyal 2019-03-23 21:47:58 +05:30
parent 3045dc3c71
commit 8813a31a38
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
25 changed files with 41 additions and 34 deletions

View File

@ -198,12 +198,12 @@ A reasonably complex real life example that exposes more of the :term:`API` of `
for div in soup.findAll(True, for div in soup.findAll(True,
attrs={'class':['section-headline', 'story', 'story headline']}): attrs={'class':['section-headline', 'story', 'story headline']}):
if div['class'] == 'section-headline': if ''.join(div['class']) == 'section-headline':
key = string.capwords(feed_title(div)) key = string.capwords(feed_title(div))
articles[key] = [] articles[key] = []
ans.append(key) ans.append(key)
elif div['class'] in ['story', 'story headline']: elif ''.join(div['class']) in ['story', 'story headline']:
a = div.find('a', href=True) a = div.find('a', href=True)
if not a: if not a:
continue continue

View File

@ -44,7 +44,7 @@ class E1843(BasicNewsRecipe):
current_section = articles = None current_section = articles = None
for div in soup.findAll(**classes('field-name-field-header node-article')): for div in soup.findAll(**classes('field-name-field-header node-article')):
if 'field-header' in div['class']: if 'field-header' in ''.join(div['class']):
if current_section and articles: if current_section and articles:
ans.append((current_section, articles)) ans.append((current_section, articles))
current_section = self.tag_to_string(div) current_section = self.tag_to_string(div)

View File

@ -41,7 +41,7 @@ class AdvancedUserRecipe1290663986(BasicNewsRecipe):
'description': article_desc, 'description': article_desc,
'url': article_url}) 'url': article_url})
# Avoid including the multimedia stuff. # Avoid including the multimedia stuff.
if entry['class'].find('last') != -1: if ''.join(entry['class']).find('last') != -1:
break break
return articles return articles
@ -86,7 +86,7 @@ class AdvancedUserRecipe1290663986(BasicNewsRecipe):
'description': article_desc, 'description': article_desc,
'url': article_url}) 'url': article_url})
# Avoid including the multimedia stuff. # Avoid including the multimedia stuff.
if entry['class'].find('last') != -1: if ''.join(entry['class']).find('last') != -1:
break break
return articles return articles

View File

@ -128,13 +128,16 @@ class TheAtlantic(BasicNewsRecipe):
feeds = [] feeds = []
for div in soup.findAll('div', attrs={'class': lambda x: x and set(x.split()).intersection({'top-sections', 'bottom-sections'})}): for div in soup.findAll('div', attrs={'class': lambda x: x and set(x.split()).intersection({'top-sections', 'bottom-sections'})}):
for h2 in div.findAll('h2', attrs={'class': True}): for h2 in div.findAll('h2', attrs={'class': True}):
if 'section-name' in h2['class'].split(): cls = h2['class']
if hasattr(cls, 'split'):
cls = cls.split()
if 'section-name' in cls:
if current_articles: if current_articles:
feeds.append((current_section, current_articles)) feeds.append((current_section, current_articles))
current_articles = [] current_articles = []
current_section = self.tag_to_string(h2) current_section = self.tag_to_string(h2)
self.log('\nFound section:', current_section) self.log('\nFound section:', current_section)
elif 'hed' in h2['class'].split(): elif 'hed' in cls:
title = self.tag_to_string(h2) title = self.tag_to_string(h2)
a = h2.findParent('a', href=True) a = h2.findParent('a', href=True)
url = a['href'] url = a['href']

View File

@ -128,13 +128,16 @@ class TheAtlantic(BasicNewsRecipe):
feeds = [] feeds = []
for div in soup.findAll('div', attrs={'class': lambda x: x and set(x.split()).intersection({'top-sections', 'bottom-sections'})}): for div in soup.findAll('div', attrs={'class': lambda x: x and set(x.split()).intersection({'top-sections', 'bottom-sections'})}):
for h2 in div.findAll('h2', attrs={'class': True}): for h2 in div.findAll('h2', attrs={'class': True}):
if 'section-name' in h2['class'].split(): cls = h2['class']
if hasattr(cls, 'split'):
cls = cls.split()
if 'section-name' in cls:
if current_articles: if current_articles:
feeds.append((current_section, current_articles)) feeds.append((current_section, current_articles))
current_articles = [] current_articles = []
current_section = self.tag_to_string(h2) current_section = self.tag_to_string(h2)
self.log('\nFound section:', current_section) self.log('\nFound section:', current_section)
elif 'hed' in h2['class'].split(): elif 'hed' in cls:
title = self.tag_to_string(h2) title = self.tag_to_string(h2)
a = h2.findParent('a', href=True) a = h2.findParent('a', href=True)
url = a['href'] url = a['href']

View File

@ -280,7 +280,7 @@ class CanWestPaper(BasicNewsRecipe):
if dtag is not None: if dtag is not None:
stag = dtag.span stag = dtag.span
if stag is not None: if stag is not None:
if stag['class'] != 'timestamp': if ''.join(stag['class']) != 'timestamp':
description = self.tag_to_string(stag, False) description = self.tag_to_string(stag, False)
else: else:
description = self.tag_to_string(dtag, False) description = self.tag_to_string(dtag, False)

View File

@ -102,13 +102,13 @@ class CIO_Magazine(BasicNewsRecipe):
for div in soup.findAll(True, for div in soup.findAll(True,
attrs={'class': ['heading', 'issue_item']}): attrs={'class': ['heading', 'issue_item']}):
if div['class'] == 'heading': if ''.join(div['class']) == 'heading':
key = string.capwords(self.tag_to_string(div.span)) key = string.capwords(self.tag_to_string(div.span))
print("Key: ", key) # Esto es para depurar print("Key: ", key) # Esto es para depurar
articles[key] = [] articles[key] = []
feeds.append(key) feeds.append(key)
elif div['class'] == 'issue_item': elif ''.join(div['class']) == 'issue_item':
a = div.find('a', href=True) a = div.find('a', href=True)
if not a: if not a:
continue continue

View File

@ -280,7 +280,7 @@ class CanWestPaper(BasicNewsRecipe):
if dtag is not None: if dtag is not None:
stag = dtag.span stag = dtag.span
if stag is not None: if stag is not None:
if stag['class'] != 'timestamp': if ''.join(stag['class']) != 'timestamp':
description = self.tag_to_string(stag, False) description = self.tag_to_string(stag, False)
else: else:
description = self.tag_to_string(dtag, False) description = self.tag_to_string(dtag, False)

View File

@ -79,7 +79,7 @@ class Esensja(BasicNewsRecipe):
section += ' - ' + subchapter section += ' - ' + subchapter
feeds.append((section, articles)) feeds.append((section, articles))
articles = [] articles = []
if tag['class'] == 'chapter': if ''.join(tag['class']) == 'chapter':
chapter = self.tag_to_string(tag).capitalize() chapter = self.tag_to_string(tag).capitalize()
subchapter = '' subchapter = ''
else: else:

View File

@ -31,7 +31,7 @@ class KopalniaWiedzy(BasicNewsRecipe):
(re.compile(u'<a .* rel="lightboxText" .*><img (.*)></a>'), (re.compile(u'<a .* rel="lightboxText" .*><img (.*)></a>'),
lambda match: '<img class="topimage" ' + match.group(1) + '>'), lambda match: '<img class="topimage" ' + match.group(1) + '>'),
(re.compile(u'<br /><br />'), (re.compile(u'<br /><br />'),
lambda match: '<br\/>') lambda match: '<br/>')
] ]
feeds = [ feeds = [
@ -44,7 +44,7 @@ class KopalniaWiedzy(BasicNewsRecipe):
] ]
def is_link_wanted(self, url, tag): def is_link_wanted(self, url, tag):
return tag['class'] == 'next' return ''.join(tag['class']) == 'next'
def remove_beyond(self, tag, next): def remove_beyond(self, tag, next):
while tag is not None and getattr(tag, 'name', None) != 'body': while tag is not None and getattr(tag, 'name', None) != 'body':

View File

@ -151,7 +151,7 @@ class LentaRURecipe(BasicNewsRecipe):
for date in dates: for date in dates:
for string in date: for string in date:
parent = date.parent parent = date.parent
if (parent and isinstance(parent, Tag) and 'div' == parent.name and 'dt' == parent['class']): if (parent and isinstance(parent, Tag) and 'div' == parent.name and 'dt' == ''.join(parent['class'])):
# Date div found # Date div found
parent.extract() parent.extract()
parent[ parent[

View File

@ -80,7 +80,7 @@ class LetsGetCritical(BasicNewsRecipe):
p = post.previousSibling p = post.previousSibling
# navigate up sibling to find date # navigate up sibling to find date
while p: while p:
if hasattr(p, 'class') and p['class'] == 'singledate': if ''.join(p.get('class') or '') == 'singledate':
date = self.tag_to_string(p) date = self.tag_to_string(p)
break break
p = p.previousSibling p = p.previousSibling

View File

@ -95,15 +95,16 @@ class WeeklyLWN(BasicNewsRecipe):
break break
text = self.tag_to_string(curr.contents[0]) text = self.tag_to_string(curr.contents[0])
cclass = ''.join(curr['class'])
if 'Cat2HL' in curr['class']: if 'Cat2HL' in cclass:
subsection = text subsection = text
elif 'Cat1HL' in curr['class']: elif 'Cat1HL' in cclass:
section = text section = text
subsection = None subsection = None
elif 'SummaryHL' in curr['class']: elif 'SummaryHL' in cclass:
article_title = text article_title = text
if not article_title: if not article_title:
article_title = _('Undefined article title') article_title = _('Undefined article title')

View File

@ -60,7 +60,7 @@ class Mediapart(BasicNewsRecipe):
try: try:
title = article.find('h3', recursive=False) title = article.find('h3', recursive=False)
if title is None or title['class'] == 'title-specific': if title is None or ''.join(title['class']) == 'title-specific':
continue continue
# print "found fil ",title # print "found fil ",title

View File

@ -51,7 +51,7 @@ class MoneyControlRecipe(BasicNewsRecipe):
freshSoup.body.append(h1) freshSoup.body.append(h1)
for p in soup.findAll('p', attrs={'class': true}): for p in soup.findAll('p', attrs={'class': true}):
if p['class'] == 'MsoNormal': if ''.join(p['class']) == 'MsoNormal':
# We have some weird pagebreak marker here; it will not find all of them however # We have some weird pagebreak marker here; it will not find all of them however
continue continue

View File

@ -280,7 +280,7 @@ class CanWestPaper(BasicNewsRecipe):
if dtag is not None: if dtag is not None:
stag = dtag.span stag = dtag.span
if stag is not None: if stag is not None:
if stag['class'] != 'timestamp': if ''.join(stag['class']) != 'timestamp':
description = self.tag_to_string(stag, False) description = self.tag_to_string(stag, False)
else: else:
description = self.tag_to_string(dtag, False) description = self.tag_to_string(dtag, False)

View File

@ -12,7 +12,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
def find_header(tag): def find_header(tag):
return tag.name == 'header' and tag.parent['class'] == 'article' return tag.name == 'header' and ''.join(tag.parent['class']) == 'article'
def absurl(url): def absurl(url):

View File

@ -12,7 +12,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
def find_header(tag): def find_header(tag):
return tag.name == 'header' and tag.parent['class'] == 'article' return tag.name == 'header' and ''.join(tag.parent['class']) == 'article'
def absurl(url): def absurl(url):

View File

@ -280,7 +280,7 @@ class CanWestPaper(BasicNewsRecipe):
if dtag is not None: if dtag is not None:
stag = dtag.span stag = dtag.span
if stag is not None: if stag is not None:
if stag['class'] != 'timestamp': if ''.join(stag['class']) != 'timestamp':
description = self.tag_to_string(stag, False) description = self.tag_to_string(stag, False)
else: else:
description = self.tag_to_string(dtag, False) description = self.tag_to_string(dtag, False)

View File

@ -47,12 +47,12 @@ class Polter(BasicNewsRecipe):
for s in soup.findAll(style=True): for s in soup.findAll(style=True):
if 'bold;' in s['style']: if 'bold;' in s['style']:
if s.get('class', ''): if s.get('class', ''):
s['class'] = s['class'] + ' p_title' s['class'] = ''.join(s['class']) + ' p_title'
else: else:
s['class'] = 'p_title' s['class'] = 'p_title'
if 'italic;' in s['style']: if 'italic;' in s['style']:
if s.get('class', ''): if s.get('class', ''):
s['class'] = s['class'] + ' italic' s['class'] = ''.join(s['class']) + ' italic'
else: else:
s['class'] = 'italic' s['class'] = 'italic'
del s['style'] del s['style']

View File

@ -180,7 +180,7 @@ class CanWestPaper(BasicNewsRecipe):
# Find each instance of class="sectiontitle", class="featurecontent" # Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div', attrs={'class': ["section_title02", "featurecontent"]}): for divtag in soup.findAll('div', attrs={'class': ["section_title02", "featurecontent"]}):
if divtag['class'].startswith('section_title'): if 'section_title' in ''.join(divtag['class']):
# div contains section title # div contains section title
if not divtag.h3: if not divtag.h3:
continue continue

View File

@ -180,7 +180,7 @@ class CanWestPaper(BasicNewsRecipe):
# Find each instance of class="sectiontitle", class="featurecontent" # Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div', attrs={'class': ["section_title02", "featurecontent"]}): for divtag in soup.findAll('div', attrs={'class': ["section_title02", "featurecontent"]}):
if divtag['class'].startswith('section_title'): if ''.join(divtag['class']).startswith('section_title'):
# div contains section title # div contains section title
if not divtag.h3: if not divtag.h3:
continue continue

View File

@ -281,7 +281,7 @@ class CanWestPaper(BasicNewsRecipe):
if dtag is not None: if dtag is not None:
stag = dtag.span stag = dtag.span
if stag is not None: if stag is not None:
if stag['class'] != 'timestamp': if ''.join(stag['class']) != 'timestamp':
description = self.tag_to_string(stag, False) description = self.tag_to_string(stag, False)
else: else:
description = self.tag_to_string(dtag, False) description = self.tag_to_string(dtag, False)

View File

@ -70,7 +70,7 @@ class CanWestPaper(BasicNewsRecipe):
# Find each instance of class="sectiontitle", class="featurecontent" # Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div', attrs={'class': ["section_title02", "featurecontent"]}): for divtag in soup.findAll('div', attrs={'class': ["section_title02", "featurecontent"]}):
if divtag['class'].startswith('section_title'): if 'section_title' in ''.join(divtag['class']):
# div contains section title # div contains section title
if not divtag.h3: if not divtag.h3:
continue continue

View File

@ -181,7 +181,7 @@ class CanWestPaper(BasicNewsRecipe):
# Find each instance of class="sectiontitle", class="featurecontent" # Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div', attrs={'class': ["section_title02", "featurecontent"]}): for divtag in soup.findAll('div', attrs={'class': ["section_title02", "featurecontent"]}):
# self.log(" div class = %s" % divtag['class']) # self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'): if ''.join(divtag['class']).startswith('section_title'):
# div contains section title # div contains section title
if not divtag.h3: if not divtag.h3:
continue continue