(1) Revisions to existing Houston Chronicle recipe

(2) Added new recipe which scrapes Bloomberg for columnists' blog postings
(3) Revisions to existing economist recipe, added description metadata via populate_article_metadata.
This commit is contained in:
dale 2018-07-12 18:45:07 -05:00
parent 439da63e81
commit 6f93d75c06
4 changed files with 423 additions and 165 deletions

View File

@ -0,0 +1,199 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2018, Dale Furrow dkfurrow@gmail.com'
'''
chron.com
'''
import re
import time
from datetime import datetime, timedelta, date
from lxml import html, etree
from StringIO import StringIO
from calibre.web.feeds.recipes import BasicNewsRecipe
import urllib2
import traceback
from collections import OrderedDict
import calendar
import sys
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
contributors_url = "https://www.bloomberg.com/view/contributors"
output_date_format = "%d %b, %H:%M"
hdr = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
def get_article_parsed(this_url):
req = urllib2.Request(this_url, headers=hdr)
page = urllib2.urlopen(req)
content = page.read()
parser = etree.HTMLParser()
parsed = html.parse(StringIO(content), parser)
return parsed
class BloombergContributor:
_name = None
_url_name = None
_url_code = None
_article_list = None # article is title, link, date, description
date_formats = ["%B %d, %Y %I:%M %p", "%b %d, %Y %I:%M %p"]
def __init__(self, name, url_name, url_code):
self._name = name
self._url_name = url_name
self._url_code = url_code
self._article_list = []
def __str__(self):
return "{0} ({1}): {2:d} articles".format(self._name, self._url_name, len(self._article_list))
def populate_article_list(self):
list_url = "{0}/{1}/{2}/articles".format(contributors_url, self._url_code, self._url_name)
parsed_list = get_article_parsed(list_url)
articles = parsed_list.xpath("//li[contains(@class, 'item_lwCjl')]")
for article in articles:
headline = article.find('a')
link = headline.attrib['href']
title = headline.text.strip()
article_date_eles = article.xpath(".//span[contains(@class, 'time_3qQJR')]")
if len(article_date_eles) > 0:
article_date_str = article_date_eles[0].text.strip()
article_date = self.parse_date_str(article_date_str)
else:
article_date = None
summary_eles = article.xpath(".//p[contains(@class, 'summary_17SO6')]")
if len(summary_eles) > 0:
summary = summary_eles[0].text.strip()
else:
summary = "No summary..."
self._article_list.append((title.encode('ascii', 'ignore'), link, summary.encode('ascii', 'ignore'),
article_date, self.get_article_timestamp(article_date)))
@staticmethod
def get_article_timestamp(article_date):
# assume all times Eastern...
# 2nd sunday March, 1st Sunday Nov
c = calendar.Calendar(firstweekday=calendar.SUNDAY)
march_cal = c.monthdatescalendar(article_date.year, 3)
dst_start = [day for week in march_cal for day in week if \
day.weekday() == calendar.SUNDAY and \
day.month == 3][1]
nov_cal = c.monthdatescalendar(article_date.year, 11)
dst_end = [day for week in nov_cal for day in week if \
day.weekday() == calendar.SUNDAY and \
day.month == 11][0]
dst_start = datetime(dst_start.year, dst_start.month, dst_start.day, 2)
dst_end = datetime(dst_end.year, dst_end.month, dst_end.day, 1)
if dst_start > article_date > dst_end:
shift = timedelta(hours=4)
else:
shift = timedelta(hours=5)
return float((article_date + shift - datetime.utcfromtimestamp(0)).total_seconds())
def parse_date_str(self, date_str):
parsed = None
for date_format in self.date_formats:
try:
parsed = datetime.strptime(date_str[0:-4], date_format)
break
except Exception as ex:
pass
return parsed
def get_article_list(self):
return self._article_list
def get_ordered_article_feed(self):
output = OrderedDict()
for article in self._article_list:
article_date = article[3]
article_dict = {'title': article[0], 'url': article[1],
'description': "{0}: {1}".format(self.get_name(), article[2]),
'author': self.get_name() + ": " + article[3].strftime(output_date_format),
'date': self.get_name() + ": " + article[3].strftime(output_date_format),
'timestamp': article[4]}
output[article_date] = article_dict
return OrderedDict(sorted(output.items(), key=lambda t: t[0], reverse=True))
def get_name(self):
return self._name
class BloombergContributors(BasicNewsRecipe):
title = u'Bloomberg, Editorial Contributors'
description = 'Articles from Bloomberg.com contributors'
__author__ = 'Dale Furrow'
xpath_contributor_list = """//li[contains(@class, 'item_2zsS8')]/a"""
language = 'en'
no_stylesheets = True
remove_attributes = ['style', 'xmlns']
keep_only_tags = [dict(name='article', attrs={'data-type': 'article'})]
remove_tags = [dict(name='div', attrs=
{'class': ['share-article-button ', 'text-to-speech']})] # note space...
oldest_article = 7.0
ignore_duplicate_articles = {'url'}
recursions = 0
category = 'news, USA, world, economy, politics'
language = 'en'
def get_contributors_list(self):
page_doc = get_article_parsed(contributors_url)
els = page_doc.xpath(self.xpath_contributor_list)
contributor_list = []
for el in els:
name = el.find("span").text.strip() # name
contibutor_items = el.attrib['href'].split('/')
contributor = BloombergContributor(name, contibutor_items[4], contibutor_items[3])
contributor_list.append(contributor)
for contributor in contributor_list:
contributor.populate_article_list()
return contributor_list
def postprocess_html(self, soup, first_fetch):
'''
:param soup: A `BeautifulSoup
<https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>
`_ instance containing the downloaded :term:`HTML`.
:param first_fetch: True if this is the first page of an article.
Remember: BeautifulSoup3! Interface is much different than bs4
'''
time_eles = soup.findAll("time", {"class": "article-timestamp"})
if len(time_eles) > 0:
time_stamp = time_eles[0].get('datetime')
try:
parsed_time = datetime.strptime(time_stamp, "%Y-%m-%dT%H:%M:%S.%fZ")\
.strftime("%B %d, %Y %I:%M %p") + " UTC"
except:
parsed_time = time_stamp
insert_tag = Tag(soup, "p", [("class", "user-inserted")])
insert_tag.insert(0, parsed_time)
soup.time.replaceWith(insert_tag)
return soup
def parse_index(self):
self.timefmt = ' [%a, %d %b, %Y]'
self.log('starting parse_index: {0}'.format(datetime.now().strftime("%B %d, %Y %I:%M %p")))
feeds = []
feed_dict = OrderedDict()
contributor_list = self.get_contributors_list()
self.log("Found {0:d} contibutors on main page".format(len(contributor_list)))
for contributor in contributor_list:
articles = contributor.get_ordered_article_feed()
feed_dict.update(articles)
feed_dict = OrderedDict(sorted(feed_dict.items(), key=lambda t: t[0], reverse=True))
self.log("Found {0:d} linked articles from contributors".format(len(feed_dict)))
feeds.append(("Columns", list(feed_dict.values())))
# self.log("Total of {0:d} {1} articles".format(len(article_list), cat))
self.log('finishing parse_index: {0}'.format(datetime.now().strftime("%B %d, %Y %I:%M %p")))
return feeds

View File

@ -10,6 +10,7 @@ from collections import OrderedDict
from calibre.ebooks.BeautifulSoup import NavigableString, Tag from calibre.ebooks.BeautifulSoup import NavigableString, Tag
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.utils.cleantext import clean_ascii_chars
def classes(classes): def classes(classes):
@ -146,6 +147,18 @@ class Economist(BasicNewsRecipe):
raw = etree.tostring(root, encoding=unicode) raw = etree.tostring(root, encoding=unicode)
return raw return raw
def populate_article_metadata(self, article, soup, first):
els = soup.findAll(name=['span', 'p'],
attrs={'class': ['flytitle-and-title__title', 'blog-post__rubric']})
result = []
for el in els[0:2]:
if el is not None:
for descendant in el.contents:
if isinstance(descendant, NavigableString):
result.append(unicode(descendant))
article.summary = u'. '.join(result).encode('utf-8') + '.'
article.text_summary = clean_ascii_chars(article.summary)
def parse_index(self): def parse_index(self):
# return [('Articles', [{'title':'test', # return [('Articles', [{'title':'test',
# 'url':'http://www.economist.com/news/business/21718916-worlds-biggest-software-firm-has-transformed-its-culture-better-getting-cloud' # 'url':'http://www.economist.com/news/business/21718916-worlds-biggest-software-firm-has-transformed-its-culture-better-getting-cloud'

View File

@ -10,6 +10,7 @@ from collections import OrderedDict
from calibre.ebooks.BeautifulSoup import NavigableString, Tag from calibre.ebooks.BeautifulSoup import NavigableString, Tag
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.utils.cleantext import clean_ascii_chars
def classes(classes): def classes(classes):
@ -146,6 +147,18 @@ class Economist(BasicNewsRecipe):
raw = etree.tostring(root, encoding=unicode) raw = etree.tostring(root, encoding=unicode)
return raw return raw
def populate_article_metadata(self, article, soup, first):
els = soup.findAll(name=['span', 'p'],
attrs={'class': ['flytitle-and-title__title', 'blog-post__rubric']})
result = []
for el in els[0:2]:
if el is not None:
for descendant in el.contents:
if isinstance(descendant, NavigableString):
result.append(unicode(descendant))
article.summary = u'. '.join(result).encode('utf-8') + '.'
article.text_summary = clean_ascii_chars(article.summary)
def parse_index(self): def parse_index(self):
# return [('Articles', [{'title':'test', # return [('Articles', [{'title':'test',
# 'url':'http://www.economist.com/news/business/21718916-worlds-biggest-software-firm-has-transformed-its-culture-better-getting-cloud' # 'url':'http://www.economist.com/news/business/21718916-worlds-biggest-software-firm-has-transformed-its-culture-better-getting-cloud'

View File

@ -1,17 +1,135 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2016, Dale Furrow dkfurrow@gmail.com' __copyright__ = '2018, Dale Furrow dkfurrow@gmail.com'
''' '''
chron.com chron.com
''' '''
import re import re
import time import time
import urllib2
from StringIO import StringIO
from datetime import datetime, timedelta, date
import traceback
import sys
from collections import OrderedDict
from datetime import datetime, timedelta, date from datetime import datetime, timedelta, date
from lxml import html
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.web.feeds import Article
from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.BeautifulSoup import NavigableString
from calibre.utils.date import dt_factory, utcnow, local_tz
from lxml import html
from lxml import etree
regex_date_only = re.compile("""(?:January|February|March|April|
{8}May|June|July|August|September|October|November|
{8}December)\s[0-9]{1,2},\s20[01][0-9]""")
regex_time_only = re.compile("""[0-9]{1,2}:[0-9]{1,2} \w{2}""")
sentence_regex = re.compile("(\S.+?[.!?])(?=\s+|$)")
blog_regex = re.compile('post-\d+')
pages = OrderedDict([('news', ('/news/houston-texas/', ['business', 'sports'])),
('business', ('/business/', ['sports'])),
('sports', ('/sports/', ['business']))])
base_url = "http://www.chron.com"
# sports has 'core-package sports' class
xpath_general = """//div[contains(@class, 'centerpiece-tabs') or
contains(@class, 'wrapper') or
contains(@class, 'contentGroups') or
contains(@class, 'headline-list') or
contains(@class, 'core-package sports') or
contains(@class, 'news')]
//a[contains(@class, 'hdn-analytics')]"""
excluded_titles = ["Winning numbers", "TV-radio listings"]
def validate_link(page, link, title):
other_category = page[1][1]
if not title or len(title.strip()) < 5:
print("{0} rejected, title too short".format(link))
return None
if link.split('/')[3] in other_category:
print("{0} rejected, covered in other section".format(link))
return None
for excluded_title in excluded_titles:
if title.find(excluded_title) != -1:
print("{0} rejected, excluded title".format(link))
return None
return link, title
def get_article_parsed(this_url):
page = urllib2.urlopen(this_url)
content = page.read()
parser = etree.HTMLParser()
parsed = html.parse(StringIO(content), parser)
return parsed
def sort_subject(element_list):
# priority of subjects
subjects = ['news', 'neighborhood', 'entertainment']
subjects.reverse()
subject_dict = OrderedDict(zip(subjects, range(len(subjects))))
rank_dict = OrderedDict([(rank, []) for rank in range(len(subjects) + 1)])
for element in element_list:
subj = element[0].split('/')[3]
if subject_dict.get(subj) is not None:
rank_dict[subject_dict[subj] + 1].append(element)
else:
rank_dict[0].append(element)
# now return in reverse order, sorted
combined_list = []
for rank in range(len(subjects), -1, -1):
article_list = rank_dict[rank]
article_list.sort()
combined_list.extend(article_list)
return combined_list
def get_links_from_section_page(page):
page_doc = get_article_parsed(base_url + page[1][0])
els = page_doc.xpath(xpath_general)
element_list = []
for el in els:
link = el.get('href').split('?')[0]
title = el.text
if title is None or len(title.strip()) < 5:
link_id = link.split('/')[-1][:-3].split('-')[:-1]
title = ' '.join(link_id)
if link[:4] != 'http':
link = base_url + link
validated_link = validate_link(page=page, link=link, title=title)
if validated_link is not None:
element_list.append(validated_link)
sorted_element_list = sort_subject(element_list)
return [page[0], sorted_element_list]
def get_all_links_from_sections():
all_sections = []
article_set = set()
final_dict = OrderedDict()
for item in pages.items():
print "getting links from {0}".format(item[0])
all_sections.append(get_links_from_section_page(item))
for section in all_sections:
section_id = section[0]
article_list = section[1]
final_dict[section_id] = []
for article in article_list:
if article[0] not in article_set:
article_set.add(article[0])
final_dict[section_id].append(article)
return final_dict
# noinspection PyAbstractClass
class HoustonChronicle(BasicNewsRecipe): class HoustonChronicle(BasicNewsRecipe):
title = u'The Houston Chronicle' title = u'The Houston Chronicle'
description = 'News from Houston, Texas' description = 'News from Houston, Texas'
@ -22,196 +140,111 @@ class HoustonChronicle(BasicNewsRecipe):
remove_empty_feeds = True remove_empty_feeds = True
timefmt = '[%a, %d %b %Y]' timefmt = '[%a, %d %b %Y]'
timestampfmt = '%Y%m%d%H%M%S' timestampfmt = '%Y%m%d%H%M%S'
ignore_duplicate_articles = {'url'} # ignore_duplicate_articles = {'url'} # defaults to None
extra_css = '.article_date {display: none}' extra_css = '.article_date {display: none}'
category = 'news, USA'
masthead_url = 'http://www.chron.com/apple-touch-icon-76x76.png'
keep_only_tags = [dict(name='div', attrs={'class': ['article-content', 'article-wrap']})]
remove_tags = [dict(name='div', attrs={'social-title': True}),
dict(name='div', attrs={'class':
['control-panel', 'gallery-overlay-inner',
'most-popular', 'asset-media mos-playlist',
'asset_media asset-media']}),
dict(name='li', attrs={'class': ['hst-resgalleryitem taboola-frame hidden',
'hst-resgalleryitem hidden']}),
dict(name='ul', attrs={'class': 'clearfix'})]
oldest_web_article = 7.0 # max_articles_per_feed = 5 # for use in testing
if oldest_web_article is None: def get_article_description_from_doc(self, soup):
earliest_date = date.today()
else:
earliest_date = date.today() - timedelta(days=oldest_web_article)
pages = [('news', '/news/houston-texas/'),
('business', '/business/'),
('sports', '/sports/')]
base_url = "http://www.chron.com"
xpath_str = """//div[contains(@class, 'news') or contains(@class, 'headline-list')]
//*[self::h4 or self::h5]//a[contains(@class, 'hdn-analytics')]"""
def get_links_from_section_page(self, section_url):
page_doc = html.parse(self.base_url + section_url)
els = page_doc.xpath(self.xpath_str)
element_list = []
for el in els:
link = el.get('href')
title = el.text
if link[:4] != 'http':
link = self.base_url + link
if title is not None:
element_list.append((link, el.text))
return element_list
def get_article_description_from_doc(self, page_doc):
description_chars_break = 140 description_chars_break = 140
description_max_chars = 300 description_max_chars = 300
desc_xpath = """//div[contains(@class, 'article-body') or
contains(@class, 'resource-content') or contains(@class, 'post')]//p"""
sentence_regex = re.compile("(\S.+?[.!?])(?=\s+|$)")
def stringify_children(node):
return ''.join([x for x in node.itertext()])
try: try:
els = page_doc.xpath(desc_xpath) els = soup.findAll('p')
if len(els) > 0:
out_text = "" out_text = ""
ellipsis = "" this_ellipsis = ""
for el in els: for el in els:
sentences = re.findall(sentence_regex, stringify_children(el)) if el is not None:
result = []
for descendant in el.contents:
if isinstance(descendant, NavigableString):
result.append(unicode(descendant).strip())
all_text = u' '.join(result).encode('utf-8')
if len(all_text) > 1:
sentences = re.findall(sentence_regex, all_text)
if sentences is not None and len(sentences) > 0:
for sentence in sentences: for sentence in sentences:
if len(out_text) < description_chars_break: if len(out_text) < description_chars_break:
out_text += sentence + " " out_text += sentence + " "
else: else:
if len(out_text) > description_max_chars: if len(out_text) > description_max_chars:
ellipsis = "..." this_ellipsis = "..."
return out_text[:description_max_chars] + ellipsis return out_text[:description_max_chars] + this_ellipsis
return out_text return out_text
except: else:
return "No Article description returned"
except Exception as ex:
self.log('Error on Article Description') self.log('Error on Article Description')
traceback.print_exc(file=sys.stdout)
print(str(ex))
return "" return ""
def get_published_time_from_doc(self, page_doc): @staticmethod
regex_date_only = re.compile("""(?:January|February|March|April| def get_published_time_from_doc(page_doc):
May|June|July|August|September|October|November|
December)\s[0-9]{1,2},\s20[01][0-9]""")
regex_time_only = re.compile("""[0-9]{1,2}:[0-9]{1,2} \w{2}""")
def get_regular_timestamp(date_string): def get_regular_timestamp(date_string):
try: try:
out_date = datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%SZ") out_date = datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%SZ")
return out_date return out_date
except: except ValueError:
return None return None
def get_date_from_string(in_text): el = page_doc.findAll(lambda this_tag: this_tag.name == "time" and
match = re.findall(regex_date_only, in_text) ('itemprop', 'datePublished') in this_tag.attrs)
if match:
try:
out_date = datetime.strptime(match[0], "%B %d, %Y")
match = re.findall(regex_time_only, in_text)
if match:
out_time = datetime.strptime(match[0], "%I:%M %p")
return datetime.combine(out_date.date(), out_time.time())
return out_date
except:
return None
el = page_doc.xpath("//*[@class='timestamp'][1]")
if len(el) == 1: if len(el) == 1:
return get_regular_timestamp(el[0].get('title')) return get_regular_timestamp(el[0].get('datetime'))
else:
el = page_doc.xpath(
"//*[@class='entry-date' or @class='post-date'][1]")
if len(el) == 1:
return get_date_from_string(el[0].text_content())
else: else:
return None return None
def get_all_data_feeds_from_page(self, page): def populate_article_metadata(self, article, soup, first):
articles = [] """
exclude_titles_with = ['Winning numbers'] Called when each HTML page belonging to article is downloaded.
Intended to be used to get article metadata like author/summary/etc.
from the parsed HTML (soup).
def title_excluded(title): :param article: A object of class :class:`calibre.web.feeds.Article`.
for text in exclude_titles_with: If you change the summary, remember to also change the text_summary
if title.find(text) != -1: :param soup: Parsed HTML belonging to this article
return True :param first: True iff the parsed HTML is the first page of the article.
return False """
summary = self.get_article_description_from_doc(soup)
link_list = self.get_links_from_section_page(page[1]) article_date = self.get_published_time_from_doc(soup)
self.log('from section: ', page[0], if article_date is not None:
" found ", len(link_list), " links") article_timestamp = float((article_date - datetime.utcfromtimestamp(0)).total_seconds())
for link in link_list: article.date = article_timestamp
try: article.utctime = dt_factory(article_date.timetuple(), assume_utc=True, as_utc=True)
article_doc = html.parse(link[0]) article.localtime = article.utctime.astimezone(local_tz)
description = self.get_article_description_from_doc( summary_date = article.localtime.strftime("%Y-%m-%d %H:%M") if article_date is not None else "No Date"
article_doc) article.summary = "{0}: {1}".format(summary_date, summary)
parsed_date = self.get_published_time_from_doc(article_doc) article.text_summary = clean_ascii_chars(article.summary)
if parsed_date is not None and description is not None and \
parsed_date.date() > self.earliest_date and \
not title_excluded(link[1]):
intro_date = parsed_date.strftime('%d %b %H:%M') + " - "
articles.append({'title': link[1], 'url': link[0],
'description': intro_date + description,
'date': ""})
self.log(page[0] + ": " + link[1] + ', from ' + intro_date +
" description of " + str(len(description)) + ' characters at ' + link[0])
else:
if parsed_date is None:
msg = " No Timestamp Found"
elif title_excluded(link[1]):
msg = " Title Excluded"
else:
msg = " article older than " + \
str(self.oldest_web_article) + ' days...'
self.log("Skipping article: ", link[0], msg)
except:
print 'error on fetching ' + link[0]
continue
return articles
def parse_index(self): def parse_index(self):
self.timefmt = ' [%a, %d %b, %Y]' self.timefmt = ' [%a, %d %b, %Y]'
self.log('starting parse_index: ', time.strftime(self.timestampfmt)) self.log('starting parse_index: ', time.strftime(self.timestampfmt))
feeds = [] feeds = []
for page in self.pages: sections = get_all_links_from_sections()
articles = self.get_all_data_feeds_from_page(page) for section_id, article_list in sections.items():
if articles: self.log("Getting {0} section, {1:d} articles".format(section_id, len(article_list)))
feeds.append((page[0], articles)) articles = []
for article_info in article_list:
self.log("Adding {0} to feed".format(article_info[0]))
articles.append({'title': article_info[1], 'url': article_info[0],
'description': '', 'date': ""})
self.log("Appending {0:d} articles for {1}".format(len(articles), section_id))
feeds.append((section_id, articles))
self.log('finished parse_index: ', time.strftime(self.timestampfmt)) self.log('finished parse_index: ', time.strftime(self.timestampfmt))
return feeds return feeds
def preprocess_html(self, soup): def preprocess_html(self, soup):
tags_to_exclude = [('class', "caption staged"),
('style', "display:none")]
story_tag = soup.find(
name='div', attrs={'class': ['article-content', 'article-body']})
blog_tag = soup.find(name='div', attrs={'id': re.compile('post-\d+')})
def is_excluded(tag_to_check):
for attr in tag_to_check.attrs:
if attr in tags_to_exclude:
return True
return False
def get_attr_startswith(attrs, this_key, this_valuestart):
starts_with = False
for attr in attrs:
if attr[0] == this_key:
if attr[1].startswith(this_valuestart):
starts_with = True
return starts_with
base_tags = []
if story_tag is not None:
base_tags = story_tag.findAll(lambda this_tag: (this_tag.name == "p" and not ('class', 'open') in this_tag.attrs and not ('class', 'close') in this_tag.attrs) or this_tag.name.startswith('h') or this_tag.name == 'table' or (this_tag.name == 'li' and ('class', 'hst-resgalleryitem') in this_tag.attrs)) # noqa
if blog_tag is not None:
base_tags = blog_tag.findAll(lambda this_tag: (this_tag.name == "p" or this_tag.name.startswith('h')) or (this_tag.name == "span" and get_attr_startswith(this_tag.attrs, 'class', 'post')) or (this_tag.name == 'img' and ('lazy-state', 'loaded') in this_tag.attrs)) # noqa
self.log('content tags: ' + str(type(base_tags)) + str(len(base_tags)))
all_tags = []
all_tags.extend(base_tags)
if len(base_tags) > 0:
for tag in base_tags:
all_tags.extend(tag.findAll(True))
for tag in base_tags:
while tag.parent is not None and not is_excluded(tag):
all_tags.append(tag)
tag = tag.parent
for tag in soup.findAll(True):
if tag not in all_tags:
tag.extract()
return soup return soup