(1) Revisions to existing Houston Chronicle recipe

(2) Added new recipe which scrapes Bloomberg for columnists' blog postings (3) Revisions to existing economist recipe, added description metadata via populate_article_metadata.
2025-11-20 13:33:02 -05:00 · 2018-07-12 18:45:07 -05:00 · 2018-07-12 18:45:07 -05:00 · 6f93d75c06
commit 6f93d75c06
parent 439da63e81
4 changed files with 423 additions and 165 deletions
--- a/recipes/bloomberg_columns.recipe
+++ b/recipes/bloomberg_columns.recipe
@ -0,0 +1,199 @@
 #!/usr/bin/env python2
 # -*- coding: utf-8 -*-
 __license__ = 'GPL v3'
 __copyright__ = '2018, Dale Furrow dkfurrow@gmail.com'
 '''
 chron.com
 '''
 import re
 import time
 from datetime import datetime, timedelta, date
 from lxml import html, etree
 from StringIO import StringIO
 from calibre.web.feeds.recipes import BasicNewsRecipe
 import urllib2
 import traceback
 from collections import OrderedDict
 import calendar
 import sys
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
 contributors_url = "https://www.bloomberg.com/view/contributors"
 output_date_format = "%d %b, %H:%M"
 hdr = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}
 def get_article_parsed(this_url):
    req = urllib2.Request(this_url, headers=hdr)
    page = urllib2.urlopen(req)
    content = page.read()
    parser = etree.HTMLParser()
    parsed = html.parse(StringIO(content), parser)
    return parsed
 class BloombergContributor:
    _name = None
    _url_name = None
    _url_code = None
    _article_list = None  # article is title, link, date, description
    date_formats = ["%B %d, %Y %I:%M %p", "%b %d, %Y %I:%M %p"]
    def __init__(self, name, url_name, url_code):
        self._name = name
        self._url_name = url_name
        self._url_code = url_code
        self._article_list = []
    def __str__(self):
        return "{0} ({1}): {2:d} articles".format(self._name, self._url_name, len(self._article_list))
    def populate_article_list(self):
        list_url = "{0}/{1}/{2}/articles".format(contributors_url, self._url_code, self._url_name)
        parsed_list = get_article_parsed(list_url)
        articles = parsed_list.xpath("//li[contains(@class, 'item_lwCjl')]")
        for article in articles:
            headline = article.find('a')
            link = headline.attrib['href']
            title = headline.text.strip()
            article_date_eles = article.xpath(".//span[contains(@class, 'time_3qQJR')]")
            if len(article_date_eles) > 0:
                article_date_str = article_date_eles[0].text.strip()
                article_date = self.parse_date_str(article_date_str)
            else:
                article_date = None
            summary_eles = article.xpath(".//p[contains(@class, 'summary_17SO6')]")
            if len(summary_eles) > 0:
                summary = summary_eles[0].text.strip()
            else:
                summary = "No summary..."
            self._article_list.append((title.encode('ascii', 'ignore'), link, summary.encode('ascii', 'ignore'),
                                      article_date, self.get_article_timestamp(article_date)))
    @staticmethod
    def get_article_timestamp(article_date):
        # assume all times Eastern...
        # 2nd sunday March, 1st Sunday Nov
        c = calendar.Calendar(firstweekday=calendar.SUNDAY)
        march_cal = c.monthdatescalendar(article_date.year, 3)
        dst_start = [day for week in march_cal for day in week if \
                     day.weekday() == calendar.SUNDAY and \
                     day.month == 3][1]
        nov_cal = c.monthdatescalendar(article_date.year, 11)
        dst_end = [day for week in nov_cal for day in week if \
                   day.weekday() == calendar.SUNDAY and \
                   day.month == 11][0]
        dst_start = datetime(dst_start.year, dst_start.month, dst_start.day, 2)
        dst_end = datetime(dst_end.year, dst_end.month, dst_end.day, 1)
        if dst_start > article_date > dst_end:
            shift = timedelta(hours=4)
        else:
            shift = timedelta(hours=5)
        return float((article_date + shift - datetime.utcfromtimestamp(0)).total_seconds())
    def parse_date_str(self, date_str):
        parsed = None
        for date_format in self.date_formats:
            try:
                parsed = datetime.strptime(date_str[0:-4], date_format)
                break
            except Exception as ex:
                pass
        return parsed
    def get_article_list(self):
        return self._article_list
    def get_ordered_article_feed(self):
        output = OrderedDict()
        for article in self._article_list:
            article_date = article[3]
            article_dict = {'title': article[0], 'url': article[1],
                            'description': "{0}: {1}".format(self.get_name(), article[2]),
                            'author': self.get_name() + ": " + article[3].strftime(output_date_format),
                            'date': self.get_name() + ": " + article[3].strftime(output_date_format),
                            'timestamp': article[4]}
            output[article_date] = article_dict
        return OrderedDict(sorted(output.items(), key=lambda t: t[0], reverse=True))
    def get_name(self):
        return self._name
 class BloombergContributors(BasicNewsRecipe):
    title = u'Bloomberg, Editorial Contributors'
    description = 'Articles from Bloomberg.com contributors'
    __author__ = 'Dale Furrow'
    xpath_contributor_list = """//li[contains(@class, 'item_2zsS8')]/a"""
    language = 'en'
    no_stylesheets = True
    remove_attributes = ['style', 'xmlns']
    keep_only_tags = [dict(name='article', attrs={'data-type': 'article'})]
    remove_tags = [dict(name='div', attrs=
    {'class': ['share-article-button ', 'text-to-speech']})]  # note space...
    oldest_article = 7.0
    ignore_duplicate_articles = {'url'}
    recursions = 0
    category = 'news, USA, world, economy, politics'
    language = 'en'
    def get_contributors_list(self):
        page_doc = get_article_parsed(contributors_url)
        els = page_doc.xpath(self.xpath_contributor_list)
        contributor_list = []
        for el in els:
            name = el.find("span").text.strip()  # name
            contibutor_items = el.attrib['href'].split('/')
            contributor = BloombergContributor(name, contibutor_items[4], contibutor_items[3])
            contributor_list.append(contributor)
        for contributor in contributor_list:
            contributor.populate_article_list()
        return contributor_list
    def postprocess_html(self, soup, first_fetch):
        '''
        :param soup: A `BeautifulSoup
        <https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>
        `_  instance containing the downloaded :term:`HTML`.
        :param first_fetch: True if this is the first page of an article.
        Remember: BeautifulSoup3! Interface is much different than bs4
        '''
        time_eles = soup.findAll("time", {"class": "article-timestamp"})
        if len(time_eles) > 0:
            time_stamp = time_eles[0].get('datetime')
            try:
                parsed_time = datetime.strptime(time_stamp, "%Y-%m-%dT%H:%M:%S.%fZ")\
                    .strftime("%B %d, %Y %I:%M %p") + " UTC"
            except:
                parsed_time = time_stamp
            insert_tag = Tag(soup, "p", [("class", "user-inserted")])
            insert_tag.insert(0, parsed_time)
            soup.time.replaceWith(insert_tag)
        return soup
    def parse_index(self):
        self.timefmt = ' [%a, %d %b, %Y]'
        self.log('starting parse_index: {0}'.format(datetime.now().strftime("%B %d, %Y %I:%M %p")))
        feeds = []
        feed_dict = OrderedDict()
        contributor_list = self.get_contributors_list()
        self.log("Found {0:d} contibutors on main page".format(len(contributor_list)))
        for contributor in contributor_list:
            articles = contributor.get_ordered_article_feed()
            feed_dict.update(articles)
        feed_dict = OrderedDict(sorted(feed_dict.items(), key=lambda t: t[0], reverse=True))
        self.log("Found {0:d} linked articles from contributors".format(len(feed_dict)))
        feeds.append(("Columns", list(feed_dict.values())))
        # self.log("Total of {0:d} {1} articles".format(len(article_list), cat))
        self.log('finishing parse_index: {0}'.format(datetime.now().strftime("%B %d, %Y %I:%M %p")))
        return feeds
--- a/recipes/economist.recipe
+++ b/recipes/economist.recipe
@ -10,6 +10,7 @@ from collections import OrderedDict
 from calibre.ebooks.BeautifulSoup import NavigableString, Tag
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.utils.cleantext import clean_ascii_chars
 def classes(classes):
@ -146,6 +147,18 @@ class Economist(BasicNewsRecipe):
        raw = etree.tostring(root, encoding=unicode)
        return raw
    def populate_article_metadata(self, article, soup, first):
        els = soup.findAll(name=['span', 'p'],
                           attrs={'class': ['flytitle-and-title__title', 'blog-post__rubric']})
        result = []
        for el in els[0:2]:
            if el is not None:
                for descendant in el.contents:
                    if isinstance(descendant, NavigableString):
                        result.append(unicode(descendant))
        article.summary = u'. '.join(result).encode('utf-8') + '.'
        article.text_summary = clean_ascii_chars(article.summary)
    def parse_index(self):
        # return [('Articles', [{'title':'test',
        # 'url':'http://www.economist.com/news/business/21718916-worlds-biggest-software-firm-has-transformed-its-culture-better-getting-cloud'
--- a/recipes/economist_free.recipe
+++ b/recipes/economist_free.recipe
@ -10,6 +10,7 @@ from collections import OrderedDict
 from calibre.ebooks.BeautifulSoup import NavigableString, Tag
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.utils.cleantext import clean_ascii_chars
 def classes(classes):
@ -146,6 +147,18 @@ class Economist(BasicNewsRecipe):
        raw = etree.tostring(root, encoding=unicode)
        return raw
    def populate_article_metadata(self, article, soup, first):
        els = soup.findAll(name=['span', 'p'],
                           attrs={'class': ['flytitle-and-title__title', 'blog-post__rubric']})
        result = []
        for el in els[0:2]:
            if el is not None:
                for descendant in el.contents:
                    if isinstance(descendant, NavigableString):
                        result.append(unicode(descendant))
        article.summary = u'. '.join(result).encode('utf-8') + '.'
        article.text_summary = clean_ascii_chars(article.summary)
    def parse_index(self):
        # return [('Articles', [{'title':'test',
        # 'url':'http://www.economist.com/news/business/21718916-worlds-biggest-software-firm-has-transformed-its-culture-better-getting-cloud'
--- a/recipes/houston_chronicle.recipe
+++ b/recipes/houston_chronicle.recipe
@ -1,17 +1,135 @@
 #!/usr/bin/env python2
 # -*- coding: utf-8 -*-
 __license__ = 'GPL v3'
-__copyright__ = '2016, Dale Furrow dkfurrow@gmail.com'
+__copyright__ = '2018, Dale Furrow dkfurrow@gmail.com'
 '''
 chron.com
 '''
 import re
 import time
 import urllib2
 from StringIO import StringIO
 from datetime import datetime, timedelta, date
 import traceback
 import sys
 from collections import OrderedDict
 from datetime import datetime, timedelta, date
 from lxml import html
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from calibre.web.feeds import Article
 from calibre.utils.cleantext import clean_ascii_chars
 from calibre.ebooks.BeautifulSoup import NavigableString
 from calibre.utils.date import dt_factory, utcnow, local_tz
 from lxml import html
 from lxml import etree
 regex_date_only = re.compile("""(?:January|February|March|April|
 {8}May|June|July|August|September|October|November|
 {8}December)\s[0-9]{1,2},\s20[01][0-9]""")
 regex_time_only = re.compile("""[0-9]{1,2}:[0-9]{1,2} \w{2}""")
 sentence_regex = re.compile("(\S.+?[.!?])(?=\s+|$)")
 blog_regex = re.compile('post-\d+')
 pages = OrderedDict([('news', ('/news/houston-texas/', ['business', 'sports'])),
                     ('business', ('/business/', ['sports'])),
                     ('sports', ('/sports/', ['business']))])
 base_url = "http://www.chron.com"
 # sports has 'core-package sports' class
 xpath_general = """//div[contains(@class, 'centerpiece-tabs') or
                    contains(@class, 'wrapper') or
                    contains(@class, 'contentGroups') or
                    contains(@class, 'headline-list') or 
                    contains(@class, 'core-package sports') or
                    contains(@class, 'news')]
                   //a[contains(@class, 'hdn-analytics')]"""
 excluded_titles = ["Winning numbers", "TV-radio listings"]
 def validate_link(page, link, title):
    other_category = page[1][1]
    if not title or len(title.strip()) < 5:
        print("{0} rejected, title too short".format(link))
        return None
    if link.split('/')[3] in other_category:
        print("{0} rejected, covered in other section".format(link))
        return None
    for excluded_title in excluded_titles:
        if title.find(excluded_title) != -1:
            print("{0} rejected, excluded title".format(link))
            return None
    return link, title
 def get_article_parsed(this_url):
    page = urllib2.urlopen(this_url)
    content = page.read()
    parser = etree.HTMLParser()
    parsed = html.parse(StringIO(content), parser)
    return parsed
 def sort_subject(element_list):
    # priority of subjects
    subjects = ['news', 'neighborhood', 'entertainment']
    subjects.reverse()
    subject_dict = OrderedDict(zip(subjects, range(len(subjects))))
    rank_dict = OrderedDict([(rank, []) for rank in range(len(subjects) + 1)])
    for element in element_list:
        subj = element[0].split('/')[3]
        if subject_dict.get(subj) is not None:
            rank_dict[subject_dict[subj] + 1].append(element)
        else:
            rank_dict[0].append(element)
    # now return in reverse order, sorted
    combined_list = []
    for rank in range(len(subjects), -1, -1):
        article_list = rank_dict[rank]
        article_list.sort()
        combined_list.extend(article_list)
    return combined_list
 def get_links_from_section_page(page):
    page_doc = get_article_parsed(base_url + page[1][0])
    els = page_doc.xpath(xpath_general)
    element_list = []
    for el in els:
        link = el.get('href').split('?')[0]
        title = el.text
        if title is None or len(title.strip()) < 5:
            link_id = link.split('/')[-1][:-3].split('-')[:-1]
            title = ' '.join(link_id)
        if link[:4] != 'http':
            link = base_url + link
        validated_link = validate_link(page=page, link=link, title=title)
        if validated_link is not None:
            element_list.append(validated_link)
    sorted_element_list = sort_subject(element_list)
    return [page[0], sorted_element_list]
 def get_all_links_from_sections():
    all_sections = []
    article_set = set()
    final_dict = OrderedDict()
    for item in pages.items():
        print "getting links from {0}".format(item[0])
        all_sections.append(get_links_from_section_page(item))
    for section in all_sections:
        section_id = section[0]
        article_list = section[1]
        final_dict[section_id] = []
        for article in article_list:
            if article[0] not in article_set:
                article_set.add(article[0])
                final_dict[section_id].append(article)
    return final_dict
 # noinspection PyAbstractClass
 class HoustonChronicle(BasicNewsRecipe):
    title = u'The Houston Chronicle'
    description = 'News from Houston, Texas'
@ -22,196 +140,111 @@ class HoustonChronicle(BasicNewsRecipe):
    remove_empty_feeds = True
    timefmt = '[%a, %d %b %Y]'
    timestampfmt = '%Y%m%d%H%M%S'
-    ignore_duplicate_articles = {'url'}
+    # ignore_duplicate_articles = {'url'}  #  defaults to None
    extra_css = '.article_date {display: none}'
    category = 'news, USA'
    masthead_url = 'http://www.chron.com/apple-touch-icon-76x76.png'
    keep_only_tags = [dict(name='div', attrs={'class': ['article-content', 'article-wrap']})]
    remove_tags = [dict(name='div', attrs={'social-title': True}),
                   dict(name='div', attrs={'class':
                                           ['control-panel', 'gallery-overlay-inner',
                                            'most-popular', 'asset-media mos-playlist',
                                            'asset_media asset-media']}),
                   dict(name='li', attrs={'class': ['hst-resgalleryitem taboola-frame hidden',
                                                    'hst-resgalleryitem hidden']}),
                   dict(name='ul', attrs={'class': 'clearfix'})]
-    oldest_web_article = 7.0
+    # max_articles_per_feed = 5  # for use in testing
-    if oldest_web_article is None:
+    def get_article_description_from_doc(self, soup):
        earliest_date = date.today()
    else:
        earliest_date = date.today() - timedelta(days=oldest_web_article)
    pages = [('news', '/news/houston-texas/'),
             ('business', '/business/'),
             ('sports', '/sports/')]
    base_url = "http://www.chron.com"
    xpath_str = """//div[contains(@class, 'news') or contains(@class, 'headline-list')]
                  //*[self::h4 or self::h5]//a[contains(@class, 'hdn-analytics')]"""
    def get_links_from_section_page(self, section_url):
        page_doc = html.parse(self.base_url + section_url)
        els = page_doc.xpath(self.xpath_str)
        element_list = []
        for el in els:
            link = el.get('href')
            title = el.text
            if link[:4] != 'http':
                link = self.base_url + link
            if title is not None:
                element_list.append((link, el.text))
        return element_list
    def get_article_description_from_doc(self, page_doc):
        description_chars_break = 140
        description_max_chars = 300
        desc_xpath = """//div[contains(@class, 'article-body') or
        contains(@class, 'resource-content') or contains(@class, 'post')]//p"""
        sentence_regex = re.compile("(\S.+?[.!?])(?=\s+|$)")
        def stringify_children(node):
            return ''.join([x for x in node.itertext()])
        try:
-            els = page_doc.xpath(desc_xpath)
+            els = soup.findAll('p')
-            out_text = ""
+            if len(els) > 0:
-            ellipsis = ""
+                out_text = ""
-            for el in els:
+                this_ellipsis = ""
-                sentences = re.findall(sentence_regex, stringify_children(el))
+                for el in els:
-                for sentence in sentences:
+                    if el is not None:
-                    if len(out_text) < description_chars_break:
+                        result = []
-                        out_text += sentence + " "
+                        for descendant in el.contents:
-                    else:
+                            if isinstance(descendant, NavigableString):
-                        if len(out_text) > description_max_chars:
+                                result.append(unicode(descendant).strip())
-                            ellipsis = "..."
+                        all_text = u' '.join(result).encode('utf-8')
-                        return out_text[:description_max_chars] + ellipsis
+                        if len(all_text) > 1:
-            return out_text
+                            sentences = re.findall(sentence_regex, all_text)
-        except:
+                            if sentences is not None and len(sentences) > 0:
                                for sentence in sentences:
                                    if len(out_text) < description_chars_break:
                                        out_text += sentence + " "
                                    else:
                                        if len(out_text) > description_max_chars:
                                            this_ellipsis = "..."
                                        return out_text[:description_max_chars] + this_ellipsis
                return out_text
            else:
                return "No Article description returned"
        except Exception as ex:
            self.log('Error on Article Description')
            traceback.print_exc(file=sys.stdout)
            print(str(ex))
            return ""
-    def get_published_time_from_doc(self, page_doc):
+    @staticmethod
-        regex_date_only = re.compile("""(?:January|February|March|April|
+    def get_published_time_from_doc(page_doc):
        May|June|July|August|September|October|November|
        December)\s[0-9]{1,2},\s20[01][0-9]""")
        regex_time_only = re.compile("""[0-9]{1,2}:[0-9]{1,2} \w{2}""")
        def get_regular_timestamp(date_string):
            try:
                out_date = datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%SZ")
                return out_date
-            except:
+            except ValueError:
                return None
-        def get_date_from_string(in_text):
+        el = page_doc.findAll(lambda this_tag: this_tag.name == "time" and
-            match = re.findall(regex_date_only, in_text)
+                              ('itemprop', 'datePublished') in this_tag.attrs)
            if match:
                try:
                    out_date = datetime.strptime(match[0], "%B %d, %Y")
                    match = re.findall(regex_time_only, in_text)
                    if match:
                        out_time = datetime.strptime(match[0], "%I:%M %p")
                        return datetime.combine(out_date.date(), out_time.time())
                    return out_date
                except:
                    return None
        el = page_doc.xpath("//*[@class='timestamp'][1]")
        if len(el) == 1:
-            return get_regular_timestamp(el[0].get('title'))
+            return get_regular_timestamp(el[0].get('datetime'))
        else:
-            el = page_doc.xpath(
+            return None
                "//*[@class='entry-date' or @class='post-date'][1]")
            if len(el) == 1:
                return get_date_from_string(el[0].text_content())
            else:
                return None
-    def get_all_data_feeds_from_page(self, page):
+    def populate_article_metadata(self, article, soup, first):
-        articles = []
+        """
-        exclude_titles_with = ['Winning numbers']
+        Called when each HTML page belonging to article is downloaded.
        Intended to be used to get article metadata like author/summary/etc.
        from the parsed HTML (soup).
-        def title_excluded(title):
+        :param article: A object of class :class:`calibre.web.feeds.Article`.
-            for text in exclude_titles_with:
+            If you change the summary, remember to also change the text_summary
-                if title.find(text) != -1:
+        :param soup: Parsed HTML belonging to this article
-                    return True
+        :param first: True iff the parsed HTML is the first page of the article.
-            return False
+        """
-
+        summary = self.get_article_description_from_doc(soup)
-        link_list = self.get_links_from_section_page(page[1])
+        article_date = self.get_published_time_from_doc(soup)
-        self.log('from section: ', page[0],
+        if article_date is not None:
-                 " found ", len(link_list), " links")
+            article_timestamp = float((article_date - datetime.utcfromtimestamp(0)).total_seconds())
-        for link in link_list:
+            article.date = article_timestamp
-            try:
+            article.utctime = dt_factory(article_date.timetuple(), assume_utc=True, as_utc=True)
-                article_doc = html.parse(link[0])
+            article.localtime = article.utctime.astimezone(local_tz)
-                description = self.get_article_description_from_doc(
+        summary_date = article.localtime.strftime("%Y-%m-%d %H:%M") if article_date is not None else "No Date"
-                    article_doc)
+        article.summary = "{0}: {1}".format(summary_date, summary)
-                parsed_date = self.get_published_time_from_doc(article_doc)
+        article.text_summary = clean_ascii_chars(article.summary)
                if parsed_date is not None and description is not None and \
                        parsed_date.date() > self.earliest_date and \
                        not title_excluded(link[1]):
                    intro_date = parsed_date.strftime('%d %b %H:%M') + " - "
                    articles.append({'title': link[1], 'url': link[0],
                                     'description': intro_date + description,
                                     'date': ""})
                    self.log(page[0] + ": " + link[1] + ', from ' + intro_date +
                             " description of " + str(len(description)) + ' characters at ' + link[0])
                else:
                    if parsed_date is None:
                        msg = " No Timestamp Found"
                    elif title_excluded(link[1]):
                        msg = " Title Excluded"
                    else:
                        msg = " article older than " + \
                            str(self.oldest_web_article) + ' days...'
                    self.log("Skipping article: ", link[0], msg)
            except:
                print 'error on fetching ' + link[0]
                continue
        return articles
    def parse_index(self):
        self.timefmt = ' [%a, %d %b, %Y]'
        self.log('starting parse_index: ', time.strftime(self.timestampfmt))
        feeds = []
-        for page in self.pages:
+        sections = get_all_links_from_sections()
-            articles = self.get_all_data_feeds_from_page(page)
+        for section_id, article_list in sections.items():
-            if articles:
+            self.log("Getting {0} section, {1:d} articles".format(section_id, len(article_list)))
-                feeds.append((page[0], articles))
+            articles = []
            for article_info in article_list:
                self.log("Adding {0} to feed".format(article_info[0]))
                articles.append({'title': article_info[1], 'url': article_info[0],
                                 'description': '', 'date': ""})
            self.log("Appending {0:d} articles for {1}".format(len(articles), section_id))
            feeds.append((section_id, articles))
        self.log('finished parse_index: ', time.strftime(self.timestampfmt))
        return feeds
    def preprocess_html(self, soup):
        tags_to_exclude = [('class', "caption  staged"),
                           ('style', "display:none")]
        story_tag = soup.find(
            name='div', attrs={'class': ['article-content', 'article-body']})
        blog_tag = soup.find(name='div', attrs={'id': re.compile('post-\d+')})
        def is_excluded(tag_to_check):
            for attr in tag_to_check.attrs:
                if attr in tags_to_exclude:
                    return True
            return False
        def get_attr_startswith(attrs, this_key, this_valuestart):
            starts_with = False
            for attr in attrs:
                if attr[0] == this_key:
                    if attr[1].startswith(this_valuestart):
                        starts_with = True
            return starts_with
        base_tags = []
        if story_tag is not None:
            base_tags = story_tag.findAll(lambda this_tag: (this_tag.name == "p" and not ('class', 'open') in this_tag.attrs and not ('class', 'close') in this_tag.attrs) or this_tag.name.startswith('h') or this_tag.name == 'table' or (this_tag.name == 'li' and ('class', 'hst-resgalleryitem') in this_tag.attrs))  # noqa
        if blog_tag is not None:
            base_tags = blog_tag.findAll(lambda this_tag: (this_tag.name == "p" or this_tag.name.startswith('h')) or (this_tag.name == "span" and get_attr_startswith(this_tag.attrs, 'class', 'post')) or (this_tag.name == 'img' and ('lazy-state', 'loaded') in this_tag.attrs))  # noqa
        self.log('content tags: ' + str(type(base_tags)) + str(len(base_tags)))
        all_tags = []
        all_tags.extend(base_tags)
        if len(base_tags) > 0:
            for tag in base_tags:
                all_tags.extend(tag.findAll(True))
        for tag in base_tags:
            while tag.parent is not None and not is_excluded(tag):
                all_tags.append(tag)
                tag = tag.parent
        for tag in soup.findAll(True):
            if tag not in all_tags:
                tag.extract()
        return soup