diff --git a/recipes/bloomberg_columns.recipe b/recipes/bloomberg_columns.recipe index fef294ab28..f43848d89d 100644 --- a/recipes/bloomberg_columns.recipe +++ b/recipes/bloomberg_columns.recipe @@ -5,18 +5,14 @@ __copyright__ = '2018, Dale Furrow dkfurrow@gmail.com' ''' chron.com ''' -import re -import time -from datetime import datetime, timedelta, date +from datetime import datetime, timedelta from lxml import html, etree from StringIO import StringIO from calibre.web.feeds.recipes import BasicNewsRecipe import urllib2 -import traceback from collections import OrderedDict import calendar -import sys -from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag +from calibre.ebooks.BeautifulSoup import Tag contributors_url = "https://www.bloomberg.com/view/contributors" @@ -38,6 +34,7 @@ def get_article_parsed(this_url): parsed = html.parse(StringIO(content), parser) return parsed + class BloombergContributor: _name = None _url_name = None @@ -55,20 +52,23 @@ class BloombergContributor: return "{0} ({1}): {2:d} articles".format(self._name, self._url_name, len(self._article_list)) def populate_article_list(self): - list_url = "{0}/{1}/{2}/articles".format(contributors_url, self._url_code, self._url_name) + list_url = "{0}/{1}/{2}/articles".format( + contributors_url, self._url_code, self._url_name) parsed_list = get_article_parsed(list_url) articles = parsed_list.xpath("//li[contains(@class, 'item_lwCjl')]") for article in articles: headline = article.find('a') link = headline.attrib['href'] title = headline.text.strip() - article_date_eles = article.xpath(".//span[contains(@class, 'time_3qQJR')]") + article_date_eles = article.xpath( + ".//span[contains(@class, 'time_3qQJR')]") if len(article_date_eles) > 0: article_date_str = article_date_eles[0].text.strip() article_date = self.parse_date_str(article_date_str) else: article_date = None - summary_eles = article.xpath(".//p[contains(@class, 'summary_17SO6')]") + summary_eles = article.xpath( + ".//p[contains(@class, 'summary_17SO6')]") if len(summary_eles) > 0: summary = summary_eles[0].text.strip() else: @@ -82,13 +82,14 @@ class BloombergContributor: # 2nd sunday March, 1st Sunday Nov c = calendar.Calendar(firstweekday=calendar.SUNDAY) march_cal = c.monthdatescalendar(article_date.year, 3) - dst_start = [day for week in march_cal for day in week if \ - day.weekday() == calendar.SUNDAY and \ - day.month == 3][1] + dst_start = [ + day for week in march_cal for day in week + if day.weekday() == calendar.SUNDAY and day.month == 3 + ][1] nov_cal = c.monthdatescalendar(article_date.year, 11) - dst_end = [day for week in nov_cal for day in week if \ - day.weekday() == calendar.SUNDAY and \ - day.month == 11][0] + dst_end = [day for week in nov_cal for day in week + if day.weekday() == calendar.SUNDAY and day.month == 11 + ][0] dst_start = datetime(dst_start.year, dst_start.month, dst_start.day, 2) dst_end = datetime(dst_end.year, dst_end.month, dst_end.day, 1) if dst_start > article_date > dst_end: @@ -97,14 +98,13 @@ class BloombergContributor: shift = timedelta(hours=5) return float((article_date + shift - datetime.utcfromtimestamp(0)).total_seconds()) - def parse_date_str(self, date_str): parsed = None for date_format in self.date_formats: try: parsed = datetime.strptime(date_str[0:-4], date_format) break - except Exception as ex: + except Exception: pass return parsed @@ -126,6 +126,7 @@ class BloombergContributor: def get_name(self): return self._name + class BloombergContributors(BasicNewsRecipe): title = u'Bloomberg, Editorial Contributors' description = 'Articles from Bloomberg.com contributors' @@ -135,8 +136,9 @@ class BloombergContributors(BasicNewsRecipe): no_stylesheets = True remove_attributes = ['style', 'xmlns'] keep_only_tags = [dict(name='article', attrs={'data-type': 'article'})] - remove_tags = [dict(name='div', attrs= - {'class': ['share-article-button ', 'text-to-speech']})] # note space... + # note space... + remove_tags = [ + dict(name='div', attrs={'class': ['share-article-button ', 'text-to-speech']})] oldest_article = 7.0 ignore_duplicate_articles = {'url'} recursions = 0 @@ -150,7 +152,8 @@ class BloombergContributors(BasicNewsRecipe): for el in els: name = el.find("span").text.strip() # name contibutor_items = el.attrib['href'].split('/') - contributor = BloombergContributor(name, contibutor_items[4], contibutor_items[3]) + contributor = BloombergContributor( + name, contibutor_items[4], contibutor_items[3]) contributor_list.append(contributor) for contributor in contributor_list: contributor.populate_article_list() @@ -176,24 +179,26 @@ class BloombergContributors(BasicNewsRecipe): insert_tag.insert(0, parsed_time) soup.time.replaceWith(insert_tag) - return soup def parse_index(self): self.timefmt = ' [%a, %d %b, %Y]' - self.log('starting parse_index: {0}'.format(datetime.now().strftime("%B %d, %Y %I:%M %p"))) + self.log('starting parse_index: {0}'.format( + datetime.now().strftime("%B %d, %Y %I:%M %p"))) feeds = [] feed_dict = OrderedDict() contributor_list = self.get_contributors_list() - self.log("Found {0:d} contibutors on main page".format(len(contributor_list))) + self.log("Found {0:d} contibutors on main page".format( + len(contributor_list))) for contributor in contributor_list: articles = contributor.get_ordered_article_feed() feed_dict.update(articles) - feed_dict = OrderedDict(sorted(feed_dict.items(), key=lambda t: t[0], reverse=True)) - self.log("Found {0:d} linked articles from contributors".format(len(feed_dict))) + feed_dict = OrderedDict( + sorted(feed_dict.items(), key=lambda t: t[0], reverse=True)) + self.log("Found {0:d} linked articles from contributors".format( + len(feed_dict))) feeds.append(("Columns", list(feed_dict.values()))) # self.log("Total of {0:d} {1} articles".format(len(article_list), cat)) - self.log('finishing parse_index: {0}'.format(datetime.now().strftime("%B %d, %Y %I:%M %p"))) + self.log('finishing parse_index: {0}'.format( + datetime.now().strftime("%B %d, %Y %I:%M %p"))) return feeds - - diff --git a/recipes/houston_chronicle.recipe b/recipes/houston_chronicle.recipe index a203262dbb..a6545ca4ca 100644 --- a/recipes/houston_chronicle.recipe +++ b/recipes/houston_chronicle.recipe @@ -9,26 +9,24 @@ import re import time import urllib2 from StringIO import StringIO -from datetime import datetime, timedelta, date +from datetime import datetime import traceback import sys from collections import OrderedDict -from datetime import datetime, timedelta, date from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.web.feeds import Article from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.BeautifulSoup import NavigableString -from calibre.utils.date import dt_factory, utcnow, local_tz +from calibre.utils.date import dt_factory, local_tz from lxml import html from lxml import etree -regex_date_only = re.compile("""(?:January|February|March|April| +regex_date_only = re.compile(r"""(?:January|February|March|April| {8}May|June|July|August|September|October|November| {8}December)\s[0-9]{1,2},\s20[01][0-9]""") -regex_time_only = re.compile("""[0-9]{1,2}:[0-9]{1,2} \w{2}""") -sentence_regex = re.compile("(\S.+?[.!?])(?=\s+|$)") -blog_regex = re.compile('post-\d+') +regex_time_only = re.compile(r"""[0-9]{1,2}:[0-9]{1,2} \w{2}""") +sentence_regex = re.compile(r"(\S.+?[.!?])(?=\s+|$)") +blog_regex = re.compile(r'post-\d+') pages = OrderedDict([('news', ('/news/houston-texas/', ['business', 'sports'])), ('business', ('/business/', ['sports'])), @@ -40,7 +38,7 @@ base_url = "http://www.chron.com" xpath_general = """//div[contains(@class, 'centerpiece-tabs') or contains(@class, 'wrapper') or contains(@class, 'contentGroups') or - contains(@class, 'headline-list') or + contains(@class, 'headline-list') or contains(@class, 'core-package sports') or contains(@class, 'news')] //a[contains(@class, 'hdn-analytics')]""" @@ -116,7 +114,7 @@ def get_all_links_from_sections(): article_set = set() final_dict = OrderedDict() for item in pages.items(): - print "getting links from {0}".format(item[0]) + print("getting links from {0}".format(item[0])) all_sections.append(get_links_from_section_page(item)) for section in all_sections: section_id = section[0] @@ -200,8 +198,8 @@ class HoustonChronicle(BasicNewsRecipe): except ValueError: return None - el = page_doc.findAll(lambda this_tag: this_tag.name == "time" and - ('itemprop', 'datePublished') in this_tag.attrs) + el = page_doc.findAll( + lambda this_tag: this_tag.name == "time" and ('itemprop', 'datePublished') in this_tag.attrs) if len(el) == 1: return get_regular_timestamp(el[0].get('datetime')) else: