pep8

2025-08-30 23:00:21 -04:00 · 2018-07-13 08:16:38 +05:30 · 2018-07-13 08:16:38 +05:30 · b7181cfd37
commit b7181cfd37
parent 17cc008148
2 changed files with 43 additions and 40 deletions
--- a/recipes/bloomberg_columns.recipe
+++ b/recipes/bloomberg_columns.recipe
@ -5,18 +5,14 @@ __copyright__ = '2018, Dale Furrow dkfurrow@gmail.com'
 '''
 chron.com
 '''
-import re
+from datetime import datetime, timedelta
 import time
 from datetime import datetime, timedelta, date
 from lxml import html, etree
 from StringIO import StringIO
 from calibre.web.feeds.recipes import BasicNewsRecipe
 import urllib2
 import traceback
 from collections import OrderedDict
 import calendar
-import sys
+from calibre.ebooks.BeautifulSoup import Tag
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
 contributors_url = "https://www.bloomberg.com/view/contributors"
@ -38,6 +34,7 @@ def get_article_parsed(this_url):
    parsed = html.parse(StringIO(content), parser)
    return parsed
 class BloombergContributor:
    _name = None
    _url_name = None
@ -55,20 +52,23 @@ class BloombergContributor:
        return "{0} ({1}): {2:d} articles".format(self._name, self._url_name, len(self._article_list))
    def populate_article_list(self):
-        list_url = "{0}/{1}/{2}/articles".format(contributors_url, self._url_code, self._url_name)
+        list_url = "{0}/{1}/{2}/articles".format(
            contributors_url, self._url_code, self._url_name)
        parsed_list = get_article_parsed(list_url)
        articles = parsed_list.xpath("//li[contains(@class, 'item_lwCjl')]")
        for article in articles:
            headline = article.find('a')
            link = headline.attrib['href']
            title = headline.text.strip()
-            article_date_eles = article.xpath(".//span[contains(@class, 'time_3qQJR')]")
+            article_date_eles = article.xpath(
                ".//span[contains(@class, 'time_3qQJR')]")
            if len(article_date_eles) > 0:
                article_date_str = article_date_eles[0].text.strip()
                article_date = self.parse_date_str(article_date_str)
            else:
                article_date = None
-            summary_eles = article.xpath(".//p[contains(@class, 'summary_17SO6')]")
+            summary_eles = article.xpath(
                ".//p[contains(@class, 'summary_17SO6')]")
            if len(summary_eles) > 0:
                summary = summary_eles[0].text.strip()
            else:
@ -82,13 +82,14 @@ class BloombergContributor:
        # 2nd sunday March, 1st Sunday Nov
        c = calendar.Calendar(firstweekday=calendar.SUNDAY)
        march_cal = c.monthdatescalendar(article_date.year, 3)
-        dst_start = [day for week in march_cal for day in week if \
+        dst_start = [
-                     day.weekday() == calendar.SUNDAY and \
+                day for week in march_cal for day in week
-                     day.month == 3][1]
+                if day.weekday() == calendar.SUNDAY and day.month == 3
        ][1]
        nov_cal = c.monthdatescalendar(article_date.year, 11)
-        dst_end = [day for week in nov_cal for day in week if \
+        dst_end = [day for week in nov_cal for day in week
-                   day.weekday() == calendar.SUNDAY and \
+                if day.weekday() == calendar.SUNDAY and day.month == 11
-                   day.month == 11][0]
+        ][0]
        dst_start = datetime(dst_start.year, dst_start.month, dst_start.day, 2)
        dst_end = datetime(dst_end.year, dst_end.month, dst_end.day, 1)
        if dst_start > article_date > dst_end:
@ -97,14 +98,13 @@ class BloombergContributor:
            shift = timedelta(hours=5)
        return float((article_date + shift - datetime.utcfromtimestamp(0)).total_seconds())
    def parse_date_str(self, date_str):
        parsed = None
        for date_format in self.date_formats:
            try:
                parsed = datetime.strptime(date_str[0:-4], date_format)
                break
-            except Exception as ex:
+            except Exception:
                pass
        return parsed
@ -126,6 +126,7 @@ class BloombergContributor:
    def get_name(self):
        return self._name
 class BloombergContributors(BasicNewsRecipe):
    title = u'Bloomberg, Editorial Contributors'
    description = 'Articles from Bloomberg.com contributors'
@ -135,8 +136,9 @@ class BloombergContributors(BasicNewsRecipe):
    no_stylesheets = True
    remove_attributes = ['style', 'xmlns']
    keep_only_tags = [dict(name='article', attrs={'data-type': 'article'})]
-    remove_tags = [dict(name='div', attrs=
+    # note space...
-    {'class': ['share-article-button ', 'text-to-speech']})]  # note space...
+    remove_tags = [
        dict(name='div', attrs={'class': ['share-article-button ', 'text-to-speech']})]
    oldest_article = 7.0
    ignore_duplicate_articles = {'url'}
    recursions = 0
@ -150,7 +152,8 @@ class BloombergContributors(BasicNewsRecipe):
        for el in els:
            name = el.find("span").text.strip()  # name
            contibutor_items = el.attrib['href'].split('/')
-            contributor = BloombergContributor(name, contibutor_items[4], contibutor_items[3])
+            contributor = BloombergContributor(
                name, contibutor_items[4], contibutor_items[3])
            contributor_list.append(contributor)
        for contributor in contributor_list:
            contributor.populate_article_list()
@ -176,24 +179,26 @@ class BloombergContributors(BasicNewsRecipe):
            insert_tag.insert(0, parsed_time)
            soup.time.replaceWith(insert_tag)
        return soup
    def parse_index(self):
        self.timefmt = ' [%a, %d %b, %Y]'
-        self.log('starting parse_index: {0}'.format(datetime.now().strftime("%B %d, %Y %I:%M %p")))
+        self.log('starting parse_index: {0}'.format(
            datetime.now().strftime("%B %d, %Y %I:%M %p")))
        feeds = []
        feed_dict = OrderedDict()
        contributor_list = self.get_contributors_list()
-        self.log("Found {0:d} contibutors on main page".format(len(contributor_list)))
+        self.log("Found {0:d} contibutors on main page".format(
            len(contributor_list)))
        for contributor in contributor_list:
            articles = contributor.get_ordered_article_feed()
            feed_dict.update(articles)
-        feed_dict = OrderedDict(sorted(feed_dict.items(), key=lambda t: t[0], reverse=True))
+        feed_dict = OrderedDict(
-        self.log("Found {0:d} linked articles from contributors".format(len(feed_dict)))
+            sorted(feed_dict.items(), key=lambda t: t[0], reverse=True))
        self.log("Found {0:d} linked articles from contributors".format(
            len(feed_dict)))
        feeds.append(("Columns", list(feed_dict.values())))
        # self.log("Total of {0:d} {1} articles".format(len(article_list), cat))
-        self.log('finishing parse_index: {0}'.format(datetime.now().strftime("%B %d, %Y %I:%M %p")))
+        self.log('finishing parse_index: {0}'.format(
            datetime.now().strftime("%B %d, %Y %I:%M %p")))
        return feeds
--- a/recipes/houston_chronicle.recipe
+++ b/recipes/houston_chronicle.recipe
@ -9,26 +9,24 @@ import re
 import time
 import urllib2
 from StringIO import StringIO
-from datetime import datetime, timedelta, date
+from datetime import datetime
 import traceback
 import sys
 from collections import OrderedDict
 from datetime import datetime, timedelta, date
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from calibre.web.feeds import Article
 from calibre.utils.cleantext import clean_ascii_chars
 from calibre.ebooks.BeautifulSoup import NavigableString
-from calibre.utils.date import dt_factory, utcnow, local_tz
+from calibre.utils.date import dt_factory, local_tz
 from lxml import html
 from lxml import etree
-regex_date_only = re.compile("""(?:January|February|March|April|
+regex_date_only = re.compile(r"""(?:January|February|March|April|
 {8}May|June|July|August|September|October|November|
 {8}December)\s[0-9]{1,2},\s20[01][0-9]""")
-regex_time_only = re.compile("""[0-9]{1,2}:[0-9]{1,2} \w{2}""")
+regex_time_only = re.compile(r"""[0-9]{1,2}:[0-9]{1,2} \w{2}""")
-sentence_regex = re.compile("(\S.+?[.!?])(?=\s+|$)")
+sentence_regex = re.compile(r"(\S.+?[.!?])(?=\s+|$)")
-blog_regex = re.compile('post-\d+')
+blog_regex = re.compile(r'post-\d+')
 pages = OrderedDict([('news', ('/news/houston-texas/', ['business', 'sports'])),
                     ('business', ('/business/', ['sports'])),
@ -40,7 +38,7 @@ base_url = "http://www.chron.com"
 xpath_general = """//div[contains(@class, 'centerpiece-tabs') or
                    contains(@class, 'wrapper') or
                    contains(@class, 'contentGroups') or
-                    contains(@class, 'headline-list') or 
+                    contains(@class, 'headline-list') or
                    contains(@class, 'core-package sports') or
                    contains(@class, 'news')]
                   //a[contains(@class, 'hdn-analytics')]"""
@ -116,7 +114,7 @@ def get_all_links_from_sections():
    article_set = set()
    final_dict = OrderedDict()
    for item in pages.items():
-        print "getting links from {0}".format(item[0])
+        print("getting links from {0}".format(item[0]))
        all_sections.append(get_links_from_section_page(item))
    for section in all_sections:
        section_id = section[0]
@ -200,8 +198,8 @@ class HoustonChronicle(BasicNewsRecipe):
            except ValueError:
                return None
-        el = page_doc.findAll(lambda this_tag: this_tag.name == "time" and
+        el = page_doc.findAll(
-                              ('itemprop', 'datePublished') in this_tag.attrs)
+                lambda this_tag: this_tag.name == "time" and ('itemprop', 'datePublished') in this_tag.attrs)
        if len(el) == 1:
            return get_regular_timestamp(el[0].get('datetime'))
        else: