mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
pep8
This commit is contained in:
parent
17cc008148
commit
b7181cfd37
@ -5,18 +5,14 @@ __copyright__ = '2018, Dale Furrow dkfurrow@gmail.com'
|
|||||||
'''
|
'''
|
||||||
chron.com
|
chron.com
|
||||||
'''
|
'''
|
||||||
import re
|
from datetime import datetime, timedelta
|
||||||
import time
|
|
||||||
from datetime import datetime, timedelta, date
|
|
||||||
from lxml import html, etree
|
from lxml import html, etree
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
import urllib2
|
import urllib2
|
||||||
import traceback
|
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
import calendar
|
import calendar
|
||||||
import sys
|
from calibre.ebooks.BeautifulSoup import Tag
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
|
|
||||||
|
|
||||||
|
|
||||||
contributors_url = "https://www.bloomberg.com/view/contributors"
|
contributors_url = "https://www.bloomberg.com/view/contributors"
|
||||||
@ -38,6 +34,7 @@ def get_article_parsed(this_url):
|
|||||||
parsed = html.parse(StringIO(content), parser)
|
parsed = html.parse(StringIO(content), parser)
|
||||||
return parsed
|
return parsed
|
||||||
|
|
||||||
|
|
||||||
class BloombergContributor:
|
class BloombergContributor:
|
||||||
_name = None
|
_name = None
|
||||||
_url_name = None
|
_url_name = None
|
||||||
@ -55,20 +52,23 @@ class BloombergContributor:
|
|||||||
return "{0} ({1}): {2:d} articles".format(self._name, self._url_name, len(self._article_list))
|
return "{0} ({1}): {2:d} articles".format(self._name, self._url_name, len(self._article_list))
|
||||||
|
|
||||||
def populate_article_list(self):
|
def populate_article_list(self):
|
||||||
list_url = "{0}/{1}/{2}/articles".format(contributors_url, self._url_code, self._url_name)
|
list_url = "{0}/{1}/{2}/articles".format(
|
||||||
|
contributors_url, self._url_code, self._url_name)
|
||||||
parsed_list = get_article_parsed(list_url)
|
parsed_list = get_article_parsed(list_url)
|
||||||
articles = parsed_list.xpath("//li[contains(@class, 'item_lwCjl')]")
|
articles = parsed_list.xpath("//li[contains(@class, 'item_lwCjl')]")
|
||||||
for article in articles:
|
for article in articles:
|
||||||
headline = article.find('a')
|
headline = article.find('a')
|
||||||
link = headline.attrib['href']
|
link = headline.attrib['href']
|
||||||
title = headline.text.strip()
|
title = headline.text.strip()
|
||||||
article_date_eles = article.xpath(".//span[contains(@class, 'time_3qQJR')]")
|
article_date_eles = article.xpath(
|
||||||
|
".//span[contains(@class, 'time_3qQJR')]")
|
||||||
if len(article_date_eles) > 0:
|
if len(article_date_eles) > 0:
|
||||||
article_date_str = article_date_eles[0].text.strip()
|
article_date_str = article_date_eles[0].text.strip()
|
||||||
article_date = self.parse_date_str(article_date_str)
|
article_date = self.parse_date_str(article_date_str)
|
||||||
else:
|
else:
|
||||||
article_date = None
|
article_date = None
|
||||||
summary_eles = article.xpath(".//p[contains(@class, 'summary_17SO6')]")
|
summary_eles = article.xpath(
|
||||||
|
".//p[contains(@class, 'summary_17SO6')]")
|
||||||
if len(summary_eles) > 0:
|
if len(summary_eles) > 0:
|
||||||
summary = summary_eles[0].text.strip()
|
summary = summary_eles[0].text.strip()
|
||||||
else:
|
else:
|
||||||
@ -82,13 +82,14 @@ class BloombergContributor:
|
|||||||
# 2nd sunday March, 1st Sunday Nov
|
# 2nd sunday March, 1st Sunday Nov
|
||||||
c = calendar.Calendar(firstweekday=calendar.SUNDAY)
|
c = calendar.Calendar(firstweekday=calendar.SUNDAY)
|
||||||
march_cal = c.monthdatescalendar(article_date.year, 3)
|
march_cal = c.monthdatescalendar(article_date.year, 3)
|
||||||
dst_start = [day for week in march_cal for day in week if \
|
dst_start = [
|
||||||
day.weekday() == calendar.SUNDAY and \
|
day for week in march_cal for day in week
|
||||||
day.month == 3][1]
|
if day.weekday() == calendar.SUNDAY and day.month == 3
|
||||||
|
][1]
|
||||||
nov_cal = c.monthdatescalendar(article_date.year, 11)
|
nov_cal = c.monthdatescalendar(article_date.year, 11)
|
||||||
dst_end = [day for week in nov_cal for day in week if \
|
dst_end = [day for week in nov_cal for day in week
|
||||||
day.weekday() == calendar.SUNDAY and \
|
if day.weekday() == calendar.SUNDAY and day.month == 11
|
||||||
day.month == 11][0]
|
][0]
|
||||||
dst_start = datetime(dst_start.year, dst_start.month, dst_start.day, 2)
|
dst_start = datetime(dst_start.year, dst_start.month, dst_start.day, 2)
|
||||||
dst_end = datetime(dst_end.year, dst_end.month, dst_end.day, 1)
|
dst_end = datetime(dst_end.year, dst_end.month, dst_end.day, 1)
|
||||||
if dst_start > article_date > dst_end:
|
if dst_start > article_date > dst_end:
|
||||||
@ -97,14 +98,13 @@ class BloombergContributor:
|
|||||||
shift = timedelta(hours=5)
|
shift = timedelta(hours=5)
|
||||||
return float((article_date + shift - datetime.utcfromtimestamp(0)).total_seconds())
|
return float((article_date + shift - datetime.utcfromtimestamp(0)).total_seconds())
|
||||||
|
|
||||||
|
|
||||||
def parse_date_str(self, date_str):
|
def parse_date_str(self, date_str):
|
||||||
parsed = None
|
parsed = None
|
||||||
for date_format in self.date_formats:
|
for date_format in self.date_formats:
|
||||||
try:
|
try:
|
||||||
parsed = datetime.strptime(date_str[0:-4], date_format)
|
parsed = datetime.strptime(date_str[0:-4], date_format)
|
||||||
break
|
break
|
||||||
except Exception as ex:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
return parsed
|
return parsed
|
||||||
|
|
||||||
@ -126,6 +126,7 @@ class BloombergContributor:
|
|||||||
def get_name(self):
|
def get_name(self):
|
||||||
return self._name
|
return self._name
|
||||||
|
|
||||||
|
|
||||||
class BloombergContributors(BasicNewsRecipe):
|
class BloombergContributors(BasicNewsRecipe):
|
||||||
title = u'Bloomberg, Editorial Contributors'
|
title = u'Bloomberg, Editorial Contributors'
|
||||||
description = 'Articles from Bloomberg.com contributors'
|
description = 'Articles from Bloomberg.com contributors'
|
||||||
@ -135,8 +136,9 @@ class BloombergContributors(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_attributes = ['style', 'xmlns']
|
remove_attributes = ['style', 'xmlns']
|
||||||
keep_only_tags = [dict(name='article', attrs={'data-type': 'article'})]
|
keep_only_tags = [dict(name='article', attrs={'data-type': 'article'})]
|
||||||
remove_tags = [dict(name='div', attrs=
|
# note space...
|
||||||
{'class': ['share-article-button ', 'text-to-speech']})] # note space...
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'class': ['share-article-button ', 'text-to-speech']})]
|
||||||
oldest_article = 7.0
|
oldest_article = 7.0
|
||||||
ignore_duplicate_articles = {'url'}
|
ignore_duplicate_articles = {'url'}
|
||||||
recursions = 0
|
recursions = 0
|
||||||
@ -150,7 +152,8 @@ class BloombergContributors(BasicNewsRecipe):
|
|||||||
for el in els:
|
for el in els:
|
||||||
name = el.find("span").text.strip() # name
|
name = el.find("span").text.strip() # name
|
||||||
contibutor_items = el.attrib['href'].split('/')
|
contibutor_items = el.attrib['href'].split('/')
|
||||||
contributor = BloombergContributor(name, contibutor_items[4], contibutor_items[3])
|
contributor = BloombergContributor(
|
||||||
|
name, contibutor_items[4], contibutor_items[3])
|
||||||
contributor_list.append(contributor)
|
contributor_list.append(contributor)
|
||||||
for contributor in contributor_list:
|
for contributor in contributor_list:
|
||||||
contributor.populate_article_list()
|
contributor.populate_article_list()
|
||||||
@ -176,24 +179,26 @@ class BloombergContributors(BasicNewsRecipe):
|
|||||||
insert_tag.insert(0, parsed_time)
|
insert_tag.insert(0, parsed_time)
|
||||||
soup.time.replaceWith(insert_tag)
|
soup.time.replaceWith(insert_tag)
|
||||||
|
|
||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
self.timefmt = ' [%a, %d %b, %Y]'
|
self.timefmt = ' [%a, %d %b, %Y]'
|
||||||
self.log('starting parse_index: {0}'.format(datetime.now().strftime("%B %d, %Y %I:%M %p")))
|
self.log('starting parse_index: {0}'.format(
|
||||||
|
datetime.now().strftime("%B %d, %Y %I:%M %p")))
|
||||||
feeds = []
|
feeds = []
|
||||||
feed_dict = OrderedDict()
|
feed_dict = OrderedDict()
|
||||||
contributor_list = self.get_contributors_list()
|
contributor_list = self.get_contributors_list()
|
||||||
self.log("Found {0:d} contibutors on main page".format(len(contributor_list)))
|
self.log("Found {0:d} contibutors on main page".format(
|
||||||
|
len(contributor_list)))
|
||||||
for contributor in contributor_list:
|
for contributor in contributor_list:
|
||||||
articles = contributor.get_ordered_article_feed()
|
articles = contributor.get_ordered_article_feed()
|
||||||
feed_dict.update(articles)
|
feed_dict.update(articles)
|
||||||
feed_dict = OrderedDict(sorted(feed_dict.items(), key=lambda t: t[0], reverse=True))
|
feed_dict = OrderedDict(
|
||||||
self.log("Found {0:d} linked articles from contributors".format(len(feed_dict)))
|
sorted(feed_dict.items(), key=lambda t: t[0], reverse=True))
|
||||||
|
self.log("Found {0:d} linked articles from contributors".format(
|
||||||
|
len(feed_dict)))
|
||||||
feeds.append(("Columns", list(feed_dict.values())))
|
feeds.append(("Columns", list(feed_dict.values())))
|
||||||
# self.log("Total of {0:d} {1} articles".format(len(article_list), cat))
|
# self.log("Total of {0:d} {1} articles".format(len(article_list), cat))
|
||||||
self.log('finishing parse_index: {0}'.format(datetime.now().strftime("%B %d, %Y %I:%M %p")))
|
self.log('finishing parse_index: {0}'.format(
|
||||||
|
datetime.now().strftime("%B %d, %Y %I:%M %p")))
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
|
|
||||||
|
@ -9,26 +9,24 @@ import re
|
|||||||
import time
|
import time
|
||||||
import urllib2
|
import urllib2
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
from datetime import datetime, timedelta, date
|
from datetime import datetime
|
||||||
import traceback
|
import traceback
|
||||||
import sys
|
import sys
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
|
||||||
from datetime import datetime, timedelta, date
|
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from calibre.web.feeds import Article
|
|
||||||
from calibre.utils.cleantext import clean_ascii_chars
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
from calibre.ebooks.BeautifulSoup import NavigableString
|
from calibre.ebooks.BeautifulSoup import NavigableString
|
||||||
from calibre.utils.date import dt_factory, utcnow, local_tz
|
from calibre.utils.date import dt_factory, local_tz
|
||||||
from lxml import html
|
from lxml import html
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
regex_date_only = re.compile("""(?:January|February|March|April|
|
regex_date_only = re.compile(r"""(?:January|February|March|April|
|
||||||
{8}May|June|July|August|September|October|November|
|
{8}May|June|July|August|September|October|November|
|
||||||
{8}December)\s[0-9]{1,2},\s20[01][0-9]""")
|
{8}December)\s[0-9]{1,2},\s20[01][0-9]""")
|
||||||
regex_time_only = re.compile("""[0-9]{1,2}:[0-9]{1,2} \w{2}""")
|
regex_time_only = re.compile(r"""[0-9]{1,2}:[0-9]{1,2} \w{2}""")
|
||||||
sentence_regex = re.compile("(\S.+?[.!?])(?=\s+|$)")
|
sentence_regex = re.compile(r"(\S.+?[.!?])(?=\s+|$)")
|
||||||
blog_regex = re.compile('post-\d+')
|
blog_regex = re.compile(r'post-\d+')
|
||||||
|
|
||||||
pages = OrderedDict([('news', ('/news/houston-texas/', ['business', 'sports'])),
|
pages = OrderedDict([('news', ('/news/houston-texas/', ['business', 'sports'])),
|
||||||
('business', ('/business/', ['sports'])),
|
('business', ('/business/', ['sports'])),
|
||||||
@ -40,7 +38,7 @@ base_url = "http://www.chron.com"
|
|||||||
xpath_general = """//div[contains(@class, 'centerpiece-tabs') or
|
xpath_general = """//div[contains(@class, 'centerpiece-tabs') or
|
||||||
contains(@class, 'wrapper') or
|
contains(@class, 'wrapper') or
|
||||||
contains(@class, 'contentGroups') or
|
contains(@class, 'contentGroups') or
|
||||||
contains(@class, 'headline-list') or
|
contains(@class, 'headline-list') or
|
||||||
contains(@class, 'core-package sports') or
|
contains(@class, 'core-package sports') or
|
||||||
contains(@class, 'news')]
|
contains(@class, 'news')]
|
||||||
//a[contains(@class, 'hdn-analytics')]"""
|
//a[contains(@class, 'hdn-analytics')]"""
|
||||||
@ -116,7 +114,7 @@ def get_all_links_from_sections():
|
|||||||
article_set = set()
|
article_set = set()
|
||||||
final_dict = OrderedDict()
|
final_dict = OrderedDict()
|
||||||
for item in pages.items():
|
for item in pages.items():
|
||||||
print "getting links from {0}".format(item[0])
|
print("getting links from {0}".format(item[0]))
|
||||||
all_sections.append(get_links_from_section_page(item))
|
all_sections.append(get_links_from_section_page(item))
|
||||||
for section in all_sections:
|
for section in all_sections:
|
||||||
section_id = section[0]
|
section_id = section[0]
|
||||||
@ -200,8 +198,8 @@ class HoustonChronicle(BasicNewsRecipe):
|
|||||||
except ValueError:
|
except ValueError:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
el = page_doc.findAll(lambda this_tag: this_tag.name == "time" and
|
el = page_doc.findAll(
|
||||||
('itemprop', 'datePublished') in this_tag.attrs)
|
lambda this_tag: this_tag.name == "time" and ('itemprop', 'datePublished') in this_tag.attrs)
|
||||||
if len(el) == 1:
|
if len(el) == 1:
|
||||||
return get_regular_timestamp(el[0].get('datetime'))
|
return get_regular_timestamp(el[0].get('datetime'))
|
||||||
else:
|
else:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user