This commit is contained in:
Kovid Goyal 2018-07-13 08:16:38 +05:30
parent 17cc008148
commit b7181cfd37
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 43 additions and 40 deletions

View File

@ -5,18 +5,14 @@ __copyright__ = '2018, Dale Furrow dkfurrow@gmail.com'
'''
chron.com
'''
import re
import time
from datetime import datetime, timedelta, date
from datetime import datetime, timedelta
from lxml import html, etree
from StringIO import StringIO
from calibre.web.feeds.recipes import BasicNewsRecipe
import urllib2
import traceback
from collections import OrderedDict
import calendar
import sys
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
from calibre.ebooks.BeautifulSoup import Tag
contributors_url = "https://www.bloomberg.com/view/contributors"
@ -38,6 +34,7 @@ def get_article_parsed(this_url):
parsed = html.parse(StringIO(content), parser)
return parsed
class BloombergContributor:
_name = None
_url_name = None
@ -55,20 +52,23 @@ class BloombergContributor:
return "{0} ({1}): {2:d} articles".format(self._name, self._url_name, len(self._article_list))
def populate_article_list(self):
list_url = "{0}/{1}/{2}/articles".format(contributors_url, self._url_code, self._url_name)
list_url = "{0}/{1}/{2}/articles".format(
contributors_url, self._url_code, self._url_name)
parsed_list = get_article_parsed(list_url)
articles = parsed_list.xpath("//li[contains(@class, 'item_lwCjl')]")
for article in articles:
headline = article.find('a')
link = headline.attrib['href']
title = headline.text.strip()
article_date_eles = article.xpath(".//span[contains(@class, 'time_3qQJR')]")
article_date_eles = article.xpath(
".//span[contains(@class, 'time_3qQJR')]")
if len(article_date_eles) > 0:
article_date_str = article_date_eles[0].text.strip()
article_date = self.parse_date_str(article_date_str)
else:
article_date = None
summary_eles = article.xpath(".//p[contains(@class, 'summary_17SO6')]")
summary_eles = article.xpath(
".//p[contains(@class, 'summary_17SO6')]")
if len(summary_eles) > 0:
summary = summary_eles[0].text.strip()
else:
@ -82,13 +82,14 @@ class BloombergContributor:
# 2nd sunday March, 1st Sunday Nov
c = calendar.Calendar(firstweekday=calendar.SUNDAY)
march_cal = c.monthdatescalendar(article_date.year, 3)
dst_start = [day for week in march_cal for day in week if \
day.weekday() == calendar.SUNDAY and \
day.month == 3][1]
dst_start = [
day for week in march_cal for day in week
if day.weekday() == calendar.SUNDAY and day.month == 3
][1]
nov_cal = c.monthdatescalendar(article_date.year, 11)
dst_end = [day for week in nov_cal for day in week if \
day.weekday() == calendar.SUNDAY and \
day.month == 11][0]
dst_end = [day for week in nov_cal for day in week
if day.weekday() == calendar.SUNDAY and day.month == 11
][0]
dst_start = datetime(dst_start.year, dst_start.month, dst_start.day, 2)
dst_end = datetime(dst_end.year, dst_end.month, dst_end.day, 1)
if dst_start > article_date > dst_end:
@ -97,14 +98,13 @@ class BloombergContributor:
shift = timedelta(hours=5)
return float((article_date + shift - datetime.utcfromtimestamp(0)).total_seconds())
def parse_date_str(self, date_str):
parsed = None
for date_format in self.date_formats:
try:
parsed = datetime.strptime(date_str[0:-4], date_format)
break
except Exception as ex:
except Exception:
pass
return parsed
@ -126,6 +126,7 @@ class BloombergContributor:
def get_name(self):
return self._name
class BloombergContributors(BasicNewsRecipe):
title = u'Bloomberg, Editorial Contributors'
description = 'Articles from Bloomberg.com contributors'
@ -135,8 +136,9 @@ class BloombergContributors(BasicNewsRecipe):
no_stylesheets = True
remove_attributes = ['style', 'xmlns']
keep_only_tags = [dict(name='article', attrs={'data-type': 'article'})]
remove_tags = [dict(name='div', attrs=
{'class': ['share-article-button ', 'text-to-speech']})] # note space...
# note space...
remove_tags = [
dict(name='div', attrs={'class': ['share-article-button ', 'text-to-speech']})]
oldest_article = 7.0
ignore_duplicate_articles = {'url'}
recursions = 0
@ -150,7 +152,8 @@ class BloombergContributors(BasicNewsRecipe):
for el in els:
name = el.find("span").text.strip() # name
contibutor_items = el.attrib['href'].split('/')
contributor = BloombergContributor(name, contibutor_items[4], contibutor_items[3])
contributor = BloombergContributor(
name, contibutor_items[4], contibutor_items[3])
contributor_list.append(contributor)
for contributor in contributor_list:
contributor.populate_article_list()
@ -176,24 +179,26 @@ class BloombergContributors(BasicNewsRecipe):
insert_tag.insert(0, parsed_time)
soup.time.replaceWith(insert_tag)
return soup
def parse_index(self):
self.timefmt = ' [%a, %d %b, %Y]'
self.log('starting parse_index: {0}'.format(datetime.now().strftime("%B %d, %Y %I:%M %p")))
self.log('starting parse_index: {0}'.format(
datetime.now().strftime("%B %d, %Y %I:%M %p")))
feeds = []
feed_dict = OrderedDict()
contributor_list = self.get_contributors_list()
self.log("Found {0:d} contibutors on main page".format(len(contributor_list)))
self.log("Found {0:d} contibutors on main page".format(
len(contributor_list)))
for contributor in contributor_list:
articles = contributor.get_ordered_article_feed()
feed_dict.update(articles)
feed_dict = OrderedDict(sorted(feed_dict.items(), key=lambda t: t[0], reverse=True))
self.log("Found {0:d} linked articles from contributors".format(len(feed_dict)))
feed_dict = OrderedDict(
sorted(feed_dict.items(), key=lambda t: t[0], reverse=True))
self.log("Found {0:d} linked articles from contributors".format(
len(feed_dict)))
feeds.append(("Columns", list(feed_dict.values())))
# self.log("Total of {0:d} {1} articles".format(len(article_list), cat))
self.log('finishing parse_index: {0}'.format(datetime.now().strftime("%B %d, %Y %I:%M %p")))
self.log('finishing parse_index: {0}'.format(
datetime.now().strftime("%B %d, %Y %I:%M %p")))
return feeds

View File

@ -9,26 +9,24 @@ import re
import time
import urllib2
from StringIO import StringIO
from datetime import datetime, timedelta, date
from datetime import datetime
import traceback
import sys
from collections import OrderedDict
from datetime import datetime, timedelta, date
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.web.feeds import Article
from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.BeautifulSoup import NavigableString
from calibre.utils.date import dt_factory, utcnow, local_tz
from calibre.utils.date import dt_factory, local_tz
from lxml import html
from lxml import etree
regex_date_only = re.compile("""(?:January|February|March|April|
regex_date_only = re.compile(r"""(?:January|February|March|April|
{8}May|June|July|August|September|October|November|
{8}December)\s[0-9]{1,2},\s20[01][0-9]""")
regex_time_only = re.compile("""[0-9]{1,2}:[0-9]{1,2} \w{2}""")
sentence_regex = re.compile("(\S.+?[.!?])(?=\s+|$)")
blog_regex = re.compile('post-\d+')
regex_time_only = re.compile(r"""[0-9]{1,2}:[0-9]{1,2} \w{2}""")
sentence_regex = re.compile(r"(\S.+?[.!?])(?=\s+|$)")
blog_regex = re.compile(r'post-\d+')
pages = OrderedDict([('news', ('/news/houston-texas/', ['business', 'sports'])),
('business', ('/business/', ['sports'])),
@ -40,7 +38,7 @@ base_url = "http://www.chron.com"
xpath_general = """//div[contains(@class, 'centerpiece-tabs') or
contains(@class, 'wrapper') or
contains(@class, 'contentGroups') or
contains(@class, 'headline-list') or
contains(@class, 'headline-list') or
contains(@class, 'core-package sports') or
contains(@class, 'news')]
//a[contains(@class, 'hdn-analytics')]"""
@ -116,7 +114,7 @@ def get_all_links_from_sections():
article_set = set()
final_dict = OrderedDict()
for item in pages.items():
print "getting links from {0}".format(item[0])
print("getting links from {0}".format(item[0]))
all_sections.append(get_links_from_section_page(item))
for section in all_sections:
section_id = section[0]
@ -200,8 +198,8 @@ class HoustonChronicle(BasicNewsRecipe):
except ValueError:
return None
el = page_doc.findAll(lambda this_tag: this_tag.name == "time" and
('itemprop', 'datePublished') in this_tag.attrs)
el = page_doc.findAll(
lambda this_tag: this_tag.name == "time" and ('itemprop', 'datePublished') in this_tag.attrs)
if len(el) == 1:
return get_regular_timestamp(el[0].get('datetime'))
else: