This commit is contained in:
Kovid Goyal 2018-07-13 08:16:38 +05:30
parent 17cc008148
commit b7181cfd37
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 43 additions and 40 deletions

View File

@ -5,18 +5,14 @@ __copyright__ = '2018, Dale Furrow dkfurrow@gmail.com'
''' '''
chron.com chron.com
''' '''
import re from datetime import datetime, timedelta
import time
from datetime import datetime, timedelta, date
from lxml import html, etree from lxml import html, etree
from StringIO import StringIO from StringIO import StringIO
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
import urllib2 import urllib2
import traceback
from collections import OrderedDict from collections import OrderedDict
import calendar import calendar
import sys from calibre.ebooks.BeautifulSoup import Tag
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
contributors_url = "https://www.bloomberg.com/view/contributors" contributors_url = "https://www.bloomberg.com/view/contributors"
@ -38,6 +34,7 @@ def get_article_parsed(this_url):
parsed = html.parse(StringIO(content), parser) parsed = html.parse(StringIO(content), parser)
return parsed return parsed
class BloombergContributor: class BloombergContributor:
_name = None _name = None
_url_name = None _url_name = None
@ -55,20 +52,23 @@ class BloombergContributor:
return "{0} ({1}): {2:d} articles".format(self._name, self._url_name, len(self._article_list)) return "{0} ({1}): {2:d} articles".format(self._name, self._url_name, len(self._article_list))
def populate_article_list(self): def populate_article_list(self):
list_url = "{0}/{1}/{2}/articles".format(contributors_url, self._url_code, self._url_name) list_url = "{0}/{1}/{2}/articles".format(
contributors_url, self._url_code, self._url_name)
parsed_list = get_article_parsed(list_url) parsed_list = get_article_parsed(list_url)
articles = parsed_list.xpath("//li[contains(@class, 'item_lwCjl')]") articles = parsed_list.xpath("//li[contains(@class, 'item_lwCjl')]")
for article in articles: for article in articles:
headline = article.find('a') headline = article.find('a')
link = headline.attrib['href'] link = headline.attrib['href']
title = headline.text.strip() title = headline.text.strip()
article_date_eles = article.xpath(".//span[contains(@class, 'time_3qQJR')]") article_date_eles = article.xpath(
".//span[contains(@class, 'time_3qQJR')]")
if len(article_date_eles) > 0: if len(article_date_eles) > 0:
article_date_str = article_date_eles[0].text.strip() article_date_str = article_date_eles[0].text.strip()
article_date = self.parse_date_str(article_date_str) article_date = self.parse_date_str(article_date_str)
else: else:
article_date = None article_date = None
summary_eles = article.xpath(".//p[contains(@class, 'summary_17SO6')]") summary_eles = article.xpath(
".//p[contains(@class, 'summary_17SO6')]")
if len(summary_eles) > 0: if len(summary_eles) > 0:
summary = summary_eles[0].text.strip() summary = summary_eles[0].text.strip()
else: else:
@ -82,13 +82,14 @@ class BloombergContributor:
# 2nd sunday March, 1st Sunday Nov # 2nd sunday March, 1st Sunday Nov
c = calendar.Calendar(firstweekday=calendar.SUNDAY) c = calendar.Calendar(firstweekday=calendar.SUNDAY)
march_cal = c.monthdatescalendar(article_date.year, 3) march_cal = c.monthdatescalendar(article_date.year, 3)
dst_start = [day for week in march_cal for day in week if \ dst_start = [
day.weekday() == calendar.SUNDAY and \ day for week in march_cal for day in week
day.month == 3][1] if day.weekday() == calendar.SUNDAY and day.month == 3
][1]
nov_cal = c.monthdatescalendar(article_date.year, 11) nov_cal = c.monthdatescalendar(article_date.year, 11)
dst_end = [day for week in nov_cal for day in week if \ dst_end = [day for week in nov_cal for day in week
day.weekday() == calendar.SUNDAY and \ if day.weekday() == calendar.SUNDAY and day.month == 11
day.month == 11][0] ][0]
dst_start = datetime(dst_start.year, dst_start.month, dst_start.day, 2) dst_start = datetime(dst_start.year, dst_start.month, dst_start.day, 2)
dst_end = datetime(dst_end.year, dst_end.month, dst_end.day, 1) dst_end = datetime(dst_end.year, dst_end.month, dst_end.day, 1)
if dst_start > article_date > dst_end: if dst_start > article_date > dst_end:
@ -97,14 +98,13 @@ class BloombergContributor:
shift = timedelta(hours=5) shift = timedelta(hours=5)
return float((article_date + shift - datetime.utcfromtimestamp(0)).total_seconds()) return float((article_date + shift - datetime.utcfromtimestamp(0)).total_seconds())
def parse_date_str(self, date_str): def parse_date_str(self, date_str):
parsed = None parsed = None
for date_format in self.date_formats: for date_format in self.date_formats:
try: try:
parsed = datetime.strptime(date_str[0:-4], date_format) parsed = datetime.strptime(date_str[0:-4], date_format)
break break
except Exception as ex: except Exception:
pass pass
return parsed return parsed
@ -126,6 +126,7 @@ class BloombergContributor:
def get_name(self): def get_name(self):
return self._name return self._name
class BloombergContributors(BasicNewsRecipe): class BloombergContributors(BasicNewsRecipe):
title = u'Bloomberg, Editorial Contributors' title = u'Bloomberg, Editorial Contributors'
description = 'Articles from Bloomberg.com contributors' description = 'Articles from Bloomberg.com contributors'
@ -135,8 +136,9 @@ class BloombergContributors(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
remove_attributes = ['style', 'xmlns'] remove_attributes = ['style', 'xmlns']
keep_only_tags = [dict(name='article', attrs={'data-type': 'article'})] keep_only_tags = [dict(name='article', attrs={'data-type': 'article'})]
remove_tags = [dict(name='div', attrs= # note space...
{'class': ['share-article-button ', 'text-to-speech']})] # note space... remove_tags = [
dict(name='div', attrs={'class': ['share-article-button ', 'text-to-speech']})]
oldest_article = 7.0 oldest_article = 7.0
ignore_duplicate_articles = {'url'} ignore_duplicate_articles = {'url'}
recursions = 0 recursions = 0
@ -150,7 +152,8 @@ class BloombergContributors(BasicNewsRecipe):
for el in els: for el in els:
name = el.find("span").text.strip() # name name = el.find("span").text.strip() # name
contibutor_items = el.attrib['href'].split('/') contibutor_items = el.attrib['href'].split('/')
contributor = BloombergContributor(name, contibutor_items[4], contibutor_items[3]) contributor = BloombergContributor(
name, contibutor_items[4], contibutor_items[3])
contributor_list.append(contributor) contributor_list.append(contributor)
for contributor in contributor_list: for contributor in contributor_list:
contributor.populate_article_list() contributor.populate_article_list()
@ -176,24 +179,26 @@ class BloombergContributors(BasicNewsRecipe):
insert_tag.insert(0, parsed_time) insert_tag.insert(0, parsed_time)
soup.time.replaceWith(insert_tag) soup.time.replaceWith(insert_tag)
return soup return soup
def parse_index(self): def parse_index(self):
self.timefmt = ' [%a, %d %b, %Y]' self.timefmt = ' [%a, %d %b, %Y]'
self.log('starting parse_index: {0}'.format(datetime.now().strftime("%B %d, %Y %I:%M %p"))) self.log('starting parse_index: {0}'.format(
datetime.now().strftime("%B %d, %Y %I:%M %p")))
feeds = [] feeds = []
feed_dict = OrderedDict() feed_dict = OrderedDict()
contributor_list = self.get_contributors_list() contributor_list = self.get_contributors_list()
self.log("Found {0:d} contibutors on main page".format(len(contributor_list))) self.log("Found {0:d} contibutors on main page".format(
len(contributor_list)))
for contributor in contributor_list: for contributor in contributor_list:
articles = contributor.get_ordered_article_feed() articles = contributor.get_ordered_article_feed()
feed_dict.update(articles) feed_dict.update(articles)
feed_dict = OrderedDict(sorted(feed_dict.items(), key=lambda t: t[0], reverse=True)) feed_dict = OrderedDict(
self.log("Found {0:d} linked articles from contributors".format(len(feed_dict))) sorted(feed_dict.items(), key=lambda t: t[0], reverse=True))
self.log("Found {0:d} linked articles from contributors".format(
len(feed_dict)))
feeds.append(("Columns", list(feed_dict.values()))) feeds.append(("Columns", list(feed_dict.values())))
# self.log("Total of {0:d} {1} articles".format(len(article_list), cat)) # self.log("Total of {0:d} {1} articles".format(len(article_list), cat))
self.log('finishing parse_index: {0}'.format(datetime.now().strftime("%B %d, %Y %I:%M %p"))) self.log('finishing parse_index: {0}'.format(
datetime.now().strftime("%B %d, %Y %I:%M %p")))
return feeds return feeds

View File

@ -9,26 +9,24 @@ import re
import time import time
import urllib2 import urllib2
from StringIO import StringIO from StringIO import StringIO
from datetime import datetime, timedelta, date from datetime import datetime
import traceback import traceback
import sys import sys
from collections import OrderedDict from collections import OrderedDict
from datetime import datetime, timedelta, date
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.web.feeds import Article
from calibre.utils.cleantext import clean_ascii_chars from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.BeautifulSoup import NavigableString from calibre.ebooks.BeautifulSoup import NavigableString
from calibre.utils.date import dt_factory, utcnow, local_tz from calibre.utils.date import dt_factory, local_tz
from lxml import html from lxml import html
from lxml import etree from lxml import etree
regex_date_only = re.compile("""(?:January|February|March|April| regex_date_only = re.compile(r"""(?:January|February|March|April|
{8}May|June|July|August|September|October|November| {8}May|June|July|August|September|October|November|
{8}December)\s[0-9]{1,2},\s20[01][0-9]""") {8}December)\s[0-9]{1,2},\s20[01][0-9]""")
regex_time_only = re.compile("""[0-9]{1,2}:[0-9]{1,2} \w{2}""") regex_time_only = re.compile(r"""[0-9]{1,2}:[0-9]{1,2} \w{2}""")
sentence_regex = re.compile("(\S.+?[.!?])(?=\s+|$)") sentence_regex = re.compile(r"(\S.+?[.!?])(?=\s+|$)")
blog_regex = re.compile('post-\d+') blog_regex = re.compile(r'post-\d+')
pages = OrderedDict([('news', ('/news/houston-texas/', ['business', 'sports'])), pages = OrderedDict([('news', ('/news/houston-texas/', ['business', 'sports'])),
('business', ('/business/', ['sports'])), ('business', ('/business/', ['sports'])),
@ -40,7 +38,7 @@ base_url = "http://www.chron.com"
xpath_general = """//div[contains(@class, 'centerpiece-tabs') or xpath_general = """//div[contains(@class, 'centerpiece-tabs') or
contains(@class, 'wrapper') or contains(@class, 'wrapper') or
contains(@class, 'contentGroups') or contains(@class, 'contentGroups') or
contains(@class, 'headline-list') or contains(@class, 'headline-list') or
contains(@class, 'core-package sports') or contains(@class, 'core-package sports') or
contains(@class, 'news')] contains(@class, 'news')]
//a[contains(@class, 'hdn-analytics')]""" //a[contains(@class, 'hdn-analytics')]"""
@ -116,7 +114,7 @@ def get_all_links_from_sections():
article_set = set() article_set = set()
final_dict = OrderedDict() final_dict = OrderedDict()
for item in pages.items(): for item in pages.items():
print "getting links from {0}".format(item[0]) print("getting links from {0}".format(item[0]))
all_sections.append(get_links_from_section_page(item)) all_sections.append(get_links_from_section_page(item))
for section in all_sections: for section in all_sections:
section_id = section[0] section_id = section[0]
@ -200,8 +198,8 @@ class HoustonChronicle(BasicNewsRecipe):
except ValueError: except ValueError:
return None return None
el = page_doc.findAll(lambda this_tag: this_tag.name == "time" and el = page_doc.findAll(
('itemprop', 'datePublished') in this_tag.attrs) lambda this_tag: this_tag.name == "time" and ('itemprop', 'datePublished') in this_tag.attrs)
if len(el) == 1: if len(el) == 1:
return get_regular_timestamp(el[0].get('datetime')) return get_regular_timestamp(el[0].get('datetime'))
else: else: