#!/usr/bin/env python2 # -*- coding: utf-8 -*- __license__ = 'GPL v3' __copyright__ = '2013, Dale Furrow dkfurrow@gmail.com' ''' chron.com ''' import re, time from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.utils.date import dt_factory, local_tz from datetime import datetime, timedelta, date from lxml import html class HoustonChronicle(BasicNewsRecipe): title = u'The Houston Chronicle' description = 'News from Houston, Texas' __author__ = 'Dale Furrow' language = 'en' no_stylesheets = True # use_embedded_content = False remove_attributes = ['style'] remove_empty_feeds = True timefmt = '[%a, %d %b %Y]' timestampfmt = '%Y%m%d%H%M%S' ignore_duplicate_articles = {'url'} remove_attributes = ['xmlns'] remove_tags = [dict(name='div', attrs={'class':'socialBar'}), dict(name='div', attrs={'class':re.compile('post-commentmeta')}), dict(name='div', attrs={'class':re.compile('slideshow_wrapper')}), dict(name='div', attrs={'class':'entry-summary'}), dict(name='a', attrs={'rel':'item-license'})] baseUrl = 'http://www.chron.com' oldest_web_article = 7.0 if oldest_web_article is None: earliest_date = date.today() else: earliest_date = date.today() - timedelta(days=oldest_web_article) pages = [('news' , '/news/houston-texas/'), ('business' , '/business/'), ('opinion', '/opinion/'), ('sports', '/sports/')] def getLinksFromSectionPage(self, sectionUrl): pageDoc = html.parse(sectionUrl) els = pageDoc.xpath("""//div[contains(@class, 'scp-item') or @class='scp-feature' or contains(@class, 'simplelist') or contains(@class, 'scp-blogpromo')] //a[@href and not(@target) and not(child::img)]""") elList = [] for el in els: link = el.get('href') title = el.text if link[:4] != 'http': link = self.baseUrl + link if title is not None: elList.append((link, el.text)) return elList def getArticleDescriptionFromDoc(self, pageDoc): descriptionCharsBreak = 140 descriptionMaxChars = 300 descXpath = """//div[contains(@class, 'article-body') or contains(@class, 'resource-content') or contains(@class, 'post')]//p""" sentenceRegex = re.compile("(\S.+?[.!?])(?=\s+|$)") def stringify_children(node): return ''.join([x for x in node.itertext()]) try: els = pageDoc.xpath(descXpath) outText = "" ellipsis = "" for el in els: sentences = re.findall(sentenceRegex, stringify_children(el)) for sentence in sentences: if len(outText) < descriptionCharsBreak: outText += sentence + " " else: if len(outText) > descriptionMaxChars: ellipsis = "..." return outText[:descriptionMaxChars] + ellipsis return outText except: self.log('Error on Article Description') return "" def getPublishedTimeFromDoc(self, pageDoc): regexDateOnly = re.compile("""(?:January|February|March|April| May|June|July|August|September|October|November| December)\s[0-9]{1,2},\s20[01][0-9]""") regextTimeOnly = re.compile("""[0-9]{1,2}:[0-9]{1,2} \w{2}""") def getRegularTimestamp(dateString): try: outDate = datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%SZ") return outDate except: return None def getDateFromString(inText): match = re.findall(regexDateOnly, inText) if match: try: outDate = datetime.strptime(match[0], "%B %d, %Y") match = re.findall(regextTimeOnly, inText) if match: outTime = datetime.strptime(match[0], "%I:%M %p") return datetime.combine(outDate.date(), outTime.time()) return outDate except: return None else: return None el = pageDoc.xpath("//*[@class='timestamp'][1]") if len(el) == 1: return getRegularTimestamp(el[0].get('title')) else: el = pageDoc.xpath("//*[@class='entry-date' or @class='post-date'][1]") if len(el) == 1: return getDateFromString(el[0].text_content()) else: return None def getAllFeedDataFromPage(self, page): articles = [] linkList = self.getLinksFromSectionPage(self.baseUrl + page[1]) self.log('from section: ', page[0], " found ", len(linkList), " links") for link in linkList: try: articleDoc = html.parse(link[0]) description = self.getArticleDescriptionFromDoc(articleDoc) articleDate = self.getPublishedTimeFromDoc(articleDoc) if articleDate is not None and description is not None and articleDate.date() > self.earliest_date: dateText = articleDate.strftime('%a, %d %b') author = articleDate.strftime(self.timestampfmt) articles.append({'title':link[1], 'url':link[0], 'description':description, 'date':dateText, 'author':author}) self.log(page[0] + ": " + link[1] + ', from ' + dateText + " description of " + str(len(description)) + ' characters at ' + link[0]) else: msg = "" if articleDate is None: msg = " No Timestamp Found" else: msg = " article older than " + str(self.oldest_web_article) + ' days...' self.log("Skipping article: ", link[0], msg) except: print 'error on fetching ' + link[0] continue return articles def parse_index(self): self.timefmt = ' [%a, %d %b, %Y]' self.log('starting parse_index: ', time.strftime(self.timestampfmt)) feeds = [] for page in self.pages: articles = [] articles = self.getAllFeedDataFromPage(page) if articles: feeds.append((page[0], articles)) self.log('finished parse_index: ', time.strftime(self.timestampfmt)) return feeds def preprocess_html(self, thisSoup): baseTags = [] baseTags.extend(thisSoup.findAll(name='div', attrs={'id':re.compile('post-\d+')})) baseTags.extend(thisSoup.findAll(name='div', attrs={'class':'hnews hentry item'})) allTags = [] allTags.extend(baseTags) if len(baseTags) > 0: for tag in baseTags: allTags.extend(tag.findAll(True)) paragraphs = thisSoup.findAll(name='p') for paragraph in paragraphs: if paragraph not in allTags: allTags.append(paragraph) for tag in baseTags: while tag.parent is not None: allTags.append(tag) tag = tag.parent for tag in thisSoup.findAll(True): if tag not in allTags: tag.extract() return thisSoup def populate_article_metadata(self, article, soup, first): if not first: return try: article.date = time.strptime(article.author, self.timestampfmt) article.utctime = dt_factory(article.date, assume_utc=False, as_utc=False) article.localtime = article.utctime.astimezone(local_tz) except Exception as inst: # remove after debug self.log('Exception: ', article.title) # remove after debug self.log(type(inst)) # remove after debug self.log(inst) # remove after debug