mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-11-03 19:17:02 -05:00 
			
		
		
		
	
		
			
				
	
	
		
			300 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			300 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
from __future__ import print_function
 | 
						|
import time
 | 
						|
import traceback
 | 
						|
# above for debugging via stack
 | 
						|
from calibre.web.feeds.recipes import BasicNewsRecipe
 | 
						|
# Allows the Python soup converter, which makes parsing easier.
 | 
						|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
 | 
						|
 | 
						|
import os
 | 
						|
 | 
						|
 | 
						|
from calibre.web.feeds import feeds_from_index
 | 
						|
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
 | 
						|
 | 
						|
 | 
						|
# To Do: strip ads and graphics, Current Column lacks a title.
 | 
						|
# The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries.
 | 
						|
# Newsletters: Talking Points Memos covered by cat12
 | 
						|
# ./ebook-convert  --username xxx --password xxx
 | 
						|
 | 
						|
# this is derived from BasicNewsRecipe, so it can only overload those.
 | 
						|
# Soome of what we need is otherwise in article, so we have more copy to
 | 
						|
# do than otherwise.
 | 
						|
class OReillyPremium(BasicNewsRecipe):
 | 
						|
    title = u'OReilly Premium'
 | 
						|
    __author__ = 'TMcN'
 | 
						|
    description = 'Retrieves Premium and News Letter content from BillOReilly.com.  Requires a Bill OReilly Premium Membership.'
 | 
						|
    cover_url = 'http://images.billoreilly.com/images/headers/billgray_header.png'
 | 
						|
    custom_title = 'Bill O\'Reilly Premium - ' + time.strftime('%d %b %Y')
 | 
						|
    title = 'Bill O\'Reilly Premium'
 | 
						|
    auto_cleanup = True
 | 
						|
    conversion_options = {'linearize_tables': True}
 | 
						|
    encoding = 'utf8'
 | 
						|
    language = 'en'
 | 
						|
    no_stylesheets = True
 | 
						|
    needs_subscription = True
 | 
						|
    oldest_article = 31
 | 
						|
    remove_javascript = True
 | 
						|
    remove_tags = [dict(name='img', attrs={})]
 | 
						|
    # Don't go down
 | 
						|
    recursions = 0
 | 
						|
    max_articles_per_feed = 20
 | 
						|
 | 
						|
    debugMessages = True
 | 
						|
 | 
						|
    # Name, URL, Soup FindAll Attr if relevant (last two are special case),
 | 
						|
    # articleList
 | 
						|
    catList = [["TV Archives",         'https://www.billoreilly.com/show?action=tvShowArchive', 'a',
 | 
						|
                {'class': ['showLinks', 'homeLinks']},                   []],
 | 
						|
               ["Current Column",      'https://www.billoreilly.com/currentcolumn',
 | 
						|
                'span', {'class': ['defaultHeader']},                           []]
 | 
						|
               ]
 | 
						|
 | 
						|
    feeds = [
 | 
						|
        (u'No Spin',        u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=7'),
 | 
						|
        (u'Daily Briefing', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=11'),
 | 
						|
        (u'Talking Points', u'https://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=12'),
 | 
						|
        (u'Blog',           u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=0'),
 | 
						|
        (u'StratFor',       u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=5')
 | 
						|
    ]
 | 
						|
    # http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=8
 | 
						|
    # is word for the day.
 | 
						|
 | 
						|
    # Note: Talking Points is broken in the above model; the site changed to more Ajax-y.
 | 
						|
    # Now using RSS
 | 
						|
 | 
						|
    def get_browser(self):
 | 
						|
        print("In get_browser")
 | 
						|
        br = BasicNewsRecipe.get_browser(self)
 | 
						|
        if self.username is not None and self.password is not None:
 | 
						|
            br.open('https://www.billoreilly.com/pg/jsp/member/membersignin.jsp')
 | 
						|
            br.select_form(name='login')
 | 
						|
            br['formEmailField'] = self.username
 | 
						|
            br['formPasswordField'] = self.password
 | 
						|
            br.submit()
 | 
						|
        return br
 | 
						|
 | 
						|
    # Returns the best-guess print url.
 | 
						|
    # The second parameter (pageURL) is returned if nothing is found.
 | 
						|
    def extractPrintURL(self, baseURL, pageURL, printString):
 | 
						|
        tagURL = pageURL
 | 
						|
        soup = self.index_to_soup(pageURL)
 | 
						|
        if soup:
 | 
						|
            printText = soup.find('a', text=printString)
 | 
						|
        else:
 | 
						|
            print("Failed to find Print string " +
 | 
						|
                  printString + " in " + pageURL)
 | 
						|
        if printText:
 | 
						|
            tag = printText.parent
 | 
						|
            tagURL = baseURL + tag['href']
 | 
						|
        return tagURL
 | 
						|
 | 
						|
    def stripBadChars(self, inString):
 | 
						|
        return inString.replace("\'", "")
 | 
						|
 | 
						|
    def parseGeneric(self, baseURL):
 | 
						|
        # Does a generic parsing of the articles.  There are six categories (0-5)
 | 
						|
        # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
 | 
						|
        # NoSpin and TV are generic
 | 
						|
        fullReturn = []
 | 
						|
        for i in range(len(self.catList)):
 | 
						|
            articleList = []
 | 
						|
            print("In " + self.catList[i][0] + ", index: " + str(i))
 | 
						|
            soup = self.index_to_soup(self.catList[i][1])
 | 
						|
            # Set defaults
 | 
						|
            description = 'None'
 | 
						|
            pubdate = time.strftime('%a, %d %b')
 | 
						|
            # Problem: 0-2 create many in an array
 | 
						|
            # 3-5 create one.
 | 
						|
            # So no for-div for 3-5
 | 
						|
 | 
						|
            if i == 0:
 | 
						|
                print("Starting TV Archives")
 | 
						|
                for div in soup.findAll(self.catList[i][2], self.catList[i][3]):
 | 
						|
                    print("Next DIV:")
 | 
						|
                    print(div)
 | 
						|
                    a = div
 | 
						|
                    summary = div.find(True, attrs={'class': 'summary'})
 | 
						|
                    if summary:
 | 
						|
                        description = self.tag_to_string(
 | 
						|
                            summary, use_alt=False)
 | 
						|
                    if not a:
 | 
						|
                        continue
 | 
						|
                    # url = baseURL+re.sub(r'\?.*', '', a['href'])
 | 
						|
                    url = baseURL + a['href']
 | 
						|
                    url = self.extractPrintURL(
 | 
						|
                        baseURL, url, "Print this entry")
 | 
						|
                    title = self.tag_to_string(a, use_alt=True).strip()
 | 
						|
                    articleList.append(
 | 
						|
                        dict(title=title, url=url, date=pubdate, description=description, content=''))
 | 
						|
 | 
						|
            else:       # Current Column
 | 
						|
                titleSpan = soup.find(self.catList[i][2], self.catList[i][3])
 | 
						|
                if titleSpan is None:
 | 
						|
                    print("No Current Column Title Span")
 | 
						|
                    print(soup)
 | 
						|
                    continue
 | 
						|
                title = titleSpan.contents[0]
 | 
						|
                url = self.extractPrintURL(baseURL, self.catList[i][
 | 
						|
                                           1], "Print This Article")
 | 
						|
            if i == 1:
 | 
						|
                if self.debugMessages:
 | 
						|
                    print(self.catList[i][0] + " Title:" +
 | 
						|
                          title + " at url: " + url)
 | 
						|
                summary = div.find(True, attrs={'class': 'summary'})
 | 
						|
                print("At Summary")
 | 
						|
                print(summary)
 | 
						|
                if summary is not None:
 | 
						|
                    description = self.tag_to_string(summary, use_alt=False)
 | 
						|
                print("At append")
 | 
						|
                articleList.append(
 | 
						|
                    dict(title=title, url=url, date=pubdate, description=description, content=''))
 | 
						|
            self.catList[i][3] = articleList
 | 
						|
            fullReturn.append((self.catList[i][0], articleList))
 | 
						|
            print("Returning")
 | 
						|
            # print fullReturn
 | 
						|
        return fullReturn
 | 
						|
 | 
						|
    def parse_index(self):
 | 
						|
        # Parse the page into Python Soup
 | 
						|
        print("Entering recipe print_index from:")
 | 
						|
        traceback.print_stack()
 | 
						|
        print("web")
 | 
						|
        baseURL = "https://www.billoreilly.com"
 | 
						|
        masterList = self.parseGeneric(baseURL)
 | 
						|
        # print(masterList)
 | 
						|
        return masterList
 | 
						|
 | 
						|
    def preprocess_html(self, soup):
 | 
						|
        print("In preprocess_html")
 | 
						|
        refresh = soup.find('meta', {'http-equiv': 'refresh'})
 | 
						|
        if refresh is None:
 | 
						|
            return soup
 | 
						|
        content = refresh.get('content').partition('=')[2]
 | 
						|
        raw = self.browser.open('https://www.billoreilly.com' + content).read()
 | 
						|
        return BeautifulSoup(raw.decode('cp1252', 'replace'))
 | 
						|
 | 
						|
    def build_index(self):
 | 
						|
        print("In OReilly build_index()\n\n")
 | 
						|
        feedsRSS = []
 | 
						|
        self.report_progress(0, ('Fetching feeds...'))
 | 
						|
        # try:
 | 
						|
        feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
 | 
						|
                                 max_articles_per_feed=self.max_articles_per_feed,
 | 
						|
                                 log=self.log)
 | 
						|
        self.report_progress(0, ('Got feeds from index page'))
 | 
						|
        # except NotImplementedError:
 | 
						|
        #    feeds = self.parse_feeds()
 | 
						|
        # Now add regular feeds.
 | 
						|
        feedsRSS = self.parse_feeds()
 | 
						|
        print("feedsRSS is type " + feedsRSS.__class__.__name__)
 | 
						|
 | 
						|
        for articles in feedsRSS:
 | 
						|
            print("articles is type " + articles.__class__.__name__)
 | 
						|
            print("Title:" + articles.title)
 | 
						|
            feeds.append(articles)
 | 
						|
        if not feeds:
 | 
						|
            raise ValueError('No articles found, aborting')
 | 
						|
 | 
						|
        self.report_progress(0, ('Trying to download cover...'))
 | 
						|
        self.download_cover()
 | 
						|
        self.report_progress(0, ('Generating masthead...'))
 | 
						|
        self.masthead_path = None
 | 
						|
 | 
						|
        try:
 | 
						|
            murl = self.get_masthead_url()
 | 
						|
        except:
 | 
						|
            self.log.exception('Failed to get masthead url')
 | 
						|
            murl = None
 | 
						|
 | 
						|
        if murl is not None:
 | 
						|
            # Try downloading the user-supplied masthead_url
 | 
						|
            # Failure sets self.masthead_path to None
 | 
						|
            self.download_masthead(murl)
 | 
						|
        if self.masthead_path is None:
 | 
						|
            self.log.info("Synthesizing mastheadImage")
 | 
						|
            self.masthead_path = os.path.join(
 | 
						|
                self.output_dir, 'mastheadImage.jpg')
 | 
						|
            try:
 | 
						|
                self.default_masthead_image(self.masthead_path)
 | 
						|
            except:
 | 
						|
                self.log.exception('Failed to generate default masthead image')
 | 
						|
                self.masthead_path = None
 | 
						|
 | 
						|
        if self.test:
 | 
						|
            feeds = feeds[:2]
 | 
						|
        self.has_single_feed = len(feeds) == 1
 | 
						|
 | 
						|
        index = os.path.join(self.output_dir, 'index.html')
 | 
						|
 | 
						|
        html = self.feeds2index(feeds)
 | 
						|
        with open(index, 'wb') as fi:
 | 
						|
            fi.write(html)
 | 
						|
 | 
						|
        self.jobs = []
 | 
						|
 | 
						|
        if self.reverse_article_order:
 | 
						|
            for feed in feeds:
 | 
						|
                if hasattr(feed, 'reverse'):
 | 
						|
                    feed.reverse()
 | 
						|
 | 
						|
        self.feed_objects = feeds
 | 
						|
        for f, feed in enumerate(feeds):
 | 
						|
            feed_dir = os.path.join(self.output_dir, 'feed_%d' % f)
 | 
						|
            if not os.path.isdir(feed_dir):
 | 
						|
                os.makedirs(feed_dir)
 | 
						|
 | 
						|
            for a, article in enumerate(feed):
 | 
						|
                if a >= self.max_articles_per_feed:
 | 
						|
                    break
 | 
						|
                art_dir = os.path.join(feed_dir, 'article_%d' % a)
 | 
						|
                if not os.path.isdir(art_dir):
 | 
						|
                    os.makedirs(art_dir)
 | 
						|
                try:
 | 
						|
                    url = self.print_version(article.url)
 | 
						|
                except NotImplementedError:
 | 
						|
                    url = article.url
 | 
						|
                except:
 | 
						|
                    self.log.exception(
 | 
						|
                        'Failed to find print version for: ' + article.url)
 | 
						|
                    url = None
 | 
						|
                if not url:
 | 
						|
                    continue
 | 
						|
                func, arg = (self.fetch_embedded_article, article) \
 | 
						|
                    if self.use_embedded_content or (self.use_embedded_content is None and feed.has_embedded_content()) \
 | 
						|
                    else \
 | 
						|
                            ((self.fetch_obfuscated_article if self.articles_are_obfuscated
 | 
						|
                              else self.fetch_article), url)
 | 
						|
                req = WorkRequest(func, (arg, art_dir, f, a, len(feed)),
 | 
						|
                                  {}, (f, a), self.article_downloaded,
 | 
						|
                                  self.error_in_article_download)
 | 
						|
                req.feed = feed
 | 
						|
                req.article = article
 | 
						|
                req.feed_dir = feed_dir
 | 
						|
                self.jobs.append(req)
 | 
						|
 | 
						|
        self.jobs_done = 0
 | 
						|
        tp = ThreadPool(self.simultaneous_downloads)
 | 
						|
        for req in self.jobs:
 | 
						|
            tp.putRequest(req, block=True, timeout=0)
 | 
						|
 | 
						|
        self.report_progress(
 | 
						|
            0, ('Starting download [%d thread(s)]...') % self.simultaneous_downloads)
 | 
						|
        while True:
 | 
						|
            try:
 | 
						|
                tp.poll()
 | 
						|
                time.sleep(0.1)
 | 
						|
            except NoResultsPending:
 | 
						|
                break
 | 
						|
        for f, feed in enumerate(feeds):
 | 
						|
            print("Writing feeds for " + feed.title)
 | 
						|
            html = self.feed2index(f, feeds)
 | 
						|
            feed_dir = os.path.join(self.output_dir, 'feed_%d' % f)
 | 
						|
            with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi:
 | 
						|
                fi.write(html)
 | 
						|
        self.create_opf(feeds)
 | 
						|
        self.report_progress(1, ('Feeds downloaded to %s') % index)
 | 
						|
 | 
						|
        return index
 |