calibre/recipes/thecodelesscode.recipe

#!/usr/bin/env python

from datetime import date
from lxml import etree

__copyright__ = '2015, April King <april@twoevils.org>'
__license__ = 'GPL v3'
__version__ = '1.2'

'''
http://www.thecodelesscode.com/
'''

from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup as bs


class CodelessCode(BasicNewsRecipe):
    __author__ = 'April King'
    title = u'The Codeless Code'
    category = 'fiction, programming, technology'
    chapters = {}    # ie, Mousetrap -> 182
    compress_news_images = True
    compress_news_images_max_size = 100
    cover_url = 'http://www.thecodelesscode.com/pages/case-9/Lotus-050.jpg'
    credits = [u'<h2 class="chapter_title">{0}</h2>'.format(title),
               u'<p>By <em>Qi</em></p>',
               u'<p>An illustrated collection of (sometimes violent) fables concerning the Art and Philosophy of software development, written in the spirit of Zen kōans</p>',  # noqa
               u'<p>eBook conversion courtesy of <em>{0}</em></p>'.format(__author__)]
    description = u'The Art and Philosophy of software development, written in the spirit of Zen kōans'
    extra_css             = '.article_date { display: none; float: right; } \
                             .chapter_title { font-size: 1.75em; margin-top: 0; } \
                             .chapter_title::first-letter { font-size: 1.35em; font-weight: 500; letter-spacing: -.05em; } \
                             h2 { margin-top: 0; } \
                             .image_wrapper { text-align: center; }'
    index = 'http://www.thecodelesscode.com/contents'
    language = 'en'
    max_articles_per_feed = 1000  # I can only wish
    path_remappings = {}    # IE, /case/182 -> articles_72/index.html
    publication_type = 'blog'
    publisher = 'Qi'
    resolve_internal_links = True
    scale_news_images = (600, 400)
    simultaneous_downloads = 1
    url = 'http://www.thecodelesscode.com'

    def parse_index(self):
        koans = []

        # Retrieve the contents page, containing the ToC
        soup = self.index_to_soup(self.index)

        for koan in soup.findAll('tr'):
            # BS has some trouble with the weird layout
            tag = koan.find('a')

            if tag is None:
                continue
            if 'random' in tag['href']:
                continue

            # Minor coding error causes calibre to glitch; use the current date
            # for the most recent title
            koan_date = koan.find('td', attrs={'class': 'toc-date'})
            if koan_date is None:
                koan_date = date.isoformat(date.today())
            else:
                koan_date = koan_date.string

            title = tag.string
            url = self.url + tag['href']

            if u'The Applicant' in title:
                continue  # Only the main story

            koans.append({
                'content': '',
                'date': koan_date,
                'description': '',
                'title': title,
                'url': url,
            })

            # ie, Mousetrap -> 182
            self.chapters[title] = url.split('/')[-1]

        # Oldest koans first
        koans.reverse()

        # Log and then get out of here
        self.log("Found {0} koans".format(len(koans)))
        return([(self.title, koans)])

    def preprocess_html(self, soup):
        title = soup.find('h1', attrs={'class': 'title'}).find(
            'a', attrs={'class': 'subtle'}).string

        # Add a title at the beginning of each chapter
        if title in self.chapters:
            title = '<div class="chapter_title">{0}</div>'.format(title)

        # Load up the actual story
        koan = soup.find('div', attrs={'class': 'story koan'})

        # Kind of a hack-y way to get .children in BS3  <a><b><c></c></b></a>
        # -> <b><c></c></b>
        contents = list(koan.contents)
        koan = bs(title)

        for i in reversed(contents):
            koan.insert(1, i)

        # Remove all anchors that don't contain /case/, leaving them as just their text
        # Note that we'll come back and clean up /case/ links when the URLs are remapped
        # during postprocess_book()
        anchors = koan.findAll('a')
        if anchors != []:
            for anchor in anchors:
                if '/case/' in anchor['href']:
                    pass
                elif 'note' in anchor['href']:
                    anchor.replaceWith('')
                else:
                    # Again, a hacky way to get the contents of the tag, thanks
                    # to BS3
                    contents = list(anchor.contents)
                    linktext = bs()
                    for i in reversed(contents):
                        linktext.insert(1, i)
                    anchor.replaceWith(linktext)

        # Find all the images, and wrap them up in an image_wrapper div
        for i in range(0, len(koan.contents), 1):
            if not hasattr(koan.contents[i], 'name'):
                continue  # skip carriage returns
            if koan.contents[i].name == u'img':
                div = bs('<div class="image_wrapper"></div>')
                div.div.insert(0, koan.contents[i])
                koan.insert(i, div)

        return(koan)

    def canonicalize_internal_url(self, url, is_link=True):
        url = url.split(self.url)[-1]
        return BasicNewsRecipe.canonicalize_internal_url(self, url, is_link=is_link)

    def postprocess_book(self, oeb, opts, log):
        # Go through each internal representation of each HTML file, and fix
        # all the broken hrefs, if possible
        for item in oeb.manifest.items:
            if item.media_type == 'text/html':

                for node in item.data.xpath('//*[@href]'):
                    naughty_href = node.get('href')

                    if naughty_href in self.path_remappings:
                        node.set('href', '../' +
                                 self.path_remappings[naughty_href])
                        href = node.get('href')
                        self.log(
                            "Remapped href {0} --> {1}".format(naughty_href, href))

        # Remove the superfluous extra feed page at the beginning of the book, replacing it
        # with the proper credits
        for item in oeb.manifest.hrefs['index.html'].data.xpath('//*[local-name()="ul"]'):
            item.getparent().remove(item)

        for item in oeb.manifest.hrefs['index.html'].data.xpath('//*[local-name()="p"]'):
            item.getparent().remove(item)

        for item in oeb.manifest.hrefs['index.html'].data.xpath('//*[local-name()="div"]'):
            for credit in self.credits[::-1]:
                item.insert(0, etree.fromstring(credit, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)))

        # Change the creator from "calibre" to the actual author
        # Also, we don't need the date in the ebook's title
        oeb.metadata.items['creator'][0].value = self.publisher
        oeb.metadata.items['description'][0].value = oeb.metadata.items[
            'description'][0].value.split('\n\nArticles in this issue')[0]
        oeb.metadata.items['publication_type'][0].value = self.title
        oeb.metadata.items['publisher'][0].value = self.publisher
        oeb.metadata.items['title'][0].value = self.title