calibre/recipes/thecodelesscode.recipe
2019-12-29 22:02:16 +05:30

183 lines
7.2 KiB
Python

#!/usr/bin/env python2
from datetime import date
from lxml import etree
__copyright__ = '2015, April King <april@twoevils.org>'
__license__ = 'GPL v3'
__version__ = '1.2'
'''
http://www.thecodelesscode.com/
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup as bs
class CodelessCode(BasicNewsRecipe):
__author__ = 'April King'
title = u'The Codeless Code'
category = 'fiction, programming, technology'
chapters = {} # ie, Mousetrap -> 182
compress_news_images = True
compress_news_images_max_size = 100
cover_url = 'http://www.thecodelesscode.com/pages/case-9/Lotus-050.jpg'
credits = [u'<h2 class="chapter_title">{0}</h2>'.format(title),
u'<p>By <em>Qi</em></p>',
u'<p>An illustrated collection of (sometimes violent) fables concerning the Art and Philosophy of software development, written in the spirit of Zen kōans</p>', # noqa
u'<p>eBook conversion courtesy of <em>{0}</em></p>'.format(__author__)]
description = u'The Art and Philosophy of software development, written in the spirit of Zen kōans'
extra_css = '.article_date { display: none; float: right; } \
.chapter_title { font-size: 1.75em; margin-top: 0; } \
.chapter_title::first-letter { font-size: 1.35em; font-weight: 500; letter-spacing: -.05em; } \
h2 { margin-top: 0; } \
.image_wrapper { text-align: center; }'
index = 'http://www.thecodelesscode.com/contents'
language = 'en'
max_articles_per_feed = 1000 # I can only wish
path_remappings = {} # IE, /case/182 -> articles_72/index.html
publication_type = 'blog'
publisher = 'Qi'
resolve_internal_links = True
scale_news_images = (600, 400)
simultaneous_downloads = 1
url = 'http://www.thecodelesscode.com'
def parse_index(self):
koans = []
# Retrieve the contents page, containing the ToC
soup = self.index_to_soup(self.index)
for koan in soup.findAll('tr'):
# BS has some trouble with the weird layout
tag = koan.find('a')
if tag is None:
continue
if 'random' in tag['href']:
continue
# Minor coding error causes calibre to glitch; use the current date
# for the most recent title
koan_date = koan.find('td', attrs={'class': 'toc-date'})
if koan_date is None:
koan_date = date.isoformat(date.today())
else:
koan_date = koan_date.string
title = tag.string
url = self.url + tag['href']
if u'The Applicant' in title:
continue # Only the main story
koans.append({
'content': '',
'date': koan_date,
'description': '',
'title': title,
'url': url,
})
# ie, Mousetrap -> 182
self.chapters[title] = url.split('/')[-1]
# Oldest koans first
koans.reverse()
# Log and then get out of here
self.log("Found {0} koans".format(len(koans)))
return([(self.title, koans)])
def preprocess_html(self, soup):
title = soup.find('h1', attrs={'class': 'title'}).find(
'a', attrs={'class': 'subtle'}).string
# Add a title at the beginning of each chapter
if title in self.chapters:
title = '<div class="chapter_title">{0}</div>'.format(title)
# Load up the actual story
koan = soup.find('div', attrs={'class': 'story koan'})
# Kind of a hack-y way to get .children in BS3 <a><b><c></c></b></a>
# -> <b><c></c></b>
contents = list(koan.contents)
koan = bs(title)
for i in reversed(contents):
koan.insert(1, i)
# Remove all anchors that don't contain /case/, leaving them as just their text
# Note that we'll come back and clean up /case/ links when the URLs are remapped
# during postprocess_book()
anchors = koan.findAll('a')
if anchors != []:
for anchor in anchors:
if '/case/' in anchor['href']:
pass
elif 'note' in anchor['href']:
anchor.replaceWith('')
else:
# Again, a hacky way to get the contents of the tag, thanks
# to BS3
contents = list(anchor.contents)
linktext = bs()
for i in reversed(contents):
linktext.insert(1, i)
anchor.replaceWith(linktext)
# Find all the images, and wrap them up in an image_wrapper div
for i in range(0, len(koan.contents), 1):
if not hasattr(koan.contents[i], 'name'):
continue # skip carriage returns
if koan.contents[i].name == u'img':
div = bs('<div class="image_wrapper"></div>')
div.div.insert(0, koan.contents[i])
koan.insert(i, div)
return(koan)
def canonicalize_internal_url(self, url, is_link=True):
url = url.split(self.url)[-1]
return BasicNewsRecipe.canonicalize_internal_url(self, url, is_link=is_link)
def postprocess_book(self, oeb, opts, log):
# Go through each internal representation of each HTML file, and fix
# all the broken hrefs, if possible
for item in oeb.manifest.items:
if item.media_type == 'text/html':
for node in item.data.xpath('//*[@href]'):
naughty_href = node.get('href')
if naughty_href in self.path_remappings:
node.set('href', '../' +
self.path_remappings[naughty_href])
href = node.get('href')
self.log(
"Remapped href {0} --> {1}".format(naughty_href, href))
# Remove the superfluous extra feed page at the beginning of the book, replacing it
# with the proper credits
for item in oeb.manifest.hrefs['index.html'].data.xpath('//*[local-name()="ul"]'):
item.getparent().remove(item)
for item in oeb.manifest.hrefs['index.html'].data.xpath('//*[local-name()="p"]'):
item.getparent().remove(item)
for item in oeb.manifest.hrefs['index.html'].data.xpath('//*[local-name()="div"]'):
for credit in self.credits[::-1]:
item.insert(0, etree.fromstring(credit, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)))
# Change the creator from "calibre" to the actual author
# Also, we don't need the date in the ebook's title
oeb.metadata.items['creator'][0].value = self.publisher
oeb.metadata.items['description'][0].value = oeb.metadata.items[
'description'][0].value.split('\n\nArticles in this issue')[0]
oeb.metadata.items['publication_type'][0].value = self.title
oeb.metadata.items['publisher'][0].value = self.publisher
oeb.metadata.items['title'][0].value = self.title