mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
183 lines
7.2 KiB
Python
183 lines
7.2 KiB
Python
#!/usr/bin/env python2
|
|
|
|
from datetime import date
|
|
from lxml import etree
|
|
|
|
__copyright__ = '2015, April King <april@twoevils.org>'
|
|
__license__ = 'GPL v3'
|
|
__version__ = '1.2'
|
|
|
|
'''
|
|
http://www.thecodelesscode.com/
|
|
'''
|
|
|
|
from calibre.web.feeds.news import BasicNewsRecipe
|
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup as bs
|
|
|
|
|
|
class CodelessCode(BasicNewsRecipe):
|
|
__author__ = 'April King'
|
|
title = u'The Codeless Code'
|
|
category = 'fiction, programming, technology'
|
|
chapters = {} # ie, Mousetrap -> 182
|
|
compress_news_images = True
|
|
compress_news_images_max_size = 100
|
|
cover_url = 'http://www.thecodelesscode.com/pages/case-9/Lotus-050.jpg'
|
|
credits = [u'<h2 class="chapter_title">{0}</h2>'.format(title),
|
|
u'<p>By <em>Qi</em></p>',
|
|
u'<p>An illustrated collection of (sometimes violent) fables concerning the Art and Philosophy of software development, written in the spirit of Zen kōans</p>', # noqa
|
|
u'<p>eBook conversion courtesy of <em>{0}</em></p>'.format(__author__)]
|
|
description = u'The Art and Philosophy of software development, written in the spirit of Zen kōans'
|
|
extra_css = '.article_date { display: none; float: right; } \
|
|
.chapter_title { font-size: 1.75em; margin-top: 0; } \
|
|
.chapter_title::first-letter { font-size: 1.35em; font-weight: 500; letter-spacing: -.05em; } \
|
|
h2 { margin-top: 0; } \
|
|
.image_wrapper { text-align: center; }'
|
|
index = 'http://www.thecodelesscode.com/contents'
|
|
language = 'en'
|
|
max_articles_per_feed = 1000 # I can only wish
|
|
path_remappings = {} # IE, /case/182 -> articles_72/index.html
|
|
publication_type = 'blog'
|
|
publisher = 'Qi'
|
|
resolve_internal_links = True
|
|
scale_news_images = (600, 400)
|
|
simultaneous_downloads = 1
|
|
url = 'http://www.thecodelesscode.com'
|
|
|
|
def parse_index(self):
|
|
koans = []
|
|
|
|
# Retrieve the contents page, containing the ToC
|
|
soup = self.index_to_soup(self.index)
|
|
|
|
for koan in soup.findAll('tr'):
|
|
# BS has some trouble with the weird layout
|
|
tag = koan.find('a')
|
|
|
|
if tag is None:
|
|
continue
|
|
if 'random' in tag['href']:
|
|
continue
|
|
|
|
# Minor coding error causes calibre to glitch; use the current date
|
|
# for the most recent title
|
|
koan_date = koan.find('td', attrs={'class': 'toc-date'})
|
|
if koan_date is None:
|
|
koan_date = date.isoformat(date.today())
|
|
else:
|
|
koan_date = koan_date.string
|
|
|
|
title = tag.string
|
|
url = self.url + tag['href']
|
|
|
|
if u'The Applicant' in title:
|
|
continue # Only the main story
|
|
|
|
koans.append({
|
|
'content': '',
|
|
'date': koan_date,
|
|
'description': '',
|
|
'title': title,
|
|
'url': url,
|
|
})
|
|
|
|
# ie, Mousetrap -> 182
|
|
self.chapters[title] = url.split('/')[-1]
|
|
|
|
# Oldest koans first
|
|
koans.reverse()
|
|
|
|
# Log and then get out of here
|
|
self.log("Found {0} koans".format(len(koans)))
|
|
return([(self.title, koans)])
|
|
|
|
def preprocess_html(self, soup):
|
|
title = soup.find('h1', attrs={'class': 'title'}).find(
|
|
'a', attrs={'class': 'subtle'}).string
|
|
|
|
# Add a title at the beginning of each chapter
|
|
if title in self.chapters:
|
|
title = '<div class="chapter_title">{0}</div>'.format(title)
|
|
|
|
# Load up the actual story
|
|
koan = soup.find('div', attrs={'class': 'story koan'})
|
|
|
|
# Kind of a hack-y way to get .children in BS3 <a><b><c></c></b></a>
|
|
# -> <b><c></c></b>
|
|
contents = list(koan.contents)
|
|
koan = bs(title)
|
|
|
|
for i in reversed(contents):
|
|
koan.insert(1, i)
|
|
|
|
# Remove all anchors that don't contain /case/, leaving them as just their text
|
|
# Note that we'll come back and clean up /case/ links when the URLs are remapped
|
|
# during postprocess_book()
|
|
anchors = koan.findAll('a')
|
|
if anchors != []:
|
|
for anchor in anchors:
|
|
if '/case/' in anchor['href']:
|
|
pass
|
|
elif 'note' in anchor['href']:
|
|
anchor.replaceWith('')
|
|
else:
|
|
# Again, a hacky way to get the contents of the tag, thanks
|
|
# to BS3
|
|
contents = list(anchor.contents)
|
|
linktext = bs()
|
|
for i in reversed(contents):
|
|
linktext.insert(1, i)
|
|
anchor.replaceWith(linktext)
|
|
|
|
# Find all the images, and wrap them up in an image_wrapper div
|
|
for i in range(0, len(koan.contents), 1):
|
|
if not hasattr(koan.contents[i], 'name'):
|
|
continue # skip carriage returns
|
|
if koan.contents[i].name == u'img':
|
|
div = bs('<div class="image_wrapper"></div>')
|
|
div.div.insert(0, koan.contents[i])
|
|
koan.insert(i, div)
|
|
|
|
return(koan)
|
|
|
|
def canonicalize_internal_url(self, url, is_link=True):
|
|
url = url.split(self.url)[-1]
|
|
return BasicNewsRecipe.canonicalize_internal_url(self, url, is_link=is_link)
|
|
|
|
def postprocess_book(self, oeb, opts, log):
|
|
# Go through each internal representation of each HTML file, and fix
|
|
# all the broken hrefs, if possible
|
|
for item in oeb.manifest.items:
|
|
if item.media_type == 'text/html':
|
|
|
|
for node in item.data.xpath('//*[@href]'):
|
|
naughty_href = node.get('href')
|
|
|
|
if naughty_href in self.path_remappings:
|
|
node.set('href', '../' +
|
|
self.path_remappings[naughty_href])
|
|
href = node.get('href')
|
|
self.log(
|
|
"Remapped href {0} --> {1}".format(naughty_href, href))
|
|
|
|
# Remove the superfluous extra feed page at the beginning of the book, replacing it
|
|
# with the proper credits
|
|
for item in oeb.manifest.hrefs['index.html'].data.xpath('//*[local-name()="ul"]'):
|
|
item.getparent().remove(item)
|
|
|
|
for item in oeb.manifest.hrefs['index.html'].data.xpath('//*[local-name()="p"]'):
|
|
item.getparent().remove(item)
|
|
|
|
for item in oeb.manifest.hrefs['index.html'].data.xpath('//*[local-name()="div"]'):
|
|
for credit in self.credits[::-1]:
|
|
item.insert(0, etree.fromstring(credit, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)))
|
|
|
|
# Change the creator from "calibre" to the actual author
|
|
# Also, we don't need the date in the ebook's title
|
|
oeb.metadata.items['creator'][0].value = self.publisher
|
|
oeb.metadata.items['description'][0].value = oeb.metadata.items[
|
|
'description'][0].value.split('\n\nArticles in this issue')[0]
|
|
oeb.metadata.items['publication_type'][0].value = self.title
|
|
oeb.metadata.items['publisher'][0].value = self.publisher
|
|
oeb.metadata.items['title'][0].value = self.title
|