From 7ff5ed6b62561c9920caccca5c98200943759066 Mon Sep 17 00:00:00 2001 From: April King Date: Sat, 11 Apr 2015 08:33:48 -0500 Subject: [PATCH 1/5] Adding The Codeless Code (thecodelesscode.com) --- recipes/thecodelesscode.recipe | 179 +++++++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 recipes/thecodelesscode.recipe diff --git a/recipes/thecodelesscode.recipe b/recipes/thecodelesscode.recipe new file mode 100644 index 0000000000..af0ea57a3f --- /dev/null +++ b/recipes/thecodelesscode.recipe @@ -0,0 +1,179 @@ +#!/usr/bin/env python2 + +from datetime import date +from lxml import etree + +__copyright__ = '2015, April King ' +__license__ = 'GPL v3' +__version__ = '1.2' + +''' +http://www.thecodelesscode.com/ +''' + +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup as bs + +class CodelessCode(BasicNewsRecipe): + __author__ = 'April King' + title = u'The Codeless Code' + category = 'fiction, programming, technology' + chapters = {} # ie, Mousetrap -> 182 + compress_news_images = True + compress_news_images_max_size = 100 + cover_url = 'http://www.thecodelesscode.com/pages/case-9/Lotus-050.jpg' + credits = [ u'

{0}

'.format(title), + u'

By Qi

', + u'

An illustrated collection of (sometimes violent) fables concerning the Art and Philosophy of software development, written in the spirit of Zen kōans

', + u'

eBook conversion courtesy of {0}

'.format(__author__) ] + description = u'The Art and Philosophy of software development, written in the spirit of Zen kōans' + extra_css = '.article_date { display: none; float: right; } \ + .chapter_title { font-size: 1.75em; margin-top: 0; } \ + .chapter_title::first-letter { font-size: 1.35em; font-weight: 500; letter-spacing: -.05em; } \ + h2 { margin-top: 0; } \ + .image_wrapper { text-align: center; }' + index = 'http://www.thecodelesscode.com/contents' + language = 'en' + max_articles_per_feed = 1000 # I can only wish + path_remappings = {} # IE, /case/182 -> articles_72/index.html + publication_type = 'blog' + publisher = 'Qi' + scale_news_images = (600, 400) + simultaneous_downloads = 1 + url = 'http://www.thecodelesscode.com' + + def create_opf(self, feeds, dir=None): + ''' + Generate a mapping of the original URL, ie, http://thecodelesscode.com/case/100 to the + internal Calibre file system, eg, ../article_7/index_u39.html + ''' + for feed in feeds: + for article in feed: + orig_path = article.orig_url.split(self.url, 2)[-1] # http://thecodelesscode.com/case/100 -> /case/100 + article_id = article.id.split('#')[-1] # internal id#10 -> 10 + article_path = article.url.split('index')[0] + 'index.html' # article_X/index.html -> article_X/ + + self.path_remappings[orig_path] = article_path + + BasicNewsRecipe.create_opf(self, feeds, dir=dir) + + def parse_index(self): + koans = [] + + # Retrieve the contents page, containing the ToC + soup = self.index_to_soup(self.index) + + for koan in soup.findAll('tr'): + # BS has some trouble with the weird layout + tag = koan.find('a') + + if tag == None: continue + if 'random' in tag['href']: continue + + # Minor coding error causes calibre to glitch; use the current date for the most recent title + koan_date = koan.find('td', attrs={'class' : 'toc-date' }) + if koan_date == None: + koan_date = date.isoformat(date.today()) + else: + koan_date = koan_date.string + + title = tag.string + url = self.url + tag['href'] + + if u'The Applicant' in title: continue # Only the main story + + koans.append({ + 'content': '', + 'date': koan_date, + 'description': '', + 'title': title, + 'url' : url, + }) + + # ie, Mousetrap -> 182 + self.chapters[title] = url.split('/')[-1] + + # Oldest koans first + koans.reverse() + + # Log and then get out of here + self.log("Found {0} koans".format(len(koans))) + return( [(self.title, koans)] ) + + def preprocess_html(self, soup): + title = soup.find('h1', attrs = {'class': 'title'}).find('a', attrs = {'class' : 'subtle'}).string + + # Add a title at the beginning of each chapter + if title in self.chapters: + title = '
{0}
'.format(title) + + # Load up the actual story + koan = soup.find('div', attrs = {'class' : 'story koan'}) + + # Kind of a hack-y way to get .children in BS3 -> + contents = list(koan.contents) + koan = bs(title) + + for i in reversed(contents): + koan.insert(1, i) + + # Remove all anchors that don't contain /case/, leaving them as just their text + # Note that we'll come back and clean up /case/ links when the URLs are remapped + # during postprocess_book() + anchors = koan.findAll('a') + if anchors != []: + for anchor in anchors: + if '/case/' in anchor['href']: + pass + elif 'note' in anchor['href']: + anchor.replaceWith('') + else: + # Again, a hacky way to get the contents of the tag, thanks to BS3 + contents = list(anchor.contents) + linktext = bs() + for i in reversed(contents): + linktext.insert(1, i) + anchor.replaceWith(linktext) + + # Find all the images, and wrap them up in an image_wrapper div + for i in range(0, len(koan.contents), 1): + if not hasattr(koan.contents[i], 'name'): continue # skip carriage returns + if koan.contents[i].name == u'img': + div = bs('
') + div.div.insert(0, koan.contents[i]) + koan.insert(i, div) + + return(koan) + + def postprocess_book(self, oeb, opts, log): + # Go through each internal representation of each HTML file, and fix all the broken hrefs, if possible + for item in oeb.manifest.items: + if item.media_type == 'text/html': + + for node in item.data.xpath('//*[@href]'): + naughty_href = node.get('href') + + if naughty_href in self.path_remappings: + node.set('href', '../' + self.path_remappings[ naughty_href ] ) + href = node.get('href') + self.log("Remapped href {0} --> {1}".format(naughty_href, href)) + + # Remove the superfluous extra feed page at the beginning of the book, replacing it + # with the proper credits + for item in oeb.manifest.hrefs['index.html'].data.xpath('//*[local-name()="ul"]'): + item.getparent().remove(item) + + for item in oeb.manifest.hrefs['index.html'].data.xpath('//*[local-name()="p"]'): + item.getparent().remove(item) + + for item in oeb.manifest.hrefs['index.html'].data.xpath('//*[local-name()="div"]'): + for credit in self.credits[::-1]: + item.insert(0, etree.fromstring(credit)) + + # Change the creator from "calibre" to the actual author + # Also, we don't need the date in the ebook's title + oeb.metadata.items['creator'][0].value = self.publisher + oeb.metadata.items['description'][0].value = oeb.metadata.items['description'][0].value.split('\n\nArticles in this issue')[0] + oeb.metadata.items['publication_type'][0].value = self.title + oeb.metadata.items['publisher'][0].value = self.publisher + oeb.metadata.items['title'][0].value = self.title From fb3f9d78981dafa4e814c19303cebeff215ed983 Mon Sep 17 00:00:00 2001 From: April King Date: Sat, 11 Apr 2015 15:12:19 -0500 Subject: [PATCH 2/5] Revert "Adding The Codeless Code (thecodelesscode.com)" This reverts commit 7ff5ed6b62561c9920caccca5c98200943759066. --- recipes/thecodelesscode.recipe | 179 --------------------------------- 1 file changed, 179 deletions(-) delete mode 100644 recipes/thecodelesscode.recipe diff --git a/recipes/thecodelesscode.recipe b/recipes/thecodelesscode.recipe deleted file mode 100644 index af0ea57a3f..0000000000 --- a/recipes/thecodelesscode.recipe +++ /dev/null @@ -1,179 +0,0 @@ -#!/usr/bin/env python2 - -from datetime import date -from lxml import etree - -__copyright__ = '2015, April King ' -__license__ = 'GPL v3' -__version__ = '1.2' - -''' -http://www.thecodelesscode.com/ -''' - -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup as bs - -class CodelessCode(BasicNewsRecipe): - __author__ = 'April King' - title = u'The Codeless Code' - category = 'fiction, programming, technology' - chapters = {} # ie, Mousetrap -> 182 - compress_news_images = True - compress_news_images_max_size = 100 - cover_url = 'http://www.thecodelesscode.com/pages/case-9/Lotus-050.jpg' - credits = [ u'

{0}

'.format(title), - u'

By Qi

', - u'

An illustrated collection of (sometimes violent) fables concerning the Art and Philosophy of software development, written in the spirit of Zen kōans

', - u'

eBook conversion courtesy of {0}

'.format(__author__) ] - description = u'The Art and Philosophy of software development, written in the spirit of Zen kōans' - extra_css = '.article_date { display: none; float: right; } \ - .chapter_title { font-size: 1.75em; margin-top: 0; } \ - .chapter_title::first-letter { font-size: 1.35em; font-weight: 500; letter-spacing: -.05em; } \ - h2 { margin-top: 0; } \ - .image_wrapper { text-align: center; }' - index = 'http://www.thecodelesscode.com/contents' - language = 'en' - max_articles_per_feed = 1000 # I can only wish - path_remappings = {} # IE, /case/182 -> articles_72/index.html - publication_type = 'blog' - publisher = 'Qi' - scale_news_images = (600, 400) - simultaneous_downloads = 1 - url = 'http://www.thecodelesscode.com' - - def create_opf(self, feeds, dir=None): - ''' - Generate a mapping of the original URL, ie, http://thecodelesscode.com/case/100 to the - internal Calibre file system, eg, ../article_7/index_u39.html - ''' - for feed in feeds: - for article in feed: - orig_path = article.orig_url.split(self.url, 2)[-1] # http://thecodelesscode.com/case/100 -> /case/100 - article_id = article.id.split('#')[-1] # internal id#10 -> 10 - article_path = article.url.split('index')[0] + 'index.html' # article_X/index.html -> article_X/ - - self.path_remappings[orig_path] = article_path - - BasicNewsRecipe.create_opf(self, feeds, dir=dir) - - def parse_index(self): - koans = [] - - # Retrieve the contents page, containing the ToC - soup = self.index_to_soup(self.index) - - for koan in soup.findAll('tr'): - # BS has some trouble with the weird layout - tag = koan.find('a') - - if tag == None: continue - if 'random' in tag['href']: continue - - # Minor coding error causes calibre to glitch; use the current date for the most recent title - koan_date = koan.find('td', attrs={'class' : 'toc-date' }) - if koan_date == None: - koan_date = date.isoformat(date.today()) - else: - koan_date = koan_date.string - - title = tag.string - url = self.url + tag['href'] - - if u'The Applicant' in title: continue # Only the main story - - koans.append({ - 'content': '', - 'date': koan_date, - 'description': '', - 'title': title, - 'url' : url, - }) - - # ie, Mousetrap -> 182 - self.chapters[title] = url.split('/')[-1] - - # Oldest koans first - koans.reverse() - - # Log and then get out of here - self.log("Found {0} koans".format(len(koans))) - return( [(self.title, koans)] ) - - def preprocess_html(self, soup): - title = soup.find('h1', attrs = {'class': 'title'}).find('a', attrs = {'class' : 'subtle'}).string - - # Add a title at the beginning of each chapter - if title in self.chapters: - title = '
{0}
'.format(title) - - # Load up the actual story - koan = soup.find('div', attrs = {'class' : 'story koan'}) - - # Kind of a hack-y way to get .children in BS3 -> - contents = list(koan.contents) - koan = bs(title) - - for i in reversed(contents): - koan.insert(1, i) - - # Remove all anchors that don't contain /case/, leaving them as just their text - # Note that we'll come back and clean up /case/ links when the URLs are remapped - # during postprocess_book() - anchors = koan.findAll('a') - if anchors != []: - for anchor in anchors: - if '/case/' in anchor['href']: - pass - elif 'note' in anchor['href']: - anchor.replaceWith('') - else: - # Again, a hacky way to get the contents of the tag, thanks to BS3 - contents = list(anchor.contents) - linktext = bs() - for i in reversed(contents): - linktext.insert(1, i) - anchor.replaceWith(linktext) - - # Find all the images, and wrap them up in an image_wrapper div - for i in range(0, len(koan.contents), 1): - if not hasattr(koan.contents[i], 'name'): continue # skip carriage returns - if koan.contents[i].name == u'img': - div = bs('
') - div.div.insert(0, koan.contents[i]) - koan.insert(i, div) - - return(koan) - - def postprocess_book(self, oeb, opts, log): - # Go through each internal representation of each HTML file, and fix all the broken hrefs, if possible - for item in oeb.manifest.items: - if item.media_type == 'text/html': - - for node in item.data.xpath('//*[@href]'): - naughty_href = node.get('href') - - if naughty_href in self.path_remappings: - node.set('href', '../' + self.path_remappings[ naughty_href ] ) - href = node.get('href') - self.log("Remapped href {0} --> {1}".format(naughty_href, href)) - - # Remove the superfluous extra feed page at the beginning of the book, replacing it - # with the proper credits - for item in oeb.manifest.hrefs['index.html'].data.xpath('//*[local-name()="ul"]'): - item.getparent().remove(item) - - for item in oeb.manifest.hrefs['index.html'].data.xpath('//*[local-name()="p"]'): - item.getparent().remove(item) - - for item in oeb.manifest.hrefs['index.html'].data.xpath('//*[local-name()="div"]'): - for credit in self.credits[::-1]: - item.insert(0, etree.fromstring(credit)) - - # Change the creator from "calibre" to the actual author - # Also, we don't need the date in the ebook's title - oeb.metadata.items['creator'][0].value = self.publisher - oeb.metadata.items['description'][0].value = oeb.metadata.items['description'][0].value.split('\n\nArticles in this issue')[0] - oeb.metadata.items['publication_type'][0].value = self.title - oeb.metadata.items['publisher'][0].value = self.publisher - oeb.metadata.items['title'][0].value = self.title From e5b4af0f6ca22fcf31fdba3a1478d7d4b6e53360 Mon Sep 17 00:00:00 2001 From: April King Date: Sat, 11 Apr 2015 15:15:21 -0500 Subject: [PATCH 3/5] Revert "Revert "Adding The Codeless Code (thecodelesscode.com)"" This reverts commit fb3f9d78981dafa4e814c19303cebeff215ed983. --- recipes/thecodelesscode.recipe | 179 +++++++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 recipes/thecodelesscode.recipe diff --git a/recipes/thecodelesscode.recipe b/recipes/thecodelesscode.recipe new file mode 100644 index 0000000000..af0ea57a3f --- /dev/null +++ b/recipes/thecodelesscode.recipe @@ -0,0 +1,179 @@ +#!/usr/bin/env python2 + +from datetime import date +from lxml import etree + +__copyright__ = '2015, April King ' +__license__ = 'GPL v3' +__version__ = '1.2' + +''' +http://www.thecodelesscode.com/ +''' + +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup as bs + +class CodelessCode(BasicNewsRecipe): + __author__ = 'April King' + title = u'The Codeless Code' + category = 'fiction, programming, technology' + chapters = {} # ie, Mousetrap -> 182 + compress_news_images = True + compress_news_images_max_size = 100 + cover_url = 'http://www.thecodelesscode.com/pages/case-9/Lotus-050.jpg' + credits = [ u'

{0}

'.format(title), + u'

By Qi

', + u'

An illustrated collection of (sometimes violent) fables concerning the Art and Philosophy of software development, written in the spirit of Zen kōans

', + u'

eBook conversion courtesy of {0}

'.format(__author__) ] + description = u'The Art and Philosophy of software development, written in the spirit of Zen kōans' + extra_css = '.article_date { display: none; float: right; } \ + .chapter_title { font-size: 1.75em; margin-top: 0; } \ + .chapter_title::first-letter { font-size: 1.35em; font-weight: 500; letter-spacing: -.05em; } \ + h2 { margin-top: 0; } \ + .image_wrapper { text-align: center; }' + index = 'http://www.thecodelesscode.com/contents' + language = 'en' + max_articles_per_feed = 1000 # I can only wish + path_remappings = {} # IE, /case/182 -> articles_72/index.html + publication_type = 'blog' + publisher = 'Qi' + scale_news_images = (600, 400) + simultaneous_downloads = 1 + url = 'http://www.thecodelesscode.com' + + def create_opf(self, feeds, dir=None): + ''' + Generate a mapping of the original URL, ie, http://thecodelesscode.com/case/100 to the + internal Calibre file system, eg, ../article_7/index_u39.html + ''' + for feed in feeds: + for article in feed: + orig_path = article.orig_url.split(self.url, 2)[-1] # http://thecodelesscode.com/case/100 -> /case/100 + article_id = article.id.split('#')[-1] # internal id#10 -> 10 + article_path = article.url.split('index')[0] + 'index.html' # article_X/index.html -> article_X/ + + self.path_remappings[orig_path] = article_path + + BasicNewsRecipe.create_opf(self, feeds, dir=dir) + + def parse_index(self): + koans = [] + + # Retrieve the contents page, containing the ToC + soup = self.index_to_soup(self.index) + + for koan in soup.findAll('tr'): + # BS has some trouble with the weird layout + tag = koan.find('a') + + if tag == None: continue + if 'random' in tag['href']: continue + + # Minor coding error causes calibre to glitch; use the current date for the most recent title + koan_date = koan.find('td', attrs={'class' : 'toc-date' }) + if koan_date == None: + koan_date = date.isoformat(date.today()) + else: + koan_date = koan_date.string + + title = tag.string + url = self.url + tag['href'] + + if u'The Applicant' in title: continue # Only the main story + + koans.append({ + 'content': '', + 'date': koan_date, + 'description': '', + 'title': title, + 'url' : url, + }) + + # ie, Mousetrap -> 182 + self.chapters[title] = url.split('/')[-1] + + # Oldest koans first + koans.reverse() + + # Log and then get out of here + self.log("Found {0} koans".format(len(koans))) + return( [(self.title, koans)] ) + + def preprocess_html(self, soup): + title = soup.find('h1', attrs = {'class': 'title'}).find('a', attrs = {'class' : 'subtle'}).string + + # Add a title at the beginning of each chapter + if title in self.chapters: + title = '
{0}
'.format(title) + + # Load up the actual story + koan = soup.find('div', attrs = {'class' : 'story koan'}) + + # Kind of a hack-y way to get .children in BS3 -> + contents = list(koan.contents) + koan = bs(title) + + for i in reversed(contents): + koan.insert(1, i) + + # Remove all anchors that don't contain /case/, leaving them as just their text + # Note that we'll come back and clean up /case/ links when the URLs are remapped + # during postprocess_book() + anchors = koan.findAll('a') + if anchors != []: + for anchor in anchors: + if '/case/' in anchor['href']: + pass + elif 'note' in anchor['href']: + anchor.replaceWith('') + else: + # Again, a hacky way to get the contents of the tag, thanks to BS3 + contents = list(anchor.contents) + linktext = bs() + for i in reversed(contents): + linktext.insert(1, i) + anchor.replaceWith(linktext) + + # Find all the images, and wrap them up in an image_wrapper div + for i in range(0, len(koan.contents), 1): + if not hasattr(koan.contents[i], 'name'): continue # skip carriage returns + if koan.contents[i].name == u'img': + div = bs('
') + div.div.insert(0, koan.contents[i]) + koan.insert(i, div) + + return(koan) + + def postprocess_book(self, oeb, opts, log): + # Go through each internal representation of each HTML file, and fix all the broken hrefs, if possible + for item in oeb.manifest.items: + if item.media_type == 'text/html': + + for node in item.data.xpath('//*[@href]'): + naughty_href = node.get('href') + + if naughty_href in self.path_remappings: + node.set('href', '../' + self.path_remappings[ naughty_href ] ) + href = node.get('href') + self.log("Remapped href {0} --> {1}".format(naughty_href, href)) + + # Remove the superfluous extra feed page at the beginning of the book, replacing it + # with the proper credits + for item in oeb.manifest.hrefs['index.html'].data.xpath('//*[local-name()="ul"]'): + item.getparent().remove(item) + + for item in oeb.manifest.hrefs['index.html'].data.xpath('//*[local-name()="p"]'): + item.getparent().remove(item) + + for item in oeb.manifest.hrefs['index.html'].data.xpath('//*[local-name()="div"]'): + for credit in self.credits[::-1]: + item.insert(0, etree.fromstring(credit)) + + # Change the creator from "calibre" to the actual author + # Also, we don't need the date in the ebook's title + oeb.metadata.items['creator'][0].value = self.publisher + oeb.metadata.items['description'][0].value = oeb.metadata.items['description'][0].value.split('\n\nArticles in this issue')[0] + oeb.metadata.items['publication_type'][0].value = self.title + oeb.metadata.items['publisher'][0].value = self.publisher + oeb.metadata.items['title'][0].value = self.title From 055a103f05e0d75162566ce9b79d17a1d6d4fbcc Mon Sep 17 00:00:00 2001 From: April King Date: Sat, 11 Apr 2015 16:22:58 -0500 Subject: [PATCH 4/5] Updated to use the latest Calibre API, introduced on 4/10: kovidgoyal/calibre@16efc96 --- recipes/thecodelesscode.recipe | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/recipes/thecodelesscode.recipe b/recipes/thecodelesscode.recipe index af0ea57a3f..dd19fbe11d 100644 --- a/recipes/thecodelesscode.recipe +++ b/recipes/thecodelesscode.recipe @@ -38,25 +38,11 @@ class CodelessCode(BasicNewsRecipe): path_remappings = {} # IE, /case/182 -> articles_72/index.html publication_type = 'blog' publisher = 'Qi' + resolve_internal_links = True scale_news_images = (600, 400) simultaneous_downloads = 1 url = 'http://www.thecodelesscode.com' - def create_opf(self, feeds, dir=None): - ''' - Generate a mapping of the original URL, ie, http://thecodelesscode.com/case/100 to the - internal Calibre file system, eg, ../article_7/index_u39.html - ''' - for feed in feeds: - for article in feed: - orig_path = article.orig_url.split(self.url, 2)[-1] # http://thecodelesscode.com/case/100 -> /case/100 - article_id = article.id.split('#')[-1] # internal id#10 -> 10 - article_path = article.url.split('index')[0] + 'index.html' # article_X/index.html -> article_X/ - - self.path_remappings[orig_path] = article_path - - BasicNewsRecipe.create_opf(self, feeds, dir=dir) - def parse_index(self): koans = [] @@ -145,6 +131,10 @@ class CodelessCode(BasicNewsRecipe): return(koan) + def canonicalize_internal_url(self, url, is_link=True): + url = url.split(self.url)[-1] + return BasicNewsRecipe.canonicalize_internal_url(self, url, is_link=is_link) + def postprocess_book(self, oeb, opts, log): # Go through each internal representation of each HTML file, and fix all the broken hrefs, if possible for item in oeb.manifest.items: From b41cc304d7f850e98d3b6a60c55afd265f49496a Mon Sep 17 00:00:00 2001 From: April King Date: Sat, 11 Apr 2015 16:26:58 -0500 Subject: [PATCH 5/5] fixed some bad spacing --- recipes/thecodelesscode.recipe | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/thecodelesscode.recipe b/recipes/thecodelesscode.recipe index dd19fbe11d..057ddc26aa 100644 --- a/recipes/thecodelesscode.recipe +++ b/recipes/thecodelesscode.recipe @@ -31,7 +31,7 @@ class CodelessCode(BasicNewsRecipe): .chapter_title { font-size: 1.75em; margin-top: 0; } \ .chapter_title::first-letter { font-size: 1.35em; font-weight: 500; letter-spacing: -.05em; } \ h2 { margin-top: 0; } \ - .image_wrapper { text-align: center; }' + .image_wrapper { text-align: center; }' index = 'http://www.thecodelesscode.com/contents' language = 'en' max_articles_per_feed = 1000 # I can only wish