mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-10-31 18:47:02 -04:00 
			
		
		
		
	
		
			
				
	
	
		
			183 lines
		
	
	
		
			7.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			183 lines
		
	
	
		
			7.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #!/usr/bin/env python
 | |
| 
 | |
| from datetime import date
 | |
| from lxml import etree
 | |
| 
 | |
| __copyright__ = '2015, April King <april@twoevils.org>'
 | |
| __license__ = 'GPL v3'
 | |
| __version__ = '1.2'
 | |
| 
 | |
| '''
 | |
| http://www.thecodelesscode.com/
 | |
| '''
 | |
| 
 | |
| from calibre.web.feeds.news import BasicNewsRecipe
 | |
| from calibre.ebooks.BeautifulSoup import BeautifulSoup as bs
 | |
| 
 | |
| 
 | |
| class CodelessCode(BasicNewsRecipe):
 | |
|     __author__ = 'April King'
 | |
|     title = u'The Codeless Code'
 | |
|     category = 'fiction, programming, technology'
 | |
|     chapters = {}    # ie, Mousetrap -> 182
 | |
|     compress_news_images = True
 | |
|     compress_news_images_max_size = 100
 | |
|     cover_url = 'http://www.thecodelesscode.com/pages/case-9/Lotus-050.jpg'
 | |
|     credits = [u'<h2 class="chapter_title">{0}</h2>'.format(title),
 | |
|                u'<p>By <em>Qi</em></p>',
 | |
|                u'<p>An illustrated collection of (sometimes violent) fables concerning the Art and Philosophy of software development, written in the spirit of Zen kōans</p>',  # noqa
 | |
|                u'<p>eBook conversion courtesy of <em>{0}</em></p>'.format(__author__)]
 | |
|     description = u'The Art and Philosophy of software development, written in the spirit of Zen kōans'
 | |
|     extra_css             = '.article_date { display: none; float: right; } \
 | |
|                              .chapter_title { font-size: 1.75em; margin-top: 0; } \
 | |
|                              .chapter_title::first-letter { font-size: 1.35em; font-weight: 500; letter-spacing: -.05em; } \
 | |
|                              h2 { margin-top: 0; } \
 | |
|                              .image_wrapper { text-align: center; }'
 | |
|     index = 'http://www.thecodelesscode.com/contents'
 | |
|     language = 'en'
 | |
|     max_articles_per_feed = 1000  # I can only wish
 | |
|     path_remappings = {}    # IE, /case/182 -> articles_72/index.html
 | |
|     publication_type = 'blog'
 | |
|     publisher = 'Qi'
 | |
|     resolve_internal_links = True
 | |
|     scale_news_images = (600, 400)
 | |
|     simultaneous_downloads = 1
 | |
|     url = 'http://www.thecodelesscode.com'
 | |
| 
 | |
|     def parse_index(self):
 | |
|         koans = []
 | |
| 
 | |
|         # Retrieve the contents page, containing the ToC
 | |
|         soup = self.index_to_soup(self.index)
 | |
| 
 | |
|         for koan in soup.findAll('tr'):
 | |
|             # BS has some trouble with the weird layout
 | |
|             tag = koan.find('a')
 | |
| 
 | |
|             if tag is None:
 | |
|                 continue
 | |
|             if 'random' in tag['href']:
 | |
|                 continue
 | |
| 
 | |
|             # Minor coding error causes calibre to glitch; use the current date
 | |
|             # for the most recent title
 | |
|             koan_date = koan.find('td', attrs={'class': 'toc-date'})
 | |
|             if koan_date is None:
 | |
|                 koan_date = date.isoformat(date.today())
 | |
|             else:
 | |
|                 koan_date = koan_date.string
 | |
| 
 | |
|             title = tag.string
 | |
|             url = self.url + tag['href']
 | |
| 
 | |
|             if u'The Applicant' in title:
 | |
|                 continue  # Only the main story
 | |
| 
 | |
|             koans.append({
 | |
|                 'content': '',
 | |
|                 'date': koan_date,
 | |
|                 'description': '',
 | |
|                 'title': title,
 | |
|                 'url': url,
 | |
|             })
 | |
| 
 | |
|             # ie, Mousetrap -> 182
 | |
|             self.chapters[title] = url.split('/')[-1]
 | |
| 
 | |
|         # Oldest koans first
 | |
|         koans.reverse()
 | |
| 
 | |
|         # Log and then get out of here
 | |
|         self.log("Found {0} koans".format(len(koans)))
 | |
|         return([(self.title, koans)])
 | |
| 
 | |
|     def preprocess_html(self, soup):
 | |
|         title = soup.find('h1', attrs={'class': 'title'}).find(
 | |
|             'a', attrs={'class': 'subtle'}).string
 | |
| 
 | |
|         # Add a title at the beginning of each chapter
 | |
|         if title in self.chapters:
 | |
|             title = '<div class="chapter_title">{0}</div>'.format(title)
 | |
| 
 | |
|         # Load up the actual story
 | |
|         koan = soup.find('div', attrs={'class': 'story koan'})
 | |
| 
 | |
|         # Kind of a hack-y way to get .children in BS3  <a><b><c></c></b></a>
 | |
|         # -> <b><c></c></b>
 | |
|         contents = list(koan.contents)
 | |
|         koan = bs(title)
 | |
| 
 | |
|         for i in reversed(contents):
 | |
|             koan.insert(1, i)
 | |
| 
 | |
|         # Remove all anchors that don't contain /case/, leaving them as just their text
 | |
|         # Note that we'll come back and clean up /case/ links when the URLs are remapped
 | |
|         # during postprocess_book()
 | |
|         anchors = koan.findAll('a')
 | |
|         if anchors != []:
 | |
|             for anchor in anchors:
 | |
|                 if '/case/' in anchor['href']:
 | |
|                     pass
 | |
|                 elif 'note' in anchor['href']:
 | |
|                     anchor.replaceWith('')
 | |
|                 else:
 | |
|                     # Again, a hacky way to get the contents of the tag, thanks
 | |
|                     # to BS3
 | |
|                     contents = list(anchor.contents)
 | |
|                     linktext = bs()
 | |
|                     for i in reversed(contents):
 | |
|                         linktext.insert(1, i)
 | |
|                     anchor.replaceWith(linktext)
 | |
| 
 | |
|         # Find all the images, and wrap them up in an image_wrapper div
 | |
|         for i in range(0, len(koan.contents), 1):
 | |
|             if not hasattr(koan.contents[i], 'name'):
 | |
|                 continue  # skip carriage returns
 | |
|             if koan.contents[i].name == u'img':
 | |
|                 div = bs('<div class="image_wrapper"></div>')
 | |
|                 div.div.insert(0, koan.contents[i])
 | |
|                 koan.insert(i, div)
 | |
| 
 | |
|         return(koan)
 | |
| 
 | |
|     def canonicalize_internal_url(self, url, is_link=True):
 | |
|         url = url.split(self.url)[-1]
 | |
|         return BasicNewsRecipe.canonicalize_internal_url(self, url, is_link=is_link)
 | |
| 
 | |
|     def postprocess_book(self, oeb, opts, log):
 | |
|         # Go through each internal representation of each HTML file, and fix
 | |
|         # all the broken hrefs, if possible
 | |
|         for item in oeb.manifest.items:
 | |
|             if item.media_type == 'text/html':
 | |
| 
 | |
|                 for node in item.data.xpath('//*[@href]'):
 | |
|                     naughty_href = node.get('href')
 | |
| 
 | |
|                     if naughty_href in self.path_remappings:
 | |
|                         node.set('href', '../' +
 | |
|                                  self.path_remappings[naughty_href])
 | |
|                         href = node.get('href')
 | |
|                         self.log(
 | |
|                             "Remapped href {0} --> {1}".format(naughty_href, href))
 | |
| 
 | |
|         # Remove the superfluous extra feed page at the beginning of the book, replacing it
 | |
|         # with the proper credits
 | |
|         for item in oeb.manifest.hrefs['index.html'].data.xpath('//*[local-name()="ul"]'):
 | |
|             item.getparent().remove(item)
 | |
| 
 | |
|         for item in oeb.manifest.hrefs['index.html'].data.xpath('//*[local-name()="p"]'):
 | |
|             item.getparent().remove(item)
 | |
| 
 | |
|         for item in oeb.manifest.hrefs['index.html'].data.xpath('//*[local-name()="div"]'):
 | |
|             for credit in self.credits[::-1]:
 | |
|                 item.insert(0, etree.fromstring(credit, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)))
 | |
| 
 | |
|         # Change the creator from "calibre" to the actual author
 | |
|         # Also, we don't need the date in the ebook's title
 | |
|         oeb.metadata.items['creator'][0].value = self.publisher
 | |
|         oeb.metadata.items['description'][0].value = oeb.metadata.items[
 | |
|             'description'][0].value.split('\n\nArticles in this issue')[0]
 | |
|         oeb.metadata.items['publication_type'][0].value = self.title
 | |
|         oeb.metadata.items['publisher'][0].value = self.publisher
 | |
|         oeb.metadata.items['title'][0].value = self.title
 |