This commit is contained in:
Kovid Goyal 2008-09-23 18:18:32 -07:00
parent 63e59dd3be
commit f3eebb473f
5 changed files with 144 additions and 11 deletions

View File

@ -1,8 +1,37 @@
from __future__ import with_statement from __future__ import with_statement
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
'''
Conversion of HTML/OPF files follows several stages:
* All links in the HTML files or in the OPF manifest are
followed to build up a list of HTML files to be converted.
This stage is implemented by
:function:`calibre.ebooks.html.traverse` and
:class:`calibre.ebooks.html.HTMLFile`.
* The HTML is pre-processed to make it more semantic.
All links in the HTML files to other resources like images,
stylesheets, etc. are relativized. The resources are copied
into the `resources` sub directory. This is accomplished by
:class:`calibre.ebooks.html.PreProcessor` and
:class:`calibre.ebooks.html.Parser`.
* The HTML is processed. Various operations are performed.
All style declarations are extracted and consolidated into
a single style sheet. Chapters are auto-detected and marked.
Various font related manipulations are performed. See
:class:`HTMLProcessor`.
* The processed HTML is saved and the
:module:`calibre.ebooks.epub.split` module is used to split up
large HTML files into smaller chunks.
* The EPUB container is created.
'''
import os, sys, re, cStringIO import os, sys, re, cStringIO
from lxml.etree import XPath from lxml.etree import XPath

View File

@ -463,6 +463,9 @@ class Processor(Parser):
return Parser.save(self) return Parser.save(self)
def populate_toc(self, toc): def populate_toc(self, toc):
'''
Populate the Table of Contents from detected chapters and links.
'''
def add_item(href, fragment, text, target, type='link'): def add_item(href, fragment, text, target, type='link'):
for entry in toc.flat(): for entry in toc.flat():

View File

@ -17,11 +17,41 @@ E-book Format Conversion
What formats does |app| support conversion to/from? What formats does |app| support conversion to/from?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|app| supports the conversion of the following formats to LRF: HTML, LIT, MOBI, PRC, EPUB, CBR, CBZ, RTF, TXT, PDF and LRS. It also supports the conversion of LRF to LRS and HTML(forthcoming). Note that calibre does not support the conversion of DRMed ebooks. |app| supports the conversion of the following formats:
What are the best formats to convert to LRF? +----------------------------+------------------------------------------+
| | **Output formats** |
| +------------------+-----------------------+
| | EPUB | LRF |
+===================+========+==================+=======================+
| | MOBI | ✔ | ✔ |
| | | | |
| | LIT | ✔ | ✔ |
| | | | |
| | PRC | ✔ | ✔ |
| | | | |
| | EPUB | ✔ | ✔ |
| | | | |
| | HTML | ✔ | ✔ |
| | | | |
| **Input formats** | CBR | ✔ | ✔ |
| | | | |
| | CBZ | ✔ | ✔ |
| | | | |
| | RTF | ✔ | ✔ |
| | | | |
| | TXT | ✔ | ✔ |
| | | | |
| | PDF | ✔ | ✔ |
| | | | |
| | LRS | | ✔ |
+-------------------+--------+------------------+-----------------------+
What are the best source formats to convert?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In order of decreasing preference: LIT, MOBI, HTML, PRC, RTF, TXT, PDF In order of decreasing preference: LIT, MOBI, EPUB, HTML, PRC, RTF, TXT, PDF
Why does the PDF conversion lose some images/tables? Why does the PDF conversion lose some images/tables?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@ -0,0 +1,33 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
doscovermagazine.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class DiscoverMagazine(BasicNewsRecipe):
title = u'Discover Magazine'
description = u'Science, Technology and the Future'
__author__ = 'Mike Diaz'
oldest_article = 33
max_articles_per_feed = 20
feeds = [
(u'Technology', u'http://discovermagazine.com/topics/technology/rss.xml'),
(u'Health - Medicine', u'http://discovermagazine.com/topics/health-medicine/rss.xml'),
(u'Mind Brain', u'http://discovermagazine.com/topics/mind-brain/rss.xml'),
(u'Space', u'http://discovermagazine.com/topics/space/rss.xml'),
(u'Human Origins', u'http://discovermagazine.com/topics/human-origins/rss.xml'),
(u'Living World', u'http://discovermagazine.com/topics/living-world/rss.xml'),
(u'Environment', u'http://discovermagazine.com/topics/environment/rss.xml'),
(u'Physics & Math', u'http://discovermagazine.com/topics/physics-math/rss.xml'),
(u'Vital Signs', u'http://discovermagazine.com/columns/vital-signs/rss.xml'),
(u"20 Things you didn't know about...", u'http://discovermagazine.com/columns/20-things-you-didnt-know/rss.xml'),
(u'Fuzzy Math', u'http://discovermagazine.com/columns/fuzzy-math/rss.xml'),
(u'The Brain', u'http://discovermagazine.com/columns/the-brain/rss.xml'),
(u'Stupid Science Word of the Month', u'http://discovermagazine.com/columns/stupid-science-word-of-the-month/rss.xml'),
(u'Science Not Fiction', u'http://blogs.discovermagazine.com/sciencenotfiction/wp-rss.php')
]

View File

@ -0,0 +1,38 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
sciam.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class ScientificAmerican(BasicNewsRecipe):
title = u'Scientific American'
description = u'Popular science'
__author__ = 'Kovid Goyal'
oldest_article = 30
max_articles_per_feed = 100
use_embedded_content = False
remove_tags_before = dict(name='div', attrs={'class':'headline'})
remove_tags_after = dict(id='article')
remove_tags = [dict(id='sharetools'), dict(id='reddit')]
html2lrf_options = ['--base-font-size', '8']
feeds = [
(u'Latest News', u'http://rss.sciam.com/ScientificAmerican-News'),
(u'Global', u'http://rss.sciam.com/ScientificAmerican-Global'),
(u'Health', u'http://rss.sciam.com/sciam/health'),
(u'Space', u'http://rss.sciam.com/sciam/space'),
(u'Technology', u'http://rss.sciam.com/sciam/technology'),
(u'Biology', u'http://rss.sciam.com/sciam/biology'),
(u'Mind & Brain', u'http://rss.sciam.com/sciam/mind-and-brain'),
(u"What's Next", u'http://rss.sciam.com/sciam/whats-next'),
(u'Archeology and Paleontology', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=archaeology-and-paleontology'),
(u'Physics', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=physics'),
(u'Math', u'http://rss.sciam.com/sciam/math'),
(u'History of Science', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=history-of-science'),
(u'Chemistry', u'http://rss.sciam.com/sciam/chemistry'),
(u'Mind Matters', u'http://rss.sciam.com/ScientificAmerican-MindBlog')
]