diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py
index dc4489d67e..d6d46476df 100644
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@@ -1,8 +1,37 @@
from __future__ import with_statement
-
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
+
+'''
+Conversion of HTML/OPF files follows several stages:
+
+ * All links in the HTML files or in the OPF manifest are
+ followed to build up a list of HTML files to be converted.
+ This stage is implemented by
+ :function:`calibre.ebooks.html.traverse` and
+ :class:`calibre.ebooks.html.HTMLFile`.
+
+ * The HTML is pre-processed to make it more semantic.
+ All links in the HTML files to other resources like images,
+ stylesheets, etc. are relativized. The resources are copied
+ into the `resources` sub directory. This is accomplished by
+ :class:`calibre.ebooks.html.PreProcessor` and
+ :class:`calibre.ebooks.html.Parser`.
+
+ * The HTML is processed. Various operations are performed.
+ All style declarations are extracted and consolidated into
+ a single style sheet. Chapters are auto-detected and marked.
+ Various font related manipulations are performed. See
+ :class:`HTMLProcessor`.
+
+ * The processed HTML is saved and the
+ :module:`calibre.ebooks.epub.split` module is used to split up
+ large HTML files into smaller chunks.
+
+ * The EPUB container is created.
+'''
+
import os, sys, re, cStringIO
from lxml.etree import XPath
diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py
index 38f2157b07..575a2ac82d 100644
--- a/src/calibre/ebooks/html.py
+++ b/src/calibre/ebooks/html.py
@@ -27,9 +27,9 @@ from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryF
from calibre.utils.zipfile import ZipFile
def tostring(root, pretty_print=False):
- return html.tostring(root, encoding='utf-8', method='xml',
+ return html.tostring(root, encoding='utf-8', method='xml',
pretty_print=pretty_print,
- include_meta_content_type=True)
+ include_meta_content_type=True)
class Link(object):
@@ -337,7 +337,7 @@ class Parser(PreProcessor, LoggingInterface):
if self.root.get(bad, None) is not None:
self.root.attrib.pop(bad)
- def save_path(self):
+ def save_path(self):
return os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path])
def save(self):
@@ -463,6 +463,9 @@ class Processor(Parser):
return Parser.save(self)
def populate_toc(self, toc):
+ '''
+ Populate the Table of Contents from detected chapters and links.
+ '''
def add_item(href, fragment, text, target, type='link'):
for entry in toc.flat():
@@ -602,9 +605,9 @@ class Processor(Parser):
def do_layout(self):
self.css += '\nbody {margin-top: 0pt; margin-bottom: 0pt; margin-left: 0pt; margin-right: 0pt}\n'
- self.css += '@page {margin-top: %fpt; margin-bottom: %fpt; margin-left: %fpt; margin-right: %fpt}\n'%(self.opts.margin_top, self.opts.margin_bottom, self.opts.margin_left, self.opts.margin_right)
+ self.css += '@page {margin-top: %fpt; margin-bottom: %fpt; margin-left: %fpt; margin-right: %fpt}\n'%(self.opts.margin_top, self.opts.margin_bottom, self.opts.margin_left, self.opts.margin_right)
-def config(defaults=None, config_name='html',
+def config(defaults=None, config_name='html',
desc=_('Options to control the traversal of HTML')):
if defaults is None:
c = Config(config_name, desc)
@@ -613,7 +616,7 @@ def config(defaults=None, config_name='html',
c.add_opt('output', ['-o', '--output'], default=None,
help=_('The output directory. Default is the current directory.'))
- c.add_opt('encoding', ['--encoding'], default=None,
+ c.add_opt('encoding', ['--encoding'], default=None,
help=_('Character encoding for HTML files. Default is to auto detect.'))
c.add_opt('zip', ['--zip'], default=False,
help=_('Create the output in a zip file. If this option is specified, the --output should be the name of a file not a directory.'))
@@ -674,7 +677,7 @@ def get_filelist(htmlfile, opts):
except:
pass
if not filelist:
- filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
+ filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
verbose=opts.verbose, encoding=opts.encoding)\
[0 if opts.breadth_first else 1]
if opts.verbose:
diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst
index 686d06dca6..0f723dd201 100644
--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@@ -17,11 +17,41 @@ E-book Format Conversion
What formats does |app| support conversion to/from?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-|app| supports the conversion of the following formats to LRF: HTML, LIT, MOBI, PRC, EPUB, CBR, CBZ, RTF, TXT, PDF and LRS. It also supports the conversion of LRF to LRS and HTML(forthcoming). Note that calibre does not support the conversion of DRMed ebooks.
+|app| supports the conversion of the following formats:
-What are the best formats to convert to LRF?
++----------------------------+------------------------------------------+
+| | **Output formats** |
+| +------------------+-----------------------+
+| | EPUB | LRF |
++===================+========+==================+=======================+
+| | MOBI | ✔ | ✔ |
+| | | | |
+| | LIT | ✔ | ✔ |
+| | | | |
+| | PRC | ✔ | ✔ |
+| | | | |
+| | EPUB | ✔ | ✔ |
+| | | | |
+| | HTML | ✔ | ✔ |
+| | | | |
+| **Input formats** | CBR | ✔ | ✔ |
+| | | | |
+| | CBZ | ✔ | ✔ |
+| | | | |
+| | RTF | ✔ | ✔ |
+| | | | |
+| | TXT | ✔ | ✔ |
+| | | | |
+| | PDF | ✔ | ✔ |
+| | | | |
+| | LRS | | ✔ |
++-------------------+--------+------------------+-----------------------+
+
+
+
+What are the best source formats to convert?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-In order of decreasing preference: LIT, MOBI, HTML, PRC, RTF, TXT, PDF
+In order of decreasing preference: LIT, MOBI, EPUB, HTML, PRC, RTF, TXT, PDF
Why does the PDF conversion lose some images/tables?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/src/calibre/web/feeds/recipes/discover_magazine.py b/src/calibre/web/feeds/recipes/discover_magazine.py
new file mode 100644
index 0000000000..0e3753834b
--- /dev/null
+++ b/src/calibre/web/feeds/recipes/discover_magazine.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+__license__ = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
+__docformat__ = 'restructuredtext en'
+
+'''
+doscovermagazine.com
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class DiscoverMagazine(BasicNewsRecipe):
+ title = u'Discover Magazine'
+ description = u'Science, Technology and the Future'
+ __author__ = 'Mike Diaz'
+ oldest_article = 33
+ max_articles_per_feed = 20
+ feeds = [
+ (u'Technology', u'http://discovermagazine.com/topics/technology/rss.xml'),
+ (u'Health - Medicine', u'http://discovermagazine.com/topics/health-medicine/rss.xml'),
+ (u'Mind Brain', u'http://discovermagazine.com/topics/mind-brain/rss.xml'),
+ (u'Space', u'http://discovermagazine.com/topics/space/rss.xml'),
+ (u'Human Origins', u'http://discovermagazine.com/topics/human-origins/rss.xml'),
+ (u'Living World', u'http://discovermagazine.com/topics/living-world/rss.xml'),
+ (u'Environment', u'http://discovermagazine.com/topics/environment/rss.xml'),
+ (u'Physics & Math', u'http://discovermagazine.com/topics/physics-math/rss.xml'),
+ (u'Vital Signs', u'http://discovermagazine.com/columns/vital-signs/rss.xml'),
+ (u"20 Things you didn't know about...", u'http://discovermagazine.com/columns/20-things-you-didnt-know/rss.xml'),
+ (u'Fuzzy Math', u'http://discovermagazine.com/columns/fuzzy-math/rss.xml'),
+ (u'The Brain', u'http://discovermagazine.com/columns/the-brain/rss.xml'),
+ (u'Stupid Science Word of the Month', u'http://discovermagazine.com/columns/stupid-science-word-of-the-month/rss.xml'),
+ (u'Science Not Fiction', u'http://blogs.discovermagazine.com/sciencenotfiction/wp-rss.php')
+ ]
\ No newline at end of file
diff --git a/src/calibre/web/feeds/recipes/scientific_american.py b/src/calibre/web/feeds/recipes/scientific_american.py
new file mode 100644
index 0000000000..6c6c679bc5
--- /dev/null
+++ b/src/calibre/web/feeds/recipes/scientific_american.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+__license__ = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
+__docformat__ = 'restructuredtext en'
+
+'''
+sciam.com
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class ScientificAmerican(BasicNewsRecipe):
+ title = u'Scientific American'
+ description = u'Popular science'
+ __author__ = 'Kovid Goyal'
+ oldest_article = 30
+ max_articles_per_feed = 100
+ use_embedded_content = False
+ remove_tags_before = dict(name='div', attrs={'class':'headline'})
+ remove_tags_after = dict(id='article')
+ remove_tags = [dict(id='sharetools'), dict(id='reddit')]
+ html2lrf_options = ['--base-font-size', '8']
+ feeds = [
+ (u'Latest News', u'http://rss.sciam.com/ScientificAmerican-News'),
+ (u'Global', u'http://rss.sciam.com/ScientificAmerican-Global'),
+ (u'Health', u'http://rss.sciam.com/sciam/health'),
+ (u'Space', u'http://rss.sciam.com/sciam/space'),
+ (u'Technology', u'http://rss.sciam.com/sciam/technology'),
+ (u'Biology', u'http://rss.sciam.com/sciam/biology'),
+ (u'Mind & Brain', u'http://rss.sciam.com/sciam/mind-and-brain'),
+ (u"What's Next", u'http://rss.sciam.com/sciam/whats-next'),
+ (u'Archeology and Paleontology', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=archaeology-and-paleontology'),
+ (u'Physics', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=physics'),
+ (u'Math', u'http://rss.sciam.com/sciam/math'),
+ (u'History of Science', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=history-of-science'),
+ (u'Chemistry', u'http://rss.sciam.com/sciam/chemistry'),
+ (u'Mind Matters', u'http://rss.sciam.com/ScientificAmerican-MindBlog')
+ ]