From f3eebb473f6b46e6b895f37808fd58f14f2e3d74 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 23 Sep 2008 18:18:32 -0700
Subject: [PATCH] IGN:...

---
 src/calibre/ebooks/epub/from_html.py          | 31 ++++++++++++++-
 src/calibre/ebooks/html.py                    | 17 +++++----
 src/calibre/manual/faq.rst                    | 36 ++++++++++++++++--
 .../web/feeds/recipes/discover_magazine.py    | 33 ++++++++++++++++
 .../web/feeds/recipes/scientific_american.py  | 38 +++++++++++++++++++
 5 files changed, 144 insertions(+), 11 deletions(-)
 create mode 100644 src/calibre/web/feeds/recipes/discover_magazine.py
 create mode 100644 src/calibre/web/feeds/recipes/scientific_american.py

diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py
index dc4489d67e..d6d46476df 100644
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@@ -1,8 +1,37 @@
 from __future__ import with_statement
-
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
+
+'''
+Conversion of HTML/OPF files follows several stages:
+
+    * All links in the HTML files or in the OPF manifest are
+    followed to build up a list of HTML files to be converted.
+    This stage is implemented by 
+    :function:`calibre.ebooks.html.traverse` and
+    :class:`calibre.ebooks.html.HTMLFile`.
+
+    * The HTML is pre-processed to make it more semantic. 
+    All links in the HTML files to other resources like images,
+    stylesheets, etc. are relativized. The resources are copied 
+    into the `resources` sub directory. This is accomplished by
+    :class:`calibre.ebooks.html.PreProcessor` and 
+    :class:`calibre.ebooks.html.Parser`.
+
+    * The HTML is processed. Various operations are performed.
+    All style declarations are extracted and consolidated into 
+    a single style sheet. Chapters are auto-detected and marked.
+    Various font related manipulations are performed. See
+    :class:`HTMLProcessor`.
+
+    * The processed HTML is saved and the 
+    :module:`calibre.ebooks.epub.split` module is used to split up
+    large HTML files into smaller chunks.
+
+    * The EPUB container is created.
+'''
+
 import os, sys, re, cStringIO
 
 from lxml.etree import XPath
diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py
index 38f2157b07..575a2ac82d 100644
--- a/src/calibre/ebooks/html.py
+++ b/src/calibre/ebooks/html.py
@@ -27,9 +27,9 @@ from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryF
 from calibre.utils.zipfile import ZipFile
 
 def tostring(root, pretty_print=False):
-    return html.tostring(root, encoding='utf-8', method='xml', 
+    return html.tostring(root, encoding='utf-8', method='xml',
                   pretty_print=pretty_print,
-                  include_meta_content_type=True) 
+                  include_meta_content_type=True)
 
 
 class Link(object):
@@ -337,7 +337,7 @@ class Parser(PreProcessor, LoggingInterface):
             if self.root.get(bad, None) is not None:
                 self.root.attrib.pop(bad)
         
-    def save_path(self):    
+    def save_path(self):
         return os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path])
     
     def save(self):
@@ -463,6 +463,9 @@ class Processor(Parser):
         return Parser.save(self)
     
     def populate_toc(self, toc):
+        '''
+        Populate the Table of Contents from detected chapters and links.
+        '''
         
         def add_item(href, fragment, text, target, type='link'):
             for entry in toc.flat():
@@ -602,9 +605,9 @@ class Processor(Parser):
         
     def do_layout(self):
         self.css += '\nbody {margin-top: 0pt; margin-bottom: 0pt; margin-left: 0pt; margin-right: 0pt}\n'
-        self.css += '@page {margin-top: %fpt; margin-bottom: %fpt; margin-left: %fpt; margin-right: %fpt}\n'%(self.opts.margin_top, self.opts.margin_bottom, self.opts.margin_left, self.opts.margin_right)    
+        self.css += '@page {margin-top: %fpt; margin-bottom: %fpt; margin-left: %fpt; margin-right: %fpt}\n'%(self.opts.margin_top, self.opts.margin_bottom, self.opts.margin_left, self.opts.margin_right)
 
-def config(defaults=None, config_name='html', 
+def config(defaults=None, config_name='html',
            desc=_('Options to control the traversal of HTML')):
     if defaults is None:
         c = Config(config_name, desc)
@@ -613,7 +616,7 @@ def config(defaults=None, config_name='html',
         
     c.add_opt('output', ['-o', '--output'], default=None,
              help=_('The output directory. Default is the current directory.'))
-    c.add_opt('encoding', ['--encoding'], default=None, 
+    c.add_opt('encoding', ['--encoding'], default=None,
               help=_('Character encoding for HTML files. Default is to auto detect.'))
     c.add_opt('zip', ['--zip'], default=False,
               help=_('Create the output in a zip file. If this option is specified, the --output should be the name of a file not a directory.'))
@@ -674,7 +677,7 @@ def get_filelist(htmlfile, opts):
         except:
             pass
     if not filelist:
-        filelist = traverse(htmlfile, max_levels=int(opts.max_levels), 
+        filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
                             verbose=opts.verbose, encoding=opts.encoding)\
                     [0 if opts.breadth_first else 1]
     if opts.verbose:
diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst
index 686d06dca6..0f723dd201 100644
--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@@ -17,11 +17,41 @@ E-book Format Conversion
 
 What formats does |app| support conversion to/from?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-|app| supports the conversion of the following formats to LRF: HTML, LIT, MOBI, PRC, EPUB, CBR, CBZ, RTF, TXT, PDF and LRS. It also supports the conversion of LRF to LRS and HTML(forthcoming). Note that calibre does not support the conversion of DRMed ebooks.
+|app| supports the conversion of the following formats:
 
-What are the best formats to convert to LRF?
++----------------------------+------------------------------------------+
+|                            |          **Output formats**              |
+|                            +------------------+-----------------------+
+|                            |      EPUB        |         LRF           |
++===================+========+==================+=======================+      
+|                   |  MOBI  |       ✔          |          ✔            |
+|                   |        |                  |                       |
+|                   |  LIT   |       ✔          |          ✔            |
+|                   |        |                  |                       |
+|                   |  PRC   |       ✔          |          ✔            |
+|                   |        |                  |                       |
+|                   |  EPUB  |       ✔          |          ✔            |
+|                   |        |                  |                       |
+|                   |  HTML  |       ✔          |          ✔            |
+|                   |        |                  |                       |
+| **Input formats** |  CBR   |       ✔          |          ✔            |
+|                   |        |                  |                       |
+|                   |  CBZ   |       ✔          |          ✔            |
+|                   |        |                  |                       |
+|                   |  RTF   |       ✔          |          ✔            |
+|                   |        |                  |                       |
+|                   |  TXT   |       ✔          |          ✔            |
+|                   |        |                  |                       |
+|                   |  PDF   |       ✔          |          ✔            | 
+|                   |        |                  |                       |
+|                   |  LRS   |                  |          ✔            |
++-------------------+--------+------------------+-----------------------+
+           
+
+
+What are the best source formats to convert?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-In order of decreasing preference: LIT, MOBI, HTML, PRC, RTF, TXT, PDF 
+In order of decreasing preference: LIT, MOBI, EPUB, HTML, PRC, RTF, TXT, PDF 
 
 Why does the PDF conversion lose some images/tables?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/src/calibre/web/feeds/recipes/discover_magazine.py b/src/calibre/web/feeds/recipes/discover_magazine.py
new file mode 100644
index 0000000000..0e3753834b
--- /dev/null
+++ b/src/calibre/web/feeds/recipes/discover_magazine.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env  python
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
+__docformat__ = 'restructuredtext en'
+
+'''
+doscovermagazine.com
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class DiscoverMagazine(BasicNewsRecipe):
+    title = u'Discover Magazine'
+    description = u'Science, Technology and the Future' 
+    __author__ = 'Mike Diaz' 
+    oldest_article = 33 
+    max_articles_per_feed = 20 
+    feeds = [
+             (u'Technology', u'http://discovermagazine.com/topics/technology/rss.xml'), 
+             (u'Health - Medicine', u'http://discovermagazine.com/topics/health-medicine/rss.xml'), 
+             (u'Mind Brain', u'http://discovermagazine.com/topics/mind-brain/rss.xml'), 
+             (u'Space', u'http://discovermagazine.com/topics/space/rss.xml'), 
+             (u'Human Origins', u'http://discovermagazine.com/topics/human-origins/rss.xml'), 
+             (u'Living World', u'http://discovermagazine.com/topics/living-world/rss.xml'), 
+             (u'Environment', u'http://discovermagazine.com/topics/environment/rss.xml'), 
+             (u'Physics & Math', u'http://discovermagazine.com/topics/physics-math/rss.xml'), 
+             (u'Vital Signs', u'http://discovermagazine.com/columns/vital-signs/rss.xml'), 
+             (u"20 Things you didn't know about...", u'http://discovermagazine.com/columns/20-things-you-didnt-know/rss.xml'), 
+             (u'Fuzzy Math', u'http://discovermagazine.com/columns/fuzzy-math/rss.xml'), 
+             (u'The Brain', u'http://discovermagazine.com/columns/the-brain/rss.xml'), 
+             (u'Stupid Science Word of the Month', u'http://discovermagazine.com/columns/stupid-science-word-of-the-month/rss.xml'), 
+             (u'Science Not Fiction', u'http://blogs.discovermagazine.com/sciencenotfiction/wp-rss.php')
+            ]
\ No newline at end of file
diff --git a/src/calibre/web/feeds/recipes/scientific_american.py b/src/calibre/web/feeds/recipes/scientific_american.py
new file mode 100644
index 0000000000..6c6c679bc5
--- /dev/null
+++ b/src/calibre/web/feeds/recipes/scientific_american.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env  python
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
+__docformat__ = 'restructuredtext en'
+
+'''
+sciam.com
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class ScientificAmerican(BasicNewsRecipe):
+    title = u'Scientific American'
+    description = u'Popular science' 
+    __author__ = 'Kovid Goyal'
+    oldest_article = 30 
+    max_articles_per_feed = 100
+    use_embedded_content   = False
+    remove_tags_before = dict(name='div', attrs={'class':'headline'})
+    remove_tags_after  = dict(id='article')
+    remove_tags        = [dict(id='sharetools'), dict(id='reddit')]
+    html2lrf_options = ['--base-font-size', '8']
+    feeds = [
+             (u'Latest News', u'http://rss.sciam.com/ScientificAmerican-News'), 
+             (u'Global', u'http://rss.sciam.com/ScientificAmerican-Global'), 
+             (u'Health', u'http://rss.sciam.com/sciam/health'), 
+             (u'Space', u'http://rss.sciam.com/sciam/space'), 
+             (u'Technology', u'http://rss.sciam.com/sciam/technology'), 
+             (u'Biology', u'http://rss.sciam.com/sciam/biology'), 
+             (u'Mind & Brain', u'http://rss.sciam.com/sciam/mind-and-brain'), 
+             (u"What's Next", u'http://rss.sciam.com/sciam/whats-next'), 
+             (u'Archeology and Paleontology', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=archaeology-and-paleontology'), 
+             (u'Physics', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=physics'), 
+             (u'Math', u'http://rss.sciam.com/sciam/math'), 
+             (u'History of Science', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=history-of-science'), 
+             (u'Chemistry', u'http://rss.sciam.com/sciam/chemistry'), 
+             (u'Mind Matters', u'http://rss.sciam.com/ScientificAmerican-MindBlog')
+            ]