IGN:...

2025-07-09 03:04:10 -04:00 · 2008-09-23 18:18:32 -07:00 · 2008-09-23 18:18:32 -07:00 · f3eebb473f
commit f3eebb473f
parent 63e59dd3be
5 changed files with 144 additions and 11 deletions
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@ -1,8 +1,37 @@
 from __future__ import with_statement
-
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
+
+'''
+Conversion of HTML/OPF files follows several stages:
+
+    * All links in the HTML files or in the OPF manifest are
+    followed to build up a list of HTML files to be converted.
+    This stage is implemented by 
+    :function:`calibre.ebooks.html.traverse` and
+    :class:`calibre.ebooks.html.HTMLFile`.
+
+    * The HTML is pre-processed to make it more semantic. 
+    All links in the HTML files to other resources like images,
+    stylesheets, etc. are relativized. The resources are copied 
+    into the `resources` sub directory. This is accomplished by
+    :class:`calibre.ebooks.html.PreProcessor` and 
+    :class:`calibre.ebooks.html.Parser`.
+
+    * The HTML is processed. Various operations are performed.
+    All style declarations are extracted and consolidated into 
+    a single style sheet. Chapters are auto-detected and marked.
+    Various font related manipulations are performed. See
+    :class:`HTMLProcessor`.
+
+    * The processed HTML is saved and the 
+    :module:`calibre.ebooks.epub.split` module is used to split up
+    large HTML files into smaller chunks.
+
+    * The EPUB container is created.
+'''
+
 import os, sys, re, cStringIO

 from lxml.etree import XPath
--- a/src/calibre/ebooks/html.py
+++ b/src/calibre/ebooks/html.py
@ -463,6 +463,9 @@ class Processor(Parser):
        return Parser.save(self)
    
    def populate_toc(self, toc):
+        '''
+        Populate the Table of Contents from detected chapters and links.
+        '''
        
        def add_item(href, fragment, text, target, type='link'):
            for entry in toc.flat():
--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@ -17,11 +17,41 @@ E-book Format Conversion

 What formats does |app| support conversion to/from?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-|app| supports the conversion of the following formats to LRF: HTML, LIT, MOBI, PRC, EPUB, CBR, CBZ, RTF, TXT, PDF and LRS. It also supports the conversion of LRF to LRS and HTML(forthcoming). Note that calibre does not support the conversion of DRMed ebooks.
+|app| supports the conversion of the following formats:

-What are the best formats to convert to LRF?
+----------------------------+------------------------------------------+
+|                            |          **Output formats**              |
+|                            +------------------+-----------------------+
+|                            |      EPUB        |         LRF           |
+===================+========+==================+=======================+      
+|                   |  MOBI  |       ✔          |          ✔            |
+|                   |        |                  |                       |
+|                   |  LIT   |       ✔          |          ✔            |
+|                   |        |                  |                       |
+|                   |  PRC   |       ✔          |          ✔            |
+|                   |        |                  |                       |
+|                   |  EPUB  |       ✔          |          ✔            |
+|                   |        |                  |                       |
+|                   |  HTML  |       ✔          |          ✔            |
+|                   |        |                  |                       |
+| **Input formats** |  CBR   |       ✔          |          ✔            |
+|                   |        |                  |                       |
+|                   |  CBZ   |       ✔          |          ✔            |
+|                   |        |                  |                       |
+|                   |  RTF   |       ✔          |          ✔            |
+|                   |        |                  |                       |
+|                   |  TXT   |       ✔          |          ✔            |
+|                   |        |                  |                       |
+|                   |  PDF   |       ✔          |          ✔            | 
+|                   |        |                  |                       |
+|                   |  LRS   |                  |          ✔            |
+-------------------+--------+------------------+-----------------------+
+           
+
+
+What are the best source formats to convert?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-In order of decreasing preference: LIT, MOBI, HTML, PRC, RTF, TXT, PDF 
+In order of decreasing preference: LIT, MOBI, EPUB, HTML, PRC, RTF, TXT, PDF 

 Why does the PDF conversion lose some images/tables?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/src/calibre/web/feeds/recipes/discover_magazine.py
+++ b/src/calibre/web/feeds/recipes/discover_magazine.py
@ -0,0 +1,33 @@
+#!/usr/bin/env  python
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
+__docformat__ = 'restructuredtext en'
+
+'''
+doscovermagazine.com
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class DiscoverMagazine(BasicNewsRecipe):
+    title = u'Discover Magazine'
+    description = u'Science, Technology and the Future' 
+    __author__ = 'Mike Diaz' 
+    oldest_article = 33 
+    max_articles_per_feed = 20 
+    feeds = [
+             (u'Technology', u'http://discovermagazine.com/topics/technology/rss.xml'), 
+             (u'Health - Medicine', u'http://discovermagazine.com/topics/health-medicine/rss.xml'), 
+             (u'Mind Brain', u'http://discovermagazine.com/topics/mind-brain/rss.xml'), 
+             (u'Space', u'http://discovermagazine.com/topics/space/rss.xml'), 
+             (u'Human Origins', u'http://discovermagazine.com/topics/human-origins/rss.xml'), 
+             (u'Living World', u'http://discovermagazine.com/topics/living-world/rss.xml'), 
+             (u'Environment', u'http://discovermagazine.com/topics/environment/rss.xml'), 
+             (u'Physics & Math', u'http://discovermagazine.com/topics/physics-math/rss.xml'), 
+             (u'Vital Signs', u'http://discovermagazine.com/columns/vital-signs/rss.xml'), 
+             (u"20 Things you didn't know about...", u'http://discovermagazine.com/columns/20-things-you-didnt-know/rss.xml'), 
+             (u'Fuzzy Math', u'http://discovermagazine.com/columns/fuzzy-math/rss.xml'), 
+             (u'The Brain', u'http://discovermagazine.com/columns/the-brain/rss.xml'), 
+             (u'Stupid Science Word of the Month', u'http://discovermagazine.com/columns/stupid-science-word-of-the-month/rss.xml'), 
+             (u'Science Not Fiction', u'http://blogs.discovermagazine.com/sciencenotfiction/wp-rss.php')
+            ]
--- a/src/calibre/web/feeds/recipes/scientific_american.py
+++ b/src/calibre/web/feeds/recipes/scientific_american.py
@ -0,0 +1,38 @@
+#!/usr/bin/env  python
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
+__docformat__ = 'restructuredtext en'
+
+'''
+sciam.com
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class ScientificAmerican(BasicNewsRecipe):
+    title = u'Scientific American'
+    description = u'Popular science' 
+    __author__ = 'Kovid Goyal'
+    oldest_article = 30 
+    max_articles_per_feed = 100
+    use_embedded_content   = False
+    remove_tags_before = dict(name='div', attrs={'class':'headline'})
+    remove_tags_after  = dict(id='article')
+    remove_tags        = [dict(id='sharetools'), dict(id='reddit')]
+    html2lrf_options = ['--base-font-size', '8']
+    feeds = [
+             (u'Latest News', u'http://rss.sciam.com/ScientificAmerican-News'), 
+             (u'Global', u'http://rss.sciam.com/ScientificAmerican-Global'), 
+             (u'Health', u'http://rss.sciam.com/sciam/health'), 
+             (u'Space', u'http://rss.sciam.com/sciam/space'), 
+             (u'Technology', u'http://rss.sciam.com/sciam/technology'), 
+             (u'Biology', u'http://rss.sciam.com/sciam/biology'), 
+             (u'Mind & Brain', u'http://rss.sciam.com/sciam/mind-and-brain'), 
+             (u"What's Next", u'http://rss.sciam.com/sciam/whats-next'), 
+             (u'Archeology and Paleontology', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=archaeology-and-paleontology'), 
+             (u'Physics', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=physics'), 
+             (u'Math', u'http://rss.sciam.com/sciam/math'), 
+             (u'History of Science', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=history-of-science'), 
+             (u'Chemistry', u'http://rss.sciam.com/sciam/chemistry'), 
+             (u'Mind Matters', u'http://rss.sciam.com/ScientificAmerican-MindBlog')
+            ]