From f3eebb473f6b46e6b895f37808fd58f14f2e3d74 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 23 Sep 2008 18:18:32 -0700 Subject: [PATCH] IGN:... --- src/calibre/ebooks/epub/from_html.py | 31 ++++++++++++++- src/calibre/ebooks/html.py | 17 +++++---- src/calibre/manual/faq.rst | 36 ++++++++++++++++-- .../web/feeds/recipes/discover_magazine.py | 33 ++++++++++++++++ .../web/feeds/recipes/scientific_american.py | 38 +++++++++++++++++++ 5 files changed, 144 insertions(+), 11 deletions(-) create mode 100644 src/calibre/web/feeds/recipes/discover_magazine.py create mode 100644 src/calibre/web/feeds/recipes/scientific_american.py diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py index dc4489d67e..d6d46476df 100644 --- a/src/calibre/ebooks/epub/from_html.py +++ b/src/calibre/ebooks/epub/from_html.py @@ -1,8 +1,37 @@ from __future__ import with_statement - __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __docformat__ = 'restructuredtext en' + +''' +Conversion of HTML/OPF files follows several stages: + + * All links in the HTML files or in the OPF manifest are + followed to build up a list of HTML files to be converted. + This stage is implemented by + :function:`calibre.ebooks.html.traverse` and + :class:`calibre.ebooks.html.HTMLFile`. + + * The HTML is pre-processed to make it more semantic. + All links in the HTML files to other resources like images, + stylesheets, etc. are relativized. The resources are copied + into the `resources` sub directory. This is accomplished by + :class:`calibre.ebooks.html.PreProcessor` and + :class:`calibre.ebooks.html.Parser`. + + * The HTML is processed. Various operations are performed. + All style declarations are extracted and consolidated into + a single style sheet. Chapters are auto-detected and marked. + Various font related manipulations are performed. See + :class:`HTMLProcessor`. + + * The processed HTML is saved and the + :module:`calibre.ebooks.epub.split` module is used to split up + large HTML files into smaller chunks. + + * The EPUB container is created. +''' + import os, sys, re, cStringIO from lxml.etree import XPath diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py index 38f2157b07..575a2ac82d 100644 --- a/src/calibre/ebooks/html.py +++ b/src/calibre/ebooks/html.py @@ -27,9 +27,9 @@ from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryF from calibre.utils.zipfile import ZipFile def tostring(root, pretty_print=False): - return html.tostring(root, encoding='utf-8', method='xml', + return html.tostring(root, encoding='utf-8', method='xml', pretty_print=pretty_print, - include_meta_content_type=True) + include_meta_content_type=True) class Link(object): @@ -337,7 +337,7 @@ class Parser(PreProcessor, LoggingInterface): if self.root.get(bad, None) is not None: self.root.attrib.pop(bad) - def save_path(self): + def save_path(self): return os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path]) def save(self): @@ -463,6 +463,9 @@ class Processor(Parser): return Parser.save(self) def populate_toc(self, toc): + ''' + Populate the Table of Contents from detected chapters and links. + ''' def add_item(href, fragment, text, target, type='link'): for entry in toc.flat(): @@ -602,9 +605,9 @@ class Processor(Parser): def do_layout(self): self.css += '\nbody {margin-top: 0pt; margin-bottom: 0pt; margin-left: 0pt; margin-right: 0pt}\n' - self.css += '@page {margin-top: %fpt; margin-bottom: %fpt; margin-left: %fpt; margin-right: %fpt}\n'%(self.opts.margin_top, self.opts.margin_bottom, self.opts.margin_left, self.opts.margin_right) + self.css += '@page {margin-top: %fpt; margin-bottom: %fpt; margin-left: %fpt; margin-right: %fpt}\n'%(self.opts.margin_top, self.opts.margin_bottom, self.opts.margin_left, self.opts.margin_right) -def config(defaults=None, config_name='html', +def config(defaults=None, config_name='html', desc=_('Options to control the traversal of HTML')): if defaults is None: c = Config(config_name, desc) @@ -613,7 +616,7 @@ def config(defaults=None, config_name='html', c.add_opt('output', ['-o', '--output'], default=None, help=_('The output directory. Default is the current directory.')) - c.add_opt('encoding', ['--encoding'], default=None, + c.add_opt('encoding', ['--encoding'], default=None, help=_('Character encoding for HTML files. Default is to auto detect.')) c.add_opt('zip', ['--zip'], default=False, help=_('Create the output in a zip file. If this option is specified, the --output should be the name of a file not a directory.')) @@ -674,7 +677,7 @@ def get_filelist(htmlfile, opts): except: pass if not filelist: - filelist = traverse(htmlfile, max_levels=int(opts.max_levels), + filelist = traverse(htmlfile, max_levels=int(opts.max_levels), verbose=opts.verbose, encoding=opts.encoding)\ [0 if opts.breadth_first else 1] if opts.verbose: diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst index 686d06dca6..0f723dd201 100644 --- a/src/calibre/manual/faq.rst +++ b/src/calibre/manual/faq.rst @@ -17,11 +17,41 @@ E-book Format Conversion What formats does |app| support conversion to/from? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -|app| supports the conversion of the following formats to LRF: HTML, LIT, MOBI, PRC, EPUB, CBR, CBZ, RTF, TXT, PDF and LRS. It also supports the conversion of LRF to LRS and HTML(forthcoming). Note that calibre does not support the conversion of DRMed ebooks. +|app| supports the conversion of the following formats: -What are the best formats to convert to LRF? ++----------------------------+------------------------------------------+ +| | **Output formats** | +| +------------------+-----------------------+ +| | EPUB | LRF | ++===================+========+==================+=======================+ +| | MOBI | ✔ | ✔ | +| | | | | +| | LIT | ✔ | ✔ | +| | | | | +| | PRC | ✔ | ✔ | +| | | | | +| | EPUB | ✔ | ✔ | +| | | | | +| | HTML | ✔ | ✔ | +| | | | | +| **Input formats** | CBR | ✔ | ✔ | +| | | | | +| | CBZ | ✔ | ✔ | +| | | | | +| | RTF | ✔ | ✔ | +| | | | | +| | TXT | ✔ | ✔ | +| | | | | +| | PDF | ✔ | ✔ | +| | | | | +| | LRS | | ✔ | ++-------------------+--------+------------------+-----------------------+ + + + +What are the best source formats to convert? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -In order of decreasing preference: LIT, MOBI, HTML, PRC, RTF, TXT, PDF +In order of decreasing preference: LIT, MOBI, EPUB, HTML, PRC, RTF, TXT, PDF Why does the PDF conversion lose some images/tables? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/src/calibre/web/feeds/recipes/discover_magazine.py b/src/calibre/web/feeds/recipes/discover_magazine.py new file mode 100644 index 0000000000..0e3753834b --- /dev/null +++ b/src/calibre/web/feeds/recipes/discover_magazine.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' +__docformat__ = 'restructuredtext en' + +''' +doscovermagazine.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class DiscoverMagazine(BasicNewsRecipe): + title = u'Discover Magazine' + description = u'Science, Technology and the Future' + __author__ = 'Mike Diaz' + oldest_article = 33 + max_articles_per_feed = 20 + feeds = [ + (u'Technology', u'http://discovermagazine.com/topics/technology/rss.xml'), + (u'Health - Medicine', u'http://discovermagazine.com/topics/health-medicine/rss.xml'), + (u'Mind Brain', u'http://discovermagazine.com/topics/mind-brain/rss.xml'), + (u'Space', u'http://discovermagazine.com/topics/space/rss.xml'), + (u'Human Origins', u'http://discovermagazine.com/topics/human-origins/rss.xml'), + (u'Living World', u'http://discovermagazine.com/topics/living-world/rss.xml'), + (u'Environment', u'http://discovermagazine.com/topics/environment/rss.xml'), + (u'Physics & Math', u'http://discovermagazine.com/topics/physics-math/rss.xml'), + (u'Vital Signs', u'http://discovermagazine.com/columns/vital-signs/rss.xml'), + (u"20 Things you didn't know about...", u'http://discovermagazine.com/columns/20-things-you-didnt-know/rss.xml'), + (u'Fuzzy Math', u'http://discovermagazine.com/columns/fuzzy-math/rss.xml'), + (u'The Brain', u'http://discovermagazine.com/columns/the-brain/rss.xml'), + (u'Stupid Science Word of the Month', u'http://discovermagazine.com/columns/stupid-science-word-of-the-month/rss.xml'), + (u'Science Not Fiction', u'http://blogs.discovermagazine.com/sciencenotfiction/wp-rss.php') + ] \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/scientific_american.py b/src/calibre/web/feeds/recipes/scientific_american.py new file mode 100644 index 0000000000..6c6c679bc5 --- /dev/null +++ b/src/calibre/web/feeds/recipes/scientific_american.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' +__docformat__ = 'restructuredtext en' + +''' +sciam.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class ScientificAmerican(BasicNewsRecipe): + title = u'Scientific American' + description = u'Popular science' + __author__ = 'Kovid Goyal' + oldest_article = 30 + max_articles_per_feed = 100 + use_embedded_content = False + remove_tags_before = dict(name='div', attrs={'class':'headline'}) + remove_tags_after = dict(id='article') + remove_tags = [dict(id='sharetools'), dict(id='reddit')] + html2lrf_options = ['--base-font-size', '8'] + feeds = [ + (u'Latest News', u'http://rss.sciam.com/ScientificAmerican-News'), + (u'Global', u'http://rss.sciam.com/ScientificAmerican-Global'), + (u'Health', u'http://rss.sciam.com/sciam/health'), + (u'Space', u'http://rss.sciam.com/sciam/space'), + (u'Technology', u'http://rss.sciam.com/sciam/technology'), + (u'Biology', u'http://rss.sciam.com/sciam/biology'), + (u'Mind & Brain', u'http://rss.sciam.com/sciam/mind-and-brain'), + (u"What's Next", u'http://rss.sciam.com/sciam/whats-next'), + (u'Archeology and Paleontology', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=archaeology-and-paleontology'), + (u'Physics', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=physics'), + (u'Math', u'http://rss.sciam.com/sciam/math'), + (u'History of Science', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=history-of-science'), + (u'Chemistry', u'http://rss.sciam.com/sciam/chemistry'), + (u'Mind Matters', u'http://rss.sciam.com/ScientificAmerican-MindBlog') + ]