From bc115198c765671d11a165be5b5dd9b8b049b6e8 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 12 Sep 2008 23:56:58 -0700
Subject: [PATCH] IGN:...

---
 src/calibre/ebooks/epub/from_html.py | 28 +++++++++++++++++++++++++---
 src/calibre/ebooks/html.py           |  1 +
 src/calibre/ebooks/metadata/toc.py   |  2 +-
 3 files changed, 27 insertions(+), 4 deletions(-)
diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py
index 525b78772f..59cd871dc7 100644
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@@ -5,6 +5,7 @@ __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
 import os, sys, re, shutil, cStringIO
 from lxml.etree import XPath
+from lxml import etree
 
 from calibre.ebooks.html import Parser, get_text, merge_metadata, get_filelist,\
     opf_traverse, create_metadata, rebase_toc
@@ -15,7 +16,7 @@ from calibre.ebooks.metadata import MetaInformation
 
 class HTMLProcessor(Parser):
     
-    def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles):
+    def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, toc=None):
         Parser.__init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, 
                         name='html2epub')
         if opts.verbose > 2:
@@ -26,6 +27,9 @@ class HTMLProcessor(Parser):
         if opts.verbose > 2:
             self.debug_tree('nocss')
         
+        if toc is not None:
+            self.populate_toc(toc)
+        
         self.collect_font_statistics()
         
         self.split()
@@ -37,6 +41,23 @@ class HTMLProcessor(Parser):
             style += ';page-break-before: always'
             elem.set(style, style)
         
+    def save(self):
+        head = self.root.xpath('//head')
+        if head:
+            head = head[0]
+        else:
+            head = self.root.xpath('//body')
+            head = head[0] if head else self.root
+        style = etree.SubElement(head, 'style', attrib={'type':'text/css'})
+        style.text='\n'+self.css
+        style.tail = '\n\n'
+        Parser.save(self)
+    
+    def populate_toc(self, toc):
+        if self.level >= self.opts.max_toc_recursion:
+            return
+        
+        
     def collect_font_statistics(self):
         '''
         Collect font statistics to figure out the base font size used in this
@@ -50,8 +71,9 @@ class HTMLProcessor(Parser):
     
     def split(self):
         ''' Split into individual flows to accommodate Adobe's incompetence '''
-        # TODO: Split on page breaks, keeping track of anchors (a.name and id)
-        # and preserving tree structure so that CSS continues to apply
+        # TODO: Only split file larger than 300K (as specified in profile)
+        # Split on page breaks first and then on <h1-6> tags and then on
+        # <div> and finally on <p>.  
         pass
             
 
diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py
index 3e3531697b..be3f7201e3 100644
--- a/src/calibre/ebooks/html.py
+++ b/src/calibre/ebooks/html.py
@@ -447,6 +447,7 @@ class Parser(PreProcessor, LoggingInterface):
             css.append('#%s {%s}'%(id, 'page-break-before:always'))
                      
         self.raw_css = '\n\n'.join(css)
+        self.css = unicode(self.raw_css)
         # TODO: Figure out what to do about CSS imports from linked stylesheets    
 
 def config(defaults=None, config_name='html', 
diff --git a/src/calibre/ebooks/metadata/toc.py b/src/calibre/ebooks/metadata/toc.py
index 25f7ca7010..cd28b9799e 100644
--- a/src/calibre/ebooks/metadata/toc.py
+++ b/src/calibre/ebooks/metadata/toc.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
-import os, glob, sys
+import os, glob
 from urlparse import urlparse
 from urllib import unquote