From bc115198c765671d11a165be5b5dd9b8b049b6e8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 12 Sep 2008 23:56:58 -0700 Subject: [PATCH] IGN:... --- src/calibre/ebooks/epub/from_html.py | 28 +++++++++++++++++++++++++--- src/calibre/ebooks/html.py | 1 + src/calibre/ebooks/metadata/toc.py | 2 +- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py index 525b78772f..59cd871dc7 100644 --- a/src/calibre/ebooks/epub/from_html.py +++ b/src/calibre/ebooks/epub/from_html.py @@ -5,6 +5,7 @@ __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __docformat__ = 'restructuredtext en' import os, sys, re, shutil, cStringIO from lxml.etree import XPath +from lxml import etree from calibre.ebooks.html import Parser, get_text, merge_metadata, get_filelist,\ opf_traverse, create_metadata, rebase_toc @@ -15,7 +16,7 @@ from calibre.ebooks.metadata import MetaInformation class HTMLProcessor(Parser): - def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles): + def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, toc=None): Parser.__init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, name='html2epub') if opts.verbose > 2: @@ -26,6 +27,9 @@ class HTMLProcessor(Parser): if opts.verbose > 2: self.debug_tree('nocss') + if toc is not None: + self.populate_toc(toc) + self.collect_font_statistics() self.split() @@ -37,6 +41,23 @@ class HTMLProcessor(Parser): style += ';page-break-before: always' elem.set(style, style) + def save(self): + head = self.root.xpath('//head') + if head: + head = head[0] + else: + head = self.root.xpath('//body') + head = head[0] if head else self.root + style = etree.SubElement(head, 'style', attrib={'type':'text/css'}) + style.text='\n'+self.css + style.tail = '\n\n' + Parser.save(self) + + def populate_toc(self, toc): + if self.level >= self.opts.max_toc_recursion: + return + + def collect_font_statistics(self): ''' Collect font statistics to figure out the base font size used in this @@ -50,8 +71,9 @@ class HTMLProcessor(Parser): def split(self): ''' Split into individual flows to accommodate Adobe's incompetence ''' - # TODO: Split on page breaks, keeping track of anchors (a.name and id) - # and preserving tree structure so that CSS continues to apply + # TODO: Only split file larger than 300K (as specified in profile) + # Split on page breaks first and then on tags and then on + #
and finally on

. pass diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py index 3e3531697b..be3f7201e3 100644 --- a/src/calibre/ebooks/html.py +++ b/src/calibre/ebooks/html.py @@ -447,6 +447,7 @@ class Parser(PreProcessor, LoggingInterface): css.append('#%s {%s}'%(id, 'page-break-before:always')) self.raw_css = '\n\n'.join(css) + self.css = unicode(self.raw_css) # TODO: Figure out what to do about CSS imports from linked stylesheets def config(defaults=None, config_name='html', diff --git a/src/calibre/ebooks/metadata/toc.py b/src/calibre/ebooks/metadata/toc.py index 25f7ca7010..cd28b9799e 100644 --- a/src/calibre/ebooks/metadata/toc.py +++ b/src/calibre/ebooks/metadata/toc.py @@ -1,7 +1,7 @@ #!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' -import os, glob, sys +import os, glob from urlparse import urlparse from urllib import unquote