diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index 6530e5f16c..c531a15e34 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -122,8 +122,9 @@ class InputFormatPlugin(Plugin): def convert(self, stream, options, file_ext, log, accelerators): ''' This method must be implemented in sub-classes. It must return - the path to the created OPF file. All output should be contained in - the current directory. If this plugin creates files outside the current + the path to the created OPF file or an :class:`OEBBook` instance. + All output should be contained in the current directory. + If this plugin creates files outside the current directory they must be deleted/marked for deletion before this method returns. diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 6142cb555a..41d5f0abd9 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -299,21 +299,15 @@ OptionRecommendation(name='language', # Create an OEBBook from the input file. The input plugin does all the # heavy lifting. - from calibre.ebooks.oeb.reader import OEBReader - from calibre.ebooks.oeb.base import OEBBook accelerators = {} tdir = PersistentTemporaryDirectory('_plumber') - opfpath = self.input_plugin(open(self.input, 'rb'), self.opts, + self.oeb = self.input_plugin(open(self.input, 'rb'), self.opts, self.input_fmt, self.log, accelerators, tdir) - html_preprocessor = HTMLPreProcessor() - self.reader = OEBReader() - self.oeb = OEBBook(self.log, html_preprocessor=html_preprocessor) - # Read OEB Book into OEBBook - self.log.info('Parsing all content...') - self.reader(self.oeb, opfpath) + if not hasattr(self.oeb, 'manifest'): + self.oeb = create_oebbook(self.log, self.oeb) self.opts.source = self.opts.input_profile self.opts.dest = self.opts.output_profile @@ -340,7 +334,20 @@ OptionRecommendation(name='language', trimmer(self.oeb, self.opts) self.log.info('Creating %s...'%self.output_plugin.name) - self.output_plugin.convert(self.oeb, self.output, self.input_plugin, self.opts, - self.log) + self.output_plugin.convert(self.oeb, self.output, self.input_plugin, + self.opts, self.log) +def create_oebbook(log, opfpath): + ''' + Create an OEBBook from an OPF file. + ''' + from calibre.ebooks.oeb.reader import OEBReader + from calibre.ebooks.oeb.base import OEBBook + html_preprocessor = HTMLPreProcessor() + reader = OEBReader() + oeb = OEBBook(log, html_preprocessor=html_preprocessor) + # Read OEB Book into OEBBook + log.info('Parsing all content...') + reader(oeb, opfpath) + return oeb diff --git a/src/calibre/ebooks/epub/__init__.py b/src/calibre/ebooks/epub/__init__.py index 0be88da070..2bc076a8ad 100644 --- a/src/calibre/ebooks/epub/__init__.py +++ b/src/calibre/ebooks/epub/__init__.py @@ -10,23 +10,23 @@ import sys, textwrap, re, os, uuid from itertools import cycle from calibre.utils.config import Config, StringConfig from calibre.utils.zipfile import ZipFile, ZIP_STORED -from calibre.ebooks.html import config as common_config, tostring +from calibre.ebooks.html import tostring from lxml import etree class DefaultProfile(object): - + flow_size = sys.maxint screen_size = None remove_special_chars = False remove_object_tags = False - + class PRS505(DefaultProfile): - + flow_size = 270000 screen_size = (590, 765) remove_special_chars = re.compile(u'[\u200b\u00ad]') remove_object_tags = True - + PROFILES = { 'PRS505' : PRS505, @@ -64,11 +64,11 @@ def config(defaults=None, name='epub'): c = Config(name, desc) else: c = StringConfig(defaults, desc) - + c.update(common_config()) c.remove_opt('output') c.remove_opt('zip') - + c.add_opt('output', ['-o', '--output'], default=None, help=_('The output EPUB file. If not specified, it is ' 'derived from the input file name.')) @@ -81,22 +81,22 @@ def config(defaults=None, name='epub'): help=_('Either the path to a CSS stylesheet or raw CSS. ' 'This CSS will override any existing CSS ' 'declarations in the source files.')) - structure = c.add_group('structure detection', + structure = c.add_group('structure detection', _('Control auto-detection of document structure.')) - structure('chapter', ['--chapter'], + structure('chapter', ['--chapter'], default="//*[re:match(name(), 'h[1-2]') and " "re:test(., 'chapter|book|section|part', 'i')] | " "//*[@class = 'chapter']", help=_('''\ An XPath expression to detect chapter titles. The default is to consider

or -

tags that contain the words "chapter","book","section" or "part" as chapter titles as -well as any tags that have class="chapter". +

tags that contain the words "chapter","book","section" or "part" as chapter titles as +well as any tags that have class="chapter". The expression used must evaluate to a list of elements. To disable chapter detection, use the expression "/". See the XPath Tutorial in the calibre User Manual for further help on using this feature. ''').replace('\n', ' ')) structure('chapter_mark', ['--chapter-mark'], choices=['pagebreak', 'rule', 'both', 'none'], - default='pagebreak', + default='pagebreak', help=_('Specify how to mark detected chapters. A value of ' '"pagebreak" will insert page breaks before chapters. ' 'A value of "rule" will insert a line before chapters. ' @@ -129,13 +129,13 @@ help on using this feature. help=_('XPath expression to find the name of each page in the ' 'pagination map relative to its boundary element. ' 'Default is to number all pages staring with 1.')) - toc = c.add_group('toc', + toc = c.add_group('toc', _('''\ Control the automatic generation of a Table of Contents. If an OPF file is detected and it specifies a Table of Contents, then that will be used rather than trying to auto-generate a Table of Contents. ''').replace('\n', ' ')) - toc('max_toc_links', ['--max-toc-links'], default=50, + toc('max_toc_links', ['--max-toc-links'], default=50, help=_('Maximum number of links to insert into the TOC. Set to 0 ' 'to disable. Default is: %default. Links are only added to the ' 'TOC if less than the --toc-threshold number of chapters were detected.')) @@ -166,15 +166,15 @@ to auto-generate a Table of Contents. help=_('Normally, if the source file already has a Table of Contents, ' 'it is used in preference to the auto-generated one. ' 'With this option, the auto-generated one is always used.')) - + layout = c.add_group('page layout', _('Control page layout')) - layout('margin_top', ['--margin-top'], default=5.0, + layout('margin_top', ['--margin-top'], default=5.0, help=_('Set the top margin in pts. Default is %default')) - layout('margin_bottom', ['--margin-bottom'], default=5.0, + layout('margin_bottom', ['--margin-bottom'], default=5.0, help=_('Set the bottom margin in pts. Default is %default')) - layout('margin_left', ['--margin-left'], default=5.0, + layout('margin_left', ['--margin-left'], default=5.0, help=_('Set the left margin in pts. Default is %default')) - layout('margin_right', ['--margin-right'], default=5.0, + layout('margin_right', ['--margin-right'], default=5.0, help=_('Set the right margin in pts. Default is %default')) layout('base_font_size2', ['--base-font-size'], default=12.0, help=_('The base font size in pts. Default is %defaultpt. ' @@ -195,12 +195,12 @@ to auto-generate a Table of Contents. 'This is only neccessary if the HTML files contain CSS that ' 'uses sibling selectors. Enabling this greatly slows down ' 'processing of large HTML files.')) - + c.add_opt('show_opf', ['--show-opf'], default=False, group='debug', help=_('Print generated OPF file to stdout')) c.add_opt('show_ncx', ['--show-ncx'], default=False, group='debug', help=_('Print generated NCX file to stdout')) - c.add_opt('keep_intermediate', ['--keep-intermediate-files'], group='debug', + c.add_opt('keep_intermediate', ['--keep-intermediate-files'], group='debug', default=False, help=_('Keep intermediate files during processing by html2epub')) c.add_opt('extract_to', ['--extract-to'], group='debug', default=None, diff --git a/src/calibre/ebooks/epub/fonts.py b/src/calibre/ebooks/epub/fonts.py index 5d0887f2d0..67e6066ed1 100644 --- a/src/calibre/ebooks/epub/fonts.py +++ b/src/calibre/ebooks/epub/fonts.py @@ -14,7 +14,7 @@ from lxml.cssselect import CSSSelector from lxml import etree from lxml.html import HtmlElement -from calibre.ebooks.html import fromstring +from calibre.ebooks.html_old import fromstring from calibre.ebooks.epub import rules from cssutils import CSSParser @@ -24,7 +24,7 @@ absolute_size = r'(?P(x?x-)?(small|large)|medium)' relative_size = r'(?Psmaller|larger)' font_size_pat = re.compile('|'.join((relative_size, absolute_size, length)), re.I) -line_height_pat = re.compile(r'({num})(px|in|cm|mm|pt|pc)'.replace('{num}', num)) +line_height_pat = re.compile(r'({num})(px|in|cm|mm|pt|pc)'.replace('{num}', num)) PTU = { 'in' : 72., @@ -37,12 +37,12 @@ PTU = { DEFAULT_FONT_SIZE = 12 class Rationalizer(object): - + @classmethod def specificity(cls, s): '''Map CSS specificity tuple to a single integer''' - return sum([10**(4-i) + x for i,x in enumerate(s)]) - + return sum([10**(4-i) + x for i,x in enumerate(s)]) + @classmethod def compute_font_size(cls, elem): ''' @@ -59,7 +59,7 @@ class Rationalizer(object): elem.computed_font_size = sfs(parent.computed_font_size) else: elem.computed_font_size = sfs - + @classmethod def calculate_font_size(cls, style): 'Return font size in pts from style object. For relative units returns a callable' @@ -69,7 +69,7 @@ class Rationalizer(object): fs = match.group() if style.fontSize: fs = style.fontSize - + match = font_size_pat.search(fs) if match is None: return None @@ -89,8 +89,8 @@ class Rationalizer(object): return 12 * x if match.get('zero', False): return 0. - return functools.partial(operator.mul, 1.2) if 'larger' in fs.lower() else functools.partial(operator.mul, 0.8) - + return functools.partial(operator.mul, 1.2) if 'larger' in fs.lower() else functools.partial(operator.mul, 0.8) + @classmethod def resolve_rules(cls, stylesheets): for sheet in stylesheets: @@ -104,12 +104,12 @@ class Rationalizer(object): if font_size is not None: for s in r.selectorList: sheet.fs_rules.append([CSSSelector(s.selectorText), font_size]) - orig = line_height_pat.search(r.style.lineHeight) + orig = line_height_pat.search(r.style.lineHeight) if orig is not None: for s in r.selectorList: sheet.lh_rules.append([CSSSelector(s.selectorText), float(orig.group(1)) * PTU[orig.group(2).lower()]]) - - + + @classmethod def apply_font_size_rules(cls, stylesheets, root): 'Add a ``specified_font_size`` attribute to every element that has a specified font size' @@ -119,7 +119,7 @@ class Rationalizer(object): elems = selector(root) for elem in elems: elem.specified_font_size = font_size - + @classmethod def remove_font_size_information(cls, stylesheets): for r in rules(stylesheets): @@ -134,17 +134,17 @@ class Rationalizer(object): r.style.removeProperty('font') if line_height_pat.search(r.style.lineHeight) is not None: r.style.removeProperty('line-height') - + @classmethod def compute_font_sizes(cls, root, stylesheets, base=12): stylesheets = [s for s in stylesheets if hasattr(s, 'cssText')] cls.apply_font_size_rules(stylesheets, root) - + # Compute the effective font size of all tags root.computed_font_size = DEFAULT_FONT_SIZE for elem in root.iter(etree.Element): cls.compute_font_size(elem) - + extra_css = {} if base > 0: # Calculate the "base" (i.e. most common) font size @@ -157,20 +157,20 @@ class Rationalizer(object): if t: t = t.strip() if t: font_sizes[elem.computed_font_size] += len(t) - + t = getattr(elem, 'tail', '') if t: t = t.strip() if t: parent = elem.getparent() if parent.tag not in IGNORE: font_sizes[parent.computed_font_size] += len(t) - + try: most_common = max(font_sizes.items(), key=operator.itemgetter(1))[0] scale = base/most_common if most_common > 0 else 1. except ValueError: scale = 1. - + # rescale absolute line-heights counter = 0 for sheet in stylesheets: @@ -181,17 +181,17 @@ class Rationalizer(object): if not extra_css.has_key(elem.get('id')): extra_css[elem.get('id')] = [] extra_css[elem.get('id')].append('line-height:%fpt'%(lh*scale)) - - - + + + # Rescale all computed font sizes for elem in body.iter(etree.Element): if isinstance(elem, HtmlElement): elem.computed_font_size *= scale - - # Remove all font size specifications from the last stylesheet + + # Remove all font size specifications from the last stylesheet cls.remove_font_size_information(stylesheets[-1:]) - + # Create the CSS to implement the rescaled font sizes for elem in body.iter(etree.Element): cfs, pcfs = map(operator.attrgetter('computed_font_size'), (elem, elem.getparent())) @@ -201,12 +201,12 @@ class Rationalizer(object): if not extra_css.has_key(elem.get('id')): extra_css[elem.get('id')] = [] extra_css[elem.get('id')].append('font-size: %f%%'%(100*(cfs/pcfs))) - + css = CSSParser(loglevel=logging.ERROR).parseString('') for id, r in extra_css.items(): css.add('#%s {%s}'%(id, ';'.join(r))) return css - + @classmethod def rationalize(cls, stylesheets, root, opts): logger = logging.getLogger('html2epub') @@ -229,7 +229,7 @@ class Rationalizer(object): ################################################################################ class FontTest(unittest.TestCase): - + def setUp(self): from calibre.ebooks.epub import config self.opts = config(defaults='').parse() @@ -246,10 +246,10 @@ class FontTest(unittest.TestCase):

Some other text.

The longest piece of single font size text in this entire file. Used to test resizing.

- + ''' self.root = fromstring(self.html) - + def do_test(self, css, base=DEFAULT_FONT_SIZE, scale=1): root1 = copy.deepcopy(self.root) root1.computed_font_size = DEFAULT_FONT_SIZE @@ -262,39 +262,39 @@ class FontTest(unittest.TestCase): for elem in root2.iter(etree.Element): Rationalizer.compute_font_size(elem) for e1, e2 in zip(root1.xpath('//body')[0].iter(etree.Element), root2.xpath('//body')[0].iter(etree.Element)): - self.assertAlmostEqual(e1.computed_font_size, e2.computed_font_size, + self.assertAlmostEqual(e1.computed_font_size, e2.computed_font_size, msg='Computed font sizes for %s not equal. Original: %f Processed: %f'%\ (root1.getroottree().getpath(e1), e1.computed_font_size, e2.computed_font_size)) return stylesheet2.cssText - + def testStripping(self): 'Test that any original entries are removed from the CSS' css = 'p { font: bold 10px italic smaller; font-size: x-large} \na { font-size: 0 }' css = CSSParser(loglevel=logging.ERROR).parseString(css) Rationalizer.compute_font_sizes(copy.deepcopy(self.root), [css]) - self.assertEqual(css.cssText.replace(' ', '').replace('\n', ''), + self.assertEqual(css.cssText.replace(' ', '').replace('\n', ''), 'p{font:bolditalic}') - + def testIdentity(self): 'Test that no unnecessary font size changes are made' extra_css = self.do_test('div {font-size:12pt} \nspan {font-size:100%}') self.assertEqual(extra_css.strip(), '') - + def testRelativization(self): 'Test conversion of absolute to relative sizes' self.do_test('#p1 {font: 24pt} b {font: 12pt} .it {font: 48pt} #p2 {font: 100%}') - + def testResizing(self): 'Test resizing of fonts' self.do_test('#longest {font: 24pt} .it {font:20pt; line-height:22pt}') - + def suite(): return unittest.TestLoader().loadTestsFromTestCase(FontTest) - + def test(): unittest.TextTestRunner(verbosity=2).run(suite()) if __name__ == '__main__': - sys.exit(test()) - \ No newline at end of file + sys.exit(test()) + diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py index 318cf5cc02..0ce4629062 100644 --- a/src/calibre/ebooks/epub/from_html.py +++ b/src/calibre/ebooks/epub/from_html.py @@ -38,7 +38,7 @@ from lxml.etree import XPath from lxml import html, etree from PyQt4.Qt import QApplication, QPixmap -from calibre.ebooks.html import Processor, merge_metadata, get_filelist,\ +from calibre.ebooks.html_old import Processor, merge_metadata, get_filelist,\ opf_traverse, create_metadata, rebase_toc, Link, parser from calibre.ebooks.epub import config as common_config, tostring from calibre.ptempfile import TemporaryDirectory diff --git a/src/calibre/ebooks/epub/iterator.py b/src/calibre/ebooks/epub/iterator.py index e55d402bef..5d47c93ea3 100644 --- a/src/calibre/ebooks/epub/iterator.py +++ b/src/calibre/ebooks/epub/iterator.py @@ -16,7 +16,7 @@ from calibre.ebooks.epub import config from calibre.ebooks.metadata.opf2 import OPF from calibre.ptempfile import TemporaryDirectory from calibre.ebooks.chardet import xml_to_unicode -from calibre.ebooks.html import create_dir +from calibre.ebooks.html_old import create_dir from calibre.utils.zipfile import safe_replace, ZipFile from calibre.utils.config import DynamicConfig diff --git a/src/calibre/ebooks/epub/split.py b/src/calibre/ebooks/epub/split.py index c39fe6d181..8ff62a1c4b 100644 --- a/src/calibre/ebooks/epub/split.py +++ b/src/calibre/ebooks/epub/split.py @@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en' Split the flows in an epub file to conform to size limitations. ''' -import os, math, logging, functools, collections, re, copy, sys +import os, math, functools, collections, re, copy, sys from lxml.etree import XPath as _XPath from lxml import etree, html @@ -24,16 +24,16 @@ SPLIT_ATTR = 'cs' SPLIT_POINT_ATTR = 'csp' class SplitError(ValueError): - + def __init__(self, path, root): size = len(tostring(root))/1024. - ValueError.__init__(self, _('Could not find reasonable point at which to split: %s Sub-tree size: %d KB')% + ValueError.__init__(self, _('Could not find reasonable point at which to split: %s Sub-tree size: %d KB')% (os.path.basename(path), size)) - + class Splitter(object): - + def __init__(self, path, opts, stylesheet_map, opf): self.setup_cli_handler(opts.verbose) self.path = path @@ -44,10 +44,10 @@ class Splitter(object): self.orig_size = os.stat(content(path)).st_size self.log_info('\tSplitting %s (%d KB)', path, self.orig_size/1024.) root = html.fromstring(open(content(path)).read()) - + self.page_breaks, self.trees = [], [] self.split_size = 0 - + # Split on page breaks self.splitting_on_page_breaks = True if not opts.dont_split_on_page_breaks: @@ -59,7 +59,7 @@ class Splitter(object): else: self.trees = [root.getroottree()] trees = list(self.trees) - + # Split any remaining over-sized trees self.splitting_on_page_breaks = False if self.opts.profile.flow_size < sys.maxint: @@ -67,7 +67,7 @@ class Splitter(object): self.log_info('\tLooking for large trees...') for i, tree in enumerate(list(trees)): self.trees = [] - size = len(tostring(tree.getroot())) + size = len(tostring(tree.getroot())) if size > self.opts.profile.flow_size: lt_found = True try: @@ -81,7 +81,7 @@ class Splitter(object): trees[i:i+1] = list(self.trees) if not lt_found: self.log_info('\tNo large trees found') - + self.trees = trees self.was_split = len(self.trees) > 1 if self.was_split: @@ -91,17 +91,17 @@ class Splitter(object): for f in self.files: self.log_info('\t\t\t%s - %d KB', f, os.stat(content(f)).st_size/1024.) self.fix_opf(opf) - + self.trees = None - - + + def split_text(self, text, root, size): self.log_debug('\t\t\tSplitting text of length: %d'%len(text)) rest = text.replace('\r', '') parts = re.split('\n\n', rest) self.log_debug('\t\t\t\tFound %d parts'%len(parts)) if max(map(len, parts)) > size: - raise SplitError('Cannot split as file contains a
 tag with a very large paragraph', root) 
+            raise SplitError('Cannot split as file contains a 
 tag with a very large paragraph', root)
         ans = []
         buf = ''
         for part in parts:
@@ -111,8 +111,8 @@ class Splitter(object):
                 ans.append(buf)
                 buf = part
         return ans
-            
-    
+
+
     def split_to_size(self, tree):
         self.log_debug('\t\tSplitting...')
         root = tree.getroot()
@@ -134,7 +134,7 @@ class Splitter(object):
                 p = pre.getparent()
                 i = p.index(pre)
                 p[i:i+1] = new_pres
-        
+
         split_point, before = self.find_split_point(root)
         if split_point is None or self.split_size > 6*self.orig_size:
             if not self.always_remove:
@@ -142,7 +142,7 @@ class Splitter(object):
                                 'structure preservation. This may cause '
                                 'incorrect rendering.'))
             raise SplitError(self.path, root)
-        
+
         for t in self.do_split(tree, split_point, before):
             r = t.getroot()
             if self.is_page_empty(r):
@@ -151,12 +151,12 @@ class Splitter(object):
             if size <= self.opts.profile.flow_size:
                 self.trees.append(t)
                 #print tostring(t.getroot(), pretty_print=True)
-                self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)', 
+                self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)',
                                len(self.trees), size/1024.)
                 self.split_size += size
             else:
                 self.split_to_size(t)
-    
+
     def is_page_empty(self, root):
         body = root.find('body')
         if body is None:
@@ -170,14 +170,14 @@ class Splitter(object):
             if img.get('style', '') != 'display:none':
                 return False
         return True
-                
+
     def do_split(self, tree, split_point, before):
         '''
-        Split ``tree`` into a *before* and *after* tree at ``split_point``, 
-        preserving tag structure, but not duplicating any text. 
+        Split ``tree`` into a *before* and *after* tree at ``split_point``,
+        preserving tag structure, but not duplicating any text.
         All tags that have had their text and tail
         removed have the attribute ``calibre_split`` set to 1.
-        
+
         :param before: If True tree is split before split_point, otherwise after split_point
         :return: before_tree, after_tree
         '''
@@ -188,7 +188,7 @@ class Splitter(object):
         body, body2  = root.body, root2.body
         split_point  = root.xpath(path)[0]
         split_point2 = root2.xpath(path)[0]
-        
+
         def nix_element(elem, top=True):
             if self.always_remove:
                 parent = elem.getparent()
@@ -198,18 +198,18 @@ class Splitter(object):
                 else:
                     index = parent.index(elem)
                     parent[index:index+1] = list(elem.iterchildren())
-                
+
             else:
                 elem.text = u''
                 elem.tail = u''
                 elem.set(SPLIT_ATTR, '1')
                 if elem.tag.lower() in ['ul', 'ol', 'dl', 'table', 'hr', 'img']:
                     elem.set('style', 'display:none')
-        
+
         def fix_split_point(sp):
             if not self.splitting_on_page_breaks:
-                sp.set('style', sp.get('style', '')+'page-break-before:avoid;page-break-after:avoid') 
-        
+                sp.set('style', sp.get('style', '')+'page-break-before:avoid;page-break-after:avoid')
+
         # Tree 1
         hit_split_point = False
         for elem in list(body.iterdescendants(etree.Element)):
@@ -223,8 +223,8 @@ class Splitter(object):
                 continue
             if hit_split_point:
                 nix_element(elem)
-            
-            
+
+
         # Tree 2
         hit_split_point = False
         for elem in list(body2.iterdescendants(etree.Element)):
@@ -238,17 +238,17 @@ class Splitter(object):
                 continue
             if not hit_split_point:
                 nix_element(elem, top=False)
-        
+
         return tree, tree2
-                
-    
+
+
     def split_on_page_breaks(self, orig_tree):
         ordered_ids = []
         for elem in orig_tree.xpath('//*[@id]'):
             id = elem.get('id')
             if id in self.page_break_ids:
                 ordered_ids.append(self.page_breaks[self.page_break_ids.index(id)])
-                
+
         self.trees = []
         tree = orig_tree
         for pattern, before in ordered_ids:
@@ -260,13 +260,13 @@ class Splitter(object):
                 tree = after
         self.trees.append(tree)
         self.trees = [t for t in self.trees if not self.is_page_empty(t.getroot())]
-                
-            
-                
+
+
+
     def find_page_breaks(self, stylesheets, root):
         '''
         Find all elements that have either page-break-before or page-break-after set.
-        Populates `self.page_breaks` with id based XPath selectors (for elements that don't 
+        Populates `self.page_breaks` with id based XPath selectors (for elements that don't
         have ids, an id is created).
         '''
         page_break_selectors = set([])
@@ -283,16 +283,16 @@ class Splitter(object):
                     page_break_selectors.add((CSSSelector(rule.selectorText), False))
             except:
                 pass
-            
+
         page_breaks = set([])
         for selector, before in page_break_selectors:
             for elem in selector(root):
                 elem.pb_before = before
                 page_breaks.add(elem)
-                
+
         for i, elem in enumerate(root.iter()):
             elem.pb_order = i
-            
+
         page_breaks = list(page_breaks)
         page_breaks.sort(cmp=lambda x,y : cmp(x.pb_order, y.pb_order))
         self.page_break_ids = []
@@ -300,12 +300,12 @@ class Splitter(object):
             x.set('id', x.get('id', 'calibre_pb_%d'%i))
             id = x.get('id')
             self.page_breaks.append((XPath('//*[@id="%s"]'%id), x.pb_before))
-            self.page_break_ids.append(id)                        
-        
-        
+            self.page_break_ids.append(id)
+
+
     def find_split_point(self, root):
         '''
-        Find the tag at which to split the tree rooted at `root`. 
+        Find the tag at which to split the tree rooted at `root`.
         Search order is:
             * Heading tags
             * 
tags @@ -314,7 +314,7 @@ class Splitter(object): *

tags *
tags *

  • tags - + We try to split in the "middle" of the file (as defined by tag counts. ''' def pick_elem(elems): @@ -325,18 +325,18 @@ class Splitter(object): i = int(math.floor(len(elems)/2.)) elems[i].set(SPLIT_POINT_ATTR, '1') return elems[i] - + for path in ( - '//*[re:match(name(), "h[1-6]", "i")]', + '//*[re:match(name(), "h[1-6]", "i")]', '/html/body/div', '//pre', - '//hr', + '//hr', '//p', '//div', '//br', '//li', ): - elems = root.xpath(path, + elems = root.xpath(path, namespaces={'re':'http://exslt.org/regular-expressions'}) elem = pick_elem(elems) if elem is not None: @@ -345,9 +345,9 @@ class Splitter(object): except: continue return elem, True - + return None, True - + def commit(self): ''' Commit all changes caused by the split. This removes the previously @@ -357,7 +357,7 @@ class Splitter(object): ''' self.anchor_map = collections.defaultdict(lambda :self.base%0) self.files = [] - + for i, tree in enumerate(self.trees): root = tree.getroot() self.files.append(self.base%i) @@ -367,7 +367,7 @@ class Splitter(object): for elem in root.xpath('//*[@%s or @%s]'%(SPLIT_ATTR, SPLIT_POINT_ATTR)): elem.attrib.pop(SPLIT_ATTR, None) elem.attrib.pop(SPLIT_POINT_ATTR, '0') - + for current, tree in zip(self.files, self.trees): for a in tree.getroot().xpath('//a[@href]'): href = a.get('href').strip() @@ -375,10 +375,10 @@ class Splitter(object): anchor = href[1:] file = self.anchor_map[anchor] if file != current: - a.set('href', file+href) + a.set('href', file+href) open(content(current), 'wb').\ write(tostring(tree.getroot(), pretty_print=self.opts.pretty_print)) - + os.remove(content(self.path)) @@ -391,12 +391,12 @@ class Splitter(object): id_map = {} for item in items: id_map[item.get('id')] = opf.replace_manifest_item(item, new_items) - + for id in id_map.keys(): opf.replace_spine_items_by_idref(id, id_map[id]) - + for ref in opf.iterguide(): - href = ref.get('href', '') + href = ref.get('href', '') if href.startswith('content/'+self.path): href = href.split('#') frag = None @@ -408,8 +408,8 @@ class Splitter(object): new_file = self.anchor_map[frag] ref.set('href', 'content/'+new_file+('' if frag is None else ('#'+frag))) - - + + def fix_content_links(html_files, changes, opts): split_files = [f.path for f in changes] anchor_maps = [f.anchor_map for f in changes] @@ -420,7 +420,7 @@ def fix_content_links(html_files, changes, opts): files[i:i+1] = changes[j].files except ValueError: continue - + for htmlfile in files: changed = False root = html.fromstring(open(content(htmlfile), 'rb').read()) @@ -439,7 +439,7 @@ def fix_content_links(html_files, changes, opts): frag = ('#'+anchor) if anchor else '' a.set('href', newf+frag) changed = True - + if changed: open(content(htmlfile), 'wb').write(tostring(root, pretty_print=opts.pretty_print)) @@ -448,7 +448,7 @@ def fix_ncx(path, changes): anchor_maps = [f.anchor_map for f in changes] tree = etree.parse(path) changed = False - for content in tree.getroot().xpath('//x:content[@src]', + for content in tree.getroot().xpath('//x:content[@src]', namespaces={'x':"http://www.daisy.org/z3986/2005/ncx/"}): href = content.get('src') if not href.startswith('#'): @@ -481,21 +481,21 @@ def find_html_files(opf): if os.path.exists(content(f)): html_files.append(f) return html_files - + def split(pathtoopf, opts, stylesheet_map): pathtoopf = os.path.abspath(pathtoopf) opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf)) - + with CurrentDir(os.path.dirname(pathtoopf)): html_files = find_html_files(opf) changes = [Splitter(f, opts, stylesheet_map, opf) for f in html_files] changes = [c for c in changes if c.was_split] - + fix_content_links(html_files, changes, opts) for item in opf.itermanifest(): if item.get('media-type', '') == 'application/x-dtbncx+xml': fix_ncx(item.get('href'), changes) - break + break open(pathtoopf, 'wb').write(opf.render()) diff --git a/src/calibre/ebooks/html/__init__.py b/src/calibre/ebooks/html/__init__.py new file mode 100644 index 0000000000..9a8f8e2d20 --- /dev/null +++ b/src/calibre/ebooks/html/__init__.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import re + +from lxml.etree import tostring as _tostring + +def tostring(root, strip_comments=False, pretty_print=False): + ''' + Serialize processed XHTML. + ''' + root.set('xmlns', 'http://www.w3.org/1999/xhtml') + root.set('{http://www.w3.org/1999/xhtml}xlink', 'http://www.w3.org/1999/xlink') + for x in root.iter(): + if x.tag.rpartition('}')[-1].lower() == 'svg': + x.set('xmlns', 'http://www.w3.org/2000/svg') + + ans = _tostring(root, encoding='utf-8', pretty_print=pretty_print) + if strip_comments: + ans = re.compile(r'', re.DOTALL).sub('', ans) + ans = '\n'+ans + + return ans + + diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py new file mode 100644 index 0000000000..dd9aa0285c --- /dev/null +++ b/src/calibre/ebooks/html/input.py @@ -0,0 +1,342 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + + +''' +Input plugin for HTML or OPF ebooks. +''' + +import os, re, sys, cStringIO +from urlparse import urlparse, urlunparse +from urllib import unquote + +from calibre.customize.conversion import InputFormatPlugin +from calibre.ebooks.metadata.meta import get_metadata +from calibre.ebooks.metadata.opf2 import OPF, OPFCreator +from calibre.ebooks.metadata import MetaInformation +from calibre.ebooks.chardet import xml_to_unicode +from calibre.customize.conversion import OptionRecommendation +from calibre import unicode_path + +class Link(object): + ''' + Represents a link in a HTML file. + ''' + + @classmethod + def url_to_local_path(cls, url, base): + path = urlunparse(('', '', url.path, url.params, url.query, '')) + path = unquote(path) + if os.path.isabs(path): + return path + return os.path.abspath(os.path.join(base, path)) + + def __init__(self, url, base): + ''' + :param url: The url this link points to. Must be an unquoted unicode string. + :param base: The base directory that relative URLs are with respect to. + Must be a unicode string. + ''' + assert isinstance(url, unicode) and isinstance(base, unicode) + self.url = url + self.parsed_url = urlparse(self.url) + self.is_local = self.parsed_url.scheme in ('', 'file') + self.is_internal = self.is_local and not bool(self.parsed_url.path) + self.path = None + self.fragment = unquote(self.parsed_url.fragment) + if self.is_local and not self.is_internal: + self.path = self.url_to_local_path(self.parsed_url, base) + + def __hash__(self): + if self.path is None: + return hash(self.url) + return hash(self.path) + + def __eq__(self, other): + return self.path == getattr(other, 'path', other) + + def __str__(self): + return u'Link: %s --> %s'%(self.url, self.path) + + +class IgnoreFile(Exception): + + def __init__(self, msg, errno): + Exception.__init__(self, msg) + self.doesnt_exist = errno == 2 + self.errno = errno + +class HTMLFile(object): + ''' + Contains basic information about an HTML file. This + includes a list of links to other files as well as + the encoding of each file. Also tries to detect if the file is not a HTML + file in which case :member:`is_binary` is set to True. + + The encoding of the file is available as :member:`encoding`. + ''' + + HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE) + TITLE_PAT = re.compile('([^<>]+)', re.IGNORECASE) + LINK_PAT = re.compile( + r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P[^"]+)")|(?:\'(?P[^\']+)\')|(?P[^\s>]+))', + re.DOTALL|re.IGNORECASE) + + def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None): + ''' + :param level: The level of this file. Should be 0 for the root file. + :param encoding: Use `encoding` to decode HTML. + :param referrer: The :class:`HTMLFile` that first refers to this file. + ''' + self.path = unicode_path(path_to_html_file, abs=True) + self.title = os.path.splitext(os.path.basename(self.path))[0] + self.base = os.path.dirname(self.path) + self.level = level + self.referrer = referrer + self.links = [] + + try: + with open(self.path, 'rb') as f: + src = f.read() + except IOError, err: + msg = 'Could not read from file: %s with error: %s'%(self.path, unicode(err)) + if level == 0: + raise IOError(msg) + raise IgnoreFile(msg, err.errno) + + self.is_binary = not bool(self.HTML_PAT.search(src[:1024])) + if not self.is_binary: + if encoding is None: + encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1] + self.encoding = encoding + else: + self.encoding = encoding + + src = src.decode(encoding, 'replace') + match = self.TITLE_PAT.search(src) + self.title = match.group(1) if match is not None else self.title + self.find_links(src) + + + + def __eq__(self, other): + return self.path == getattr(other, 'path', other) + + def __str__(self): + return u'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path) + + def __repr__(self): + return str(self) + + + def find_links(self, src): + for match in self.LINK_PAT.finditer(src): + url = None + for i in ('url1', 'url2', 'url3'): + url = match.group(i) + if url: + break + link = self.resolve(url) + if link not in self.links: + self.links.append(link) + + def resolve(self, url): + return Link(url, self.base) + + +def depth_first(root, flat, visited=set([])): + yield root + visited.add(root) + for link in root.links: + if link.path is not None and link not in visited: + try: + index = flat.index(link) + except ValueError: # Can happen if max_levels is used + continue + hf = flat[index] + if hf not in visited: + yield hf + visited.add(hf) + for hf in depth_first(hf, flat, visited): + if hf not in visited: + yield hf + visited.add(hf) + + +def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None): + ''' + Recursively traverse all links in the HTML file. + + :param max_levels: Maximum levels of recursion. Must be non-negative. 0 + implies that no links in the root HTML file are followed. + :param encoding: Specify character encoding of HTML files. If `None` it is + auto-detected. + :return: A pair of lists (breadth_first, depth_first). Each list contains + :class:`HTMLFile` objects. + ''' + assert max_levels >= 0 + level = 0 + flat = [HTMLFile(path_to_html_file, level, encoding, verbose)] + next_level = list(flat) + while level < max_levels and len(next_level) > 0: + level += 1 + nl = [] + for hf in next_level: + rejects = [] + for link in hf.links: + if link.path is None or link.path in flat: + continue + try: + nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf) + if nf.is_binary: + raise IgnoreFile('%s is a binary file'%nf.path, -1) + nl.append(nf) + flat.append(nf) + except IgnoreFile, err: + rejects.append(link) + if not err.doesnt_exist or verbose > 1: + print repr(err) + for link in rejects: + hf.links.remove(link) + + next_level = list(nl) + orec = sys.getrecursionlimit() + sys.setrecursionlimit(500000) + try: + return flat, list(depth_first(flat[0], flat)) + finally: + sys.setrecursionlimit(orec) + + +def opf_traverse(opf_reader, verbose=0, encoding=None): + ''' + Return a list of :class:`HTMLFile` objects in the order specified by the + `` element of the OPF. + + :param opf_reader: An :class:`calibre.ebooks.metadata.opf2.OPF` instance. + :param encoding: Specify character encoding of HTML files. If `None` it is + auto-detected. + ''' + if not opf_reader.spine: + raise ValueError('OPF does not have a spine') + flat = [] + for path in opf_reader.spine.items(): + path = os.path.abspath(path) + if path not in flat: + flat.append(os.path.abspath(path)) + for item in opf_reader.manifest: + if 'html' in item.mime_type: + path = os.path.abspath(item.path) + if path not in flat: + flat.append(path) + for i, path in enumerate(flat): + if not os.path.exists(path): + path = path.replace('&', '%26') + if os.path.exists(path): + flat[i] = path + for item in opf_reader.itermanifest(): + item.set('href', item.get('href').replace('&', '%26')) + ans = [] + for path in flat: + if os.path.exists(path): + ans.append(HTMLFile(path, 0, encoding, verbose)) + else: + print 'WARNING: OPF spine item %s does not exist'%path + ans = [f for f in ans if not f.is_binary] + return ans + +def search_for_opf(dir): + for f in os.listdir(dir): + if f.lower().endswith('.opf'): + return OPF(open(os.path.join(dir, f), 'rb'), dir) + +def get_filelist(htmlfile, dir, opts, log): + ''' + Build list of files referenced by html file or try to detect and use an + OPF file instead. + ''' + print 'Building file list...' + opf = search_for_opf(dir) + filelist = None + if opf is not None: + try: + filelist = opf_traverse(opf, verbose=opts.verbose, + encoding=opts.input_encoding) + except: + pass + if not filelist: + filelist = traverse(htmlfile, max_levels=int(opts.max_levels), + verbose=opts.verbose, + encoding=opts.input_encoding)\ + [0 if opts.breadth_first else 1] + if opts.verbose: + log.debug('\tFound files...') + for f in filelist: + log.debug('\t\t', f) + return opf, filelist + + +class HTMLInput(InputFormatPlugin): + + name = 'HTML Input' + author = 'Kovid Goyal' + description = 'Convert HTML and OPF files to an OEB' + file_types = set(['opf', 'html', 'htm', 'xhtml', 'xhtm']) + + options = set([ + OptionRecommendation(name='breadth_first', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Traverse links in HTML files breadth first. Normally, ' + 'they are traversed depth first.' + ) + ), + + OptionRecommendation(name='max_levels', + recommended_value=5, level=OptionRecommendation.LOW, + help=_('Maximum levels of recursion when following links in ' + 'HTML files. Must be non-negative. 0 implies that no ' + 'links in the root HTML file are followed. Default is ' + '%default.' + ) + ), + + ]) + + def convert(self, stream, opts, file_ext, log, + accelerators): + basedir = os.getcwd() + if hasattr(stream, 'name'): + basedir = os.path.dirname(stream.name) + if file_ext == 'opf': + opf = OPF(stream, basedir) + filelist = opf_traverse(opf, verbose=opts.verbose, + encoding=opts.input_encoding) + mi = MetaInformation(opf) + else: + opf, filelist = get_filelist(stream.name, basedir, opts, log) + mi = MetaInformation(opf) + mi.smart_update(get_metadata(stream, 'html')) + + mi = OPFCreator(os.getcwdu(), mi) + mi.guide = None + entries = [(f.path, 'application/xhtml+xml') for f in filelist] + mi.create_manifest(entries) + mi.create_spine([f.path for f in filelist]) + + tocbuf = cStringIO.StringIO() + mi.render(open('metadata.opf', 'wb'), tocbuf, 'toc.ncx') + toc = tocbuf.getvalue() + if toc: + open('toc.ncx', 'wb').write(toc) + + from calibre.ebooks.conversion.plumber import create_oebbook + return create_oebbook(log, os.path.abspath('metadata.opf')) + + + + diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html_old.py similarity index 100% rename from src/calibre/ebooks/html.py rename to src/calibre/ebooks/html_old.py diff --git a/src/calibre/ebooks/metadata/opf2.py b/src/calibre/ebooks/metadata/opf2.py index f6b5a9bd1a..4b7648d81f 100644 --- a/src/calibre/ebooks/metadata/opf2.py +++ b/src/calibre/ebooks/metadata/opf2.py @@ -683,26 +683,6 @@ class OPF(object): return property(fget=fget, fset=fset) - @dynamic_property - def title_sort(self): - - def fget(self): - matches = self.title_path(self.metadata) - if matches: - for match in matches: - ans = match.get('{%s}file-as'%self.NAMESPACES['opf'], None) - if not ans: - ans = match.get('file-as', None) - if ans: - return ans - - def fset(self, val): - matches = self.title_path(self.metadata) - if matches: - matches[0].set('file-as', unicode(val)) - - return property(fget=fget, fset=fset) - @dynamic_property def tags(self): @@ -943,9 +923,10 @@ class OPFCreator(MetaInformation): from calibre.resources import opf_template from calibre.utils.genshi.template import MarkupTemplate template = MarkupTemplate(opf_template) + toc = getattr(self, 'toc', None) if self.manifest: self.manifest.set_basedir(self.base_path) - if ncx_manifest_entry is not None: + if ncx_manifest_entry is not None and toc is not None: if not os.path.isabs(ncx_manifest_entry): ncx_manifest_entry = os.path.join(self.base_path, ncx_manifest_entry) remove = [i for i in self.manifest if i.id == 'ncx'] @@ -965,7 +946,6 @@ class OPFCreator(MetaInformation): opf = template.generate(__appname__=__appname__, mi=self, __version__=__version__).render('xml') opf_stream.write(opf) opf_stream.flush() - toc = getattr(self, 'toc', None) if toc is not None and ncx_stream is not None: toc.render(ncx_stream, self.application_id) ncx_stream.flush() @@ -1030,19 +1010,8 @@ class OPFTest(unittest.TestCase): self.opf.smart_update(MetaInformation(self.opf)) self.testReading() - def testCreator(self): - opf = OPFCreator(os.getcwd(), self.opf) - buf = cStringIO.StringIO() - opf.render(buf) - raw = buf.getvalue() - self.testReading(opf=OPF(cStringIO.StringIO(raw), os.getcwd())) - - def testSmartUpdate(self): - self.opf.smart_update(self.opf) - self.testReading() - def suite(): return unittest.TestLoader().loadTestsFromTestCase(OPFTest) def test(): - unittest.TextTestRunner(verbosity=2).run(suite()) \ No newline at end of file + unittest.TextTestRunner(verbosity=2).run(suite()) diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py index 8f2e24a831..2eb45c9161 100644 --- a/src/calibre/ebooks/mobi/input.py +++ b/src/calibre/ebooks/mobi/input.py @@ -29,5 +29,5 @@ class MOBIInput(InputFormatPlugin): with open(f, 'wb') as q: q.write(html.tostring(root, encoding='utf-8', method='xml', include_meta_content_type=False)) - accelerators['pagebreaks'] = {f: '//div[@class="mbp_pagebreak"]'} + accelerators['pagebreaks'] = {f: '//*[@class="mbp_pagebreak"]'} return mr.created_opf_path diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index a78b5085d9..6032ae549a 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -522,7 +522,7 @@ class MobiReader(object): else: raise MobiError('Unknown compression algorithm: %s'%repr(self.book_header.compression_type)) if self.book_header.ancient and '