mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-10-31 02:27:01 -04:00 
			
		
		
		
	ODT Input: Speed up conversion of ODT files that define huge amounts of redundant style information. Fixes #777468 (Conversion from ODT to EPUB extremely slow)
This commit is contained in:
		
							parent
							
								
									36ba0bd52a
								
							
						
					
					
						commit
						e15ee70a1d
					
				| @ -7,6 +7,8 @@ __docformat__ = 'restructuredtext en' | |||||||
| Convert an ODT file into a Open Ebook | Convert an ODT file into a Open Ebook | ||||||
| ''' | ''' | ||||||
| import os | import os | ||||||
|  | 
 | ||||||
|  | from lxml import etree | ||||||
| from odf.odf2xhtml import ODF2XHTML | from odf.odf2xhtml import ODF2XHTML | ||||||
| 
 | 
 | ||||||
| from calibre import CurrentDir, walk | from calibre import CurrentDir, walk | ||||||
| @ -23,7 +25,48 @@ class Extract(ODF2XHTML): | |||||||
|                 with open(name, 'wb') as f: |                 with open(name, 'wb') as f: | ||||||
|                     f.write(data) |                     f.write(data) | ||||||
| 
 | 
 | ||||||
|     def __call__(self, stream, odir): |     def filter_css(self, html, log): | ||||||
|  |         root = etree.fromstring(html) | ||||||
|  |         style = root.xpath('//*[local-name() = "style" and @type="text/css"]') | ||||||
|  |         if style: | ||||||
|  |             style = style[0] | ||||||
|  |             css = style.text | ||||||
|  |             if css: | ||||||
|  |                 style.text, sel_map = self.do_filter_css(css) | ||||||
|  |                 for x in root.xpath('//*[@class]'): | ||||||
|  |                     extra = [] | ||||||
|  |                     orig = x.get('class') | ||||||
|  |                     for cls in orig.split(): | ||||||
|  |                         extra.extend(sel_map.get(cls, [])) | ||||||
|  |                     if extra: | ||||||
|  |                         x.set('class', orig + ' ' + ' '.join(extra)) | ||||||
|  |                 html = etree.tostring(root, encoding='utf-8', | ||||||
|  |                         xml_declaration=True) | ||||||
|  |         return html | ||||||
|  | 
 | ||||||
|  |     def do_filter_css(self, css): | ||||||
|  |         from cssutils import parseString | ||||||
|  |         from cssutils.css import CSSRule | ||||||
|  |         sheet = parseString(css) | ||||||
|  |         rules = list(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE)) | ||||||
|  |         sel_map = {} | ||||||
|  |         count = 0 | ||||||
|  |         for r in rules: | ||||||
|  |             # Check if we have only class selectors for this rule | ||||||
|  |             nc = [x for x in r.selectorList if not | ||||||
|  |                     x.selectorText.startswith('.')] | ||||||
|  |             if len(r.selectorList) > 1 and not nc: | ||||||
|  |                 replace_name = 'c_odt%d'%count | ||||||
|  |                 count += 1 | ||||||
|  |                 for sel in r.selectorList: | ||||||
|  |                     s = sel.selectorText[1:] | ||||||
|  |                     if s not in sel_map: | ||||||
|  |                         sel_map[s] = [] | ||||||
|  |                     sel_map[s].append(replace_name) | ||||||
|  |                 r.selectorText = '.'+replace_name | ||||||
|  |         return sheet.cssText, sel_map | ||||||
|  | 
 | ||||||
|  |     def __call__(self, stream, odir, log): | ||||||
|         from calibre.utils.zipfile import ZipFile |         from calibre.utils.zipfile import ZipFile | ||||||
|         from calibre.ebooks.metadata.meta import get_metadata |         from calibre.ebooks.metadata.meta import get_metadata | ||||||
|         from calibre.ebooks.metadata.opf2 import OPFCreator |         from calibre.ebooks.metadata.opf2 import OPFCreator | ||||||
| @ -32,13 +75,17 @@ class Extract(ODF2XHTML): | |||||||
|         if not os.path.exists(odir): |         if not os.path.exists(odir): | ||||||
|             os.makedirs(odir) |             os.makedirs(odir) | ||||||
|         with CurrentDir(odir): |         with CurrentDir(odir): | ||||||
|             print 'Extracting ODT file...' |             log('Extracting ODT file...') | ||||||
|             html = self.odf2xhtml(stream) |             html = self.odf2xhtml(stream) | ||||||
|             # A blanket img specification like this causes problems |             # A blanket img specification like this causes problems | ||||||
|             # with EPUB output as the contaiing element often has |             # with EPUB output as the containing element often has | ||||||
|             # an absolute height and width set that is larger than |             # an absolute height and width set that is larger than | ||||||
|             # the available screen real estate |             # the available screen real estate | ||||||
|             html = html.replace('img { width: 100%; height: 100%; }', '') |             html = html.replace('img { width: 100%; height: 100%; }', '') | ||||||
|  |             try: | ||||||
|  |                 html = self.filter_css(html, log) | ||||||
|  |             except: | ||||||
|  |                 log.exception('Failed to filter CSS, conversion may be slow') | ||||||
|             with open('index.xhtml', 'wb') as f: |             with open('index.xhtml', 'wb') as f: | ||||||
|                 f.write(html.encode('utf-8')) |                 f.write(html.encode('utf-8')) | ||||||
|             zf = ZipFile(stream, 'r') |             zf = ZipFile(stream, 'r') | ||||||
| @ -67,7 +114,7 @@ class ODTInput(InputFormatPlugin): | |||||||
| 
 | 
 | ||||||
|     def convert(self, stream, options, file_ext, log, |     def convert(self, stream, options, file_ext, log, | ||||||
|                 accelerators): |                 accelerators): | ||||||
|         return Extract()(stream, '.') |         return Extract()(stream, '.', log) | ||||||
| 
 | 
 | ||||||
|     def postprocess_book(self, oeb, opts, log): |     def postprocess_book(self, oeb, opts, log): | ||||||
|         # Fix <p><div> constructs as the asinine epubchecker complains |         # Fix <p><div> constructs as the asinine epubchecker complains | ||||||
|  | |||||||
| @ -841,11 +841,19 @@ ol, ul { padding-left: 2em; } | |||||||
|             self.styledict[name] = styles |             self.styledict[name] = styles | ||||||
|         # Write the styles to HTML |         # Write the styles to HTML | ||||||
|         self.writeout(self.default_styles) |         self.writeout(self.default_styles) | ||||||
|  |         # Changed by Kovid to not write out endless copies of the same style | ||||||
|  |         css_styles = {} | ||||||
|         for name in self.stylestack: |         for name in self.stylestack: | ||||||
|             styles = self.styledict.get(name) |             styles = self.styledict.get(name) | ||||||
|             css2 = self.cs.convert_styles(styles) |             css2 = tuple(self.cs.convert_styles(styles).iteritems()) | ||||||
|             self.writeout("%s {\n" % name) |             if css2 in css_styles: | ||||||
|             for style, val in css2.items(): |                 css_styles[css2].append(name) | ||||||
|  |             else: | ||||||
|  |                 css_styles[css2] = [name] | ||||||
|  | 
 | ||||||
|  |         for css2, names in css_styles.iteritems(): | ||||||
|  |             self.writeout("%s {\n" % ', '.join(names)) | ||||||
|  |             for style, val in css2: | ||||||
|                 self.writeout("\t%s: %s;\n" % (style, val) ) |                 self.writeout("\t%s: %s;\n" % (style, val) ) | ||||||
|             self.writeout("}\n") |             self.writeout("}\n") | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user