From e15ee70a1ded03bf4b84f5a30f8fce89aeefa56e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 4 May 2011 18:56:07 -0600 Subject: [PATCH 1/6] ODT Input: Speed up conversion of ODT files that define huge amounts of redundant style information. Fixes #777468 (Conversion from ODT to EPUB extremely slow) --- src/calibre/ebooks/odt/input.py | 55 ++++++++++++++++++++++++++++++--- src/odf/odf2xhtml.py | 14 +++++++-- 2 files changed, 62 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/odt/input.py b/src/calibre/ebooks/odt/input.py index 1184148e80..10553dac2b 100644 --- a/src/calibre/ebooks/odt/input.py +++ b/src/calibre/ebooks/odt/input.py @@ -7,6 +7,8 @@ __docformat__ = 'restructuredtext en' Convert an ODT file into a Open Ebook ''' import os + +from lxml import etree from odf.odf2xhtml import ODF2XHTML from calibre import CurrentDir, walk @@ -23,7 +25,48 @@ class Extract(ODF2XHTML): with open(name, 'wb') as f: f.write(data) - def __call__(self, stream, odir): + def filter_css(self, html, log): + root = etree.fromstring(html) + style = root.xpath('//*[local-name() = "style" and @type="text/css"]') + if style: + style = style[0] + css = style.text + if css: + style.text, sel_map = self.do_filter_css(css) + for x in root.xpath('//*[@class]'): + extra = [] + orig = x.get('class') + for cls in orig.split(): + extra.extend(sel_map.get(cls, [])) + if extra: + x.set('class', orig + ' ' + ' '.join(extra)) + html = etree.tostring(root, encoding='utf-8', + xml_declaration=True) + return html + + def do_filter_css(self, css): + from cssutils import parseString + from cssutils.css import CSSRule + sheet = parseString(css) + rules = list(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE)) + sel_map = {} + count = 0 + for r in rules: + # Check if we have only class selectors for this rule + nc = [x for x in r.selectorList if not + x.selectorText.startswith('.')] + if len(r.selectorList) > 1 and not nc: + replace_name = 'c_odt%d'%count + count += 1 + for sel in r.selectorList: + s = sel.selectorText[1:] + if s not in sel_map: + sel_map[s] = [] + sel_map[s].append(replace_name) + r.selectorText = '.'+replace_name + return sheet.cssText, sel_map + + def __call__(self, stream, odir, log): from calibre.utils.zipfile import ZipFile from calibre.ebooks.metadata.meta import get_metadata from calibre.ebooks.metadata.opf2 import OPFCreator @@ -32,13 +75,17 @@ class Extract(ODF2XHTML): if not os.path.exists(odir): os.makedirs(odir) with CurrentDir(odir): - print 'Extracting ODT file...' + log('Extracting ODT file...') html = self.odf2xhtml(stream) # A blanket img specification like this causes problems - # with EPUB output as the contaiing element often has + # with EPUB output as the containing element often has # an absolute height and width set that is larger than # the available screen real estate html = html.replace('img { width: 100%; height: 100%; }', '') + try: + html = self.filter_css(html, log) + except: + log.exception('Failed to filter CSS, conversion may be slow') with open('index.xhtml', 'wb') as f: f.write(html.encode('utf-8')) zf = ZipFile(stream, 'r') @@ -67,7 +114,7 @@ class ODTInput(InputFormatPlugin): def convert(self, stream, options, file_ext, log, accelerators): - return Extract()(stream, '.') + return Extract()(stream, '.', log) def postprocess_book(self, oeb, opts, log): # Fix

constructs as the asinine epubchecker complains diff --git a/src/odf/odf2xhtml.py b/src/odf/odf2xhtml.py index 26da9d9905..a04aa48bf7 100644 --- a/src/odf/odf2xhtml.py +++ b/src/odf/odf2xhtml.py @@ -841,11 +841,19 @@ ol, ul { padding-left: 2em; } self.styledict[name] = styles # Write the styles to HTML self.writeout(self.default_styles) + # Changed by Kovid to not write out endless copies of the same style + css_styles = {} for name in self.stylestack: styles = self.styledict.get(name) - css2 = self.cs.convert_styles(styles) - self.writeout("%s {\n" % name) - for style, val in css2.items(): + css2 = tuple(self.cs.convert_styles(styles).iteritems()) + if css2 in css_styles: + css_styles[css2].append(name) + else: + css_styles[css2] = [name] + + for css2, names in css_styles.iteritems(): + self.writeout("%s {\n" % ', '.join(names)) + for style, val in css2: self.writeout("\t%s: %s;\n" % (style, val) ) self.writeout("}\n") From b461f5bc26030f3f8ce43d02facc3914ade45c5e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 4 May 2011 20:30:34 -0600 Subject: [PATCH 2/6] ... --- src/calibre/ebooks/odt/input.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/calibre/ebooks/odt/input.py b/src/calibre/ebooks/odt/input.py index 10553dac2b..e724acb981 100644 --- a/src/calibre/ebooks/odt/input.py +++ b/src/calibre/ebooks/odt/input.py @@ -56,6 +56,9 @@ class Extract(ODF2XHTML): nc = [x for x in r.selectorList if not x.selectorText.startswith('.')] if len(r.selectorList) > 1 and not nc: + # Replace all the class selectors with a single class selector + # This will be added to the class attribute of all elements + # that have one of these selectors. replace_name = 'c_odt%d'%count count += 1 for sel in r.selectorList: From 6dc8803ac569d739f1b53a76faaf6a510c80eb3f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 5 May 2011 09:21:31 -0600 Subject: [PATCH 3/6] Add uri->url identifiers mapping --- src/calibre/ebooks/metadata/sources/identify.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/calibre/ebooks/metadata/sources/identify.py b/src/calibre/ebooks/metadata/sources/identify.py index 3d4807ac02..1bd071d6f9 100644 --- a/src/calibre/ebooks/metadata/sources/identify.py +++ b/src/calibre/ebooks/metadata/sources/identify.py @@ -13,6 +13,7 @@ from Queue import Queue, Empty from threading import Thread from io import BytesIO from operator import attrgetter +from urlparse import urlparse from calibre.customize.ui import metadata_plugins, all_metadata_plugins from calibre.ebooks.metadata.sources.base import create_log, msprefs @@ -458,6 +459,14 @@ def urls_from_identifiers(identifiers): # {{{ if oclc: ans.append(('OCLC', 'oclc', oclc, 'http://www.worldcat.org/oclc/'+oclc)) + url = identifiers.get('uri', None) + if url is None: + url = identifiers.get('url', None) + if url and url.startswith('http'): + url = url[:8].replace('|', ':') + url[8:].replace('|', ',') + parts = urlparse(url) + name = parts.netloc + ans.append((name, 'url', url, url)) return ans # }}} From d371225d5d36dcc294fdda40a1b551a986f548a1 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 5 May 2011 14:20:51 -0600 Subject: [PATCH 4/6] ... --- src/calibre/gui2/actions/choose_library.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/calibre/gui2/actions/choose_library.py b/src/calibre/gui2/actions/choose_library.py index f6b19fc4aa..9fd156b802 100644 --- a/src/calibre/gui2/actions/choose_library.py +++ b/src/calibre/gui2/actions/choose_library.py @@ -246,7 +246,8 @@ class ChooseLibraryAction(InterfaceAction): def delete_requested(self, name, location): loc = location.replace('/', os.sep) if not question_dialog(self.gui, _('Are you sure?'), '

'+ - _('All files from

%s

will be ' + _('All files (not just ebooks) ' + 'from

%s

will be ' 'permanently deleted. Are you sure?') % loc, show_copy_button=False): return From 1647e17684a07b29501b4a2ef4bfdd59e989f390 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 5 May 2011 14:34:41 -0600 Subject: [PATCH 5/6] Fix two zero byte files being left behind after calibre quits on windows --- setup/installer/windows/main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/setup/installer/windows/main.c b/setup/installer/windows/main.c index d76850504e..780be94330 100644 --- a/setup/installer/windows/main.c +++ b/setup/installer/windows/main.c @@ -23,6 +23,9 @@ wWinMain(HINSTANCE Inst, HINSTANCE PrevInst, ret = execute_python_entrypoint(BASENAME, MODULE, FUNCTION, stdout_redirect, stderr_redirect); + if (stdout != NULL) fclose(stdout); + if (stderr != NULL) fclose(stderr); + DeleteFile(stdout_redirect); DeleteFile(stderr_redirect); From e277a9218930b9aef5a60a963a45fa070b7c1461 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 5 May 2011 14:44:54 -0600 Subject: [PATCH 6/6] Fix temp file leak when restarting calibre from within itself --- src/calibre/gui2/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/gui2/main.py b/src/calibre/gui2/main.py index eadfa55549..645ce3b228 100644 --- a/src/calibre/gui2/main.py +++ b/src/calibre/gui2/main.py @@ -299,13 +299,13 @@ def run_gui(opts, args, actions, listener, app, gui_debug=None): if getattr(runner.main, 'debug_on_restart', False): run_in_debug_mode() else: + import subprocess print 'Restarting with:', e, sys.argv if hasattr(sys, 'frameworks_dir'): app = os.path.dirname(os.path.dirname(sys.frameworks_dir)) - import subprocess subprocess.Popen('sleep 3s; open '+app, shell=True) else: - os.execvp(e, sys.argv) + subprocess.Popen([e] + sys.argv[1:]) else: if iswindows: try: