diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py index d6d46476df..99c8430c33 100644 --- a/src/calibre/ebooks/epub/from_html.py +++ b/src/calibre/ebooks/epub/from_html.py @@ -65,6 +65,11 @@ class HTMLProcessor(Processor): self.extract_css() if opts.verbose > 2: self.debug_tree('nocss') + + def save(self): + for meta in list(self.root.xpath('//meta')): + meta.getparent().remove(meta) + Processor.save(self) #self.collect_font_statistics() diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py index 1433976113..d04f6ba844 100644 --- a/src/calibre/ebooks/html.py +++ b/src/calibre/ebooks/html.py @@ -27,11 +27,10 @@ from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryF from calibre.utils.zipfile import ZipFile def tostring(root, pretty_print=False): - return html.tostring(root, encoding='utf-8', method='xml', - pretty_print=pretty_print, - include_meta_content_type=True) - - + return html.tostring(root, encoding='utf-8', method='xml', + include_meta_content_type=True, + pretty_print=pretty_print) + class Link(object): ''' Represents a link in a HTML file. @@ -313,6 +312,14 @@ class PreProcessor(object): return html class Parser(PreProcessor, LoggingInterface): +# SELF_CLOSING_TAGS = 'hr|br|link|img|meta|input|area|base|basefont' +# SELF_CLOSING_RULES = [re.compile(p[0]%SELF_CLOSING_TAGS, re.IGNORECASE) for p in +# [ +# (r'<(?P%s)(?P(\s+[^<>]*){0,1})(?', +# '<\g\g />'), +# (), +# ] +# ] def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, name='htmlparser'): LoggingInterface.__init__(self, logging.getLogger(name)) @@ -347,10 +354,10 @@ class Parser(PreProcessor, LoggingInterface): Save processed HTML into the content directory. Should be called after all HTML processing is finished. ''' + ans = tostring(self.root, pretty_print=self.opts.pretty_print) + ans = re.compile(r'', re.IGNORECASE).sub('', ans[:1000]) + ans[1000:] + with open(self.save_path(), 'wb') as f: - ans = tostring(self.root, pretty_print=self.opts.pretty_print) - ans = re.compile(r'', re.IGNORECASE).sub('', ans) - ans = re.compile(r']*?>', re.IGNORECASE).sub('\n\n', ans) f.write(ans) return f.name @@ -369,15 +376,14 @@ class Parser(PreProcessor, LoggingInterface): if self.opts.verbose: self.log_exception('lxml based parsing failed') self.root = soupparser.fromstring(src) - self.head = self.body = None - head = self.root.xpath('//head') - if head: - self.head = head[0] - body = self.root.xpath('//body') - if body: - self.body = body[0] + head = self.root.xpath('./head') + self.head = head[0] if head else etree.SubElement(self.root, 'head') + self.body = self.root.body for a in self.root.xpath('//a[@name]'): a.set('id', a.get('name')) + if not self.head.xpath('./title'): + title = etree.SubElement(self.head, 'title') + title.text = _('Unknown') def debug_tree(self, name): ''' @@ -455,11 +461,11 @@ class Processor(Parser): def save(self): - head = self.head if self.head is not None else self.body style_path = os.path.basename(self.save_path())+'.css' - style = etree.SubElement(head, 'link', attrib={'type':'text/css', 'rel':'stylesheet', - 'href':'resources/'+style_path}) - style.tail = '\n\n' + style = etree.SubElement(self.head, 'link', attrib={'type':'text/css', 'rel':'stylesheet', + 'href':'resources/'+style_path, + 'charset':'UTF-8'}) + style.tail = '\n' style_path = os.path.join(os.path.dirname(self.save_path()), 'resources', style_path) open(style_path, 'wb').write(self.css.encode('utf-8')) return Parser.save(self) @@ -584,6 +590,7 @@ class Processor(Parser): if cn: cn += ' ' cn += classname font.set('class', cn) + font.tag = 'span' for elem in self.root.xpath('//*[@style]'): setting = elem.get('style') diff --git a/src/calibre/gui2/main.py b/src/calibre/gui2/main.py index af81ea91c5..a01812c00f 100644 --- a/src/calibre/gui2/main.py +++ b/src/calibre/gui2/main.py @@ -806,7 +806,7 @@ class Main(MainWindow, Ui_MainWindow): if job.exception is not None: self.job_exception(job) return - to_device = self.device_connected and fmt in self.device_manager.device_class.FORMATS + to_device = self.device_connected and fmt.lower() in self.device_manager.device_class.FORMATS self._add_books([pt.name], to_device) if to_device: self.status_bar.showMessage(_('News fetched. Uploading to device.'), 2000)