From 3c404a7a6612de4edf66540645ff4905a5bcba91 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 16 Sep 2008 21:50:00 -0700 Subject: [PATCH] IGN:html2epub now works when passed OPF files --- src/calibre/ebooks/chardet/__init__.py | 5 +- src/calibre/ebooks/epub/__init__.py | 4 ++ src/calibre/ebooks/epub/from_html.py | 14 +++-- src/calibre/ebooks/html.py | 62 +++++++++++++------- src/calibre/ebooks/metadata/__init__.py | 4 +- src/calibre/ebooks/metadata/library_thing.py | 2 +- src/calibre/ebooks/metadata/ncx.xml | 7 ++- src/calibre/ebooks/metadata/opf.py | 13 +++- src/calibre/ebooks/metadata/opf.xml | 11 ++-- src/calibre/ebooks/metadata/toc.py | 11 ++-- src/calibre/web/feeds/news.py | 15 ++++- upload.py | 15 +++-- 12 files changed, 110 insertions(+), 53 deletions(-) diff --git a/src/calibre/ebooks/chardet/__init__.py b/src/calibre/ebooks/chardet/__init__.py index 26fd84bee2..03d8fc2ea0 100644 --- a/src/calibre/ebooks/chardet/__init__.py +++ b/src/calibre/ebooks/chardet/__init__.py @@ -75,7 +75,10 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, resolve_entiti if encoding == 'ascii': encoding = 'utf-8' - raw = raw.decode(encoding, 'replace') + try: + raw = raw.decode(encoding, 'replace') + except LookupError: + raw = raw.decode('utf-8', 'replace') if resolve_entities: from calibre import entity_to_unicode from functools import partial diff --git a/src/calibre/ebooks/epub/__init__.py b/src/calibre/ebooks/epub/__init__.py index bcbc82f6c9..885ed4c650 100644 --- a/src/calibre/ebooks/epub/__init__.py +++ b/src/calibre/ebooks/epub/__init__.py @@ -53,6 +53,8 @@ The expression used must evaluate to a list of elements. To disable chapter dete use the expression "/". See the XPath Tutorial in the calibre User Manual for further help on using this feature. ''').replace('\n', ' ')) + structure('chapter_mark', ['--chapter-mark'], choices=['pagebreak', 'rule', 'both'], + default='pagebreak', help=_('Specify how to mark detected chapters. A value of "pagebreak" will insert page breaks before chapters. A value of "rule" will insert a line before chapters. A value of "none" will disable chapter marking and a value of "both" will use both page breaks and lines to mark chapters.')) toc = c.add_group('toc', _('''\ @@ -69,5 +71,7 @@ to auto-generate a Table of Contents. c.add_opt('show_opf', ['--show-opf'], default=False, group='debug', help=_('Print generated OPF file to stdout')) + c.add_opt('show_ncx', ['--show-ncx'], default=False, group='debug', + help=_('Print generated NCX file to stdout')) return c \ No newline at end of file diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py index 32a86df4ed..cddeb5ba72 100644 --- a/src/calibre/ebooks/epub/from_html.py +++ b/src/calibre/ebooks/epub/from_html.py @@ -23,8 +23,9 @@ class HTMLProcessor(Processor): if opts.verbose > 2: self.debug_tree('parsed') self.detect_chapters() - self.extract_css() + + self.extract_css() if opts.verbose > 2: self.debug_tree('nocss') @@ -97,8 +98,8 @@ def convert(htmlfile, opts, notification=None): resources = [os.path.join(tdir, 'content', f) for f in resource_map.values()] if mi.cover and os.access(mi.cover, os.R_OK): - shutil.copyfile(mi.cover, os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover))) - cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)) + shutil.copyfile(mi.cover, os.path.join(tdir, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)[1])) + cpath = os.path.join(tdir, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)[1]) shutil.copyfile(opf.cover, cpath) resources.append(cpath) mi.cover = cpath @@ -107,21 +108,22 @@ def convert(htmlfile, opts, notification=None): mi = create_metadata(tdir, mi, spine, resources) buf = cStringIO.StringIO() if mi.toc: - rebase_toc(mi.toc, htmlfile_map, opts.output) + rebase_toc(mi.toc, htmlfile_map, tdir) if mi.toc is None or len(mi.toc) < 2: mi.toc = generated_toc for item in mi.manifest: if getattr(item, 'mime_type', None) == 'text/html': item.mime_type = 'application/xhtml+xml' with open(os.path.join(tdir, 'metadata.opf'), 'wb') as f: - mi.render(f, buf) + mi.render(f, buf, 'toc.ncx') if opts.show_opf: print open(os.path.join(tdir, 'metadata.opf')).read() toc = buf.getvalue() if toc: with open(os.path.join(tdir, 'toc.ncx'), 'wb') as f: f.write(toc) - + if opts.show_ncx: + print toc epub = initialize_container(opts.output) epub.add_dir(tdir) print 'Output written to', opts.output diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py index 742a7d3856..e0a44707e8 100644 --- a/src/calibre/ebooks/html.py +++ b/src/calibre/ebooks/html.py @@ -13,7 +13,8 @@ from urlparse import urlparse from urllib import unquote from lxml import html, etree -from lxml.etree import XPath +from lxml.html import soupparser, HTMLParser +from lxml.etree import XPath, XMLParser get_text = XPath("//text()") from calibre import LoggingInterface, unicode_path @@ -297,6 +298,8 @@ class PreProcessor(object): class Parser(PreProcessor, LoggingInterface): + PARSER = HTMLParser(recover=True) + def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, name='htmlparser'): LoggingInterface.__init__(self, logging.getLogger(name)) self.setup_cli_handler(opts.verbose) @@ -318,6 +321,11 @@ class Parser(PreProcessor, LoggingInterface): self.parse_html() self.root.rewrite_links(self.rewrite_links, resolve_base_href=False) + for bad in ('xmlns', 'lang', 'xml:lang'): # lxml also adds these attributes for XHTML documents, leading to duplicates + if self.root.get(bad, None) is not None: + self.root.attrib.pop(bad) + + def save(self): ''' @@ -325,28 +333,30 @@ class Parser(PreProcessor, LoggingInterface): Should be called after all HTML processing is finished. ''' with open(os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path]), 'wb') as f: - f.write(html.tostring(self.root, - encoding='utf-8', method='xml', - include_meta_content_type=True, - pretty_print=self.opts.pretty_print) - ) + ans = html.tostring(self.root, encoding='utf-8', method='xml', + pretty_print=self.opts.pretty_print, + include_meta_content_type=True) + ans = re.compile(r'', re.IGNORECASE).sub('', ans) + f.write(ans) return f.name def parse_html(self): ''' Create lxml ElementTree from HTML ''' self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:])) - src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace') + src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace').strip() src = self.preprocess(src) # lxml chokes on unicode input when it contains encoding declarations for pat in ENCODING_PATS: src = pat.sub('', src) try: - self.root = html.document_fromstring(src) + self.root = etree.HTML(src, self.PARSER) + if self.root is None: + raise ValueError('%s is empty'%self.htmlfile.path) except: if self.opts.verbose: self.log_exception('lxml based parsing failed') - self.root = html.soupparser.fromstring() + self.root = soupparser.fromstring(src) self.head = self.body = None head = self.root.xpath('//head') if head: @@ -404,19 +414,27 @@ class Processor(Parser): def detect_chapters(self): self.detected_chapters = self.opts.chapter(self.root) for elem in self.detected_chapters: - style = elem.get('style', '').strip() - if style and not style.endswith(';'): - style += '; ' - style += 'page-break-before: always' - elem.set(style, style) + if self.opts.chapter_mark in ('both', 'pagebreak'): + style = elem.get('style', '').strip() + if style and not style.endswith(';'): + style += '; ' + style += 'page-break-before: always' + elem.set('style', style) + if self.opts.chapter_mark in ('both', 'rule'): + hr = etree.Element('hr') + if elem.getprevious() is None: + elem.getparent()[:0] = [hr] + else: + insert = None + for i, c in enumerate(elem.getparent()): + if c is elem: + insert = i + break + elem.getparent()[insert:insert] = [hr] + def save(self): - head = self.root.xpath('//head') - if head: - head = head[0] - else: - head = self.root.xpath('//body') - head = head[0] if head else self.root + head = self.head if self.head is not None else self.body style = etree.SubElement(head, 'style', attrib={'type':'text/css'}) style.text='\n'+self.css style.tail = '\n\n' @@ -589,7 +607,7 @@ def search_for_opf(dir): def get_filelist(htmlfile, opts): ''' - Build list of files references by html file or try to detect and use an + Build list of files referenced by html file or try to detect and use an OPF file instead. ''' print 'Building file list...' @@ -672,7 +690,7 @@ def rebase_toc(toc, htmlfile_map, basepath, root=True): fix_entry(entry) if root: toc.base_path = basepath - + def create_dir(htmlfile, opts): ''' Create a directory that contains the open ebook diff --git a/src/calibre/ebooks/metadata/__init__.py b/src/calibre/ebooks/metadata/__init__.py index 332d056124..934617a416 100644 --- a/src/calibre/ebooks/metadata/__init__.py +++ b/src/calibre/ebooks/metadata/__init__.py @@ -43,7 +43,7 @@ class Resource(object): def __init__(self, href_or_path, basedir=os.getcwd(), is_path=True): self._href = None - self._basedir = None + self._basedir = basedir self.path = None self.fragment = '' try: @@ -55,7 +55,7 @@ class Resource(object): if is_path: path = href_or_path if not os.path.isabs(path): - path = os.path.abspath(os.path.join(path, basedir)) + path = os.path.abspath(os.path.join(basedir, path)) if isinstance(path, str): path = path.decode(sys.getfilesystemencoding()) self.path = path diff --git a/src/calibre/ebooks/metadata/library_thing.py b/src/calibre/ebooks/metadata/library_thing.py index f93ffafd66..fdecf3fa99 100644 --- a/src/calibre/ebooks/metadata/library_thing.py +++ b/src/calibre/ebooks/metadata/library_thing.py @@ -39,7 +39,7 @@ def cover_from_isbn(isbn, timeout=5.): _timeout = socket.getdefaulttimeout() socket.setdefaulttimeout(timeout) try: - src = browser.open('http://www.librarything.com/isbn/'+isbn).read() + src = browser.open('http://www.librarything.com/isbn/'+isbn).read().decode('utf-8', 'replace') s = BeautifulSoup(src) url = s.find('td', attrs={'class':'left'}) if url is None: diff --git a/src/calibre/ebooks/metadata/ncx.xml b/src/calibre/ebooks/metadata/ncx.xml index 9e8ff00414..55395805cd 100644 --- a/src/calibre/ebooks/metadata/ncx.xml +++ b/src/calibre/ebooks/metadata/ncx.xml @@ -1,7 +1,10 @@ + + @@ -14,7 +17,7 @@ Table of Contents - ${'%*s'%(4*level,'')} + ${'%*s'%(4*level,'')} ${'%*s'%(4*level,'')} ${'%*s'%(4*level,'')}${np.text} ${'%*s'%(4*level,'')} diff --git a/src/calibre/ebooks/metadata/opf.py b/src/calibre/ebooks/metadata/opf.py index 1bff99ecf4..58e52e877d 100644 --- a/src/calibre/ebooks/metadata/opf.py +++ b/src/calibre/ebooks/metadata/opf.py @@ -483,7 +483,7 @@ class OPFCreator(MetaInformation): Set the toc. You must call :method:`create_spine` before calling this method. - `toc`: A :class:`TOC` object + :param toc: A :class:`TOC` object ''' self.toc = toc @@ -491,12 +491,21 @@ class OPFCreator(MetaInformation): self.guide = Guide.from_opf_guide(guide_element, self.base_path) self.guide.set_basedir(self.base_path) - def render(self, opf_stream, ncx_stream=None): + def render(self, opf_stream, ncx_stream=None, ncx_manifest_entry=None): from calibre.resources import opf_template from calibre.utils.genshi.template import MarkupTemplate template = MarkupTemplate(opf_template) if self.manifest: self.manifest.set_basedir(self.base_path) + if ncx_manifest_entry is not None: + if not os.path.isabs(ncx_manifest_entry): + ncx_manifest_entry = os.path.join(self.base_path, ncx_manifest_entry) + remove = [i for i in self.manifest if i.id == 'ncx'] + for item in remove: + self.manifest.remove(item) + self.manifest.append(ManifestItem(ncx_manifest_entry, self.base_path)) + self.manifest[-1].id = 'ncx' + self.manifest[-1].mime_type = 'application/x-dtbncx+xml' if not self.guide: self.guide = Guide() if self.cover: diff --git a/src/calibre/ebooks/metadata/opf.xml b/src/calibre/ebooks/metadata/opf.xml index 3a6cfec58c..61920c40f6 100644 --- a/src/calibre/ebooks/metadata/opf.xml +++ b/src/calibre/ebooks/metadata/opf.xml @@ -23,6 +23,12 @@ + + + + + + @@ -36,10 +42,5 @@ - - - - - diff --git a/src/calibre/ebooks/metadata/toc.py b/src/calibre/ebooks/metadata/toc.py index 2eb6402a41..8f4d2d2ecd 100644 --- a/src/calibre/ebooks/metadata/toc.py +++ b/src/calibre/ebooks/metadata/toc.py @@ -29,8 +29,9 @@ class TOC(list): self.base_path = base_path self.play_order = play_order - def add_item(self, href, fragment, text): - play_order = (self[-1].play_order if len(self) else self.play_order) + 1 + def add_item(self, href, fragment, text, play_order=None): + if play_order is None: + play_order = (self[-1].play_order if len(self) else self.play_order) + 1 self.append(TOC(href=href, fragment=fragment, text=text, parent=self, base_path=self.base_path, play_order=play_order)) return self[-1] @@ -113,14 +114,16 @@ class TOC(list): soup = NCXSoup(xml_to_unicode(open(toc, 'rb').read())[0]) def process_navpoint(np, dest): - play_order = np.get('playOrder', 1) + play_order = np.get('playOrder', None) + if play_order is None: + play_order = int(np.get('playorder', 1)) href = fragment = text = None nl = np.find('navlabel') if nl is not None: text = u'' for txt in nl.findAll('text'): text += ''.join([unicode(s) for s in txt.findAll(text=True)]) - content = elem.find('content') + content = np.find('content') if content is None or not content.has_key('src') or not txt: return diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index fa83bfb694..31735d6c44 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -719,6 +719,8 @@ class BasicNewsRecipe(object, LoggingInterface): entries = ['index.html'] toc = TOC(base_path=dir) + self.play_order_counter = 0 + self.play_order_map = {} def feed_index(num, parent): f = feeds[num] @@ -726,7 +728,12 @@ class BasicNewsRecipe(object, LoggingInterface): if getattr(a, 'downloaded', False): adir = 'feed_%d/article_%d/'%(num, j) entries.append('%sindex.html'%adir) - parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article')) + po = self.play_order_map.get(entries[-1], None) + if po is None: + self.play_order_counter += 1 + po = self.play_order_counter + parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'), + play_order=po) last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep)) for sp in a.sub_pages: prefix = os.path.commonprefix([opf_path, sp]) @@ -752,7 +759,11 @@ class BasicNewsRecipe(object, LoggingInterface): if len(feeds) > 1: for i, f in enumerate(feeds): entries.append('feed_%d/index.html'%i) - feed_index(i, toc.add_item('feed_%d/index.html'%i, None, f.title)) + po = self.play_order_map.get(entries[-1], None) + if po is None: + self.play_order_counter += 1 + po = self.play_order_counter + feed_index(i, toc.add_item('feed_%d/index.html'%i, None, f.title, play_order=po)) else: entries.append('feed_%d/index.html'%0) feed_index(0, toc) diff --git a/upload.py b/upload.py index 70c2d7db3f..7d1e86ac19 100644 --- a/upload.py +++ b/upload.py @@ -206,11 +206,11 @@ def upload_user_manual(): check_call('scp -r src/calibre/manual/.build/html/* divok:%s'%USER_MANUAL) def build_src_tarball(): - check_call('bzr export dist/calibre-%s.tar.bz2'%__version__) + check_call('bzr export dist/calibre-%s.tar.gz'%__version__) def upload_src_tarball(): - check_call('ssh divok rm -f %s/calibre-\*.tar.bz2'%DOWNLOADS) - check_call('scp dist/calibre-*.tar.bz2 divok:%s/'%DOWNLOADS) + check_call('ssh divok rm -f %s/calibre-\*.tar.gz'%DOWNLOADS) + check_call('scp dist/calibre-*.tar.gz divok:%s/'%DOWNLOADS) def stage_one(): check_call('sudo rm -rf build', shell=True) @@ -226,16 +226,19 @@ def stage_one(): def stage_two(): subprocess.check_call('rm -rf dist/*', shell=True) build_installers() - build_src_tarball() def stage_three(): print 'Uploading installers...' upload_installers() print 'Uploading to PyPI' - upload_src_tarball() upload_docs() upload_user_manual() - check_call('python setup.py register bdist_egg --exclude-source-files upload') + check_call('rm -f dist/*') + check_call('python setup.py register') + check_call('python setup.py bdist_egg --exclude-source-files') + build_src_tarball() + upload_src_tarball() + check_call('python setup.py upload') check_call('''rm -rf dist/* build/*''') check_call('''ssh divok bzr update /var/www/calibre.kovidgoyal.net/calibre/''')