IGN:html2epub now works when passed OPF files

2025-07-09 03:04:10 -04:00 · 2008-09-16 21:50:00 -07:00 · 2008-09-16 21:50:00 -07:00 · 3c404a7a66
commit 3c404a7a66
parent 5e236b8edb
12 changed files with 110 additions and 53 deletions
--- a/src/calibre/ebooks/chardet/init.py
+++ b/src/calibre/ebooks/chardet/init.py
@ -75,7 +75,10 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, resolve_entiti
    if encoding == 'ascii':
        encoding = 'utf-8'
    
+    try:
        raw = raw.decode(encoding, 'replace')
+    except LookupError:
+        raw = raw.decode('utf-8', 'replace')
    if resolve_entities:
        from calibre import entity_to_unicode
        from functools import partial
--- a/src/calibre/ebooks/epub/init.py
+++ b/src/calibre/ebooks/epub/init.py
@ -53,6 +53,8 @@ The expression used must evaluate to a list of elements. To disable chapter dete
 use the expression "/". See the XPath Tutorial in the calibre User Manual for further
 help on using this feature.
 ''').replace('\n', ' '))
+    structure('chapter_mark', ['--chapter-mark'], choices=['pagebreak', 'rule', 'both'],
+              default='pagebreak', help=_('Specify how to mark detected chapters. A value of "pagebreak" will insert page breaks before chapters. A value of "rule" will insert a line before chapters. A value of "none" will disable chapter marking and a value of "both" will use both page breaks and lines to mark chapters.'))
    
    toc = c.add_group('toc', 
        _('''\
@ -69,5 +71,7 @@ to auto-generate a Table of Contents.
    
    c.add_opt('show_opf', ['--show-opf'], default=False, group='debug',
              help=_('Print generated OPF file to stdout'))
+    c.add_opt('show_ncx', ['--show-ncx'], default=False, group='debug',
+              help=_('Print generated NCX file to stdout'))
    
    return c
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@ -23,8 +23,9 @@ class HTMLProcessor(Processor):
        if opts.verbose > 2:
            self.debug_tree('parsed')
        self.detect_chapters()
-        self.extract_css()
        
+        
+        self.extract_css()
        if opts.verbose > 2:
            self.debug_tree('nocss')
        
@ -97,8 +98,8 @@ def convert(htmlfile, opts, notification=None):
        resources = [os.path.join(tdir, 'content', f) for f in resource_map.values()]
        
        if mi.cover and os.access(mi.cover, os.R_OK):
-            shutil.copyfile(mi.cover, os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)))
-            cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover))
+            shutil.copyfile(mi.cover, os.path.join(tdir, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)[1]))
+            cpath = os.path.join(tdir, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)[1])
            shutil.copyfile(opf.cover, cpath)
            resources.append(cpath)
            mi.cover = cpath
@ -107,21 +108,22 @@ def convert(htmlfile, opts, notification=None):
        mi = create_metadata(tdir, mi, spine, resources)
        buf = cStringIO.StringIO()
        if mi.toc:
-            rebase_toc(mi.toc, htmlfile_map, opts.output)
+            rebase_toc(mi.toc, htmlfile_map, tdir)
        if mi.toc is None or len(mi.toc) < 2:
            mi.toc = generated_toc
        for item in mi.manifest:
            if getattr(item, 'mime_type', None) == 'text/html':
                item.mime_type = 'application/xhtml+xml'
        with open(os.path.join(tdir, 'metadata.opf'), 'wb') as f:
-            mi.render(f, buf)
+            mi.render(f, buf, 'toc.ncx')
        if opts.show_opf:
            print open(os.path.join(tdir, 'metadata.opf')).read()
        toc = buf.getvalue()
        if toc:
            with open(os.path.join(tdir, 'toc.ncx'), 'wb') as f:
                f.write(toc)
-                
+            if opts.show_ncx:
+                print toc
        epub = initialize_container(opts.output)
        epub.add_dir(tdir)
        print 'Output written to', opts.output
--- a/src/calibre/ebooks/html.py
+++ b/src/calibre/ebooks/html.py
@ -13,7 +13,8 @@ from urlparse import urlparse
 from urllib import unquote

 from lxml import html, etree
-from lxml.etree import XPath
+from lxml.html import soupparser, HTMLParser
+from lxml.etree import XPath, XMLParser
 get_text = XPath("//text()")

 from calibre import LoggingInterface, unicode_path
@ -297,6 +298,8 @@ class PreProcessor(object):
    
 class Parser(PreProcessor, LoggingInterface):
    
+    PARSER = HTMLParser(recover=True)
+    
    def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, name='htmlparser'):
        LoggingInterface.__init__(self, logging.getLogger(name))
        self.setup_cli_handler(opts.verbose)
@ -318,6 +321,11 @@ class Parser(PreProcessor, LoggingInterface):
        
        self.parse_html()
        self.root.rewrite_links(self.rewrite_links, resolve_base_href=False)
+        for bad in ('xmlns', 'lang', 'xml:lang'): # lxml also adds these attributes for XHTML documents, leading to duplicates
+            if self.root.get(bad, None) is not None:
+                self.root.attrib.pop(bad)
+        
+        
        
    def save(self):
        '''
@ -325,28 +333,30 @@ class Parser(PreProcessor, LoggingInterface):
        Should be called after all HTML processing is finished.
        '''
        with open(os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path]), 'wb') as f:
-            f.write(html.tostring(self.root, 
-                        encoding='utf-8', method='xml',
-                        include_meta_content_type=True,
-                        pretty_print=self.opts.pretty_print)
-                    )
+            ans = html.tostring(self.root, encoding='utf-8', method='xml', 
+                                pretty_print=self.opts.pretty_print,
+                                include_meta_content_type=True)
+            ans = re.compile(r'<html>', re.IGNORECASE).sub('<html xmlns="http://www.w3.org/1999/xhtml">', ans)
+            f.write(ans)
            return f.name


    def parse_html(self):
        ''' Create lxml ElementTree from HTML '''
        self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:]))
-        src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace')
+        src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace').strip()
        src = self.preprocess(src)
        # lxml chokes on unicode input when it contains encoding declarations
        for pat in ENCODING_PATS:
            src = pat.sub('', src)
        try:
-            self.root = html.document_fromstring(src)
+            self.root = etree.HTML(src, self.PARSER)
+            if self.root is None:
+                raise ValueError('%s is empty'%self.htmlfile.path)
        except:
            if self.opts.verbose:
                self.log_exception('lxml based parsing failed')
-            self.root = html.soupparser.fromstring()
+            self.root = soupparser.fromstring(src)
        self.head = self.body = None
        head = self.root.xpath('//head')
        if head:
@ -404,19 +414,27 @@ class Processor(Parser):
    def detect_chapters(self):
        self.detected_chapters = self.opts.chapter(self.root)
        for elem in self.detected_chapters:
+            if self.opts.chapter_mark in ('both', 'pagebreak'):
                style = elem.get('style', '').strip()
                if style and not style.endswith(';'):
                    style += '; '
                style += 'page-break-before: always'
-            elem.set(style, style)
+                elem.set('style', style)
+            if self.opts.chapter_mark in ('both', 'rule'):
+                hr = etree.Element('hr')
+                if elem.getprevious() is None:
+                    elem.getparent()[:0] = [hr]
+                else:
+                    insert = None
+                    for i, c in enumerate(elem.getparent()):
+                        if c is elem:
+                            insert = i
+                            break
+                    elem.getparent()[insert:insert] = [hr]
+                    
        
    def save(self):
-        head = self.root.xpath('//head')
-        if head:
-            head = head[0]
-        else:
-            head = self.root.xpath('//body')
-            head = head[0] if head else self.root
+        head = self.head if self.head is not None else self.body
        style = etree.SubElement(head, 'style', attrib={'type':'text/css'})
        style.text='\n'+self.css
        style.tail = '\n\n'
@ -589,7 +607,7 @@ def search_for_opf(dir):

 def get_filelist(htmlfile, opts):
    '''
-    Build list of files references by html file or try to detect and use an
+    Build list of files referenced by html file or try to detect and use an
    OPF file instead.
    '''
    print 'Building file list...'
--- a/src/calibre/ebooks/metadata/init.py
+++ b/src/calibre/ebooks/metadata/init.py
@ -43,7 +43,7 @@ class Resource(object):
    
    def __init__(self, href_or_path, basedir=os.getcwd(), is_path=True):
        self._href = None
-        self._basedir = None
+        self._basedir = basedir
        self.path = None
        self.fragment = ''
        try:
@ -55,7 +55,7 @@ class Resource(object):
        if is_path:
            path = href_or_path
            if not os.path.isabs(path):
-                path = os.path.abspath(os.path.join(path, basedir))
+                path = os.path.abspath(os.path.join(basedir, path))
            if isinstance(path, str):
                path = path.decode(sys.getfilesystemencoding())
            self.path = path
--- a/src/calibre/ebooks/metadata/library_thing.py
+++ b/src/calibre/ebooks/metadata/library_thing.py
@ -39,7 +39,7 @@ def cover_from_isbn(isbn, timeout=5.):
    _timeout = socket.getdefaulttimeout()
    socket.setdefaulttimeout(timeout)                
    try:
-        src = browser.open('http://www.librarything.com/isbn/'+isbn).read()
+        src = browser.open('http://www.librarything.com/isbn/'+isbn).read().decode('utf-8', 'replace')
        s = BeautifulSoup(src)
        url = s.find('td', attrs={'class':'left'})
        if url is None:
--- a/src/calibre/ebooks/metadata/ncx.xml
+++ b/src/calibre/ebooks/metadata/ncx.xml
@ -1,7 +1,10 @@
+<?xml version="1.0"  encoding="UTF-8"?>
+<?python
+from uuid import uuid4
+?>
 <ncx version="2005-1" 
     xml:lang="en" 
     xmlns="http://www.daisy.org/z3986/2005/ncx/"
-     encoding="UTF-8"
     xmlns:py="http://genshi.edgewall.org/"
 >
    <head>
@ -14,7 +17,7 @@
    <docTitle><text>Table of Contents</text></docTitle>
    
    <py:def function="navpoint(np, level)">
-        ${'%*s'%(4*level,'')}<navPoint playOrder="${str(np.play_order)}">
+        ${'%*s'%(4*level,'')}<navPoint id="${str(uuid4())}" playOrder="${str(np.play_order)}">
            ${'%*s'%(4*level,'')}<navLabel>
                ${'%*s'%(4*level,'')}<text>${np.text}</text>
            ${'%*s'%(4*level,'')}</navLabel>
--- a/src/calibre/ebooks/metadata/opf.py
+++ b/src/calibre/ebooks/metadata/opf.py
@ -483,7 +483,7 @@ class OPFCreator(MetaInformation):
        Set the toc. You must call :method:`create_spine` before calling this
        method.
        
-        `toc`: A :class:`TOC` object
+        :param toc: A :class:`TOC` object
        '''
        self.toc = toc
        
@ -491,12 +491,21 @@ class OPFCreator(MetaInformation):
        self.guide = Guide.from_opf_guide(guide_element, self.base_path)
        self.guide.set_basedir(self.base_path)
            
-    def render(self, opf_stream, ncx_stream=None):
+    def render(self, opf_stream, ncx_stream=None, ncx_manifest_entry=None):
        from calibre.resources import opf_template
        from calibre.utils.genshi.template import MarkupTemplate
        template = MarkupTemplate(opf_template)
        if self.manifest:
            self.manifest.set_basedir(self.base_path)
+            if ncx_manifest_entry is not None:
+                if not os.path.isabs(ncx_manifest_entry):
+                    ncx_manifest_entry = os.path.join(self.base_path, ncx_manifest_entry)
+                remove = [i for i in self.manifest if i.id == 'ncx']
+                for item in remove:
+                    self.manifest.remove(item)
+                self.manifest.append(ManifestItem(ncx_manifest_entry, self.base_path))
+                self.manifest[-1].id = 'ncx'
+                self.manifest[-1].mime_type = 'application/x-dtbncx+xml'
        if not self.guide:
            self.guide = Guide()
        if self.cover:
--- a/src/calibre/ebooks/metadata/opf.xml
+++ b/src/calibre/ebooks/metadata/opf.xml
@ -23,6 +23,12 @@
        </py:for>
    </metadata>
    
+    <manifest py:if="getattr(mi, 'manifest', None)">
+        <py:for each="ref in mi.manifest">
+        <item id="${ref.id}" href="${ref.href()}" media-type="${ref.mime_type}" /> 
+        </py:for>
+    </manifest>
+    
    <guide py:if="getattr(mi, 'guide', None)">
    <py:for each="ref in mi.guide">
        <reference type="${ref.type}" href="${ref.href()}" py:with="attrs={'title': ref.title if ref.title else None}" py:attrs="attrs" /> 
@ -36,10 +42,5 @@
        </py:for>
    </spine> 
    
-    <manifest py:if="getattr(mi, 'manifest', None)">
-        <py:for each="ref in mi.manifest">
-        <item id="${ref.id}" href="${ref.href()}" media-type="${ref.mime_type}" /> 
-        </py:for>
-    </manifest>
       
 </package>
--- a/src/calibre/ebooks/metadata/toc.py
+++ b/src/calibre/ebooks/metadata/toc.py
@ -29,7 +29,8 @@ class TOC(list):
        self.base_path = base_path
        self.play_order = play_order
        
-    def add_item(self, href, fragment, text):
+    def add_item(self, href, fragment, text, play_order=None):
+        if play_order is None:
            play_order = (self[-1].play_order if len(self) else self.play_order) + 1
        self.append(TOC(href=href, fragment=fragment, text=text, parent=self,
                        base_path=self.base_path, play_order=play_order))
@ -113,14 +114,16 @@ class TOC(list):
        soup = NCXSoup(xml_to_unicode(open(toc, 'rb').read())[0])
        
        def process_navpoint(np, dest):
-            play_order = np.get('playOrder', 1)
+            play_order = np.get('playOrder', None)
+            if play_order is None:
+                play_order = int(np.get('playorder', 1))
            href = fragment = text = None
            nl = np.find('navlabel')
            if nl is not None:
                text = u''
                for txt in nl.findAll('text'):
                    text += ''.join([unicode(s) for s in txt.findAll(text=True)])
-                content = elem.find('content')
+                content = np.find('content')
                if content is None or not content.has_key('src') or not txt:
                    return
                
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -719,6 +719,8 @@ class BasicNewsRecipe(object, LoggingInterface):
        
        entries = ['index.html']
        toc = TOC(base_path=dir)
+        self.play_order_counter = 0
+        self.play_order_map = {}
        
        def feed_index(num, parent):
            f = feeds[num]
@ -726,7 +728,12 @@ class BasicNewsRecipe(object, LoggingInterface):
                if getattr(a, 'downloaded', False):
                    adir = 'feed_%d/article_%d/'%(num, j)
                    entries.append('%sindex.html'%adir)
-                    parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'))
+                    po = self.play_order_map.get(entries[-1], None)
+                    if po is None:
+                        self.play_order_counter += 1
+                        po = self.play_order_counter
+                    parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
+                                    play_order=po)
                    last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
                    for sp in a.sub_pages:
                        prefix = os.path.commonprefix([opf_path, sp])
@ -752,7 +759,11 @@ class BasicNewsRecipe(object, LoggingInterface):
        if len(feeds) > 1:
            for i, f in enumerate(feeds):
                entries.append('feed_%d/index.html'%i)
-                feed_index(i, toc.add_item('feed_%d/index.html'%i, None, f.title))
+                po = self.play_order_map.get(entries[-1], None)
+                if po is None:
+                    self.play_order_counter += 1
+                    po = self.play_order_counter
+                feed_index(i, toc.add_item('feed_%d/index.html'%i, None, f.title, play_order=po))
        else:
            entries.append('feed_%d/index.html'%0)
            feed_index(0, toc)
--- a/upload.py
+++ b/upload.py
@ -206,11 +206,11 @@ def upload_user_manual():
    check_call('scp -r src/calibre/manual/.build/html/* divok:%s'%USER_MANUAL)
    
 def build_src_tarball():
-    check_call('bzr export dist/calibre-%s.tar.bz2'%__version__)
+    check_call('bzr export dist/calibre-%s.tar.gz'%__version__)

 def upload_src_tarball():
-    check_call('ssh divok rm -f %s/calibre-\*.tar.bz2'%DOWNLOADS)
-    check_call('scp dist/calibre-*.tar.bz2 divok:%s/'%DOWNLOADS)
+    check_call('ssh divok rm -f %s/calibre-\*.tar.gz'%DOWNLOADS)
+    check_call('scp dist/calibre-*.tar.gz divok:%s/'%DOWNLOADS)

 def stage_one():
    check_call('sudo rm -rf build', shell=True)
@ -226,16 +226,19 @@ def stage_one():
 def stage_two():
    subprocess.check_call('rm -rf dist/*', shell=True)
    build_installers()
-    build_src_tarball()

 def stage_three():
    print 'Uploading installers...'
    upload_installers()
    print 'Uploading to PyPI'
-    upload_src_tarball()
    upload_docs()
    upload_user_manual()
-    check_call('python setup.py register bdist_egg --exclude-source-files upload')
+    check_call('rm -f dist/*')
+    check_call('python setup.py register')
+    check_call('python setup.py bdist_egg --exclude-source-files')
+    build_src_tarball()
+    upload_src_tarball()
+    check_call('python setup.py upload')
    check_call('''rm -rf dist/* build/*''')
    check_call('''ssh divok bzr update /var/www/calibre.kovidgoyal.net/calibre/''')