More robust OPF parsing and improved TOC creation in html2epub

2025-07-09 03:04:10 -04:00 · 2008-09-23 13:03:26 -07:00 · 2008-09-23 13:03:26 -07:00 · 501cc90bfa
commit 501cc90bfa
parent e3b8a1b3bf
7 changed files with 120 additions and 80 deletions
--- a/src/calibre/ebooks/epub/init.py
+++ b/src/calibre/ebooks/epub/init.py
@ -62,10 +62,11 @@ def config(defaults=None):
    c.add_opt('override_css', ['--override-css'], default=None,
              help=_('Either the path to a CSS stylesheet or raw CSS. This CSS will override any existing CSS declarations in the source files.'))
    structure = c.add_group('structure detection', _('Control auto-detection of document structure.'))
-    structure('chapter', ['--chapter'], default="//*[re:match(name(), 'h[1-2]') and re:test(., 'chapter|book|section|part', 'i')]",
+    structure('chapter', ['--chapter'], default="//*[re:match(name(), 'h[1-2]') and re:test(., 'chapter|book|section|part', 'i')] | //*[@class = 'chapter']",
            help=_('''\
 An XPath expression to detect chapter titles. The default is to consider <h1> or
-<h2> tags that contain the words "chapter","book","section" or "part" as chapter titles. 
+<h2> tags that contain the words "chapter","book","section" or "part" as chapter titles as 
+well as any tags that have class="chapter". 
 The expression used must evaluate to a list of elements. To disable chapter detection,
 use the expression "/". See the XPath Tutorial in the calibre User Manual for further
 help on using this feature.
@ -84,12 +85,12 @@ Control the automatic generation of a Table of Contents. If an OPF file is detec
 and it specifies a Table of Contents, then that will be used rather than trying
 to auto-generate a Table of Contents.
 ''').replace('\n', ' '))
-    toc('max_toc_recursion', ['--max-toc-recursion'], default=1, 
-        help=_('Number of levels of HTML files to try to autodetect TOC entries from. Set to 0 to disable all TOC autodetection. Default is %default.'))
-    toc('max_toc_links', ['--max-toc-links'], default=40, 
-        help=_('Maximum number of links from each HTML file to insert into the TOC. Set to 0 to disable. Default is: %default.'))
+    toc('max_toc_links', ['--max-toc-links'], default=50, 
+        help=_('Maximum number of links to insert into the TOC. Set to 0 to disable. Default is: %default. Links are only added to the TOC if less than the --toc-threshold number of chapters were detected.'))
    toc('no_chapters_in_toc', ['--no-chapters-in-toc'], default=False,
        help=_("Don't add auto-detected chapters to the Table of Contents."))
+    toc('toc_threshold', ['--toc-threshold'], default=6,
+        help=_('If fewer than this number of chapters is detected, then links are added to the Table of Contents.'))
    toc('use_auto_toc', ['--use-auto-toc'], default=False,
        help=_('Normally, if the source file already has a Table of Contents, it is used in preference to the autodetected one. With this option, the autodetected one is always used.'))
    
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@ -69,13 +69,19 @@ the <spine> element of the OPF file.
 def parse_content(filelist, opts, tdir):
    os.makedirs(os.path.join(tdir, 'content', 'resources'))
    resource_map = {}
-    toc = TOC(base_path=tdir)
+    toc = TOC(base_path=tdir, type='root')
    for htmlfile in filelist:
        hp = HTMLProcessor(htmlfile, opts, os.path.join(tdir, 'content'), 
                           resource_map, filelist)
        hp.populate_toc(toc)
        hp.save()
    
+    if toc.count('chapter') > opts.toc_threshold:
+        toc.purge(['file', 'link', 'unknown'])
+    if toc.count('chapter') + toc.count('file') > opts.toc_threshold:
+        toc.purge(['link', 'unknown'])
+    toc.purge(['link'], max=opts.max_toc_links)
+    
    return resource_map, hp.htmlfile_map, toc

 def convert(htmlfile, opts, notification=None):
--- a/src/calibre/ebooks/html.py
+++ b/src/calibre/ebooks/html.py
@ -118,7 +118,7 @@ class HTMLFile(object):
            raise IgnoreFile(msg, err.errno)
        
        self.is_binary = not bool(self.HTML_PAT.search(src[:1024]))
-        
+        self.title = None
        if not self.is_binary:
            if encoding is None:
                encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
@ -126,8 +126,7 @@ class HTMLFile(object):

            src = src.decode(encoding, 'replace')
            match = self.TITLE_PAT.search(src)
-            if match is not None:
-                self.title = match.group(1)
+            self.title = match.group(1) if match is not None else None
            self.find_links(src)
                
        
@ -460,8 +459,28 @@ class Processor(Parser):
        return Parser.save(self)
    
    def populate_toc(self, toc):
-        if self.level >= self.opts.max_toc_recursion:
-            return
+        
+        def add_item(href, fragment, text, target, type='link'):
+            for entry in toc.flat():
+                if entry.href == href and entry.fragment == fragment:
+                    return entry
+            if len(text) > 50:
+                text = text[:50] + u'\u2026'
+            return target.add_item(href, fragment, text, type=type)
+        
+        # Add chapters to TOC
+        counter = 0
+        if not self.opts.no_chapters_in_toc:
+            for elem in getattr(self, 'detected_chapters', []):
+                text = (u''.join(elem.xpath('string()'))).strip()
+                if text:
+                    name = self.htmlfile_map[self.htmlfile.path]
+                    href = 'content/'+name
+                    counter += 1
+                    id = elem.get('id', 'calibre_chapter_%d'%counter)
+                    elem.set('id', id)
+                    add_item(href, id, text, toc, type='chapter')
+        
        
        referrer = toc
        if self.htmlfile.referrer is not None:
@ -472,19 +491,12 @@ class Processor(Parser):
                    referrer = i
                    break
        
-        def add_item(href, fragment, text, target):
-            for entry in toc.flat():
-                if entry.href == href and entry.fragment == fragment:
-                    return entry
-            if len(text) > 50:
-                text = text[:50] + u'\u2026'
-            return target.add_item(href, fragment, text)
            
        name = self.htmlfile_map[self.htmlfile.path]
        href = 'content/'+name
        
        if referrer.href != href: # Happens for root file
-            target = add_item(href, None, self.htmlfile.title, referrer)
+            target = add_item(href, None, unicode(self.htmlfile.title), referrer, type='file')
        
        # Add links to TOC
        if int(self.opts.max_toc_links) > 0:
@ -502,18 +514,6 @@ class Processor(Parser):
                            name = self.htmlfile_map[self.htmlfile.referrer.path]
                        add_item(href, fragment, text, target)
                        
-        # Add chapters to TOC
-        if not self.opts.no_chapters_in_toc:
-            counter = 0
-            for elem in getattr(self, 'detected_chapters', []):
-                text = (u''.join(elem.xpath('string()'))).strip()
-                if text:
-                    name = self.htmlfile_map[self.htmlfile.path]
-                    href = 'content/'+name
-                    counter += 1
-                    id = elem.get('id', 'calibre_chapter_%d'%counter)
-                    elem.set('id', id)
-                    add_item(href, id, text, target)
                    
        
    def extract_css(self):
--- a/src/calibre/ebooks/metadata/opf2.py
+++ b/src/calibre/ebooks/metadata/opf2.py
@ -393,7 +393,9 @@ class OPF(object):
    NAMESPACES       = {
                        None  : "http://www.idpf.org/2007/opf",
                        'dc'  : "http://purl.org/dc/elements/1.1/",
+                        'dc1' : 'http://purl.org/dc/elements/1.0/',
                        'opf' : "http://www.idpf.org/2007/opf",
+                        'oebpackage' : 'http://openebook.org/namespaces/oeb-package/1.0/',
                       }
    xpn = NAMESPACES.copy()
    xpn.pop(None)
@ -402,16 +404,15 @@ class OPF(object):
    TEXT             = XPath('string()')
    
    
-    metadata_path   = XPath('/opf:package/opf:metadata')
-    metadata_elem_path = XPath('/opf:package/opf:metadata/*[re:match(name(), $name, "i")]')
-    authors_path    = XPath('/opf:package/opf:metadata/*' + \
-        '[re:match(name(), "creator", "i") and (@role="aut" or @opf:role="aut")]')
-    tags_path       = XPath('/opf:package/opf:metadata/*[re:match(name(), "subject", "i")]')
-    isbn_path       = XPath('/opf:package/opf:metadata/*[re:match(name(), "identifier", "i") and '+
+    metadata_path   = XPath('descendant::*[re:match(name(), "metadata", "i")]')
+    metadata_elem_path = XPath('descendant::*[re:match(name(), $name, "i")]')
+    authors_path    = XPath('descendant::*[re:match(name(), "creator", "i") and (@role="aut" or @opf:role="aut")]')
+    tags_path       = XPath('descendant::*[re:match(name(), "subject", "i")]')
+    isbn_path       = XPath('descendant::*[re:match(name(), "identifier", "i") and '+
                            '(re:match(@scheme, "isbn", "i") or re:match(@opf:scheme, "isbn", "i"))]')
-    manifest_path   = XPath('/opf:package/*[re:match(name(), "manifest", "i")]/*[re:match(name(), "item", "i")]') 
-    spine_path      = XPath('/opf:package/*[re:match(name(), "spine", "i")]/*[re:match(name(), "itemref", "i")]')
-    guide_path      = XPath('/opf:package/*[re:match(name(), "guide", "i")]/*[re:match(name(), "reference", "i")]')
+    manifest_path   = XPath('descendant::*[re:match(name(), "manifest", "i")]/*[re:match(name(), "item", "i")]') 
+    spine_path      = XPath('descendant::*[re:match(name(), "spine", "i")]/*[re:match(name(), "itemref", "i")]')
+    guide_path      = XPath('descendant::*[re:match(name(), "guide", "i")]/*[re:match(name(), "reference", "i")]')
    
    title           = MetadataField('title')
    publisher       = MetadataField('publisher')
@ -424,25 +425,27 @@ class OPF(object):
    
    
    def __init__(self, stream, basedir=os.getcwdu()):
+        if not hasattr(stream, 'read'):
+            stream = open(stream, 'rb')
        self.basedir  = self.base_dir = basedir
        raw, self.encoding = xml_to_unicode(stream.read(), strip_encoding_pats=True, resolve_entities=True)
        
-        self.tree     = etree.fromstring(raw, self.PARSER)
-        self.metadata = self.metadata_path(self.tree)
+        self.root     = etree.fromstring(raw, self.PARSER)
+        self.metadata = self.metadata_path(self.root)
        if not self.metadata:
            raise ValueError('Malformed OPF file: No <metadata> element')
        self.metadata      = self.metadata[0]
        self.unquote_urls()
        self.manifest = Manifest()
-        m = self.manifest_path(self.tree)
+        m = self.manifest_path(self.root)
        if m:
            self.manifest = Manifest.from_opf_manifest_element(m, basedir)
        self.spine = None
-        s = self.spine_path(self.tree)
+        s = self.spine_path(self.root)
        if s:
            self.spine = Spine.from_opf_spine_element(s, self.manifest)
        self.guide = None
-        guide = self.guide_path(self.tree)
+        guide = self.guide_path(self.root)
        if guide:
            self.guide = Guide.from_opf_guide(guide, basedir)
        self.cover_data = (None, None)
@ -452,7 +455,7 @@ class OPF(object):
        return u''.join(self.TEXT(elem))
    
    def itermanifest(self):
-        return self.manifest_path(self.tree)
+        return self.manifest_path(self.root)
    
    def create_manifest_item(self, href, media_type):
        ids = [i.get('id', None) for i in self.itermanifest()]
@ -478,7 +481,7 @@ class OPF(object):
        return [i.get('id') for i in items]
    
    def iterspine(self):
-        return self.spine_path(self.tree)
+        return self.spine_path(self.root)
    
    def create_spine_item(self, idref):
        ans = etree.Element('{%s}itemref'%self.NAMESPACES['opf'], idref=idref)
@ -487,14 +490,14 @@ class OPF(object):
    
    def replace_spine_items_by_idref(self, idref, new_idrefs):
        items = list(map(self.create_spine_item, new_idrefs))
-        spine = self.XPath('/opf:package/*[re:match(name(), "spine", "i")]')(self.tree)[0]
+        spine = self.XPath('/opf:package/*[re:match(name(), "spine", "i")]')(self.root)[0]
        old = [i for i in self.iterspine() if i.get('idref', None) == idref]
        for x in old:
            i = spine.index(x)
            spine[i:i+1] = items
    
    def iterguide(self):
-        return self.guide_path(self.tree)
+        return self.guide_path(self.root)
    
    def unquote_urls(self):
        for item in self.itermanifest():
@ -507,12 +510,12 @@ class OPF(object):
        
        def fget(self):
            ans = []
-            for elem in self.authors_path(self.tree):
+            for elem in self.authors_path(self.metadata):
                ans.extend([x.strip() for x in self.get_text(elem).split(',')])
            return ans
        
        def fset(self, val):
-            remove = list(self.authors_path(self.tree))
+            remove = list(self.authors_path(self.metadata))
            for elem in remove:
                self.metadata.remove(elem)
            for author in val:
@ -526,13 +529,13 @@ class OPF(object):
    def author_sort():
        
        def fget(self):
-            matches = self.authors_path(self.tree)
+            matches = self.authors_path(self.metadata)
            if matches:
                ans = matches[0].get('opf:file-as', None)
                return ans if ans else matches[0].get('file-as', None)
            
        def fset(self, val):
-            matches = self.authors_path(self.tree)
+            matches = self.authors_path(self.metadata)
            if matches:
                matches[0].set('file-as', unicode(val))
            
@ -543,12 +546,12 @@ class OPF(object):
        
        def fget(self):
            ans = []
-            for tag in self.tags_path(self.tree):
+            for tag in self.tags_path(self.metadata):
                ans.append(self.get_text(tag))
            return ans
        
        def fset(self, val):
-            for tag in list(self.tags_path(self.tree)):
+            for tag in list(self.tags_path(self.metadata)):
                self.metadata.remove(tag)
            for tag in val:
                elem = self.create_metadata_element('subject', ns='dc')
@ -560,11 +563,11 @@ class OPF(object):
    def isbn():
        
        def fget(self):
-            for match in self.isbn_path(self.tree):
+            for match in self.isbn_path(self.metadata):
                return match.text if match.text else None
            
        def fset(self, val):
-            matches = self.isbn_path(self.tree)
+            matches = self.isbn_path(self.metadata)
            if not matches:
                matches = [self.create_metadata_element('identifier', ns='dc',
                                                attrib={'{%s}scheme'%self.NAMESPACES['opf']:'ISBN'})]
@ -572,9 +575,9 @@ class OPF(object):
        return property(fget=fget, fset=fset)
    
    def get_metadata_element(self, name):
-        matches = self.metadata_elem_path(self.tree, name=name)
+        matches = self.metadata_elem_path(self.metadata, name=name)
        if matches:
-            return matches[0]
+            return matches[-1]
        
    def create_metadata_element(self, name, attrib=None, ns='opf'):
        elem = etree.SubElement(self.metadata, '{%s}%s'%(self.NAMESPACES[ns], name), 
@ -583,7 +586,7 @@ class OPF(object):
        return elem
        
    def render(self, encoding='utf-8'):
-        return etree.tostring(self.tree, encoding='utf-8', pretty_print=True)
+        return etree.tostring(self.root, encoding='utf-8', pretty_print=True)
    
    def smart_update(self, mi):
        for attr in ('author_sort', 'title_sort', 'comments', 'category',
@ -716,7 +719,13 @@ class OPFTest(unittest.TestCase):
    <creator opf:role="aut" file-as="Monkey">Monkey Kitchen, Next</creator>
    <dc:subject>One</dc:subject><dc:subject>Two</dc:subject>
    <dc:identifier scheme="ISBN">123456789</dc:identifier>
+    <x-metadata>
+        <series>A one book series</series>
+    </x-metadata>
 </metadata>
+<manifest>
+    <item id="1" href="a%20%7E%20b" media-type="text/txt" />
+</manifest>
 </package>
 '''
        )
@ -729,14 +738,14 @@ class OPFTest(unittest.TestCase):
        self.assertEqual(opf.author_sort, 'Monkey')
        self.assertEqual(opf.tags, ['One', 'Two'])
        self.assertEqual(opf.isbn, '123456789')
-        self.assertEqual(opf.series, None)
+        self.assertEqual(opf.series, 'A one book series')
        self.assertEqual(opf.series_index, None)
-        
+        self.assertEqual(list(opf.itermanifest())[0].get('href'), 'a ~ b')
        
    def testWriting(self):
        for test in [('title', 'New & Title'), ('authors', ['One', 'Two']),
                     ('author_sort', "Kitchen"), ('tags', ['Three']),
-                     ('isbn', 'a'), ('rating', 3)]:
+                     ('isbn', 'a'), ('rating', 3), ('series_index', 1)]:
            setattr(self.opf, *test)
            self.assertEqual(getattr(self.opf, test[0]), test[1])
        
@ -748,10 +757,5 @@ def suite():
 def test():
    unittest.TextTestRunner(verbosity=2).run(suite())

-
-
-def main(args=sys.argv):
-    return 0
-
 if __name__ == '__main__':
    sys.exit(test())
--- a/src/calibre/ebooks/metadata/toc.py
+++ b/src/calibre/ebooks/metadata/toc.py
@ -21,7 +21,7 @@ class NCXSoup(BeautifulStoneSoup):
 class TOC(list):
    
    def __init__(self, href=None, fragment=None, text=None, parent=None, play_order=0, 
-                 base_path=os.getcwd()):
+                 base_path=os.getcwd(), type='unknown'):
        self.href = href
        self.fragment = fragment
        if not self.fragment:
@ -30,12 +30,32 @@ class TOC(list):
        self.parent = parent
        self.base_path = base_path
        self.play_order = play_order
+        self.type = type
        
-    def add_item(self, href, fragment, text, play_order=None):
+    def count(self, type):
+        return len([i for i in self.flat() if i.type == type])
+    
+    def purge(self, types, max=0):
+        remove = []
+        for entry in self.flat():
+            if entry.type in types:
+                remove.append(entry)
+        remove = remove[max:]
+        for entry in remove:
+            if entry.parent is None:
+                continue
+            entry.parent.remove(entry)
+        return remove
+    
+    def remove(self, entry):
+        list.remove(self, entry)
+        entry.parent = None
+        
+    def add_item(self, href, fragment, text, play_order=None, type='unknown'):
        if play_order is None:
            play_order = (self[-1].play_order if len(self) else self.play_order) + 1
        self.append(TOC(href=href, fragment=fragment, text=text, parent=self,
-                        base_path=self.base_path, play_order=play_order))
+                        base_path=self.base_path, play_order=play_order, type=type))
        return self[-1]
    
    def top_level_items(self):
--- a/src/calibre/gui2/dialogs/epub.ui
+++ b/src/calibre/gui2/dialogs/epub.ui
@ -77,7 +77,7 @@
     <item>
      <widget class="QStackedWidget" name="stack" >
       <property name="currentIndex" >
-        <number>0</number>
+        <number>3</number>
       </property>
       <widget class="QWidget" name="metadata_page" >
        <layout class="QGridLayout" name="gridLayout_4" >
@ -619,15 +619,15 @@ p, li { white-space: pre-wrap; }
             </widget>
            </item>
            <item row="3" column="1" >
-             <widget class="QSpinBox" name="opt_max_toc_recursion" />
+             <widget class="QSpinBox" name="opt_toc_threshold" />
            </item>
            <item row="3" column="0" >
             <widget class="QLabel" name="label_16" >
              <property name="text" >
-               <string>Table of Contents &amp;recursion</string>
+               <string>Chapter &amp;threshold</string>
              </property>
              <property name="buddy" >
-               <cstring>opt_max_toc_recursion</cstring>
+               <cstring>opt_toc_threshold</cstring>
              </property>
             </widget>
            </item>
--- a/src/calibre/gui2/status.py
+++ b/src/calibre/gui2/status.py
@ -1,6 +1,6 @@
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
-import re
+import re, collections

 from PyQt4.QtGui import QStatusBar, QMovie, QLabel, QFrame, QHBoxLayout, QPixmap, \
                        QVBoxLayout, QSizePolicy, QToolButton, QIcon
@ -48,6 +48,13 @@ class BookInfoDisplay(QFrame):
        def mouseReleaseEvent(self, ev):
            self.emit(SIGNAL('mr(int)'), 1)
            
+    WEIGHTS = collections.defaultdict(lambda : 100)
+    WEIGHTS[_('Path')] = 0
+    WEIGHTS[_('Formats')] = 1
+    WEIGHTS[_('Comments')] = 2
+    WEIGHTS[_('Series')] = 3
+    WEIGHTS[_('Tags')] = 4
+    
    def __init__(self, clear_message):
        QFrame.__init__(self)
        self.setCursor(Qt.PointingHandCursor)
@ -74,7 +81,9 @@ class BookInfoDisplay(QFrame):
        rows = u''
        self.book_data.setText('')
        self.data = data.copy()
-        for key in data.keys():
+        keys = data.keys()
+        keys.sort(cmp=lambda x, y: cmp(self.WEIGHTS[x], self.WEIGHTS[y]))
+        for key in keys:
            txt = data[key]
            #txt = '<br />\n'.join(textwrap.wrap(txt, 120))
            if isinstance(key, str):