diff --git a/src/calibre/ebooks/epub/__init__.py b/src/calibre/ebooks/epub/__init__.py index c8d3e4dd3a..487202be87 100644 --- a/src/calibre/ebooks/epub/__init__.py +++ b/src/calibre/ebooks/epub/__init__.py @@ -62,10 +62,11 @@ def config(defaults=None): c.add_opt('override_css', ['--override-css'], default=None, help=_('Either the path to a CSS stylesheet or raw CSS. This CSS will override any existing CSS declarations in the source files.')) structure = c.add_group('structure detection', _('Control auto-detection of document structure.')) - structure('chapter', ['--chapter'], default="//*[re:match(name(), 'h[1-2]') and re:test(., 'chapter|book|section|part', 'i')]", + structure('chapter', ['--chapter'], default="//*[re:match(name(), 'h[1-2]') and re:test(., 'chapter|book|section|part', 'i')] | //*[@class = 'chapter']", help=_('''\ An XPath expression to detect chapter titles. The default is to consider

or -

tags that contain the words "chapter","book","section" or "part" as chapter titles. +

tags that contain the words "chapter","book","section" or "part" as chapter titles as +well as any tags that have class="chapter". The expression used must evaluate to a list of elements. To disable chapter detection, use the expression "/". See the XPath Tutorial in the calibre User Manual for further help on using this feature. @@ -84,12 +85,12 @@ Control the automatic generation of a Table of Contents. If an OPF file is detec and it specifies a Table of Contents, then that will be used rather than trying to auto-generate a Table of Contents. ''').replace('\n', ' ')) - toc('max_toc_recursion', ['--max-toc-recursion'], default=1, - help=_('Number of levels of HTML files to try to autodetect TOC entries from. Set to 0 to disable all TOC autodetection. Default is %default.')) - toc('max_toc_links', ['--max-toc-links'], default=40, - help=_('Maximum number of links from each HTML file to insert into the TOC. Set to 0 to disable. Default is: %default.')) + toc('max_toc_links', ['--max-toc-links'], default=50, + help=_('Maximum number of links to insert into the TOC. Set to 0 to disable. Default is: %default. Links are only added to the TOC if less than the --toc-threshold number of chapters were detected.')) toc('no_chapters_in_toc', ['--no-chapters-in-toc'], default=False, help=_("Don't add auto-detected chapters to the Table of Contents.")) + toc('toc_threshold', ['--toc-threshold'], default=6, + help=_('If fewer than this number of chapters is detected, then links are added to the Table of Contents.')) toc('use_auto_toc', ['--use-auto-toc'], default=False, help=_('Normally, if the source file already has a Table of Contents, it is used in preference to the autodetected one. With this option, the autodetected one is always used.')) diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py index c3e29f761a..ef90128caf 100644 --- a/src/calibre/ebooks/epub/from_html.py +++ b/src/calibre/ebooks/epub/from_html.py @@ -69,13 +69,19 @@ the element of the OPF file. def parse_content(filelist, opts, tdir): os.makedirs(os.path.join(tdir, 'content', 'resources')) resource_map = {} - toc = TOC(base_path=tdir) + toc = TOC(base_path=tdir, type='root') for htmlfile in filelist: hp = HTMLProcessor(htmlfile, opts, os.path.join(tdir, 'content'), resource_map, filelist) hp.populate_toc(toc) hp.save() - + + if toc.count('chapter') > opts.toc_threshold: + toc.purge(['file', 'link', 'unknown']) + if toc.count('chapter') + toc.count('file') > opts.toc_threshold: + toc.purge(['link', 'unknown']) + toc.purge(['link'], max=opts.max_toc_links) + return resource_map, hp.htmlfile_map, toc def convert(htmlfile, opts, notification=None): diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py index 65d73fe503..20d0546fd0 100644 --- a/src/calibre/ebooks/html.py +++ b/src/calibre/ebooks/html.py @@ -118,7 +118,7 @@ class HTMLFile(object): raise IgnoreFile(msg, err.errno) self.is_binary = not bool(self.HTML_PAT.search(src[:1024])) - + self.title = None if not self.is_binary: if encoding is None: encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1] @@ -126,8 +126,7 @@ class HTMLFile(object): src = src.decode(encoding, 'replace') match = self.TITLE_PAT.search(src) - if match is not None: - self.title = match.group(1) + self.title = match.group(1) if match is not None else None self.find_links(src) @@ -460,8 +459,28 @@ class Processor(Parser): return Parser.save(self) def populate_toc(self, toc): - if self.level >= self.opts.max_toc_recursion: - return + + def add_item(href, fragment, text, target, type='link'): + for entry in toc.flat(): + if entry.href == href and entry.fragment == fragment: + return entry + if len(text) > 50: + text = text[:50] + u'\u2026' + return target.add_item(href, fragment, text, type=type) + + # Add chapters to TOC + counter = 0 + if not self.opts.no_chapters_in_toc: + for elem in getattr(self, 'detected_chapters', []): + text = (u''.join(elem.xpath('string()'))).strip() + if text: + name = self.htmlfile_map[self.htmlfile.path] + href = 'content/'+name + counter += 1 + id = elem.get('id', 'calibre_chapter_%d'%counter) + elem.set('id', id) + add_item(href, id, text, toc, type='chapter') + referrer = toc if self.htmlfile.referrer is not None: @@ -472,20 +491,13 @@ class Processor(Parser): referrer = i break - def add_item(href, fragment, text, target): - for entry in toc.flat(): - if entry.href == href and entry.fragment == fragment: - return entry - if len(text) > 50: - text = text[:50] + u'\u2026' - return target.add_item(href, fragment, text) name = self.htmlfile_map[self.htmlfile.path] href = 'content/'+name if referrer.href != href: # Happens for root file - target = add_item(href, None, self.htmlfile.title, referrer) - + target = add_item(href, None, unicode(self.htmlfile.title), referrer, type='file') + # Add links to TOC if int(self.opts.max_toc_links) > 0: for link in list(self.LINKS_PATH(self.root))[:self.opts.max_toc_links]: @@ -502,18 +514,6 @@ class Processor(Parser): name = self.htmlfile_map[self.htmlfile.referrer.path] add_item(href, fragment, text, target) - # Add chapters to TOC - if not self.opts.no_chapters_in_toc: - counter = 0 - for elem in getattr(self, 'detected_chapters', []): - text = (u''.join(elem.xpath('string()'))).strip() - if text: - name = self.htmlfile_map[self.htmlfile.path] - href = 'content/'+name - counter += 1 - id = elem.get('id', 'calibre_chapter_%d'%counter) - elem.set('id', id) - add_item(href, id, text, target) def extract_css(self): diff --git a/src/calibre/ebooks/metadata/opf2.py b/src/calibre/ebooks/metadata/opf2.py index 18b3ddd5cf..19b49eda40 100644 --- a/src/calibre/ebooks/metadata/opf2.py +++ b/src/calibre/ebooks/metadata/opf2.py @@ -393,7 +393,9 @@ class OPF(object): NAMESPACES = { None : "http://www.idpf.org/2007/opf", 'dc' : "http://purl.org/dc/elements/1.1/", + 'dc1' : 'http://purl.org/dc/elements/1.0/', 'opf' : "http://www.idpf.org/2007/opf", + 'oebpackage' : 'http://openebook.org/namespaces/oeb-package/1.0/', } xpn = NAMESPACES.copy() xpn.pop(None) @@ -402,16 +404,15 @@ class OPF(object): TEXT = XPath('string()') - metadata_path = XPath('/opf:package/opf:metadata') - metadata_elem_path = XPath('/opf:package/opf:metadata/*[re:match(name(), $name, "i")]') - authors_path = XPath('/opf:package/opf:metadata/*' + \ - '[re:match(name(), "creator", "i") and (@role="aut" or @opf:role="aut")]') - tags_path = XPath('/opf:package/opf:metadata/*[re:match(name(), "subject", "i")]') - isbn_path = XPath('/opf:package/opf:metadata/*[re:match(name(), "identifier", "i") and '+ + metadata_path = XPath('descendant::*[re:match(name(), "metadata", "i")]') + metadata_elem_path = XPath('descendant::*[re:match(name(), $name, "i")]') + authors_path = XPath('descendant::*[re:match(name(), "creator", "i") and (@role="aut" or @opf:role="aut")]') + tags_path = XPath('descendant::*[re:match(name(), "subject", "i")]') + isbn_path = XPath('descendant::*[re:match(name(), "identifier", "i") and '+ '(re:match(@scheme, "isbn", "i") or re:match(@opf:scheme, "isbn", "i"))]') - manifest_path = XPath('/opf:package/*[re:match(name(), "manifest", "i")]/*[re:match(name(), "item", "i")]') - spine_path = XPath('/opf:package/*[re:match(name(), "spine", "i")]/*[re:match(name(), "itemref", "i")]') - guide_path = XPath('/opf:package/*[re:match(name(), "guide", "i")]/*[re:match(name(), "reference", "i")]') + manifest_path = XPath('descendant::*[re:match(name(), "manifest", "i")]/*[re:match(name(), "item", "i")]') + spine_path = XPath('descendant::*[re:match(name(), "spine", "i")]/*[re:match(name(), "itemref", "i")]') + guide_path = XPath('descendant::*[re:match(name(), "guide", "i")]/*[re:match(name(), "reference", "i")]') title = MetadataField('title') publisher = MetadataField('publisher') @@ -424,25 +425,27 @@ class OPF(object): def __init__(self, stream, basedir=os.getcwdu()): + if not hasattr(stream, 'read'): + stream = open(stream, 'rb') self.basedir = self.base_dir = basedir raw, self.encoding = xml_to_unicode(stream.read(), strip_encoding_pats=True, resolve_entities=True) - self.tree = etree.fromstring(raw, self.PARSER) - self.metadata = self.metadata_path(self.tree) + self.root = etree.fromstring(raw, self.PARSER) + self.metadata = self.metadata_path(self.root) if not self.metadata: raise ValueError('Malformed OPF file: No element') self.metadata = self.metadata[0] self.unquote_urls() self.manifest = Manifest() - m = self.manifest_path(self.tree) + m = self.manifest_path(self.root) if m: self.manifest = Manifest.from_opf_manifest_element(m, basedir) self.spine = None - s = self.spine_path(self.tree) + s = self.spine_path(self.root) if s: self.spine = Spine.from_opf_spine_element(s, self.manifest) self.guide = None - guide = self.guide_path(self.tree) + guide = self.guide_path(self.root) if guide: self.guide = Guide.from_opf_guide(guide, basedir) self.cover_data = (None, None) @@ -452,7 +455,7 @@ class OPF(object): return u''.join(self.TEXT(elem)) def itermanifest(self): - return self.manifest_path(self.tree) + return self.manifest_path(self.root) def create_manifest_item(self, href, media_type): ids = [i.get('id', None) for i in self.itermanifest()] @@ -478,7 +481,7 @@ class OPF(object): return [i.get('id') for i in items] def iterspine(self): - return self.spine_path(self.tree) + return self.spine_path(self.root) def create_spine_item(self, idref): ans = etree.Element('{%s}itemref'%self.NAMESPACES['opf'], idref=idref) @@ -487,14 +490,14 @@ class OPF(object): def replace_spine_items_by_idref(self, idref, new_idrefs): items = list(map(self.create_spine_item, new_idrefs)) - spine = self.XPath('/opf:package/*[re:match(name(), "spine", "i")]')(self.tree)[0] + spine = self.XPath('/opf:package/*[re:match(name(), "spine", "i")]')(self.root)[0] old = [i for i in self.iterspine() if i.get('idref', None) == idref] for x in old: i = spine.index(x) spine[i:i+1] = items def iterguide(self): - return self.guide_path(self.tree) + return self.guide_path(self.root) def unquote_urls(self): for item in self.itermanifest(): @@ -507,12 +510,12 @@ class OPF(object): def fget(self): ans = [] - for elem in self.authors_path(self.tree): + for elem in self.authors_path(self.metadata): ans.extend([x.strip() for x in self.get_text(elem).split(',')]) return ans def fset(self, val): - remove = list(self.authors_path(self.tree)) + remove = list(self.authors_path(self.metadata)) for elem in remove: self.metadata.remove(elem) for author in val: @@ -526,13 +529,13 @@ class OPF(object): def author_sort(): def fget(self): - matches = self.authors_path(self.tree) + matches = self.authors_path(self.metadata) if matches: ans = matches[0].get('opf:file-as', None) return ans if ans else matches[0].get('file-as', None) def fset(self, val): - matches = self.authors_path(self.tree) + matches = self.authors_path(self.metadata) if matches: matches[0].set('file-as', unicode(val)) @@ -543,12 +546,12 @@ class OPF(object): def fget(self): ans = [] - for tag in self.tags_path(self.tree): + for tag in self.tags_path(self.metadata): ans.append(self.get_text(tag)) return ans def fset(self, val): - for tag in list(self.tags_path(self.tree)): + for tag in list(self.tags_path(self.metadata)): self.metadata.remove(tag) for tag in val: elem = self.create_metadata_element('subject', ns='dc') @@ -560,11 +563,11 @@ class OPF(object): def isbn(): def fget(self): - for match in self.isbn_path(self.tree): + for match in self.isbn_path(self.metadata): return match.text if match.text else None def fset(self, val): - matches = self.isbn_path(self.tree) + matches = self.isbn_path(self.metadata) if not matches: matches = [self.create_metadata_element('identifier', ns='dc', attrib={'{%s}scheme'%self.NAMESPACES['opf']:'ISBN'})] @@ -572,9 +575,9 @@ class OPF(object): return property(fget=fget, fset=fset) def get_metadata_element(self, name): - matches = self.metadata_elem_path(self.tree, name=name) + matches = self.metadata_elem_path(self.metadata, name=name) if matches: - return matches[0] + return matches[-1] def create_metadata_element(self, name, attrib=None, ns='opf'): elem = etree.SubElement(self.metadata, '{%s}%s'%(self.NAMESPACES[ns], name), @@ -583,7 +586,7 @@ class OPF(object): return elem def render(self, encoding='utf-8'): - return etree.tostring(self.tree, encoding='utf-8', pretty_print=True) + return etree.tostring(self.root, encoding='utf-8', pretty_print=True) def smart_update(self, mi): for attr in ('author_sort', 'title_sort', 'comments', 'category', @@ -716,7 +719,13 @@ class OPFTest(unittest.TestCase): Monkey Kitchen, Next OneTwo 123456789 + + A one book series + + + + ''' ) @@ -729,14 +738,14 @@ class OPFTest(unittest.TestCase): self.assertEqual(opf.author_sort, 'Monkey') self.assertEqual(opf.tags, ['One', 'Two']) self.assertEqual(opf.isbn, '123456789') - self.assertEqual(opf.series, None) + self.assertEqual(opf.series, 'A one book series') self.assertEqual(opf.series_index, None) - + self.assertEqual(list(opf.itermanifest())[0].get('href'), 'a ~ b') def testWriting(self): for test in [('title', 'New & Title'), ('authors', ['One', 'Two']), ('author_sort', "Kitchen"), ('tags', ['Three']), - ('isbn', 'a'), ('rating', 3)]: + ('isbn', 'a'), ('rating', 3), ('series_index', 1)]: setattr(self.opf, *test) self.assertEqual(getattr(self.opf, test[0]), test[1]) @@ -748,10 +757,5 @@ def suite(): def test(): unittest.TextTestRunner(verbosity=2).run(suite()) - - -def main(args=sys.argv): - return 0 - if __name__ == '__main__': sys.exit(test()) \ No newline at end of file diff --git a/src/calibre/ebooks/metadata/toc.py b/src/calibre/ebooks/metadata/toc.py index 5e1a51619e..0583b88242 100644 --- a/src/calibre/ebooks/metadata/toc.py +++ b/src/calibre/ebooks/metadata/toc.py @@ -21,7 +21,7 @@ class NCXSoup(BeautifulStoneSoup): class TOC(list): def __init__(self, href=None, fragment=None, text=None, parent=None, play_order=0, - base_path=os.getcwd()): + base_path=os.getcwd(), type='unknown'): self.href = href self.fragment = fragment if not self.fragment: @@ -30,12 +30,32 @@ class TOC(list): self.parent = parent self.base_path = base_path self.play_order = play_order + self.type = type - def add_item(self, href, fragment, text, play_order=None): + def count(self, type): + return len([i for i in self.flat() if i.type == type]) + + def purge(self, types, max=0): + remove = [] + for entry in self.flat(): + if entry.type in types: + remove.append(entry) + remove = remove[max:] + for entry in remove: + if entry.parent is None: + continue + entry.parent.remove(entry) + return remove + + def remove(self, entry): + list.remove(self, entry) + entry.parent = None + + def add_item(self, href, fragment, text, play_order=None, type='unknown'): if play_order is None: play_order = (self[-1].play_order if len(self) else self.play_order) + 1 self.append(TOC(href=href, fragment=fragment, text=text, parent=self, - base_path=self.base_path, play_order=play_order)) + base_path=self.base_path, play_order=play_order, type=type)) return self[-1] def top_level_items(self): diff --git a/src/calibre/gui2/dialogs/epub.ui b/src/calibre/gui2/dialogs/epub.ui index 6f13a1fde3..4e748e5af9 100644 --- a/src/calibre/gui2/dialogs/epub.ui +++ b/src/calibre/gui2/dialogs/epub.ui @@ -77,7 +77,7 @@ - 0 + 3 @@ -619,15 +619,15 @@ p, li { white-space: pre-wrap; } - + - Table of Contents &recursion + Chapter &threshold - opt_max_toc_recursion + opt_toc_threshold diff --git a/src/calibre/gui2/status.py b/src/calibre/gui2/status.py index e141812033..e66bb197a9 100644 --- a/src/calibre/gui2/status.py +++ b/src/calibre/gui2/status.py @@ -1,6 +1,6 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' -import re +import re, collections from PyQt4.QtGui import QStatusBar, QMovie, QLabel, QFrame, QHBoxLayout, QPixmap, \ QVBoxLayout, QSizePolicy, QToolButton, QIcon @@ -47,6 +47,13 @@ class BookInfoDisplay(QFrame): def mouseReleaseEvent(self, ev): self.emit(SIGNAL('mr(int)'), 1) + + WEIGHTS = collections.defaultdict(lambda : 100) + WEIGHTS[_('Path')] = 0 + WEIGHTS[_('Formats')] = 1 + WEIGHTS[_('Comments')] = 2 + WEIGHTS[_('Series')] = 3 + WEIGHTS[_('Tags')] = 4 def __init__(self, clear_message): QFrame.__init__(self) @@ -74,7 +81,9 @@ class BookInfoDisplay(QFrame): rows = u'' self.book_data.setText('') self.data = data.copy() - for key in data.keys(): + keys = data.keys() + keys.sort(cmp=lambda x, y: cmp(self.WEIGHTS[x], self.WEIGHTS[y])) + for key in keys: txt = data[key] #txt = '
\n'.join(textwrap.wrap(txt, 120)) if isinstance(key, str):