mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
More robust OPF parsing and improved TOC creation in html2epub
This commit is contained in:
parent
e3b8a1b3bf
commit
501cc90bfa
@ -62,10 +62,11 @@ def config(defaults=None):
|
||||
c.add_opt('override_css', ['--override-css'], default=None,
|
||||
help=_('Either the path to a CSS stylesheet or raw CSS. This CSS will override any existing CSS declarations in the source files.'))
|
||||
structure = c.add_group('structure detection', _('Control auto-detection of document structure.'))
|
||||
structure('chapter', ['--chapter'], default="//*[re:match(name(), 'h[1-2]') and re:test(., 'chapter|book|section|part', 'i')]",
|
||||
structure('chapter', ['--chapter'], default="//*[re:match(name(), 'h[1-2]') and re:test(., 'chapter|book|section|part', 'i')] | //*[@class = 'chapter']",
|
||||
help=_('''\
|
||||
An XPath expression to detect chapter titles. The default is to consider <h1> or
|
||||
<h2> tags that contain the words "chapter","book","section" or "part" as chapter titles.
|
||||
<h2> tags that contain the words "chapter","book","section" or "part" as chapter titles as
|
||||
well as any tags that have class="chapter".
|
||||
The expression used must evaluate to a list of elements. To disable chapter detection,
|
||||
use the expression "/". See the XPath Tutorial in the calibre User Manual for further
|
||||
help on using this feature.
|
||||
@ -84,12 +85,12 @@ Control the automatic generation of a Table of Contents. If an OPF file is detec
|
||||
and it specifies a Table of Contents, then that will be used rather than trying
|
||||
to auto-generate a Table of Contents.
|
||||
''').replace('\n', ' '))
|
||||
toc('max_toc_recursion', ['--max-toc-recursion'], default=1,
|
||||
help=_('Number of levels of HTML files to try to autodetect TOC entries from. Set to 0 to disable all TOC autodetection. Default is %default.'))
|
||||
toc('max_toc_links', ['--max-toc-links'], default=40,
|
||||
help=_('Maximum number of links from each HTML file to insert into the TOC. Set to 0 to disable. Default is: %default.'))
|
||||
toc('max_toc_links', ['--max-toc-links'], default=50,
|
||||
help=_('Maximum number of links to insert into the TOC. Set to 0 to disable. Default is: %default. Links are only added to the TOC if less than the --toc-threshold number of chapters were detected.'))
|
||||
toc('no_chapters_in_toc', ['--no-chapters-in-toc'], default=False,
|
||||
help=_("Don't add auto-detected chapters to the Table of Contents."))
|
||||
toc('toc_threshold', ['--toc-threshold'], default=6,
|
||||
help=_('If fewer than this number of chapters is detected, then links are added to the Table of Contents.'))
|
||||
toc('use_auto_toc', ['--use-auto-toc'], default=False,
|
||||
help=_('Normally, if the source file already has a Table of Contents, it is used in preference to the autodetected one. With this option, the autodetected one is always used.'))
|
||||
|
||||
|
@ -69,13 +69,19 @@ the <spine> element of the OPF file.
|
||||
def parse_content(filelist, opts, tdir):
|
||||
os.makedirs(os.path.join(tdir, 'content', 'resources'))
|
||||
resource_map = {}
|
||||
toc = TOC(base_path=tdir)
|
||||
toc = TOC(base_path=tdir, type='root')
|
||||
for htmlfile in filelist:
|
||||
hp = HTMLProcessor(htmlfile, opts, os.path.join(tdir, 'content'),
|
||||
resource_map, filelist)
|
||||
hp.populate_toc(toc)
|
||||
hp.save()
|
||||
|
||||
if toc.count('chapter') > opts.toc_threshold:
|
||||
toc.purge(['file', 'link', 'unknown'])
|
||||
if toc.count('chapter') + toc.count('file') > opts.toc_threshold:
|
||||
toc.purge(['link', 'unknown'])
|
||||
toc.purge(['link'], max=opts.max_toc_links)
|
||||
|
||||
return resource_map, hp.htmlfile_map, toc
|
||||
|
||||
def convert(htmlfile, opts, notification=None):
|
||||
|
@ -118,7 +118,7 @@ class HTMLFile(object):
|
||||
raise IgnoreFile(msg, err.errno)
|
||||
|
||||
self.is_binary = not bool(self.HTML_PAT.search(src[:1024]))
|
||||
|
||||
self.title = None
|
||||
if not self.is_binary:
|
||||
if encoding is None:
|
||||
encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
|
||||
@ -126,8 +126,7 @@ class HTMLFile(object):
|
||||
|
||||
src = src.decode(encoding, 'replace')
|
||||
match = self.TITLE_PAT.search(src)
|
||||
if match is not None:
|
||||
self.title = match.group(1)
|
||||
self.title = match.group(1) if match is not None else None
|
||||
self.find_links(src)
|
||||
|
||||
|
||||
@ -460,8 +459,28 @@ class Processor(Parser):
|
||||
return Parser.save(self)
|
||||
|
||||
def populate_toc(self, toc):
|
||||
if self.level >= self.opts.max_toc_recursion:
|
||||
return
|
||||
|
||||
def add_item(href, fragment, text, target, type='link'):
|
||||
for entry in toc.flat():
|
||||
if entry.href == href and entry.fragment == fragment:
|
||||
return entry
|
||||
if len(text) > 50:
|
||||
text = text[:50] + u'\u2026'
|
||||
return target.add_item(href, fragment, text, type=type)
|
||||
|
||||
# Add chapters to TOC
|
||||
counter = 0
|
||||
if not self.opts.no_chapters_in_toc:
|
||||
for elem in getattr(self, 'detected_chapters', []):
|
||||
text = (u''.join(elem.xpath('string()'))).strip()
|
||||
if text:
|
||||
name = self.htmlfile_map[self.htmlfile.path]
|
||||
href = 'content/'+name
|
||||
counter += 1
|
||||
id = elem.get('id', 'calibre_chapter_%d'%counter)
|
||||
elem.set('id', id)
|
||||
add_item(href, id, text, toc, type='chapter')
|
||||
|
||||
|
||||
referrer = toc
|
||||
if self.htmlfile.referrer is not None:
|
||||
@ -472,19 +491,12 @@ class Processor(Parser):
|
||||
referrer = i
|
||||
break
|
||||
|
||||
def add_item(href, fragment, text, target):
|
||||
for entry in toc.flat():
|
||||
if entry.href == href and entry.fragment == fragment:
|
||||
return entry
|
||||
if len(text) > 50:
|
||||
text = text[:50] + u'\u2026'
|
||||
return target.add_item(href, fragment, text)
|
||||
|
||||
name = self.htmlfile_map[self.htmlfile.path]
|
||||
href = 'content/'+name
|
||||
|
||||
if referrer.href != href: # Happens for root file
|
||||
target = add_item(href, None, self.htmlfile.title, referrer)
|
||||
target = add_item(href, None, unicode(self.htmlfile.title), referrer, type='file')
|
||||
|
||||
# Add links to TOC
|
||||
if int(self.opts.max_toc_links) > 0:
|
||||
@ -502,18 +514,6 @@ class Processor(Parser):
|
||||
name = self.htmlfile_map[self.htmlfile.referrer.path]
|
||||
add_item(href, fragment, text, target)
|
||||
|
||||
# Add chapters to TOC
|
||||
if not self.opts.no_chapters_in_toc:
|
||||
counter = 0
|
||||
for elem in getattr(self, 'detected_chapters', []):
|
||||
text = (u''.join(elem.xpath('string()'))).strip()
|
||||
if text:
|
||||
name = self.htmlfile_map[self.htmlfile.path]
|
||||
href = 'content/'+name
|
||||
counter += 1
|
||||
id = elem.get('id', 'calibre_chapter_%d'%counter)
|
||||
elem.set('id', id)
|
||||
add_item(href, id, text, target)
|
||||
|
||||
|
||||
def extract_css(self):
|
||||
|
@ -393,7 +393,9 @@ class OPF(object):
|
||||
NAMESPACES = {
|
||||
None : "http://www.idpf.org/2007/opf",
|
||||
'dc' : "http://purl.org/dc/elements/1.1/",
|
||||
'dc1' : 'http://purl.org/dc/elements/1.0/',
|
||||
'opf' : "http://www.idpf.org/2007/opf",
|
||||
'oebpackage' : 'http://openebook.org/namespaces/oeb-package/1.0/',
|
||||
}
|
||||
xpn = NAMESPACES.copy()
|
||||
xpn.pop(None)
|
||||
@ -402,16 +404,15 @@ class OPF(object):
|
||||
TEXT = XPath('string()')
|
||||
|
||||
|
||||
metadata_path = XPath('/opf:package/opf:metadata')
|
||||
metadata_elem_path = XPath('/opf:package/opf:metadata/*[re:match(name(), $name, "i")]')
|
||||
authors_path = XPath('/opf:package/opf:metadata/*' + \
|
||||
'[re:match(name(), "creator", "i") and (@role="aut" or @opf:role="aut")]')
|
||||
tags_path = XPath('/opf:package/opf:metadata/*[re:match(name(), "subject", "i")]')
|
||||
isbn_path = XPath('/opf:package/opf:metadata/*[re:match(name(), "identifier", "i") and '+
|
||||
metadata_path = XPath('descendant::*[re:match(name(), "metadata", "i")]')
|
||||
metadata_elem_path = XPath('descendant::*[re:match(name(), $name, "i")]')
|
||||
authors_path = XPath('descendant::*[re:match(name(), "creator", "i") and (@role="aut" or @opf:role="aut")]')
|
||||
tags_path = XPath('descendant::*[re:match(name(), "subject", "i")]')
|
||||
isbn_path = XPath('descendant::*[re:match(name(), "identifier", "i") and '+
|
||||
'(re:match(@scheme, "isbn", "i") or re:match(@opf:scheme, "isbn", "i"))]')
|
||||
manifest_path = XPath('/opf:package/*[re:match(name(), "manifest", "i")]/*[re:match(name(), "item", "i")]')
|
||||
spine_path = XPath('/opf:package/*[re:match(name(), "spine", "i")]/*[re:match(name(), "itemref", "i")]')
|
||||
guide_path = XPath('/opf:package/*[re:match(name(), "guide", "i")]/*[re:match(name(), "reference", "i")]')
|
||||
manifest_path = XPath('descendant::*[re:match(name(), "manifest", "i")]/*[re:match(name(), "item", "i")]')
|
||||
spine_path = XPath('descendant::*[re:match(name(), "spine", "i")]/*[re:match(name(), "itemref", "i")]')
|
||||
guide_path = XPath('descendant::*[re:match(name(), "guide", "i")]/*[re:match(name(), "reference", "i")]')
|
||||
|
||||
title = MetadataField('title')
|
||||
publisher = MetadataField('publisher')
|
||||
@ -424,25 +425,27 @@ class OPF(object):
|
||||
|
||||
|
||||
def __init__(self, stream, basedir=os.getcwdu()):
|
||||
if not hasattr(stream, 'read'):
|
||||
stream = open(stream, 'rb')
|
||||
self.basedir = self.base_dir = basedir
|
||||
raw, self.encoding = xml_to_unicode(stream.read(), strip_encoding_pats=True, resolve_entities=True)
|
||||
|
||||
self.tree = etree.fromstring(raw, self.PARSER)
|
||||
self.metadata = self.metadata_path(self.tree)
|
||||
self.root = etree.fromstring(raw, self.PARSER)
|
||||
self.metadata = self.metadata_path(self.root)
|
||||
if not self.metadata:
|
||||
raise ValueError('Malformed OPF file: No <metadata> element')
|
||||
self.metadata = self.metadata[0]
|
||||
self.unquote_urls()
|
||||
self.manifest = Manifest()
|
||||
m = self.manifest_path(self.tree)
|
||||
m = self.manifest_path(self.root)
|
||||
if m:
|
||||
self.manifest = Manifest.from_opf_manifest_element(m, basedir)
|
||||
self.spine = None
|
||||
s = self.spine_path(self.tree)
|
||||
s = self.spine_path(self.root)
|
||||
if s:
|
||||
self.spine = Spine.from_opf_spine_element(s, self.manifest)
|
||||
self.guide = None
|
||||
guide = self.guide_path(self.tree)
|
||||
guide = self.guide_path(self.root)
|
||||
if guide:
|
||||
self.guide = Guide.from_opf_guide(guide, basedir)
|
||||
self.cover_data = (None, None)
|
||||
@ -452,7 +455,7 @@ class OPF(object):
|
||||
return u''.join(self.TEXT(elem))
|
||||
|
||||
def itermanifest(self):
|
||||
return self.manifest_path(self.tree)
|
||||
return self.manifest_path(self.root)
|
||||
|
||||
def create_manifest_item(self, href, media_type):
|
||||
ids = [i.get('id', None) for i in self.itermanifest()]
|
||||
@ -478,7 +481,7 @@ class OPF(object):
|
||||
return [i.get('id') for i in items]
|
||||
|
||||
def iterspine(self):
|
||||
return self.spine_path(self.tree)
|
||||
return self.spine_path(self.root)
|
||||
|
||||
def create_spine_item(self, idref):
|
||||
ans = etree.Element('{%s}itemref'%self.NAMESPACES['opf'], idref=idref)
|
||||
@ -487,14 +490,14 @@ class OPF(object):
|
||||
|
||||
def replace_spine_items_by_idref(self, idref, new_idrefs):
|
||||
items = list(map(self.create_spine_item, new_idrefs))
|
||||
spine = self.XPath('/opf:package/*[re:match(name(), "spine", "i")]')(self.tree)[0]
|
||||
spine = self.XPath('/opf:package/*[re:match(name(), "spine", "i")]')(self.root)[0]
|
||||
old = [i for i in self.iterspine() if i.get('idref', None) == idref]
|
||||
for x in old:
|
||||
i = spine.index(x)
|
||||
spine[i:i+1] = items
|
||||
|
||||
def iterguide(self):
|
||||
return self.guide_path(self.tree)
|
||||
return self.guide_path(self.root)
|
||||
|
||||
def unquote_urls(self):
|
||||
for item in self.itermanifest():
|
||||
@ -507,12 +510,12 @@ class OPF(object):
|
||||
|
||||
def fget(self):
|
||||
ans = []
|
||||
for elem in self.authors_path(self.tree):
|
||||
for elem in self.authors_path(self.metadata):
|
||||
ans.extend([x.strip() for x in self.get_text(elem).split(',')])
|
||||
return ans
|
||||
|
||||
def fset(self, val):
|
||||
remove = list(self.authors_path(self.tree))
|
||||
remove = list(self.authors_path(self.metadata))
|
||||
for elem in remove:
|
||||
self.metadata.remove(elem)
|
||||
for author in val:
|
||||
@ -526,13 +529,13 @@ class OPF(object):
|
||||
def author_sort():
|
||||
|
||||
def fget(self):
|
||||
matches = self.authors_path(self.tree)
|
||||
matches = self.authors_path(self.metadata)
|
||||
if matches:
|
||||
ans = matches[0].get('opf:file-as', None)
|
||||
return ans if ans else matches[0].get('file-as', None)
|
||||
|
||||
def fset(self, val):
|
||||
matches = self.authors_path(self.tree)
|
||||
matches = self.authors_path(self.metadata)
|
||||
if matches:
|
||||
matches[0].set('file-as', unicode(val))
|
||||
|
||||
@ -543,12 +546,12 @@ class OPF(object):
|
||||
|
||||
def fget(self):
|
||||
ans = []
|
||||
for tag in self.tags_path(self.tree):
|
||||
for tag in self.tags_path(self.metadata):
|
||||
ans.append(self.get_text(tag))
|
||||
return ans
|
||||
|
||||
def fset(self, val):
|
||||
for tag in list(self.tags_path(self.tree)):
|
||||
for tag in list(self.tags_path(self.metadata)):
|
||||
self.metadata.remove(tag)
|
||||
for tag in val:
|
||||
elem = self.create_metadata_element('subject', ns='dc')
|
||||
@ -560,11 +563,11 @@ class OPF(object):
|
||||
def isbn():
|
||||
|
||||
def fget(self):
|
||||
for match in self.isbn_path(self.tree):
|
||||
for match in self.isbn_path(self.metadata):
|
||||
return match.text if match.text else None
|
||||
|
||||
def fset(self, val):
|
||||
matches = self.isbn_path(self.tree)
|
||||
matches = self.isbn_path(self.metadata)
|
||||
if not matches:
|
||||
matches = [self.create_metadata_element('identifier', ns='dc',
|
||||
attrib={'{%s}scheme'%self.NAMESPACES['opf']:'ISBN'})]
|
||||
@ -572,9 +575,9 @@ class OPF(object):
|
||||
return property(fget=fget, fset=fset)
|
||||
|
||||
def get_metadata_element(self, name):
|
||||
matches = self.metadata_elem_path(self.tree, name=name)
|
||||
matches = self.metadata_elem_path(self.metadata, name=name)
|
||||
if matches:
|
||||
return matches[0]
|
||||
return matches[-1]
|
||||
|
||||
def create_metadata_element(self, name, attrib=None, ns='opf'):
|
||||
elem = etree.SubElement(self.metadata, '{%s}%s'%(self.NAMESPACES[ns], name),
|
||||
@ -583,7 +586,7 @@ class OPF(object):
|
||||
return elem
|
||||
|
||||
def render(self, encoding='utf-8'):
|
||||
return etree.tostring(self.tree, encoding='utf-8', pretty_print=True)
|
||||
return etree.tostring(self.root, encoding='utf-8', pretty_print=True)
|
||||
|
||||
def smart_update(self, mi):
|
||||
for attr in ('author_sort', 'title_sort', 'comments', 'category',
|
||||
@ -716,7 +719,13 @@ class OPFTest(unittest.TestCase):
|
||||
<creator opf:role="aut" file-as="Monkey">Monkey Kitchen, Next</creator>
|
||||
<dc:subject>One</dc:subject><dc:subject>Two</dc:subject>
|
||||
<dc:identifier scheme="ISBN">123456789</dc:identifier>
|
||||
<x-metadata>
|
||||
<series>A one book series</series>
|
||||
</x-metadata>
|
||||
</metadata>
|
||||
<manifest>
|
||||
<item id="1" href="a%20%7E%20b" media-type="text/txt" />
|
||||
</manifest>
|
||||
</package>
|
||||
'''
|
||||
)
|
||||
@ -729,14 +738,14 @@ class OPFTest(unittest.TestCase):
|
||||
self.assertEqual(opf.author_sort, 'Monkey')
|
||||
self.assertEqual(opf.tags, ['One', 'Two'])
|
||||
self.assertEqual(opf.isbn, '123456789')
|
||||
self.assertEqual(opf.series, None)
|
||||
self.assertEqual(opf.series, 'A one book series')
|
||||
self.assertEqual(opf.series_index, None)
|
||||
|
||||
self.assertEqual(list(opf.itermanifest())[0].get('href'), 'a ~ b')
|
||||
|
||||
def testWriting(self):
|
||||
for test in [('title', 'New & Title'), ('authors', ['One', 'Two']),
|
||||
('author_sort', "Kitchen"), ('tags', ['Three']),
|
||||
('isbn', 'a'), ('rating', 3)]:
|
||||
('isbn', 'a'), ('rating', 3), ('series_index', 1)]:
|
||||
setattr(self.opf, *test)
|
||||
self.assertEqual(getattr(self.opf, test[0]), test[1])
|
||||
|
||||
@ -748,10 +757,5 @@ def suite():
|
||||
def test():
|
||||
unittest.TextTestRunner(verbosity=2).run(suite())
|
||||
|
||||
|
||||
|
||||
def main(args=sys.argv):
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(test())
|
@ -21,7 +21,7 @@ class NCXSoup(BeautifulStoneSoup):
|
||||
class TOC(list):
|
||||
|
||||
def __init__(self, href=None, fragment=None, text=None, parent=None, play_order=0,
|
||||
base_path=os.getcwd()):
|
||||
base_path=os.getcwd(), type='unknown'):
|
||||
self.href = href
|
||||
self.fragment = fragment
|
||||
if not self.fragment:
|
||||
@ -30,12 +30,32 @@ class TOC(list):
|
||||
self.parent = parent
|
||||
self.base_path = base_path
|
||||
self.play_order = play_order
|
||||
self.type = type
|
||||
|
||||
def add_item(self, href, fragment, text, play_order=None):
|
||||
def count(self, type):
|
||||
return len([i for i in self.flat() if i.type == type])
|
||||
|
||||
def purge(self, types, max=0):
|
||||
remove = []
|
||||
for entry in self.flat():
|
||||
if entry.type in types:
|
||||
remove.append(entry)
|
||||
remove = remove[max:]
|
||||
for entry in remove:
|
||||
if entry.parent is None:
|
||||
continue
|
||||
entry.parent.remove(entry)
|
||||
return remove
|
||||
|
||||
def remove(self, entry):
|
||||
list.remove(self, entry)
|
||||
entry.parent = None
|
||||
|
||||
def add_item(self, href, fragment, text, play_order=None, type='unknown'):
|
||||
if play_order is None:
|
||||
play_order = (self[-1].play_order if len(self) else self.play_order) + 1
|
||||
self.append(TOC(href=href, fragment=fragment, text=text, parent=self,
|
||||
base_path=self.base_path, play_order=play_order))
|
||||
base_path=self.base_path, play_order=play_order, type=type))
|
||||
return self[-1]
|
||||
|
||||
def top_level_items(self):
|
||||
|
@ -77,7 +77,7 @@
|
||||
<item>
|
||||
<widget class="QStackedWidget" name="stack" >
|
||||
<property name="currentIndex" >
|
||||
<number>0</number>
|
||||
<number>3</number>
|
||||
</property>
|
||||
<widget class="QWidget" name="metadata_page" >
|
||||
<layout class="QGridLayout" name="gridLayout_4" >
|
||||
@ -619,15 +619,15 @@ p, li { white-space: pre-wrap; }
|
||||
</widget>
|
||||
</item>
|
||||
<item row="3" column="1" >
|
||||
<widget class="QSpinBox" name="opt_max_toc_recursion" />
|
||||
<widget class="QSpinBox" name="opt_toc_threshold" />
|
||||
</item>
|
||||
<item row="3" column="0" >
|
||||
<widget class="QLabel" name="label_16" >
|
||||
<property name="text" >
|
||||
<string>Table of Contents &recursion</string>
|
||||
<string>Chapter &threshold</string>
|
||||
</property>
|
||||
<property name="buddy" >
|
||||
<cstring>opt_max_toc_recursion</cstring>
|
||||
<cstring>opt_toc_threshold</cstring>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
|
@ -1,6 +1,6 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
import re
|
||||
import re, collections
|
||||
|
||||
from PyQt4.QtGui import QStatusBar, QMovie, QLabel, QFrame, QHBoxLayout, QPixmap, \
|
||||
QVBoxLayout, QSizePolicy, QToolButton, QIcon
|
||||
@ -48,6 +48,13 @@ class BookInfoDisplay(QFrame):
|
||||
def mouseReleaseEvent(self, ev):
|
||||
self.emit(SIGNAL('mr(int)'), 1)
|
||||
|
||||
WEIGHTS = collections.defaultdict(lambda : 100)
|
||||
WEIGHTS[_('Path')] = 0
|
||||
WEIGHTS[_('Formats')] = 1
|
||||
WEIGHTS[_('Comments')] = 2
|
||||
WEIGHTS[_('Series')] = 3
|
||||
WEIGHTS[_('Tags')] = 4
|
||||
|
||||
def __init__(self, clear_message):
|
||||
QFrame.__init__(self)
|
||||
self.setCursor(Qt.PointingHandCursor)
|
||||
@ -74,7 +81,9 @@ class BookInfoDisplay(QFrame):
|
||||
rows = u''
|
||||
self.book_data.setText('')
|
||||
self.data = data.copy()
|
||||
for key in data.keys():
|
||||
keys = data.keys()
|
||||
keys.sort(cmp=lambda x, y: cmp(self.WEIGHTS[x], self.WEIGHTS[y]))
|
||||
for key in keys:
|
||||
txt = data[key]
|
||||
#txt = '<br />\n'.join(textwrap.wrap(txt, 120))
|
||||
if isinstance(key, str):
|
||||
|
Loading…
x
Reference in New Issue
Block a user