IGN:html2epub now works when passed OPF files

This commit is contained in:
Kovid Goyal 2008-09-16 21:50:00 -07:00
parent 5e236b8edb
commit 3c404a7a66
12 changed files with 110 additions and 53 deletions

View File

@ -75,7 +75,10 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, resolve_entiti
if encoding == 'ascii':
encoding = 'utf-8'
raw = raw.decode(encoding, 'replace')
try:
raw = raw.decode(encoding, 'replace')
except LookupError:
raw = raw.decode('utf-8', 'replace')
if resolve_entities:
from calibre import entity_to_unicode
from functools import partial

View File

@ -53,6 +53,8 @@ The expression used must evaluate to a list of elements. To disable chapter dete
use the expression "/". See the XPath Tutorial in the calibre User Manual for further
help on using this feature.
''').replace('\n', ' '))
structure('chapter_mark', ['--chapter-mark'], choices=['pagebreak', 'rule', 'both'],
default='pagebreak', help=_('Specify how to mark detected chapters. A value of "pagebreak" will insert page breaks before chapters. A value of "rule" will insert a line before chapters. A value of "none" will disable chapter marking and a value of "both" will use both page breaks and lines to mark chapters.'))
toc = c.add_group('toc',
_('''\
@ -69,5 +71,7 @@ to auto-generate a Table of Contents.
c.add_opt('show_opf', ['--show-opf'], default=False, group='debug',
help=_('Print generated OPF file to stdout'))
c.add_opt('show_ncx', ['--show-ncx'], default=False, group='debug',
help=_('Print generated NCX file to stdout'))
return c

View File

@ -23,8 +23,9 @@ class HTMLProcessor(Processor):
if opts.verbose > 2:
self.debug_tree('parsed')
self.detect_chapters()
self.extract_css()
self.extract_css()
if opts.verbose > 2:
self.debug_tree('nocss')
@ -97,8 +98,8 @@ def convert(htmlfile, opts, notification=None):
resources = [os.path.join(tdir, 'content', f) for f in resource_map.values()]
if mi.cover and os.access(mi.cover, os.R_OK):
shutil.copyfile(mi.cover, os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)))
cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover))
shutil.copyfile(mi.cover, os.path.join(tdir, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)[1]))
cpath = os.path.join(tdir, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)[1])
shutil.copyfile(opf.cover, cpath)
resources.append(cpath)
mi.cover = cpath
@ -107,21 +108,22 @@ def convert(htmlfile, opts, notification=None):
mi = create_metadata(tdir, mi, spine, resources)
buf = cStringIO.StringIO()
if mi.toc:
rebase_toc(mi.toc, htmlfile_map, opts.output)
rebase_toc(mi.toc, htmlfile_map, tdir)
if mi.toc is None or len(mi.toc) < 2:
mi.toc = generated_toc
for item in mi.manifest:
if getattr(item, 'mime_type', None) == 'text/html':
item.mime_type = 'application/xhtml+xml'
with open(os.path.join(tdir, 'metadata.opf'), 'wb') as f:
mi.render(f, buf)
mi.render(f, buf, 'toc.ncx')
if opts.show_opf:
print open(os.path.join(tdir, 'metadata.opf')).read()
toc = buf.getvalue()
if toc:
with open(os.path.join(tdir, 'toc.ncx'), 'wb') as f:
f.write(toc)
if opts.show_ncx:
print toc
epub = initialize_container(opts.output)
epub.add_dir(tdir)
print 'Output written to', opts.output

View File

@ -13,7 +13,8 @@ from urlparse import urlparse
from urllib import unquote
from lxml import html, etree
from lxml.etree import XPath
from lxml.html import soupparser, HTMLParser
from lxml.etree import XPath, XMLParser
get_text = XPath("//text()")
from calibre import LoggingInterface, unicode_path
@ -297,6 +298,8 @@ class PreProcessor(object):
class Parser(PreProcessor, LoggingInterface):
PARSER = HTMLParser(recover=True)
def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, name='htmlparser'):
LoggingInterface.__init__(self, logging.getLogger(name))
self.setup_cli_handler(opts.verbose)
@ -318,6 +321,11 @@ class Parser(PreProcessor, LoggingInterface):
self.parse_html()
self.root.rewrite_links(self.rewrite_links, resolve_base_href=False)
for bad in ('xmlns', 'lang', 'xml:lang'): # lxml also adds these attributes for XHTML documents, leading to duplicates
if self.root.get(bad, None) is not None:
self.root.attrib.pop(bad)
def save(self):
'''
@ -325,28 +333,30 @@ class Parser(PreProcessor, LoggingInterface):
Should be called after all HTML processing is finished.
'''
with open(os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path]), 'wb') as f:
f.write(html.tostring(self.root,
encoding='utf-8', method='xml',
include_meta_content_type=True,
pretty_print=self.opts.pretty_print)
)
ans = html.tostring(self.root, encoding='utf-8', method='xml',
pretty_print=self.opts.pretty_print,
include_meta_content_type=True)
ans = re.compile(r'<html>', re.IGNORECASE).sub('<html xmlns="http://www.w3.org/1999/xhtml">', ans)
f.write(ans)
return f.name
def parse_html(self):
''' Create lxml ElementTree from HTML '''
self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:]))
src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace')
src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace').strip()
src = self.preprocess(src)
# lxml chokes on unicode input when it contains encoding declarations
for pat in ENCODING_PATS:
src = pat.sub('', src)
try:
self.root = html.document_fromstring(src)
self.root = etree.HTML(src, self.PARSER)
if self.root is None:
raise ValueError('%s is empty'%self.htmlfile.path)
except:
if self.opts.verbose:
self.log_exception('lxml based parsing failed')
self.root = html.soupparser.fromstring()
self.root = soupparser.fromstring(src)
self.head = self.body = None
head = self.root.xpath('//head')
if head:
@ -404,19 +414,27 @@ class Processor(Parser):
def detect_chapters(self):
self.detected_chapters = self.opts.chapter(self.root)
for elem in self.detected_chapters:
style = elem.get('style', '').strip()
if style and not style.endswith(';'):
style += '; '
style += 'page-break-before: always'
elem.set(style, style)
if self.opts.chapter_mark in ('both', 'pagebreak'):
style = elem.get('style', '').strip()
if style and not style.endswith(';'):
style += '; '
style += 'page-break-before: always'
elem.set('style', style)
if self.opts.chapter_mark in ('both', 'rule'):
hr = etree.Element('hr')
if elem.getprevious() is None:
elem.getparent()[:0] = [hr]
else:
insert = None
for i, c in enumerate(elem.getparent()):
if c is elem:
insert = i
break
elem.getparent()[insert:insert] = [hr]
def save(self):
head = self.root.xpath('//head')
if head:
head = head[0]
else:
head = self.root.xpath('//body')
head = head[0] if head else self.root
head = self.head if self.head is not None else self.body
style = etree.SubElement(head, 'style', attrib={'type':'text/css'})
style.text='\n'+self.css
style.tail = '\n\n'
@ -589,7 +607,7 @@ def search_for_opf(dir):
def get_filelist(htmlfile, opts):
'''
Build list of files references by html file or try to detect and use an
Build list of files referenced by html file or try to detect and use an
OPF file instead.
'''
print 'Building file list...'

View File

@ -43,7 +43,7 @@ class Resource(object):
def __init__(self, href_or_path, basedir=os.getcwd(), is_path=True):
self._href = None
self._basedir = None
self._basedir = basedir
self.path = None
self.fragment = ''
try:
@ -55,7 +55,7 @@ class Resource(object):
if is_path:
path = href_or_path
if not os.path.isabs(path):
path = os.path.abspath(os.path.join(path, basedir))
path = os.path.abspath(os.path.join(basedir, path))
if isinstance(path, str):
path = path.decode(sys.getfilesystemencoding())
self.path = path

View File

@ -39,7 +39,7 @@ def cover_from_isbn(isbn, timeout=5.):
_timeout = socket.getdefaulttimeout()
socket.setdefaulttimeout(timeout)
try:
src = browser.open('http://www.librarything.com/isbn/'+isbn).read()
src = browser.open('http://www.librarything.com/isbn/'+isbn).read().decode('utf-8', 'replace')
s = BeautifulSoup(src)
url = s.find('td', attrs={'class':'left'})
if url is None:

View File

@ -1,7 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<?python
from uuid import uuid4
?>
<ncx version="2005-1"
xml:lang="en"
xmlns="http://www.daisy.org/z3986/2005/ncx/"
encoding="UTF-8"
xmlns:py="http://genshi.edgewall.org/"
>
<head>
@ -14,7 +17,7 @@
<docTitle><text>Table of Contents</text></docTitle>
<py:def function="navpoint(np, level)">
${'%*s'%(4*level,'')}<navPoint playOrder="${str(np.play_order)}">
${'%*s'%(4*level,'')}<navPoint id="${str(uuid4())}" playOrder="${str(np.play_order)}">
${'%*s'%(4*level,'')}<navLabel>
${'%*s'%(4*level,'')}<text>${np.text}</text>
${'%*s'%(4*level,'')}</navLabel>

View File

@ -483,7 +483,7 @@ class OPFCreator(MetaInformation):
Set the toc. You must call :method:`create_spine` before calling this
method.
`toc`: A :class:`TOC` object
:param toc: A :class:`TOC` object
'''
self.toc = toc
@ -491,12 +491,21 @@ class OPFCreator(MetaInformation):
self.guide = Guide.from_opf_guide(guide_element, self.base_path)
self.guide.set_basedir(self.base_path)
def render(self, opf_stream, ncx_stream=None):
def render(self, opf_stream, ncx_stream=None, ncx_manifest_entry=None):
from calibre.resources import opf_template
from calibre.utils.genshi.template import MarkupTemplate
template = MarkupTemplate(opf_template)
if self.manifest:
self.manifest.set_basedir(self.base_path)
if ncx_manifest_entry is not None:
if not os.path.isabs(ncx_manifest_entry):
ncx_manifest_entry = os.path.join(self.base_path, ncx_manifest_entry)
remove = [i for i in self.manifest if i.id == 'ncx']
for item in remove:
self.manifest.remove(item)
self.manifest.append(ManifestItem(ncx_manifest_entry, self.base_path))
self.manifest[-1].id = 'ncx'
self.manifest[-1].mime_type = 'application/x-dtbncx+xml'
if not self.guide:
self.guide = Guide()
if self.cover:

View File

@ -23,6 +23,12 @@
</py:for>
</metadata>
<manifest py:if="getattr(mi, 'manifest', None)">
<py:for each="ref in mi.manifest">
<item id="${ref.id}" href="${ref.href()}" media-type="${ref.mime_type}" />
</py:for>
</manifest>
<guide py:if="getattr(mi, 'guide', None)">
<py:for each="ref in mi.guide">
<reference type="${ref.type}" href="${ref.href()}" py:with="attrs={'title': ref.title if ref.title else None}" py:attrs="attrs" />
@ -36,10 +42,5 @@
</py:for>
</spine>
<manifest py:if="getattr(mi, 'manifest', None)">
<py:for each="ref in mi.manifest">
<item id="${ref.id}" href="${ref.href()}" media-type="${ref.mime_type}" />
</py:for>
</manifest>
</package>

View File

@ -29,8 +29,9 @@ class TOC(list):
self.base_path = base_path
self.play_order = play_order
def add_item(self, href, fragment, text):
play_order = (self[-1].play_order if len(self) else self.play_order) + 1
def add_item(self, href, fragment, text, play_order=None):
if play_order is None:
play_order = (self[-1].play_order if len(self) else self.play_order) + 1
self.append(TOC(href=href, fragment=fragment, text=text, parent=self,
base_path=self.base_path, play_order=play_order))
return self[-1]
@ -113,14 +114,16 @@ class TOC(list):
soup = NCXSoup(xml_to_unicode(open(toc, 'rb').read())[0])
def process_navpoint(np, dest):
play_order = np.get('playOrder', 1)
play_order = np.get('playOrder', None)
if play_order is None:
play_order = int(np.get('playorder', 1))
href = fragment = text = None
nl = np.find('navlabel')
if nl is not None:
text = u''
for txt in nl.findAll('text'):
text += ''.join([unicode(s) for s in txt.findAll(text=True)])
content = elem.find('content')
content = np.find('content')
if content is None or not content.has_key('src') or not txt:
return

View File

@ -719,6 +719,8 @@ class BasicNewsRecipe(object, LoggingInterface):
entries = ['index.html']
toc = TOC(base_path=dir)
self.play_order_counter = 0
self.play_order_map = {}
def feed_index(num, parent):
f = feeds[num]
@ -726,7 +728,12 @@ class BasicNewsRecipe(object, LoggingInterface):
if getattr(a, 'downloaded', False):
adir = 'feed_%d/article_%d/'%(num, j)
entries.append('%sindex.html'%adir)
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'))
po = self.play_order_map.get(entries[-1], None)
if po is None:
self.play_order_counter += 1
po = self.play_order_counter
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
play_order=po)
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
for sp in a.sub_pages:
prefix = os.path.commonprefix([opf_path, sp])
@ -752,7 +759,11 @@ class BasicNewsRecipe(object, LoggingInterface):
if len(feeds) > 1:
for i, f in enumerate(feeds):
entries.append('feed_%d/index.html'%i)
feed_index(i, toc.add_item('feed_%d/index.html'%i, None, f.title))
po = self.play_order_map.get(entries[-1], None)
if po is None:
self.play_order_counter += 1
po = self.play_order_counter
feed_index(i, toc.add_item('feed_%d/index.html'%i, None, f.title, play_order=po))
else:
entries.append('feed_%d/index.html'%0)
feed_index(0, toc)

View File

@ -206,11 +206,11 @@ def upload_user_manual():
check_call('scp -r src/calibre/manual/.build/html/* divok:%s'%USER_MANUAL)
def build_src_tarball():
check_call('bzr export dist/calibre-%s.tar.bz2'%__version__)
check_call('bzr export dist/calibre-%s.tar.gz'%__version__)
def upload_src_tarball():
check_call('ssh divok rm -f %s/calibre-\*.tar.bz2'%DOWNLOADS)
check_call('scp dist/calibre-*.tar.bz2 divok:%s/'%DOWNLOADS)
check_call('ssh divok rm -f %s/calibre-\*.tar.gz'%DOWNLOADS)
check_call('scp dist/calibre-*.tar.gz divok:%s/'%DOWNLOADS)
def stage_one():
check_call('sudo rm -rf build', shell=True)
@ -226,16 +226,19 @@ def stage_one():
def stage_two():
subprocess.check_call('rm -rf dist/*', shell=True)
build_installers()
build_src_tarball()
def stage_three():
print 'Uploading installers...'
upload_installers()
print 'Uploading to PyPI'
upload_src_tarball()
upload_docs()
upload_user_manual()
check_call('python setup.py register bdist_egg --exclude-source-files upload')
check_call('rm -f dist/*')
check_call('python setup.py register')
check_call('python setup.py bdist_egg --exclude-source-files')
build_src_tarball()
upload_src_tarball()
check_call('python setup.py upload')
check_call('''rm -rf dist/* build/*''')
check_call('''ssh divok bzr update /var/www/calibre.kovidgoyal.net/calibre/''')