IGN:html2epub now works when passed OPF files

This commit is contained in:
Kovid Goyal 2008-09-16 21:50:00 -07:00
parent 5e236b8edb
commit 3c404a7a66
12 changed files with 110 additions and 53 deletions

View File

@ -75,7 +75,10 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, resolve_entiti
if encoding == 'ascii': if encoding == 'ascii':
encoding = 'utf-8' encoding = 'utf-8'
raw = raw.decode(encoding, 'replace') try:
raw = raw.decode(encoding, 'replace')
except LookupError:
raw = raw.decode('utf-8', 'replace')
if resolve_entities: if resolve_entities:
from calibre import entity_to_unicode from calibre import entity_to_unicode
from functools import partial from functools import partial

View File

@ -53,6 +53,8 @@ The expression used must evaluate to a list of elements. To disable chapter dete
use the expression "/". See the XPath Tutorial in the calibre User Manual for further use the expression "/". See the XPath Tutorial in the calibre User Manual for further
help on using this feature. help on using this feature.
''').replace('\n', ' ')) ''').replace('\n', ' '))
structure('chapter_mark', ['--chapter-mark'], choices=['pagebreak', 'rule', 'both'],
default='pagebreak', help=_('Specify how to mark detected chapters. A value of "pagebreak" will insert page breaks before chapters. A value of "rule" will insert a line before chapters. A value of "none" will disable chapter marking and a value of "both" will use both page breaks and lines to mark chapters.'))
toc = c.add_group('toc', toc = c.add_group('toc',
_('''\ _('''\
@ -69,5 +71,7 @@ to auto-generate a Table of Contents.
c.add_opt('show_opf', ['--show-opf'], default=False, group='debug', c.add_opt('show_opf', ['--show-opf'], default=False, group='debug',
help=_('Print generated OPF file to stdout')) help=_('Print generated OPF file to stdout'))
c.add_opt('show_ncx', ['--show-ncx'], default=False, group='debug',
help=_('Print generated NCX file to stdout'))
return c return c

View File

@ -23,8 +23,9 @@ class HTMLProcessor(Processor):
if opts.verbose > 2: if opts.verbose > 2:
self.debug_tree('parsed') self.debug_tree('parsed')
self.detect_chapters() self.detect_chapters()
self.extract_css()
self.extract_css()
if opts.verbose > 2: if opts.verbose > 2:
self.debug_tree('nocss') self.debug_tree('nocss')
@ -97,8 +98,8 @@ def convert(htmlfile, opts, notification=None):
resources = [os.path.join(tdir, 'content', f) for f in resource_map.values()] resources = [os.path.join(tdir, 'content', f) for f in resource_map.values()]
if mi.cover and os.access(mi.cover, os.R_OK): if mi.cover and os.access(mi.cover, os.R_OK):
shutil.copyfile(mi.cover, os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover))) shutil.copyfile(mi.cover, os.path.join(tdir, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)[1]))
cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)) cpath = os.path.join(tdir, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)[1])
shutil.copyfile(opf.cover, cpath) shutil.copyfile(opf.cover, cpath)
resources.append(cpath) resources.append(cpath)
mi.cover = cpath mi.cover = cpath
@ -107,21 +108,22 @@ def convert(htmlfile, opts, notification=None):
mi = create_metadata(tdir, mi, spine, resources) mi = create_metadata(tdir, mi, spine, resources)
buf = cStringIO.StringIO() buf = cStringIO.StringIO()
if mi.toc: if mi.toc:
rebase_toc(mi.toc, htmlfile_map, opts.output) rebase_toc(mi.toc, htmlfile_map, tdir)
if mi.toc is None or len(mi.toc) < 2: if mi.toc is None or len(mi.toc) < 2:
mi.toc = generated_toc mi.toc = generated_toc
for item in mi.manifest: for item in mi.manifest:
if getattr(item, 'mime_type', None) == 'text/html': if getattr(item, 'mime_type', None) == 'text/html':
item.mime_type = 'application/xhtml+xml' item.mime_type = 'application/xhtml+xml'
with open(os.path.join(tdir, 'metadata.opf'), 'wb') as f: with open(os.path.join(tdir, 'metadata.opf'), 'wb') as f:
mi.render(f, buf) mi.render(f, buf, 'toc.ncx')
if opts.show_opf: if opts.show_opf:
print open(os.path.join(tdir, 'metadata.opf')).read() print open(os.path.join(tdir, 'metadata.opf')).read()
toc = buf.getvalue() toc = buf.getvalue()
if toc: if toc:
with open(os.path.join(tdir, 'toc.ncx'), 'wb') as f: with open(os.path.join(tdir, 'toc.ncx'), 'wb') as f:
f.write(toc) f.write(toc)
if opts.show_ncx:
print toc
epub = initialize_container(opts.output) epub = initialize_container(opts.output)
epub.add_dir(tdir) epub.add_dir(tdir)
print 'Output written to', opts.output print 'Output written to', opts.output

View File

@ -13,7 +13,8 @@ from urlparse import urlparse
from urllib import unquote from urllib import unquote
from lxml import html, etree from lxml import html, etree
from lxml.etree import XPath from lxml.html import soupparser, HTMLParser
from lxml.etree import XPath, XMLParser
get_text = XPath("//text()") get_text = XPath("//text()")
from calibre import LoggingInterface, unicode_path from calibre import LoggingInterface, unicode_path
@ -297,6 +298,8 @@ class PreProcessor(object):
class Parser(PreProcessor, LoggingInterface): class Parser(PreProcessor, LoggingInterface):
PARSER = HTMLParser(recover=True)
def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, name='htmlparser'): def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, name='htmlparser'):
LoggingInterface.__init__(self, logging.getLogger(name)) LoggingInterface.__init__(self, logging.getLogger(name))
self.setup_cli_handler(opts.verbose) self.setup_cli_handler(opts.verbose)
@ -318,6 +321,11 @@ class Parser(PreProcessor, LoggingInterface):
self.parse_html() self.parse_html()
self.root.rewrite_links(self.rewrite_links, resolve_base_href=False) self.root.rewrite_links(self.rewrite_links, resolve_base_href=False)
for bad in ('xmlns', 'lang', 'xml:lang'): # lxml also adds these attributes for XHTML documents, leading to duplicates
if self.root.get(bad, None) is not None:
self.root.attrib.pop(bad)
def save(self): def save(self):
''' '''
@ -325,28 +333,30 @@ class Parser(PreProcessor, LoggingInterface):
Should be called after all HTML processing is finished. Should be called after all HTML processing is finished.
''' '''
with open(os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path]), 'wb') as f: with open(os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path]), 'wb') as f:
f.write(html.tostring(self.root, ans = html.tostring(self.root, encoding='utf-8', method='xml',
encoding='utf-8', method='xml', pretty_print=self.opts.pretty_print,
include_meta_content_type=True, include_meta_content_type=True)
pretty_print=self.opts.pretty_print) ans = re.compile(r'<html>', re.IGNORECASE).sub('<html xmlns="http://www.w3.org/1999/xhtml">', ans)
) f.write(ans)
return f.name return f.name
def parse_html(self): def parse_html(self):
''' Create lxml ElementTree from HTML ''' ''' Create lxml ElementTree from HTML '''
self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:])) self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:]))
src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace') src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace').strip()
src = self.preprocess(src) src = self.preprocess(src)
# lxml chokes on unicode input when it contains encoding declarations # lxml chokes on unicode input when it contains encoding declarations
for pat in ENCODING_PATS: for pat in ENCODING_PATS:
src = pat.sub('', src) src = pat.sub('', src)
try: try:
self.root = html.document_fromstring(src) self.root = etree.HTML(src, self.PARSER)
if self.root is None:
raise ValueError('%s is empty'%self.htmlfile.path)
except: except:
if self.opts.verbose: if self.opts.verbose:
self.log_exception('lxml based parsing failed') self.log_exception('lxml based parsing failed')
self.root = html.soupparser.fromstring() self.root = soupparser.fromstring(src)
self.head = self.body = None self.head = self.body = None
head = self.root.xpath('//head') head = self.root.xpath('//head')
if head: if head:
@ -404,19 +414,27 @@ class Processor(Parser):
def detect_chapters(self): def detect_chapters(self):
self.detected_chapters = self.opts.chapter(self.root) self.detected_chapters = self.opts.chapter(self.root)
for elem in self.detected_chapters: for elem in self.detected_chapters:
style = elem.get('style', '').strip() if self.opts.chapter_mark in ('both', 'pagebreak'):
if style and not style.endswith(';'): style = elem.get('style', '').strip()
style += '; ' if style and not style.endswith(';'):
style += 'page-break-before: always' style += '; '
elem.set(style, style) style += 'page-break-before: always'
elem.set('style', style)
if self.opts.chapter_mark in ('both', 'rule'):
hr = etree.Element('hr')
if elem.getprevious() is None:
elem.getparent()[:0] = [hr]
else:
insert = None
for i, c in enumerate(elem.getparent()):
if c is elem:
insert = i
break
elem.getparent()[insert:insert] = [hr]
def save(self): def save(self):
head = self.root.xpath('//head') head = self.head if self.head is not None else self.body
if head:
head = head[0]
else:
head = self.root.xpath('//body')
head = head[0] if head else self.root
style = etree.SubElement(head, 'style', attrib={'type':'text/css'}) style = etree.SubElement(head, 'style', attrib={'type':'text/css'})
style.text='\n'+self.css style.text='\n'+self.css
style.tail = '\n\n' style.tail = '\n\n'
@ -589,7 +607,7 @@ def search_for_opf(dir):
def get_filelist(htmlfile, opts): def get_filelist(htmlfile, opts):
''' '''
Build list of files references by html file or try to detect and use an Build list of files referenced by html file or try to detect and use an
OPF file instead. OPF file instead.
''' '''
print 'Building file list...' print 'Building file list...'
@ -672,7 +690,7 @@ def rebase_toc(toc, htmlfile_map, basepath, root=True):
fix_entry(entry) fix_entry(entry)
if root: if root:
toc.base_path = basepath toc.base_path = basepath
def create_dir(htmlfile, opts): def create_dir(htmlfile, opts):
''' '''
Create a directory that contains the open ebook Create a directory that contains the open ebook

View File

@ -43,7 +43,7 @@ class Resource(object):
def __init__(self, href_or_path, basedir=os.getcwd(), is_path=True): def __init__(self, href_or_path, basedir=os.getcwd(), is_path=True):
self._href = None self._href = None
self._basedir = None self._basedir = basedir
self.path = None self.path = None
self.fragment = '' self.fragment = ''
try: try:
@ -55,7 +55,7 @@ class Resource(object):
if is_path: if is_path:
path = href_or_path path = href_or_path
if not os.path.isabs(path): if not os.path.isabs(path):
path = os.path.abspath(os.path.join(path, basedir)) path = os.path.abspath(os.path.join(basedir, path))
if isinstance(path, str): if isinstance(path, str):
path = path.decode(sys.getfilesystemencoding()) path = path.decode(sys.getfilesystemencoding())
self.path = path self.path = path

View File

@ -39,7 +39,7 @@ def cover_from_isbn(isbn, timeout=5.):
_timeout = socket.getdefaulttimeout() _timeout = socket.getdefaulttimeout()
socket.setdefaulttimeout(timeout) socket.setdefaulttimeout(timeout)
try: try:
src = browser.open('http://www.librarything.com/isbn/'+isbn).read() src = browser.open('http://www.librarything.com/isbn/'+isbn).read().decode('utf-8', 'replace')
s = BeautifulSoup(src) s = BeautifulSoup(src)
url = s.find('td', attrs={'class':'left'}) url = s.find('td', attrs={'class':'left'})
if url is None: if url is None:

View File

@ -1,7 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<?python
from uuid import uuid4
?>
<ncx version="2005-1" <ncx version="2005-1"
xml:lang="en" xml:lang="en"
xmlns="http://www.daisy.org/z3986/2005/ncx/" xmlns="http://www.daisy.org/z3986/2005/ncx/"
encoding="UTF-8"
xmlns:py="http://genshi.edgewall.org/" xmlns:py="http://genshi.edgewall.org/"
> >
<head> <head>
@ -14,7 +17,7 @@
<docTitle><text>Table of Contents</text></docTitle> <docTitle><text>Table of Contents</text></docTitle>
<py:def function="navpoint(np, level)"> <py:def function="navpoint(np, level)">
${'%*s'%(4*level,'')}<navPoint playOrder="${str(np.play_order)}"> ${'%*s'%(4*level,'')}<navPoint id="${str(uuid4())}" playOrder="${str(np.play_order)}">
${'%*s'%(4*level,'')}<navLabel> ${'%*s'%(4*level,'')}<navLabel>
${'%*s'%(4*level,'')}<text>${np.text}</text> ${'%*s'%(4*level,'')}<text>${np.text}</text>
${'%*s'%(4*level,'')}</navLabel> ${'%*s'%(4*level,'')}</navLabel>

View File

@ -483,7 +483,7 @@ class OPFCreator(MetaInformation):
Set the toc. You must call :method:`create_spine` before calling this Set the toc. You must call :method:`create_spine` before calling this
method. method.
`toc`: A :class:`TOC` object :param toc: A :class:`TOC` object
''' '''
self.toc = toc self.toc = toc
@ -491,12 +491,21 @@ class OPFCreator(MetaInformation):
self.guide = Guide.from_opf_guide(guide_element, self.base_path) self.guide = Guide.from_opf_guide(guide_element, self.base_path)
self.guide.set_basedir(self.base_path) self.guide.set_basedir(self.base_path)
def render(self, opf_stream, ncx_stream=None): def render(self, opf_stream, ncx_stream=None, ncx_manifest_entry=None):
from calibre.resources import opf_template from calibre.resources import opf_template
from calibre.utils.genshi.template import MarkupTemplate from calibre.utils.genshi.template import MarkupTemplate
template = MarkupTemplate(opf_template) template = MarkupTemplate(opf_template)
if self.manifest: if self.manifest:
self.manifest.set_basedir(self.base_path) self.manifest.set_basedir(self.base_path)
if ncx_manifest_entry is not None:
if not os.path.isabs(ncx_manifest_entry):
ncx_manifest_entry = os.path.join(self.base_path, ncx_manifest_entry)
remove = [i for i in self.manifest if i.id == 'ncx']
for item in remove:
self.manifest.remove(item)
self.manifest.append(ManifestItem(ncx_manifest_entry, self.base_path))
self.manifest[-1].id = 'ncx'
self.manifest[-1].mime_type = 'application/x-dtbncx+xml'
if not self.guide: if not self.guide:
self.guide = Guide() self.guide = Guide()
if self.cover: if self.cover:

View File

@ -23,6 +23,12 @@
</py:for> </py:for>
</metadata> </metadata>
<manifest py:if="getattr(mi, 'manifest', None)">
<py:for each="ref in mi.manifest">
<item id="${ref.id}" href="${ref.href()}" media-type="${ref.mime_type}" />
</py:for>
</manifest>
<guide py:if="getattr(mi, 'guide', None)"> <guide py:if="getattr(mi, 'guide', None)">
<py:for each="ref in mi.guide"> <py:for each="ref in mi.guide">
<reference type="${ref.type}" href="${ref.href()}" py:with="attrs={'title': ref.title if ref.title else None}" py:attrs="attrs" /> <reference type="${ref.type}" href="${ref.href()}" py:with="attrs={'title': ref.title if ref.title else None}" py:attrs="attrs" />
@ -36,10 +42,5 @@
</py:for> </py:for>
</spine> </spine>
<manifest py:if="getattr(mi, 'manifest', None)">
<py:for each="ref in mi.manifest">
<item id="${ref.id}" href="${ref.href()}" media-type="${ref.mime_type}" />
</py:for>
</manifest>
</package> </package>

View File

@ -29,8 +29,9 @@ class TOC(list):
self.base_path = base_path self.base_path = base_path
self.play_order = play_order self.play_order = play_order
def add_item(self, href, fragment, text): def add_item(self, href, fragment, text, play_order=None):
play_order = (self[-1].play_order if len(self) else self.play_order) + 1 if play_order is None:
play_order = (self[-1].play_order if len(self) else self.play_order) + 1
self.append(TOC(href=href, fragment=fragment, text=text, parent=self, self.append(TOC(href=href, fragment=fragment, text=text, parent=self,
base_path=self.base_path, play_order=play_order)) base_path=self.base_path, play_order=play_order))
return self[-1] return self[-1]
@ -113,14 +114,16 @@ class TOC(list):
soup = NCXSoup(xml_to_unicode(open(toc, 'rb').read())[0]) soup = NCXSoup(xml_to_unicode(open(toc, 'rb').read())[0])
def process_navpoint(np, dest): def process_navpoint(np, dest):
play_order = np.get('playOrder', 1) play_order = np.get('playOrder', None)
if play_order is None:
play_order = int(np.get('playorder', 1))
href = fragment = text = None href = fragment = text = None
nl = np.find('navlabel') nl = np.find('navlabel')
if nl is not None: if nl is not None:
text = u'' text = u''
for txt in nl.findAll('text'): for txt in nl.findAll('text'):
text += ''.join([unicode(s) for s in txt.findAll(text=True)]) text += ''.join([unicode(s) for s in txt.findAll(text=True)])
content = elem.find('content') content = np.find('content')
if content is None or not content.has_key('src') or not txt: if content is None or not content.has_key('src') or not txt:
return return

View File

@ -719,6 +719,8 @@ class BasicNewsRecipe(object, LoggingInterface):
entries = ['index.html'] entries = ['index.html']
toc = TOC(base_path=dir) toc = TOC(base_path=dir)
self.play_order_counter = 0
self.play_order_map = {}
def feed_index(num, parent): def feed_index(num, parent):
f = feeds[num] f = feeds[num]
@ -726,7 +728,12 @@ class BasicNewsRecipe(object, LoggingInterface):
if getattr(a, 'downloaded', False): if getattr(a, 'downloaded', False):
adir = 'feed_%d/article_%d/'%(num, j) adir = 'feed_%d/article_%d/'%(num, j)
entries.append('%sindex.html'%adir) entries.append('%sindex.html'%adir)
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article')) po = self.play_order_map.get(entries[-1], None)
if po is None:
self.play_order_counter += 1
po = self.play_order_counter
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
play_order=po)
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep)) last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
for sp in a.sub_pages: for sp in a.sub_pages:
prefix = os.path.commonprefix([opf_path, sp]) prefix = os.path.commonprefix([opf_path, sp])
@ -752,7 +759,11 @@ class BasicNewsRecipe(object, LoggingInterface):
if len(feeds) > 1: if len(feeds) > 1:
for i, f in enumerate(feeds): for i, f in enumerate(feeds):
entries.append('feed_%d/index.html'%i) entries.append('feed_%d/index.html'%i)
feed_index(i, toc.add_item('feed_%d/index.html'%i, None, f.title)) po = self.play_order_map.get(entries[-1], None)
if po is None:
self.play_order_counter += 1
po = self.play_order_counter
feed_index(i, toc.add_item('feed_%d/index.html'%i, None, f.title, play_order=po))
else: else:
entries.append('feed_%d/index.html'%0) entries.append('feed_%d/index.html'%0)
feed_index(0, toc) feed_index(0, toc)

View File

@ -206,11 +206,11 @@ def upload_user_manual():
check_call('scp -r src/calibre/manual/.build/html/* divok:%s'%USER_MANUAL) check_call('scp -r src/calibre/manual/.build/html/* divok:%s'%USER_MANUAL)
def build_src_tarball(): def build_src_tarball():
check_call('bzr export dist/calibre-%s.tar.bz2'%__version__) check_call('bzr export dist/calibre-%s.tar.gz'%__version__)
def upload_src_tarball(): def upload_src_tarball():
check_call('ssh divok rm -f %s/calibre-\*.tar.bz2'%DOWNLOADS) check_call('ssh divok rm -f %s/calibre-\*.tar.gz'%DOWNLOADS)
check_call('scp dist/calibre-*.tar.bz2 divok:%s/'%DOWNLOADS) check_call('scp dist/calibre-*.tar.gz divok:%s/'%DOWNLOADS)
def stage_one(): def stage_one():
check_call('sudo rm -rf build', shell=True) check_call('sudo rm -rf build', shell=True)
@ -226,16 +226,19 @@ def stage_one():
def stage_two(): def stage_two():
subprocess.check_call('rm -rf dist/*', shell=True) subprocess.check_call('rm -rf dist/*', shell=True)
build_installers() build_installers()
build_src_tarball()
def stage_three(): def stage_three():
print 'Uploading installers...' print 'Uploading installers...'
upload_installers() upload_installers()
print 'Uploading to PyPI' print 'Uploading to PyPI'
upload_src_tarball()
upload_docs() upload_docs()
upload_user_manual() upload_user_manual()
check_call('python setup.py register bdist_egg --exclude-source-files upload') check_call('rm -f dist/*')
check_call('python setup.py register')
check_call('python setup.py bdist_egg --exclude-source-files')
build_src_tarball()
upload_src_tarball()
check_call('python setup.py upload')
check_call('''rm -rf dist/* build/*''') check_call('''rm -rf dist/* build/*''')
check_call('''ssh divok bzr update /var/www/calibre.kovidgoyal.net/calibre/''') check_call('''ssh divok bzr update /var/www/calibre.kovidgoyal.net/calibre/''')