mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
IGN:html2epub now works when passed OPF files
This commit is contained in:
parent
5e236b8edb
commit
3c404a7a66
@ -75,7 +75,10 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, resolve_entiti
|
||||
if encoding == 'ascii':
|
||||
encoding = 'utf-8'
|
||||
|
||||
try:
|
||||
raw = raw.decode(encoding, 'replace')
|
||||
except LookupError:
|
||||
raw = raw.decode('utf-8', 'replace')
|
||||
if resolve_entities:
|
||||
from calibre import entity_to_unicode
|
||||
from functools import partial
|
||||
|
@ -53,6 +53,8 @@ The expression used must evaluate to a list of elements. To disable chapter dete
|
||||
use the expression "/". See the XPath Tutorial in the calibre User Manual for further
|
||||
help on using this feature.
|
||||
''').replace('\n', ' '))
|
||||
structure('chapter_mark', ['--chapter-mark'], choices=['pagebreak', 'rule', 'both'],
|
||||
default='pagebreak', help=_('Specify how to mark detected chapters. A value of "pagebreak" will insert page breaks before chapters. A value of "rule" will insert a line before chapters. A value of "none" will disable chapter marking and a value of "both" will use both page breaks and lines to mark chapters.'))
|
||||
|
||||
toc = c.add_group('toc',
|
||||
_('''\
|
||||
@ -69,5 +71,7 @@ to auto-generate a Table of Contents.
|
||||
|
||||
c.add_opt('show_opf', ['--show-opf'], default=False, group='debug',
|
||||
help=_('Print generated OPF file to stdout'))
|
||||
c.add_opt('show_ncx', ['--show-ncx'], default=False, group='debug',
|
||||
help=_('Print generated NCX file to stdout'))
|
||||
|
||||
return c
|
@ -23,8 +23,9 @@ class HTMLProcessor(Processor):
|
||||
if opts.verbose > 2:
|
||||
self.debug_tree('parsed')
|
||||
self.detect_chapters()
|
||||
self.extract_css()
|
||||
|
||||
|
||||
self.extract_css()
|
||||
if opts.verbose > 2:
|
||||
self.debug_tree('nocss')
|
||||
|
||||
@ -97,8 +98,8 @@ def convert(htmlfile, opts, notification=None):
|
||||
resources = [os.path.join(tdir, 'content', f) for f in resource_map.values()]
|
||||
|
||||
if mi.cover and os.access(mi.cover, os.R_OK):
|
||||
shutil.copyfile(mi.cover, os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)))
|
||||
cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover))
|
||||
shutil.copyfile(mi.cover, os.path.join(tdir, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)[1]))
|
||||
cpath = os.path.join(tdir, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)[1])
|
||||
shutil.copyfile(opf.cover, cpath)
|
||||
resources.append(cpath)
|
||||
mi.cover = cpath
|
||||
@ -107,21 +108,22 @@ def convert(htmlfile, opts, notification=None):
|
||||
mi = create_metadata(tdir, mi, spine, resources)
|
||||
buf = cStringIO.StringIO()
|
||||
if mi.toc:
|
||||
rebase_toc(mi.toc, htmlfile_map, opts.output)
|
||||
rebase_toc(mi.toc, htmlfile_map, tdir)
|
||||
if mi.toc is None or len(mi.toc) < 2:
|
||||
mi.toc = generated_toc
|
||||
for item in mi.manifest:
|
||||
if getattr(item, 'mime_type', None) == 'text/html':
|
||||
item.mime_type = 'application/xhtml+xml'
|
||||
with open(os.path.join(tdir, 'metadata.opf'), 'wb') as f:
|
||||
mi.render(f, buf)
|
||||
mi.render(f, buf, 'toc.ncx')
|
||||
if opts.show_opf:
|
||||
print open(os.path.join(tdir, 'metadata.opf')).read()
|
||||
toc = buf.getvalue()
|
||||
if toc:
|
||||
with open(os.path.join(tdir, 'toc.ncx'), 'wb') as f:
|
||||
f.write(toc)
|
||||
|
||||
if opts.show_ncx:
|
||||
print toc
|
||||
epub = initialize_container(opts.output)
|
||||
epub.add_dir(tdir)
|
||||
print 'Output written to', opts.output
|
||||
|
@ -13,7 +13,8 @@ from urlparse import urlparse
|
||||
from urllib import unquote
|
||||
|
||||
from lxml import html, etree
|
||||
from lxml.etree import XPath
|
||||
from lxml.html import soupparser, HTMLParser
|
||||
from lxml.etree import XPath, XMLParser
|
||||
get_text = XPath("//text()")
|
||||
|
||||
from calibre import LoggingInterface, unicode_path
|
||||
@ -297,6 +298,8 @@ class PreProcessor(object):
|
||||
|
||||
class Parser(PreProcessor, LoggingInterface):
|
||||
|
||||
PARSER = HTMLParser(recover=True)
|
||||
|
||||
def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, name='htmlparser'):
|
||||
LoggingInterface.__init__(self, logging.getLogger(name))
|
||||
self.setup_cli_handler(opts.verbose)
|
||||
@ -318,6 +321,11 @@ class Parser(PreProcessor, LoggingInterface):
|
||||
|
||||
self.parse_html()
|
||||
self.root.rewrite_links(self.rewrite_links, resolve_base_href=False)
|
||||
for bad in ('xmlns', 'lang', 'xml:lang'): # lxml also adds these attributes for XHTML documents, leading to duplicates
|
||||
if self.root.get(bad, None) is not None:
|
||||
self.root.attrib.pop(bad)
|
||||
|
||||
|
||||
|
||||
def save(self):
|
||||
'''
|
||||
@ -325,28 +333,30 @@ class Parser(PreProcessor, LoggingInterface):
|
||||
Should be called after all HTML processing is finished.
|
||||
'''
|
||||
with open(os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path]), 'wb') as f:
|
||||
f.write(html.tostring(self.root,
|
||||
encoding='utf-8', method='xml',
|
||||
include_meta_content_type=True,
|
||||
pretty_print=self.opts.pretty_print)
|
||||
)
|
||||
ans = html.tostring(self.root, encoding='utf-8', method='xml',
|
||||
pretty_print=self.opts.pretty_print,
|
||||
include_meta_content_type=True)
|
||||
ans = re.compile(r'<html>', re.IGNORECASE).sub('<html xmlns="http://www.w3.org/1999/xhtml">', ans)
|
||||
f.write(ans)
|
||||
return f.name
|
||||
|
||||
|
||||
def parse_html(self):
|
||||
''' Create lxml ElementTree from HTML '''
|
||||
self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:]))
|
||||
src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace')
|
||||
src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace').strip()
|
||||
src = self.preprocess(src)
|
||||
# lxml chokes on unicode input when it contains encoding declarations
|
||||
for pat in ENCODING_PATS:
|
||||
src = pat.sub('', src)
|
||||
try:
|
||||
self.root = html.document_fromstring(src)
|
||||
self.root = etree.HTML(src, self.PARSER)
|
||||
if self.root is None:
|
||||
raise ValueError('%s is empty'%self.htmlfile.path)
|
||||
except:
|
||||
if self.opts.verbose:
|
||||
self.log_exception('lxml based parsing failed')
|
||||
self.root = html.soupparser.fromstring()
|
||||
self.root = soupparser.fromstring(src)
|
||||
self.head = self.body = None
|
||||
head = self.root.xpath('//head')
|
||||
if head:
|
||||
@ -404,19 +414,27 @@ class Processor(Parser):
|
||||
def detect_chapters(self):
|
||||
self.detected_chapters = self.opts.chapter(self.root)
|
||||
for elem in self.detected_chapters:
|
||||
if self.opts.chapter_mark in ('both', 'pagebreak'):
|
||||
style = elem.get('style', '').strip()
|
||||
if style and not style.endswith(';'):
|
||||
style += '; '
|
||||
style += 'page-break-before: always'
|
||||
elem.set(style, style)
|
||||
elem.set('style', style)
|
||||
if self.opts.chapter_mark in ('both', 'rule'):
|
||||
hr = etree.Element('hr')
|
||||
if elem.getprevious() is None:
|
||||
elem.getparent()[:0] = [hr]
|
||||
else:
|
||||
insert = None
|
||||
for i, c in enumerate(elem.getparent()):
|
||||
if c is elem:
|
||||
insert = i
|
||||
break
|
||||
elem.getparent()[insert:insert] = [hr]
|
||||
|
||||
|
||||
def save(self):
|
||||
head = self.root.xpath('//head')
|
||||
if head:
|
||||
head = head[0]
|
||||
else:
|
||||
head = self.root.xpath('//body')
|
||||
head = head[0] if head else self.root
|
||||
head = self.head if self.head is not None else self.body
|
||||
style = etree.SubElement(head, 'style', attrib={'type':'text/css'})
|
||||
style.text='\n'+self.css
|
||||
style.tail = '\n\n'
|
||||
@ -589,7 +607,7 @@ def search_for_opf(dir):
|
||||
|
||||
def get_filelist(htmlfile, opts):
|
||||
'''
|
||||
Build list of files references by html file or try to detect and use an
|
||||
Build list of files referenced by html file or try to detect and use an
|
||||
OPF file instead.
|
||||
'''
|
||||
print 'Building file list...'
|
||||
|
@ -43,7 +43,7 @@ class Resource(object):
|
||||
|
||||
def __init__(self, href_or_path, basedir=os.getcwd(), is_path=True):
|
||||
self._href = None
|
||||
self._basedir = None
|
||||
self._basedir = basedir
|
||||
self.path = None
|
||||
self.fragment = ''
|
||||
try:
|
||||
@ -55,7 +55,7 @@ class Resource(object):
|
||||
if is_path:
|
||||
path = href_or_path
|
||||
if not os.path.isabs(path):
|
||||
path = os.path.abspath(os.path.join(path, basedir))
|
||||
path = os.path.abspath(os.path.join(basedir, path))
|
||||
if isinstance(path, str):
|
||||
path = path.decode(sys.getfilesystemencoding())
|
||||
self.path = path
|
||||
|
@ -39,7 +39,7 @@ def cover_from_isbn(isbn, timeout=5.):
|
||||
_timeout = socket.getdefaulttimeout()
|
||||
socket.setdefaulttimeout(timeout)
|
||||
try:
|
||||
src = browser.open('http://www.librarything.com/isbn/'+isbn).read()
|
||||
src = browser.open('http://www.librarything.com/isbn/'+isbn).read().decode('utf-8', 'replace')
|
||||
s = BeautifulSoup(src)
|
||||
url = s.find('td', attrs={'class':'left'})
|
||||
if url is None:
|
||||
|
@ -1,7 +1,10 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<?python
|
||||
from uuid import uuid4
|
||||
?>
|
||||
<ncx version="2005-1"
|
||||
xml:lang="en"
|
||||
xmlns="http://www.daisy.org/z3986/2005/ncx/"
|
||||
encoding="UTF-8"
|
||||
xmlns:py="http://genshi.edgewall.org/"
|
||||
>
|
||||
<head>
|
||||
@ -14,7 +17,7 @@
|
||||
<docTitle><text>Table of Contents</text></docTitle>
|
||||
|
||||
<py:def function="navpoint(np, level)">
|
||||
${'%*s'%(4*level,'')}<navPoint playOrder="${str(np.play_order)}">
|
||||
${'%*s'%(4*level,'')}<navPoint id="${str(uuid4())}" playOrder="${str(np.play_order)}">
|
||||
${'%*s'%(4*level,'')}<navLabel>
|
||||
${'%*s'%(4*level,'')}<text>${np.text}</text>
|
||||
${'%*s'%(4*level,'')}</navLabel>
|
||||
|
@ -483,7 +483,7 @@ class OPFCreator(MetaInformation):
|
||||
Set the toc. You must call :method:`create_spine` before calling this
|
||||
method.
|
||||
|
||||
`toc`: A :class:`TOC` object
|
||||
:param toc: A :class:`TOC` object
|
||||
'''
|
||||
self.toc = toc
|
||||
|
||||
@ -491,12 +491,21 @@ class OPFCreator(MetaInformation):
|
||||
self.guide = Guide.from_opf_guide(guide_element, self.base_path)
|
||||
self.guide.set_basedir(self.base_path)
|
||||
|
||||
def render(self, opf_stream, ncx_stream=None):
|
||||
def render(self, opf_stream, ncx_stream=None, ncx_manifest_entry=None):
|
||||
from calibre.resources import opf_template
|
||||
from calibre.utils.genshi.template import MarkupTemplate
|
||||
template = MarkupTemplate(opf_template)
|
||||
if self.manifest:
|
||||
self.manifest.set_basedir(self.base_path)
|
||||
if ncx_manifest_entry is not None:
|
||||
if not os.path.isabs(ncx_manifest_entry):
|
||||
ncx_manifest_entry = os.path.join(self.base_path, ncx_manifest_entry)
|
||||
remove = [i for i in self.manifest if i.id == 'ncx']
|
||||
for item in remove:
|
||||
self.manifest.remove(item)
|
||||
self.manifest.append(ManifestItem(ncx_manifest_entry, self.base_path))
|
||||
self.manifest[-1].id = 'ncx'
|
||||
self.manifest[-1].mime_type = 'application/x-dtbncx+xml'
|
||||
if not self.guide:
|
||||
self.guide = Guide()
|
||||
if self.cover:
|
||||
|
@ -23,6 +23,12 @@
|
||||
</py:for>
|
||||
</metadata>
|
||||
|
||||
<manifest py:if="getattr(mi, 'manifest', None)">
|
||||
<py:for each="ref in mi.manifest">
|
||||
<item id="${ref.id}" href="${ref.href()}" media-type="${ref.mime_type}" />
|
||||
</py:for>
|
||||
</manifest>
|
||||
|
||||
<guide py:if="getattr(mi, 'guide', None)">
|
||||
<py:for each="ref in mi.guide">
|
||||
<reference type="${ref.type}" href="${ref.href()}" py:with="attrs={'title': ref.title if ref.title else None}" py:attrs="attrs" />
|
||||
@ -36,10 +42,5 @@
|
||||
</py:for>
|
||||
</spine>
|
||||
|
||||
<manifest py:if="getattr(mi, 'manifest', None)">
|
||||
<py:for each="ref in mi.manifest">
|
||||
<item id="${ref.id}" href="${ref.href()}" media-type="${ref.mime_type}" />
|
||||
</py:for>
|
||||
</manifest>
|
||||
|
||||
</package>
|
||||
|
@ -29,7 +29,8 @@ class TOC(list):
|
||||
self.base_path = base_path
|
||||
self.play_order = play_order
|
||||
|
||||
def add_item(self, href, fragment, text):
|
||||
def add_item(self, href, fragment, text, play_order=None):
|
||||
if play_order is None:
|
||||
play_order = (self[-1].play_order if len(self) else self.play_order) + 1
|
||||
self.append(TOC(href=href, fragment=fragment, text=text, parent=self,
|
||||
base_path=self.base_path, play_order=play_order))
|
||||
@ -113,14 +114,16 @@ class TOC(list):
|
||||
soup = NCXSoup(xml_to_unicode(open(toc, 'rb').read())[0])
|
||||
|
||||
def process_navpoint(np, dest):
|
||||
play_order = np.get('playOrder', 1)
|
||||
play_order = np.get('playOrder', None)
|
||||
if play_order is None:
|
||||
play_order = int(np.get('playorder', 1))
|
||||
href = fragment = text = None
|
||||
nl = np.find('navlabel')
|
||||
if nl is not None:
|
||||
text = u''
|
||||
for txt in nl.findAll('text'):
|
||||
text += ''.join([unicode(s) for s in txt.findAll(text=True)])
|
||||
content = elem.find('content')
|
||||
content = np.find('content')
|
||||
if content is None or not content.has_key('src') or not txt:
|
||||
return
|
||||
|
||||
|
@ -719,6 +719,8 @@ class BasicNewsRecipe(object, LoggingInterface):
|
||||
|
||||
entries = ['index.html']
|
||||
toc = TOC(base_path=dir)
|
||||
self.play_order_counter = 0
|
||||
self.play_order_map = {}
|
||||
|
||||
def feed_index(num, parent):
|
||||
f = feeds[num]
|
||||
@ -726,7 +728,12 @@ class BasicNewsRecipe(object, LoggingInterface):
|
||||
if getattr(a, 'downloaded', False):
|
||||
adir = 'feed_%d/article_%d/'%(num, j)
|
||||
entries.append('%sindex.html'%adir)
|
||||
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'))
|
||||
po = self.play_order_map.get(entries[-1], None)
|
||||
if po is None:
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
|
||||
play_order=po)
|
||||
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||
for sp in a.sub_pages:
|
||||
prefix = os.path.commonprefix([opf_path, sp])
|
||||
@ -752,7 +759,11 @@ class BasicNewsRecipe(object, LoggingInterface):
|
||||
if len(feeds) > 1:
|
||||
for i, f in enumerate(feeds):
|
||||
entries.append('feed_%d/index.html'%i)
|
||||
feed_index(i, toc.add_item('feed_%d/index.html'%i, None, f.title))
|
||||
po = self.play_order_map.get(entries[-1], None)
|
||||
if po is None:
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
feed_index(i, toc.add_item('feed_%d/index.html'%i, None, f.title, play_order=po))
|
||||
else:
|
||||
entries.append('feed_%d/index.html'%0)
|
||||
feed_index(0, toc)
|
||||
|
15
upload.py
15
upload.py
@ -206,11 +206,11 @@ def upload_user_manual():
|
||||
check_call('scp -r src/calibre/manual/.build/html/* divok:%s'%USER_MANUAL)
|
||||
|
||||
def build_src_tarball():
|
||||
check_call('bzr export dist/calibre-%s.tar.bz2'%__version__)
|
||||
check_call('bzr export dist/calibre-%s.tar.gz'%__version__)
|
||||
|
||||
def upload_src_tarball():
|
||||
check_call('ssh divok rm -f %s/calibre-\*.tar.bz2'%DOWNLOADS)
|
||||
check_call('scp dist/calibre-*.tar.bz2 divok:%s/'%DOWNLOADS)
|
||||
check_call('ssh divok rm -f %s/calibre-\*.tar.gz'%DOWNLOADS)
|
||||
check_call('scp dist/calibre-*.tar.gz divok:%s/'%DOWNLOADS)
|
||||
|
||||
def stage_one():
|
||||
check_call('sudo rm -rf build', shell=True)
|
||||
@ -226,16 +226,19 @@ def stage_one():
|
||||
def stage_two():
|
||||
subprocess.check_call('rm -rf dist/*', shell=True)
|
||||
build_installers()
|
||||
build_src_tarball()
|
||||
|
||||
def stage_three():
|
||||
print 'Uploading installers...'
|
||||
upload_installers()
|
||||
print 'Uploading to PyPI'
|
||||
upload_src_tarball()
|
||||
upload_docs()
|
||||
upload_user_manual()
|
||||
check_call('python setup.py register bdist_egg --exclude-source-files upload')
|
||||
check_call('rm -f dist/*')
|
||||
check_call('python setup.py register')
|
||||
check_call('python setup.py bdist_egg --exclude-source-files')
|
||||
build_src_tarball()
|
||||
upload_src_tarball()
|
||||
check_call('python setup.py upload')
|
||||
check_call('''rm -rf dist/* build/*''')
|
||||
check_call('''ssh divok bzr update /var/www/calibre.kovidgoyal.net/calibre/''')
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user