Implement #4825 (CHM format)

This commit is contained in:
Kovid Goyal 2010-03-05 10:49:28 -07:00
parent c7e8c889a4
commit eae90e2ef4
5 changed files with 172 additions and 7 deletions

View File

@ -110,7 +110,7 @@ class CHMMetadataReader(MetadataReaderPlugin):
description = _('Read metadata from %s files') % 'CHM' description = _('Read metadata from %s files') % 'CHM'
def get_metadata(self, stream, ftype): def get_metadata(self, stream, ftype):
from calibre.ebooks.metadata.chm import get_metadata from calibre.ebooks.chm.metadata import get_metadata
return get_metadata(stream) return get_metadata(stream)

View File

@ -25,15 +25,16 @@ class CHMInput(InputFormatPlugin):
rdr = CHMReader(chm_path, log) rdr = CHMReader(chm_path, log)
log.debug('Extracting CHM to %s' % output_dir) log.debug('Extracting CHM to %s' % output_dir)
rdr.extract_content(output_dir) rdr.extract_content(output_dir)
self._chm_reader = rdr
return rdr.hhc_path return rdr.hhc_path
def convert(self, stream, options, file_ext, log, accelerators): def convert(self, stream, options, file_ext, log, accelerators):
from calibre.ebooks.metadata.chm import get_metadata_ from calibre.ebooks.chm.metadata import get_metadata_from_reader
from calibre.customize.ui import plugin_for_input_format from calibre.customize.ui import plugin_for_input_format
log.debug('Processing CHM...') log.debug('Processing CHM...')
with TemporaryDirectory('chm2oeb') as tdir: with TemporaryDirectory('_chm2oeb') as tdir:
html_input = plugin_for_input_format('html') html_input = plugin_for_input_format('html')
for opt in html_input.options: for opt in html_input.options:
setattr(options, opt.option.name, opt.recommended_value) setattr(options, opt.option.name, opt.recommended_value)
@ -48,8 +49,9 @@ class CHMInput(InputFormatPlugin):
log.debug('stream.name=%s' % stream.name) log.debug('stream.name=%s' % stream.name)
mainname = self._chmtohtml(tdir, chm_name, no_images, log) mainname = self._chmtohtml(tdir, chm_name, no_images, log)
mainpath = os.path.join(tdir, mainname) mainpath = os.path.join(tdir, mainname)
#raw_input()
metadata = get_metadata_(tdir) metadata = get_metadata_from_reader(self._chm_reader)
odi = options.debug_pipeline odi = options.debug_pipeline
options.debug_pipeline = None options.debug_pipeline = None
@ -170,6 +172,7 @@ class CHMInput(InputFormatPlugin):
if isinstance(node.tag, basestring): if isinstance(node.tag, basestring):
from calibre.ebooks.chm.reader import match_string from calibre.ebooks.chm.reader import match_string
chapter_path = None
if match_string(node.tag, 'object') and match_string(node.attrib['type'], 'text/sitemap'): if match_string(node.tag, 'object') and match_string(node.attrib['type'], 'text/sitemap'):
for child in node: for child in node:
if match_string(child.tag,'param') and match_string(child.attrib['name'], 'name'): if match_string(child.tag,'param') and match_string(child.attrib['name'], 'name'):

View File

@ -0,0 +1,157 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.metadata import string_to_authors, MetaInformation
from calibre.utils.logging import default_log
from calibre.ptempfile import TemporaryFile
def _clean(s):
return s.replace(u'\u00a0', u' ')
def _detag(tag):
str = u""
for elem in tag:
if hasattr(elem, "contents"):
str += _detag(elem)
else:
str += _clean(elem)
return str
def _metadata_from_table(soup, searchfor):
td = soup.find('td', text=re.compile(searchfor, flags=re.I))
if td is None:
return None
td = td.parent
# there appears to be multiple ways of structuring the metadata
# on the home page. cue some nasty special-case hacks...
if re.match(r'^\s*'+searchfor+r'\s*$', td.renderContents(), flags=re.I):
meta = _detag(td.findNextSibling('td'))
return re.sub('^:', '', meta).strip()
else:
meta = _detag(td)
return re.sub(r'^[^:]+:', '', meta).strip()
def _metadata_from_span(soup, searchfor):
span = soup.find('span', {'class': re.compile(searchfor, flags=re.I)})
if span is None:
return None
# this metadata might need some cleaning up still :/
return _detag(span.renderContents().strip())
def _get_authors(soup):
aut = (_metadata_from_span(soup, r'author')
or _metadata_from_table(soup, r'^\s*by\s*:?\s+'))
ans = [_('Unknown')]
if aut is not None:
ans = string_to_authors(aut)
return ans
def _get_publisher(soup):
return (_metadata_from_span(soup, 'imprint')
or _metadata_from_table(soup, 'publisher'))
def _get_isbn(soup):
return (_metadata_from_span(soup, 'isbn')
or _metadata_from_table(soup, 'isbn'))
def _get_comments(soup):
date = (_metadata_from_span(soup, 'cwdate')
or _metadata_from_table(soup, 'pub date'))
pages = ( _metadata_from_span(soup, 'pages')
or _metadata_from_table(soup, 'pages'))
try:
# date span can have copyright symbols in it...
date = date.replace(u'\u00a9', '').strip()
# and pages often comes as '(\d+ pages)'
pages = re.search(r'\d+', pages).group(0)
return u'Published %s, %s pages.' % (date, pages)
except:
pass
return None
def _get_cover(soup, rdr):
ans = None
try:
ans = soup.find('img', alt=re.compile('cover', flags=re.I))['src']
except TypeError:
# meeehh, no handy alt-tag goodness, try some hackery
# the basic idea behind this is that in general, the cover image
# has a height:width ratio of ~1.25, whereas most of the nav
# buttons are decidedly less than that.
# what we do in this is work out that ratio, take 1.25 off it and
# save the absolute value when we sort by this value, the smallest
# one is most likely to be the cover image, hopefully.
r = {}
for img in soup('img'):
try:
r[abs(float(img['height'])/float(img['width'])-1.25)] = img['src']
except KeyError:
# interestingly, occasionally the only image without height
# or width attrs is the cover...
r[0] = img['src']
l = r.keys()
l.sort()
ans = r[l[0]]
# this link comes from the internal html, which is in a subdir
if ans is not None:
try:
ans = rdr.GetFile(ans)
except:
ans = rdr.root + "/" + ans
try:
ans = rdr.GetFile(ans)
except:
ans = None
if ans is not None:
from PIL import Image
from cStringIO import StringIO
buf = StringIO()
try:
Image.open(StringIO(ans)).convert('RGB').save(buf, 'JPEG')
ans = buf.getvalue()
except:
ans = None
return ans
def get_metadata_from_reader(rdr):
raw = rdr.GetFile(rdr.home)
home = BeautifulSoup(xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0])
title = rdr.title
authors = _get_authors(home)
mi = MetaInformation(title, authors)
publisher = _get_publisher(home)
if publisher:
mi.publisher = publisher
isbn = _get_isbn(home)
if isbn:
mi.isbn = isbn
comments = _get_comments(home)
if comments:
mi.comments = comments
cdata = _get_cover(home, rdr)
if cdata is not None:
mi.cover_data = ('jpg', cdata)
return mi
def get_metadata(stream):
with TemporaryFile('_chm_metadata.chm') as fname:
with open(fname, 'wb') as f:
f.write(stream.read())
from calibre.ebooks.chm.reader import CHMReader
rdr = CHMReader(fname, default_log)
return get_metadata_from_reader(rdr)

View File

@ -135,8 +135,13 @@ class CHMReader(CHMFile):
if guess_mimetype(path)[0] == ('text/html'): if guess_mimetype(path)[0] == ('text/html'):
data = self._reformat(data) data = self._reformat(data)
f.write(data) f.write(data)
#subprocess.call(['extract_chmLib.exe', self._sourcechm, output_dir])
self._extracted = True self._extracted = True
files = os.listdir(output_dir)
if self.hhc_path not in files:
for f in files:
if f.lower() == self.hhc_path.lower():
self.hhc_path = f
break
def _reformat(self, data): def _reformat(self, data):
try: try:

View File

@ -20,7 +20,7 @@ What formats does |app| support conversion to/from?
|app| supports the conversion of many input formats to many output formats. |app| supports the conversion of many input formats to many output formats.
It can convert every input format in the following list, to every output format. It can convert every input format in the following list, to every output format.
*Input Formats:* CBZ, CBR, CBC, EPUB, FB2, HTML, LIT, LRF, MOBI, ODT, PDF, PRC**, PDB, PML, RB, RTF, TCR, TXT *Input Formats:* CBZ, CBR, CBC, CHM, EPUB, FB2, HTML, LIT, LRF, MOBI, ODT, PDF, PRC**, PDB, PML, RB, RTF, TCR, TXT
*Output Formats:* EPUB, FB2, OEB, LIT, LRF, MOBI, PDB, PML, RB, PDF, TCR, TXT *Output Formats:* EPUB, FB2, OEB, LIT, LRF, MOBI, PDB, PML, RB, PDF, TCR, TXT
@ -191,7 +191,7 @@ Library Management
What formats does |app| read metadata from? What formats does |app| read metadata from?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|app| reads metadata from the following formats: LRF, PDF, LIT, RTF, OPF, MOBI, PRC, EPUB, FB2, IMP, RB, HTML. In addition it can write metadata to: LRF, RTF, OPF, EPUB, PDF, MOBI |app| reads metadata from the following formats: CHM, LRF, PDF, LIT, RTF, OPF, MOBI, PRC, EPUB, FB2, IMP, RB, HTML. In addition it can write metadata to: LRF, RTF, OPF, EPUB, PDF, MOBI
Where are the book files stored? Where are the book files stored?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~