Bug fixes and auto import of metadata from .opf files

This commit is contained in:
Kovid Goyal 2007-05-14 19:35:49 +00:00
parent dba36da690
commit 5961fe6fc7
2 changed files with 55 additions and 5 deletions

View File

@ -33,7 +33,7 @@ You may have to adjust the GROUP and the location of the rules file to
suit your distribution. suit your distribution.
""" """
__version__ = "0.3.28" __version__ = "0.3.29"
__docformat__ = "epytext" __docformat__ = "epytext"
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>" __author__ = "Kovid Goyal <kovid@kovidgoyal.net>"

View File

@ -14,13 +14,14 @@
## You should have received a copy of the GNU General Public License along ## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc., ## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
from libprs500.lrf.html.BeautifulSoup import BeautifulStoneSoup
""" """
Code to convert HTML ebooks into LRF ebooks. Code to convert HTML ebooks into LRF ebooks.
I am indebted to esperanc for the initial CSS->Xylog Style conversion routines I am indebted to esperanc for the initial CSS->Xylog Style conversion routines
and to Falstaff for pylrs. and to Falstaff for pylrs.
""" """
import os, re, sys, shutil, traceback, copy, codecs import os, re, sys, shutil, traceback, copy, glob
from htmlentitydefs import name2codepoint from htmlentitydefs import name2codepoint
from urllib import unquote from urllib import unquote
from urlparse import urlparse from urlparse import urlparse
@ -32,7 +33,7 @@ try:
except ImportError: except ImportError:
import Image as PILImage import Image as PILImage
from libprs500.lrf.html.BeautifulSoup import BeautifulSoup, Comment, Tag, \ from libprs500.lrf.html.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, Comment, Tag, \
NavigableString, Declaration, ProcessingInstruction NavigableString, Declaration, ProcessingInstruction
from libprs500.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, TextBlock, \ from libprs500.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, TextBlock, \
ImageBlock, JumpButton, CharButton, \ ImageBlock, JumpButton, CharButton, \
@ -997,9 +998,19 @@ class HTMLConverter(object):
def process_file(path, options): def process_file(path, options):
cwd = os.getcwd() cwd = os.getcwd()
dirpath = None dirpath = None
default_title = filename_to_utf8(os.path.splitext(os.path.basename(path))[0])
try: try:
dirpath, path = get_path(path) dirpath, path = get_path(path)
cpath, tpath = '', '' cpath, tpath = '', ''
isbn = try_opf(path, options)
if not options.cover and isbn:
for item in isbn:
matches = glob.glob(re.sub('-', '', item[1])+'.*')
for match in matches:
if match.lower().endswith('.jpeg') or match.lower().endswith('.jpg') or \
match.lower().endswith('.gif') or match.lower().endswith('.bmp'):
options.cover = match
break
if options.cover: if options.cover:
options.cover = os.path.abspath(os.path.expanduser(options.cover)) options.cover = os.path.abspath(os.path.expanduser(options.cover))
cpath = options.cover cpath = options.cover
@ -1021,6 +1032,10 @@ def process_file(path, options):
tpath = tf.name tpath = tf.name
else: else:
raise ConversionError, 'Cannot read from: %s', (options.cover,) raise ConversionError, 'Cannot read from: %s', (options.cover,)
if not options.title:
options.title = default_title
title = (options.title, options.title_sort) title = (options.title, options.title_sort)
author = (options.author, options.author_sort) author = (options.author, options.author_sort)
args = dict(font_delta=options.font_delta, title=title, \ args = dict(font_delta=options.font_delta, title=title, \
@ -1051,7 +1066,7 @@ def process_file(path, options):
link_exclude=re.compile(le), page_break=pb, link_exclude=re.compile(le), page_break=pb,
hide_broken_links=not options.show_broken_links) hide_broken_links=not options.show_broken_links)
conv.process_links() conv.process_links()
oname = options.output oname = options.output
if not oname: if not oname:
suffix = '.lrs' if options.lrs else '.lrf' suffix = '.lrs' if options.lrs else '.lrf'
name = os.path.splitext(os.path.basename(path))[0] + suffix name = os.path.splitext(os.path.basename(path))[0] + suffix
@ -1064,7 +1079,42 @@ def process_file(path, options):
os.chdir(cwd) os.chdir(cwd)
if dirpath: if dirpath:
shutil.rmtree(dirpath, True) shutil.rmtree(dirpath, True)
def try_opf(path, options):
try:
opf = glob.glob('*.opf')[0]
except IndexError:
return
soup = BeautifulStoneSoup(open(opf).read())
try:
title = soup.package.metadata.find('dc:title')
if title and not options.title:
options.title = title.string
creators = soup.package.metadata.findAll('dc:creator')
if options.author == 'Unknown':
for author in creators:
role = author.get('role')
if not role:
role = author.get('opf:role')
if role == 'aut':
options.author = author.string
fa = author.get('file-as')
if fa:
options.author_sort = fa
isbn = []
for item in soup.package.metadata.findAll('dc:identifier'):
scheme = item.get('scheme')
if not scheme:
scheme = item.get('opf:scheme')
isbn.append((scheme, item.string))
return isbn
except Exception, err:
if options.verbose:
print >>sys.stderr, 'Failed to process opf file', err
pass
def parse_options(argv=None, cli=True): def parse_options(argv=None, cli=True):
""" CLI for html -> lrf conversions """ """ CLI for html -> lrf conversions """
if not argv: if not argv: