Bug fixes and auto import of metadata from .opf files

2025-07-08 18:54:09 -04:00 · 2007-05-14 19:35:49 +00:00 · 2007-05-14 19:35:49 +00:00 · 5961fe6fc7
commit 5961fe6fc7
parent dba36da690
2 changed files with 55 additions and 5 deletions
--- a/src/libprs500/init.py
+++ b/src/libprs500/init.py
@ -33,7 +33,7 @@ You may have to adjust the GROUP and the location of the rules file to
 suit your distribution.
 """

-__version__   = "0.3.28"
+__version__   = "0.3.29"
 __docformat__ = "epytext"
 __author__    = "Kovid Goyal <kovid@kovidgoyal.net>"

--- a/src/libprs500/lrf/html/convert_from.py
+++ b/src/libprs500/lrf/html/convert_from.py
@ -14,13 +14,14 @@
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+from libprs500.lrf.html.BeautifulSoup import BeautifulStoneSoup
 """ 
 Code to convert HTML ebooks into LRF ebooks.

 I am indebted to esperanc for the initial CSS->Xylog Style conversion routines
 and to Falstaff for pylrs.
 """
-import os, re, sys, shutil, traceback, copy, codecs
+import os, re, sys, shutil, traceback, copy, glob
 from htmlentitydefs import name2codepoint
 from urllib import unquote
 from urlparse import urlparse
@ -32,7 +33,7 @@ try:
 except ImportError:
    import Image as PILImage

-from libprs500.lrf.html.BeautifulSoup import BeautifulSoup, Comment, Tag, \
+from libprs500.lrf.html.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, Comment, Tag, \
                                             NavigableString, Declaration, ProcessingInstruction
 from libprs500.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, TextBlock, \
                                      ImageBlock, JumpButton, CharButton, \
@ -997,9 +998,19 @@ class HTMLConverter(object):
 def process_file(path, options):
    cwd = os.getcwd()
    dirpath = None
+    default_title = filename_to_utf8(os.path.splitext(os.path.basename(path))[0])
    try:
        dirpath, path = get_path(path)
        cpath, tpath = '', '' 
+        isbn = try_opf(path, options)
+        if not options.cover and isbn:
+            for item in isbn:
+                matches = glob.glob(re.sub('-', '', item[1])+'.*')
+                for match in matches:
+                    if match.lower().endswith('.jpeg') or match.lower().endswith('.jpg') or \
+                    match.lower().endswith('.gif') or match.lower().endswith('.bmp'):
+                        options.cover = match
+                        break
        if options.cover:
            options.cover = os.path.abspath(os.path.expanduser(options.cover))
            cpath = options.cover
@ -1021,6 +1032,10 @@ def process_file(path, options):
                tpath = tf.name
            else:
                raise ConversionError, 'Cannot read from: %s', (options.cover,)
+        
+                    
+        if not options.title:
+            options.title = default_title
        title = (options.title, options.title_sort)
        author = (options.author, options.author_sort)
        args = dict(font_delta=options.font_delta, title=title, \
@ -1051,7 +1066,7 @@ def process_file(path, options):
                             link_exclude=re.compile(le), page_break=pb,
                             hide_broken_links=not options.show_broken_links)
        conv.process_links()
-        oname = options.output
+        oname = options.output        
        if not oname:
            suffix = '.lrs' if options.lrs else '.lrf'
            name = os.path.splitext(os.path.basename(path))[0] + suffix
@ -1064,7 +1079,42 @@ def process_file(path, options):
        os.chdir(cwd)
        if dirpath:
            shutil.rmtree(dirpath, True)
-        
+
+def try_opf(path, options):
+    try:
+        opf = glob.glob('*.opf')[0]
+    except IndexError:
+        return
+    soup = BeautifulStoneSoup(open(opf).read())
+    try:
+        title = soup.package.metadata.find('dc:title')
+        if title and not options.title:
+            options.title = title.string
+        creators = soup.package.metadata.findAll('dc:creator')
+        if options.author == 'Unknown':
+            for author in creators:
+                role = author.get('role')
+                if not role:
+                    role = author.get('opf:role')
+                if role == 'aut':
+                    options.author = author.string
+                    fa = author.get('file-as')
+                    if fa:
+                        options.author_sort = fa
+        isbn = []
+        for item in soup.package.metadata.findAll('dc:identifier'):
+            scheme = item.get('scheme')
+            if not scheme:
+                scheme = item.get('opf:scheme')
+            isbn.append((scheme, item.string))
+        return isbn
+    except Exception, err:
+        if options.verbose:
+            print >>sys.stderr, 'Failed to process opf file', err
+        pass
+                
+            
+
 def parse_options(argv=None, cli=True):
    """ CLI for html -> lrf conversions """
    if not argv: