Implement #518

2025-11-18 12:33:03 -05:00 · 2008-02-12 20:34:35 +00:00 · 2008-02-12 20:34:35 +00:00 · ac0c54d15c
commit ac0c54d15c
parent 3df02e65e1
3 changed files with 141 additions and 17 deletions
--- a/src/libprs500/ebooks/metadata/init.py
+++ b/src/libprs500/ebooks/metadata/init.py
@ -43,11 +43,12 @@ class MetaInformation(object):
    @staticmethod
    def copy(mi):
        ans = MetaInformation(mi.title, mi.authors)
-        ans.author_sort = mi.author_sort
-        ans.title_sort = mi.title_sort
-        ans.comments = mi.comments
-        ans.category = mi.category
-        ans.publisher = mi.publisher
+        for attr in ('author_sort', 'title_sort', 'comments', 'category',
+                     'publisher', 'series', 'series_index', 'rating',
+                     'isbn', 'tags', 'cover_data'):
+            if hasattr(mi, attr):
+                setattr(ans, attr, getattr(mi, attr))
+        
    
    def __init__(self, title, authors):
        '''
@ -77,6 +78,32 @@ class MetaInformation(object):
        self.cover_data   = (None, None)  if not mi else mi.cover_data #(extension, data)
         
    
+    def smart_update(self, mi):
+        '''
+        Merge the information in C{mi} into self. In case of conflicts, the information
+        in C{mi} takes precedence, unless the information in mi is NULL.
+        '''
+        if mi.title and mi.title.lower() != 'unknown':
+            self.title = mi.title
+            
+        if mi.authors and mi.authors[0].lower() != 'unknown':
+            self.authors = mi.authors
+            
+        for attr in ('author_sort', 'title_sort', 'comments', 'category',
+                     'publisher', 'series', 'series_index', 'rating',
+                     'isbn'):
+            if hasattr(mi, attr):
+                val = getattr(mi, attr)
+                if val is not None:
+                    setattr(self, attr, val)
+                    
+        self.tags += mi.tags
+        self.tags = list(set(self.tags))
+        
+        if mi.cover_data[0] is not None:
+            self.cover_data = mi.cover_data
+            
+            
    def __str__(self):
        ans = u''
        ans += u'Title    : ' + unicode(self.title) + u'\n'
--- a/src/libprs500/ebooks/metadata/html.py
+++ b/src/libprs500/ebooks/metadata/html.py
@ -0,0 +1,62 @@
+#!/usr/bin/env  python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+##    Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+'''
+Try to read metadata from an HTML file.
+'''
+
+import re
+
+from libprs500.ebooks.metadata import MetaInformation
+
+def get_metadata(stream):
+    src = stream.read()
+    
+    # Title
+    title = None
+    pat = re.compile(r'<!--.*?TITLE=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
+    match = pat.search(src)
+    if match:
+        title = match.group(1)
+        
+    # Author
+    author = None
+    pat = re.compile(r'<!--.*?AUTHOR=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
+    match = pat.search(src)
+    if match:
+        author = match.group(1).replace(',', ';')
+        
+    mi = MetaInformation(title, [author])
+    
+    # Publisher
+    pat = re.compile(r'<!--.*?PUBLISHER=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
+    match = pat.search(src)
+    if match:
+        mi.publisher = match.group(1)
+        
+    # ISBN
+    pat = re.compile(r'<!--.*?ISBN=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
+    match = pat.search(src)
+    if match:
+        isbn = match.group(1)
+        mi.isbn = re.sub(r'[^0-9xX]', '', isbn)
+        
+    print mi
+    
+    return mi
+    
+    
--- a/src/libprs500/ebooks/metadata/meta.py
+++ b/src/libprs500/ebooks/metadata/meta.py
@ -13,11 +13,14 @@
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

+import os, re
+
 from libprs500.ebooks.metadata.rtf  import get_metadata as rtf_metadata
 from libprs500.ebooks.lrf.meta      import get_metadata as lrf_metadata
 from libprs500.ebooks.metadata.pdf  import get_metadata as pdf_metadata
 from libprs500.ebooks.metadata.lit  import get_metadata as lit_metadata
 from libprs500.ebooks.metadata.epub import get_metadata as epub_metadata
+from libprs500.ebooks.metadata.html import get_metadata as html_metadata
 from libprs500.ebooks.metadata.rtf  import set_metadata as set_rtf_metadata
 from libprs500.ebooks.lrf.meta      import set_metadata as set_lrf_metadata

@ -25,17 +28,23 @@ from libprs500.ebooks.metadata import MetaInformation

 def get_metadata(stream, stream_type='lrf'):
    if stream_type: stream_type = stream_type.lower()
-    if stream_type == 'rtf':
-        return MetaInformation(rtf_metadata(stream), None)
-    if stream_type == 'lrf':
-        return MetaInformation(lrf_metadata(stream), None)
-    if stream_type == 'pdf':
-        return MetaInformation(pdf_metadata(stream), None)
-    if stream_type == 'lit':
-        return MetaInformation(lit_metadata(stream), None)
-    if stream_type == 'epub':
-        return MetaInformation(epub_metadata(stream), None)
-    return MetaInformation(None, None)
+    if stream_type in ('html', 'html', 'xhtml', 'xhtm'):
+        stream_type = 'html'
+    
+    try:
+        func = eval(stream_type + '_metadata')
+        mi = func(stream)
+    except NameError:
+        mi = MetaInformation(None, None)
+        
+    name = os.path.basename(stream.name) if hasattr(stream, 'name') else ''
+    base = metadata_from_filename(name)
+    if not base.title:
+        base.title = name if name else 'Unknown'
+    if not base.authors:
+        base.authors = ['Unknown']
+    base.smart_update(mi)
+    return base

 def set_metadata(stream, mi, stream_type='lrf'):
    if stream_type: stream_type = stream_type.lower()
@ -44,3 +53,29 @@ def set_metadata(stream, mi, stream_type='lrf'):
    elif stream_type == 'rtf':
        set_rtf_metadata(stream, mi)

+_filename_pat = re.compile(r'(?P<title>.+) - (?P<author>[^_]+)')
+
+def metadata_from_filename(name):
+    name = os.path.splitext(name)[0]
+    mi = MetaInformation(None, None)
+    match = _filename_pat.search(name)
+    if match:
+        try:
+            mi.title = match.group('title')
+        except IndexError:
+            pass
+        try:
+            mi.authors = [match.group('author')]
+        except IndexError:
+            pass
+        try:
+            au = match.group('authors')
+            aus = au.split(',')
+            authors = []
+            for a in aus:
+                authors.extend(a.split('&'))
+            mi.authors = authors
+        except IndexError:
+            pass
+    return mi
+