Added support for reading RTF metadata.

2025-06-23 15:30:45 -04:00 · 2007-01-29 08:52:47 +00:00 · 2007-01-29 08:52:47 +00:00 · 08aad56c04
commit 08aad56c04
parent 8327ce9c1e
5 changed files with 174 additions and 23 deletions
--- a/src/libprs500/gui/database.py
+++ b/src/libprs500/gui/database.py
@ -17,6 +17,7 @@ import os
 from zlib import compress, decompress
 from stat import ST_SIZE
 from libprs500.lrf.meta import LRFMetaFile, LRFException
+from libprs500.metadata.meta import get_metadata
 from cStringIO import StringIO as cStringIO

 class LibraryDatabase(object):
@ -54,28 +55,21 @@ class LibraryDatabase(object):
    
    def add_book(self, path):
        _file = os.path.abspath(path)
-        title, author, publisher, size, cover = os.path.basename(_file), \
-                                       None, None, os.stat(_file)[ST_SIZE], None
+        title, size, cover = os.path.basename(_file), \
+                                       os.stat(_file)[ST_SIZE], None
        ext = title[title.rfind(".")+1:].lower() if title.find(".") > -1 else None
-        comments, tags = None, None
-        if ext == "lrf":
-            lrf = LRFMetaFile(open(_file, "r+b"))
-            title, author, cover, publisher = lrf.title, lrf.author.strip(), \
-                                            lrf.thumbnail, lrf.publisher.strip()
-            if "unknown" in publisher.lower() or 'some publisher' in publisher.lower(): 
-                publisher = None
-            if "unknown" in author.lower(): 
-                author = None
-            comments = lrf.free_text
-            if not comments:
-                comments = None
-            classification, category = lrf.classification, lrf.category
-            if 'unknown' in classification.lower():
-                classification = ''
-            if 'unknown' in category.lower():
-                category = ''
-            if classification or category:
-                tags = ", ".join((classification, category))
+        mi = get_metadata(open(_file, "r+b"), ext)
+        tags = []
+        if not mi.title:
+            mi.title = title
+        if mi.category:
+            tags.append(mi.category)
+        if mi.classification:
+            tags.append(mi.classification)
+        if tags:
+            tags = ', '.join(tags)
+        else:
+            tags = None
        data = open(_file).read()
        usize = len(data)
        data = compress(data)
@ -86,7 +80,8 @@ class LibraryDatabase(object):
        self.con.execute("insert into books_meta (title, authors, publisher, "+\
                         "size, tags, comments, rating) values "+\
                         "(?,?,?,?,?,?,?)", \
-                         (title, author, publisher, size, tags, comments, None))
+                         (mi.title, mi.author, mi.publisher, size, tags, \
+                          mi.comments, None))
        _id =  self.con.execute("select max(id) from books_meta").next()[0]    
        self.con.execute("insert into books_data values (?,?,?,?)", \
                            (_id, ext, usize, sqlite.Binary(data)))
--- a/src/libprs500/lrf/meta.py
+++ b/src/libprs500/lrf/meta.py
@ -32,6 +32,7 @@ import xml.dom.minidom as dom
 from functools import wraps

 from libprs500.prstypes import field
+from libprs500.metadata import MetaInformation

 BYTE      = "<B"  #: Unsigned char little endian encoded in 1 byte 
 WORD      = "<H"  #: Unsigned short little endian encoded in 2 bytes 
@ -186,7 +187,29 @@ def insert_into_file(fileobj, data, start, end):
    return delta


-    
+def get_metadata(stream):
+    """
+    Return basic meta-data about the LRF file in C{stream} as a 
+    L{MetaInformation} object.
+    """
+    lrf = LRFMetaFile(stream)
+    mi = MetaInformation(lrf.title.strip(), lrf.author.strip())
+    mi.comments = lrf.free_text.strip()
+    mi.category = lrf.category.strip()
+    mi.classification = lrf.classification.strip()
+    mi.publisher = lrf.publisher.strip()
+    if not mi.title or 'unknown' in mi.title.lower():
+        mi.title = None
+    if not mi.author or 'unknown' in mi.author.lower():
+        mi.author = None
+    if not mi.category or 'unknown' in mi.category.lower():
+        mi.category = None
+    if not mi.classification or 'unknown' in mi.classification.lower():
+        mi.classification = None
+    if not mi.publisher or 'unknown' in mi.publisher.lower() or \
+            'some publisher' in mi.publisher.lower():
+        mi.publisher = None
+    return mi

 class LRFMetaFile(object):
    """ Has properties to read and write all Meta information in a LRF file. """
--- a/src/libprs500/metadata/init.py
+++ b/src/libprs500/metadata/init.py
@ -18,3 +18,25 @@ the L{libprs500.lrf.meta} module.
 """
 __docformat__ = "epytext"
 __author__       = "Kovid Goyal <kovid@kovidgoyal.net>"
+
+
+class MetaInformation(object):
+    
+    def __init__(self, title, author):
+        self.title = title
+        self.author = author
+        self.comments = None
+        self.category = None
+        self.classification = None
+        self.publisher = None
+        
+    def __str__(self):
+        ans = ''
+        ans += 'Title   : ' + str(self.title) + '\n'
+        ans += 'Author  : ' + str(self.author) + '\n'
+        ans += 'Category: ' + str(self.category) + '\n'
+        ans += 'Comments: ' + str(self.comments) + '\n'
+        return ans.strip()
+    
+    def __nonzero__(self):
+        return self.title or self.author or self.comments or self.category
--- a/src/libprs500/metadata/meta.py
+++ b/src/libprs500/metadata/meta.py
@ -0,0 +1,26 @@
+##    Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+from libprs500.metadata.rtf import get_metadata as rtf_metadata
+from libprs500.lrf.meta import get_metadata as lrf_metadata
+from libprs500.metadata import MetaInformation
+
+def get_metadata(stream, stream_type='lrf'):
+    if stream_type == 'rtf':
+        return rtf_metadata(stream)
+    if stream_type == 'lrf':
+        return lrf_metadata(stream)
+    return MetaInformation(None, None)
+    
--- a/src/libprs500/metadata/rtf.py
+++ b/src/libprs500/metadata/rtf.py
@ -0,0 +1,85 @@
+##    Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+import re, cStringIO
+
+from libprs500.metadata import MetaInformation
+
+title_pat = re.compile(r'\{\\info.*?\{\\title(.*?)\}', re.DOTALL)
+author_pat = re.compile(r'\{\\info.*?\{\\author(.*?)\}', re.DOTALL)
+comment_pat = re.compile(r'\{\\info.*?\{\\subject(.*?)\}', re.DOTALL)
+category_pat = re.compile(r'\{\\info.*?\{\\category(.*?)\}', re.DOTALL)
+
+def get_document_info(stream):
+    block_size = 4096
+    stream.seek(0)
+    found, block = False, ""
+    while not found:
+        prefix = block[-6:] 
+        block = prefix + stream.read(block_size)
+        if len(block) == len(prefix):
+            break
+        idx = block.find(r'{\info')
+        if idx >= 0:
+            found = True
+            stream.seek(stream.tell() - block_size + idx - len(prefix))
+        else:
+            stream.seek(stream.tell())
+    if not found:
+        return None, 0
+    data, count, = cStringIO.StringIO(), 0
+    pos = stream.tell()
+    while True:
+        ch = stream.read(1)
+        if ch == '{':
+            count += 1
+        elif ch == '}':
+            count -= 1
+        data.write(ch)
+        if count == 0:
+            break
+    return data.getvalue(), pos
+
+def get_metadata(stream):
+    stream.seek(0)
+    if stream.read(5) != r'{\rtf':
+        raise Exception('Not a valid RTF file')
+    block, pos = get_document_info(stream)
+    if not block:
+        return MetaInformation(None, None)
+    title, author, comment, category = None, None, None, None
+    title_match = title_pat.search(block)
+    if title_match:
+        title = title_match.group(1).strip()
+    author_match = author_pat.search(block)
+    if author_match:
+        author = author_match.group(1).strip()
+    comment_match = comment_pat.search(block)
+    if comment_match:
+        title = comment_match.group(1).strip()
+    category_match = category_pat.search(block)
+    if category_match:
+        category = category_match.group(1).strip()
+    mi = MetaInformation(title, author)
+    mi.comments = comment
+    mi.category = category
+    return mi
+    
+def main():
+    import sys
+    print get_metadata(open(sys.argv[1]))
+
+if __name__ == '__main__':
+    main()