mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
py3: Port RTF metadata module
This commit is contained in:
parent
dc15aff051
commit
8b280c03c3
@ -9,7 +9,7 @@ import unittest
|
|||||||
|
|
||||||
from setup import Command, islinux, isosx, iswindows, SRC
|
from setup import Command, islinux, isosx, iswindows, SRC
|
||||||
|
|
||||||
TEST_MODULES = frozenset('srv db polish opf css docx cfi matcher icu smartypants build misc dbcli'.split())
|
TEST_MODULES = frozenset('srv db polish opf css docx cfi matcher icu smartypants build misc dbcli ebooks'.split())
|
||||||
|
|
||||||
|
|
||||||
class TestImports(unittest.TestCase):
|
class TestImports(unittest.TestCase):
|
||||||
@ -103,6 +103,9 @@ def find_tests(which_tests=None):
|
|||||||
if ok('smartypants'):
|
if ok('smartypants'):
|
||||||
from calibre.utils.smartypants import run_tests
|
from calibre.utils.smartypants import run_tests
|
||||||
a(run_tests(return_tests=True))
|
a(run_tests(return_tests=True))
|
||||||
|
if ok('ebooks'):
|
||||||
|
from calibre.ebooks.metadata.rtf import find_tests
|
||||||
|
a(find_tests())
|
||||||
if ok('misc'):
|
if ok('misc'):
|
||||||
from calibre.ebooks.metadata.tag_mapper import find_tests
|
from calibre.ebooks.metadata.tag_mapper import find_tests
|
||||||
a(find_tests())
|
a(find_tests())
|
||||||
|
@ -1,20 +1,23 @@
|
|||||||
__license__ = 'GPL v3'
|
#!/usr/bin/env python2
|
||||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
# vim:fileencoding=utf-8
|
||||||
|
# License: GPLv3 Copyright: 2008, Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Edit metadata in RTF files.
|
Edit metadata in RTF files.
|
||||||
"""
|
"""
|
||||||
import re, codecs
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
import codecs
|
||||||
|
import re
|
||||||
|
|
||||||
from calibre import force_unicode
|
from calibre import force_unicode
|
||||||
from calibre.ebooks.metadata import MetaInformation, string_to_authors
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
from polyglot.builtins import codepoint_to_chr, unicode_type, string_or_bytes
|
from polyglot.builtins import codepoint_to_chr, string_or_bytes, unicode_type, int_to_byte
|
||||||
|
|
||||||
title_pat = re.compile(r'\{\\info.*?\{\\title(.*?)(?<!\\)\}', re.DOTALL)
|
title_pat = re.compile(br'\{\\info.*?\{\\title(.*?)(?<!\\)\}', re.DOTALL)
|
||||||
author_pat = re.compile(r'\{\\info.*?\{\\author(.*?)(?<!\\)\}', re.DOTALL)
|
author_pat = re.compile(br'\{\\info.*?\{\\author(.*?)(?<!\\)\}', re.DOTALL)
|
||||||
comment_pat = re.compile(r'\{\\info.*?\{\\subject(.*?)(?<!\\)\}', re.DOTALL)
|
comment_pat = re.compile(br'\{\\info.*?\{\\subject(.*?)(?<!\\)\}', re.DOTALL)
|
||||||
tags_pat = re.compile(r'\{\\info.*?\{\\category(.*?)(?<!\\)\}', re.DOTALL)
|
tags_pat = re.compile(br'\{\\info.*?\{\\category(.*?)(?<!\\)\}', re.DOTALL)
|
||||||
publisher_pat = re.compile(r'\{\\info.*?\{\\manager(.*?)(?<!\\)\}', re.DOTALL)
|
publisher_pat = re.compile(br'\{\\info.*?\{\\manager(.*?)(?<!\\)\}', re.DOTALL)
|
||||||
|
|
||||||
|
|
||||||
def get_document_info(stream):
|
def get_document_info(stream):
|
||||||
@ -61,36 +64,48 @@ def get_document_info(stream):
|
|||||||
|
|
||||||
|
|
||||||
def detect_codepage(stream):
|
def detect_codepage(stream):
|
||||||
pat = re.compile(r'\\ansicpg(\d+)')
|
pat = re.compile(br'\\ansicpg(\d+)')
|
||||||
match = pat.search(stream.read(512))
|
match = pat.search(stream.read(512))
|
||||||
if match is not None:
|
if match is not None:
|
||||||
num = match.group(1)
|
num = match.group(1)
|
||||||
if num == '0':
|
if num == b'0':
|
||||||
num = '1252'
|
num = b'1252'
|
||||||
codec = 'cp'+num
|
|
||||||
try:
|
try:
|
||||||
|
codec = (b'cp'+num).decode('ascii')
|
||||||
codecs.lookup(codec)
|
codecs.lookup(codec)
|
||||||
return codec
|
return codec
|
||||||
except:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def encode(unistr):
|
def encode(unistr):
|
||||||
if not isinstance(unistr, unicode_type):
|
if not isinstance(unistr, unicode_type):
|
||||||
unistr = force_unicode(unistr)
|
unistr = force_unicode(unistr)
|
||||||
return ''.join([str(c) if ord(c) < 128 else '\\u' + str(ord(c)) + '?' for c in unistr])
|
return ''.join(c if ord(c) < 128 else '\\u{}?'.format(ord(c)) for c in unistr)
|
||||||
|
|
||||||
|
|
||||||
def decode(raw, codec):
|
def decode(raw, codec):
|
||||||
if codec is not None:
|
# https://en.wikipedia.org/wiki/Rich_Text_Format#Character_encoding
|
||||||
|
|
||||||
def codepage(match):
|
def codepage(match):
|
||||||
return chr(int(match.group(1), 16))
|
try:
|
||||||
raw = re.sub(r"\\'([a-fA-F0-9]{2})", codepage, raw)
|
return int_to_byte(int(match.group(1), 16)).decode(codec)
|
||||||
raw = raw.decode(codec)
|
except ValueError:
|
||||||
|
return '?'
|
||||||
|
|
||||||
def uni(match):
|
def uni(match):
|
||||||
|
try:
|
||||||
return codepoint_to_chr(int(match.group(1)))
|
return codepoint_to_chr(int(match.group(1)))
|
||||||
raw = re.sub(r'\\u([0-9]{3,4}).', uni, raw)
|
except Exception:
|
||||||
|
return '?'
|
||||||
|
|
||||||
|
if isinstance(raw, bytes):
|
||||||
|
raw = raw.decode('ascii', 'replace')
|
||||||
|
|
||||||
|
if codec is not None:
|
||||||
|
raw = re.sub(r"\\'([a-fA-F0-9]{2})", codepage, raw)
|
||||||
|
|
||||||
|
raw = re.sub(r'\\u([0-9]{3,5}).', uni, raw)
|
||||||
return raw
|
return raw
|
||||||
|
|
||||||
|
|
||||||
@ -99,7 +114,7 @@ def get_metadata(stream):
|
|||||||
Return metadata as a L{MetaInfo} object
|
Return metadata as a L{MetaInfo} object
|
||||||
"""
|
"""
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
if stream.read(5) != r'{\rtf':
|
if stream.read(5) != br'{\rtf':
|
||||||
return MetaInformation(_('Unknown'))
|
return MetaInformation(_('Unknown'))
|
||||||
block = get_document_info(stream)[0]
|
block = get_document_info(stream)[0]
|
||||||
if not block:
|
if not block:
|
||||||
@ -121,7 +136,7 @@ def get_metadata(stream):
|
|||||||
author = None
|
author = None
|
||||||
mi = MetaInformation(title)
|
mi = MetaInformation(title)
|
||||||
if author:
|
if author:
|
||||||
mi.authors = string_to_authors(author)
|
mi.authors = [x.strip() for x in author.split(',')]
|
||||||
|
|
||||||
comment_match = comment_pat.search(block)
|
comment_match = comment_pat.search(block)
|
||||||
if comment_match is not None:
|
if comment_match is not None:
|
||||||
@ -147,7 +162,7 @@ def create_metadata(stream, options):
|
|||||||
if options.authors:
|
if options.authors:
|
||||||
au = options.authors
|
au = options.authors
|
||||||
if not isinstance(au, string_or_bytes):
|
if not isinstance(au, string_or_bytes):
|
||||||
au = u', '.join(au)
|
au = ', '.join(au)
|
||||||
author = encode(au)
|
author = encode(au)
|
||||||
md.append(r'{\author %s}'%(author,))
|
md.append(r'{\author %s}'%(author,))
|
||||||
comp = options.comment if hasattr(options, 'comment') else options.comments
|
comp = options.comment if hasattr(options, 'comment') else options.comments
|
||||||
@ -165,7 +180,7 @@ def create_metadata(stream, options):
|
|||||||
md.append('}')
|
md.append('}')
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
src = stream.read()
|
src = stream.read()
|
||||||
ans = src[:6] + u''.join(md) + src[6:]
|
ans = src[:6] + ''.join(md).encode('ascii') + src[6:]
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
stream.write(ans)
|
stream.write(ans)
|
||||||
|
|
||||||
@ -178,58 +193,59 @@ def set_metadata(stream, options):
|
|||||||
def add_metadata_item(src, name, val):
|
def add_metadata_item(src, name, val):
|
||||||
index = src.rindex('}')
|
index = src.rindex('}')
|
||||||
return src[:index] + r'{\ '[:-1] + name + ' ' + val + '}}'
|
return src[:index] + r'{\ '[:-1] + name + ' ' + val + '}}'
|
||||||
|
|
||||||
src, pos = get_document_info(stream)
|
src, pos = get_document_info(stream)
|
||||||
if src is None:
|
if src is None:
|
||||||
create_metadata(stream, options)
|
create_metadata(stream, options)
|
||||||
else:
|
else:
|
||||||
|
src = src.decode('ascii')
|
||||||
olen = len(src)
|
olen = len(src)
|
||||||
|
|
||||||
base_pat = r'\{\\name(.*?)(?<!\\)\}'
|
base_pat = r'\{\\name(.*?)(?<!\\)\}'
|
||||||
title = options.title
|
|
||||||
if title is not None:
|
def replace_or_create(src, name, val):
|
||||||
title = encode(title)
|
val = encode(val)
|
||||||
pat = re.compile(base_pat.replace('name', 'title'), re.DOTALL)
|
pat = re.compile(base_pat.replace('name', name), re.DOTALL)
|
||||||
if pat.search(src):
|
src, num = pat.subn('{\\' + name + ' ' + val + '}', src)
|
||||||
src = pat.sub(r'{\\title ' + title + r'}', src)
|
if num == 0:
|
||||||
else:
|
src = add_metadata_item(src, name, val)
|
||||||
src = add_metadata_item(src, 'title', title)
|
return src
|
||||||
comment = options.comments
|
|
||||||
if comment is not None:
|
if options.title is not None:
|
||||||
comment = encode(comment)
|
src = replace_or_create(src, 'title', options.title)
|
||||||
pat = re.compile(base_pat.replace('name', 'subject'), re.DOTALL)
|
if options.comments is not None:
|
||||||
if pat.search(src):
|
src = replace_or_create(src, 'subject', options.comments)
|
||||||
src = pat.sub(r'{\\subject ' + comment + r'}', src)
|
if options.authors is not None:
|
||||||
else:
|
src = replace_or_create(src, 'author', ', '.join(options.authors))
|
||||||
src = add_metadata_item(src, 'subject', comment)
|
if options.tags is not None:
|
||||||
author = options.authors
|
src = replace_or_create(src, 'category', ', '.join(options.tags))
|
||||||
if author is not None:
|
if options.publisher is not None:
|
||||||
author = ', '.join(author)
|
src = replace_or_create(src, 'manager', options.publisher)
|
||||||
author = encode(author)
|
|
||||||
pat = re.compile(base_pat.replace('name', 'author'), re.DOTALL)
|
|
||||||
if pat.search(src):
|
|
||||||
src = pat.sub(r'{\\author ' + author + r'}', src)
|
|
||||||
else:
|
|
||||||
src = add_metadata_item(src, 'author', author)
|
|
||||||
tags = options.tags
|
|
||||||
if tags is not None:
|
|
||||||
tags = ', '.join(tags)
|
|
||||||
tags = encode(tags)
|
|
||||||
pat = re.compile(base_pat.replace('name', 'category'), re.DOTALL)
|
|
||||||
if pat.search(src):
|
|
||||||
src = pat.sub(r'{\\category ' + tags + r'}', src)
|
|
||||||
else:
|
|
||||||
src = add_metadata_item(src, 'category', tags)
|
|
||||||
publisher = options.publisher
|
|
||||||
if publisher is not None:
|
|
||||||
publisher = encode(publisher)
|
|
||||||
pat = re.compile(base_pat.replace('name', 'manager'), re.DOTALL)
|
|
||||||
if pat.search(src):
|
|
||||||
src = pat.sub(r'{\\manager ' + publisher + r'}', src)
|
|
||||||
else:
|
|
||||||
src = add_metadata_item(src, 'manager', publisher)
|
|
||||||
stream.seek(pos + olen)
|
stream.seek(pos + olen)
|
||||||
after = stream.read()
|
after = stream.read()
|
||||||
stream.seek(pos)
|
stream.seek(pos)
|
||||||
stream.truncate()
|
stream.truncate()
|
||||||
stream.write(src)
|
stream.write(src.encode('ascii'))
|
||||||
stream.write(after)
|
stream.write(after)
|
||||||
|
|
||||||
|
|
||||||
|
def find_tests():
|
||||||
|
import unittest
|
||||||
|
from io import BytesIO
|
||||||
|
from calibre.ebooks.metadata.book.base import Metadata
|
||||||
|
|
||||||
|
class Test(unittest.TestCase):
|
||||||
|
|
||||||
|
def test_rtf_metadata(self):
|
||||||
|
stream = BytesIO(br'{\rtf1\ansi\ansicpg1252}')
|
||||||
|
m = Metadata('Test ø̄title', ['Author One', 'Author БTwo'])
|
||||||
|
m.tags = 'tag1 見tag2'.split()
|
||||||
|
m.comments = '<p>some ⊹comments</p>'
|
||||||
|
m.publisher = 'publiSher'
|
||||||
|
set_metadata(stream, m)
|
||||||
|
stream.seek(0)
|
||||||
|
o = get_metadata(stream)
|
||||||
|
for attr in 'title authors publisher comments tags'.split():
|
||||||
|
self.assertEqual(getattr(m, attr), getattr(o, attr))
|
||||||
|
|
||||||
|
return unittest.defaultTestLoader.loadTestsFromTestCase(Test)
|
||||||
|
@ -59,6 +59,9 @@ if is_py3:
|
|||||||
def cmp(a, b):
|
def cmp(a, b):
|
||||||
return (a > b) - (a < b)
|
return (a > b) - (a < b)
|
||||||
|
|
||||||
|
def int_to_byte(x):
|
||||||
|
return bytes((x,))
|
||||||
|
|
||||||
else:
|
else:
|
||||||
exec("""def reraise(tp, value, tb=None):
|
exec("""def reraise(tp, value, tb=None):
|
||||||
try:
|
try:
|
||||||
@ -78,6 +81,7 @@ else:
|
|||||||
exec_path = execfile
|
exec_path = execfile
|
||||||
raw_input = builtins.raw_input
|
raw_input = builtins.raw_input
|
||||||
cmp = builtins.cmp
|
cmp = builtins.cmp
|
||||||
|
int_to_byte = chr
|
||||||
|
|
||||||
def iteritems(d):
|
def iteritems(d):
|
||||||
return d.iteritems()
|
return d.iteritems()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user