Add unit tests for HTML metadata imports

This commit is contained in:
Christopher Szucko 2019-08-11 09:30:35 -05:00
parent a81ff78c0a
commit b12c75c904

View File

@ -9,6 +9,7 @@ Try to read metadata from an HTML file.
''' '''
import re import re
import unittest
from collections import defaultdict from collections import defaultdict
from HTMLParser import HTMLParser from HTMLParser import HTMLParser
@ -251,3 +252,184 @@ def get_metadata_(src, encoding=None):
mi.set_identifier(k, v[0]) mi.set_identifier(k, v[0])
return mi return mi
class MetadataHtmlTest(unittest.TestCase):
def compare_metadata(self, meta_a, meta_b):
for attr in ('title', 'authors', 'publisher', 'isbn', 'languages', 'pubdate', 'timestamp', 'series', 'series_index', 'rating', 'comments', 'tags', 'identifiers'):
self.assertEqual(getattr(meta_a, attr), getattr(meta_b, attr))
def get_stream(self, test):
from io import BytesIO
raw = b'''\
<html>
<head>
'''
if test in {'title', 'meta_single', 'meta_multi', 'comment_single', 'comment_multi'}:
raw += b'''\
}
<title>A Title Tag &amp;amp; Title &#x24B8;</title>
'''
if test in {'meta_single', 'meta_multi', 'comment_single', 'comment_multi'}:
raw += b'''\
<meta name="dc:title" content="A Meta Tag &amp;amp; Title &#9400;" />
<meta name="dcterms.creator.aut" content="George Washington" />
<meta name="dc.publisher" content="Publisher A" />
<meta name="isbn" content="1234567890" />
<meta name="dc.language" content="English" />
<meta name="dc.date.published" content="2019-01-01" />
<meta name="dcterms.created" content="2018-01-01" />
<meta name="series" content="Meta Series" />
<meta name="seriesnumber" content="1" />
<meta name="rating" content="" />
<meta name="dc.description" content="" />
<meta name="tags" content="tag a, tag b" />
<meta name="dc.identifier.url" content="" />
<meta name="dc.identifier" scheme="" content="invalid" />
<meta name="dc.identifier." content="still invalid" />
<meta name="dc.identifier.conflicting" scheme="schemes" content="are also invalid" />
<meta name="dc.identifier.custom.subid" content="invalid too" />
'''
if test in {'meta_multi', 'comment_single', 'comment_multi'}:
raw += b'''\
<meta name="title" content="A Different Meta Tag &amp;amp; Title &#9400;" />
<meta name="author" content="John Adams with Thomas Jefferson" />
<meta name="publisher" content="Publisher B" />
<meta name="isbn" content="2345678901" />
<meta name="dcterms.language" content="Spanish" />
<meta name="date of publication" content="2017-01-01" />
<meta name="timestamp" content="2016-01-01" />
<meta name="series" content="Another Meta Series" />
<meta name="series.index" content="2" />
<meta name="rating" content="8" />
<meta name="comments" content="meta &quot;comments&quot; &#x2665; HTML &amp;amp;" />
<meta name="tags" content="tag c" />
<meta name="dc.identifier.url" content="http://google.com/search?q=calibre" />
'''
if test in {'comment_single', 'comment_multi'}:
raw += b'''\
<!-- TITLE="A Comment Tag &amp;amp; Title &#9400;" -->
<!-- AUTHOR="James Madison and James Monroe" -->
<!-- PUBLISHER="Publisher C" -->
<!-- ISBN="3456789012" -->
<!-- LANGUAGE="French" -->
<!-- PUBDATE="2015-01-01" -->
<!-- TIMESTAMP="2014-01-01" -->
<!-- SERIES="Comment Series" -->
<!-- SERIESNUMBER="3" -->
<!-- RATING="20" -->
<!-- COMMENTS="comment &quot;comments&quot; &#x2665; HTML too &amp;amp;" -->
<!-- TAGS="tag d" -->
'''
if test in {'comment_multi'}:
raw += b'''\
<!-- TITLE="Another Comment Tag &amp;amp; Title &#9400;" -->
<!-- AUTHOR="John Quincy Adams" -->
<!-- PUBLISHER="Publisher D" -->
<!-- ISBN="4567890123" -->
<!-- LANGUAGE="Japanese" -->
<!-- PUBDATE="2013-01-01" -->
<!-- TIMESTAMP="2012-01-01" -->
<!-- SERIES="Comment Series 2" -->
<!-- SERIESNUMBER="4" -->
<!-- RATING="1" -->
<!-- COMMENTS="comment &quot;comments&quot; &#x2665; HTML too &amp;amp; for sure" -->
<!-- TAGS="tag e, tag f" -->
'''
raw += b'''\
</head>
<body>
</body>
</html>
'''
return BytesIO(raw)
def test_input_title(self):
stream_meta = get_metadata(self.get_stream('title'))
canon_meta = Metadata('A Title Tag &amp; Title Ⓒ', [_('Unknown')])
self.compare_metadata(stream_meta, canon_meta)
def test_input_meta_single(self):
stream_meta = get_metadata(self.get_stream('meta_single'))
canon_meta = Metadata('A Meta Tag &amp; Title Ⓒ', ['George Washington'])
canon_meta.publisher = 'Publisher A'
canon_meta.languages = ['English']
canon_meta.pubdate = parse_date('2019-01-01')
canon_meta.timestamp = parse_date('2018-01-01')
canon_meta.series = 'Meta Series'
canon_meta.series_index = float(1)
# canon_meta.rating = float(0)
# canon_meta.comments = ''
canon_meta.tags = ['tag a', 'tag b']
canon_meta.set_identifiers({'isbn': '1234567890'})
self.compare_metadata(stream_meta, canon_meta)
def test_input_meta_multi(self):
stream_meta = get_metadata(self.get_stream('meta_multi'))
canon_meta = Metadata('A Meta Tag &amp; Title Ⓒ', ['George Washington', 'John Adams', 'Thomas Jefferson'])
canon_meta.publisher = 'Publisher A'
canon_meta.languages = ['English', 'Spanish']
canon_meta.pubdate = parse_date('2019-01-01')
canon_meta.timestamp = parse_date('2018-01-01')
canon_meta.series = 'Meta Series'
canon_meta.series_index = float(1)
canon_meta.rating = float(8)
canon_meta.comments = 'meta &quot;comments&quot; ♥ HTML &amp;amp;'
canon_meta.tags = ['tag a', 'tag b', 'tag c']
canon_meta.set_identifiers({'isbn': '1234567890', 'url': 'http://google.com/search?q=calibre'})
self.compare_metadata(stream_meta, canon_meta)
def test_input_comment_single(self):
stream_meta = get_metadata(self.get_stream('comment_single'))
canon_meta = Metadata('A Comment Tag &amp; Title Ⓒ', ['James Madison', 'James Monroe'])
canon_meta.publisher = 'Publisher C'
canon_meta.languages = ['French']
canon_meta.pubdate = parse_date('2015-01-01')
canon_meta.timestamp = parse_date('2014-01-01')
canon_meta.series = 'Comment Series'
canon_meta.series_index = float(3)
canon_meta.rating = float(0)
canon_meta.comments = 'comment &quot;comments&quot; ♥ HTML too &amp;amp;'
canon_meta.tags = ['tag d']
canon_meta.set_identifiers({'isbn': '3456789012', 'url': 'http://google.com/search?q=calibre'})
self.compare_metadata(stream_meta, canon_meta)
def test_input_comment_multi(self):
stream_meta = get_metadata(self.get_stream('comment_multi'))
canon_meta = Metadata('A Comment Tag &amp; Title Ⓒ', ['James Madison', 'James Monroe', 'John Quincy Adams'])
canon_meta.publisher = 'Publisher C'
canon_meta.languages = ['French', 'Japanese']
canon_meta.pubdate = parse_date('2015-01-01')
canon_meta.timestamp = parse_date('2014-01-01')
canon_meta.series = 'Comment Series'
canon_meta.series_index = float(3)
canon_meta.rating = float(0)
canon_meta.comments = 'comment &quot;comments&quot; ♥ HTML too &amp;amp;'
canon_meta.tags = ['tag d', 'tag e', 'tag f']
canon_meta.set_identifiers({'isbn': '3456789012', 'url': 'http://google.com/search?q=calibre'})
self.compare_metadata(stream_meta, canon_meta)
def suite():
return unittest.TestLoader().loadTestsFromTestCase(MetadataHtmlTest)
def test():
unittest.TextTestRunner(verbosity=2).run(suite())
if __name__ == '__main__':
test()