Add unit tests for HTML metadata imports

This commit is contained in:
Christopher Szucko 2019-08-11 09:30:35 -05:00
parent a81ff78c0a
commit b12c75c904

View File

@ -9,6 +9,7 @@ Try to read metadata from an HTML file.
'''
import re
import unittest
from collections import defaultdict
from HTMLParser import HTMLParser
@ -251,3 +252,184 @@ def get_metadata_(src, encoding=None):
mi.set_identifier(k, v[0])
return mi
class MetadataHtmlTest(unittest.TestCase):
def compare_metadata(self, meta_a, meta_b):
for attr in ('title', 'authors', 'publisher', 'isbn', 'languages', 'pubdate', 'timestamp', 'series', 'series_index', 'rating', 'comments', 'tags', 'identifiers'):
self.assertEqual(getattr(meta_a, attr), getattr(meta_b, attr))
def get_stream(self, test):
from io import BytesIO
raw = b'''\
<html>
<head>
'''
if test in {'title', 'meta_single', 'meta_multi', 'comment_single', 'comment_multi'}:
raw += b'''\
}
<title>A Title Tag &amp;amp; Title &#x24B8;</title>
'''
if test in {'meta_single', 'meta_multi', 'comment_single', 'comment_multi'}:
raw += b'''\
<meta name="dc:title" content="A Meta Tag &amp;amp; Title &#9400;" />
<meta name="dcterms.creator.aut" content="George Washington" />
<meta name="dc.publisher" content="Publisher A" />
<meta name="isbn" content="1234567890" />
<meta name="dc.language" content="English" />
<meta name="dc.date.published" content="2019-01-01" />
<meta name="dcterms.created" content="2018-01-01" />
<meta name="series" content="Meta Series" />
<meta name="seriesnumber" content="1" />
<meta name="rating" content="" />
<meta name="dc.description" content="" />
<meta name="tags" content="tag a, tag b" />
<meta name="dc.identifier.url" content="" />
<meta name="dc.identifier" scheme="" content="invalid" />
<meta name="dc.identifier." content="still invalid" />
<meta name="dc.identifier.conflicting" scheme="schemes" content="are also invalid" />
<meta name="dc.identifier.custom.subid" content="invalid too" />
'''
if test in {'meta_multi', 'comment_single', 'comment_multi'}:
raw += b'''\
<meta name="title" content="A Different Meta Tag &amp;amp; Title &#9400;" />
<meta name="author" content="John Adams with Thomas Jefferson" />
<meta name="publisher" content="Publisher B" />
<meta name="isbn" content="2345678901" />
<meta name="dcterms.language" content="Spanish" />
<meta name="date of publication" content="2017-01-01" />
<meta name="timestamp" content="2016-01-01" />
<meta name="series" content="Another Meta Series" />
<meta name="series.index" content="2" />
<meta name="rating" content="8" />
<meta name="comments" content="meta &quot;comments&quot; &#x2665; HTML &amp;amp;" />
<meta name="tags" content="tag c" />
<meta name="dc.identifier.url" content="http://google.com/search?q=calibre" />
'''
if test in {'comment_single', 'comment_multi'}:
raw += b'''\
<!-- TITLE="A Comment Tag &amp;amp; Title &#9400;" -->
<!-- AUTHOR="James Madison and James Monroe" -->
<!-- PUBLISHER="Publisher C" -->
<!-- ISBN="3456789012" -->
<!-- LANGUAGE="French" -->
<!-- PUBDATE="2015-01-01" -->
<!-- TIMESTAMP="2014-01-01" -->
<!-- SERIES="Comment Series" -->
<!-- SERIESNUMBER="3" -->
<!-- RATING="20" -->
<!-- COMMENTS="comment &quot;comments&quot; &#x2665; HTML too &amp;amp;" -->
<!-- TAGS="tag d" -->
'''
if test in {'comment_multi'}:
raw += b'''\
<!-- TITLE="Another Comment Tag &amp;amp; Title &#9400;" -->
<!-- AUTHOR="John Quincy Adams" -->
<!-- PUBLISHER="Publisher D" -->
<!-- ISBN="4567890123" -->
<!-- LANGUAGE="Japanese" -->
<!-- PUBDATE="2013-01-01" -->
<!-- TIMESTAMP="2012-01-01" -->
<!-- SERIES="Comment Series 2" -->
<!-- SERIESNUMBER="4" -->
<!-- RATING="1" -->
<!-- COMMENTS="comment &quot;comments&quot; &#x2665; HTML too &amp;amp; for sure" -->
<!-- TAGS="tag e, tag f" -->
'''
raw += b'''\
</head>
<body>
</body>
</html>
'''
return BytesIO(raw)
def test_input_title(self):
stream_meta = get_metadata(self.get_stream('title'))
canon_meta = Metadata('A Title Tag &amp; Title Ⓒ', [_('Unknown')])
self.compare_metadata(stream_meta, canon_meta)
def test_input_meta_single(self):
stream_meta = get_metadata(self.get_stream('meta_single'))
canon_meta = Metadata('A Meta Tag &amp; Title Ⓒ', ['George Washington'])
canon_meta.publisher = 'Publisher A'
canon_meta.languages = ['English']
canon_meta.pubdate = parse_date('2019-01-01')
canon_meta.timestamp = parse_date('2018-01-01')
canon_meta.series = 'Meta Series'
canon_meta.series_index = float(1)
# canon_meta.rating = float(0)
# canon_meta.comments = ''
canon_meta.tags = ['tag a', 'tag b']
canon_meta.set_identifiers({'isbn': '1234567890'})
self.compare_metadata(stream_meta, canon_meta)
def test_input_meta_multi(self):
stream_meta = get_metadata(self.get_stream('meta_multi'))
canon_meta = Metadata('A Meta Tag &amp; Title Ⓒ', ['George Washington', 'John Adams', 'Thomas Jefferson'])
canon_meta.publisher = 'Publisher A'
canon_meta.languages = ['English', 'Spanish']
canon_meta.pubdate = parse_date('2019-01-01')
canon_meta.timestamp = parse_date('2018-01-01')
canon_meta.series = 'Meta Series'
canon_meta.series_index = float(1)
canon_meta.rating = float(8)
canon_meta.comments = 'meta &quot;comments&quot; ♥ HTML &amp;amp;'
canon_meta.tags = ['tag a', 'tag b', 'tag c']
canon_meta.set_identifiers({'isbn': '1234567890', 'url': 'http://google.com/search?q=calibre'})
self.compare_metadata(stream_meta, canon_meta)
def test_input_comment_single(self):
stream_meta = get_metadata(self.get_stream('comment_single'))
canon_meta = Metadata('A Comment Tag &amp; Title Ⓒ', ['James Madison', 'James Monroe'])
canon_meta.publisher = 'Publisher C'
canon_meta.languages = ['French']
canon_meta.pubdate = parse_date('2015-01-01')
canon_meta.timestamp = parse_date('2014-01-01')
canon_meta.series = 'Comment Series'
canon_meta.series_index = float(3)
canon_meta.rating = float(0)
canon_meta.comments = 'comment &quot;comments&quot; ♥ HTML too &amp;amp;'
canon_meta.tags = ['tag d']
canon_meta.set_identifiers({'isbn': '3456789012', 'url': 'http://google.com/search?q=calibre'})
self.compare_metadata(stream_meta, canon_meta)
def test_input_comment_multi(self):
stream_meta = get_metadata(self.get_stream('comment_multi'))
canon_meta = Metadata('A Comment Tag &amp; Title Ⓒ', ['James Madison', 'James Monroe', 'John Quincy Adams'])
canon_meta.publisher = 'Publisher C'
canon_meta.languages = ['French', 'Japanese']
canon_meta.pubdate = parse_date('2015-01-01')
canon_meta.timestamp = parse_date('2014-01-01')
canon_meta.series = 'Comment Series'
canon_meta.series_index = float(3)
canon_meta.rating = float(0)
canon_meta.comments = 'comment &quot;comments&quot; ♥ HTML too &amp;amp;'
canon_meta.tags = ['tag d', 'tag e', 'tag f']
canon_meta.set_identifiers({'isbn': '3456789012', 'url': 'http://google.com/search?q=calibre'})
self.compare_metadata(stream_meta, canon_meta)
def suite():
return unittest.TestLoader().loadTestsFromTestCase(MetadataHtmlTest)
def test():
unittest.TextTestRunner(verbosity=2).run(suite())
if __name__ == '__main__':
test()