mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 18:54:09 -04:00
Refactor the container class to make it easy to create mocked containers for testing
This commit is contained in:
parent
fdfed12489
commit
0dd8f53b4e
@ -100,8 +100,94 @@ def href_to_name(href, root, base=None):
|
|||||||
fullpath = os.path.join(base, *href.split('/'))
|
fullpath = os.path.join(base, *href.split('/'))
|
||||||
return abspath_to_name(fullpath, root)
|
return abspath_to_name(fullpath, root)
|
||||||
|
|
||||||
|
class ContainerBase(object): # {{{
|
||||||
|
'''
|
||||||
|
A base class that implements just the parsing methods. Useful to create
|
||||||
|
virtual containers for testing.
|
||||||
|
'''
|
||||||
|
|
||||||
class Container(object): # {{{
|
#: The mode used to parse HTML and CSS (polishing uses tweak_mode=False and the editor uses tweak_mode=True)
|
||||||
|
tweak_mode = False
|
||||||
|
|
||||||
|
def __init__(self, log):
|
||||||
|
self.log = log
|
||||||
|
self.parsed_cache = {}
|
||||||
|
self.mime_map = {}
|
||||||
|
self.encoding_map = {}
|
||||||
|
self.html_preprocessor = HTMLPreProcessor()
|
||||||
|
self.css_preprocessor = CSSPreProcessor()
|
||||||
|
|
||||||
|
def guess_type(self, name):
|
||||||
|
' Return the expected mimetype for the specified file name based on its extension. '
|
||||||
|
# epubcheck complains if the mimetype for text documents is set to
|
||||||
|
# text/html in EPUB 2 books. Sigh.
|
||||||
|
ans = guess_type(name)
|
||||||
|
if ans == 'text/html':
|
||||||
|
ans = 'application/xhtml+xml'
|
||||||
|
return ans
|
||||||
|
|
||||||
|
def decode(self, data, normalize_to_nfc=True):
|
||||||
|
"""
|
||||||
|
Automatically decode ``data`` into a ``unicode`` object.
|
||||||
|
|
||||||
|
:param normalize_to_nfc: Normalize returned unicode to the NFC normal form as is required by both the EPUB and AZW3 formats.
|
||||||
|
"""
|
||||||
|
def fix_data(d):
|
||||||
|
return d.replace('\r\n', '\n').replace('\r', '\n')
|
||||||
|
if isinstance(data, unicode):
|
||||||
|
return fix_data(data)
|
||||||
|
bom_enc = None
|
||||||
|
if data[:4] in {b'\0\0\xfe\xff', b'\xff\xfe\0\0'}:
|
||||||
|
bom_enc = {b'\0\0\xfe\xff':'utf-32-be',
|
||||||
|
b'\xff\xfe\0\0':'utf-32-le'}[data[:4]]
|
||||||
|
data = data[4:]
|
||||||
|
elif data[:2] in {b'\xff\xfe', b'\xfe\xff'}:
|
||||||
|
bom_enc = {b'\xff\xfe':'utf-16-le', b'\xfe\xff':'utf-16-be'}[data[:2]]
|
||||||
|
data = data[2:]
|
||||||
|
elif data[:3] == b'\xef\xbb\xbf':
|
||||||
|
bom_enc = 'utf-8'
|
||||||
|
data = data[3:]
|
||||||
|
if bom_enc is not None:
|
||||||
|
try:
|
||||||
|
self.used_encoding = bom_enc
|
||||||
|
return fix_data(data.decode(bom_enc))
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
self.used_encoding = 'utf-8'
|
||||||
|
return fix_data(data.decode('utf-8'))
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
pass
|
||||||
|
data, self.used_encoding = xml_to_unicode(data)
|
||||||
|
if normalize_to_nfc:
|
||||||
|
data = unicodedata.normalize('NFC', data)
|
||||||
|
return fix_data(data)
|
||||||
|
|
||||||
|
def parse_xml(self, data):
|
||||||
|
data, self.used_encoding = xml_to_unicode(
|
||||||
|
data, strip_encoding_pats=True, assume_utf8=True, resolve_entities=True)
|
||||||
|
data = unicodedata.normalize('NFC', data)
|
||||||
|
return etree.fromstring(data, parser=RECOVER_PARSER)
|
||||||
|
|
||||||
|
def parse_xhtml(self, data, fname='<string>', force_html5_parse=False):
|
||||||
|
if self.tweak_mode:
|
||||||
|
return parse_html_tweak(data, log=self.log, decoder=self.decode, force_html5_parse=force_html5_parse)
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
return parse_html(
|
||||||
|
data, log=self.log, decoder=self.decode,
|
||||||
|
preprocessor=self.html_preprocessor, filename=fname,
|
||||||
|
non_html_file_tags={'ncx'})
|
||||||
|
except NotHTML:
|
||||||
|
return self.parse_xml(data)
|
||||||
|
|
||||||
|
def parse_css(self, data, fname='<string>', is_declaration=False):
|
||||||
|
return parse_css(data, fname=fname, is_declaration=is_declaration, decode=self.decode, log_level=logging.WARNING,
|
||||||
|
css_preprocessor=(None if self.tweak_mode else self.css_preprocessor))
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
|
||||||
|
class Container(ContainerBase): # {{{
|
||||||
|
|
||||||
'''
|
'''
|
||||||
A container represents an Open EBook as a directory full of files and an
|
A container represents an Open EBook as a directory full of files and an
|
||||||
@ -129,23 +215,16 @@ class Container(object): # {{{
|
|||||||
book_type = 'oeb'
|
book_type = 'oeb'
|
||||||
#: If this container represents an unzipped book (a directory)
|
#: If this container represents an unzipped book (a directory)
|
||||||
is_dir = False
|
is_dir = False
|
||||||
#: The mode used to parse HTML and CSS (polishing uses tweak_mode=False and the editor uses tweak_mode=True)
|
|
||||||
tweak_mode = False
|
|
||||||
|
|
||||||
SUPPORTS_TITLEPAGES = True
|
SUPPORTS_TITLEPAGES = True
|
||||||
SUPPORTS_FILENAMES = True
|
SUPPORTS_FILENAMES = True
|
||||||
|
|
||||||
def __init__(self, rootpath, opfpath, log, clone_data=None):
|
def __init__(self, rootpath, opfpath, log, clone_data=None):
|
||||||
|
ContainerBase.__init__(self, log)
|
||||||
self.root = clone_data['root'] if clone_data is not None else os.path.abspath(rootpath)
|
self.root = clone_data['root'] if clone_data is not None else os.path.abspath(rootpath)
|
||||||
self.log = log
|
|
||||||
self.html_preprocessor = HTMLPreProcessor()
|
|
||||||
self.css_preprocessor = CSSPreProcessor()
|
|
||||||
|
|
||||||
self.parsed_cache = {}
|
|
||||||
self.mime_map = {}
|
|
||||||
self.name_path_map = {}
|
self.name_path_map = {}
|
||||||
self.dirtied = set()
|
self.dirtied = set()
|
||||||
self.encoding_map = {}
|
|
||||||
self.pretty_print = set()
|
self.pretty_print = set()
|
||||||
self.cloned = False
|
self.cloned = False
|
||||||
self.cache_names = ('parsed_cache', 'mime_map', 'name_path_map', 'encoding_map', 'dirtied', 'pretty_print')
|
self.cache_names = ('parsed_cache', 'mime_map', 'name_path_map', 'encoding_map', 'dirtied', 'pretty_print')
|
||||||
@ -202,15 +281,6 @@ class Container(object): # {{{
|
|||||||
for name, path in self.name_path_map.iteritems()}
|
for name, path in self.name_path_map.iteritems()}
|
||||||
}
|
}
|
||||||
|
|
||||||
def guess_type(self, name):
|
|
||||||
' Return the expected mimetype for the specified file name based on its extension. '
|
|
||||||
# epubcheck complains if the mimetype for text documents is set to
|
|
||||||
# text/html in EPUB 2 books. Sigh.
|
|
||||||
ans = guess_type(name)
|
|
||||||
if ans == 'text/html':
|
|
||||||
ans = 'application/xhtml+xml'
|
|
||||||
return ans
|
|
||||||
|
|
||||||
def add_name_to_manifest(self, name):
|
def add_name_to_manifest(self, name):
|
||||||
' Add an entry to the manifest for a file with the specified name. Returns the manifest id. '
|
' Add an entry to the manifest for a file with the specified name. Returns the manifest id. '
|
||||||
all_ids = {x.get('id') for x in self.opf_xpath('//*[@id]')}
|
all_ids = {x.get('id') for x in self.opf_xpath('//*[@id]')}
|
||||||
@ -417,43 +487,6 @@ class Container(object): # {{{
|
|||||||
:meth:`abspath_to_name` for that.'''
|
:meth:`abspath_to_name` for that.'''
|
||||||
return relpath(path, base or self.root)
|
return relpath(path, base or self.root)
|
||||||
|
|
||||||
def decode(self, data, normalize_to_nfc=True):
|
|
||||||
"""
|
|
||||||
Automatically decode ``data`` into a ``unicode`` object.
|
|
||||||
|
|
||||||
:param normalize_to_nfc: Normalize returned unicode to the NFC normal form as is required by both the EPUB and AZW3 formats.
|
|
||||||
"""
|
|
||||||
def fix_data(d):
|
|
||||||
return d.replace('\r\n', '\n').replace('\r', '\n')
|
|
||||||
if isinstance(data, unicode):
|
|
||||||
return fix_data(data)
|
|
||||||
bom_enc = None
|
|
||||||
if data[:4] in {b'\0\0\xfe\xff', b'\xff\xfe\0\0'}:
|
|
||||||
bom_enc = {b'\0\0\xfe\xff':'utf-32-be',
|
|
||||||
b'\xff\xfe\0\0':'utf-32-le'}[data[:4]]
|
|
||||||
data = data[4:]
|
|
||||||
elif data[:2] in {b'\xff\xfe', b'\xfe\xff'}:
|
|
||||||
bom_enc = {b'\xff\xfe':'utf-16-le', b'\xfe\xff':'utf-16-be'}[data[:2]]
|
|
||||||
data = data[2:]
|
|
||||||
elif data[:3] == b'\xef\xbb\xbf':
|
|
||||||
bom_enc = 'utf-8'
|
|
||||||
data = data[3:]
|
|
||||||
if bom_enc is not None:
|
|
||||||
try:
|
|
||||||
self.used_encoding = bom_enc
|
|
||||||
return fix_data(data.decode(bom_enc))
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
pass
|
|
||||||
try:
|
|
||||||
self.used_encoding = 'utf-8'
|
|
||||||
return fix_data(data.decode('utf-8'))
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
pass
|
|
||||||
data, self.used_encoding = xml_to_unicode(data)
|
|
||||||
if normalize_to_nfc:
|
|
||||||
data = unicodedata.normalize('NFC', data)
|
|
||||||
return fix_data(data)
|
|
||||||
|
|
||||||
def ok_to_be_unmanifested(self, name):
|
def ok_to_be_unmanifested(self, name):
|
||||||
return name in self.names_that_need_not_be_manifested
|
return name in self.names_that_need_not_be_manifested
|
||||||
|
|
||||||
@ -472,24 +505,6 @@ class Container(object): # {{{
|
|||||||
' Set of names that must never be renamed. Depends on the ebook file format. '
|
' Set of names that must never be renamed. Depends on the ebook file format. '
|
||||||
return set()
|
return set()
|
||||||
|
|
||||||
def parse_xml(self, data):
|
|
||||||
data, self.used_encoding = xml_to_unicode(
|
|
||||||
data, strip_encoding_pats=True, assume_utf8=True, resolve_entities=True)
|
|
||||||
data = unicodedata.normalize('NFC', data)
|
|
||||||
return etree.fromstring(data, parser=RECOVER_PARSER)
|
|
||||||
|
|
||||||
def parse_xhtml(self, data, fname='<string>', force_html5_parse=False):
|
|
||||||
if self.tweak_mode:
|
|
||||||
return parse_html_tweak(data, log=self.log, decoder=self.decode, force_html5_parse=force_html5_parse)
|
|
||||||
else:
|
|
||||||
try:
|
|
||||||
return parse_html(
|
|
||||||
data, log=self.log, decoder=self.decode,
|
|
||||||
preprocessor=self.html_preprocessor, filename=fname,
|
|
||||||
non_html_file_tags={'ncx'})
|
|
||||||
except NotHTML:
|
|
||||||
return self.parse_xml(data)
|
|
||||||
|
|
||||||
def parse(self, path, mime):
|
def parse(self, path, mime):
|
||||||
with open(path, 'rb') as src:
|
with open(path, 'rb') as src:
|
||||||
data = src.read()
|
data = src.read()
|
||||||
@ -514,10 +529,6 @@ class Container(object): # {{{
|
|||||||
ans = self.decode(ans, normalize_to_nfc=normalize_to_nfc)
|
ans = self.decode(ans, normalize_to_nfc=normalize_to_nfc)
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
def parse_css(self, data, fname='<string>', is_declaration=False):
|
|
||||||
return parse_css(data, fname=fname, is_declaration=is_declaration, decode=self.decode, log_level=logging.WARNING,
|
|
||||||
css_preprocessor=(None if self.tweak_mode else self.css_preprocessor))
|
|
||||||
|
|
||||||
def parsed(self, name):
|
def parsed(self, name):
|
||||||
''' Return a parsed representation of the file specified by name. For
|
''' Return a parsed representation of the file specified by name. For
|
||||||
HTML and XML files an lxml tree is returned. For CSS files a cssutils
|
HTML and XML files an lxml tree is returned. For CSS files a cssutils
|
||||||
|
Loading…
x
Reference in New Issue
Block a user