From 0dd8f53b4e8b6c8e65603b70aec1523d1bf5e8cb Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 11 Apr 2016 16:31:07 +0530
Subject: [PATCH] Refactor the container class to make it easy to create mocked
 containers for testing

---
 src/calibre/ebooks/oeb/polish/container.py | 165 +++++++++++----------
 1 file changed, 88 insertions(+), 77 deletions(-)
diff --git a/src/calibre/ebooks/oeb/polish/container.py b/src/calibre/ebooks/oeb/polish/container.py
index e4ef417a7b..83ad3fc8dd 100644
--- a/src/calibre/ebooks/oeb/polish/container.py
+++ b/src/calibre/ebooks/oeb/polish/container.py
@@ -100,8 +100,94 @@ def href_to_name(href, root, base=None):
     fullpath = os.path.join(base, *href.split('/'))
     return abspath_to_name(fullpath, root)
 
+class ContainerBase(object):  # {{{
+    '''
+    A base class that implements just the parsing methods. Useful to create
+    virtual containers for testing.
+    '''
 
-class Container(object):  # {{{
+    #: The mode used to parse HTML and CSS (polishing uses tweak_mode=False and the editor uses tweak_mode=True)
+    tweak_mode = False
+
+    def __init__(self, log):
+        self.log = log
+        self.parsed_cache = {}
+        self.mime_map = {}
+        self.encoding_map = {}
+        self.html_preprocessor = HTMLPreProcessor()
+        self.css_preprocessor = CSSPreProcessor()
+
+    def guess_type(self, name):
+        ' Return the expected mimetype for the specified file name based on its extension. '
+        # epubcheck complains if the mimetype for text documents is set to
+        # text/html in EPUB 2 books. Sigh.
+        ans = guess_type(name)
+        if ans == 'text/html':
+            ans = 'application/xhtml+xml'
+        return ans
+
+    def decode(self, data, normalize_to_nfc=True):
+        """
+        Automatically decode ``data`` into a ``unicode`` object.
+
+        :param normalize_to_nfc: Normalize returned unicode to the NFC normal form as is required by both the EPUB and AZW3 formats.
+        """
+        def fix_data(d):
+            return d.replace('\r\n', '\n').replace('\r', '\n')
+        if isinstance(data, unicode):
+            return fix_data(data)
+        bom_enc = None
+        if data[:4] in {b'\0\0\xfe\xff', b'\xff\xfe\0\0'}:
+            bom_enc = {b'\0\0\xfe\xff':'utf-32-be',
+                       b'\xff\xfe\0\0':'utf-32-le'}[data[:4]]
+            data = data[4:]
+        elif data[:2] in {b'\xff\xfe', b'\xfe\xff'}:
+            bom_enc = {b'\xff\xfe':'utf-16-le', b'\xfe\xff':'utf-16-be'}[data[:2]]
+            data = data[2:]
+        elif data[:3] == b'\xef\xbb\xbf':
+            bom_enc = 'utf-8'
+            data = data[3:]
+        if bom_enc is not None:
+            try:
+                self.used_encoding = bom_enc
+                return fix_data(data.decode(bom_enc))
+            except UnicodeDecodeError:
+                pass
+        try:
+            self.used_encoding = 'utf-8'
+            return fix_data(data.decode('utf-8'))
+        except UnicodeDecodeError:
+            pass
+        data, self.used_encoding = xml_to_unicode(data)
+        if normalize_to_nfc:
+            data = unicodedata.normalize('NFC', data)
+        return fix_data(data)
+
+    def parse_xml(self, data):
+        data, self.used_encoding = xml_to_unicode(
+            data, strip_encoding_pats=True, assume_utf8=True, resolve_entities=True)
+        data = unicodedata.normalize('NFC', data)
+        return etree.fromstring(data, parser=RECOVER_PARSER)
+
+    def parse_xhtml(self, data, fname='<string>', force_html5_parse=False):
+        if self.tweak_mode:
+            return parse_html_tweak(data, log=self.log, decoder=self.decode, force_html5_parse=force_html5_parse)
+        else:
+            try:
+                return parse_html(
+                    data, log=self.log, decoder=self.decode,
+                    preprocessor=self.html_preprocessor, filename=fname,
+                    non_html_file_tags={'ncx'})
+            except NotHTML:
+                return self.parse_xml(data)
+
+    def parse_css(self, data, fname='<string>', is_declaration=False):
+        return parse_css(data, fname=fname, is_declaration=is_declaration, decode=self.decode, log_level=logging.WARNING,
+                         css_preprocessor=(None if self.tweak_mode else self.css_preprocessor))
+# }}}
+
+
+class Container(ContainerBase):  # {{{
 
     '''
     A container represents an Open EBook as a directory full of files and an
@@ -129,23 +215,16 @@ class Container(object):  # {{{
     book_type = 'oeb'
     #: If this container represents an unzipped book (a directory)
     is_dir = False
-    #: The mode used to parse HTML and CSS (polishing uses tweak_mode=False and the editor uses tweak_mode=True)
-    tweak_mode = False
 
     SUPPORTS_TITLEPAGES = True
     SUPPORTS_FILENAMES = True
 
     def __init__(self, rootpath, opfpath, log, clone_data=None):
+        ContainerBase.__init__(self, log)
         self.root = clone_data['root'] if clone_data is not None else os.path.abspath(rootpath)
-        self.log = log
-        self.html_preprocessor = HTMLPreProcessor()
-        self.css_preprocessor = CSSPreProcessor()
 
-        self.parsed_cache = {}
-        self.mime_map = {}
         self.name_path_map = {}
         self.dirtied = set()
-        self.encoding_map = {}
         self.pretty_print = set()
         self.cloned = False
         self.cache_names = ('parsed_cache', 'mime_map', 'name_path_map', 'encoding_map', 'dirtied', 'pretty_print')
@@ -202,15 +281,6 @@ class Container(object):  # {{{
                 for name, path in self.name_path_map.iteritems()}
         }
 
-    def guess_type(self, name):
-        ' Return the expected mimetype for the specified file name based on its extension. '
-        # epubcheck complains if the mimetype for text documents is set to
-        # text/html in EPUB 2 books. Sigh.
-        ans = guess_type(name)
-        if ans == 'text/html':
-            ans = 'application/xhtml+xml'
-        return ans
-
     def add_name_to_manifest(self, name):
         ' Add an entry to the manifest for a file with the specified name. Returns the manifest id. '
         all_ids = {x.get('id') for x in self.opf_xpath('//*[@id]')}
@@ -417,43 +487,6 @@ class Container(object):  # {{{
         :meth:`abspath_to_name` for that.'''
         return relpath(path, base or self.root)
 
-    def decode(self, data, normalize_to_nfc=True):
-        """
-        Automatically decode ``data`` into a ``unicode`` object.
-
-        :param normalize_to_nfc: Normalize returned unicode to the NFC normal form as is required by both the EPUB and AZW3 formats.
-        """
-        def fix_data(d):
-            return d.replace('\r\n', '\n').replace('\r', '\n')
-        if isinstance(data, unicode):
-            return fix_data(data)
-        bom_enc = None
-        if data[:4] in {b'\0\0\xfe\xff', b'\xff\xfe\0\0'}:
-            bom_enc = {b'\0\0\xfe\xff':'utf-32-be',
-                       b'\xff\xfe\0\0':'utf-32-le'}[data[:4]]
-            data = data[4:]
-        elif data[:2] in {b'\xff\xfe', b'\xfe\xff'}:
-            bom_enc = {b'\xff\xfe':'utf-16-le', b'\xfe\xff':'utf-16-be'}[data[:2]]
-            data = data[2:]
-        elif data[:3] == b'\xef\xbb\xbf':
-            bom_enc = 'utf-8'
-            data = data[3:]
-        if bom_enc is not None:
-            try:
-                self.used_encoding = bom_enc
-                return fix_data(data.decode(bom_enc))
-            except UnicodeDecodeError:
-                pass
-        try:
-            self.used_encoding = 'utf-8'
-            return fix_data(data.decode('utf-8'))
-        except UnicodeDecodeError:
-            pass
-        data, self.used_encoding = xml_to_unicode(data)
-        if normalize_to_nfc:
-            data = unicodedata.normalize('NFC', data)
-        return fix_data(data)
-
     def ok_to_be_unmanifested(self, name):
         return name in self.names_that_need_not_be_manifested
 
@@ -472,24 +505,6 @@ class Container(object):  # {{{
         ' Set of names that must never be renamed. Depends on the ebook file format. '
         return set()
 
-    def parse_xml(self, data):
-        data, self.used_encoding = xml_to_unicode(
-            data, strip_encoding_pats=True, assume_utf8=True, resolve_entities=True)
-        data = unicodedata.normalize('NFC', data)
-        return etree.fromstring(data, parser=RECOVER_PARSER)
-
-    def parse_xhtml(self, data, fname='<string>', force_html5_parse=False):
-        if self.tweak_mode:
-            return parse_html_tweak(data, log=self.log, decoder=self.decode, force_html5_parse=force_html5_parse)
-        else:
-            try:
-                return parse_html(
-                    data, log=self.log, decoder=self.decode,
-                    preprocessor=self.html_preprocessor, filename=fname,
-                    non_html_file_tags={'ncx'})
-            except NotHTML:
-                return self.parse_xml(data)
-
     def parse(self, path, mime):
         with open(path, 'rb') as src:
             data = src.read()
@@ -514,10 +529,6 @@ class Container(object):  # {{{
             ans = self.decode(ans, normalize_to_nfc=normalize_to_nfc)
         return ans
 
-    def parse_css(self, data, fname='<string>', is_declaration=False):
-        return parse_css(data, fname=fname, is_declaration=is_declaration, decode=self.decode, log_level=logging.WARNING,
-                         css_preprocessor=(None if self.tweak_mode else self.css_preprocessor))
-
     def parsed(self, name):
         ''' Return a parsed representation of the file specified by name. For
         HTML and XML files an lxml tree is returned. For CSS files a cssutils