From f277f2b870b77fa611a72094f330d74655e68fc6 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 20 Sep 2008 09:46:55 -0700
Subject: [PATCH] IGN:Various regression fixes and an incomplete epub split
 implementation

---
 src/calibre/ebooks/epub/__init__.py  |   7 +-
 src/calibre/ebooks/epub/from_html.py |  29 +-
 src/calibre/ebooks/epub/split.py     | 175 +++++++
 src/calibre/ebooks/html.py           |  11 +-
 src/calibre/ebooks/metadata/opf2.py  |   9 +
 src/calibre/gui2/main.py             |  12 +-
 src/calibre/linux.py                 |   4 +
 src/calibre/translations/nb.po       |   2 -
 src/calibre/web/fetch/simple.py      |   1 -
 src/encutils/__init__.py             | 655 +++++++++++++++++++++++++++
 10 files changed, 872 insertions(+), 33 deletions(-)
 create mode 100644 src/calibre/ebooks/epub/split.py
 create mode 100644 src/encutils/__init__.py

diff --git a/src/calibre/ebooks/epub/__init__.py b/src/calibre/ebooks/epub/__init__.py
index 0dcc4b24d1..7c9d1197a9 100644
--- a/src/calibre/ebooks/epub/__init__.py
+++ b/src/calibre/ebooks/epub/__init__.py
@@ -7,9 +7,10 @@ __docformat__ = 'restructuredtext en'
 Conversion to EPUB.
 '''
 import sys, textwrap
+from lxml import html
 from calibre.utils.config import Config, StringConfig
 from calibre.utils.zipfile import ZipFile, ZIP_STORED
-from calibre.ebooks.html import config as common_config
+from calibre.ebooks.html import config as common_config, tostring
 
 class DefaultProfile(object):
     
@@ -42,7 +43,6 @@ def initialize_container(path_to_container, opf_name='metadata.opf'):
     zf.writestr('META-INF/', '', 0700)
     zf.writestr('META-INF/container.xml', CONTAINER)
     return zf
-    
 
 def config(defaults=None):
     desc = _('Options to control the conversion to EPUB')
@@ -59,7 +59,8 @@ def config(defaults=None):
              help=_('The output EPUB file. If not specified, it is derived from the input file name.'))
     c.add_opt('profile', ['--profile'], default='PRS505', choices=list(PROFILES.keys()),
               help=_('Profile of the target device this EPUB is meant for. Set to None to create a device independent EPUB. The profile is used for device specific restrictions on the EPUB. Choices are: ')+str(list(PROFILES.keys())))
-    
+    c.add_opt('override_css', ['--override-css'], default=None,
+              help=_('Either the path to a CSS stylesheet or raw CSS. This CSS will override any existing CSS declarations in the source files.'))
     structure = c.add_group('structure detection', _('Control auto-detection of document structure.'))
     structure('chapter', ['--chapter'], default="//*[re:match(name(), 'h[1-2]') and re:test(., 'chapter|book|section', 'i')]",
             help=_('''\
diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py
index a4ffa224e4..a94a68c76b 100644
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@@ -18,6 +18,7 @@ from calibre.ptempfile import TemporaryDirectory
 from calibre.ebooks.metadata import MetaInformation
 from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.epub import initialize_container, PROFILES
+from calibre.ebooks.epub.split import split
 
 
 class HTMLProcessor(Processor):
@@ -34,18 +35,8 @@ class HTMLProcessor(Processor):
         if opts.verbose > 2:
             self.debug_tree('nocss')
         
-        self.collect_font_statistics()
+        #self.collect_font_statistics()
         
-        self.split()
-        
-    def save(self):
-        file = Processor.save(self)
-        with open(file, 'rb') as f:
-            f.seek(0, 2)
-            size = f.tell()
-        if size > self.opts.profile.flow_size:
-            self.split()
-                
         
     def collect_font_statistics(self):
         '''
@@ -58,12 +49,6 @@ class HTMLProcessor(Processor):
             #TODO: Use cssutils on self.raw_css to figure out the font size 
             # of this piece of text and update statistics accordingly        
     
-    def split(self):
-        ''' Split into individual flows to accommodate Adobe's incompetence '''
-        # TODO: Only split file larger than 300K (as specified in profile)
-        # Split on page breaks first and then on <h1-6> tags and then on
-        # <div> and finally on <p>.  
-        pass
             
 
 def config(defaults=None):
@@ -88,6 +73,7 @@ def parse_content(filelist, opts, tdir):
                            resource_map, filelist)
         hp.populate_toc(toc)
         hp.save()
+        
     return resource_map, hp.htmlfile_map, toc
 
 def convert(htmlfile, opts, notification=None):
@@ -96,6 +82,11 @@ def convert(htmlfile, opts, notification=None):
         opts.output = os.path.splitext(os.path.basename(htmlfile))[0] + '.epub'
     opts.profile = PROFILES[opts.profile]
     opts.output = os.path.abspath(opts.output)
+    if opts.override_css is not None:
+        try:
+            opts.override_css = open(opts.override_css, 'rb').read().decode('utf-8', 'replace')
+        except:
+            opts.override_css = opts.override_css.decode('utf-8', 'replace')
     if htmlfile.lower().endswith('.opf'):
         opf = OPFReader(htmlfile, os.path.dirname(os.path.abspath(htmlfile)))
         filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding)
@@ -153,7 +144,8 @@ def convert(htmlfile, opts, notification=None):
         for item in mi.manifest:
             if getattr(item, 'mime_type', None) == 'text/html':
                 item.mime_type = 'application/xhtml+xml'
-        with open(os.path.join(tdir, 'metadata.opf'), 'wb') as f:
+        opf_path = os.path.join(tdir, 'metadata.opf')
+        with open(opf_path, 'wb') as f:
             mi.render(f, buf, 'toc.ncx')
         if opts.show_opf:
             print open(os.path.join(tdir, 'metadata.opf')).read()
@@ -163,6 +155,7 @@ def convert(htmlfile, opts, notification=None):
                 f.write(toc)
             if opts.show_ncx:
                 print toc
+        split(opf_path, opts)
         epub = initialize_container(opts.output)
         epub.add_dir(tdir)
         print 'Output written to', opts.output
diff --git a/src/calibre/ebooks/epub/split.py b/src/calibre/ebooks/epub/split.py
new file mode 100644
index 0000000000..da8d6487f4
--- /dev/null
+++ b/src/calibre/ebooks/epub/split.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env  python
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
+__docformat__ = 'restructuredtext en'
+
+'''
+Split the flows in an epub file to conform to size limitations.
+'''
+
+import sys, os, math, copy
+
+from lxml.etree import parse, XMLParser
+from lxml.cssselect import CSSSelector
+
+from calibre.ebooks.metadata.opf2 import OPF
+from calibre.ebooks.epub import tostring
+
+PARSER = XMLParser(recover=True)
+
+class SplitError(ValueError):
+    
+    def __init__(self, path):
+        ValueError.__init__(self, _('Could not find reasonable point at which to split: ')+os.path.basename(path))
+
+def split_tree(tree, split_point, before, opts, filepath):
+    trees = set([])
+    tree2 = copy.deepcopy(tree)
+    path = tree.getpath(split_point)
+    root, root2 = tree.getroot(), tree2.getroot()
+    body, body2 = root.xpath('//body')[0], root2.xpath('//body')[0]
+    split_point2 = root2.xpath(path)[0]
+    
+    # Tree 1
+    hit_split_point = False
+    for elem in body.iterdescendants():
+        if elem is split_point:
+            hit_split_point = True
+            if before:
+                elem.text = u''
+                elem.tail = u''
+                elem.set('calibre_split', '1')
+            continue
+        if hit_split_point:
+            elem.text = u''
+            elem.tail = u''
+        elem.set('calibre_split', '1' if hit_split_point else '0')
+        
+    # Tree 2
+    hit_split_point = False
+    for elem in body2.iterdescendants():
+        if elem is split_point2:
+            hit_split_point = True
+            if not before:
+                elem.text = u''
+                elem.tail = u''
+                elem.set('calibre_split', '1')
+            continue
+        if not hit_split_point:
+            elem.text = u''
+            elem.tail = u''
+        elem.set('calibre_split', '0' if hit_split_point else '1')
+    
+    for t, r in [(tree, root), (tree2, root2)]:
+        if len(tostring(r)) < opts.profile.flow_size:
+            trees.append(t)
+        else:
+            new_split_point, before = find_split_point(t)
+            if new_split_point is None:
+                raise SplitError(filepath)
+            trees.extend(split_tree(t, new_split_point, before, opts, filepath))
+            
+    return trees
+    
+
+def find_split_point(tree):
+    root = tree.getroot()
+    css = root.xpath('//style[@type="text/css"]')
+    if css:
+        
+        def pick_elem(elems):
+            if elems:
+                elems = [i for i in elems if elem.get('calibre_split', '0') != '1']
+                if elems:
+                    i = int(math.floor(len(elems)/2.))
+                    return elems[i]
+        
+        def selector_element(rule):
+            try:
+                selector = CSSSelector(rule.selectorText)
+                return pick_elem(selector(root))
+            except:
+                return None
+        
+        css = css[0].text
+        from cssutils import CSSParser
+        stylesheet = CSSParser().parseString(css)
+        for rule in stylesheet:
+            if rule.type != rule.STYLE_RULE:
+                continue
+            before = getattr(rule.style.getPropertyCSSValue('page-break-before'), 'cssText', '').strip().lower()
+            if before and before != 'avoid':
+                elem = selector_element(rule)
+                if elem is not None:
+                    return elem, True
+            after  = getattr(rule.style.getPropertyCSSValue('page-break-after'), 'cssText', '').strip().lower()
+            if after and after != 'avoid':
+                elem = selector_element(rule)
+                if elem is not None:
+                    return elem, False
+                
+    for path in ('//*[re:match(name(), "h[1-6]", "i")', '/body/div', '//p'):
+        elems = root.xpath(path)
+        elem = pick_elem(elems)
+        if elem is not None:
+            return elem, True
+        
+    return None, True
+
+def do_split(path, opts):
+    tree = parse(path, parser=PARSER)
+    split_point, before = find_split_point(tree)
+    if split_point is None:
+        raise SplitError(path)
+    trees = split_tree(tree, split_point, before, opts, path)
+    base = os.path.splitext(os.path.basename(path))[0] + '_split_%d.html'
+    anchor_map = {None:base%0}
+    files = []
+    for i, tree in enumerate(trees):
+        root = tree.getroot()
+        files.append(base%i)
+        for elem in root.xpath('//*[@id and @calibre_split = "1"]'):
+            anchor_map[elem.get('id')] = files[-1]
+            elem.attrib.pop('calibre_split')
+        for elem in root.xpath('//*[@calibre_split]'):
+            elem.attrib.pop('calibre_split')
+        open(os.path.join(os.path.dirname(path), files[-1]), 'wb').write(tostring(root, pretty_print=opts.pretty_print))
+    os.remove(path)
+    return path, files, anchor_map
+
+def fix_opf(opf, orig_file, files, anchor_map):
+    orig = None
+    for item in opf.manifest:
+        if os.path.samefile(orig_file, item.path):
+            orig = item
+            break
+    opf.manifest.remove(orig)
+    ids = []
+    for f in files:
+        ids.append(opf.manifest.add_item(f))
+            
+ 
+def split(pathtoopf, opts):
+    return
+    pathtoopf = os.path.abspath(pathtoopf)
+    opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf))
+    html_files = []
+    for item in opf.manifest:
+        if 'html' in item.mime_type.lower():
+            html_files.append(item.path)
+    changes = []
+    for f in html_files:
+        if os.stat(f).st_size > opts.profile.flow_size:
+            fix_opf(opf, *do_split(f, opts))
+    if changes:
+        pass
+        
+             
+        
+    
+
+def main(args=sys.argv):
+    return 0
+
+if __name__ == '__main__':
+    sys.exit(main())
\ No newline at end of file
diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py
index a01f426e38..7532d43cf8 100644
--- a/src/calibre/ebooks/html.py
+++ b/src/calibre/ebooks/html.py
@@ -27,6 +27,11 @@ from calibre.ebooks.metadata.opf2 import OPF
 from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
 from calibre.utils.zipfile import ZipFile
 
+def tostring(root, pretty_print=False):
+    return html.tostring(root, encoding='utf-8', method='xml', 
+                  pretty_print=pretty_print,
+                  include_meta_content_type=True) 
+
 
 class Link(object):
     '''
@@ -332,9 +337,7 @@ class Parser(PreProcessor, LoggingInterface):
         Should be called after all HTML processing is finished.
         '''
         with open(os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path]), 'wb') as f:
-            ans = html.tostring(self.root, encoding='utf-8', method='xml', 
-                                pretty_print=self.opts.pretty_print,
-                                include_meta_content_type=True)
+            ans = tostring(self.root, pretty_print=self.opts.pretty_print)
             ans = re.compile(r'<html>', re.IGNORECASE).sub('<html xmlns="http://www.w3.org/1999/xhtml">', ans)
             ans = re.compile(r'<head[^<>]*?>', re.IGNORECASE).sub('<head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n', ans)
             f.write(ans)
@@ -551,6 +554,8 @@ class Processor(Parser):
             
         self.raw_css = '\n\n'.join(css)
         self.css = unicode(self.raw_css)
+        if self.opts.override_css:
+            self.css += '\n\n'+self.opts.override_css
         self.do_layout()
         # TODO: Figure out what to do about CSS imports from linked stylesheets
         
diff --git a/src/calibre/ebooks/metadata/opf2.py b/src/calibre/ebooks/metadata/opf2.py
index 009d5cfef8..91255efbf5 100644
--- a/src/calibre/ebooks/metadata/opf2.py
+++ b/src/calibre/ebooks/metadata/opf2.py
@@ -88,6 +88,15 @@ class Manifest(ResourceCollection):
             m.append(mi)
         return m
     
+    def add_item(self, path, mime_type=None):
+        mi = ManifestItem(path, is_path=True)
+        if mime_type:
+            mi.mime_type = mime_type
+        mi.id = 'id%d'%self.next_id
+        self.next_id += 1
+        self.append(mi)
+        return mi.id
+    
     def __init__(self):
         ResourceCollection.__init__(self)
         self.next_id = 1
diff --git a/src/calibre/gui2/main.py b/src/calibre/gui2/main.py
index 7154b2949d..191969023b 100644
--- a/src/calibre/gui2/main.py
+++ b/src/calibre/gui2/main.py
@@ -229,12 +229,6 @@ class Main(MainWindow, Ui_MainWindow):
                 db = LibraryDatabase2(self.library_path)
         self.library_view.set_database(db)
         if self.olddb is not None:
-            QMessageBox.information(self, 'Database format changed',
-                '''\
-<p>calibre's book storage format has changed. Instead of storing book files in a database, the
-files are now stored in a folder on your filesystem. You will now be asked to choose the folder 
-in which you want to store your books files. Any existing books will be automatically migrated.
-                ''')
             from PyQt4.QtGui import QProgressDialog
             pd = QProgressDialog('', '', 0, 100, self)
             pd.setWindowModality(Qt.ApplicationModal)
@@ -1278,6 +1272,12 @@ in which you want to store your books files. Any existing books will be automati
         self.library_path = prefs['library_path']
         self.olddb = None
         if self.library_path is None: # Need to migrate to new database layout
+            QMessageBox.information(self, 'Database format changed',
+                '''\
+<p>calibre's book storage format has changed. Instead of storing book files in a database, the
+files are now stored in a folder on your filesystem. You will now be asked to choose the folder 
+in which you want to store your books files. Any existing books will be automatically migrated.
+                ''')
             self.database_path = prefs['database_path']
             if not os.access(os.path.dirname(self.database_path), os.W_OK):
                 error_dialog(self, _('Database does not exist'), 
diff --git a/src/calibre/linux.py b/src/calibre/linux.py
index 9e31cb9fc6..6c7e390306 100644
--- a/src/calibre/linux.py
+++ b/src/calibre/linux.py
@@ -440,6 +440,10 @@ def post_install():
 
     if opts.save_manifest_to:
         open(opts.save_manifest_to, 'wb').write('\n'.join(manifest)+'\n')
+        
+    from calibre.utils.config import config_dir
+    if os.path.exists(config_dir):
+        shutil.rmtree(config_dir) 
 
 
 VIEWER = '''\
diff --git a/src/calibre/translations/nb.po b/src/calibre/translations/nb.po
index 51b5bd5037..d0797ecd18 100644
--- a/src/calibre/translations/nb.po
+++ b/src/calibre/translations/nb.po
@@ -368,9 +368,7 @@ msgid ""
 "device. Default: %s Supported profiles: "
 msgstr ""
 "Profilen til lagringsenheten som denne LRF filen blir generert for. Profilen "
-"\n"
 "angir innstillinger som oppløsning og skjerm størrelsen til lagringsenheten. "
-"\n"
 "Standard: %s Støttede profiler: "
 
 #: /home/kovid/work/calibre/src/calibre/ebooks/lrf/__init__.py:134
diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py
index 9b8f666c4d..05b3b9a87b 100644
--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@@ -258,7 +258,6 @@ class RecursiveFetcher(object, LoggingInterface):
             except Exception, err:
                 self.log_warning('Could not fetch image %s', iurl)
                 self.log_debug('Error: %s', str(err), exc_info=True)
-                if hasattr(f, 'close'): f.close()
                 continue
             c += 1
             fname = sanitize_file_name('img'+str(c)+ext)
diff --git a/src/encutils/__init__.py b/src/encutils/__init__.py
new file mode 100644
index 0000000000..f2eb4eefb3
--- /dev/null
+++ b/src/encutils/__init__.py
@@ -0,0 +1,655 @@
+#!/usr/bin/env python
+"""encutils - encoding detection collection for Python
+
+encutils
+========
+:Author: Christof Hoeke, see http://cthedot.de/encutils/
+:Copyright: 2005-2008: Christof Hoeke
+:License: encutils has a dual-license, please choose whatever you prefer:
+
+    * encutils is published under the `LGPL 3 or later <http://cthedot.de/encutils/license/>`__
+    * encutils is published under the  
+      `Creative Commons License <http://creativecommons.org/licenses/by/3.0/>`__.
+      
+    This file is part of encutils.
+
+    encutils is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    encutils is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with encutils.  If not, see <http://www.gnu.org/licenses/>.
+ 
+
+A collection of helper functions to detect encodings of text files (like HTML, XHTML, XML, CSS, etc.) retrieved via HTTP, file or string.
+
+``getEncodingInfo`` is probably the main function of interest which uses
+other supplied functions itself and gathers all information together and
+supplies an ``EncodingInfo`` object with the following properties:
+
+- ``encoding``: The guessed encoding
+    Encoding is the explicit or implicit encoding or None and
+    always lowercase.
+
+- from HTTP response    
+    * ``http_encoding``
+    * ``http_media_type``
+
+- from HTML <meta> element    
+    * ``meta_encoding``
+    * ``meta_media_type``
+
+- from XML declaration
+    * ``xml_encoding``
+
+example::
+
+    >>> import encutils
+    >>> info = encutils.getEncodingInfo(url='http://cthedot.de/encutils/')
+    
+    >>> print info  # = str(info)
+    utf-8
+    
+    >>> info        # = repr(info)
+    <encutils.EncodingInfo object encoding='utf-8' mismatch=False at 0xb86d30>
+    
+    >>> print info.logtext
+    HTTP media_type: text/html
+    HTTP encoding: utf-8
+    HTML META media_type: text/html
+    HTML META encoding: utf-8
+    Encoding (probably): utf-8 (Mismatch: False)
+
+
+references
+==========
+XML
+    RFC 3023 (http://www.ietf.org/rfc/rfc3023.txt)
+    
+    easier explained in 
+        - http://feedparser.org/docs/advanced.html
+        - http://www.xml.com/pub/a/2004/07/21/dive.html
+        
+HTML
+    http://www.w3.org/TR/REC-html40/charset.html#h-5.2.2
+
+TODO
+====
+- parse @charset of HTML elements?
+- check for more texttypes if only text given
+    
+"""
+__all__ = ['buildlog',
+           'encodingByMediaType',
+           'getHTTPInfo',
+           'getMetaInfo',
+           'detectXMLEncoding',
+           'getEncodingInfo',
+           'tryEncodings',
+           'EncodingInfo']
+__docformat__ = 'restructuredtext'
+__author__ = 'Christof Hoeke'
+__version__ = '0.8.3 $Id: __init__.py 1138 2008-03-15 18:24:46Z cthedot $'
+
+import cgi
+import HTMLParser
+import httplib
+import re
+import StringIO
+import sys
+import types
+import urllib
+
+class _MetaHTMLParser(HTMLParser.HTMLParser):
+    """parses given data for <meta http-equiv="content-type">"""
+    content_type = None
+    
+    def handle_starttag(self, tag, attrs):
+        if tag == 'meta' and not self.content_type:
+            atts = dict([(a.lower(), v.lower()) for a, v in attrs])
+            if atts.get('http-equiv', u'').strip() == u'content-type':
+                self.content_type = atts.get('content')
+
+
+# application/xml, application/xml-dtd, application/xml-external-parsed-entity, or a subtype like application/rss+xml.
+_XML_APPLICATION_TYPE = 0
+
+# text/xml, text/xml-external-parsed-entity, or a subtype like text/AnythingAtAll+xml
+_XML_TEXT_TYPE = 1
+
+# text/html
+_HTML_TEXT_TYPE = 2
+
+# any other of text/* like text/plain, ...
+_TEXT_TYPE = 3
+
+# any text/* like which defaults to UTF-8 encoding, for now only text/css
+_TEXT_UTF8 = 5
+
+# types not fitting in above types
+_OTHER_TYPE = 4
+
+class EncodingInfo(object):
+    """
+    All encoding related information, returned by ``getEncodingInfo``
+    
+    - ``encoding``: The guessed encoding
+        Encoding is the explicit or implicit encoding or None and
+        always lowercase.
+
+    - from HTTP response    
+        * ``http_encoding``
+        * ``http_media_type``
+
+    - from HTML <meta> element    
+        * ``meta_encoding``
+        * ``meta_media_type``
+
+    - from XML declaration
+        * ``xml_encoding``
+
+    - ``mismatch``: True if mismatch between XML declaration and HTTP header
+        Mismatch is True if any mismatches between HTTP header, XML
+        declaration or textcontent (meta) are found. More detailed mismatch
+        reports are written to the optional log or ``logtext``
+
+        Mismatches are not necessarily errors as preferences are defined.
+        For details see the specifications.
+
+    - ``logtext``: if no log was given log reports are given here 
+    
+    """
+    def __init__(self):
+        """
+        initializes all possible properties to ``None``, see class
+        description
+        """
+        self.encoding = self.mismatch = self.logtext =\
+            self.http_encoding = self.http_media_type =\
+            self.meta_encoding = self.meta_media_type =\
+            self.xml_encoding =\
+                None
+        
+    def __str__(self):
+        """
+        ``str(EncodingInfo())`` outputs the guessed encoding itself or the empty string
+        """
+        if self.encoding:
+            return self.encoding
+        else:
+            return u''
+            
+    def __repr__(self):
+        return "<%s.%s object encoding=%r mismatch=%s at 0x%x>" % (
+                self.__class__.__module__, self.__class__.__name__, 
+                self.encoding, self.mismatch, id(self))
+
+
+def buildlog(logname='encutils', level='INFO', stream=sys.stderr,
+            filename=None, filemode="w",
+            format='%(levelname)s\t%(message)s'):
+    """
+    helper to build a basic log
+    
+    - if ``filename`` is given returns a log logging to ``filename`` with
+      mode ``filemode``
+    - else uses a log streaming to ``stream`` which defaults to
+      ``sys.stderr``
+    - ``level`` defines the level of the log
+    - ``format`` defines the formatter format of the log
+
+    returns a log with the name ``logname``
+    """
+    import logging
+    
+    log = logging.getLogger(logname)
+    
+    if filename:
+        hdlr = logging.FileHandler(filename, filemode)
+    else:
+        hdlr = logging.StreamHandler(stream)
+        
+    formatter = logging.Formatter(format)
+    hdlr.setFormatter(formatter)
+
+    log.addHandler(hdlr)
+    log.setLevel(logging.__dict__.get(level, logging.INFO))
+
+    return log    
+
+def _getTextTypeByMediaType(media_type, log=None):
+    """
+    returns type as defined by constants above
+    """
+    if not media_type:
+        return _OTHER_TYPE
+    
+    xml_application_types = [
+        ur'application/.*?\+xml',
+        u'application/xml',
+        u'application/xml-dtd',
+        u'application/xml-external-parsed-entity']
+    xml_text_types = [
+        ur'text\/.*?\+xml',
+        u'text/xml',
+        u'text/xml-external-parsed-entity']
+    
+    media_type = media_type.strip().lower()
+
+    if media_type in xml_application_types or\
+            re.match(xml_application_types[0], media_type, re.I|re.S|re.X):
+        return _XML_APPLICATION_TYPE
+    elif media_type in xml_text_types or\
+            re.match(xml_text_types[0], media_type, re.I|re.S|re.X):
+        return _XML_TEXT_TYPE
+    elif media_type == u'text/html':
+        return _HTML_TEXT_TYPE
+    elif media_type == u'text/css':
+        return _TEXT_UTF8
+    elif media_type.startswith(u'text/'):
+        return _TEXT_TYPE
+    else:
+        return _OTHER_TYPE    
+
+def _getTextType(text, log=None):
+    """
+    checks if given text is XML (**naive test!**)
+    used if no content-type given    
+    """
+    if text[:30].find(u'<?xml version=') != -1:
+        return _XML_APPLICATION_TYPE
+    else:
+        return _OTHER_TYPE
+    
+def encodingByMediaType(media_type, log=None):
+    """
+    Returns a default encoding for the given media_type.
+    For example ``'utf-8'`` for ``media_type='application/xml'``. 
+
+    Refers to RFC 3023 and HTTP MIME specification.
+   
+    If no default encoding is available returns ``None``.
+    """
+    defaultencodings = {
+        _XML_APPLICATION_TYPE: u'utf-8',
+        _XML_TEXT_TYPE: u'ascii',
+        _HTML_TEXT_TYPE: u'iso-8859-1', # should be None?
+        _TEXT_TYPE: u'iso-8859-1', # should be None?
+        _TEXT_UTF8: u'utf-8',
+        _OTHER_TYPE: None}
+
+    texttype = _getTextTypeByMediaType(media_type)
+    encoding = defaultencodings.get(texttype, None)
+
+    if log:
+        if not encoding:
+            log.debug(u'"%s" Media-Type has no default encoding',
+                media_type)
+        else:
+            log.debug(
+                u'Default encoding for Media Type "%s": %s',
+                media_type, encoding)
+    return encoding
+
+def getHTTPInfo(response, log=None):
+    """
+    Returns ``(media_type, encoding)`` information from the response'
+    Content-Type HTTP header. (Case of headers is ignored.)
+    May be ``(None, None)`` e.g. if no Content-Type header is
+    available.
+    """
+    info = response.info()
+    media_type = info.gettype()
+    encoding = info.getparam('charset') 
+
+    if encoding:
+        encoding = encoding.lower()
+
+    if log:
+        log.info(u'HTTP media_type: %s', media_type)
+        log.info(u'HTTP encoding: %s', encoding)
+
+    return media_type, encoding
+
+def getMetaInfo(text, log=None):
+    """
+    Returns (media_type, encoding) information from (first)
+    X/HTML Content-Type ``<meta>`` element if available.
+
+    Normally in X/HTML:
+        ``<meta http-equiv="Content-Type" content="media_type;
+        charset=encoding"/>``
+    """
+    p = _MetaHTMLParser()
+    p.feed(text)
+    if p.content_type:
+        media_type, params = cgi.parse_header(p.content_type)
+        encoding = params.get('charset') # defaults to None
+        if encoding:
+            encoding = encoding.lower()
+        if log:
+            log.info(u'HTML META media_type: %s', media_type)
+            log.info(u'HTML META encoding: %s', encoding)
+    else:
+        media_type = encoding = None
+
+    return media_type, encoding
+
+def detectXMLEncoding(fp, log=None, includeDefault=True):
+    """
+    Attempts to detect the character encoding of the xml file
+    given by a file object fp. fp must not be a codec wrapped file
+    object! fp may also be a string or unicode string
+
+    The return value can be:
+        - if detection of the BOM succeeds, the codec name of the
+          corresponding unicode charset is returned
+
+        - if BOM detection fails, the xml declaration is searched for
+          the encoding attribute and its value returned. the "<"
+          character has to be the very first in the file then (it's xml
+          standard after all).
+
+        - if BOM and xml declaration fail, utf-8 is returned according
+          to XML 1.0.
+
+    Based on a recipe by Lars Tiede:
+    http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/363841
+    which itself is based on Paul Prescotts recipe:
+    http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52257
+    """
+    if type(fp) in types.StringTypes:
+        fp = StringIO.StringIO(fp)
+    
+    ### detection using BOM
+    
+    ## the BOMs we know, by their pattern
+    bomDict={ # bytepattern: name              
+             (0x00, 0x00, 0xFE, 0xFF) : "utf_32_be",        
+             (0xFF, 0xFE, 0x00, 0x00) : "utf_32_le",
+             (0xFE, 0xFF, None, None) : "utf_16_be", 
+             (0xFF, 0xFE, None, None) : "utf_16_le", 
+             (0xEF, 0xBB, 0xBF, None) : "utf-8",
+            }
+
+    ## go to beginning of file and get the first 4 bytes
+    oldFP = fp.tell()
+    fp.seek(0)
+    (byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4)))
+
+    ## try bom detection using 4 bytes, 3 bytes, or 2 bytes
+    bomDetection = bomDict.get((byte1, byte2, byte3, byte4))
+    if not bomDetection:
+        bomDetection = bomDict.get((byte1, byte2, byte3, None))
+        if not bomDetection:
+            bomDetection = bomDict.get((byte1, byte2, None, None))
+
+    ## if BOM detected, we're done :-)
+    if bomDetection:
+        if log:
+            log.info(u'XML BOM encoding: %s' % bomDetection)
+        fp.seek(oldFP)
+        return bomDetection
+
+    ## still here? BOM detection failed.
+    ##  now that BOM detection has failed we assume one byte character
+    ##  encoding behaving ASCII
+    
+    ### search xml declaration for encoding attribute
+
+    ## assume xml declaration fits into the first 2 KB (*cough*)
+    fp.seek(0)
+    buffer = fp.read(2048)
+
+    ## set up regular expression
+    xmlDeclPattern = r"""
+    ^<\?xml             # w/o BOM, xmldecl starts with <?xml at the first byte
+    .+?                 # some chars (version info), matched minimal
+    encoding=           # encoding attribute begins
+    ["']                # attribute start delimiter
+    (?P<encstr>         # what's matched in the brackets will be named encstr
+     [^"']+              # every character not delimiter (not overly exact!)
+    )                   # closes the brackets pair for the named group
+    ["']                # attribute end delimiter
+    .*?                 # some chars optionally (standalone decl or whitespace)
+    \?>                 # xmldecl end
+    """
+    xmlDeclRE = re.compile(xmlDeclPattern, re.VERBOSE)
+
+    ## search and extract encoding string
+    match = xmlDeclRE.search(buffer)
+    fp.seek(oldFP)
+    if match:
+        enc = match.group("encstr").lower()
+        if log:
+            log.info(u'XML encoding="%s"' % enc)
+        return enc
+    else:
+        if includeDefault:
+            if log:
+                log.info(u'XML encoding default utf-8')
+            return u'utf-8'
+        else:
+            return None
+
+def tryEncodings(text, log=None):
+    """
+    If installed uses chardet http://chardet.feedparser.org/ to detect
+    encoding, else tries different encodings on text and returns the one
+    that does not raise an exception which is not very advanced or may
+    be totally wrong.
+
+    Returns working encoding or None if no encoding does work at all.
+
+    The returned encoding might nevertheless be not the one intended by the
+    author as it is only checked if the text might be encoded in that
+    encoding. Some texts might be working in "iso-8859-1" *and*
+    "windows-1252" *and* "ascii" *and* "utf-8" and ...
+    """
+    try:
+        import chardet
+        encoding = chardet.detect(text)["encoding"]
+
+    except ImportError:
+        msg = 'Using simplified encoding detection, you might want to install chardet.'
+        if log:
+            log.warn(msg)
+        else:
+            print msg
+        
+        encodings = (
+            'ascii',
+            'iso-8859-1',
+            'windows-1252',
+            'utf-8'
+            )
+        encoding = None
+        for e in encodings:
+            try:
+                text.encode(e)
+            except (UnicodeEncodeError, UnicodeDecodeError):
+                pass
+            else:
+                encoding = e
+                break
+
+    return encoding
+            
+def getEncodingInfo(response=None, text=u'', log=None, url=None):
+    """
+    Finds all encoding related information in given ``text``.
+    Uses information in headers of supplied HTTPResponse, possible XML
+    declaration and X/HTML ``<meta>`` elements.
+    ``text`` will mostly be HTML or XML.
+    
+    Parameters
+        - ``response``: HTTP response object,
+          e.g. ``urllib.urlopen('url')``
+        - ``text``: to guess encoding for, might include XML
+          prolog with encoding pseudo attribute or HTML meta element 
+        - ``log``: an optional logging logger to which messages may go, if
+          no log given all log messages are available from resulting
+          ``EncodingInfo``
+
+    May also simply be called with ``getEncodingInfo(url='URL')`` which fetches
+    the url and all needed information.
+
+    Returns instance of ``EncodingInfo``.
+
+    How the resulting encoding is retrieved
+    =======================================
+    XML
+    ---
+    RFC 3023 states if media type given in the Content-Type HTTP header is
+    application/xml, application/xml-dtd,
+    application/xml-external-parsed-entity, or any one of the subtypes of
+    application/xml such as application/atom+xml or application/rss+xml
+    etc then the character encoding is determined in this order:
+
+    1. the encoding given in the charset parameter of the Content-Type HTTP
+    header, or
+    2. the encoding given in the encoding attribute of the XML declaration
+    within the document, or
+    3. utf-8.
+
+    Mismatch possibilities:
+        - HTTP + XMLdecla 
+        - HTTP + HTMLmeta
+
+        application/xhtml+xml ?
+            XMLdecla + HTMLmeta   
+        
+    If the media type given in the Content-Type HTTP header is text/xml,
+    text/xml-external-parsed-entity, or a subtype like text/Anything+xml,
+    the encoding attribute of the XML declaration is ignored completely
+    and the character encoding is determined in the order:
+    1. the encoding given in the charset parameter of the Content-Type HTTP
+    header, or
+    2. ascii.
+
+    Mismatch possibilities:
+        - HTTP + XMLdecla 
+        - HTTP + HTMLmeta 
+        
+        text/xhtml+xml
+            XMLdecla + HTMLmeta 
+
+    HTML
+    ----
+    For HTML served as text/html:
+        http://www.w3.org/TR/REC-html40/charset.html#h-5.2.2 
+
+    1. An HTTP "charset" parameter in a "Content-Type" field.
+       (maybe defaults to ISO-8859-1, but should not assume this)
+    2. A META declaration with "http-equiv" set to "Content-Type" and a
+       value set for "charset".
+    3. The charset attribute set on an element that designates an external
+       resource. (NOT IMPLEMENTED HERE YET)
+
+    Mismatch possibilities:
+        - HTTP + HTMLmeta 
+        
+    TEXT
+    ----
+    For most text/* types the encoding will be reported as iso-8859-1. 
+    Exceptions are XML formats send as text/* mime type (see above) and 
+    text/css which has a default encoding of UTF-8. 
+    """
+    if url:
+        try:
+            response = urllib.urlopen(url)
+            text = response.read()
+        except IOError, e:
+            print IOError(e)
+            sys.exit(1)
+        
+    encinfo = EncodingInfo()
+
+    logstream = StringIO.StringIO()
+    if not log:
+        log = buildlog(stream=logstream, format='%(message)s')
+        
+    # HTTP
+    if response:
+        encinfo.http_media_type, encinfo.http_encoding = getHTTPInfo(
+            response, log)
+        texttype = _getTextTypeByMediaType(encinfo.http_media_type, log)
+    else:
+        # check if maybe XML or (TODO:) HTML
+        texttype = _getTextType(text, log)
+
+    # XML (also XHTML served as text/html)
+    if texttype == _XML_APPLICATION_TYPE or texttype == _XML_TEXT_TYPE:
+        encinfo.xml_encoding = detectXMLEncoding(text, log)
+
+    # XML (also XHTML served as text/html)
+    if texttype == _HTML_TEXT_TYPE:
+        encinfo.xml_encoding = detectXMLEncoding(text, log, includeDefault=False)
+
+    # HTML
+    if texttype == _HTML_TEXT_TYPE or texttype == _TEXT_TYPE:
+        encinfo.meta_media_type, encinfo.meta_encoding = getMetaInfo(
+            text, log)
+
+    # guess
+    # 1. HTTP charset?
+    encinfo.encoding = encinfo.http_encoding
+    encinfo.mismatch = False
+
+    # 2. media_type?
+    #   XML application/...
+    if texttype == _XML_APPLICATION_TYPE:
+        if not encinfo.encoding:
+            encinfo.encoding = encinfo.xml_encoding
+            # xml_encoding has default of utf-8            
+
+    #   text/html
+    elif texttype == _HTML_TEXT_TYPE:
+        if not encinfo.encoding:
+            encinfo.encoding = encinfo.meta_encoding
+        if not encinfo.encoding:
+            encinfo.encoding = encodingByMediaType(encinfo.http_media_type)
+        if not encinfo.encoding:
+            encinfo.encoding = tryEncodings(text)
+
+    #   text/... + xml or text/*
+    elif texttype == _XML_TEXT_TYPE or texttype == _TEXT_TYPE:
+        if not encinfo.encoding:
+            encinfo.encoding = encodingByMediaType(encinfo.http_media_type)
+
+    # possible mismatches, checks if present at all and then if equal
+    # HTTP + XML
+    if encinfo.http_encoding and encinfo.xml_encoding and\
+       encinfo.http_encoding <> encinfo.xml_encoding:
+        encinfo.mismatch = True
+        log.warn(u'"%s" (HTTP) <> "%s" (XML) encoding mismatch' %
+                 (encinfo.http_encoding, encinfo.xml_encoding))
+    # HTTP + Meta
+    if encinfo.http_encoding and encinfo.meta_encoding and\
+         encinfo.http_encoding <> encinfo.meta_encoding:
+        encinfo.mismatch = True
+        log.warn(u'"%s" (HTTP) <> "%s" (HTML <meta>) encoding mismatch' %
+                 (encinfo.http_encoding, encinfo.meta_encoding))
+    # XML + Meta
+    if encinfo.xml_encoding and encinfo.meta_encoding and\
+         encinfo.xml_encoding <> encinfo.meta_encoding:
+        encinfo.mismatch = True
+        log.warn(u'"%s" (XML) <> "%s" (HTML <meta>) encoding mismatch' %
+                 (encinfo.xml_encoding, encinfo.meta_encoding))
+                
+    log.info(u'Encoding (probably): %s (Mismatch: %s)',
+             encinfo.encoding, encinfo.mismatch)
+
+    encinfo.logtext = logstream.getvalue()
+    return encinfo
+
+
+if __name__ == '__main__':
+    import pydoc
+    pydoc.help(__name__)
\ No newline at end of file