From f277f2b870b77fa611a72094f330d74655e68fc6 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 20 Sep 2008 09:46:55 -0700 Subject: [PATCH] IGN:Various regression fixes and an incomplete epub split implementation --- src/calibre/ebooks/epub/__init__.py | 7 +- src/calibre/ebooks/epub/from_html.py | 29 +- src/calibre/ebooks/epub/split.py | 175 +++++++ src/calibre/ebooks/html.py | 11 +- src/calibre/ebooks/metadata/opf2.py | 9 + src/calibre/gui2/main.py | 12 +- src/calibre/linux.py | 4 + src/calibre/translations/nb.po | 2 - src/calibre/web/fetch/simple.py | 1 - src/encutils/__init__.py | 655 +++++++++++++++++++++++++++ 10 files changed, 872 insertions(+), 33 deletions(-) create mode 100644 src/calibre/ebooks/epub/split.py create mode 100644 src/encutils/__init__.py diff --git a/src/calibre/ebooks/epub/__init__.py b/src/calibre/ebooks/epub/__init__.py index 0dcc4b24d1..7c9d1197a9 100644 --- a/src/calibre/ebooks/epub/__init__.py +++ b/src/calibre/ebooks/epub/__init__.py @@ -7,9 +7,10 @@ __docformat__ = 'restructuredtext en' Conversion to EPUB. ''' import sys, textwrap +from lxml import html from calibre.utils.config import Config, StringConfig from calibre.utils.zipfile import ZipFile, ZIP_STORED -from calibre.ebooks.html import config as common_config +from calibre.ebooks.html import config as common_config, tostring class DefaultProfile(object): @@ -42,7 +43,6 @@ def initialize_container(path_to_container, opf_name='metadata.opf'): zf.writestr('META-INF/', '', 0700) zf.writestr('META-INF/container.xml', CONTAINER) return zf - def config(defaults=None): desc = _('Options to control the conversion to EPUB') @@ -59,7 +59,8 @@ def config(defaults=None): help=_('The output EPUB file. If not specified, it is derived from the input file name.')) c.add_opt('profile', ['--profile'], default='PRS505', choices=list(PROFILES.keys()), help=_('Profile of the target device this EPUB is meant for. Set to None to create a device independent EPUB. The profile is used for device specific restrictions on the EPUB. Choices are: ')+str(list(PROFILES.keys()))) - + c.add_opt('override_css', ['--override-css'], default=None, + help=_('Either the path to a CSS stylesheet or raw CSS. This CSS will override any existing CSS declarations in the source files.')) structure = c.add_group('structure detection', _('Control auto-detection of document structure.')) structure('chapter', ['--chapter'], default="//*[re:match(name(), 'h[1-2]') and re:test(., 'chapter|book|section', 'i')]", help=_('''\ diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py index a4ffa224e4..a94a68c76b 100644 --- a/src/calibre/ebooks/epub/from_html.py +++ b/src/calibre/ebooks/epub/from_html.py @@ -18,6 +18,7 @@ from calibre.ptempfile import TemporaryDirectory from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.epub import initialize_container, PROFILES +from calibre.ebooks.epub.split import split class HTMLProcessor(Processor): @@ -34,18 +35,8 @@ class HTMLProcessor(Processor): if opts.verbose > 2: self.debug_tree('nocss') - self.collect_font_statistics() + #self.collect_font_statistics() - self.split() - - def save(self): - file = Processor.save(self) - with open(file, 'rb') as f: - f.seek(0, 2) - size = f.tell() - if size > self.opts.profile.flow_size: - self.split() - def collect_font_statistics(self): ''' @@ -58,12 +49,6 @@ class HTMLProcessor(Processor): #TODO: Use cssutils on self.raw_css to figure out the font size # of this piece of text and update statistics accordingly - def split(self): - ''' Split into individual flows to accommodate Adobe's incompetence ''' - # TODO: Only split file larger than 300K (as specified in profile) - # Split on page breaks first and then on tags and then on - #
and finally on

. - pass def config(defaults=None): @@ -88,6 +73,7 @@ def parse_content(filelist, opts, tdir): resource_map, filelist) hp.populate_toc(toc) hp.save() + return resource_map, hp.htmlfile_map, toc def convert(htmlfile, opts, notification=None): @@ -96,6 +82,11 @@ def convert(htmlfile, opts, notification=None): opts.output = os.path.splitext(os.path.basename(htmlfile))[0] + '.epub' opts.profile = PROFILES[opts.profile] opts.output = os.path.abspath(opts.output) + if opts.override_css is not None: + try: + opts.override_css = open(opts.override_css, 'rb').read().decode('utf-8', 'replace') + except: + opts.override_css = opts.override_css.decode('utf-8', 'replace') if htmlfile.lower().endswith('.opf'): opf = OPFReader(htmlfile, os.path.dirname(os.path.abspath(htmlfile))) filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding) @@ -153,7 +144,8 @@ def convert(htmlfile, opts, notification=None): for item in mi.manifest: if getattr(item, 'mime_type', None) == 'text/html': item.mime_type = 'application/xhtml+xml' - with open(os.path.join(tdir, 'metadata.opf'), 'wb') as f: + opf_path = os.path.join(tdir, 'metadata.opf') + with open(opf_path, 'wb') as f: mi.render(f, buf, 'toc.ncx') if opts.show_opf: print open(os.path.join(tdir, 'metadata.opf')).read() @@ -163,6 +155,7 @@ def convert(htmlfile, opts, notification=None): f.write(toc) if opts.show_ncx: print toc + split(opf_path, opts) epub = initialize_container(opts.output) epub.add_dir(tdir) print 'Output written to', opts.output diff --git a/src/calibre/ebooks/epub/split.py b/src/calibre/ebooks/epub/split.py new file mode 100644 index 0000000000..da8d6487f4 --- /dev/null +++ b/src/calibre/ebooks/epub/split.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' +__docformat__ = 'restructuredtext en' + +''' +Split the flows in an epub file to conform to size limitations. +''' + +import sys, os, math, copy + +from lxml.etree import parse, XMLParser +from lxml.cssselect import CSSSelector + +from calibre.ebooks.metadata.opf2 import OPF +from calibre.ebooks.epub import tostring + +PARSER = XMLParser(recover=True) + +class SplitError(ValueError): + + def __init__(self, path): + ValueError.__init__(self, _('Could not find reasonable point at which to split: ')+os.path.basename(path)) + +def split_tree(tree, split_point, before, opts, filepath): + trees = set([]) + tree2 = copy.deepcopy(tree) + path = tree.getpath(split_point) + root, root2 = tree.getroot(), tree2.getroot() + body, body2 = root.xpath('//body')[0], root2.xpath('//body')[0] + split_point2 = root2.xpath(path)[0] + + # Tree 1 + hit_split_point = False + for elem in body.iterdescendants(): + if elem is split_point: + hit_split_point = True + if before: + elem.text = u'' + elem.tail = u'' + elem.set('calibre_split', '1') + continue + if hit_split_point: + elem.text = u'' + elem.tail = u'' + elem.set('calibre_split', '1' if hit_split_point else '0') + + # Tree 2 + hit_split_point = False + for elem in body2.iterdescendants(): + if elem is split_point2: + hit_split_point = True + if not before: + elem.text = u'' + elem.tail = u'' + elem.set('calibre_split', '1') + continue + if not hit_split_point: + elem.text = u'' + elem.tail = u'' + elem.set('calibre_split', '0' if hit_split_point else '1') + + for t, r in [(tree, root), (tree2, root2)]: + if len(tostring(r)) < opts.profile.flow_size: + trees.append(t) + else: + new_split_point, before = find_split_point(t) + if new_split_point is None: + raise SplitError(filepath) + trees.extend(split_tree(t, new_split_point, before, opts, filepath)) + + return trees + + +def find_split_point(tree): + root = tree.getroot() + css = root.xpath('//style[@type="text/css"]') + if css: + + def pick_elem(elems): + if elems: + elems = [i for i in elems if elem.get('calibre_split', '0') != '1'] + if elems: + i = int(math.floor(len(elems)/2.)) + return elems[i] + + def selector_element(rule): + try: + selector = CSSSelector(rule.selectorText) + return pick_elem(selector(root)) + except: + return None + + css = css[0].text + from cssutils import CSSParser + stylesheet = CSSParser().parseString(css) + for rule in stylesheet: + if rule.type != rule.STYLE_RULE: + continue + before = getattr(rule.style.getPropertyCSSValue('page-break-before'), 'cssText', '').strip().lower() + if before and before != 'avoid': + elem = selector_element(rule) + if elem is not None: + return elem, True + after = getattr(rule.style.getPropertyCSSValue('page-break-after'), 'cssText', '').strip().lower() + if after and after != 'avoid': + elem = selector_element(rule) + if elem is not None: + return elem, False + + for path in ('//*[re:match(name(), "h[1-6]", "i")', '/body/div', '//p'): + elems = root.xpath(path) + elem = pick_elem(elems) + if elem is not None: + return elem, True + + return None, True + +def do_split(path, opts): + tree = parse(path, parser=PARSER) + split_point, before = find_split_point(tree) + if split_point is None: + raise SplitError(path) + trees = split_tree(tree, split_point, before, opts, path) + base = os.path.splitext(os.path.basename(path))[0] + '_split_%d.html' + anchor_map = {None:base%0} + files = [] + for i, tree in enumerate(trees): + root = tree.getroot() + files.append(base%i) + for elem in root.xpath('//*[@id and @calibre_split = "1"]'): + anchor_map[elem.get('id')] = files[-1] + elem.attrib.pop('calibre_split') + for elem in root.xpath('//*[@calibre_split]'): + elem.attrib.pop('calibre_split') + open(os.path.join(os.path.dirname(path), files[-1]), 'wb').write(tostring(root, pretty_print=opts.pretty_print)) + os.remove(path) + return path, files, anchor_map + +def fix_opf(opf, orig_file, files, anchor_map): + orig = None + for item in opf.manifest: + if os.path.samefile(orig_file, item.path): + orig = item + break + opf.manifest.remove(orig) + ids = [] + for f in files: + ids.append(opf.manifest.add_item(f)) + + +def split(pathtoopf, opts): + return + pathtoopf = os.path.abspath(pathtoopf) + opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf)) + html_files = [] + for item in opf.manifest: + if 'html' in item.mime_type.lower(): + html_files.append(item.path) + changes = [] + for f in html_files: + if os.stat(f).st_size > opts.profile.flow_size: + fix_opf(opf, *do_split(f, opts)) + if changes: + pass + + + + + +def main(args=sys.argv): + return 0 + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py index a01f426e38..7532d43cf8 100644 --- a/src/calibre/ebooks/html.py +++ b/src/calibre/ebooks/html.py @@ -27,6 +27,11 @@ from calibre.ebooks.metadata.opf2 import OPF from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile from calibre.utils.zipfile import ZipFile +def tostring(root, pretty_print=False): + return html.tostring(root, encoding='utf-8', method='xml', + pretty_print=pretty_print, + include_meta_content_type=True) + class Link(object): ''' @@ -332,9 +337,7 @@ class Parser(PreProcessor, LoggingInterface): Should be called after all HTML processing is finished. ''' with open(os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path]), 'wb') as f: - ans = html.tostring(self.root, encoding='utf-8', method='xml', - pretty_print=self.opts.pretty_print, - include_meta_content_type=True) + ans = tostring(self.root, pretty_print=self.opts.pretty_print) ans = re.compile(r'', re.IGNORECASE).sub('', ans) ans = re.compile(r']*?>', re.IGNORECASE).sub('\n\n', ans) f.write(ans) @@ -551,6 +554,8 @@ class Processor(Parser): self.raw_css = '\n\n'.join(css) self.css = unicode(self.raw_css) + if self.opts.override_css: + self.css += '\n\n'+self.opts.override_css self.do_layout() # TODO: Figure out what to do about CSS imports from linked stylesheets diff --git a/src/calibre/ebooks/metadata/opf2.py b/src/calibre/ebooks/metadata/opf2.py index 009d5cfef8..91255efbf5 100644 --- a/src/calibre/ebooks/metadata/opf2.py +++ b/src/calibre/ebooks/metadata/opf2.py @@ -88,6 +88,15 @@ class Manifest(ResourceCollection): m.append(mi) return m + def add_item(self, path, mime_type=None): + mi = ManifestItem(path, is_path=True) + if mime_type: + mi.mime_type = mime_type + mi.id = 'id%d'%self.next_id + self.next_id += 1 + self.append(mi) + return mi.id + def __init__(self): ResourceCollection.__init__(self) self.next_id = 1 diff --git a/src/calibre/gui2/main.py b/src/calibre/gui2/main.py index 7154b2949d..191969023b 100644 --- a/src/calibre/gui2/main.py +++ b/src/calibre/gui2/main.py @@ -229,12 +229,6 @@ class Main(MainWindow, Ui_MainWindow): db = LibraryDatabase2(self.library_path) self.library_view.set_database(db) if self.olddb is not None: - QMessageBox.information(self, 'Database format changed', - '''\ -

calibre's book storage format has changed. Instead of storing book files in a database, the -files are now stored in a folder on your filesystem. You will now be asked to choose the folder -in which you want to store your books files. Any existing books will be automatically migrated. - ''') from PyQt4.QtGui import QProgressDialog pd = QProgressDialog('', '', 0, 100, self) pd.setWindowModality(Qt.ApplicationModal) @@ -1278,6 +1272,12 @@ in which you want to store your books files. Any existing books will be automati self.library_path = prefs['library_path'] self.olddb = None if self.library_path is None: # Need to migrate to new database layout + QMessageBox.information(self, 'Database format changed', + '''\ +

calibre's book storage format has changed. Instead of storing book files in a database, the +files are now stored in a folder on your filesystem. You will now be asked to choose the folder +in which you want to store your books files. Any existing books will be automatically migrated. + ''') self.database_path = prefs['database_path'] if not os.access(os.path.dirname(self.database_path), os.W_OK): error_dialog(self, _('Database does not exist'), diff --git a/src/calibre/linux.py b/src/calibre/linux.py index 9e31cb9fc6..6c7e390306 100644 --- a/src/calibre/linux.py +++ b/src/calibre/linux.py @@ -440,6 +440,10 @@ def post_install(): if opts.save_manifest_to: open(opts.save_manifest_to, 'wb').write('\n'.join(manifest)+'\n') + + from calibre.utils.config import config_dir + if os.path.exists(config_dir): + shutil.rmtree(config_dir) VIEWER = '''\ diff --git a/src/calibre/translations/nb.po b/src/calibre/translations/nb.po index 51b5bd5037..d0797ecd18 100644 --- a/src/calibre/translations/nb.po +++ b/src/calibre/translations/nb.po @@ -368,9 +368,7 @@ msgid "" "device. Default: %s Supported profiles: " msgstr "" "Profilen til lagringsenheten som denne LRF filen blir generert for. Profilen " -" \n" "angir innstillinger som oppløsning og skjerm størrelsen til lagringsenheten. " -" \n" "Standard: %s Støttede profiler: " #: /home/kovid/work/calibre/src/calibre/ebooks/lrf/__init__.py:134 diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py index 9b8f666c4d..05b3b9a87b 100644 --- a/src/calibre/web/fetch/simple.py +++ b/src/calibre/web/fetch/simple.py @@ -258,7 +258,6 @@ class RecursiveFetcher(object, LoggingInterface): except Exception, err: self.log_warning('Could not fetch image %s', iurl) self.log_debug('Error: %s', str(err), exc_info=True) - if hasattr(f, 'close'): f.close() continue c += 1 fname = sanitize_file_name('img'+str(c)+ext) diff --git a/src/encutils/__init__.py b/src/encutils/__init__.py new file mode 100644 index 0000000000..f2eb4eefb3 --- /dev/null +++ b/src/encutils/__init__.py @@ -0,0 +1,655 @@ +#!/usr/bin/env python +"""encutils - encoding detection collection for Python + +encutils +======== +:Author: Christof Hoeke, see http://cthedot.de/encutils/ +:Copyright: 2005-2008: Christof Hoeke +:License: encutils has a dual-license, please choose whatever you prefer: + + * encutils is published under the `LGPL 3 or later `__ + * encutils is published under the + `Creative Commons License `__. + + This file is part of encutils. + + encutils is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + encutils is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with encutils. If not, see . + + +A collection of helper functions to detect encodings of text files (like HTML, XHTML, XML, CSS, etc.) retrieved via HTTP, file or string. + +``getEncodingInfo`` is probably the main function of interest which uses +other supplied functions itself and gathers all information together and +supplies an ``EncodingInfo`` object with the following properties: + +- ``encoding``: The guessed encoding + Encoding is the explicit or implicit encoding or None and + always lowercase. + +- from HTTP response + * ``http_encoding`` + * ``http_media_type`` + +- from HTML element + * ``meta_encoding`` + * ``meta_media_type`` + +- from XML declaration + * ``xml_encoding`` + +example:: + + >>> import encutils + >>> info = encutils.getEncodingInfo(url='http://cthedot.de/encutils/') + + >>> print info # = str(info) + utf-8 + + >>> info # = repr(info) + + + >>> print info.logtext + HTTP media_type: text/html + HTTP encoding: utf-8 + HTML META media_type: text/html + HTML META encoding: utf-8 + Encoding (probably): utf-8 (Mismatch: False) + + +references +========== +XML + RFC 3023 (http://www.ietf.org/rfc/rfc3023.txt) + + easier explained in + - http://feedparser.org/docs/advanced.html + - http://www.xml.com/pub/a/2004/07/21/dive.html + +HTML + http://www.w3.org/TR/REC-html40/charset.html#h-5.2.2 + +TODO +==== +- parse @charset of HTML elements? +- check for more texttypes if only text given + +""" +__all__ = ['buildlog', + 'encodingByMediaType', + 'getHTTPInfo', + 'getMetaInfo', + 'detectXMLEncoding', + 'getEncodingInfo', + 'tryEncodings', + 'EncodingInfo'] +__docformat__ = 'restructuredtext' +__author__ = 'Christof Hoeke' +__version__ = '0.8.3 $Id: __init__.py 1138 2008-03-15 18:24:46Z cthedot $' + +import cgi +import HTMLParser +import httplib +import re +import StringIO +import sys +import types +import urllib + +class _MetaHTMLParser(HTMLParser.HTMLParser): + """parses given data for """ + content_type = None + + def handle_starttag(self, tag, attrs): + if tag == 'meta' and not self.content_type: + atts = dict([(a.lower(), v.lower()) for a, v in attrs]) + if atts.get('http-equiv', u'').strip() == u'content-type': + self.content_type = atts.get('content') + + +# application/xml, application/xml-dtd, application/xml-external-parsed-entity, or a subtype like application/rss+xml. +_XML_APPLICATION_TYPE = 0 + +# text/xml, text/xml-external-parsed-entity, or a subtype like text/AnythingAtAll+xml +_XML_TEXT_TYPE = 1 + +# text/html +_HTML_TEXT_TYPE = 2 + +# any other of text/* like text/plain, ... +_TEXT_TYPE = 3 + +# any text/* like which defaults to UTF-8 encoding, for now only text/css +_TEXT_UTF8 = 5 + +# types not fitting in above types +_OTHER_TYPE = 4 + +class EncodingInfo(object): + """ + All encoding related information, returned by ``getEncodingInfo`` + + - ``encoding``: The guessed encoding + Encoding is the explicit or implicit encoding or None and + always lowercase. + + - from HTTP response + * ``http_encoding`` + * ``http_media_type`` + + - from HTML element + * ``meta_encoding`` + * ``meta_media_type`` + + - from XML declaration + * ``xml_encoding`` + + - ``mismatch``: True if mismatch between XML declaration and HTTP header + Mismatch is True if any mismatches between HTTP header, XML + declaration or textcontent (meta) are found. More detailed mismatch + reports are written to the optional log or ``logtext`` + + Mismatches are not necessarily errors as preferences are defined. + For details see the specifications. + + - ``logtext``: if no log was given log reports are given here + + """ + def __init__(self): + """ + initializes all possible properties to ``None``, see class + description + """ + self.encoding = self.mismatch = self.logtext =\ + self.http_encoding = self.http_media_type =\ + self.meta_encoding = self.meta_media_type =\ + self.xml_encoding =\ + None + + def __str__(self): + """ + ``str(EncodingInfo())`` outputs the guessed encoding itself or the empty string + """ + if self.encoding: + return self.encoding + else: + return u'' + + def __repr__(self): + return "<%s.%s object encoding=%r mismatch=%s at 0x%x>" % ( + self.__class__.__module__, self.__class__.__name__, + self.encoding, self.mismatch, id(self)) + + +def buildlog(logname='encutils', level='INFO', stream=sys.stderr, + filename=None, filemode="w", + format='%(levelname)s\t%(message)s'): + """ + helper to build a basic log + + - if ``filename`` is given returns a log logging to ``filename`` with + mode ``filemode`` + - else uses a log streaming to ``stream`` which defaults to + ``sys.stderr`` + - ``level`` defines the level of the log + - ``format`` defines the formatter format of the log + + returns a log with the name ``logname`` + """ + import logging + + log = logging.getLogger(logname) + + if filename: + hdlr = logging.FileHandler(filename, filemode) + else: + hdlr = logging.StreamHandler(stream) + + formatter = logging.Formatter(format) + hdlr.setFormatter(formatter) + + log.addHandler(hdlr) + log.setLevel(logging.__dict__.get(level, logging.INFO)) + + return log + +def _getTextTypeByMediaType(media_type, log=None): + """ + returns type as defined by constants above + """ + if not media_type: + return _OTHER_TYPE + + xml_application_types = [ + ur'application/.*?\+xml', + u'application/xml', + u'application/xml-dtd', + u'application/xml-external-parsed-entity'] + xml_text_types = [ + ur'text\/.*?\+xml', + u'text/xml', + u'text/xml-external-parsed-entity'] + + media_type = media_type.strip().lower() + + if media_type in xml_application_types or\ + re.match(xml_application_types[0], media_type, re.I|re.S|re.X): + return _XML_APPLICATION_TYPE + elif media_type in xml_text_types or\ + re.match(xml_text_types[0], media_type, re.I|re.S|re.X): + return _XML_TEXT_TYPE + elif media_type == u'text/html': + return _HTML_TEXT_TYPE + elif media_type == u'text/css': + return _TEXT_UTF8 + elif media_type.startswith(u'text/'): + return _TEXT_TYPE + else: + return _OTHER_TYPE + +def _getTextType(text, log=None): + """ + checks if given text is XML (**naive test!**) + used if no content-type given + """ + if text[:30].find(u'`` element if available. + + Normally in X/HTML: + ```` + """ + p = _MetaHTMLParser() + p.feed(text) + if p.content_type: + media_type, params = cgi.parse_header(p.content_type) + encoding = params.get('charset') # defaults to None + if encoding: + encoding = encoding.lower() + if log: + log.info(u'HTML META media_type: %s', media_type) + log.info(u'HTML META encoding: %s', encoding) + else: + media_type = encoding = None + + return media_type, encoding + +def detectXMLEncoding(fp, log=None, includeDefault=True): + """ + Attempts to detect the character encoding of the xml file + given by a file object fp. fp must not be a codec wrapped file + object! fp may also be a string or unicode string + + The return value can be: + - if detection of the BOM succeeds, the codec name of the + corresponding unicode charset is returned + + - if BOM detection fails, the xml declaration is searched for + the encoding attribute and its value returned. the "<" + character has to be the very first in the file then (it's xml + standard after all). + + - if BOM and xml declaration fail, utf-8 is returned according + to XML 1.0. + + Based on a recipe by Lars Tiede: + http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/363841 + which itself is based on Paul Prescotts recipe: + http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52257 + """ + if type(fp) in types.StringTypes: + fp = StringIO.StringIO(fp) + + ### detection using BOM + + ## the BOMs we know, by their pattern + bomDict={ # bytepattern: name + (0x00, 0x00, 0xFE, 0xFF) : "utf_32_be", + (0xFF, 0xFE, 0x00, 0x00) : "utf_32_le", + (0xFE, 0xFF, None, None) : "utf_16_be", + (0xFF, 0xFE, None, None) : "utf_16_le", + (0xEF, 0xBB, 0xBF, None) : "utf-8", + } + + ## go to beginning of file and get the first 4 bytes + oldFP = fp.tell() + fp.seek(0) + (byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4))) + + ## try bom detection using 4 bytes, 3 bytes, or 2 bytes + bomDetection = bomDict.get((byte1, byte2, byte3, byte4)) + if not bomDetection: + bomDetection = bomDict.get((byte1, byte2, byte3, None)) + if not bomDetection: + bomDetection = bomDict.get((byte1, byte2, None, None)) + + ## if BOM detected, we're done :-) + if bomDetection: + if log: + log.info(u'XML BOM encoding: %s' % bomDetection) + fp.seek(oldFP) + return bomDetection + + ## still here? BOM detection failed. + ## now that BOM detection has failed we assume one byte character + ## encoding behaving ASCII + + ### search xml declaration for encoding attribute + + ## assume xml declaration fits into the first 2 KB (*cough*) + fp.seek(0) + buffer = fp.read(2048) + + ## set up regular expression + xmlDeclPattern = r""" + ^<\?xml # w/o BOM, xmldecl starts with # what's matched in the brackets will be named encstr + [^"']+ # every character not delimiter (not overly exact!) + ) # closes the brackets pair for the named group + ["'] # attribute end delimiter + .*? # some chars optionally (standalone decl or whitespace) + \?> # xmldecl end + """ + xmlDeclRE = re.compile(xmlDeclPattern, re.VERBOSE) + + ## search and extract encoding string + match = xmlDeclRE.search(buffer) + fp.seek(oldFP) + if match: + enc = match.group("encstr").lower() + if log: + log.info(u'XML encoding="%s"' % enc) + return enc + else: + if includeDefault: + if log: + log.info(u'XML encoding default utf-8') + return u'utf-8' + else: + return None + +def tryEncodings(text, log=None): + """ + If installed uses chardet http://chardet.feedparser.org/ to detect + encoding, else tries different encodings on text and returns the one + that does not raise an exception which is not very advanced or may + be totally wrong. + + Returns working encoding or None if no encoding does work at all. + + The returned encoding might nevertheless be not the one intended by the + author as it is only checked if the text might be encoded in that + encoding. Some texts might be working in "iso-8859-1" *and* + "windows-1252" *and* "ascii" *and* "utf-8" and ... + """ + try: + import chardet + encoding = chardet.detect(text)["encoding"] + + except ImportError: + msg = 'Using simplified encoding detection, you might want to install chardet.' + if log: + log.warn(msg) + else: + print msg + + encodings = ( + 'ascii', + 'iso-8859-1', + 'windows-1252', + 'utf-8' + ) + encoding = None + for e in encodings: + try: + text.encode(e) + except (UnicodeEncodeError, UnicodeDecodeError): + pass + else: + encoding = e + break + + return encoding + +def getEncodingInfo(response=None, text=u'', log=None, url=None): + """ + Finds all encoding related information in given ``text``. + Uses information in headers of supplied HTTPResponse, possible XML + declaration and X/HTML ```` elements. + ``text`` will mostly be HTML or XML. + + Parameters + - ``response``: HTTP response object, + e.g. ``urllib.urlopen('url')`` + - ``text``: to guess encoding for, might include XML + prolog with encoding pseudo attribute or HTML meta element + - ``log``: an optional logging logger to which messages may go, if + no log given all log messages are available from resulting + ``EncodingInfo`` + + May also simply be called with ``getEncodingInfo(url='URL')`` which fetches + the url and all needed information. + + Returns instance of ``EncodingInfo``. + + How the resulting encoding is retrieved + ======================================= + XML + --- + RFC 3023 states if media type given in the Content-Type HTTP header is + application/xml, application/xml-dtd, + application/xml-external-parsed-entity, or any one of the subtypes of + application/xml such as application/atom+xml or application/rss+xml + etc then the character encoding is determined in this order: + + 1. the encoding given in the charset parameter of the Content-Type HTTP + header, or + 2. the encoding given in the encoding attribute of the XML declaration + within the document, or + 3. utf-8. + + Mismatch possibilities: + - HTTP + XMLdecla + - HTTP + HTMLmeta + + application/xhtml+xml ? + XMLdecla + HTMLmeta + + If the media type given in the Content-Type HTTP header is text/xml, + text/xml-external-parsed-entity, or a subtype like text/Anything+xml, + the encoding attribute of the XML declaration is ignored completely + and the character encoding is determined in the order: + 1. the encoding given in the charset parameter of the Content-Type HTTP + header, or + 2. ascii. + + Mismatch possibilities: + - HTTP + XMLdecla + - HTTP + HTMLmeta + + text/xhtml+xml + XMLdecla + HTMLmeta + + HTML + ---- + For HTML served as text/html: + http://www.w3.org/TR/REC-html40/charset.html#h-5.2.2 + + 1. An HTTP "charset" parameter in a "Content-Type" field. + (maybe defaults to ISO-8859-1, but should not assume this) + 2. A META declaration with "http-equiv" set to "Content-Type" and a + value set for "charset". + 3. The charset attribute set on an element that designates an external + resource. (NOT IMPLEMENTED HERE YET) + + Mismatch possibilities: + - HTTP + HTMLmeta + + TEXT + ---- + For most text/* types the encoding will be reported as iso-8859-1. + Exceptions are XML formats send as text/* mime type (see above) and + text/css which has a default encoding of UTF-8. + """ + if url: + try: + response = urllib.urlopen(url) + text = response.read() + except IOError, e: + print IOError(e) + sys.exit(1) + + encinfo = EncodingInfo() + + logstream = StringIO.StringIO() + if not log: + log = buildlog(stream=logstream, format='%(message)s') + + # HTTP + if response: + encinfo.http_media_type, encinfo.http_encoding = getHTTPInfo( + response, log) + texttype = _getTextTypeByMediaType(encinfo.http_media_type, log) + else: + # check if maybe XML or (TODO:) HTML + texttype = _getTextType(text, log) + + # XML (also XHTML served as text/html) + if texttype == _XML_APPLICATION_TYPE or texttype == _XML_TEXT_TYPE: + encinfo.xml_encoding = detectXMLEncoding(text, log) + + # XML (also XHTML served as text/html) + if texttype == _HTML_TEXT_TYPE: + encinfo.xml_encoding = detectXMLEncoding(text, log, includeDefault=False) + + # HTML + if texttype == _HTML_TEXT_TYPE or texttype == _TEXT_TYPE: + encinfo.meta_media_type, encinfo.meta_encoding = getMetaInfo( + text, log) + + # guess + # 1. HTTP charset? + encinfo.encoding = encinfo.http_encoding + encinfo.mismatch = False + + # 2. media_type? + # XML application/... + if texttype == _XML_APPLICATION_TYPE: + if not encinfo.encoding: + encinfo.encoding = encinfo.xml_encoding + # xml_encoding has default of utf-8 + + # text/html + elif texttype == _HTML_TEXT_TYPE: + if not encinfo.encoding: + encinfo.encoding = encinfo.meta_encoding + if not encinfo.encoding: + encinfo.encoding = encodingByMediaType(encinfo.http_media_type) + if not encinfo.encoding: + encinfo.encoding = tryEncodings(text) + + # text/... + xml or text/* + elif texttype == _XML_TEXT_TYPE or texttype == _TEXT_TYPE: + if not encinfo.encoding: + encinfo.encoding = encodingByMediaType(encinfo.http_media_type) + + # possible mismatches, checks if present at all and then if equal + # HTTP + XML + if encinfo.http_encoding and encinfo.xml_encoding and\ + encinfo.http_encoding <> encinfo.xml_encoding: + encinfo.mismatch = True + log.warn(u'"%s" (HTTP) <> "%s" (XML) encoding mismatch' % + (encinfo.http_encoding, encinfo.xml_encoding)) + # HTTP + Meta + if encinfo.http_encoding and encinfo.meta_encoding and\ + encinfo.http_encoding <> encinfo.meta_encoding: + encinfo.mismatch = True + log.warn(u'"%s" (HTTP) <> "%s" (HTML ) encoding mismatch' % + (encinfo.http_encoding, encinfo.meta_encoding)) + # XML + Meta + if encinfo.xml_encoding and encinfo.meta_encoding and\ + encinfo.xml_encoding <> encinfo.meta_encoding: + encinfo.mismatch = True + log.warn(u'"%s" (XML) <> "%s" (HTML ) encoding mismatch' % + (encinfo.xml_encoding, encinfo.meta_encoding)) + + log.info(u'Encoding (probably): %s (Mismatch: %s)', + encinfo.encoding, encinfo.mismatch) + + encinfo.logtext = logstream.getvalue() + return encinfo + + +if __name__ == '__main__': + import pydoc + pydoc.help(__name__) \ No newline at end of file