IGN:Various regression fixes and an incomplete epub split implementation

This commit is contained in:
Kovid Goyal 2008-09-20 09:46:55 -07:00
parent 5b78f8430f
commit f277f2b870
10 changed files with 872 additions and 33 deletions

View File

@ -7,9 +7,10 @@ __docformat__ = 'restructuredtext en'
Conversion to EPUB. Conversion to EPUB.
''' '''
import sys, textwrap import sys, textwrap
from lxml import html
from calibre.utils.config import Config, StringConfig from calibre.utils.config import Config, StringConfig
from calibre.utils.zipfile import ZipFile, ZIP_STORED from calibre.utils.zipfile import ZipFile, ZIP_STORED
from calibre.ebooks.html import config as common_config from calibre.ebooks.html import config as common_config, tostring
class DefaultProfile(object): class DefaultProfile(object):
@ -42,7 +43,6 @@ def initialize_container(path_to_container, opf_name='metadata.opf'):
zf.writestr('META-INF/', '', 0700) zf.writestr('META-INF/', '', 0700)
zf.writestr('META-INF/container.xml', CONTAINER) zf.writestr('META-INF/container.xml', CONTAINER)
return zf return zf
def config(defaults=None): def config(defaults=None):
desc = _('Options to control the conversion to EPUB') desc = _('Options to control the conversion to EPUB')
@ -59,7 +59,8 @@ def config(defaults=None):
help=_('The output EPUB file. If not specified, it is derived from the input file name.')) help=_('The output EPUB file. If not specified, it is derived from the input file name.'))
c.add_opt('profile', ['--profile'], default='PRS505', choices=list(PROFILES.keys()), c.add_opt('profile', ['--profile'], default='PRS505', choices=list(PROFILES.keys()),
help=_('Profile of the target device this EPUB is meant for. Set to None to create a device independent EPUB. The profile is used for device specific restrictions on the EPUB. Choices are: ')+str(list(PROFILES.keys()))) help=_('Profile of the target device this EPUB is meant for. Set to None to create a device independent EPUB. The profile is used for device specific restrictions on the EPUB. Choices are: ')+str(list(PROFILES.keys())))
c.add_opt('override_css', ['--override-css'], default=None,
help=_('Either the path to a CSS stylesheet or raw CSS. This CSS will override any existing CSS declarations in the source files.'))
structure = c.add_group('structure detection', _('Control auto-detection of document structure.')) structure = c.add_group('structure detection', _('Control auto-detection of document structure.'))
structure('chapter', ['--chapter'], default="//*[re:match(name(), 'h[1-2]') and re:test(., 'chapter|book|section', 'i')]", structure('chapter', ['--chapter'], default="//*[re:match(name(), 'h[1-2]') and re:test(., 'chapter|book|section', 'i')]",
help=_('''\ help=_('''\

View File

@ -18,6 +18,7 @@ from calibre.ptempfile import TemporaryDirectory
from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.epub import initialize_container, PROFILES from calibre.ebooks.epub import initialize_container, PROFILES
from calibre.ebooks.epub.split import split
class HTMLProcessor(Processor): class HTMLProcessor(Processor):
@ -34,18 +35,8 @@ class HTMLProcessor(Processor):
if opts.verbose > 2: if opts.verbose > 2:
self.debug_tree('nocss') self.debug_tree('nocss')
self.collect_font_statistics() #self.collect_font_statistics()
self.split()
def save(self):
file = Processor.save(self)
with open(file, 'rb') as f:
f.seek(0, 2)
size = f.tell()
if size > self.opts.profile.flow_size:
self.split()
def collect_font_statistics(self): def collect_font_statistics(self):
''' '''
@ -58,12 +49,6 @@ class HTMLProcessor(Processor):
#TODO: Use cssutils on self.raw_css to figure out the font size #TODO: Use cssutils on self.raw_css to figure out the font size
# of this piece of text and update statistics accordingly # of this piece of text and update statistics accordingly
def split(self):
''' Split into individual flows to accommodate Adobe's incompetence '''
# TODO: Only split file larger than 300K (as specified in profile)
# Split on page breaks first and then on <h1-6> tags and then on
# <div> and finally on <p>.
pass
def config(defaults=None): def config(defaults=None):
@ -88,6 +73,7 @@ def parse_content(filelist, opts, tdir):
resource_map, filelist) resource_map, filelist)
hp.populate_toc(toc) hp.populate_toc(toc)
hp.save() hp.save()
return resource_map, hp.htmlfile_map, toc return resource_map, hp.htmlfile_map, toc
def convert(htmlfile, opts, notification=None): def convert(htmlfile, opts, notification=None):
@ -96,6 +82,11 @@ def convert(htmlfile, opts, notification=None):
opts.output = os.path.splitext(os.path.basename(htmlfile))[0] + '.epub' opts.output = os.path.splitext(os.path.basename(htmlfile))[0] + '.epub'
opts.profile = PROFILES[opts.profile] opts.profile = PROFILES[opts.profile]
opts.output = os.path.abspath(opts.output) opts.output = os.path.abspath(opts.output)
if opts.override_css is not None:
try:
opts.override_css = open(opts.override_css, 'rb').read().decode('utf-8', 'replace')
except:
opts.override_css = opts.override_css.decode('utf-8', 'replace')
if htmlfile.lower().endswith('.opf'): if htmlfile.lower().endswith('.opf'):
opf = OPFReader(htmlfile, os.path.dirname(os.path.abspath(htmlfile))) opf = OPFReader(htmlfile, os.path.dirname(os.path.abspath(htmlfile)))
filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding) filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding)
@ -153,7 +144,8 @@ def convert(htmlfile, opts, notification=None):
for item in mi.manifest: for item in mi.manifest:
if getattr(item, 'mime_type', None) == 'text/html': if getattr(item, 'mime_type', None) == 'text/html':
item.mime_type = 'application/xhtml+xml' item.mime_type = 'application/xhtml+xml'
with open(os.path.join(tdir, 'metadata.opf'), 'wb') as f: opf_path = os.path.join(tdir, 'metadata.opf')
with open(opf_path, 'wb') as f:
mi.render(f, buf, 'toc.ncx') mi.render(f, buf, 'toc.ncx')
if opts.show_opf: if opts.show_opf:
print open(os.path.join(tdir, 'metadata.opf')).read() print open(os.path.join(tdir, 'metadata.opf')).read()
@ -163,6 +155,7 @@ def convert(htmlfile, opts, notification=None):
f.write(toc) f.write(toc)
if opts.show_ncx: if opts.show_ncx:
print toc print toc
split(opf_path, opts)
epub = initialize_container(opts.output) epub = initialize_container(opts.output)
epub.add_dir(tdir) epub.add_dir(tdir)
print 'Output written to', opts.output print 'Output written to', opts.output

View File

@ -0,0 +1,175 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
Split the flows in an epub file to conform to size limitations.
'''
import sys, os, math, copy
from lxml.etree import parse, XMLParser
from lxml.cssselect import CSSSelector
from calibre.ebooks.metadata.opf2 import OPF
from calibre.ebooks.epub import tostring
PARSER = XMLParser(recover=True)
class SplitError(ValueError):
def __init__(self, path):
ValueError.__init__(self, _('Could not find reasonable point at which to split: ')+os.path.basename(path))
def split_tree(tree, split_point, before, opts, filepath):
trees = set([])
tree2 = copy.deepcopy(tree)
path = tree.getpath(split_point)
root, root2 = tree.getroot(), tree2.getroot()
body, body2 = root.xpath('//body')[0], root2.xpath('//body')[0]
split_point2 = root2.xpath(path)[0]
# Tree 1
hit_split_point = False
for elem in body.iterdescendants():
if elem is split_point:
hit_split_point = True
if before:
elem.text = u''
elem.tail = u''
elem.set('calibre_split', '1')
continue
if hit_split_point:
elem.text = u''
elem.tail = u''
elem.set('calibre_split', '1' if hit_split_point else '0')
# Tree 2
hit_split_point = False
for elem in body2.iterdescendants():
if elem is split_point2:
hit_split_point = True
if not before:
elem.text = u''
elem.tail = u''
elem.set('calibre_split', '1')
continue
if not hit_split_point:
elem.text = u''
elem.tail = u''
elem.set('calibre_split', '0' if hit_split_point else '1')
for t, r in [(tree, root), (tree2, root2)]:
if len(tostring(r)) < opts.profile.flow_size:
trees.append(t)
else:
new_split_point, before = find_split_point(t)
if new_split_point is None:
raise SplitError(filepath)
trees.extend(split_tree(t, new_split_point, before, opts, filepath))
return trees
def find_split_point(tree):
root = tree.getroot()
css = root.xpath('//style[@type="text/css"]')
if css:
def pick_elem(elems):
if elems:
elems = [i for i in elems if elem.get('calibre_split', '0') != '1']
if elems:
i = int(math.floor(len(elems)/2.))
return elems[i]
def selector_element(rule):
try:
selector = CSSSelector(rule.selectorText)
return pick_elem(selector(root))
except:
return None
css = css[0].text
from cssutils import CSSParser
stylesheet = CSSParser().parseString(css)
for rule in stylesheet:
if rule.type != rule.STYLE_RULE:
continue
before = getattr(rule.style.getPropertyCSSValue('page-break-before'), 'cssText', '').strip().lower()
if before and before != 'avoid':
elem = selector_element(rule)
if elem is not None:
return elem, True
after = getattr(rule.style.getPropertyCSSValue('page-break-after'), 'cssText', '').strip().lower()
if after and after != 'avoid':
elem = selector_element(rule)
if elem is not None:
return elem, False
for path in ('//*[re:match(name(), "h[1-6]", "i")', '/body/div', '//p'):
elems = root.xpath(path)
elem = pick_elem(elems)
if elem is not None:
return elem, True
return None, True
def do_split(path, opts):
tree = parse(path, parser=PARSER)
split_point, before = find_split_point(tree)
if split_point is None:
raise SplitError(path)
trees = split_tree(tree, split_point, before, opts, path)
base = os.path.splitext(os.path.basename(path))[0] + '_split_%d.html'
anchor_map = {None:base%0}
files = []
for i, tree in enumerate(trees):
root = tree.getroot()
files.append(base%i)
for elem in root.xpath('//*[@id and @calibre_split = "1"]'):
anchor_map[elem.get('id')] = files[-1]
elem.attrib.pop('calibre_split')
for elem in root.xpath('//*[@calibre_split]'):
elem.attrib.pop('calibre_split')
open(os.path.join(os.path.dirname(path), files[-1]), 'wb').write(tostring(root, pretty_print=opts.pretty_print))
os.remove(path)
return path, files, anchor_map
def fix_opf(opf, orig_file, files, anchor_map):
orig = None
for item in opf.manifest:
if os.path.samefile(orig_file, item.path):
orig = item
break
opf.manifest.remove(orig)
ids = []
for f in files:
ids.append(opf.manifest.add_item(f))
def split(pathtoopf, opts):
return
pathtoopf = os.path.abspath(pathtoopf)
opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf))
html_files = []
for item in opf.manifest:
if 'html' in item.mime_type.lower():
html_files.append(item.path)
changes = []
for f in html_files:
if os.stat(f).st_size > opts.profile.flow_size:
fix_opf(opf, *do_split(f, opts))
if changes:
pass
def main(args=sys.argv):
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -27,6 +27,11 @@ from calibre.ebooks.metadata.opf2 import OPF
from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
from calibre.utils.zipfile import ZipFile from calibre.utils.zipfile import ZipFile
def tostring(root, pretty_print=False):
return html.tostring(root, encoding='utf-8', method='xml',
pretty_print=pretty_print,
include_meta_content_type=True)
class Link(object): class Link(object):
''' '''
@ -332,9 +337,7 @@ class Parser(PreProcessor, LoggingInterface):
Should be called after all HTML processing is finished. Should be called after all HTML processing is finished.
''' '''
with open(os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path]), 'wb') as f: with open(os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path]), 'wb') as f:
ans = html.tostring(self.root, encoding='utf-8', method='xml', ans = tostring(self.root, pretty_print=self.opts.pretty_print)
pretty_print=self.opts.pretty_print,
include_meta_content_type=True)
ans = re.compile(r'<html>', re.IGNORECASE).sub('<html xmlns="http://www.w3.org/1999/xhtml">', ans) ans = re.compile(r'<html>', re.IGNORECASE).sub('<html xmlns="http://www.w3.org/1999/xhtml">', ans)
ans = re.compile(r'<head[^<>]*?>', re.IGNORECASE).sub('<head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n', ans) ans = re.compile(r'<head[^<>]*?>', re.IGNORECASE).sub('<head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n', ans)
f.write(ans) f.write(ans)
@ -551,6 +554,8 @@ class Processor(Parser):
self.raw_css = '\n\n'.join(css) self.raw_css = '\n\n'.join(css)
self.css = unicode(self.raw_css) self.css = unicode(self.raw_css)
if self.opts.override_css:
self.css += '\n\n'+self.opts.override_css
self.do_layout() self.do_layout()
# TODO: Figure out what to do about CSS imports from linked stylesheets # TODO: Figure out what to do about CSS imports from linked stylesheets

View File

@ -88,6 +88,15 @@ class Manifest(ResourceCollection):
m.append(mi) m.append(mi)
return m return m
def add_item(self, path, mime_type=None):
mi = ManifestItem(path, is_path=True)
if mime_type:
mi.mime_type = mime_type
mi.id = 'id%d'%self.next_id
self.next_id += 1
self.append(mi)
return mi.id
def __init__(self): def __init__(self):
ResourceCollection.__init__(self) ResourceCollection.__init__(self)
self.next_id = 1 self.next_id = 1

View File

@ -229,12 +229,6 @@ class Main(MainWindow, Ui_MainWindow):
db = LibraryDatabase2(self.library_path) db = LibraryDatabase2(self.library_path)
self.library_view.set_database(db) self.library_view.set_database(db)
if self.olddb is not None: if self.olddb is not None:
QMessageBox.information(self, 'Database format changed',
'''\
<p>calibre's book storage format has changed. Instead of storing book files in a database, the
files are now stored in a folder on your filesystem. You will now be asked to choose the folder
in which you want to store your books files. Any existing books will be automatically migrated.
''')
from PyQt4.QtGui import QProgressDialog from PyQt4.QtGui import QProgressDialog
pd = QProgressDialog('', '', 0, 100, self) pd = QProgressDialog('', '', 0, 100, self)
pd.setWindowModality(Qt.ApplicationModal) pd.setWindowModality(Qt.ApplicationModal)
@ -1278,6 +1272,12 @@ in which you want to store your books files. Any existing books will be automati
self.library_path = prefs['library_path'] self.library_path = prefs['library_path']
self.olddb = None self.olddb = None
if self.library_path is None: # Need to migrate to new database layout if self.library_path is None: # Need to migrate to new database layout
QMessageBox.information(self, 'Database format changed',
'''\
<p>calibre's book storage format has changed. Instead of storing book files in a database, the
files are now stored in a folder on your filesystem. You will now be asked to choose the folder
in which you want to store your books files. Any existing books will be automatically migrated.
''')
self.database_path = prefs['database_path'] self.database_path = prefs['database_path']
if not os.access(os.path.dirname(self.database_path), os.W_OK): if not os.access(os.path.dirname(self.database_path), os.W_OK):
error_dialog(self, _('Database does not exist'), error_dialog(self, _('Database does not exist'),

View File

@ -440,6 +440,10 @@ def post_install():
if opts.save_manifest_to: if opts.save_manifest_to:
open(opts.save_manifest_to, 'wb').write('\n'.join(manifest)+'\n') open(opts.save_manifest_to, 'wb').write('\n'.join(manifest)+'\n')
from calibre.utils.config import config_dir
if os.path.exists(config_dir):
shutil.rmtree(config_dir)
VIEWER = '''\ VIEWER = '''\

View File

@ -368,9 +368,7 @@ msgid ""
"device. Default: %s Supported profiles: " "device. Default: %s Supported profiles: "
msgstr "" msgstr ""
"Profilen til lagringsenheten som denne LRF filen blir generert for. Profilen " "Profilen til lagringsenheten som denne LRF filen blir generert for. Profilen "
" \n"
"angir innstillinger som oppløsning og skjerm størrelsen til lagringsenheten. " "angir innstillinger som oppløsning og skjerm størrelsen til lagringsenheten. "
" \n"
"Standard: %s Støttede profiler: " "Standard: %s Støttede profiler: "
#: /home/kovid/work/calibre/src/calibre/ebooks/lrf/__init__.py:134 #: /home/kovid/work/calibre/src/calibre/ebooks/lrf/__init__.py:134

View File

@ -258,7 +258,6 @@ class RecursiveFetcher(object, LoggingInterface):
except Exception, err: except Exception, err:
self.log_warning('Could not fetch image %s', iurl) self.log_warning('Could not fetch image %s', iurl)
self.log_debug('Error: %s', str(err), exc_info=True) self.log_debug('Error: %s', str(err), exc_info=True)
if hasattr(f, 'close'): f.close()
continue continue
c += 1 c += 1
fname = sanitize_file_name('img'+str(c)+ext) fname = sanitize_file_name('img'+str(c)+ext)

655
src/encutils/__init__.py Normal file
View File

@ -0,0 +1,655 @@
#!/usr/bin/env python
"""encutils - encoding detection collection for Python
encutils
========
:Author: Christof Hoeke, see http://cthedot.de/encutils/
:Copyright: 2005-2008: Christof Hoeke
:License: encutils has a dual-license, please choose whatever you prefer:
* encutils is published under the `LGPL 3 or later <http://cthedot.de/encutils/license/>`__
* encutils is published under the
`Creative Commons License <http://creativecommons.org/licenses/by/3.0/>`__.
This file is part of encutils.
encutils is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
encutils is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with encutils. If not, see <http://www.gnu.org/licenses/>.
A collection of helper functions to detect encodings of text files (like HTML, XHTML, XML, CSS, etc.) retrieved via HTTP, file or string.
``getEncodingInfo`` is probably the main function of interest which uses
other supplied functions itself and gathers all information together and
supplies an ``EncodingInfo`` object with the following properties:
- ``encoding``: The guessed encoding
Encoding is the explicit or implicit encoding or None and
always lowercase.
- from HTTP response
* ``http_encoding``
* ``http_media_type``
- from HTML <meta> element
* ``meta_encoding``
* ``meta_media_type``
- from XML declaration
* ``xml_encoding``
example::
>>> import encutils
>>> info = encutils.getEncodingInfo(url='http://cthedot.de/encutils/')
>>> print info # = str(info)
utf-8
>>> info # = repr(info)
<encutils.EncodingInfo object encoding='utf-8' mismatch=False at 0xb86d30>
>>> print info.logtext
HTTP media_type: text/html
HTTP encoding: utf-8
HTML META media_type: text/html
HTML META encoding: utf-8
Encoding (probably): utf-8 (Mismatch: False)
references
==========
XML
RFC 3023 (http://www.ietf.org/rfc/rfc3023.txt)
easier explained in
- http://feedparser.org/docs/advanced.html
- http://www.xml.com/pub/a/2004/07/21/dive.html
HTML
http://www.w3.org/TR/REC-html40/charset.html#h-5.2.2
TODO
====
- parse @charset of HTML elements?
- check for more texttypes if only text given
"""
__all__ = ['buildlog',
'encodingByMediaType',
'getHTTPInfo',
'getMetaInfo',
'detectXMLEncoding',
'getEncodingInfo',
'tryEncodings',
'EncodingInfo']
__docformat__ = 'restructuredtext'
__author__ = 'Christof Hoeke'
__version__ = '0.8.3 $Id: __init__.py 1138 2008-03-15 18:24:46Z cthedot $'
import cgi
import HTMLParser
import httplib
import re
import StringIO
import sys
import types
import urllib
class _MetaHTMLParser(HTMLParser.HTMLParser):
"""parses given data for <meta http-equiv="content-type">"""
content_type = None
def handle_starttag(self, tag, attrs):
if tag == 'meta' and not self.content_type:
atts = dict([(a.lower(), v.lower()) for a, v in attrs])
if atts.get('http-equiv', u'').strip() == u'content-type':
self.content_type = atts.get('content')
# application/xml, application/xml-dtd, application/xml-external-parsed-entity, or a subtype like application/rss+xml.
_XML_APPLICATION_TYPE = 0
# text/xml, text/xml-external-parsed-entity, or a subtype like text/AnythingAtAll+xml
_XML_TEXT_TYPE = 1
# text/html
_HTML_TEXT_TYPE = 2
# any other of text/* like text/plain, ...
_TEXT_TYPE = 3
# any text/* like which defaults to UTF-8 encoding, for now only text/css
_TEXT_UTF8 = 5
# types not fitting in above types
_OTHER_TYPE = 4
class EncodingInfo(object):
"""
All encoding related information, returned by ``getEncodingInfo``
- ``encoding``: The guessed encoding
Encoding is the explicit or implicit encoding or None and
always lowercase.
- from HTTP response
* ``http_encoding``
* ``http_media_type``
- from HTML <meta> element
* ``meta_encoding``
* ``meta_media_type``
- from XML declaration
* ``xml_encoding``
- ``mismatch``: True if mismatch between XML declaration and HTTP header
Mismatch is True if any mismatches between HTTP header, XML
declaration or textcontent (meta) are found. More detailed mismatch
reports are written to the optional log or ``logtext``
Mismatches are not necessarily errors as preferences are defined.
For details see the specifications.
- ``logtext``: if no log was given log reports are given here
"""
def __init__(self):
"""
initializes all possible properties to ``None``, see class
description
"""
self.encoding = self.mismatch = self.logtext =\
self.http_encoding = self.http_media_type =\
self.meta_encoding = self.meta_media_type =\
self.xml_encoding =\
None
def __str__(self):
"""
``str(EncodingInfo())`` outputs the guessed encoding itself or the empty string
"""
if self.encoding:
return self.encoding
else:
return u''
def __repr__(self):
return "<%s.%s object encoding=%r mismatch=%s at 0x%x>" % (
self.__class__.__module__, self.__class__.__name__,
self.encoding, self.mismatch, id(self))
def buildlog(logname='encutils', level='INFO', stream=sys.stderr,
filename=None, filemode="w",
format='%(levelname)s\t%(message)s'):
"""
helper to build a basic log
- if ``filename`` is given returns a log logging to ``filename`` with
mode ``filemode``
- else uses a log streaming to ``stream`` which defaults to
``sys.stderr``
- ``level`` defines the level of the log
- ``format`` defines the formatter format of the log
returns a log with the name ``logname``
"""
import logging
log = logging.getLogger(logname)
if filename:
hdlr = logging.FileHandler(filename, filemode)
else:
hdlr = logging.StreamHandler(stream)
formatter = logging.Formatter(format)
hdlr.setFormatter(formatter)
log.addHandler(hdlr)
log.setLevel(logging.__dict__.get(level, logging.INFO))
return log
def _getTextTypeByMediaType(media_type, log=None):
"""
returns type as defined by constants above
"""
if not media_type:
return _OTHER_TYPE
xml_application_types = [
ur'application/.*?\+xml',
u'application/xml',
u'application/xml-dtd',
u'application/xml-external-parsed-entity']
xml_text_types = [
ur'text\/.*?\+xml',
u'text/xml',
u'text/xml-external-parsed-entity']
media_type = media_type.strip().lower()
if media_type in xml_application_types or\
re.match(xml_application_types[0], media_type, re.I|re.S|re.X):
return _XML_APPLICATION_TYPE
elif media_type in xml_text_types or\
re.match(xml_text_types[0], media_type, re.I|re.S|re.X):
return _XML_TEXT_TYPE
elif media_type == u'text/html':
return _HTML_TEXT_TYPE
elif media_type == u'text/css':
return _TEXT_UTF8
elif media_type.startswith(u'text/'):
return _TEXT_TYPE
else:
return _OTHER_TYPE
def _getTextType(text, log=None):
"""
checks if given text is XML (**naive test!**)
used if no content-type given
"""
if text[:30].find(u'<?xml version=') != -1:
return _XML_APPLICATION_TYPE
else:
return _OTHER_TYPE
def encodingByMediaType(media_type, log=None):
"""
Returns a default encoding for the given media_type.
For example ``'utf-8'`` for ``media_type='application/xml'``.
Refers to RFC 3023 and HTTP MIME specification.
If no default encoding is available returns ``None``.
"""
defaultencodings = {
_XML_APPLICATION_TYPE: u'utf-8',
_XML_TEXT_TYPE: u'ascii',
_HTML_TEXT_TYPE: u'iso-8859-1', # should be None?
_TEXT_TYPE: u'iso-8859-1', # should be None?
_TEXT_UTF8: u'utf-8',
_OTHER_TYPE: None}
texttype = _getTextTypeByMediaType(media_type)
encoding = defaultencodings.get(texttype, None)
if log:
if not encoding:
log.debug(u'"%s" Media-Type has no default encoding',
media_type)
else:
log.debug(
u'Default encoding for Media Type "%s": %s',
media_type, encoding)
return encoding
def getHTTPInfo(response, log=None):
"""
Returns ``(media_type, encoding)`` information from the response'
Content-Type HTTP header. (Case of headers is ignored.)
May be ``(None, None)`` e.g. if no Content-Type header is
available.
"""
info = response.info()
media_type = info.gettype()
encoding = info.getparam('charset')
if encoding:
encoding = encoding.lower()
if log:
log.info(u'HTTP media_type: %s', media_type)
log.info(u'HTTP encoding: %s', encoding)
return media_type, encoding
def getMetaInfo(text, log=None):
"""
Returns (media_type, encoding) information from (first)
X/HTML Content-Type ``<meta>`` element if available.
Normally in X/HTML:
``<meta http-equiv="Content-Type" content="media_type;
charset=encoding"/>``
"""
p = _MetaHTMLParser()
p.feed(text)
if p.content_type:
media_type, params = cgi.parse_header(p.content_type)
encoding = params.get('charset') # defaults to None
if encoding:
encoding = encoding.lower()
if log:
log.info(u'HTML META media_type: %s', media_type)
log.info(u'HTML META encoding: %s', encoding)
else:
media_type = encoding = None
return media_type, encoding
def detectXMLEncoding(fp, log=None, includeDefault=True):
"""
Attempts to detect the character encoding of the xml file
given by a file object fp. fp must not be a codec wrapped file
object! fp may also be a string or unicode string
The return value can be:
- if detection of the BOM succeeds, the codec name of the
corresponding unicode charset is returned
- if BOM detection fails, the xml declaration is searched for
the encoding attribute and its value returned. the "<"
character has to be the very first in the file then (it's xml
standard after all).
- if BOM and xml declaration fail, utf-8 is returned according
to XML 1.0.
Based on a recipe by Lars Tiede:
http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/363841
which itself is based on Paul Prescotts recipe:
http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52257
"""
if type(fp) in types.StringTypes:
fp = StringIO.StringIO(fp)
### detection using BOM
## the BOMs we know, by their pattern
bomDict={ # bytepattern: name
(0x00, 0x00, 0xFE, 0xFF) : "utf_32_be",
(0xFF, 0xFE, 0x00, 0x00) : "utf_32_le",
(0xFE, 0xFF, None, None) : "utf_16_be",
(0xFF, 0xFE, None, None) : "utf_16_le",
(0xEF, 0xBB, 0xBF, None) : "utf-8",
}
## go to beginning of file and get the first 4 bytes
oldFP = fp.tell()
fp.seek(0)
(byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4)))
## try bom detection using 4 bytes, 3 bytes, or 2 bytes
bomDetection = bomDict.get((byte1, byte2, byte3, byte4))
if not bomDetection:
bomDetection = bomDict.get((byte1, byte2, byte3, None))
if not bomDetection:
bomDetection = bomDict.get((byte1, byte2, None, None))
## if BOM detected, we're done :-)
if bomDetection:
if log:
log.info(u'XML BOM encoding: %s' % bomDetection)
fp.seek(oldFP)
return bomDetection
## still here? BOM detection failed.
## now that BOM detection has failed we assume one byte character
## encoding behaving ASCII
### search xml declaration for encoding attribute
## assume xml declaration fits into the first 2 KB (*cough*)
fp.seek(0)
buffer = fp.read(2048)
## set up regular expression
xmlDeclPattern = r"""
^<\?xml # w/o BOM, xmldecl starts with <?xml at the first byte
.+? # some chars (version info), matched minimal
encoding= # encoding attribute begins
["'] # attribute start delimiter
(?P<encstr> # what's matched in the brackets will be named encstr
[^"']+ # every character not delimiter (not overly exact!)
) # closes the brackets pair for the named group
["'] # attribute end delimiter
.*? # some chars optionally (standalone decl or whitespace)
\?> # xmldecl end
"""
xmlDeclRE = re.compile(xmlDeclPattern, re.VERBOSE)
## search and extract encoding string
match = xmlDeclRE.search(buffer)
fp.seek(oldFP)
if match:
enc = match.group("encstr").lower()
if log:
log.info(u'XML encoding="%s"' % enc)
return enc
else:
if includeDefault:
if log:
log.info(u'XML encoding default utf-8')
return u'utf-8'
else:
return None
def tryEncodings(text, log=None):
"""
If installed uses chardet http://chardet.feedparser.org/ to detect
encoding, else tries different encodings on text and returns the one
that does not raise an exception which is not very advanced or may
be totally wrong.
Returns working encoding or None if no encoding does work at all.
The returned encoding might nevertheless be not the one intended by the
author as it is only checked if the text might be encoded in that
encoding. Some texts might be working in "iso-8859-1" *and*
"windows-1252" *and* "ascii" *and* "utf-8" and ...
"""
try:
import chardet
encoding = chardet.detect(text)["encoding"]
except ImportError:
msg = 'Using simplified encoding detection, you might want to install chardet.'
if log:
log.warn(msg)
else:
print msg
encodings = (
'ascii',
'iso-8859-1',
'windows-1252',
'utf-8'
)
encoding = None
for e in encodings:
try:
text.encode(e)
except (UnicodeEncodeError, UnicodeDecodeError):
pass
else:
encoding = e
break
return encoding
def getEncodingInfo(response=None, text=u'', log=None, url=None):
"""
Finds all encoding related information in given ``text``.
Uses information in headers of supplied HTTPResponse, possible XML
declaration and X/HTML ``<meta>`` elements.
``text`` will mostly be HTML or XML.
Parameters
- ``response``: HTTP response object,
e.g. ``urllib.urlopen('url')``
- ``text``: to guess encoding for, might include XML
prolog with encoding pseudo attribute or HTML meta element
- ``log``: an optional logging logger to which messages may go, if
no log given all log messages are available from resulting
``EncodingInfo``
May also simply be called with ``getEncodingInfo(url='URL')`` which fetches
the url and all needed information.
Returns instance of ``EncodingInfo``.
How the resulting encoding is retrieved
=======================================
XML
---
RFC 3023 states if media type given in the Content-Type HTTP header is
application/xml, application/xml-dtd,
application/xml-external-parsed-entity, or any one of the subtypes of
application/xml such as application/atom+xml or application/rss+xml
etc then the character encoding is determined in this order:
1. the encoding given in the charset parameter of the Content-Type HTTP
header, or
2. the encoding given in the encoding attribute of the XML declaration
within the document, or
3. utf-8.
Mismatch possibilities:
- HTTP + XMLdecla
- HTTP + HTMLmeta
application/xhtml+xml ?
XMLdecla + HTMLmeta
If the media type given in the Content-Type HTTP header is text/xml,
text/xml-external-parsed-entity, or a subtype like text/Anything+xml,
the encoding attribute of the XML declaration is ignored completely
and the character encoding is determined in the order:
1. the encoding given in the charset parameter of the Content-Type HTTP
header, or
2. ascii.
Mismatch possibilities:
- HTTP + XMLdecla
- HTTP + HTMLmeta
text/xhtml+xml
XMLdecla + HTMLmeta
HTML
----
For HTML served as text/html:
http://www.w3.org/TR/REC-html40/charset.html#h-5.2.2
1. An HTTP "charset" parameter in a "Content-Type" field.
(maybe defaults to ISO-8859-1, but should not assume this)
2. A META declaration with "http-equiv" set to "Content-Type" and a
value set for "charset".
3. The charset attribute set on an element that designates an external
resource. (NOT IMPLEMENTED HERE YET)
Mismatch possibilities:
- HTTP + HTMLmeta
TEXT
----
For most text/* types the encoding will be reported as iso-8859-1.
Exceptions are XML formats send as text/* mime type (see above) and
text/css which has a default encoding of UTF-8.
"""
if url:
try:
response = urllib.urlopen(url)
text = response.read()
except IOError, e:
print IOError(e)
sys.exit(1)
encinfo = EncodingInfo()
logstream = StringIO.StringIO()
if not log:
log = buildlog(stream=logstream, format='%(message)s')
# HTTP
if response:
encinfo.http_media_type, encinfo.http_encoding = getHTTPInfo(
response, log)
texttype = _getTextTypeByMediaType(encinfo.http_media_type, log)
else:
# check if maybe XML or (TODO:) HTML
texttype = _getTextType(text, log)
# XML (also XHTML served as text/html)
if texttype == _XML_APPLICATION_TYPE or texttype == _XML_TEXT_TYPE:
encinfo.xml_encoding = detectXMLEncoding(text, log)
# XML (also XHTML served as text/html)
if texttype == _HTML_TEXT_TYPE:
encinfo.xml_encoding = detectXMLEncoding(text, log, includeDefault=False)
# HTML
if texttype == _HTML_TEXT_TYPE or texttype == _TEXT_TYPE:
encinfo.meta_media_type, encinfo.meta_encoding = getMetaInfo(
text, log)
# guess
# 1. HTTP charset?
encinfo.encoding = encinfo.http_encoding
encinfo.mismatch = False
# 2. media_type?
# XML application/...
if texttype == _XML_APPLICATION_TYPE:
if not encinfo.encoding:
encinfo.encoding = encinfo.xml_encoding
# xml_encoding has default of utf-8
# text/html
elif texttype == _HTML_TEXT_TYPE:
if not encinfo.encoding:
encinfo.encoding = encinfo.meta_encoding
if not encinfo.encoding:
encinfo.encoding = encodingByMediaType(encinfo.http_media_type)
if not encinfo.encoding:
encinfo.encoding = tryEncodings(text)
# text/... + xml or text/*
elif texttype == _XML_TEXT_TYPE or texttype == _TEXT_TYPE:
if not encinfo.encoding:
encinfo.encoding = encodingByMediaType(encinfo.http_media_type)
# possible mismatches, checks if present at all and then if equal
# HTTP + XML
if encinfo.http_encoding and encinfo.xml_encoding and\
encinfo.http_encoding <> encinfo.xml_encoding:
encinfo.mismatch = True
log.warn(u'"%s" (HTTP) <> "%s" (XML) encoding mismatch' %
(encinfo.http_encoding, encinfo.xml_encoding))
# HTTP + Meta
if encinfo.http_encoding and encinfo.meta_encoding and\
encinfo.http_encoding <> encinfo.meta_encoding:
encinfo.mismatch = True
log.warn(u'"%s" (HTTP) <> "%s" (HTML <meta>) encoding mismatch' %
(encinfo.http_encoding, encinfo.meta_encoding))
# XML + Meta
if encinfo.xml_encoding and encinfo.meta_encoding and\
encinfo.xml_encoding <> encinfo.meta_encoding:
encinfo.mismatch = True
log.warn(u'"%s" (XML) <> "%s" (HTML <meta>) encoding mismatch' %
(encinfo.xml_encoding, encinfo.meta_encoding))
log.info(u'Encoding (probably): %s (Mismatch: %s)',
encinfo.encoding, encinfo.mismatch)
encinfo.logtext = logstream.getvalue()
return encinfo
if __name__ == '__main__':
import pydoc
pydoc.help(__name__)