mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
IGN:Various regression fixes and an incomplete epub split implementation
This commit is contained in:
parent
5b78f8430f
commit
f277f2b870
@ -7,9 +7,10 @@ __docformat__ = 'restructuredtext en'
|
||||
Conversion to EPUB.
|
||||
'''
|
||||
import sys, textwrap
|
||||
from lxml import html
|
||||
from calibre.utils.config import Config, StringConfig
|
||||
from calibre.utils.zipfile import ZipFile, ZIP_STORED
|
||||
from calibre.ebooks.html import config as common_config
|
||||
from calibre.ebooks.html import config as common_config, tostring
|
||||
|
||||
class DefaultProfile(object):
|
||||
|
||||
@ -43,7 +44,6 @@ def initialize_container(path_to_container, opf_name='metadata.opf'):
|
||||
zf.writestr('META-INF/container.xml', CONTAINER)
|
||||
return zf
|
||||
|
||||
|
||||
def config(defaults=None):
|
||||
desc = _('Options to control the conversion to EPUB')
|
||||
if defaults is None:
|
||||
@ -59,7 +59,8 @@ def config(defaults=None):
|
||||
help=_('The output EPUB file. If not specified, it is derived from the input file name.'))
|
||||
c.add_opt('profile', ['--profile'], default='PRS505', choices=list(PROFILES.keys()),
|
||||
help=_('Profile of the target device this EPUB is meant for. Set to None to create a device independent EPUB. The profile is used for device specific restrictions on the EPUB. Choices are: ')+str(list(PROFILES.keys())))
|
||||
|
||||
c.add_opt('override_css', ['--override-css'], default=None,
|
||||
help=_('Either the path to a CSS stylesheet or raw CSS. This CSS will override any existing CSS declarations in the source files.'))
|
||||
structure = c.add_group('structure detection', _('Control auto-detection of document structure.'))
|
||||
structure('chapter', ['--chapter'], default="//*[re:match(name(), 'h[1-2]') and re:test(., 'chapter|book|section', 'i')]",
|
||||
help=_('''\
|
||||
|
@ -18,6 +18,7 @@ from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from calibre.ebooks.epub import initialize_container, PROFILES
|
||||
from calibre.ebooks.epub.split import split
|
||||
|
||||
|
||||
class HTMLProcessor(Processor):
|
||||
@ -34,17 +35,7 @@ class HTMLProcessor(Processor):
|
||||
if opts.verbose > 2:
|
||||
self.debug_tree('nocss')
|
||||
|
||||
self.collect_font_statistics()
|
||||
|
||||
self.split()
|
||||
|
||||
def save(self):
|
||||
file = Processor.save(self)
|
||||
with open(file, 'rb') as f:
|
||||
f.seek(0, 2)
|
||||
size = f.tell()
|
||||
if size > self.opts.profile.flow_size:
|
||||
self.split()
|
||||
#self.collect_font_statistics()
|
||||
|
||||
|
||||
def collect_font_statistics(self):
|
||||
@ -58,12 +49,6 @@ class HTMLProcessor(Processor):
|
||||
#TODO: Use cssutils on self.raw_css to figure out the font size
|
||||
# of this piece of text and update statistics accordingly
|
||||
|
||||
def split(self):
|
||||
''' Split into individual flows to accommodate Adobe's incompetence '''
|
||||
# TODO: Only split file larger than 300K (as specified in profile)
|
||||
# Split on page breaks first and then on <h1-6> tags and then on
|
||||
# <div> and finally on <p>.
|
||||
pass
|
||||
|
||||
|
||||
def config(defaults=None):
|
||||
@ -88,6 +73,7 @@ def parse_content(filelist, opts, tdir):
|
||||
resource_map, filelist)
|
||||
hp.populate_toc(toc)
|
||||
hp.save()
|
||||
|
||||
return resource_map, hp.htmlfile_map, toc
|
||||
|
||||
def convert(htmlfile, opts, notification=None):
|
||||
@ -96,6 +82,11 @@ def convert(htmlfile, opts, notification=None):
|
||||
opts.output = os.path.splitext(os.path.basename(htmlfile))[0] + '.epub'
|
||||
opts.profile = PROFILES[opts.profile]
|
||||
opts.output = os.path.abspath(opts.output)
|
||||
if opts.override_css is not None:
|
||||
try:
|
||||
opts.override_css = open(opts.override_css, 'rb').read().decode('utf-8', 'replace')
|
||||
except:
|
||||
opts.override_css = opts.override_css.decode('utf-8', 'replace')
|
||||
if htmlfile.lower().endswith('.opf'):
|
||||
opf = OPFReader(htmlfile, os.path.dirname(os.path.abspath(htmlfile)))
|
||||
filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding)
|
||||
@ -153,7 +144,8 @@ def convert(htmlfile, opts, notification=None):
|
||||
for item in mi.manifest:
|
||||
if getattr(item, 'mime_type', None) == 'text/html':
|
||||
item.mime_type = 'application/xhtml+xml'
|
||||
with open(os.path.join(tdir, 'metadata.opf'), 'wb') as f:
|
||||
opf_path = os.path.join(tdir, 'metadata.opf')
|
||||
with open(opf_path, 'wb') as f:
|
||||
mi.render(f, buf, 'toc.ncx')
|
||||
if opts.show_opf:
|
||||
print open(os.path.join(tdir, 'metadata.opf')).read()
|
||||
@ -163,6 +155,7 @@ def convert(htmlfile, opts, notification=None):
|
||||
f.write(toc)
|
||||
if opts.show_ncx:
|
||||
print toc
|
||||
split(opf_path, opts)
|
||||
epub = initialize_container(opts.output)
|
||||
epub.add_dir(tdir)
|
||||
print 'Output written to', opts.output
|
||||
|
175
src/calibre/ebooks/epub/split.py
Normal file
175
src/calibre/ebooks/epub/split.py
Normal file
@ -0,0 +1,175 @@
|
||||
#!/usr/bin/env python
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Split the flows in an epub file to conform to size limitations.
|
||||
'''
|
||||
|
||||
import sys, os, math, copy
|
||||
|
||||
from lxml.etree import parse, XMLParser
|
||||
from lxml.cssselect import CSSSelector
|
||||
|
||||
from calibre.ebooks.metadata.opf2 import OPF
|
||||
from calibre.ebooks.epub import tostring
|
||||
|
||||
PARSER = XMLParser(recover=True)
|
||||
|
||||
class SplitError(ValueError):
|
||||
|
||||
def __init__(self, path):
|
||||
ValueError.__init__(self, _('Could not find reasonable point at which to split: ')+os.path.basename(path))
|
||||
|
||||
def split_tree(tree, split_point, before, opts, filepath):
|
||||
trees = set([])
|
||||
tree2 = copy.deepcopy(tree)
|
||||
path = tree.getpath(split_point)
|
||||
root, root2 = tree.getroot(), tree2.getroot()
|
||||
body, body2 = root.xpath('//body')[0], root2.xpath('//body')[0]
|
||||
split_point2 = root2.xpath(path)[0]
|
||||
|
||||
# Tree 1
|
||||
hit_split_point = False
|
||||
for elem in body.iterdescendants():
|
||||
if elem is split_point:
|
||||
hit_split_point = True
|
||||
if before:
|
||||
elem.text = u''
|
||||
elem.tail = u''
|
||||
elem.set('calibre_split', '1')
|
||||
continue
|
||||
if hit_split_point:
|
||||
elem.text = u''
|
||||
elem.tail = u''
|
||||
elem.set('calibre_split', '1' if hit_split_point else '0')
|
||||
|
||||
# Tree 2
|
||||
hit_split_point = False
|
||||
for elem in body2.iterdescendants():
|
||||
if elem is split_point2:
|
||||
hit_split_point = True
|
||||
if not before:
|
||||
elem.text = u''
|
||||
elem.tail = u''
|
||||
elem.set('calibre_split', '1')
|
||||
continue
|
||||
if not hit_split_point:
|
||||
elem.text = u''
|
||||
elem.tail = u''
|
||||
elem.set('calibre_split', '0' if hit_split_point else '1')
|
||||
|
||||
for t, r in [(tree, root), (tree2, root2)]:
|
||||
if len(tostring(r)) < opts.profile.flow_size:
|
||||
trees.append(t)
|
||||
else:
|
||||
new_split_point, before = find_split_point(t)
|
||||
if new_split_point is None:
|
||||
raise SplitError(filepath)
|
||||
trees.extend(split_tree(t, new_split_point, before, opts, filepath))
|
||||
|
||||
return trees
|
||||
|
||||
|
||||
def find_split_point(tree):
|
||||
root = tree.getroot()
|
||||
css = root.xpath('//style[@type="text/css"]')
|
||||
if css:
|
||||
|
||||
def pick_elem(elems):
|
||||
if elems:
|
||||
elems = [i for i in elems if elem.get('calibre_split', '0') != '1']
|
||||
if elems:
|
||||
i = int(math.floor(len(elems)/2.))
|
||||
return elems[i]
|
||||
|
||||
def selector_element(rule):
|
||||
try:
|
||||
selector = CSSSelector(rule.selectorText)
|
||||
return pick_elem(selector(root))
|
||||
except:
|
||||
return None
|
||||
|
||||
css = css[0].text
|
||||
from cssutils import CSSParser
|
||||
stylesheet = CSSParser().parseString(css)
|
||||
for rule in stylesheet:
|
||||
if rule.type != rule.STYLE_RULE:
|
||||
continue
|
||||
before = getattr(rule.style.getPropertyCSSValue('page-break-before'), 'cssText', '').strip().lower()
|
||||
if before and before != 'avoid':
|
||||
elem = selector_element(rule)
|
||||
if elem is not None:
|
||||
return elem, True
|
||||
after = getattr(rule.style.getPropertyCSSValue('page-break-after'), 'cssText', '').strip().lower()
|
||||
if after and after != 'avoid':
|
||||
elem = selector_element(rule)
|
||||
if elem is not None:
|
||||
return elem, False
|
||||
|
||||
for path in ('//*[re:match(name(), "h[1-6]", "i")', '/body/div', '//p'):
|
||||
elems = root.xpath(path)
|
||||
elem = pick_elem(elems)
|
||||
if elem is not None:
|
||||
return elem, True
|
||||
|
||||
return None, True
|
||||
|
||||
def do_split(path, opts):
|
||||
tree = parse(path, parser=PARSER)
|
||||
split_point, before = find_split_point(tree)
|
||||
if split_point is None:
|
||||
raise SplitError(path)
|
||||
trees = split_tree(tree, split_point, before, opts, path)
|
||||
base = os.path.splitext(os.path.basename(path))[0] + '_split_%d.html'
|
||||
anchor_map = {None:base%0}
|
||||
files = []
|
||||
for i, tree in enumerate(trees):
|
||||
root = tree.getroot()
|
||||
files.append(base%i)
|
||||
for elem in root.xpath('//*[@id and @calibre_split = "1"]'):
|
||||
anchor_map[elem.get('id')] = files[-1]
|
||||
elem.attrib.pop('calibre_split')
|
||||
for elem in root.xpath('//*[@calibre_split]'):
|
||||
elem.attrib.pop('calibre_split')
|
||||
open(os.path.join(os.path.dirname(path), files[-1]), 'wb').write(tostring(root, pretty_print=opts.pretty_print))
|
||||
os.remove(path)
|
||||
return path, files, anchor_map
|
||||
|
||||
def fix_opf(opf, orig_file, files, anchor_map):
|
||||
orig = None
|
||||
for item in opf.manifest:
|
||||
if os.path.samefile(orig_file, item.path):
|
||||
orig = item
|
||||
break
|
||||
opf.manifest.remove(orig)
|
||||
ids = []
|
||||
for f in files:
|
||||
ids.append(opf.manifest.add_item(f))
|
||||
|
||||
|
||||
def split(pathtoopf, opts):
|
||||
return
|
||||
pathtoopf = os.path.abspath(pathtoopf)
|
||||
opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf))
|
||||
html_files = []
|
||||
for item in opf.manifest:
|
||||
if 'html' in item.mime_type.lower():
|
||||
html_files.append(item.path)
|
||||
changes = []
|
||||
for f in html_files:
|
||||
if os.stat(f).st_size > opts.profile.flow_size:
|
||||
fix_opf(opf, *do_split(f, opts))
|
||||
if changes:
|
||||
pass
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def main(args=sys.argv):
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
@ -27,6 +27,11 @@ from calibre.ebooks.metadata.opf2 import OPF
|
||||
from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
|
||||
def tostring(root, pretty_print=False):
|
||||
return html.tostring(root, encoding='utf-8', method='xml',
|
||||
pretty_print=pretty_print,
|
||||
include_meta_content_type=True)
|
||||
|
||||
|
||||
class Link(object):
|
||||
'''
|
||||
@ -332,9 +337,7 @@ class Parser(PreProcessor, LoggingInterface):
|
||||
Should be called after all HTML processing is finished.
|
||||
'''
|
||||
with open(os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path]), 'wb') as f:
|
||||
ans = html.tostring(self.root, encoding='utf-8', method='xml',
|
||||
pretty_print=self.opts.pretty_print,
|
||||
include_meta_content_type=True)
|
||||
ans = tostring(self.root, pretty_print=self.opts.pretty_print)
|
||||
ans = re.compile(r'<html>', re.IGNORECASE).sub('<html xmlns="http://www.w3.org/1999/xhtml">', ans)
|
||||
ans = re.compile(r'<head[^<>]*?>', re.IGNORECASE).sub('<head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n', ans)
|
||||
f.write(ans)
|
||||
@ -551,6 +554,8 @@ class Processor(Parser):
|
||||
|
||||
self.raw_css = '\n\n'.join(css)
|
||||
self.css = unicode(self.raw_css)
|
||||
if self.opts.override_css:
|
||||
self.css += '\n\n'+self.opts.override_css
|
||||
self.do_layout()
|
||||
# TODO: Figure out what to do about CSS imports from linked stylesheets
|
||||
|
||||
|
@ -88,6 +88,15 @@ class Manifest(ResourceCollection):
|
||||
m.append(mi)
|
||||
return m
|
||||
|
||||
def add_item(self, path, mime_type=None):
|
||||
mi = ManifestItem(path, is_path=True)
|
||||
if mime_type:
|
||||
mi.mime_type = mime_type
|
||||
mi.id = 'id%d'%self.next_id
|
||||
self.next_id += 1
|
||||
self.append(mi)
|
||||
return mi.id
|
||||
|
||||
def __init__(self):
|
||||
ResourceCollection.__init__(self)
|
||||
self.next_id = 1
|
||||
|
@ -229,12 +229,6 @@ class Main(MainWindow, Ui_MainWindow):
|
||||
db = LibraryDatabase2(self.library_path)
|
||||
self.library_view.set_database(db)
|
||||
if self.olddb is not None:
|
||||
QMessageBox.information(self, 'Database format changed',
|
||||
'''\
|
||||
<p>calibre's book storage format has changed. Instead of storing book files in a database, the
|
||||
files are now stored in a folder on your filesystem. You will now be asked to choose the folder
|
||||
in which you want to store your books files. Any existing books will be automatically migrated.
|
||||
''')
|
||||
from PyQt4.QtGui import QProgressDialog
|
||||
pd = QProgressDialog('', '', 0, 100, self)
|
||||
pd.setWindowModality(Qt.ApplicationModal)
|
||||
@ -1278,6 +1272,12 @@ in which you want to store your books files. Any existing books will be automati
|
||||
self.library_path = prefs['library_path']
|
||||
self.olddb = None
|
||||
if self.library_path is None: # Need to migrate to new database layout
|
||||
QMessageBox.information(self, 'Database format changed',
|
||||
'''\
|
||||
<p>calibre's book storage format has changed. Instead of storing book files in a database, the
|
||||
files are now stored in a folder on your filesystem. You will now be asked to choose the folder
|
||||
in which you want to store your books files. Any existing books will be automatically migrated.
|
||||
''')
|
||||
self.database_path = prefs['database_path']
|
||||
if not os.access(os.path.dirname(self.database_path), os.W_OK):
|
||||
error_dialog(self, _('Database does not exist'),
|
||||
|
@ -441,6 +441,10 @@ def post_install():
|
||||
if opts.save_manifest_to:
|
||||
open(opts.save_manifest_to, 'wb').write('\n'.join(manifest)+'\n')
|
||||
|
||||
from calibre.utils.config import config_dir
|
||||
if os.path.exists(config_dir):
|
||||
shutil.rmtree(config_dir)
|
||||
|
||||
|
||||
VIEWER = '''\
|
||||
[Desktop Entry]
|
||||
|
@ -368,9 +368,7 @@ msgid ""
|
||||
"device. Default: %s Supported profiles: "
|
||||
msgstr ""
|
||||
"Profilen til lagringsenheten som denne LRF filen blir generert for. Profilen "
|
||||
"
\n"
|
||||
"angir innstillinger som oppløsning og skjerm størrelsen til lagringsenheten. "
|
||||
"
\n"
|
||||
"Standard: %s Støttede profiler: "
|
||||
|
||||
#: /home/kovid/work/calibre/src/calibre/ebooks/lrf/__init__.py:134
|
||||
|
@ -258,7 +258,6 @@ class RecursiveFetcher(object, LoggingInterface):
|
||||
except Exception, err:
|
||||
self.log_warning('Could not fetch image %s', iurl)
|
||||
self.log_debug('Error: %s', str(err), exc_info=True)
|
||||
if hasattr(f, 'close'): f.close()
|
||||
continue
|
||||
c += 1
|
||||
fname = sanitize_file_name('img'+str(c)+ext)
|
||||
|
655
src/encutils/__init__.py
Normal file
655
src/encutils/__init__.py
Normal file
@ -0,0 +1,655 @@
|
||||
#!/usr/bin/env python
|
||||
"""encutils - encoding detection collection for Python
|
||||
|
||||
encutils
|
||||
========
|
||||
:Author: Christof Hoeke, see http://cthedot.de/encutils/
|
||||
:Copyright: 2005-2008: Christof Hoeke
|
||||
:License: encutils has a dual-license, please choose whatever you prefer:
|
||||
|
||||
* encutils is published under the `LGPL 3 or later <http://cthedot.de/encutils/license/>`__
|
||||
* encutils is published under the
|
||||
`Creative Commons License <http://creativecommons.org/licenses/by/3.0/>`__.
|
||||
|
||||
This file is part of encutils.
|
||||
|
||||
encutils is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Lesser General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
encutils is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with encutils. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
A collection of helper functions to detect encodings of text files (like HTML, XHTML, XML, CSS, etc.) retrieved via HTTP, file or string.
|
||||
|
||||
``getEncodingInfo`` is probably the main function of interest which uses
|
||||
other supplied functions itself and gathers all information together and
|
||||
supplies an ``EncodingInfo`` object with the following properties:
|
||||
|
||||
- ``encoding``: The guessed encoding
|
||||
Encoding is the explicit or implicit encoding or None and
|
||||
always lowercase.
|
||||
|
||||
- from HTTP response
|
||||
* ``http_encoding``
|
||||
* ``http_media_type``
|
||||
|
||||
- from HTML <meta> element
|
||||
* ``meta_encoding``
|
||||
* ``meta_media_type``
|
||||
|
||||
- from XML declaration
|
||||
* ``xml_encoding``
|
||||
|
||||
example::
|
||||
|
||||
>>> import encutils
|
||||
>>> info = encutils.getEncodingInfo(url='http://cthedot.de/encutils/')
|
||||
|
||||
>>> print info # = str(info)
|
||||
utf-8
|
||||
|
||||
>>> info # = repr(info)
|
||||
<encutils.EncodingInfo object encoding='utf-8' mismatch=False at 0xb86d30>
|
||||
|
||||
>>> print info.logtext
|
||||
HTTP media_type: text/html
|
||||
HTTP encoding: utf-8
|
||||
HTML META media_type: text/html
|
||||
HTML META encoding: utf-8
|
||||
Encoding (probably): utf-8 (Mismatch: False)
|
||||
|
||||
|
||||
references
|
||||
==========
|
||||
XML
|
||||
RFC 3023 (http://www.ietf.org/rfc/rfc3023.txt)
|
||||
|
||||
easier explained in
|
||||
- http://feedparser.org/docs/advanced.html
|
||||
- http://www.xml.com/pub/a/2004/07/21/dive.html
|
||||
|
||||
HTML
|
||||
http://www.w3.org/TR/REC-html40/charset.html#h-5.2.2
|
||||
|
||||
TODO
|
||||
====
|
||||
- parse @charset of HTML elements?
|
||||
- check for more texttypes if only text given
|
||||
|
||||
"""
|
||||
__all__ = ['buildlog',
|
||||
'encodingByMediaType',
|
||||
'getHTTPInfo',
|
||||
'getMetaInfo',
|
||||
'detectXMLEncoding',
|
||||
'getEncodingInfo',
|
||||
'tryEncodings',
|
||||
'EncodingInfo']
|
||||
__docformat__ = 'restructuredtext'
|
||||
__author__ = 'Christof Hoeke'
|
||||
__version__ = '0.8.3 $Id: __init__.py 1138 2008-03-15 18:24:46Z cthedot $'
|
||||
|
||||
import cgi
|
||||
import HTMLParser
|
||||
import httplib
|
||||
import re
|
||||
import StringIO
|
||||
import sys
|
||||
import types
|
||||
import urllib
|
||||
|
||||
class _MetaHTMLParser(HTMLParser.HTMLParser):
|
||||
"""parses given data for <meta http-equiv="content-type">"""
|
||||
content_type = None
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag == 'meta' and not self.content_type:
|
||||
atts = dict([(a.lower(), v.lower()) for a, v in attrs])
|
||||
if atts.get('http-equiv', u'').strip() == u'content-type':
|
||||
self.content_type = atts.get('content')
|
||||
|
||||
|
||||
# application/xml, application/xml-dtd, application/xml-external-parsed-entity, or a subtype like application/rss+xml.
|
||||
_XML_APPLICATION_TYPE = 0
|
||||
|
||||
# text/xml, text/xml-external-parsed-entity, or a subtype like text/AnythingAtAll+xml
|
||||
_XML_TEXT_TYPE = 1
|
||||
|
||||
# text/html
|
||||
_HTML_TEXT_TYPE = 2
|
||||
|
||||
# any other of text/* like text/plain, ...
|
||||
_TEXT_TYPE = 3
|
||||
|
||||
# any text/* like which defaults to UTF-8 encoding, for now only text/css
|
||||
_TEXT_UTF8 = 5
|
||||
|
||||
# types not fitting in above types
|
||||
_OTHER_TYPE = 4
|
||||
|
||||
class EncodingInfo(object):
|
||||
"""
|
||||
All encoding related information, returned by ``getEncodingInfo``
|
||||
|
||||
- ``encoding``: The guessed encoding
|
||||
Encoding is the explicit or implicit encoding or None and
|
||||
always lowercase.
|
||||
|
||||
- from HTTP response
|
||||
* ``http_encoding``
|
||||
* ``http_media_type``
|
||||
|
||||
- from HTML <meta> element
|
||||
* ``meta_encoding``
|
||||
* ``meta_media_type``
|
||||
|
||||
- from XML declaration
|
||||
* ``xml_encoding``
|
||||
|
||||
- ``mismatch``: True if mismatch between XML declaration and HTTP header
|
||||
Mismatch is True if any mismatches between HTTP header, XML
|
||||
declaration or textcontent (meta) are found. More detailed mismatch
|
||||
reports are written to the optional log or ``logtext``
|
||||
|
||||
Mismatches are not necessarily errors as preferences are defined.
|
||||
For details see the specifications.
|
||||
|
||||
- ``logtext``: if no log was given log reports are given here
|
||||
|
||||
"""
|
||||
def __init__(self):
|
||||
"""
|
||||
initializes all possible properties to ``None``, see class
|
||||
description
|
||||
"""
|
||||
self.encoding = self.mismatch = self.logtext =\
|
||||
self.http_encoding = self.http_media_type =\
|
||||
self.meta_encoding = self.meta_media_type =\
|
||||
self.xml_encoding =\
|
||||
None
|
||||
|
||||
def __str__(self):
|
||||
"""
|
||||
``str(EncodingInfo())`` outputs the guessed encoding itself or the empty string
|
||||
"""
|
||||
if self.encoding:
|
||||
return self.encoding
|
||||
else:
|
||||
return u''
|
||||
|
||||
def __repr__(self):
|
||||
return "<%s.%s object encoding=%r mismatch=%s at 0x%x>" % (
|
||||
self.__class__.__module__, self.__class__.__name__,
|
||||
self.encoding, self.mismatch, id(self))
|
||||
|
||||
|
||||
def buildlog(logname='encutils', level='INFO', stream=sys.stderr,
|
||||
filename=None, filemode="w",
|
||||
format='%(levelname)s\t%(message)s'):
|
||||
"""
|
||||
helper to build a basic log
|
||||
|
||||
- if ``filename`` is given returns a log logging to ``filename`` with
|
||||
mode ``filemode``
|
||||
- else uses a log streaming to ``stream`` which defaults to
|
||||
``sys.stderr``
|
||||
- ``level`` defines the level of the log
|
||||
- ``format`` defines the formatter format of the log
|
||||
|
||||
returns a log with the name ``logname``
|
||||
"""
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(logname)
|
||||
|
||||
if filename:
|
||||
hdlr = logging.FileHandler(filename, filemode)
|
||||
else:
|
||||
hdlr = logging.StreamHandler(stream)
|
||||
|
||||
formatter = logging.Formatter(format)
|
||||
hdlr.setFormatter(formatter)
|
||||
|
||||
log.addHandler(hdlr)
|
||||
log.setLevel(logging.__dict__.get(level, logging.INFO))
|
||||
|
||||
return log
|
||||
|
||||
def _getTextTypeByMediaType(media_type, log=None):
|
||||
"""
|
||||
returns type as defined by constants above
|
||||
"""
|
||||
if not media_type:
|
||||
return _OTHER_TYPE
|
||||
|
||||
xml_application_types = [
|
||||
ur'application/.*?\+xml',
|
||||
u'application/xml',
|
||||
u'application/xml-dtd',
|
||||
u'application/xml-external-parsed-entity']
|
||||
xml_text_types = [
|
||||
ur'text\/.*?\+xml',
|
||||
u'text/xml',
|
||||
u'text/xml-external-parsed-entity']
|
||||
|
||||
media_type = media_type.strip().lower()
|
||||
|
||||
if media_type in xml_application_types or\
|
||||
re.match(xml_application_types[0], media_type, re.I|re.S|re.X):
|
||||
return _XML_APPLICATION_TYPE
|
||||
elif media_type in xml_text_types or\
|
||||
re.match(xml_text_types[0], media_type, re.I|re.S|re.X):
|
||||
return _XML_TEXT_TYPE
|
||||
elif media_type == u'text/html':
|
||||
return _HTML_TEXT_TYPE
|
||||
elif media_type == u'text/css':
|
||||
return _TEXT_UTF8
|
||||
elif media_type.startswith(u'text/'):
|
||||
return _TEXT_TYPE
|
||||
else:
|
||||
return _OTHER_TYPE
|
||||
|
||||
def _getTextType(text, log=None):
|
||||
"""
|
||||
checks if given text is XML (**naive test!**)
|
||||
used if no content-type given
|
||||
"""
|
||||
if text[:30].find(u'<?xml version=') != -1:
|
||||
return _XML_APPLICATION_TYPE
|
||||
else:
|
||||
return _OTHER_TYPE
|
||||
|
||||
def encodingByMediaType(media_type, log=None):
|
||||
"""
|
||||
Returns a default encoding for the given media_type.
|
||||
For example ``'utf-8'`` for ``media_type='application/xml'``.
|
||||
|
||||
Refers to RFC 3023 and HTTP MIME specification.
|
||||
|
||||
If no default encoding is available returns ``None``.
|
||||
"""
|
||||
defaultencodings = {
|
||||
_XML_APPLICATION_TYPE: u'utf-8',
|
||||
_XML_TEXT_TYPE: u'ascii',
|
||||
_HTML_TEXT_TYPE: u'iso-8859-1', # should be None?
|
||||
_TEXT_TYPE: u'iso-8859-1', # should be None?
|
||||
_TEXT_UTF8: u'utf-8',
|
||||
_OTHER_TYPE: None}
|
||||
|
||||
texttype = _getTextTypeByMediaType(media_type)
|
||||
encoding = defaultencodings.get(texttype, None)
|
||||
|
||||
if log:
|
||||
if not encoding:
|
||||
log.debug(u'"%s" Media-Type has no default encoding',
|
||||
media_type)
|
||||
else:
|
||||
log.debug(
|
||||
u'Default encoding for Media Type "%s": %s',
|
||||
media_type, encoding)
|
||||
return encoding
|
||||
|
||||
def getHTTPInfo(response, log=None):
|
||||
"""
|
||||
Returns ``(media_type, encoding)`` information from the response'
|
||||
Content-Type HTTP header. (Case of headers is ignored.)
|
||||
May be ``(None, None)`` e.g. if no Content-Type header is
|
||||
available.
|
||||
"""
|
||||
info = response.info()
|
||||
media_type = info.gettype()
|
||||
encoding = info.getparam('charset')
|
||||
|
||||
if encoding:
|
||||
encoding = encoding.lower()
|
||||
|
||||
if log:
|
||||
log.info(u'HTTP media_type: %s', media_type)
|
||||
log.info(u'HTTP encoding: %s', encoding)
|
||||
|
||||
return media_type, encoding
|
||||
|
||||
def getMetaInfo(text, log=None):
|
||||
"""
|
||||
Returns (media_type, encoding) information from (first)
|
||||
X/HTML Content-Type ``<meta>`` element if available.
|
||||
|
||||
Normally in X/HTML:
|
||||
``<meta http-equiv="Content-Type" content="media_type;
|
||||
charset=encoding"/>``
|
||||
"""
|
||||
p = _MetaHTMLParser()
|
||||
p.feed(text)
|
||||
if p.content_type:
|
||||
media_type, params = cgi.parse_header(p.content_type)
|
||||
encoding = params.get('charset') # defaults to None
|
||||
if encoding:
|
||||
encoding = encoding.lower()
|
||||
if log:
|
||||
log.info(u'HTML META media_type: %s', media_type)
|
||||
log.info(u'HTML META encoding: %s', encoding)
|
||||
else:
|
||||
media_type = encoding = None
|
||||
|
||||
return media_type, encoding
|
||||
|
||||
def detectXMLEncoding(fp, log=None, includeDefault=True):
|
||||
"""
|
||||
Attempts to detect the character encoding of the xml file
|
||||
given by a file object fp. fp must not be a codec wrapped file
|
||||
object! fp may also be a string or unicode string
|
||||
|
||||
The return value can be:
|
||||
- if detection of the BOM succeeds, the codec name of the
|
||||
corresponding unicode charset is returned
|
||||
|
||||
- if BOM detection fails, the xml declaration is searched for
|
||||
the encoding attribute and its value returned. the "<"
|
||||
character has to be the very first in the file then (it's xml
|
||||
standard after all).
|
||||
|
||||
- if BOM and xml declaration fail, utf-8 is returned according
|
||||
to XML 1.0.
|
||||
|
||||
Based on a recipe by Lars Tiede:
|
||||
http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/363841
|
||||
which itself is based on Paul Prescotts recipe:
|
||||
http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52257
|
||||
"""
|
||||
if type(fp) in types.StringTypes:
|
||||
fp = StringIO.StringIO(fp)
|
||||
|
||||
### detection using BOM
|
||||
|
||||
## the BOMs we know, by their pattern
|
||||
bomDict={ # bytepattern: name
|
||||
(0x00, 0x00, 0xFE, 0xFF) : "utf_32_be",
|
||||
(0xFF, 0xFE, 0x00, 0x00) : "utf_32_le",
|
||||
(0xFE, 0xFF, None, None) : "utf_16_be",
|
||||
(0xFF, 0xFE, None, None) : "utf_16_le",
|
||||
(0xEF, 0xBB, 0xBF, None) : "utf-8",
|
||||
}
|
||||
|
||||
## go to beginning of file and get the first 4 bytes
|
||||
oldFP = fp.tell()
|
||||
fp.seek(0)
|
||||
(byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4)))
|
||||
|
||||
## try bom detection using 4 bytes, 3 bytes, or 2 bytes
|
||||
bomDetection = bomDict.get((byte1, byte2, byte3, byte4))
|
||||
if not bomDetection:
|
||||
bomDetection = bomDict.get((byte1, byte2, byte3, None))
|
||||
if not bomDetection:
|
||||
bomDetection = bomDict.get((byte1, byte2, None, None))
|
||||
|
||||
## if BOM detected, we're done :-)
|
||||
if bomDetection:
|
||||
if log:
|
||||
log.info(u'XML BOM encoding: %s' % bomDetection)
|
||||
fp.seek(oldFP)
|
||||
return bomDetection
|
||||
|
||||
## still here? BOM detection failed.
|
||||
## now that BOM detection has failed we assume one byte character
|
||||
## encoding behaving ASCII
|
||||
|
||||
### search xml declaration for encoding attribute
|
||||
|
||||
## assume xml declaration fits into the first 2 KB (*cough*)
|
||||
fp.seek(0)
|
||||
buffer = fp.read(2048)
|
||||
|
||||
## set up regular expression
|
||||
xmlDeclPattern = r"""
|
||||
^<\?xml # w/o BOM, xmldecl starts with <?xml at the first byte
|
||||
.+? # some chars (version info), matched minimal
|
||||
encoding= # encoding attribute begins
|
||||
["'] # attribute start delimiter
|
||||
(?P<encstr> # what's matched in the brackets will be named encstr
|
||||
[^"']+ # every character not delimiter (not overly exact!)
|
||||
) # closes the brackets pair for the named group
|
||||
["'] # attribute end delimiter
|
||||
.*? # some chars optionally (standalone decl or whitespace)
|
||||
\?> # xmldecl end
|
||||
"""
|
||||
xmlDeclRE = re.compile(xmlDeclPattern, re.VERBOSE)
|
||||
|
||||
## search and extract encoding string
|
||||
match = xmlDeclRE.search(buffer)
|
||||
fp.seek(oldFP)
|
||||
if match:
|
||||
enc = match.group("encstr").lower()
|
||||
if log:
|
||||
log.info(u'XML encoding="%s"' % enc)
|
||||
return enc
|
||||
else:
|
||||
if includeDefault:
|
||||
if log:
|
||||
log.info(u'XML encoding default utf-8')
|
||||
return u'utf-8'
|
||||
else:
|
||||
return None
|
||||
|
||||
def tryEncodings(text, log=None):
|
||||
"""
|
||||
If installed uses chardet http://chardet.feedparser.org/ to detect
|
||||
encoding, else tries different encodings on text and returns the one
|
||||
that does not raise an exception which is not very advanced or may
|
||||
be totally wrong.
|
||||
|
||||
Returns working encoding or None if no encoding does work at all.
|
||||
|
||||
The returned encoding might nevertheless be not the one intended by the
|
||||
author as it is only checked if the text might be encoded in that
|
||||
encoding. Some texts might be working in "iso-8859-1" *and*
|
||||
"windows-1252" *and* "ascii" *and* "utf-8" and ...
|
||||
"""
|
||||
try:
|
||||
import chardet
|
||||
encoding = chardet.detect(text)["encoding"]
|
||||
|
||||
except ImportError:
|
||||
msg = 'Using simplified encoding detection, you might want to install chardet.'
|
||||
if log:
|
||||
log.warn(msg)
|
||||
else:
|
||||
print msg
|
||||
|
||||
encodings = (
|
||||
'ascii',
|
||||
'iso-8859-1',
|
||||
'windows-1252',
|
||||
'utf-8'
|
||||
)
|
||||
encoding = None
|
||||
for e in encodings:
|
||||
try:
|
||||
text.encode(e)
|
||||
except (UnicodeEncodeError, UnicodeDecodeError):
|
||||
pass
|
||||
else:
|
||||
encoding = e
|
||||
break
|
||||
|
||||
return encoding
|
||||
|
||||
def getEncodingInfo(response=None, text=u'', log=None, url=None):
|
||||
"""
|
||||
Finds all encoding related information in given ``text``.
|
||||
Uses information in headers of supplied HTTPResponse, possible XML
|
||||
declaration and X/HTML ``<meta>`` elements.
|
||||
``text`` will mostly be HTML or XML.
|
||||
|
||||
Parameters
|
||||
- ``response``: HTTP response object,
|
||||
e.g. ``urllib.urlopen('url')``
|
||||
- ``text``: to guess encoding for, might include XML
|
||||
prolog with encoding pseudo attribute or HTML meta element
|
||||
- ``log``: an optional logging logger to which messages may go, if
|
||||
no log given all log messages are available from resulting
|
||||
``EncodingInfo``
|
||||
|
||||
May also simply be called with ``getEncodingInfo(url='URL')`` which fetches
|
||||
the url and all needed information.
|
||||
|
||||
Returns instance of ``EncodingInfo``.
|
||||
|
||||
How the resulting encoding is retrieved
|
||||
=======================================
|
||||
XML
|
||||
---
|
||||
RFC 3023 states if media type given in the Content-Type HTTP header is
|
||||
application/xml, application/xml-dtd,
|
||||
application/xml-external-parsed-entity, or any one of the subtypes of
|
||||
application/xml such as application/atom+xml or application/rss+xml
|
||||
etc then the character encoding is determined in this order:
|
||||
|
||||
1. the encoding given in the charset parameter of the Content-Type HTTP
|
||||
header, or
|
||||
2. the encoding given in the encoding attribute of the XML declaration
|
||||
within the document, or
|
||||
3. utf-8.
|
||||
|
||||
Mismatch possibilities:
|
||||
- HTTP + XMLdecla
|
||||
- HTTP + HTMLmeta
|
||||
|
||||
application/xhtml+xml ?
|
||||
XMLdecla + HTMLmeta
|
||||
|
||||
If the media type given in the Content-Type HTTP header is text/xml,
|
||||
text/xml-external-parsed-entity, or a subtype like text/Anything+xml,
|
||||
the encoding attribute of the XML declaration is ignored completely
|
||||
and the character encoding is determined in the order:
|
||||
1. the encoding given in the charset parameter of the Content-Type HTTP
|
||||
header, or
|
||||
2. ascii.
|
||||
|
||||
Mismatch possibilities:
|
||||
- HTTP + XMLdecla
|
||||
- HTTP + HTMLmeta
|
||||
|
||||
text/xhtml+xml
|
||||
XMLdecla + HTMLmeta
|
||||
|
||||
HTML
|
||||
----
|
||||
For HTML served as text/html:
|
||||
http://www.w3.org/TR/REC-html40/charset.html#h-5.2.2
|
||||
|
||||
1. An HTTP "charset" parameter in a "Content-Type" field.
|
||||
(maybe defaults to ISO-8859-1, but should not assume this)
|
||||
2. A META declaration with "http-equiv" set to "Content-Type" and a
|
||||
value set for "charset".
|
||||
3. The charset attribute set on an element that designates an external
|
||||
resource. (NOT IMPLEMENTED HERE YET)
|
||||
|
||||
Mismatch possibilities:
|
||||
- HTTP + HTMLmeta
|
||||
|
||||
TEXT
|
||||
----
|
||||
For most text/* types the encoding will be reported as iso-8859-1.
|
||||
Exceptions are XML formats send as text/* mime type (see above) and
|
||||
text/css which has a default encoding of UTF-8.
|
||||
"""
|
||||
if url:
|
||||
try:
|
||||
response = urllib.urlopen(url)
|
||||
text = response.read()
|
||||
except IOError, e:
|
||||
print IOError(e)
|
||||
sys.exit(1)
|
||||
|
||||
encinfo = EncodingInfo()
|
||||
|
||||
logstream = StringIO.StringIO()
|
||||
if not log:
|
||||
log = buildlog(stream=logstream, format='%(message)s')
|
||||
|
||||
# HTTP
|
||||
if response:
|
||||
encinfo.http_media_type, encinfo.http_encoding = getHTTPInfo(
|
||||
response, log)
|
||||
texttype = _getTextTypeByMediaType(encinfo.http_media_type, log)
|
||||
else:
|
||||
# check if maybe XML or (TODO:) HTML
|
||||
texttype = _getTextType(text, log)
|
||||
|
||||
# XML (also XHTML served as text/html)
|
||||
if texttype == _XML_APPLICATION_TYPE or texttype == _XML_TEXT_TYPE:
|
||||
encinfo.xml_encoding = detectXMLEncoding(text, log)
|
||||
|
||||
# XML (also XHTML served as text/html)
|
||||
if texttype == _HTML_TEXT_TYPE:
|
||||
encinfo.xml_encoding = detectXMLEncoding(text, log, includeDefault=False)
|
||||
|
||||
# HTML
|
||||
if texttype == _HTML_TEXT_TYPE or texttype == _TEXT_TYPE:
|
||||
encinfo.meta_media_type, encinfo.meta_encoding = getMetaInfo(
|
||||
text, log)
|
||||
|
||||
# guess
|
||||
# 1. HTTP charset?
|
||||
encinfo.encoding = encinfo.http_encoding
|
||||
encinfo.mismatch = False
|
||||
|
||||
# 2. media_type?
|
||||
# XML application/...
|
||||
if texttype == _XML_APPLICATION_TYPE:
|
||||
if not encinfo.encoding:
|
||||
encinfo.encoding = encinfo.xml_encoding
|
||||
# xml_encoding has default of utf-8
|
||||
|
||||
# text/html
|
||||
elif texttype == _HTML_TEXT_TYPE:
|
||||
if not encinfo.encoding:
|
||||
encinfo.encoding = encinfo.meta_encoding
|
||||
if not encinfo.encoding:
|
||||
encinfo.encoding = encodingByMediaType(encinfo.http_media_type)
|
||||
if not encinfo.encoding:
|
||||
encinfo.encoding = tryEncodings(text)
|
||||
|
||||
# text/... + xml or text/*
|
||||
elif texttype == _XML_TEXT_TYPE or texttype == _TEXT_TYPE:
|
||||
if not encinfo.encoding:
|
||||
encinfo.encoding = encodingByMediaType(encinfo.http_media_type)
|
||||
|
||||
# possible mismatches, checks if present at all and then if equal
|
||||
# HTTP + XML
|
||||
if encinfo.http_encoding and encinfo.xml_encoding and\
|
||||
encinfo.http_encoding <> encinfo.xml_encoding:
|
||||
encinfo.mismatch = True
|
||||
log.warn(u'"%s" (HTTP) <> "%s" (XML) encoding mismatch' %
|
||||
(encinfo.http_encoding, encinfo.xml_encoding))
|
||||
# HTTP + Meta
|
||||
if encinfo.http_encoding and encinfo.meta_encoding and\
|
||||
encinfo.http_encoding <> encinfo.meta_encoding:
|
||||
encinfo.mismatch = True
|
||||
log.warn(u'"%s" (HTTP) <> "%s" (HTML <meta>) encoding mismatch' %
|
||||
(encinfo.http_encoding, encinfo.meta_encoding))
|
||||
# XML + Meta
|
||||
if encinfo.xml_encoding and encinfo.meta_encoding and\
|
||||
encinfo.xml_encoding <> encinfo.meta_encoding:
|
||||
encinfo.mismatch = True
|
||||
log.warn(u'"%s" (XML) <> "%s" (HTML <meta>) encoding mismatch' %
|
||||
(encinfo.xml_encoding, encinfo.meta_encoding))
|
||||
|
||||
log.info(u'Encoding (probably): %s (Mismatch: %s)',
|
||||
encinfo.encoding, encinfo.mismatch)
|
||||
|
||||
encinfo.logtext = logstream.getvalue()
|
||||
return encinfo
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import pydoc
|
||||
pydoc.help(__name__)
|
Loading…
x
Reference in New Issue
Block a user