Improve encoding detection.

This commit is contained in:
Kovid Goyal 2008-02-25 17:11:02 +00:00
parent 1f710c1fd2
commit f94102bc87
5 changed files with 39 additions and 10 deletions

View File

@ -26,6 +26,7 @@ from optparse import OptionParser as _OptionParser
from ttfquery import findsystem, describe
from libprs500.translations.msgfmt import make
from libprs500.ebooks.chardet import detect
iswindows = 'win32' in sys.platform.lower() or 'win64' in sys.platform.lower()
isosx = 'darwin' in sys.platform.lower()

View File

@ -1074,11 +1074,8 @@ class BeautifulStoneSoup(Tag, SGMLParser):
if not hasattr(self, 'originalEncoding'):
self.originalEncoding = None
else:
dammit = UnicodeDammit\
(markup, [self.fromEncoding, inDocumentEncoding],
smartQuotesTo=self.smartQuotesTo)
markup = dammit.unicode
self.originalEncoding = dammit.originalEncoding
# Changed detection by Kovid
markup, self.originalEncoding = chardet.xml_to_unicode(markup)
if markup:
if self.markupMassage:
if not isList(self.markupMassage):

View File

@ -17,6 +17,8 @@
__version__ = "1.0"
import re
def detect(aBuf):
import universaldetector
u = universaldetector.UniversalDetector()
@ -24,3 +26,31 @@ def detect(aBuf):
u.feed(aBuf)
u.close()
return u.result
# Added by Kovid
def xml_to_unicode(raw):
'''
Force conversion of byte string to unicode. Tries to llok for XML/HTML
encoding declaration first, if not found uses the chardet library and
prints a warning if detection confidence is < 100%
@return: (unicode, encoding used)
'''
encoding = None
if isinstance(raw, unicode):
return raw, encoding
match = re.compile('^\s*<\?.*encoding=[\'"](.*?)[\'"].*\?>', re.IGNORECASE).match(raw)
if match is None:
match = re.compile(r'<meta.*?content=[\'"].*?charset=([^\s\'"]+).*?[\'"]', re.IGNORECASE).search(raw)
if match is not None:
encoding = match.group(1)
if encoding is None:
chardet = detect(raw)
encoding = chardet['encoding']
if chardet['confidence'] < 1:
print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100)
CHARSET_ALIASES = { "macintosh" : "mac-roman",
"x-sjis" : "shift-jis" }
encoding = encoding.lower()
if CHARSET_ALIASES.has_key(encoding):
encoding = CHARSET_ALIASES[encoding]
return raw.decode(encoding, 'ignore'), encoding

View File

@ -33,8 +33,7 @@ except ImportError:
import Image as PILImage
from libprs500.ebooks.BeautifulSoup import BeautifulSoup, Comment, Tag, \
NavigableString, Declaration, ProcessingInstruction, \
UnicodeDammit
NavigableString, Declaration, ProcessingInstruction
from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \
TextBlock, ImageBlock, JumpButton, CharButton, \
Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \
@ -49,6 +48,7 @@ from libprs500.ptempfile import PersistentTemporaryFile
from libprs500.ebooks.metadata.opf import OPFReader
from libprs500.devices.interface import Device
from libprs500.ebooks.lrf.html.color_map import lrs_color
from libprs500.ebooks.chardet import xml_to_unicode
def update_css(ncss, ocss):
for key in ncss.keys():
@ -360,7 +360,7 @@ class HTMLConverter(object):
if self.pdftohtml: # Bug in pdftohtml that causes it to output invalid UTF-8 files
raw = raw.decode('utf-8', 'ignore')
else:
raw = UnicodeDammit(raw).unicode
raw = xml_to_unicode(raw)[0]
f.close()
soup = self.preprocess(raw)
self.logger.info('\tConverting to BBeB...')

View File

@ -20,12 +20,13 @@ import sys, os, logging
from libprs500 import __author__, __appname__, __version__, setup_cli_handlers
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, NavigableString, \
CData, Tag, UnicodeDammit
CData, Tag
from libprs500.ebooks.lrf.pylrs.pylrs import Book, PageStyle, TextStyle, \
BlockStyle, ImageStream, Font, StyleDefault, BookSetting, Header, \
Image, ImageBlock, Page, TextBlock, Canvas, Paragraph, CR, Span, \
Italic, Sup, Sub, Bold, EmpLine, JumpButton, CharButton, Plot, \
DropCaps, Footer, RuledLine
from libprs500.ebooks.chardet import xml_to_unicode
class LrsParser(object):
@ -38,7 +39,7 @@ class LrsParser(object):
def __init__(self, stream, logger):
self.logger = logger
src = stream.read()
self.soup = BeautifulStoneSoup(UnicodeDammit(src).unicode,
self.soup = BeautifulStoneSoup(xml_to_unicode(src)[0],
selfClosingTags=self.SELF_CLOSING_TAGS)
self.objects = {}
for obj in self.soup.findAll(objid=True):