mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Improve encoding detection.
This commit is contained in:
parent
1f710c1fd2
commit
f94102bc87
@ -26,6 +26,7 @@ from optparse import OptionParser as _OptionParser
|
||||
from ttfquery import findsystem, describe
|
||||
|
||||
from libprs500.translations.msgfmt import make
|
||||
from libprs500.ebooks.chardet import detect
|
||||
|
||||
iswindows = 'win32' in sys.platform.lower() or 'win64' in sys.platform.lower()
|
||||
isosx = 'darwin' in sys.platform.lower()
|
||||
|
@ -1074,11 +1074,8 @@ class BeautifulStoneSoup(Tag, SGMLParser):
|
||||
if not hasattr(self, 'originalEncoding'):
|
||||
self.originalEncoding = None
|
||||
else:
|
||||
dammit = UnicodeDammit\
|
||||
(markup, [self.fromEncoding, inDocumentEncoding],
|
||||
smartQuotesTo=self.smartQuotesTo)
|
||||
markup = dammit.unicode
|
||||
self.originalEncoding = dammit.originalEncoding
|
||||
# Changed detection by Kovid
|
||||
markup, self.originalEncoding = chardet.xml_to_unicode(markup)
|
||||
if markup:
|
||||
if self.markupMassage:
|
||||
if not isList(self.markupMassage):
|
||||
|
@ -17,6 +17,8 @@
|
||||
|
||||
__version__ = "1.0"
|
||||
|
||||
import re
|
||||
|
||||
def detect(aBuf):
|
||||
import universaldetector
|
||||
u = universaldetector.UniversalDetector()
|
||||
@ -24,3 +26,31 @@ def detect(aBuf):
|
||||
u.feed(aBuf)
|
||||
u.close()
|
||||
return u.result
|
||||
|
||||
# Added by Kovid
|
||||
def xml_to_unicode(raw):
|
||||
'''
|
||||
Force conversion of byte string to unicode. Tries to llok for XML/HTML
|
||||
encoding declaration first, if not found uses the chardet library and
|
||||
prints a warning if detection confidence is < 100%
|
||||
@return: (unicode, encoding used)
|
||||
'''
|
||||
encoding = None
|
||||
if isinstance(raw, unicode):
|
||||
return raw, encoding
|
||||
match = re.compile('^\s*<\?.*encoding=[\'"](.*?)[\'"].*\?>', re.IGNORECASE).match(raw)
|
||||
if match is None:
|
||||
match = re.compile(r'<meta.*?content=[\'"].*?charset=([^\s\'"]+).*?[\'"]', re.IGNORECASE).search(raw)
|
||||
if match is not None:
|
||||
encoding = match.group(1)
|
||||
if encoding is None:
|
||||
chardet = detect(raw)
|
||||
encoding = chardet['encoding']
|
||||
if chardet['confidence'] < 1:
|
||||
print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100)
|
||||
CHARSET_ALIASES = { "macintosh" : "mac-roman",
|
||||
"x-sjis" : "shift-jis" }
|
||||
encoding = encoding.lower()
|
||||
if CHARSET_ALIASES.has_key(encoding):
|
||||
encoding = CHARSET_ALIASES[encoding]
|
||||
return raw.decode(encoding, 'ignore'), encoding
|
||||
|
@ -33,8 +33,7 @@ except ImportError:
|
||||
import Image as PILImage
|
||||
|
||||
from libprs500.ebooks.BeautifulSoup import BeautifulSoup, Comment, Tag, \
|
||||
NavigableString, Declaration, ProcessingInstruction, \
|
||||
UnicodeDammit
|
||||
NavigableString, Declaration, ProcessingInstruction
|
||||
from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \
|
||||
TextBlock, ImageBlock, JumpButton, CharButton, \
|
||||
Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \
|
||||
@ -49,6 +48,7 @@ from libprs500.ptempfile import PersistentTemporaryFile
|
||||
from libprs500.ebooks.metadata.opf import OPFReader
|
||||
from libprs500.devices.interface import Device
|
||||
from libprs500.ebooks.lrf.html.color_map import lrs_color
|
||||
from libprs500.ebooks.chardet import xml_to_unicode
|
||||
|
||||
def update_css(ncss, ocss):
|
||||
for key in ncss.keys():
|
||||
@ -360,7 +360,7 @@ class HTMLConverter(object):
|
||||
if self.pdftohtml: # Bug in pdftohtml that causes it to output invalid UTF-8 files
|
||||
raw = raw.decode('utf-8', 'ignore')
|
||||
else:
|
||||
raw = UnicodeDammit(raw).unicode
|
||||
raw = xml_to_unicode(raw)[0]
|
||||
f.close()
|
||||
soup = self.preprocess(raw)
|
||||
self.logger.info('\tConverting to BBeB...')
|
||||
|
@ -20,12 +20,13 @@ import sys, os, logging
|
||||
|
||||
from libprs500 import __author__, __appname__, __version__, setup_cli_handlers
|
||||
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, NavigableString, \
|
||||
CData, Tag, UnicodeDammit
|
||||
CData, Tag
|
||||
from libprs500.ebooks.lrf.pylrs.pylrs import Book, PageStyle, TextStyle, \
|
||||
BlockStyle, ImageStream, Font, StyleDefault, BookSetting, Header, \
|
||||
Image, ImageBlock, Page, TextBlock, Canvas, Paragraph, CR, Span, \
|
||||
Italic, Sup, Sub, Bold, EmpLine, JumpButton, CharButton, Plot, \
|
||||
DropCaps, Footer, RuledLine
|
||||
from libprs500.ebooks.chardet import xml_to_unicode
|
||||
|
||||
class LrsParser(object):
|
||||
|
||||
@ -38,7 +39,7 @@ class LrsParser(object):
|
||||
def __init__(self, stream, logger):
|
||||
self.logger = logger
|
||||
src = stream.read()
|
||||
self.soup = BeautifulStoneSoup(UnicodeDammit(src).unicode,
|
||||
self.soup = BeautifulStoneSoup(xml_to_unicode(src)[0],
|
||||
selfClosingTags=self.SELF_CLOSING_TAGS)
|
||||
self.objects = {}
|
||||
for obj in self.soup.findAll(objid=True):
|
||||
|
Loading…
x
Reference in New Issue
Block a user