mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Improve encoding detection.
This commit is contained in:
parent
1f710c1fd2
commit
f94102bc87
@ -26,6 +26,7 @@ from optparse import OptionParser as _OptionParser
|
|||||||
from ttfquery import findsystem, describe
|
from ttfquery import findsystem, describe
|
||||||
|
|
||||||
from libprs500.translations.msgfmt import make
|
from libprs500.translations.msgfmt import make
|
||||||
|
from libprs500.ebooks.chardet import detect
|
||||||
|
|
||||||
iswindows = 'win32' in sys.platform.lower() or 'win64' in sys.platform.lower()
|
iswindows = 'win32' in sys.platform.lower() or 'win64' in sys.platform.lower()
|
||||||
isosx = 'darwin' in sys.platform.lower()
|
isosx = 'darwin' in sys.platform.lower()
|
||||||
|
@ -1074,11 +1074,8 @@ class BeautifulStoneSoup(Tag, SGMLParser):
|
|||||||
if not hasattr(self, 'originalEncoding'):
|
if not hasattr(self, 'originalEncoding'):
|
||||||
self.originalEncoding = None
|
self.originalEncoding = None
|
||||||
else:
|
else:
|
||||||
dammit = UnicodeDammit\
|
# Changed detection by Kovid
|
||||||
(markup, [self.fromEncoding, inDocumentEncoding],
|
markup, self.originalEncoding = chardet.xml_to_unicode(markup)
|
||||||
smartQuotesTo=self.smartQuotesTo)
|
|
||||||
markup = dammit.unicode
|
|
||||||
self.originalEncoding = dammit.originalEncoding
|
|
||||||
if markup:
|
if markup:
|
||||||
if self.markupMassage:
|
if self.markupMassage:
|
||||||
if not isList(self.markupMassage):
|
if not isList(self.markupMassage):
|
||||||
|
@ -17,6 +17,8 @@
|
|||||||
|
|
||||||
__version__ = "1.0"
|
__version__ = "1.0"
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
def detect(aBuf):
|
def detect(aBuf):
|
||||||
import universaldetector
|
import universaldetector
|
||||||
u = universaldetector.UniversalDetector()
|
u = universaldetector.UniversalDetector()
|
||||||
@ -24,3 +26,31 @@ def detect(aBuf):
|
|||||||
u.feed(aBuf)
|
u.feed(aBuf)
|
||||||
u.close()
|
u.close()
|
||||||
return u.result
|
return u.result
|
||||||
|
|
||||||
|
# Added by Kovid
|
||||||
|
def xml_to_unicode(raw):
|
||||||
|
'''
|
||||||
|
Force conversion of byte string to unicode. Tries to llok for XML/HTML
|
||||||
|
encoding declaration first, if not found uses the chardet library and
|
||||||
|
prints a warning if detection confidence is < 100%
|
||||||
|
@return: (unicode, encoding used)
|
||||||
|
'''
|
||||||
|
encoding = None
|
||||||
|
if isinstance(raw, unicode):
|
||||||
|
return raw, encoding
|
||||||
|
match = re.compile('^\s*<\?.*encoding=[\'"](.*?)[\'"].*\?>', re.IGNORECASE).match(raw)
|
||||||
|
if match is None:
|
||||||
|
match = re.compile(r'<meta.*?content=[\'"].*?charset=([^\s\'"]+).*?[\'"]', re.IGNORECASE).search(raw)
|
||||||
|
if match is not None:
|
||||||
|
encoding = match.group(1)
|
||||||
|
if encoding is None:
|
||||||
|
chardet = detect(raw)
|
||||||
|
encoding = chardet['encoding']
|
||||||
|
if chardet['confidence'] < 1:
|
||||||
|
print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100)
|
||||||
|
CHARSET_ALIASES = { "macintosh" : "mac-roman",
|
||||||
|
"x-sjis" : "shift-jis" }
|
||||||
|
encoding = encoding.lower()
|
||||||
|
if CHARSET_ALIASES.has_key(encoding):
|
||||||
|
encoding = CHARSET_ALIASES[encoding]
|
||||||
|
return raw.decode(encoding, 'ignore'), encoding
|
||||||
|
@ -33,8 +33,7 @@ except ImportError:
|
|||||||
import Image as PILImage
|
import Image as PILImage
|
||||||
|
|
||||||
from libprs500.ebooks.BeautifulSoup import BeautifulSoup, Comment, Tag, \
|
from libprs500.ebooks.BeautifulSoup import BeautifulSoup, Comment, Tag, \
|
||||||
NavigableString, Declaration, ProcessingInstruction, \
|
NavigableString, Declaration, ProcessingInstruction
|
||||||
UnicodeDammit
|
|
||||||
from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \
|
from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \
|
||||||
TextBlock, ImageBlock, JumpButton, CharButton, \
|
TextBlock, ImageBlock, JumpButton, CharButton, \
|
||||||
Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \
|
Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \
|
||||||
@ -49,6 +48,7 @@ from libprs500.ptempfile import PersistentTemporaryFile
|
|||||||
from libprs500.ebooks.metadata.opf import OPFReader
|
from libprs500.ebooks.metadata.opf import OPFReader
|
||||||
from libprs500.devices.interface import Device
|
from libprs500.devices.interface import Device
|
||||||
from libprs500.ebooks.lrf.html.color_map import lrs_color
|
from libprs500.ebooks.lrf.html.color_map import lrs_color
|
||||||
|
from libprs500.ebooks.chardet import xml_to_unicode
|
||||||
|
|
||||||
def update_css(ncss, ocss):
|
def update_css(ncss, ocss):
|
||||||
for key in ncss.keys():
|
for key in ncss.keys():
|
||||||
@ -360,7 +360,7 @@ class HTMLConverter(object):
|
|||||||
if self.pdftohtml: # Bug in pdftohtml that causes it to output invalid UTF-8 files
|
if self.pdftohtml: # Bug in pdftohtml that causes it to output invalid UTF-8 files
|
||||||
raw = raw.decode('utf-8', 'ignore')
|
raw = raw.decode('utf-8', 'ignore')
|
||||||
else:
|
else:
|
||||||
raw = UnicodeDammit(raw).unicode
|
raw = xml_to_unicode(raw)[0]
|
||||||
f.close()
|
f.close()
|
||||||
soup = self.preprocess(raw)
|
soup = self.preprocess(raw)
|
||||||
self.logger.info('\tConverting to BBeB...')
|
self.logger.info('\tConverting to BBeB...')
|
||||||
|
@ -20,12 +20,13 @@ import sys, os, logging
|
|||||||
|
|
||||||
from libprs500 import __author__, __appname__, __version__, setup_cli_handlers
|
from libprs500 import __author__, __appname__, __version__, setup_cli_handlers
|
||||||
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, NavigableString, \
|
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, NavigableString, \
|
||||||
CData, Tag, UnicodeDammit
|
CData, Tag
|
||||||
from libprs500.ebooks.lrf.pylrs.pylrs import Book, PageStyle, TextStyle, \
|
from libprs500.ebooks.lrf.pylrs.pylrs import Book, PageStyle, TextStyle, \
|
||||||
BlockStyle, ImageStream, Font, StyleDefault, BookSetting, Header, \
|
BlockStyle, ImageStream, Font, StyleDefault, BookSetting, Header, \
|
||||||
Image, ImageBlock, Page, TextBlock, Canvas, Paragraph, CR, Span, \
|
Image, ImageBlock, Page, TextBlock, Canvas, Paragraph, CR, Span, \
|
||||||
Italic, Sup, Sub, Bold, EmpLine, JumpButton, CharButton, Plot, \
|
Italic, Sup, Sub, Bold, EmpLine, JumpButton, CharButton, Plot, \
|
||||||
DropCaps, Footer, RuledLine
|
DropCaps, Footer, RuledLine
|
||||||
|
from libprs500.ebooks.chardet import xml_to_unicode
|
||||||
|
|
||||||
class LrsParser(object):
|
class LrsParser(object):
|
||||||
|
|
||||||
@ -38,7 +39,7 @@ class LrsParser(object):
|
|||||||
def __init__(self, stream, logger):
|
def __init__(self, stream, logger):
|
||||||
self.logger = logger
|
self.logger = logger
|
||||||
src = stream.read()
|
src = stream.read()
|
||||||
self.soup = BeautifulStoneSoup(UnicodeDammit(src).unicode,
|
self.soup = BeautifulStoneSoup(xml_to_unicode(src)[0],
|
||||||
selfClosingTags=self.SELF_CLOSING_TAGS)
|
selfClosingTags=self.SELF_CLOSING_TAGS)
|
||||||
self.objects = {}
|
self.objects = {}
|
||||||
for obj in self.soup.findAll(objid=True):
|
for obj in self.soup.findAll(objid=True):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user