mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Improve encoding detection.
This commit is contained in:
parent
d53460c1c4
commit
28622f5e58
@ -30,14 +30,14 @@ def detect(aBuf):
|
|||||||
# Added by Kovid
|
# Added by Kovid
|
||||||
def xml_to_unicode(raw, verbose=False):
|
def xml_to_unicode(raw, verbose=False):
|
||||||
'''
|
'''
|
||||||
Force conversion of byte string to unicode. Tries to llok for XML/HTML
|
Force conversion of byte string to unicode. Tries to look for XML/HTML
|
||||||
encoding declaration first, if not found uses the chardet library and
|
encoding declaration first, if not found uses the chardet library and
|
||||||
prints a warning if detection confidence is < 100%
|
prints a warning if detection confidence is < 100%
|
||||||
@return: (unicode, encoding used)
|
@return: (unicode, encoding used)
|
||||||
'''
|
'''
|
||||||
if not raw:
|
|
||||||
return u'', None
|
|
||||||
encoding = None
|
encoding = None
|
||||||
|
if not raw:
|
||||||
|
return u'', encoding
|
||||||
if isinstance(raw, unicode):
|
if isinstance(raw, unicode):
|
||||||
return raw, encoding
|
return raw, encoding
|
||||||
match = re.compile('^\s*<\?.*encoding=[\'"](.*?)[\'"].*\?>', re.IGNORECASE).match(raw)
|
match = re.compile('^\s*<\?.*encoding=[\'"](.*?)[\'"].*\?>', re.IGNORECASE).match(raw)
|
||||||
|
@ -313,20 +313,26 @@ class DefaultProfile(object):
|
|||||||
soup = BeautifulStoneSoup(src)
|
soup = BeautifulStoneSoup(src)
|
||||||
for item in soup.findAll('item'):
|
for item in soup.findAll('item'):
|
||||||
try:
|
try:
|
||||||
|
atitle = item.find('title')
|
||||||
|
if not atitle:
|
||||||
|
continue
|
||||||
|
|
||||||
|
atitle = self.tag_to_string(atitle)
|
||||||
if self.use_pubdate:
|
if self.use_pubdate:
|
||||||
pubdate = item.find('pubdate')
|
pubdate = item.find('pubdate')
|
||||||
if not pubdate:
|
if not pubdate:
|
||||||
pubdate = item.find('dc:date')
|
pubdate = item.find('dc:date')
|
||||||
if not pubdate or not pubdate.string:
|
if not pubdate or not pubdate.string:
|
||||||
self.logger.debug('Skipping article as it does not have publication date')
|
self.logger.debug('Skipping article %s as it does not have publication date'%atitle)
|
||||||
continue
|
continue
|
||||||
pubdate = self.tag_to_string(pubdate)
|
pubdate = self.tag_to_string(pubdate)
|
||||||
pubdate = pubdate.replace('+0000', 'GMT')
|
pubdate = pubdate.replace('+0000', 'GMT')
|
||||||
|
|
||||||
|
|
||||||
url = self.get_article_url(item)
|
url = self.get_article_url(item)
|
||||||
url = self.tag_to_string(url)
|
url = self.tag_to_string(url)
|
||||||
if require_url and not url:
|
if require_url and not url:
|
||||||
self.logger.debug('Skipping article as it does not have a link url')
|
self.logger.debug('Skipping article %s as it does not have a link url'%atitle)
|
||||||
continue
|
continue
|
||||||
purl = url
|
purl = url
|
||||||
try:
|
try:
|
||||||
@ -344,7 +350,7 @@ class DefaultProfile(object):
|
|||||||
content = ''
|
content = ''
|
||||||
|
|
||||||
d = {
|
d = {
|
||||||
'title' : self.tag_to_string(item.find('title')),
|
'title' : atitle,
|
||||||
'url' : purl,
|
'url' : purl,
|
||||||
'timestamp': self.strptime(pubdate) if self.use_pubdate else time.time(),
|
'timestamp': self.strptime(pubdate) if self.use_pubdate else time.time(),
|
||||||
'date' : pubdate if self.use_pubdate else time.ctime(),
|
'date' : pubdate if self.use_pubdate else time.ctime(),
|
||||||
|
@ -270,7 +270,7 @@ class RecursiveFetcher(object):
|
|||||||
if self.encoding is not None:
|
if self.encoding is not None:
|
||||||
dsrc = dsrc.decode(self.encoding, 'ignore')
|
dsrc = dsrc.decode(self.encoding, 'ignore')
|
||||||
else:
|
else:
|
||||||
dsrc = xml_to_unicode(dsrc)
|
dsrc = xml_to_unicode(dsrc, self.verbose)[0]
|
||||||
|
|
||||||
soup = self.get_soup(dsrc)
|
soup = self.get_soup(dsrc)
|
||||||
self.logger.debug('Processing images...')
|
self.logger.debug('Processing images...')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user