Improve encoding detection.

This commit is contained in:
Kovid Goyal 2008-02-29 17:11:29 +00:00
parent d53460c1c4
commit 28622f5e58
3 changed files with 13 additions and 7 deletions

View File

@ -30,14 +30,14 @@ def detect(aBuf):
# Added by Kovid
def xml_to_unicode(raw, verbose=False):
'''
Force conversion of byte string to unicode. Tries to llok for XML/HTML
Force conversion of byte string to unicode. Tries to look for XML/HTML
encoding declaration first, if not found uses the chardet library and
prints a warning if detection confidence is < 100%
@return: (unicode, encoding used)
'''
if not raw:
return u'', None
encoding = None
if not raw:
return u'', encoding
if isinstance(raw, unicode):
return raw, encoding
match = re.compile('^\s*<\?.*encoding=[\'"](.*?)[\'"].*\?>', re.IGNORECASE).match(raw)

View File

@ -313,20 +313,26 @@ class DefaultProfile(object):
soup = BeautifulStoneSoup(src)
for item in soup.findAll('item'):
try:
atitle = item.find('title')
if not atitle:
continue
atitle = self.tag_to_string(atitle)
if self.use_pubdate:
pubdate = item.find('pubdate')
if not pubdate:
pubdate = item.find('dc:date')
if not pubdate or not pubdate.string:
self.logger.debug('Skipping article as it does not have publication date')
self.logger.debug('Skipping article %s as it does not have publication date'%atitle)
continue
pubdate = self.tag_to_string(pubdate)
pubdate = pubdate.replace('+0000', 'GMT')
url = self.get_article_url(item)
url = self.tag_to_string(url)
if require_url and not url:
self.logger.debug('Skipping article as it does not have a link url')
self.logger.debug('Skipping article %s as it does not have a link url'%atitle)
continue
purl = url
try:
@ -344,7 +350,7 @@ class DefaultProfile(object):
content = ''
d = {
'title' : self.tag_to_string(item.find('title')),
'title' : atitle,
'url' : purl,
'timestamp': self.strptime(pubdate) if self.use_pubdate else time.time(),
'date' : pubdate if self.use_pubdate else time.ctime(),

View File

@ -270,7 +270,7 @@ class RecursiveFetcher(object):
if self.encoding is not None:
dsrc = dsrc.decode(self.encoding, 'ignore')
else:
dsrc = xml_to_unicode(dsrc)
dsrc = xml_to_unicode(dsrc, self.verbose)[0]
soup = self.get_soup(dsrc)
self.logger.debug('Processing images...')