Improve encoding detection.

This commit is contained in:
Kovid Goyal 2008-02-29 17:11:29 +00:00
parent d53460c1c4
commit 28622f5e58
3 changed files with 13 additions and 7 deletions

View File

@ -30,14 +30,14 @@ def detect(aBuf):
# Added by Kovid # Added by Kovid
def xml_to_unicode(raw, verbose=False): def xml_to_unicode(raw, verbose=False):
''' '''
Force conversion of byte string to unicode. Tries to llok for XML/HTML Force conversion of byte string to unicode. Tries to look for XML/HTML
encoding declaration first, if not found uses the chardet library and encoding declaration first, if not found uses the chardet library and
prints a warning if detection confidence is < 100% prints a warning if detection confidence is < 100%
@return: (unicode, encoding used) @return: (unicode, encoding used)
''' '''
if not raw:
return u'', None
encoding = None encoding = None
if not raw:
return u'', encoding
if isinstance(raw, unicode): if isinstance(raw, unicode):
return raw, encoding return raw, encoding
match = re.compile('^\s*<\?.*encoding=[\'"](.*?)[\'"].*\?>', re.IGNORECASE).match(raw) match = re.compile('^\s*<\?.*encoding=[\'"](.*?)[\'"].*\?>', re.IGNORECASE).match(raw)

View File

@ -313,20 +313,26 @@ class DefaultProfile(object):
soup = BeautifulStoneSoup(src) soup = BeautifulStoneSoup(src)
for item in soup.findAll('item'): for item in soup.findAll('item'):
try: try:
atitle = item.find('title')
if not atitle:
continue
atitle = self.tag_to_string(atitle)
if self.use_pubdate: if self.use_pubdate:
pubdate = item.find('pubdate') pubdate = item.find('pubdate')
if not pubdate: if not pubdate:
pubdate = item.find('dc:date') pubdate = item.find('dc:date')
if not pubdate or not pubdate.string: if not pubdate or not pubdate.string:
self.logger.debug('Skipping article as it does not have publication date') self.logger.debug('Skipping article %s as it does not have publication date'%atitle)
continue continue
pubdate = self.tag_to_string(pubdate) pubdate = self.tag_to_string(pubdate)
pubdate = pubdate.replace('+0000', 'GMT') pubdate = pubdate.replace('+0000', 'GMT')
url = self.get_article_url(item) url = self.get_article_url(item)
url = self.tag_to_string(url) url = self.tag_to_string(url)
if require_url and not url: if require_url and not url:
self.logger.debug('Skipping article as it does not have a link url') self.logger.debug('Skipping article %s as it does not have a link url'%atitle)
continue continue
purl = url purl = url
try: try:
@ -344,7 +350,7 @@ class DefaultProfile(object):
content = '' content = ''
d = { d = {
'title' : self.tag_to_string(item.find('title')), 'title' : atitle,
'url' : purl, 'url' : purl,
'timestamp': self.strptime(pubdate) if self.use_pubdate else time.time(), 'timestamp': self.strptime(pubdate) if self.use_pubdate else time.time(),
'date' : pubdate if self.use_pubdate else time.ctime(), 'date' : pubdate if self.use_pubdate else time.ctime(),

View File

@ -270,7 +270,7 @@ class RecursiveFetcher(object):
if self.encoding is not None: if self.encoding is not None:
dsrc = dsrc.decode(self.encoding, 'ignore') dsrc = dsrc.decode(self.encoding, 'ignore')
else: else:
dsrc = xml_to_unicode(dsrc) dsrc = xml_to_unicode(dsrc, self.verbose)[0]
soup = self.get_soup(dsrc) soup = self.get_soup(dsrc)
self.logger.debug('Processing images...') self.logger.debug('Processing images...')