Improve encoding detection.

2025-07-09 03:04:10 -04:00 · 2008-02-29 17:11:29 +00:00 · 2008-02-29 17:11:29 +00:00 · 28622f5e58
commit 28622f5e58
parent d53460c1c4
3 changed files with 13 additions and 7 deletions
--- a/src/libprs500/ebooks/chardet/init.py
+++ b/src/libprs500/ebooks/chardet/init.py
@ -30,14 +30,14 @@ def detect(aBuf):
 # Added by Kovid
 def xml_to_unicode(raw, verbose=False):
    '''
-    Force conversion of byte string to unicode. Tries to llok for XML/HTML 
+    Force conversion of byte string to unicode. Tries to look for XML/HTML 
    encoding declaration first, if not found uses the chardet library and
    prints a warning if detection confidence is < 100%
    @return: (unicode, encoding used) 
    '''
-    if not raw:
-        return u'', None
    encoding = None
+    if not raw:
+        return u'', encoding    
    if isinstance(raw, unicode):
        return raw, encoding
    match = re.compile('^\s*<\?.*encoding=[\'"](.*?)[\'"].*\?>', re.IGNORECASE).match(raw)
--- a/src/libprs500/ebooks/lrf/web/profiles/init.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/init.py
@ -313,20 +313,26 @@ class DefaultProfile(object):
            soup = BeautifulStoneSoup(src)
            for item in soup.findAll('item'):
                try:
+                    atitle = item.find('title')
+                    if not atitle:
+                        continue
+                    
+                    atitle = self.tag_to_string(atitle)
                    if self.use_pubdate:
                        pubdate = item.find('pubdate')
                        if not pubdate:
                            pubdate = item.find('dc:date')
                        if not pubdate or not pubdate.string:
-                            self.logger.debug('Skipping article as it does not have publication date')
+                            self.logger.debug('Skipping article %s as it does not have publication date'%atitle)
                            continue
                        pubdate = self.tag_to_string(pubdate)
                        pubdate = pubdate.replace('+0000', 'GMT')
                    
+                    
                    url = self.get_article_url(item)
                    url = self.tag_to_string(url)
                    if require_url and not url:
-                        self.logger.debug('Skipping article as it does not have a link url')
+                        self.logger.debug('Skipping article %s as it does not have a link url'%atitle)
                        continue
                    purl = url
                    try:
@ -344,7 +350,7 @@ class DefaultProfile(object):
                        content = ''
                        
                    d = { 
-                        'title'    : self.tag_to_string(item.find('title')),                 
+                        'title'    : atitle,                 
                        'url'      : purl,
                        'timestamp': self.strptime(pubdate) if self.use_pubdate else time.time(),
                        'date'     : pubdate if self.use_pubdate else time.ctime(),
--- a/src/libprs500/web/fetch/simple.py
+++ b/src/libprs500/web/fetch/simple.py
@ -270,7 +270,7 @@ class RecursiveFetcher(object):
                    if self.encoding is not None:
                        dsrc = dsrc.decode(self.encoding, 'ignore')
                    else:
-                        dsrc = xml_to_unicode(dsrc)
+                        dsrc = xml_to_unicode(dsrc, self.verbose)[0]
                    
                    soup = self.get_soup(dsrc)
                    self.logger.debug('Processing images...')