diff --git a/src/libprs500/ebooks/chardet/__init__.py b/src/libprs500/ebooks/chardet/__init__.py index 83763bca1c..c54c945708 100644 --- a/src/libprs500/ebooks/chardet/__init__.py +++ b/src/libprs500/ebooks/chardet/__init__.py @@ -30,14 +30,14 @@ def detect(aBuf): # Added by Kovid def xml_to_unicode(raw, verbose=False): ''' - Force conversion of byte string to unicode. Tries to llok for XML/HTML + Force conversion of byte string to unicode. Tries to look for XML/HTML encoding declaration first, if not found uses the chardet library and prints a warning if detection confidence is < 100% @return: (unicode, encoding used) ''' - if not raw: - return u'', None encoding = None + if not raw: + return u'', encoding if isinstance(raw, unicode): return raw, encoding match = re.compile('^\s*<\?.*encoding=[\'"](.*?)[\'"].*\?>', re.IGNORECASE).match(raw) diff --git a/src/libprs500/ebooks/lrf/web/profiles/__init__.py b/src/libprs500/ebooks/lrf/web/profiles/__init__.py index 768ce9e45b..74d486692c 100644 --- a/src/libprs500/ebooks/lrf/web/profiles/__init__.py +++ b/src/libprs500/ebooks/lrf/web/profiles/__init__.py @@ -313,20 +313,26 @@ class DefaultProfile(object): soup = BeautifulStoneSoup(src) for item in soup.findAll('item'): try: + atitle = item.find('title') + if not atitle: + continue + + atitle = self.tag_to_string(atitle) if self.use_pubdate: pubdate = item.find('pubdate') if not pubdate: pubdate = item.find('dc:date') if not pubdate or not pubdate.string: - self.logger.debug('Skipping article as it does not have publication date') + self.logger.debug('Skipping article %s as it does not have publication date'%atitle) continue pubdate = self.tag_to_string(pubdate) pubdate = pubdate.replace('+0000', 'GMT') + url = self.get_article_url(item) url = self.tag_to_string(url) if require_url and not url: - self.logger.debug('Skipping article as it does not have a link url') + self.logger.debug('Skipping article %s as it does not have a link url'%atitle) continue purl = url try: @@ -344,7 +350,7 @@ class DefaultProfile(object): content = '' d = { - 'title' : self.tag_to_string(item.find('title')), + 'title' : atitle, 'url' : purl, 'timestamp': self.strptime(pubdate) if self.use_pubdate else time.time(), 'date' : pubdate if self.use_pubdate else time.ctime(), diff --git a/src/libprs500/web/fetch/simple.py b/src/libprs500/web/fetch/simple.py index 50cb53038a..6fccaead74 100644 --- a/src/libprs500/web/fetch/simple.py +++ b/src/libprs500/web/fetch/simple.py @@ -270,7 +270,7 @@ class RecursiveFetcher(object): if self.encoding is not None: dsrc = dsrc.decode(self.encoding, 'ignore') else: - dsrc = xml_to_unicode(dsrc) + dsrc = xml_to_unicode(dsrc, self.verbose)[0] soup = self.get_soup(dsrc) self.logger.debug('Processing images...')