mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Improve encoding detection.
This commit is contained in:
parent
d53460c1c4
commit
28622f5e58
@ -30,14 +30,14 @@ def detect(aBuf):
|
||||
# Added by Kovid
|
||||
def xml_to_unicode(raw, verbose=False):
|
||||
'''
|
||||
Force conversion of byte string to unicode. Tries to llok for XML/HTML
|
||||
Force conversion of byte string to unicode. Tries to look for XML/HTML
|
||||
encoding declaration first, if not found uses the chardet library and
|
||||
prints a warning if detection confidence is < 100%
|
||||
@return: (unicode, encoding used)
|
||||
'''
|
||||
if not raw:
|
||||
return u'', None
|
||||
encoding = None
|
||||
if not raw:
|
||||
return u'', encoding
|
||||
if isinstance(raw, unicode):
|
||||
return raw, encoding
|
||||
match = re.compile('^\s*<\?.*encoding=[\'"](.*?)[\'"].*\?>', re.IGNORECASE).match(raw)
|
||||
|
@ -313,20 +313,26 @@ class DefaultProfile(object):
|
||||
soup = BeautifulStoneSoup(src)
|
||||
for item in soup.findAll('item'):
|
||||
try:
|
||||
atitle = item.find('title')
|
||||
if not atitle:
|
||||
continue
|
||||
|
||||
atitle = self.tag_to_string(atitle)
|
||||
if self.use_pubdate:
|
||||
pubdate = item.find('pubdate')
|
||||
if not pubdate:
|
||||
pubdate = item.find('dc:date')
|
||||
if not pubdate or not pubdate.string:
|
||||
self.logger.debug('Skipping article as it does not have publication date')
|
||||
self.logger.debug('Skipping article %s as it does not have publication date'%atitle)
|
||||
continue
|
||||
pubdate = self.tag_to_string(pubdate)
|
||||
pubdate = pubdate.replace('+0000', 'GMT')
|
||||
|
||||
|
||||
url = self.get_article_url(item)
|
||||
url = self.tag_to_string(url)
|
||||
if require_url and not url:
|
||||
self.logger.debug('Skipping article as it does not have a link url')
|
||||
self.logger.debug('Skipping article %s as it does not have a link url'%atitle)
|
||||
continue
|
||||
purl = url
|
||||
try:
|
||||
@ -344,7 +350,7 @@ class DefaultProfile(object):
|
||||
content = ''
|
||||
|
||||
d = {
|
||||
'title' : self.tag_to_string(item.find('title')),
|
||||
'title' : atitle,
|
||||
'url' : purl,
|
||||
'timestamp': self.strptime(pubdate) if self.use_pubdate else time.time(),
|
||||
'date' : pubdate if self.use_pubdate else time.ctime(),
|
||||
|
@ -270,7 +270,7 @@ class RecursiveFetcher(object):
|
||||
if self.encoding is not None:
|
||||
dsrc = dsrc.decode(self.encoding, 'ignore')
|
||||
else:
|
||||
dsrc = xml_to_unicode(dsrc)
|
||||
dsrc = xml_to_unicode(dsrc, self.verbose)[0]
|
||||
|
||||
soup = self.get_soup(dsrc)
|
||||
self.logger.debug('Processing images...')
|
||||
|
Loading…
x
Reference in New Issue
Block a user