Fix some news downloads in east asian languages not working because the truncation of article descriptions could cause invalid UTF-16 bytes in the string

This commit is contained in:
Kovid Goyal 2016-09-26 23:30:39 +05:30
parent a3eee6a22a
commit d75e5323a9
3 changed files with 12 additions and 6 deletions

View File

@ -274,7 +274,10 @@ class TOC(list):
desc = getattr(np, 'description', None) desc = getattr(np, 'description', None)
if desc: if desc:
desc = re.sub(r'\s+', ' ', desc) desc = re.sub(r'\s+', ' ', desc)
elem.append(C.meta(desc, name='description')) try:
elem.append(C.meta(desc, name='description'))
except ValueError:
elem.append(C.meta(clean_xml_chars(desc), name='description'))
idx = getattr(np, 'toc_thumbnail', None) idx = getattr(np, 'toc_thumbnail', None)
if idx: if idx:
elem.append(C.meta(idx, name='toc_thumbnail')) elem.append(C.meta(idx, name='toc_thumbnail'))

View File

@ -1039,12 +1039,13 @@ class BasicNewsRecipe(Recipe):
def description_limiter(cls, src): def description_limiter(cls, src):
if not src: if not src:
return '' return ''
src = force_unicode(src, 'utf-8')
pos = cls.summary_length pos = cls.summary_length
fuzz = 50 fuzz = 50
si = src.find(';', pos) si = src.find(u';', pos)
if si > 0 and si-pos > fuzz: if si > 0 and si-pos > fuzz:
si = -1 si = -1
gi = src.find('>', pos) gi = src.find(u'>', pos)
if gi > 0 and gi-pos > fuzz: if gi > 0 and gi-pos > fuzz:
gi = -1 gi = -1
npos = max(si, gi) npos = max(si, gi)
@ -1052,8 +1053,9 @@ class BasicNewsRecipe(Recipe):
npos = pos npos = pos
ans = src[:npos+1] ans = src[:npos+1]
if len(ans) < len(src): if len(ans) < len(src):
return (ans+u'\u2026') if isinstance(ans, unicode) else (ans + from calibre.utils.cleantext import clean_xml_chars
'...') # Truncating the string could cause a dangling UTF-16 half-surrogate, which will cause lxml to barf, clean it
ans = clean_xml_chars(ans) + u'\u2026'
return ans return ans
def feed2index(self, f, feeds): def feed2index(self, f, feeds):

View File

@ -135,6 +135,7 @@ class FeedTemplate(Template):
return navbar return navbar
def _generate(self, f, feeds, cutoff, extra_css=None, style=None): def _generate(self, f, feeds, cutoff, extra_css=None, style=None):
from calibre.utils.cleantext import clean_xml_chars
feed = feeds[f] feed = feeds[f]
head = HEAD(TITLE(feed.title)) head = HEAD(TITLE(feed.title))
if style: if style:
@ -173,7 +174,7 @@ class FeedTemplate(Template):
style='padding-bottom:0.5em') style='padding-bottom:0.5em')
) )
if article.summary: if article.summary:
li.append(DIV(cutoff(article.text_summary), li.append(DIV(clean_xml_chars(cutoff(article.text_summary)),
CLASS('article_description', 'calibre_rescale_70'))) CLASS('article_description', 'calibre_rescale_70')))
ul.append(li) ul.append(li)
div.append(ul) div.append(ul)