From ef0af86b19f4477602ebefee16fc195fa27f7286 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 1 Jun 2010 11:08:16 -0600 Subject: [PATCH 1/8] Updte Newsweek recipe for new site --- resources/recipes/newsweek.recipe | 229 ++++++++---------------------- 1 file changed, 58 insertions(+), 171 deletions(-) diff --git a/resources/recipes/newsweek.recipe b/resources/recipes/newsweek.recipe index 7a53c23e45..73837c1872 100644 --- a/resources/recipes/newsweek.recipe +++ b/resources/recipes/newsweek.recipe @@ -1,189 +1,76 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' - -import re -from calibre import strftime -from calibre.ebooks.BeautifulSoup import BeautifulSoup +import string from calibre.web.feeds.news import BasicNewsRecipe class Newsweek(BasicNewsRecipe): - title = 'Newsweek' - __author__ = 'Kovid Goyal and Sujata Raman' + __author__ = 'Kovid Goyal' description = 'Weekly news and current affairs in the US' + language = 'en' + encoding = 'utf-8' no_stylesheets = True - extra_css = ''' - h1{font-family:Arial,Helvetica,sans-serif; font-size:large; color:#383733;} - .deck{font-family:Georgia,sans-serif; color:#383733;} - .bylineDate{font-family:georgia ; color:#58544A; font-size:x-small;} - .authorInfo{font-family:arial,helvetica,sans-serif; color:#0066CC; font-size:x-small;} - .articleUpdated{font-family:arial,helvetica,sans-serif; color:#73726C; font-size:x-small;} - .issueDate{font-family:arial,helvetica,sans-serif; color:#73726C; font-size:x-small; font-style:italic;} - h5{font-family:arial,helvetica,sans-serif; color:#73726C; font-size:x-small;} - h6{font-family:arial,helvetica,sans-serif; color:#73726C; font-size:x-small;} - .story{font-family:georgia,sans-serif ;color:black;} - .photoCredit{color:#999999; font-family:Arial,Helvetica,sans-serif;font-size:x-small;} - .photoCaption{color:#0A0A09;font-family:Arial,Helvetica,sans-serif;font-size:x-small;} - .fwArticle{font-family:Arial,Helvetica,sans-serif;font-size:x-small;font-weight:bold;} - ''' + BASE_URL = 'http://www.newsweek.com' + INDEX = BASE_URL+'/topics.html' - encoding = 'utf-8' - language = 'en' + keep_only_tags = dict(name='article', attrs={'class':'article-text'}) + remove_tags = [dict(attrs={'data-dartad':True})] + remove_attributes = ['property'] - remove_tags = [ - {'class':['fwArticle noHr','fwArticle','hdlBulletItem','head-content','navbar','link', 'ad', 'sponsorLinksArticle', 'mm-content', - 'inline-social-links-wrapper', 'email-article','ToolBox', - 'inline-promo-link', 'sponsorship', - 'inlineComponentRight', - 'comments-and-social-links-wrapper', 'EmailArticleBlock']}, - {'id' : ['footer', 'ticker-data', 'topTenVertical', - 'digg-top-five', 'mesothorax', 'nw-comments', 'my-take-landing', - 'ToolBox', 'EmailMain']}, - {'class': re.compile('related-cloud')}, - dict(name='li', attrs={'id':['slug_bigbox']}) - ] + def postprocess_html(self, soup, first): + for tag in soup.findAll(name=['article', 'header']): + tag.name = 'div' + return soup + + def newsweek_sections(self): + soup = self.index_to_soup(self.INDEX) + for a in soup.findAll('a', title='Primary tag', href=True): + yield (string.capitalize(self.tag_to_string(a)), + self.BASE_URL+a['href']) - keep_only_tags = [{'class':['article HorizontalHeader', - 'articlecontent','photoBox', 'article columnist first']}, ] - recursions = 1 - match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+'] - preprocess_regexps = [(re.compile(r'', re.DOTALL), lambda m: '')] - - def find_title(self, section): - d = {'scope':'Scope', 'thetake':'The Take', 'features':'Features', - None:'Departments', 'culture':'Culture'} - ans = None - a = section.find('a', attrs={'name':True}) - if a is not None: - ans = a['name'] - return d.get(ans, ans) - - - def find_articles(self, section): - ans = [] - for x in section.findAll('h5'): - title = ' '.join(x.findAll(text=True)).strip() - a = x.find('a') - if not a: continue - href = a['href'] - ans.append({'title':title, 'url':href, 'description':'', 'date': strftime('%a, %d %b')}) - if not ans: - for x in section.findAll('div', attrs={'class':'hdlItem'}): - a = x.find('a', href=True) - if not a : continue - title = ' '.join(a.findAll(text=True)).strip() - href = a['href'] - if 'http://xtra.newsweek.com' in href: continue - ans.append({'title':title, 'url':href, 'description':'', 'date': strftime('%a, %d %b')}) - - #for x in ans: - # x['url'] += '/output/print' - return ans + def newsweek_parse_section_page(self, soup): + for article in soup.findAll('article', about=True, + attrs={'class':'stream-item'}): + title = article.find(attrs={'property': 'dc:title'}) + if title is None: continue + title = self.tag_to_string(title) + url = self.BASE_URL + article['about'] + desc = '' + author = article.find({'property':'dc:creator'}) + if author: + desc = u'by %s. '%self.tag_to_string(author) + p = article.find(attrs={'property':'dc:abstract'}) + if p is not None: + for a in p.find('a'): a.extract() + desc += self.tag_to_string(p) + t = article.find('time', attrs={'property':'dc:created'}) + date = '' + if t is not None: + date = u' [%s]'%self.tag_to_string(t) + self.log('\tFound article:', title, 'at', url) + self.log('\t\t', desc) + yield {'title':title, 'url':url, 'description':desc, 'date':date} def parse_index(self): - soup = self.get_current_issue() - if not soup: - raise RuntimeError('Unable to connect to newsweek.com. Try again later.') - sections = soup.findAll('div', attrs={'class':'featurewell'}) - titles = map(self.find_title, sections) - articles = map(self.find_articles, sections) - ans = list(zip(titles, articles)) - def fcmp(x, y): - tx, ty = x[0], y[0] - if tx == "Features": return cmp(1, 2) - if ty == "Features": return cmp(2, 1) - return cmp(tx, ty) - return sorted(ans, cmp=fcmp) - - def ensure_html(self, soup): - root = soup.find(name=True) - if root.name == 'html': return soup - nsoup = BeautifulSoup('') - nroot = nsoup.find(name='body') - for x in soup.contents: - if getattr(x, 'name', False): - x.extract() - nroot.insert(len(nroot), x) - return nsoup - - def postprocess_html(self, soup, first_fetch): - if not first_fetch: - h1 = soup.find(id='headline') - if h1: - h1.extract() - div = soup.find(attrs={'class':'articleInfo'}) - if div: - div.extract() - divs = list(soup.findAll('div', 'pagination')) - if not divs: - return self.ensure_html(soup) - for div in divs[1:]: div.extract() - all_a = divs[0].findAll('a', href=True) - divs[0]['style']="display:none" - if len(all_a) > 1: - all_a[-1].extract() - test = re.compile(self.match_regexps[0]) - for a in soup.findAll('a', href=test): - if a not in all_a: - del a['href'] - return self.ensure_html(soup) - - def get_current_issue(self): - soup = self.index_to_soup('http://www.newsweek.com') - div = soup.find('div', attrs={'class':re.compile('more-from-mag')}) - if div is None: return None - a = div.find('a') - if a is not None: - href = a['href'].split('#')[0] - return self.index_to_soup(href) - - def get_cover_url(self): - cover_url = None - soup = self.index_to_soup('http://www.newsweek.com') - link_item = soup.find('div',attrs={'class':'cover-image'}) - if link_item and link_item.a and link_item.a.img: - cover_url = link_item.a.img['src'] - return cover_url + sections = [] + for section, shref in self.newsweek_sections(): + self.log('Processing section', section, shref) + articles = [] + soups = [self.index_to_soup(shref)] + na = soups[0].find('a', rel='next') + if na: + soups.append(self.index_to_soup(self.BASE_URL+na['href'])) + for soup in soups: + articles.extend(self.newsweek_parse_section_page(soup)) + if self.test and len(articles) > 1: + break + if articles: + sections.append((section, articles)) + if self.test and len(sections) > 1: + break + return sections - def postprocess_book(self, oeb, opts, log) : - - def extractByline(href) : - soup = BeautifulSoup(str(oeb.manifest.hrefs[href])) - byline = soup.find(True,attrs={'class':'authorInfo'}) - byline = self.tag_to_string(byline) if byline is not None else '' - issueDate = soup.find(True,attrs={'class':'issueDate'}) - issueDate = self.tag_to_string(issueDate) if issueDate is not None else '' - issueDate = re.sub(',','', issueDate) - if byline > '' and issueDate > '' : - return byline + ' | ' + issueDate - else : - return byline + issueDate - - def extractDescription(href) : - soup = BeautifulSoup(str(oeb.manifest.hrefs[href])) - description = soup.find(True,attrs={'name':'description'}) - if description is not None and description.has_key('content'): - description = description['content'] - if description.startswith('Newsweek magazine online plus') : - description = soup.find(True, attrs={'class':'story'}) - firstPara = soup.find('p') - description = self.tag_to_string(firstPara) - else : - description = soup.find(True, attrs={'class':'story'}) - firstPara = soup.find('p') - description = self.tag_to_string(firstPara) - return description - - for section in oeb.toc : - for article in section : - if article.author is None : - article.author = extractByline(article.href) - if article.description is None : - article.description = extractDescription(article.href) - return From 721b48038e75a6992b849379b5f685458caa45b3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 1 Jun 2010 16:07:40 -0600 Subject: [PATCH 2/8] Fix minor multiple location ondevice bug --- src/calibre/gui2/device.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/gui2/device.py b/src/calibre/gui2/device.py index d3c2e4f10f..181d0c784b 100644 --- a/src/calibre/gui2/device.py +++ b/src/calibre/gui2/device.py @@ -1123,12 +1123,12 @@ class DeviceGUI(object): if cache: if id in cache['db_ids']: loc[i] = True - break + continue if mi.authors and \ re.sub('(?u)\W|[_]', '', authors_to_string(mi.authors).lower()) \ in cache['authors']: loc[i] = True - break + continue return loc def set_books_in_library(self, booklists, reset=False): From a529cb0303f22329214012e280d1ff026a8942a7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 1 Jun 2010 18:21:39 -0600 Subject: [PATCH 3/8] Fix #5662 (< hexa entity problem) --- src/calibre/ebooks/conversion/preprocess.py | 10 ++++++++-- src/calibre/ebooks/oeb/base.py | 1 - 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 40c67453b2..7a7f362169 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -14,8 +14,14 @@ XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>') SVG_NS = 'http://www.w3.org/2000/svg' XLINK_NS = 'http://www.w3.org/1999/xlink' -convert_entities = functools.partial(entity_to_unicode, exceptions=['quot', - 'apos', 'lt', 'gt', 'amp', '#60', '#62']) +convert_entities = functools.partial(entity_to_unicode, + result_exceptions = { + u'<' : '<', + u'>' : '>', + u"'" : ''', + u'"' : '"', + u'&' : '&', + }) _span_pat = re.compile('', re.DOTALL|re.IGNORECASE) LIGATURES = { diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 79f9f15248..76e2cef3bb 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -787,7 +787,6 @@ class Manifest(object): data = self.oeb.decode(data) data = self.oeb.html_preprocessor(data) - # Remove DOCTYPE declaration as it messes up parsing # In particular, it causes tostring to insert xmlns # declarations, which messes up the coercing logic From 900ff7204b12eb15d65df68b43a427ab38962d95 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 1 Jun 2010 18:25:17 -0600 Subject: [PATCH 4/8] Fix #5654 (No Default Cover causes conversion error) --- src/calibre/ebooks/oeb/transforms/cover.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/calibre/ebooks/oeb/transforms/cover.py b/src/calibre/ebooks/oeb/transforms/cover.py index ecdc1294ad..4d41ab14b4 100644 --- a/src/calibre/ebooks/oeb/transforms/cover.py +++ b/src/calibre/ebooks/oeb/transforms/cover.py @@ -136,6 +136,8 @@ class CoverManager(object): href = g['cover'].href else: href = self.default_cover() + if href is None: + return width, height = self.inspect_cover(href) if width is None or height is None: self.log.warning('Failed to read cover dimensions') From 7213c1e4b61cb13ca40d01040461a08915be7573 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 1 Jun 2010 18:40:12 -0600 Subject: [PATCH 5/8] Regex builder: Convert entities so people don't use them in building their regexes. Fixes #5549 (Not removing header/footer) --- src/calibre/gui2/convert/regex_builder.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/calibre/gui2/convert/regex_builder.py b/src/calibre/gui2/convert/regex_builder.py index 58e1d1ae45..6fa0fa5fe4 100644 --- a/src/calibre/gui2/convert/regex_builder.py +++ b/src/calibre/gui2/convert/regex_builder.py @@ -14,6 +14,7 @@ from calibre.gui2.convert.regex_builder_ui import Ui_RegexBuilder from calibre.gui2.convert.xexp_edit_ui import Ui_Form as Ui_Edit from calibre.gui2 import error_dialog, choose_files from calibre.ebooks.oeb.iterator import EbookIterator +from calibre.ebooks.conversion.preprocess import convert_entities from calibre.gui2.dialogs.choose_format import ChooseFormatDialog class RegexBuilder(QDialog, Ui_RegexBuilder): @@ -87,8 +88,10 @@ class RegexBuilder(QDialog, Ui_RegexBuilder): self.iterator = EbookIterator(pathtoebook) self.iterator.__enter__(only_input_plugin=True) text = [u''] + ent_pat = re.compile(r'&(\S+?);') for path in self.iterator.spine: html = open(path, 'rb').read().decode('utf-8', 'replace') + html = ent_pat.sub(convert_entities, html) text.append(html) self.preview.setPlainText('\n---\n'.join(text)) From 7a737aa3a1d2829ac28df6cb4825f000cb9b8433 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 1 Jun 2010 19:01:25 -0600 Subject: [PATCH 6/8] Fix warnings when compiling user manual --- src/calibre/customize/__init__.py | 6 +----- src/calibre/manual/conversion.rst | 6 +++--- src/calibre/manual/faq.rst | 2 +- src/calibre/web/feeds/news.py | 6 +++--- 4 files changed, 8 insertions(+), 12 deletions(-) diff --git a/src/calibre/customize/__init__.py b/src/calibre/customize/__init__.py index 4eaaf3b90a..9a018231ef 100644 --- a/src/calibre/customize/__init__.py +++ b/src/calibre/customize/__init__.py @@ -29,7 +29,7 @@ class Plugin(object): ''' #: List of platforms this plugin works on - #: For example: ``['windows', 'osx', 'linux'] + #: For example: ``['windows', 'osx', 'linux']`` supported_platforms = [] #: The name of this plugin. You must set it something other @@ -214,10 +214,8 @@ class MetadataReaderPlugin(Plugin): Return metadata for the file represented by stream (a file like object that supports reading). Raise an exception when there is an error with the input data. - :param type: The type of file. Guaranteed to be one of the entries in :attr:`file_types`. - :return: A :class:`calibre.ebooks.metadata.MetaInformation` object ''' return None @@ -245,11 +243,9 @@ class MetadataWriterPlugin(Plugin): Set metadata for the file represented by stream (a file like object that supports reading). Raise an exception when there is an error with the input data. - :param type: The type of file. Guaranteed to be one of the entries in :attr:`file_types`. :param mi: A :class:`calibre.ebooks.metadata.MetaInformation` object - ''' pass diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst index ee148c79c7..cd8abd0493 100644 --- a/src/calibre/manual/conversion.rst +++ b/src/calibre/manual/conversion.rst @@ -453,7 +453,7 @@ as HTML and then convert the resulting HTML file with |app|. When saving as HTML There is a Word macro package that can automate the conversion of Word documents using |app|. It also makes generating the Table of Contents much simpler. It is called BookCreator and is available for free -`here `_. +at `mobileread `_. Convert TXT documents ~~~~~~~~~~~~~~~~~~~~~~ @@ -493,7 +493,7 @@ TXT input supports a number of options to differentiate how paragraphs are detec allows for basic formatting to be added to TXT documents, such as bold, italics, section headings, tables, lists, a Table of Contents, etc. Marking chapter headings with a leading # and setting the chapter XPath detection expression to "//h:h1" is the easiest way to have a proper table of contents generated from a TXT document. - You can learn more about the markdown syntax `here `_. + You can learn more about the markdown syntax at `daringfireball `_. Convert PDF documents @@ -540,7 +540,7 @@ EPUB advanced formatting demo Various advanced formatting for EPUB files is demonstrated in this `demo file `_. The file was created from hand coded HTML using calibre and is meant to be used as a template for your own EPUB creation efforts. -The source HTML it was created from is available `here `_. The settings used to create the +The source HTML it was created from is available `demo.zip `_. The settings used to create the EPUB from the ZIP file are:: ebook-convert demo.zip .epub -vv --authors "Kovid Goyal" --language en --level1-toc '//*[@class="title"]' --disable-font-rescaling --page-breaks-before / --no-default-epub-cover diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst index f7329fb54d..e606505194 100644 --- a/src/calibre/manual/faq.rst +++ b/src/calibre/manual/faq.rst @@ -133,7 +133,7 @@ Can I use the collections feature of the SONY reader? turned into a collection on the reader. Note that the PRS-500 does not support collections for books stored on the SD card. The PRS-505 does. How do I use |app| with my iPad/iPhone/iTouch? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ You can access your calibre library on a iPad/iPhone/iTouch over the air using the calibre content server. diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index 26b3ad0593..9faabb2615 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -146,7 +146,7 @@ class BasicNewsRecipe(Recipe): #: If True empty feeds are removed from the output. #: This option has no effect if parse_index is overriden in #: the sub class. It is meant only for recipes that return a list - #: of feeds using :member:`feeds` or :method:`get_feeds`. + #: of feeds using `feeds` or :method:`get_feeds`. remove_empty_feeds = False #: List of regular expressions that determines which links to follow @@ -256,7 +256,7 @@ class BasicNewsRecipe(Recipe): #: The CSS that is used to styles the templates, i.e., the navigation bars and #: the Tables of Contents. Rather than overriding this variable, you should - #: use :member:`extra_css` in your recipe to customize look and feel. + #: use `extra_css` in your recipe to customize look and feel. template_css = u''' .article_date { color: gray; font-family: monospace; @@ -506,7 +506,7 @@ class BasicNewsRecipe(Recipe): def get_obfuscated_article(self, url): ''' - If you set :member:`articles_are_obfuscated` this method is called with + If you set `articles_are_obfuscated` this method is called with every article URL. It should return the path to a file on the filesystem that contains the article HTML. That file is processed by the recursive HTML fetching engine, so it can contain links to pages/images on the web. From c2f655ad7188582a2709f035e7e46cb7ff82ad4b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 1 Jun 2010 19:24:01 -0600 Subject: [PATCH 7/8] When listing series, sort ignoring leading English prepositions. Fixes #5090 (Series Sort) --- src/calibre/library/caches.py | 5 +++-- src/calibre/library/database2.py | 3 +++ src/calibre/library/server/content.py | 4 ++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/calibre/library/caches.py b/src/calibre/library/caches.py index 93891ee92b..83c56c5395 100644 --- a/src/calibre/library/caches.py +++ b/src/calibre/library/caches.py @@ -17,7 +17,7 @@ from calibre.utils.config import tweaks from calibre.utils.date import parse_date, now, UNDEFINED_DATE from calibre.utils.search_query_parser import SearchQueryParser from calibre.utils.pyparsing import ParseException -# from calibre.library.field_metadata import FieldMetadata +from calibre.ebooks.metadata import title_sort class CoverCache(QThread): @@ -564,7 +564,8 @@ class ResultCache(SearchQueryParser): def seriescmp(self, x, y): sidx = self.FIELD_MAP['series'] try: - ans = cmp(self._data[x][sidx].lower(), self._data[y][sidx].lower()) + ans = cmp(title_sort(self._data[x][sidx].lower()), + title_sort(self._data[y][sidx].lower())) except AttributeError: # Some entries may be None ans = cmp(self._data[x][sidx], self._data[y][sidx]) if ans != 0: return ans diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py index 4107d327ce..f27a42beee 100644 --- a/src/calibre/library/database2.py +++ b/src/calibre/library/database2.py @@ -725,6 +725,9 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): categories[category] = [Tag(formatter(r[1]), count=r[2], id=r[0], icon=icon, tooltip = tooltip) for r in data if item_not_zero_func(r)] + if category == 'series': + categories[category].sort(cmp=lambda x,y:cmp(title_sort(x.name), + title_sort(y.name))) # We delayed computing the standard formats category because it does not # use a view, but is computed dynamically diff --git a/src/calibre/library/server/content.py b/src/calibre/library/server/content.py index 8638035c88..12bd786322 100644 --- a/src/calibre/library/server/content.py +++ b/src/calibre/library/server/content.py @@ -16,7 +16,7 @@ except ImportError: from calibre import fit_image, guess_type from calibre.utils.date import fromtimestamp - +from calibre.ebooks.metadata import title_sort class ContentServer(object): @@ -67,7 +67,7 @@ class ContentServer(object): def seriescmp(self, x, y): si = self.db.FIELD_MAP['series'] try: - ans = cmp(x[si].lower(), y[si].lower()) + ans = cmp(title_sort(x[si].lower()), title_sort(y[si].lower())) except AttributeError: # Some entries may be None ans = cmp(x[si], y[si]) if ans != 0: return ans From f4bbf10ee348fb7f998c20301073c7fadf0dac99 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 1 Jun 2010 19:49:53 -0600 Subject: [PATCH 8/8] LRF Input: Handle ampersands and other XML reserved characters correctly when converting LRF documents. Fixes #4923 (Ampersands in input text get lost in output) --- src/calibre/ebooks/lrf/objects.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/lrf/objects.py b/src/calibre/ebooks/lrf/objects.py index 0045e679a3..8f69e94013 100644 --- a/src/calibre/ebooks/lrf/objects.py +++ b/src/calibre/ebooks/lrf/objects.py @@ -3,7 +3,7 @@ __copyright__ = '2008, Kovid Goyal ' import struct, array, zlib, cStringIO, collections, re from calibre.ebooks.lrf import LRFParseError, PRS500_PROFILE -from calibre import entity_to_unicode +from calibre import entity_to_unicode, prepare_string_for_xml from calibre.ebooks.lrf.tags import Tag ruby_tags = { @@ -870,7 +870,7 @@ class Text(LRFStream): open_containers = collections.deque() for c in self.content: if isinstance(c, basestring): - s += c + s += prepare_string_for_xml(c) elif c is None: if open_containers: p = open_containers.pop()