From 2f93152b13c7cf230a97829cb8cb18b1b8396520 Mon Sep 17 00:00:00 2001 From: mythtv Date: Sun, 28 Jun 2009 14:42:02 -0700 Subject: [PATCH 01/20] Include pubdate when importing --- src/calibre/library/database2.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py index 730ca364d5..d37afa18e5 100644 --- a/src/calibre/library/database2.py +++ b/src/calibre/library/database2.py @@ -1048,6 +1048,8 @@ class LibraryDatabase2(LibraryDatabase): self.set_isbn(id, mi.isbn, notify=False) if mi.series_index: self.set_series_index(id, mi.series_index, notify=False) + if mi.pubdate: + self.set_pubdate(id, mi.pubdate, notify=False) if getattr(mi, 'timestamp', None) is not None: self.set_timestamp(id, mi.timestamp, notify=False) self.set_path(id, True) From 3bb5d82da6985f9f98841efc2f86b4801d809947 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 6 Jul 2009 17:16:10 -0600 Subject: [PATCH 02/20] New Honduran recipes by Darko Miletic --- src/calibre/gui2/images/news/eltiempo_hn.png | Bin 0 -> 3315 bytes src/calibre/gui2/images/news/laprensa_hn.png | Bin 0 -> 306 bytes src/calibre/gui2/images/news/latribuna.png | Bin 0 -> 553 bytes src/calibre/web/feeds/recipes/__init__.py | 4 +- .../web/feeds/recipes/recipe_eltiempo_hn.py | 52 ++++++++++++++ .../web/feeds/recipes/recipe_laprensa_hn.py | 54 +++++++++++++++ .../web/feeds/recipes/recipe_latribuna.py | 65 ++++++++++++++++++ 7 files changed, 174 insertions(+), 1 deletion(-) create mode 100644 src/calibre/gui2/images/news/eltiempo_hn.png create mode 100644 src/calibre/gui2/images/news/laprensa_hn.png create mode 100644 src/calibre/gui2/images/news/latribuna.png create mode 100644 src/calibre/web/feeds/recipes/recipe_eltiempo_hn.py create mode 100644 src/calibre/web/feeds/recipes/recipe_laprensa_hn.py create mode 100644 src/calibre/web/feeds/recipes/recipe_latribuna.py diff --git a/src/calibre/gui2/images/news/eltiempo_hn.png b/src/calibre/gui2/images/news/eltiempo_hn.png new file mode 100644 index 0000000000000000000000000000000000000000..56bba04b0a5b0b0b249cd667f29b2b717711b467 GIT binary patch literal 3315 zcmV004&%004{+008|`004nN004b?008NW002DY000@xb3BE2000U( zX+uL$P-t&-Z*ypGa3D!TLm+T+Z)Rz1WdHz3$DNjUR8-d%htIutdZEoQ0#b(FyTAa_ zdy`&8VVD_UC<6{NG_fI~0ue<-nj%P0#DLLIBvwSR5EN9f2P6n6F&ITuEN@2Ei>|D^ z_ww@lRz|vC zuzLs)$;-`!o*{AqUjza0dRV*yaMRE;fKCVhpQKsoe1Yhg01=zBIT!&C1$=TK@rP|Ibo3vKKm@PqnO#LJhq6%Ij6Hz*<$V$@wQAMN5qJ)hzm2h zoGcOF60t^#FqJFfH{#e-4l@G)6iI9sa9D{VHW4w29}?su;^hF~NC{tY+*d5%WDCTX za!E_i;d2ub1#}&jF5T4HnnCyEWTkKf0>c0%E1Ah>(_PY1)0w;+02c53Su*0<(nUqK zG_|(0G&D0Z{i;y^b@OjZ+}lNZ8Th$p5Uu}MTtq^NHl*T1?CO*}7&0ztZsv2j*bmJyf3G7=Z`5B*PvzoDiKdLpOAxi2$L0#SX*@cY z_n(^h55xYX#km%V()bZjV~l{*bt*u9?FT3d5g^g~#a;iSZ@&02Abxq_DwB(I|L-^b zXThc7C4-yrInE_0gw7K3GZ**7&k~>k0Z0NWkO#^@9q0fwx1%qj zZ=)yBuQ3=54Wo^*!gyjLF-e%Um=erBOdIALW)L%unZshS@>qSW9o8Sq#0s#5*edK% z>{;v(b^`kbN5rY%%y90wC>#%$kE_5P!JWYk;U;klcqzOl-UjcFXXA75rT9jCH~u<) z0>40zCTJ7v2qAyk54cquI@7b&LHdZ`+zlTss6bJ7%PQ)z$cROu4wBhpu-r)01) zS~6}jY?%U?gEALn#wiFzo#H}aQ8rT=DHkadR18&{>P1bW7E`~Y4p3)hWn`DhhRJ5j z*2tcg9i<^OEt(fCg;q*CP8+7ZTcWhYX$fb^_9d-LhL+6BEtPYWVlfKTBusSTASKKb%HuWJzl+By+?gkLq)?+BTu761 zjmyXF)a;mc^>(B7bo*HQ1NNg1st!zt28YLv>W*y3CdWx9U8f|cqfXDAO`Q48?auQq zHZJR2&bcD49Ip>EY~kKEPV6Wm+eXFV)D)_R=tM0@&p?(!V*Qu1PXHG9o^ zTY0bZ?)4%01p8F`JoeS|<@=<@RE7GY07EYX@lwd>4oW|Yi!o+Su@M`;WuSK z8LKk71XR(_RKHM1xJ5XYX`fk>`6eqY>qNG6HZQwBM=xi4&Sb88?zd}EYguc1@>KIS z<&CX#T35dwS|7K*XM_5Nf(;WJJvJWRMA($P>8E^?{IdL4o5MGE7bq2MEEwP7v8AO@ zqL5!WvekBL-8R%V?zVyL=G&{be=K4bT`e{#t|)$A!YaA?jp;X)-+bB;zhj`(vULAW z%ue3U;av{94wp%n<(7@__S@Z2PA@Mif3+uO&y|X06?J#oSi8M;ejj_^(0<4Lt#wLu#dYrva1Y$6_o(k^&}yhSh&h;f@JVA>W8b%o zZ=0JGnu?n~9O4}sJsfnnx7n(>`H13?(iXTy*fM=I`sj`CT)*pTHEgYKqqP+u1IL8N zo_-(u{qS+0<2@%BCt82d{Gqm;(q7a7b>wu+b|!X?c13m#p7cK1({0<`{-e>4hfb-U zsyQuty7Ua;Ou?B?XLHZaol8GAb3Wnxcu!2v{R_`T4=x`(GvqLI{-*2AOSimk zUAw*F_TX^n@STz9kDQ z$NC=!KfXWC8h`dn#xL(D3Z9UkR7|Q&Hcy#Notk!^zVUSB(}`#4&lYA1f0h2V_PNgU zAAWQEt$#LRcH#y9#i!p(Udq2b^lI6wp1FXzN3T;~FU%Lck$-deE#qz9yYP3D3t8{6 z?<+s(e(3(_^YOu_)K8!O1p}D#{JO;G(*OVhIY~r8R5(wSlFM%raS+GXyZ?nJTM}#K zV4^)}JsAyJAJO#EgEdV(*qE567Z0|Y#>&GIs`4yVLa`?0Wo;2S*xCvf+8|Jt-QO;- z%cI}|3lvc)_(Kp9)Ps}BB=gJnGn3!U9O#$=$bT6G+MFu{*Od5#9YC2S0BNdEOyffuAX!4+yGXB(B*O`sN<9;sjOML}rUt;_{{|+P=}(?WEY3t@Uy8+2 zxx8PkR#|pWBGju@_Awn+ught6SV8j@C}~+OW3zdgXmR%G(&#)6rCofyOMoNYLgJW+ zP;X|TqEe~pV60SX9kQ9+J=xlArEA0x9x_CG?=15sHjwM$&!+}ny|&@Mq%DW($~JXw z4)9)pGn0-ZS2c%v#N1&MGAQ#M>{UEcIqN_z?pMp@8v529J2}45J#6mmo9nnRd-$yU z=)eMUNlo3iCIM4|>e9C|lP>HPf3UeyHdCadnKA9oEe+cL62Gho_1v{{ZoX-Zb&V*% z*-$j2Tlm0az9y2nSR#i8ac3@N=bp{xRlN^7r z7{HS#mJEWVdpiU>$fcI^60{wFIafsVE5V4w;qdJSu!J9tyWAuM6jBRs=5Ii=#_?(5 xhK7{e!q#Mz-+M z;HOVQMeKx^#!0ZTZFn%*h2KDhq1)7L51RoH{P?MRKxmw literal 0 HcmV?d00001 diff --git a/src/calibre/gui2/images/news/latribuna.png b/src/calibre/gui2/images/news/latribuna.png new file mode 100644 index 0000000000000000000000000000000000000000..06c9ebeef151ebe18bc556d5eb62c1b7d196ec46 GIT binary patch literal 553 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!60wlNoGJgf6n3BBRT^Rni_n+Ah2>S z4={E+nQaFWEGuwK2hw1@3^B*n9%f)*4D)nx4AD5>8o1ZnsZhlB{qyS!p0U5;&a6mV zV4A@h=zMVk-&T_yU*%m)wtMV$R#|Jts>i46C>SEp$QkBh>*%S)b=IV~s>Al}wv3;G zCDW^SJ}-FpFy_JhM|JK(JKXCgAN;>T`sow9XI;nVCbKX_YD&CpKKaO_`I)AVBa^RK zT&vlloYKg&%Mphy-dMNW=eF?aNtf1` zIb1YJP5b$%?W>j6KaB~??94LaufAGvkTeb#!m8Um&DO4-{`xz7THE$ndYhm&86R)p*00y^eiEBhja#3nxNvduNkYF$}FtF4$ zFxNFO3o$geGBmI<1hK6Q4BlC6>4Ry=%}>cptHjh`YGrC}Wn=--@LN0ssGfnr)78&q Iol`;+0Q^$dUjP6A literal 0 HcmV?d00001 diff --git a/src/calibre/web/feeds/recipes/__init__.py b/src/calibre/web/feeds/recipes/__init__.py index 44fb9bd46e..51f0000605 100644 --- a/src/calibre/web/feeds/recipes/__init__.py +++ b/src/calibre/web/feeds/recipes/__init__.py @@ -51,9 +51,11 @@ recipe_modules = ['recipe_' + r for r in ( 'theeconomictimes_india', '7dias', 'buenosaireseconomico', 'diagonales', 'miradasalsur', 'newsweek_argentina', 'veintitres', 'gva_be', 'hln', 'tijd', 'degentenaar', 'inquirer_net', 'uncrate', - 'fastcompany', 'accountancyage', + 'fastcompany', 'accountancyage', 'laprensa_hn', 'latribuna', + 'eltiempo_hn', )] + import re, imp, inspect, time, os from calibre.web.feeds.news import BasicNewsRecipe, CustomIndexRecipe, AutomaticNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup diff --git a/src/calibre/web/feeds/recipes/recipe_eltiempo_hn.py b/src/calibre/web/feeds/recipes/recipe_eltiempo_hn.py new file mode 100644 index 0000000000..e7fd23b797 --- /dev/null +++ b/src/calibre/web/feeds/recipes/recipe_eltiempo_hn.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2009, Darko Miletic ' +''' +www.tiempo.hn +''' + +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag + +class ElTiempoHn(BasicNewsRecipe): + title = 'El Tiempo - Honduras' + __author__ = 'Darko Miletic' + description = 'Noticias de Honduras y mundo' + publisher = 'El Tiempo' + category = 'news, politics, Honduras' + oldest_article = 2 + max_articles_per_feed = 100 + use_embedded_content = False + no_stylesheets = True + remove_javascript = True + encoding = 'utf-8' + language = _('Spanish') + lang = 'es-HN' + direction = 'ltr' + + html2lrf_options = [ + '--comment', description + , '--category', category + , '--publisher', publisher + , '--ignore-tables' + ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True\npretty_print=True\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} img {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em}"' + + remove_tags = [dict(name=['form','object','embed','base'])] + + keep_only_tags = [dict(name='td' , attrs={'id':'mainbodycont'})] + + feeds = [(u'Noticias', u'http://www.tiempo.hn/index.php?format=feed&type=rss')] + + def preprocess_html(self, soup): + soup.html['lang'] = self.lang + soup.html['dir' ] = self.direction + mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) + mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")]) + soup.head.insert(0,mlang) + soup.head.insert(1,mcharset) + for item in soup.findAll(style=True): + del item['style'] + return self.adeify_images(soup) diff --git a/src/calibre/web/feeds/recipes/recipe_laprensa_hn.py b/src/calibre/web/feeds/recipes/recipe_laprensa_hn.py new file mode 100644 index 0000000000..b34f158400 --- /dev/null +++ b/src/calibre/web/feeds/recipes/recipe_laprensa_hn.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2009, Darko Miletic ' +''' +www.laprensahn.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag + +class LaPrensaHn(BasicNewsRecipe): + title = 'La Prensa - Honduras' + __author__ = 'Darko Miletic' + description = 'Noticias de Honduras y mundo' + publisher = 'La Prensa' + category = 'news, politics, Honduras' + oldest_article = 2 + max_articles_per_feed = 100 + use_embedded_content = False + no_stylesheets = True + remove_javascript = True + encoding = 'utf-8' + language = _('Spanish') + lang = 'es-HN' + direction = 'ltr' + + html2lrf_options = [ + '--comment', description + , '--category', category + , '--publisher', publisher + ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\npretty_print=True\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "' + + remove_tags = [dict(name=['form','object','embed'])] + + keep_only_tags = [ + dict(name='h1' , attrs={'class':'titulo1'}) + ,dict(name='div', attrs={'class':['sumario11','hora','texto']}) + ] + + feeds = [(u'Noticias', u'http://feeds.feedburner.com/laprensa_titulares')] + + def preprocess_html(self, soup): + soup.html['lang'] = self.lang + soup.html['dir' ] = self.direction + mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) + mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")]) + soup.head.insert(0,mlang) + soup.head.insert(1,mcharset) + for item in soup.findAll(style=True): + del item['style'] + return soup diff --git a/src/calibre/web/feeds/recipes/recipe_latribuna.py b/src/calibre/web/feeds/recipes/recipe_latribuna.py new file mode 100644 index 0000000000..d3a9a333cb --- /dev/null +++ b/src/calibre/web/feeds/recipes/recipe_latribuna.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2009, Darko Miletic ' +''' +www.latribuna.hn +''' + +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag + +class LaTribuna(BasicNewsRecipe): + title = 'La Tribuna - Honduras' + __author__ = 'Darko Miletic' + description = 'Noticias de Honduras y mundo' + publisher = 'La Tribuna' + category = 'news, politics, Honduras' + oldest_article = 2 + max_articles_per_feed = 100 + use_embedded_content = False + no_stylesheets = True + remove_javascript = True + encoding = 'utf-8' + language = _('Spanish') + lang = 'es-HN' + direction = 'ltr' + + html2lrf_options = [ + '--comment', description + , '--category', category + , '--publisher', publisher + ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\npretty_print=True\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "' + + remove_tags = [dict(name=['form','object','embed'])] + + keep_only_tags = [ + dict(name='p', attrs={'id':['BlogTitle','BlogDate']}) + ,dict(name='div', attrs={'id':'BlogContent'}) + ] + + feeds = [(u'Noticias', u'http://www.latribuna.hn/web2.0/?feed=rss')] + + def print_version(self, url): + return url + '&print=1' + + def preprocess_html(self, soup): + soup.html['lang'] = self.lang + soup.html['dir' ] = self.direction + mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) + mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")]) + soup.head.insert(0,mlang) + soup.head.insert(1,mcharset) + for item in soup.findAll(style=True): + del item['style'] + return soup + + def get_cover_url(self): + cover_url = None + soup = self.index_to_soup('http://www.latribuna.hn/web2.0/') + cover_item = soup.find('div',attrs={'class':'portada_impresa'}) + if cover_item: + cover_url = cover_item.a.img['src'] + return cover_url From 4158655b62dd20936b0d479a81781a33c74fd69a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 6 Jul 2009 18:28:54 -0600 Subject: [PATCH 03/20] ... --- src/calibre/ebooks/mobi/writer.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index b8b5c8f796..f7e1c17939 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -1917,18 +1917,18 @@ class MobiWriter(object): self._ctoc_map.append(ctoc_name_map) def _generate_ctoc(self): - # Generate the compiled TOC strings - # Each node has 1-4 CTOC entries: - # Periodical (0xDF) - # title, class - # Section (0xFF) - # title, class - # Article (0x3F) - # title, class, description, author - # Chapter (0x0F) - # title, class - # nb: Chapters don't actually have @class, so we synthesize it - # in reader._toc_from_navpoint + # Generate the compiled TOC strings + # Each node has 1-4 CTOC entries: + # Periodical (0xDF) + # title, class + # Section (0xFF) + # title, class + # Article (0x3F) + # title, class, description, author + # Chapter (0x0F) + # title, class + # nb: Chapters don't actually have @class, so we synthesize it + # in reader._toc_from_navpoint toc = self._oeb.toc reduced_toc = [] From 01a3782676ed75215463cd355d37f2023f9dcb9b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 6 Jul 2009 19:39:00 -0600 Subject: [PATCH 04/20] MOBI Output: Have the href of the periodical class TOC node point to the top of the first spine element --- src/calibre/ebooks/mobi/output.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/mobi/output.py b/src/calibre/ebooks/mobi/output.py index c8fe87a161..048185b170 100644 --- a/src/calibre/ebooks/mobi/output.py +++ b/src/calibre/ebooks/mobi/output.py @@ -76,6 +76,7 @@ class MOBIOutput(OutputFormatPlugin): from calibre.ebooks.oeb.base import TOC toc = self.oeb.toc if toc and toc[0].klass != 'periodical': + start_href = self.oeb.spine[0].href self.log('Converting TOC for MOBI periodical indexing...') articles = {} if toc.depth < 3: @@ -92,7 +93,7 @@ class MOBIOutput(OutputFormatPlugin): a.klass = 'article' articles[id(sec)].append(a) sec.nodes.remove(a) - root = TOC(klass='periodical', + root = TOC(klass='periodical', href=start_href, title=unicode(self.oeb.metadata.title[0])) for s in sections: if articles[id(s)]: From 5c5d37ee1ad0509cd5ce7b1a4195c1cc1c481a57 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 6 Jul 2009 19:55:15 -0600 Subject: [PATCH 05/20] Fix #2781 (Failed to generate index) --- src/calibre/ebooks/mobi/writer.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index f7e1c17939..77437a7224 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -1953,6 +1953,8 @@ class MobiWriter(object): first = False else : self._oeb.logger.info('Generating flat CTOC ...') + previousOffset = -1 + currentOffset = 0 for (i, child) in enumerate(toc.iter()): # Only add chapters or articles at depth==1 # no class defaults to 'chapter' @@ -1961,8 +1963,20 @@ class MobiWriter(object): if self.opts.verbose > 2 : self._oeb.logger.info("adding (klass:%s depth:%d) %s to flat ctoc" % \ (child.klass, child.depth(), child) ) - self._add_flat_ctoc_node(child, ctoc) - reduced_toc.append(child) + + # Test to see if this child's offset is the same as the previous child's + # offset, skip it + h = child.href + currentOffset = self._id_offsets[h] + # print "_generate_ctoc: child offset: 0x%X" % currentOffset + + if currentOffset != previousOffset : + self._add_flat_ctoc_node(child, ctoc) + reduced_toc.append(child) + previousOffset = currentOffset + else : + self._oeb.logger.warn("ignoring redundant href: %s in '%s'" % (h, child.title)) + first = False else : if self.opts.verbose > 2 : From 50f6de082abce0c06b7432a28c42d7b438b40551 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 7 Jul 2009 09:30:48 -0600 Subject: [PATCH 06/20] Remove inline navars from downloaded recipes when converting to indexed MOBI. Also fix various typos. --- src/calibre/customize/profiles.py | 5 +++ src/calibre/ebooks/conversion/plumber.py | 2 ++ src/calibre/ebooks/mobi/output.py | 9 ++--- src/calibre/ebooks/mobi/writer.py | 44 ++++++++++-------------- src/calibre/ebooks/oeb/base.py | 4 ++- src/calibre/ebooks/oeb/stylizer.py | 3 +- src/calibre/web/feeds/news.py | 18 +++++----- 7 files changed, 45 insertions(+), 40 deletions(-) diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py index 45026fcb5c..af2694ccff 100644 --- a/src/calibre/customize/profiles.py +++ b/src/calibre/customize/profiles.py @@ -143,6 +143,9 @@ class OutputProfile(Plugin): # The image size for comics comic_screen_size = (584, 754) + # If True the MOBI renderer on the device supports MOBI indexing + supports_mobi_indexing = False + @classmethod def tags_to_string(cls, tags): return ', '.join(tags) @@ -230,6 +233,7 @@ class KindleOutput(OutputProfile): dpi = 168.451 fbase = 16 fsizes = [12, 12, 14, 16, 18, 20, 22, 24] + supports_mobi_indexing = True @classmethod def tags_to_string(cls, tags): @@ -245,6 +249,7 @@ class KindleDXOutput(OutputProfile): screen_size = (744, 1022) dpi = 150.0 comic_screen_size = (741, 1022) + supports_mobi_indexing = True @classmethod def tags_to_string(cls, tags): diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 11975094e3..b2e2958ec0 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -550,6 +550,8 @@ OptionRecommendation(name='list_recipes', break self.read_user_metadata() + self.opts.no_inline_navbars = self.opts.output_profile.supports_mobi_indexing \ + and self.output_fmt == 'mobi' def flush(self): try: diff --git a/src/calibre/ebooks/mobi/output.py b/src/calibre/ebooks/mobi/output.py index 048185b170..bab86390b0 100644 --- a/src/calibre/ebooks/mobi/output.py +++ b/src/calibre/ebooks/mobi/output.py @@ -79,10 +79,11 @@ class MOBIOutput(OutputFormatPlugin): start_href = self.oeb.spine[0].href self.log('Converting TOC for MOBI periodical indexing...') articles = {} - if toc.depth < 3: - sections = [TOC(klass='section')] + if toc.depth() < 3: + sections = [TOC(klass='section', title=_('All articles'), + href=start_href)] for x in toc: - sections[0].append(x) + sections[0].nodes.append(x) else: sections = list(toc) for x in sections: @@ -99,7 +100,7 @@ class MOBIOutput(OutputFormatPlugin): if articles[id(s)]: for a in articles[id(s)]: s.nodes.append(a) - root.nodes.append(s) + root.nodes.append(s) for x in list(toc.nodes): toc.nodes.remove(x) diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index 77437a7224..f7121cb2c3 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -379,7 +379,7 @@ class MobiWriter(object): try: self._generate_index() except: - self.oeb.log.exception('Failed to generate index') + self._oeb.log.exception('Failed to generate index') self._generate_images() @@ -1178,40 +1178,32 @@ class MobiWriter(object): ''' toc = self._oeb.toc nodes = list(toc.iter())[1:] + toc_conforms = True for (i, child) in enumerate(nodes) : - if self.opts.verbose > 3 : - self._oeb.logger.info(" : %-25.25s \tklass=%-15.15s \tdepth:%d playOrder=%03d" % \ - (child.title, child.klass, child.depth(), child.play_order) ) + if child.klass == "periodical" and child.depth() != 3 or \ + child.klass == "section" and child.depth() != 2 or \ + child.klass == "article" and child.depth() != 1 : - if child.klass == "periodical" and child.depth() != 3 : - self._oeb.logger.info('<navPoint class="periodical"> found at depth %d, nonconforming TOC' % \ - child.depth() ) - return False - - if child.klass == "section" and child.depth() != 2 : - self._oeb.logger.info('<navPoint class="section"> found at depth %d, nonconforming TOC' % \ - child.depth() ) - return False - - if child.klass == "article" and child.depth() != 1 : - self._oeb.logger.info('<navPoint class="article"> found at depth %d, nonconforming TOC' % \ - child.depth() ) - return False + self._oeb.logger.warn('Nonconforming TOC entry: "%s" found at depth %d' % \ + (child.klass, child.depth()) ) + self._oeb.logger.warn(" <title>: '%-25.25s...' \t\tklass=%-15.15s \tdepth:%d \tplayOrder=%03d" % \ + (child.title, child.klass, child.depth(), child.play_order) ) + toc_conforms = False # We also need to know that we have a pubdate or timestamp in the metadata, which the Kindle needs if self._oeb.metadata['date'] == [] and self._oeb.metadata['timestamp'] == [] : - self._oeb.logger.info('metadata missing timestamp needed for periodical') - return False + self._oeb.logger.info('metadata missing date/timestamp') + toc_conforms = False # Periodicals also need a mastheadImage in the manifest has_mastheadImage = 'masthead' in self._oeb.guide if not has_mastheadImage : - self._oeb.logger.info('mastheadImage missing from manifest, aborting periodical indexing') - return False + self._oeb.logger.info('mastheadImage missing from manifest') + toc_conforms = False - self._oeb.logger.info('TOC structure and pubdate verified') - return True + self._oeb.logger.info("%s" % "TOC structure conforms" if toc_conforms else "TOC structure non-conforming") + return toc_conforms def _generate_text(self): @@ -1236,7 +1228,7 @@ class MobiWriter(object): # Evaluate toc for conformance if self.opts.mobi_periodical : - self._oeb.logger.info('--mobi-periodical specified, evaluating TOC for periodical conformance ...') + self._oeb.logger.info('MOBI periodical specified, evaluating TOC for periodical conformance ...') self._conforming_periodical_toc = self._evaluate_periodical_toc() # This routine decides whether to build flat or structured based on self._conforming_periodical_toc @@ -1545,7 +1537,7 @@ class MobiWriter(object): exth.write(data) nrecs += 1 if term == 'rights' : - rights = unicode(oeb.metadata.rights[0]) + rights = unicode(oeb.metadata.rights[0]).encode('utf-8') exth.write(pack('>II', EXTH_CODES['rights'], len(rights) + 8)) exth.write(rights) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index acf95df502..ba4ebbc598 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -1468,7 +1468,9 @@ class TOC(object): node.to_opf1(tour) return tour - def to_ncx(self, parent): + def to_ncx(self, parent=None): + if parent is None: + parent = etree.Element(NCX('navMap')) for node in self.nodes: id = node.id or unicode(uuid.uuid4()) attrib = {'id': id, 'playOrder': str(node.play_order)} diff --git a/src/calibre/ebooks/oeb/stylizer.py b/src/calibre/ebooks/oeb/stylizer.py index 5fcc7e3fac..18c16e44d5 100644 --- a/src/calibre/ebooks/oeb/stylizer.py +++ b/src/calibre/ebooks/oeb/stylizer.py @@ -169,7 +169,8 @@ class Stylizer(object): if not matches and class_sel_pat.match(text): found = False for x in tree.xpath('//*[@class]'): - if text.lower().endswith('.'+x.get('class').lower()): + if text.lower().endswith('.'+x.get('class').lower()) and \ + text.lower() != text: matches.append(x) found = True if found: diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index 17bff315d4..9f74b6263f 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -469,6 +469,7 @@ class BasicNewsRecipe(Recipe): self.username = options.username self.password = options.password self.lrf = options.lrf + self.include_navbars = not options.no_inline_navbars self.output_dir = os.path.abspath(self.output_dir) if options.test: @@ -539,7 +540,7 @@ class BasicNewsRecipe(Recipe): if first_fetch and job_info: url, f, a, feed_len = job_info body = soup.find('body') - if body is not None: + if body is not None and self.include_navbars: templ = self.navbar.generate(False, f, a, feed_len, not self.has_single_feed, url, __appname__, @@ -907,12 +908,13 @@ class BasicNewsRecipe(Recipe): body = soup.find('body') if body is not None: prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last)))) - templ = self.navbar.generate(True, num, j, len(f), - not self.has_single_feed, - a.orig_url, __appname__, prefix=prefix, - center=self.center_navbar) - elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div') - body.insert(len(body.contents), elem) + if self.include_navbars: + templ = self.navbar.generate(True, num, j, len(f), + not self.has_single_feed, + a.orig_url, __appname__, prefix=prefix, + center=self.center_navbar) + elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div') + body.insert(len(body.contents), elem) with open(last, 'wb') as fi: fi.write(unicode(soup).encode('utf-8')) @@ -923,7 +925,7 @@ class BasicNewsRecipe(Recipe): if po is None: self.play_order_counter += 1 po = self.play_order_counter - desc = f.description + desc = getattr(f, 'description', None) if not desc: desc = None feed_index(i, toc.add_item('feed_%d/index.html'%i, None, From 151ffab821d698d1582d52b6dec0d04804bc4e5c Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Tue, 7 Jul 2009 10:05:22 -0600 Subject: [PATCH 07/20] Support Want-OPDS-Catalog HTTP header in content server --- src/calibre/library/server.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/calibre/library/server.py b/src/calibre/library/server.py index eab159bc95..0a13800f75 100644 --- a/src/calibre/library/server.py +++ b/src/calibre/library/server.py @@ -366,10 +366,9 @@ class LibraryServer(object): @expose def index(self, **kwargs): 'The / URL' - stanza = cherrypy.request.headers.get('Stanza-Device-Name', 919) - if stanza == 919: - return self.static('index.html') - return self.stanza() + want_opds = cherrypy.request.headers.get('Stanza-Device-Name', 919) != \ + 919 or cherrypy.request.headers.get('Want-OPDS-Catalog', 919) != 919 + return self.stanza() if want_opds else self.static('index.html') @expose From ed716130f8f818491fd417659e34ee30a45ad530 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Tue, 7 Jul 2009 10:12:41 -0600 Subject: [PATCH 08/20] ... --- src/calibre/ebooks/mobi/writer.py | 33 +++++++++++++++---------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index f7121cb2c3..aa7aa998c4 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -461,7 +461,7 @@ class MobiWriter(object): h = child.href if h not in self._id_offsets: - self._oeb.log.warning('Could not find TOC entry "%s", aborting indexing ...'% child.title) + self._oeb.log.warning(' Could not find TOC entry "%s", aborting indexing ...'% child.title) return False offset = self._id_offsets[h] @@ -573,7 +573,7 @@ class MobiWriter(object): # Entries continues with a stream of section+articles, section+articles ... h = child.href if h not in self._id_offsets: - self._oeb.log.warning('Could not find TOC entry "%s", aborting indexing ...'% child.title) + self._oeb.log.warning(' Could not find TOC entry "%s", aborting indexing ...'% child.title) return False offset = self._id_offsets[h] @@ -1184,10 +1184,11 @@ class MobiWriter(object): child.klass == "section" and child.depth() != 2 or \ child.klass == "article" and child.depth() != 1 : - self._oeb.logger.warn('Nonconforming TOC entry: "%s" found at depth %d' % \ + self._oeb.logger.info('Nonconforming TOC entry: "%s" found at depth %d' % \ (child.klass, child.depth()) ) - self._oeb.logger.warn(" <title>: '%-25.25s...' \t\tklass=%-15.15s \tdepth:%d \tplayOrder=%03d" % \ + self._oeb.logger.info(" <title>: '%-25.25s...' \t\tklass=%-15.15s \tdepth:%d \tplayOrder=%03d" % \ (child.title, child.klass, child.depth(), child.play_order) ) + self._oeb.logger.info("(Conforming: periodical at depth:3, section at depth:2, articles at depth:1)") toc_conforms = False # We also need to know that we have a pubdate or timestamp in the metadata, which the Kindle needs @@ -1195,14 +1196,11 @@ class MobiWriter(object): self._oeb.logger.info('metadata missing date/timestamp') toc_conforms = False - # Periodicals also need a mastheadImage in the manifest - has_mastheadImage = 'masthead' in self._oeb.guide - - if not has_mastheadImage : + if not 'masthead' in self._oeb.guide : self._oeb.logger.info('mastheadImage missing from manifest') toc_conforms = False - self._oeb.logger.info("%s" % "TOC structure conforms" if toc_conforms else "TOC structure non-conforming") + self._oeb.logger.info("%s" % " TOC structure conforms" if toc_conforms else " TOC structure non-conforming") return toc_conforms @@ -1223,12 +1221,12 @@ class MobiWriter(object): offset = 0 if self._compression != UNCOMPRESSED: - self._oeb.logger.info('Compressing markup content...') + self._oeb.logger.info(' Compressing markup content...') data, overlap = self._read_text_record(text) # Evaluate toc for conformance if self.opts.mobi_periodical : - self._oeb.logger.info('MOBI periodical specified, evaluating TOC for periodical conformance ...') + self._oeb.logger.info(' evaluating TOC for periodical conformance ...') self._conforming_periodical_toc = self._evaluate_periodical_toc() # This routine decides whether to build flat or structured based on self._conforming_periodical_toc @@ -1241,11 +1239,11 @@ class MobiWriter(object): if len(entries) : self._indexable = self._generate_indexed_navpoints() else : - self._oeb.logger.info('No entries found in TOC ...') + self._oeb.logger.info(' No entries found in TOC ...') self._indexable = False if not self._indexable : - self._oeb.logger.info('Writing unindexed mobi ...') + self._oeb.logger.info(' Writing unindexed mobi ...') while len(data) > 0: if self._compression == PALMDOC: @@ -1263,7 +1261,8 @@ class MobiWriter(object): while breaks and (breaks[0] - offset) < RECORD_SIZE: # .pop returns item, removes it from list pbreak = (breaks.pop(0) - running) >> 3 - self._oeb.logger.info('pbreak = 0x%X at 0x%X' % (pbreak, record.tell()) ) + if self.opts.verbose > 2 : + self._oeb.logger.info('pbreak = 0x%X at 0x%X' % (pbreak, record.tell()) ) encoded = decint(pbreak, DECINT_FORWARD) record.write(encoded) running += pbreak << 3 @@ -1376,7 +1375,7 @@ class MobiWriter(object): # 0x002 MOBI book (chapter - chapter navigation) # 0x101 News - Hierarchical navigation with sections and articles # 0x102 News feed - Flat navigation - # 0x103 News magazine - same as 1x101 + # 0x103 News magazine - same as 0x101 # 0xC - 0xF : Text encoding (65001 is utf-8) # 0x10 - 0x13 : UID # 0x14 - 0x17 : Generator version @@ -1606,7 +1605,7 @@ class MobiWriter(object): self._write(record) def _generate_index(self): - self._oeb.log('Generating primary index ...') + self._oeb.log('Generating INDX ...') self._primary_index_record = None # Build the NCXEntries and INDX @@ -1967,7 +1966,7 @@ class MobiWriter(object): reduced_toc.append(child) previousOffset = currentOffset else : - self._oeb.logger.warn("ignoring redundant href: %s in '%s'" % (h, child.title)) + self._oeb.logger.warn(" Ignoring redundant href: %s in '%s'" % (h, child.title)) first = False else : From 3725e7eb793ee796305738b70c01d044aa9a91f3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Tue, 7 Jul 2009 10:18:10 -0600 Subject: [PATCH 09/20] ... --- src/calibre/ebooks/mobi/writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index aa7aa998c4..36fa4befe0 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -2050,7 +2050,7 @@ class MobiWriter(object): indices.write(pack('>H', pos)) # Save the offset for IDXTIndices name = "%04X"%count indxt.write(chr(len(name)) + name) # Write the name - indxt.write(INDXT['article']) # entryType [0x0F | 0xDF | 0xFF | 0x3F] + indxt.write(INDXT['article']) # entryType [0x0F | 0xDF | 0xFF | 0x3F] hasAuthor = True if self._ctoc_map[index]['authorOffset'] else False hasDescription = True if self._ctoc_map[index]['descriptionOffset'] else False From d57e1d98c39f7e86d17fee426941fe0bfb58404b Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Tue, 7 Jul 2009 11:13:49 -0600 Subject: [PATCH 10/20] LRF Output: Base font size rescaling now uses both OEB and LRF algorithms. --- src/calibre/ebooks/lrf/output.py | 2 +- src/calibre/ebooks/mobi/reader.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/lrf/output.py b/src/calibre/ebooks/lrf/output.py index 6ca27ba9a4..e88317c402 100644 --- a/src/calibre/ebooks/lrf/output.py +++ b/src/calibre/ebooks/lrf/output.py @@ -29,7 +29,7 @@ class LRFOptions(object): self.use_metadata_cover = True self.output = output self.ignore_tables = opts.linearize_tables - self.base_font_size = 0 + self.base_font_size = opts.base_font_size self.blank_after_para = opts.insert_blank_line self.use_spine = True self.font_delta = 0 diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index 8dc8d31150..051395343d 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -301,7 +301,7 @@ class MobiReader(object): root = html.fromstring(self.processed_html) if root.xpath('descendant::p/descendant::p'): from lxml.html import soupparser - self.log.warning('Malformed markup, parsing using BeatifulSoup') + self.log.warning('Malformed markup, parsing using BeautifulSoup') root = soupparser.fromstring(self.processed_html) if root.tag != 'html': From e0d515874b638008464a7588cb861746c675196b Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Tue, 7 Jul 2009 12:01:20 -0600 Subject: [PATCH 11/20] Force pretty printing of output when outputting to OEB or EPUB. Fix #2777 (ToC incorrect in MS Reader to ePub) --- src/calibre/ebooks/epub/output.py | 2 ++ src/calibre/ebooks/metadata/__init__.py | 2 +- src/calibre/ebooks/oeb/output.py | 5 +++++ src/calibre/ebooks/oeb/transforms/split.py | 22 ++++++++++++++++++++-- 4 files changed, 28 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/epub/output.py b/src/calibre/ebooks/epub/output.py index 160676137e..9c542d7b2f 100644 --- a/src/calibre/ebooks/epub/output.py +++ b/src/calibre/ebooks/epub/output.py @@ -80,6 +80,8 @@ class EPUBOutput(OutputFormatPlugin): ]) + recommendations = set([('pretty_print', True, OptionRecommendation.HIGH)]) + TITLEPAGE_COVER = '''\ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en"> diff --git a/src/calibre/ebooks/metadata/__init__.py b/src/calibre/ebooks/metadata/__init__.py index 5f575eb2a9..e5619bee63 100644 --- a/src/calibre/ebooks/metadata/__init__.py +++ b/src/calibre/ebooks/metadata/__init__.py @@ -367,7 +367,7 @@ class MetaInformation(object): if self.pubdate is not None: ans += [(_('Published'), unicode(self.pubdate.isoformat(' ')))] if self.rights is not None: - ans += [(_('Rights'), unicode(self.rights.isoformat(' ')))] + ans += [(_('Rights'), unicode(self.rights))] for i, x in enumerate(ans): ans[i] = u'<tr><td><b>%s</b></td><td>%s</td></tr>'%x return u'<table>%s</table>'%u'\n'.join(ans) diff --git a/src/calibre/ebooks/oeb/output.py b/src/calibre/ebooks/oeb/output.py index 18c20f334d..4df8c0f679 100644 --- a/src/calibre/ebooks/oeb/output.py +++ b/src/calibre/ebooks/oeb/output.py @@ -9,6 +9,8 @@ from lxml import etree from calibre.customize.conversion import OutputFormatPlugin from calibre import CurrentDir +from calibre.customize.conversion import OptionRecommendation + from urllib import unquote class OEBOutput(OutputFormatPlugin): @@ -17,6 +19,9 @@ class OEBOutput(OutputFormatPlugin): author = 'Kovid Goyal' file_type = 'oeb' + recommendations = set([('pretty_print', True, OptionRecommendation.HIGH)]) + + def convert(self, oeb_book, output_path, input_plugin, opts, log): self.log, self.opts = log, opts if not os.path.exists(output_path): diff --git a/src/calibre/ebooks/oeb/transforms/split.py b/src/calibre/ebooks/oeb/transforms/split.py index 1fba7ffa64..d4b60e3a59 100644 --- a/src/calibre/ebooks/oeb/transforms/split.py +++ b/src/calibre/ebooks/oeb/transforms/split.py @@ -16,7 +16,7 @@ from lxml import etree from lxml.cssselect import CSSSelector from calibre.ebooks.oeb.base import OEB_STYLES, XPNSMAP as NAMESPACES, \ - urldefrag, rewrite_links, urlunquote, barename + urldefrag, rewrite_links, urlunquote, barename, XHTML from calibre.ebooks.epub import rules XPath = functools.partial(_XPath, namespaces=NAMESPACES) @@ -216,7 +216,25 @@ class FlowSplitter(object): self.trees.append(before) tree = after self.trees.append(tree) - self.trees = [t for t in self.trees if not self.is_page_empty(t.getroot())] + trees, ids = [], set([]) + for tree in self.trees: + root = tree.getroot() + if self.is_page_empty(root): + discarded_ids = root.xpath('//*[@id]') + for x in discarded_ids: + x = x.get('id') + if not x.startswith('calibre_'): + ids.add(x) + else: + if ids: + body = self.get_body(root) + if body is not None: + for x in ids: + body.insert(0, body.makeelement(XHTML('div'), + id=x, style='height:0pt')) + ids = set([]) + trees.append(tree) + self.trees = trees def get_body(self, root): body = root.xpath('//h:body', namespaces=NAMESPACES) From 21194d5d7508d1012be23fed16df7b746fb8fe6e Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Tue, 7 Jul 2009 12:56:08 -0600 Subject: [PATCH 12/20] ... --- src/calibre/ebooks/mobi/writer.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index 36fa4befe0..a5b13e1437 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -1184,11 +1184,10 @@ class MobiWriter(object): child.klass == "section" and child.depth() != 2 or \ child.klass == "article" and child.depth() != 1 : - self._oeb.logger.info('Nonconforming TOC entry: "%s" found at depth %d' % \ + self._oeb.logger.warn('Nonconforming TOC entry: "%s" found at depth %d' % \ (child.klass, child.depth()) ) - self._oeb.logger.info(" <title>: '%-25.25s...' \t\tklass=%-15.15s \tdepth:%d \tplayOrder=%03d" % \ + self._oeb.logger.warn(" <title>: '%-25.25s...' \t\tklass=%-15.15s \tdepth:%d \tplayOrder=%03d" % \ (child.title, child.klass, child.depth(), child.play_order) ) - self._oeb.logger.info("(Conforming: periodical at depth:3, section at depth:2, articles at depth:1)") toc_conforms = False # We also need to know that we have a pubdate or timestamp in the metadata, which the Kindle needs @@ -1226,7 +1225,7 @@ class MobiWriter(object): # Evaluate toc for conformance if self.opts.mobi_periodical : - self._oeb.logger.info(' evaluating TOC for periodical conformance ...') + self._oeb.logger.info(' MOBI periodical specified, evaluating TOC for periodical conformance ...') self._conforming_periodical_toc = self._evaluate_periodical_toc() # This routine decides whether to build flat or structured based on self._conforming_periodical_toc From 173aa421e3d69a9669fa508495a9f51525d5197c Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Tue, 7 Jul 2009 13:06:17 -0600 Subject: [PATCH 13/20] Fix #2748 ("calibredb add -1" fails) --- src/calibre/ebooks/metadata/meta.py | 8 ++------ src/calibre/ebooks/mobi/writer.py | 2 +- src/calibre/library/database2.py | 2 +- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/src/calibre/ebooks/metadata/meta.py b/src/calibre/ebooks/metadata/meta.py index a239933710..e74ce5757d 100644 --- a/src/calibre/ebooks/metadata/meta.py +++ b/src/calibre/ebooks/metadata/meta.py @@ -31,7 +31,7 @@ def metadata_from_formats(formats): try: return _metadata_from_formats(formats) except: - mi = metadata_from_filename(formats[0]) + mi = metadata_from_filename(list(formats)[0]) if not mi.authors: mi.authors = [_('Unknown')] @@ -126,14 +126,10 @@ def metadata_from_filename(name, pat=None): mi.title = match.group('title') except IndexError: pass - try: - mi.authors = [match.group('author')] - except IndexError: - pass try: au = match.group('authors') aus = string_to_authors(au) - mi.authors = authors + mi.authors = aus except IndexError: pass try: diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index a5b13e1437..8e8cff2aff 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -2031,7 +2031,7 @@ class MobiWriter(object): indices.write(pack('>H', pos)) # Save the offset for IDXTIndices name = "%04X"%count indxt.write(chr(len(name)) + name) # Write the name - indxt.write(INDXT['section']) # entryType [0x0F | 0xDF | 0xFF | 0x3F] + indxt.write(INDXT['section']) # entryType [0x0F | 0xDF | 0xFF | 0x3F] indxt.write(chr(0)) # subType 0 indxt.write(decint(offset, DECINT_FORWARD)) # offset indxt.write(decint(length, DECINT_FORWARD)) # length diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py index 3550253ffa..fccd131761 100644 --- a/src/calibre/library/database2.py +++ b/src/calibre/library/database2.py @@ -1708,7 +1708,7 @@ books_series_link feeds formats = self.find_books_in_directory(dirpath, True) if not formats: return - + formats = list(formats) mi = metadata_from_formats(formats) if mi.title is None: return From 3850f64c060fa45d8078974456ee7d9c4595b80e Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Tue, 7 Jul 2009 13:46:03 -0600 Subject: [PATCH 14/20] EPUB Output: Replace self closed pre elements as they cause problems with WebKit based renderers --- src/calibre/ebooks/epub/output.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/calibre/ebooks/epub/output.py b/src/calibre/ebooks/epub/output.py index 9c542d7b2f..be096eece3 100644 --- a/src/calibre/ebooks/epub/output.py +++ b/src/calibre/ebooks/epub/output.py @@ -136,6 +136,21 @@ class EPUBOutput(OutputFormatPlugin): </body> </html> ''' + def workaround_webkit_quirks(self): + from calibre.ebooks.oeb.base import XPath + for x in self.oeb.spine: + root = x.data + body = XPath('//h:body')(root) + if body: + body = body[0] + + if not hasattr(body, 'xpath'): + continue + + for pre in XPath('//h:pre')(body): + if not pre.text and len(pre) == 0: + pre.tag = 'div' + def convert(self, oeb, output_path, input_plugin, opts, log): self.log, self.opts, self.oeb = log, opts, oeb @@ -148,6 +163,7 @@ class EPUBOutput(OutputFormatPlugin): self.workaround_ade_quirks() + self.workaround_webkit_quirks() from calibre.ebooks.oeb.transforms.rescale import RescaleImages RescaleImages()(oeb, opts) From a625a9f6ea2c332ed77abf1b7662894067563b91 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Tue, 7 Jul 2009 13:46:23 -0600 Subject: [PATCH 15/20] Fix #2737 (Metadata retrieval error: "year out of range") --- src/calibre/__init__.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index d7c5f0b223..1c2d780412 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -308,14 +308,25 @@ def walk(dir): yield os.path.join(record[0], f) def strftime(fmt, t=None): - ''' A version of strtime that returns unicode strings. ''' + ''' A version of strtime that returns unicode strings and tries to handle dates + before 1900 ''' if t is None: t = time.localtime() + early_year = t[0] < 1900 + if early_year: + fmt = fmt.replace('%Y', '_early year hack##') + t = list(t) + orig_year = t[0] + t[0] = 1900 + ans = None if iswindows: if isinstance(fmt, unicode): fmt = fmt.encode('mbcs') - return plugins['winutil'][0].strftime(fmt, t) - return time.strftime(fmt, t).decode(preferred_encoding, 'replace') + ans = plugins['winutil'][0].strftime(fmt, t) + ans = time.strftime(fmt, t).decode(preferred_encoding, 'replace') + if early_year: + ans = ans.replace('_early year hack##', str(orig_year)) + return ans def my_unichr(num): try: From d850f6e2d65acb3c09deb370b583e1f03e53b83b Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Tue, 7 Jul 2009 14:20:21 -0600 Subject: [PATCH 16/20] MOBI Input: Assume unit less numbers are in pixels --- src/calibre/ebooks/mobi/reader.py | 13 +++++++++---- src/calibre/ebooks/oeb/transforms/guide.py | 2 +- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index 051395343d..ac7619cbb6 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -439,7 +439,12 @@ class MobiReader(object): self.processed_html = '<html><p>' + self.processed_html.replace('\n\n', '<p>') + '</html>' self.processed_html = self.processed_html.replace('\r\n', '\n') self.processed_html = self.processed_html.replace('> <', '>\n<') - self.processed_html = re.sub('\x14|\x15', '', self.processed_html) + self.processed_html = re.sub('\x14|\x15|\x1c|\x1d', '', self.processed_html) + + def ensure_unit(self, raw, unit='px'): + if re.search(r'\d+$', raw) is not None: + raw += unit + return raw def upshift_markup(self, root): self.log.debug('Converting style information to CSS...') @@ -469,13 +474,13 @@ class MobiReader(object): if attrib.has_key('height'): height = attrib.pop('height').strip() if height: - styles.append('margin-top: %s' % height) + styles.append('margin-top: %s' % self.ensure_unit(height)) if attrib.has_key('width'): width = attrib.pop('width').strip() if width: - styles.append('text-indent: %s' % width) + styles.append('text-indent: %s' % self.ensure_unit(width)) if width.startswith('-'): - styles.append('margin-left: %s' % (width[1:])) + styles.append('margin-left: %s' % self.ensure_unit(width[1:])) if attrib.has_key('align'): align = attrib.pop('align').strip() if align: diff --git a/src/calibre/ebooks/oeb/transforms/guide.py b/src/calibre/ebooks/oeb/transforms/guide.py index aaeba67d80..82bcaca72a 100644 --- a/src/calibre/ebooks/oeb/transforms/guide.py +++ b/src/calibre/ebooks/oeb/transforms/guide.py @@ -34,7 +34,7 @@ class Clean(object): for x in list(self.oeb.guide): href = urldefrag(self.oeb.guide[x].href)[0] - if x.lower() not in ('cover', 'titlepage'): + if x.lower() not in ('cover', 'titlepage', 'masthead'): self.oeb.guide.remove(x) From 8e9590373f5e69fdae7f814deace22adb10f1557 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Tue, 7 Jul 2009 15:24:24 -0600 Subject: [PATCH 17/20] Fix handling of user specified masthead image --- src/calibre/ebooks/mobi/output.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/calibre/ebooks/mobi/output.py b/src/calibre/ebooks/mobi/output.py index bab86390b0..4de346c0af 100644 --- a/src/calibre/ebooks/mobi/output.py +++ b/src/calibre/ebooks/mobi/output.py @@ -48,11 +48,7 @@ class MOBIOutput(OutputFormatPlugin): self.opts.mobi_periodical = False def check_for_masthead(self): - found = False - for typ in self.oeb.guide: - if type == 'masthead': - found = True - break + found = 'masthead' in self.oeb.guide if not found: self.oeb.log.debug('No masthead found, generating default one...') from calibre.resources import server_resources From 6697b9bbb3ad0cd02ef3b2e33bec1da4a87aba4c Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Tue, 7 Jul 2009 15:51:00 -0600 Subject: [PATCH 18/20] Pass trough guide entries used by MOBI output --- src/calibre/ebooks/oeb/transforms/guide.py | 3 +- .../web/feeds/recipes/recipe_nytimes_sub.py | 30 +++++++++---------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/src/calibre/ebooks/oeb/transforms/guide.py b/src/calibre/ebooks/oeb/transforms/guide.py index 82bcaca72a..c1f0dd6669 100644 --- a/src/calibre/ebooks/oeb/transforms/guide.py +++ b/src/calibre/ebooks/oeb/transforms/guide.py @@ -34,7 +34,8 @@ class Clean(object): for x in list(self.oeb.guide): href = urldefrag(self.oeb.guide[x].href)[0] - if x.lower() not in ('cover', 'titlepage', 'masthead'): + if x.lower() not in ('cover', 'titlepage', 'masthead', 'toc', + 'title-page', 'copyright-page'): self.oeb.guide.remove(x) diff --git a/src/calibre/web/feeds/recipes/recipe_nytimes_sub.py b/src/calibre/web/feeds/recipes/recipe_nytimes_sub.py index 5d91dbae38..4449ba1aa2 100644 --- a/src/calibre/web/feeds/recipes/recipe_nytimes_sub.py +++ b/src/calibre/web/feeds/recipes/recipe_nytimes_sub.py @@ -11,7 +11,7 @@ from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup class NYTimes(BasicNewsRecipe): - + title = 'The New York Times (subscription)' __author__ = 'Kovid Goyal' language = _('English') @@ -20,13 +20,13 @@ class NYTimes(BasicNewsRecipe): needs_subscription = True remove_tags_before = dict(id='article') remove_tags_after = dict(id='article') - remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}), - dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']), + remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}), + dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']), dict(name=['script', 'noscript', 'style'])] encoding = 'cp1252' no_stylesheets = True extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}' - + def get_browser(self): br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: @@ -36,24 +36,24 @@ class NYTimes(BasicNewsRecipe): br['PASSWORD'] = self.password br.submit() return br - + def parse_index(self): soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html') - + def feed_title(div): return ''.join(div.findAll(text=True, recursive=False)).strip() - + articles = {} key = None ans = [] - for div in soup.findAll(True, + for div in soup.findAll(True, attrs={'class':['section-headline', 'story', 'story headline']}): - + if div['class'] == 'section-headline': key = string.capwords(feed_title(div)) articles[key] = [] ans.append(key) - + elif div['class'] in ['story', 'story headline']: a = div.find('a', href=True) if not a: @@ -66,21 +66,21 @@ class NYTimes(BasicNewsRecipe): summary = div.find(True, attrs={'class':'summary'}) if summary: description = self.tag_to_string(summary, use_alt=False) - + feed = key if key is not None else 'Uncategorized' if not articles.has_key(feed): articles[feed] = [] if not 'podcasts' in url: articles[feed].append( - dict(title=title, url=url, date=pubdate, + dict(title=title, url=url, date=pubdate, description=description, content='')) - ans = self.sort_index_by(ans, {'The Front Page':-1, - 'Dining In, Dining Out':1, + ans = self.sort_index_by(ans, {'The Front Page':-1, + 'Dining In, Dining Out':1, 'Obituaries':2}) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] return ans - + def preprocess_html(self, soup): refresh = soup.find('meta', {'http-equiv':'refresh'}) if refresh is None: From e0e6ddb6cf73757d6ced82f7ee8f6cfc8e9ba283 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Tue, 7 Jul 2009 17:13:30 -0600 Subject: [PATCH 19/20] Strip trailling periods from filenames calculated by the SQLite backend --- src/calibre/ebooks/metadata/opf2.py | 5 ++++- src/calibre/gui2/add.py | 2 -- src/calibre/library/database2.py | 2 ++ 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/metadata/opf2.py b/src/calibre/ebooks/metadata/opf2.py index c147c2b748..4571ac1d6f 100644 --- a/src/calibre/ebooks/metadata/opf2.py +++ b/src/calibre/ebooks/metadata/opf2.py @@ -452,9 +452,12 @@ class OPF(object): def __init__(self, stream, basedir=os.getcwdu(), unquote_urls=True): if not hasattr(stream, 'read'): stream = open(stream, 'rb') + raw = stream.read() + if not raw: + raise ValueError('Empty file: '+getattr(stream, 'name', 'stream')) self.basedir = self.base_dir = basedir self.path_to_html_toc = self.html_toc_fragment = None - raw, self.encoding = xml_to_unicode(stream.read(), strip_encoding_pats=True, resolve_entities=True) + raw, self.encoding = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True) raw = raw[raw.find('<'):] self.root = etree.fromstring(raw, self.PARSER) self.metadata = self.metadata_path(self.root) diff --git a/src/calibre/gui2/add.py b/src/calibre/gui2/add.py index ec253e5ae0..b5572e34d6 100644 --- a/src/calibre/gui2/add.py +++ b/src/calibre/gui2/add.py @@ -107,8 +107,6 @@ class Adder(QObject): self.callback(self.paths, self.names, self.infos) self.callback_called = True - - def update(self): if not self.ids: self.timer.stop() diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py index a2d3d70edd..f7780aa2a6 100644 --- a/src/calibre/library/database2.py +++ b/src/calibre/library/database2.py @@ -631,6 +631,8 @@ class LibraryDatabase2(LibraryDatabase): author = sanitize_file_name(authors.split(',')[0][:self.PATH_LIMIT]).decode(filesystem_encoding, 'replace') title = sanitize_file_name(self.title(id, index_is_id=True)[:self.PATH_LIMIT]).decode(filesystem_encoding, 'replace') name = title + ' - ' + author + while name.endswith('.'): + name = name[:-1] return name def rmtree(self, path): From 5faeaba6c27863afa35fbd140a1dffffaa596dca Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Tue, 7 Jul 2009 23:58:18 -0600 Subject: [PATCH 20/20] Updated Wall Street Journal recipe --- src/calibre/web/feeds/recipes/recipe_wsj.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/calibre/web/feeds/recipes/recipe_wsj.py b/src/calibre/web/feeds/recipes/recipe_wsj.py index 67211d75dc..962f7cb30b 100644 --- a/src/calibre/web/feeds/recipes/recipe_wsj.py +++ b/src/calibre/web/feeds/recipes/recipe_wsj.py @@ -53,6 +53,10 @@ class WallStreetJournal(BasicNewsRecipe): def postprocess_html(self, soup, first): for tag in soup.findAll(name=['table', 'tr', 'td']): tag.name = 'div' + + for tag in soup.findAll('div', dict(id=["articleImage_1", "articleImage_2", "articleImage_3", "articleImage_4", "articleImage_5", "articleImage_6", "articleImage_7"])): + tag.extract() + return soup def get_article_url(self, article): @@ -70,7 +74,7 @@ class WallStreetJournal(BasicNewsRecipe): #('Most Emailed - Month', 'http://online.wsj.com/xml/rss/3_7254.xml'), (' Most Viewed - Day', 'http://online.wsj.com/xml/rss/3_7198.xml'), (' Most Viewed - Week', 'http://online.wsj.com/xml/rss/3_7251.xml'), - # ('Most Viewed - Month', 'http://online.wsj.com/xml/rss/3_7252.xml'), + #('Most Viewed - Month', 'http://online.wsj.com/xml/rss/3_7252.xml'), ('Today\'s Newspaper - Page One', 'http://online.wsj.com/xml/rss/3_7205.xml'), ('Today\'s Newspaper - Marketplace', 'http://online.wsj.com/xml/rss/3_7206.xml'), ('Today\'s Newspaper - Money & Investing', 'http://online.wsj.com/xml/rss/3_7207.xml'),