diff --git a/resources/catalog/stylesheet.css b/resources/catalog/stylesheet.css index b5770599e6..12063b121f 100644 --- a/resources/catalog/stylesheet.css +++ b/resources/catalog/stylesheet.css @@ -27,7 +27,7 @@ p.tags { p.description { text-align:left; - font-style:italic; + font-style:normal; margin-top: 0em; } @@ -55,6 +55,14 @@ p.author_index { text-indent: 0em; } +p.series { + text-align: left; + margin-top:0px; + margin-bottom:0px; + margin-left:2em; + text-indent:-2em; + } + p.read_book { text-align:left; margin-top:0px; diff --git a/resources/images/news/digitalspy_uk.png b/resources/images/news/digitalspy_uk.png new file mode 100644 index 0000000000..28c865713d Binary files /dev/null and b/resources/images/news/digitalspy_uk.png differ diff --git a/resources/images/news/elcomercio.png b/resources/images/news/elcomercio.png new file mode 100644 index 0000000000..df484860dd Binary files /dev/null and b/resources/images/news/elcomercio.png differ diff --git a/resources/images/news/gizmodo.png b/resources/images/news/gizmodo.png new file mode 100644 index 0000000000..8f2e6f002b Binary files /dev/null and b/resources/images/news/gizmodo.png differ diff --git a/resources/images/news/newsstraitstimes.png b/resources/images/news/newsstraitstimes.png new file mode 100644 index 0000000000..075e2cc001 Binary files /dev/null and b/resources/images/news/newsstraitstimes.png differ diff --git a/resources/images/news/readitlater.png b/resources/images/news/readitlater.png new file mode 100644 index 0000000000..439a690cd8 Binary files /dev/null and b/resources/images/news/readitlater.png differ diff --git a/resources/images/news/tidbits.png b/resources/images/news/tidbits.png new file mode 100644 index 0000000000..e64d71ec68 Binary files /dev/null and b/resources/images/news/tidbits.png differ diff --git a/resources/recipes/ZIVE.sk.recipe b/resources/recipes/ZIVE.sk.recipe new file mode 100644 index 0000000000..e5bfd56cef --- /dev/null +++ b/resources/recipes/ZIVE.sk.recipe @@ -0,0 +1,45 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re + + + +class ZiveRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' + __author__ = 'Abelturd' + language = 'sk' + version = 1 + + title = u'ZIVE.sk' + publisher = u'' + category = u'News, Newspaper' + description = u'Naj\u010d\xedtanej\u0161\xed denn\xedk opo\u010d\xedta\u010doch, IT a internete. ' + encoding = 'UTF-8' + + oldest_article = 7 + max_articles_per_feed = 100 + use_embedded_content = False + remove_empty_feeds = True + + no_stylesheets = True + remove_javascript = True + cover_url = 'http://www.zive.sk/Client.Images/Logos/logo-zive-sk.gif' + + feeds = [] + feeds.append((u'V\u0161etky \u010dl\xe1nky', u'http://www.zive.sk/rss/sc-47/default.aspx')) + + preprocess_regexps = [ + (re.compile(r'

Pokra.*ie

', re.DOTALL|re.IGNORECASE), + lambda match: ''), + + ] + + + remove_tags = [] + + keep_only_tags = [dict(name='h1'), dict(name='span', attrs={'class':'arlist-data-info-author'}), dict(name='div', attrs={'class':'bbtext font-resizer-area'}),] + extra_css = ''' + h1 {font-size:140%;font-family:georgia,serif; font-weight:bold} + h3 {font-size:115%;font-family:georgia,serif; font-weight:bold} + ''' + + diff --git a/resources/recipes/digitalspy_uk.recipe b/resources/recipes/digitalspy_uk.recipe new file mode 100644 index 0000000000..ac54c3790d --- /dev/null +++ b/resources/recipes/digitalspy_uk.recipe @@ -0,0 +1,43 @@ + +__license__ = 'GPL v3' +__copyright__ = '2010, Darko Miletic ' +''' +www.digitalspy.co.uk +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class DigitalSpyUK(BasicNewsRecipe): + title = 'Digital Spy - UK Edition' + __author__ = 'Darko Miletic' + description = 'Entertainment news about the biggest TV shows, films and celebrities, updated around the clock.' + publisher = 'Digital Spy Limited.' + category = 'news, showbiz, big brother, x factor, torchwood, doctor who, tv, media, sky, freeview, cable' + oldest_article = 2 + max_articles_per_feed = 100 + no_stylesheets = True + encoding = 'cp1252' + use_embedded_content = False + language = 'en_GB' + remove_empty_feeds = True + extra_css = ' body{font-family: Verdana,Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} .info{font-size: small} ' + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + remove_tags = [dict(name=['link'])] + remove_attributes = ['height','width'] + keep_only_tags = [dict(name='div',attrs={'id':'article'})] + + feeds = [ + (u'News' , u'http://www.digitalspy.co.uk/rss/zones/gb/all.xml' ) + ,(u'Big Brother' , u'http://www.digitalspy.co.uk/rss/zones/gb/bigbrother.xml' ) + ,(u'Entertainment' , u'http://www.digitalspy.co.uk/rss/zones/gb/entertainment.xml') + ,(u'General' , u'http://www.digitalspy.co.uk/rss/zones/gb/general.xml' ) + ,(u'Media' , u'http://www.digitalspy.co.uk/rss/zones/gb/media.xml' ) + ] + diff --git a/resources/recipes/elcomercio.recipe b/resources/recipes/elcomercio.recipe new file mode 100644 index 0000000000..37733bda8b --- /dev/null +++ b/resources/recipes/elcomercio.recipe @@ -0,0 +1,38 @@ + +__license__ = 'GPL v3' +__copyright__ = '2010, Darko Miletic ' +''' +elcomercio.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class ElComercio(BasicNewsRecipe): + title = 'El Comercio ' + __author__ = 'Darko Miletic' + description = "Gizmodo, the gadget guide. So much in love with shiny new toys, it's unnatural." + publisher = 'GRUPO EL COMERCIO C.A.' + category = 'news, Ecuador, politics' + oldest_article = 2 + max_articles_per_feed = 100 + no_stylesheets = True + encoding = 'utf-8' + use_embedded_content = True + language = 'es' + masthead_url = 'http://ww1.elcomercio.com/nv_images/headers/EC/logo_new_08.gif' + extra_css = ' body{font-family: Arial,Verdana,sans-serif} img{margin-bottom: 1em} ' + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + remove_attributes = ['width','height'] + + feeds = [(u'Articles', u'http://ww1.elcomercio.com/rss/titulares1.xml')] + + def preprocess_html(self, soup): + return self.adeify_images(soup) + diff --git a/resources/recipes/gizmodo.recipe b/resources/recipes/gizmodo.recipe new file mode 100644 index 0000000000..6f6e6ae0cf --- /dev/null +++ b/resources/recipes/gizmodo.recipe @@ -0,0 +1,40 @@ + +__license__ = 'GPL v3' +__copyright__ = '2010, Darko Miletic ' +''' +gizmodo.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class Gizmodo(BasicNewsRecipe): + title = 'Gizmodo' + __author__ = 'Darko Miletic' + description = "Gizmodo, the gadget guide. So much in love with shiny new toys, it's unnatural." + publisher = 'gizmodo.com' + category = 'news, IT, Internet, gadgets' + oldest_article = 2 + max_articles_per_feed = 100 + no_stylesheets = True + encoding = 'utf-8' + use_embedded_content = True + language = 'en' + masthead_url = 'http://cache.gawkerassets.com/assets/gizmodo.com/img/logo.png' + extra_css = ' body{font-family: "Lucida Grande",Helvetica,Arial,sans-serif} img{margin-bottom: 1em} ' + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + remove_attributes = ['width','height'] + remove_tags = [dict(name='div',attrs={'class':'feedflare'})] + remove_tags_after = dict(name='div',attrs={'class':'feedflare'}) + + feeds = [(u'Articles', u'http://feeds.gawker.com/gizmodo/full')] + + def preprocess_html(self, soup): + return self.adeify_images(soup) + diff --git a/resources/recipes/hbr.recipe b/resources/recipes/hbr.recipe index b84062af8c..3d1e8ccfac 100644 --- a/resources/recipes/hbr.recipe +++ b/resources/recipes/hbr.recipe @@ -18,7 +18,8 @@ class HBR(BasicNewsRecipe): remove_tags = [dict(id=['mastheadContainer', 'magazineHeadline', 'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn', 'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR', - 'mailingListTout', 'partnerCenter', 'pageFooter']), + 'mailingListTout', 'partnerCenter', 'pageFooter', + 'articleToolbarTop', 'articleToolbarBottom', 'articleToolbarRD']), dict(name='iframe')] extra_css = ''' a {font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000; } diff --git a/resources/recipes/iliteratura_cz.recipe b/resources/recipes/iliteratura_cz.recipe new file mode 100644 index 0000000000..7d603f0cec --- /dev/null +++ b/resources/recipes/iliteratura_cz.recipe @@ -0,0 +1,47 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re + +class SmeRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' + __author__ = 'Abelturd' + language = 'cz' + version = 1 + + title = u'iLiteratura.cz' + publisher = u'' + category = u'News, Newspaper' + description = u'O LITERATU\u0158E V CEL\xc9M SV\u011aT\u011a A DOMA' + cover_url = 'http://www.iliteratura.cz/1_vzhled/1/iliteratura.gif' + + oldest_article = 7 + max_articles_per_feed = 100 + use_embedded_content = False + remove_empty_feeds = True + + no_stylesheets = True + remove_javascript = True + + + feeds = [] + feeds.append((u'\u010cl\xe1nky', u'http://www.iliteratura.cz/rss.asp')) + + + keep_only_tags = [] + + remove_tags = [dict(name='table'),dict(name='h3')] + + + preprocess_regexps = [ + (re.compile(r'

Souvisej.*', re.DOTALL|re.IGNORECASE), + lambda match: ''), + ] + + def print_version(self, url): + m = re.search('(?<=ID=)[0-9]*', url) + + return u'http://www.iliteratura.cz/clanek.asp?polozkaID=' + str(m.group(0)) + '&c=tisk' + + extra_css = ''' + h1 {font-size:140%;font-family:georgia,serif; font-weight:bold} + h3 {font-size:115%;font-family:georgia,serif; font-weight:bold} + ''' diff --git a/resources/recipes/metro_montreal.recipe b/resources/recipes/metro_montreal.recipe index 8272c760cc..c2054bdeec 100644 --- a/resources/recipes/metro_montreal.recipe +++ b/resources/recipes/metro_montreal.recipe @@ -4,7 +4,7 @@ class Metro_Montreal(BasicNewsRecipe): title = u'M\xe9tro Montr\xe9al' __author__ = 'Jerry Clapperton' - description = 'Le quotidien le plus branché sur le monde' + description = 'Le quotidien le plus branch\xe9 sur le monde' language = 'fr' oldest_article = 7 @@ -16,7 +16,7 @@ class Metro_Montreal(BasicNewsRecipe): extra_css = '.headline {font-size: x-large;} \n .fact {padding-top: 10pt}' remove_tags = [dict(attrs={'id':'buttons'})] - + feeds = [ (u"L'info", u'http://journalmetro.com/linfo/rss'), (u'Monde', u'http://journalmetro.com/monde/rss'), @@ -26,4 +26,4 @@ class Metro_Montreal(BasicNewsRecipe): ] def print_version(self, url): - return url.replace('article', 'ArticlePrint') + '?language=fr' \ No newline at end of file + return url.replace('article', 'ArticlePrint') + '?language=fr' diff --git a/resources/recipes/newsstraitstimes.recipe b/resources/recipes/newsstraitstimes.recipe new file mode 100644 index 0000000000..ebbaca1a0e --- /dev/null +++ b/resources/recipes/newsstraitstimes.recipe @@ -0,0 +1,35 @@ + +__license__ = 'GPL v3' +__copyright__ = '2010, Darko Miletic ' +''' +www.nst.com.my +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class Newstraitstimes(BasicNewsRecipe): + title = 'New Straits Times from Malaysia' + __author__ = 'Darko Miletic' + description = 'Learning Curve, Sunday People, New Straits Times from Malaysia' + publisher = 'nst.com.my' + category = 'news, politics, Malaysia' + oldest_article = 2 + max_articles_per_feed = 100 + no_stylesheets = True + encoding = 'cp1252' + use_embedded_content = False + language = 'en' + masthead_url = 'http://www.nst.com.my/Current_News/NST/Images/new-nstonline.jpg' + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + remove_tags = [dict(name=['link','table'])] + keep_only_tags = dict(name='div',attrs={'id':'haidah'}) + + feeds = [(u'Articles', u'http://www.nst.com.my/rss/allSec')] + diff --git a/resources/recipes/pagina12.recipe b/resources/recipes/pagina12.recipe index 2fb433dc82..a5ee18a7ed 100644 --- a/resources/recipes/pagina12.recipe +++ b/resources/recipes/pagina12.recipe @@ -1,13 +1,12 @@ - __license__ = 'GPL v3' __copyright__ = '2008-2010, Darko Miletic ' ''' pagina12.com.ar ''' -import time -from calibre import strftime +import re from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup class Pagina12(BasicNewsRecipe): title = 'Pagina - 12' @@ -22,7 +21,8 @@ class Pagina12(BasicNewsRecipe): use_embedded_content = False language = 'es' remove_empty_feeds = True - extra_css = ' body{font-family: sans-serif} ' + masthead_url = 'http://www.pagina12.com.ar/commons/imgs/logo-home.gif' + extra_css = ' body{font-family: Arial,Helvetica,sans-serif } h2{color: #028CCD} img{margin-bottom: 0.4em} .epigrafe{font-size: x-small; background-color: #EBEAE5; color: #565144 } .intro{font-size: 1.1em} ' conversion_options = { 'comment' : description @@ -52,7 +52,11 @@ class Pagina12(BasicNewsRecipe): return url.replace('http://www.pagina12.com.ar/','http://www.pagina12.com.ar/imprimir/') def get_cover_url(self): - imgnames = ['tapan.jpg','tapagn.jpg','tapan_gr.jpg','tapagn.jpg','tapagn.jpg','tapan.jpg','tapagn.jpg'] - weekday = time.localtime().tm_wday - return strftime('http://www.pagina12.com.ar/fotos/%Y%m%d/diario/') + imgnames[weekday] + rawc = self.index_to_soup('http://www.pagina12.com.ar/diario/principal/diario/index.html',True) + rawc2 = re.sub(r'PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN','PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"',rawc) + soup = BeautifulSoup(rawc2,fromEncoding=self.encoding,smartQuotesTo=None) + for image in soup.findAll('img',alt=True): + if image['alt'].startswith('Tapa de la fecha'): + return image['src'] + return None diff --git a/resources/recipes/people_us_mashup.recipe b/resources/recipes/people_us_mashup.recipe index 38d750cd4c..ed43e24e56 100644 --- a/resources/recipes/people_us_mashup.recipe +++ b/resources/recipes/people_us_mashup.recipe @@ -31,7 +31,7 @@ class PeopleMag(BasicNewsRecipe): keep_only_tags = [ - dict(name='div', attrs={'class': 'panel_news_article_main'}), + dict(name='div', attrs={'class': 'panel_news_article_main'}), dict(name='div', attrs={'class':'article_content'}), dict(name='div', attrs={'class': 'headline'}), dict(name='div', attrs={'class': 'post'}), @@ -51,6 +51,7 @@ class PeopleMag(BasicNewsRecipe): dict(name='div', attrs={'class':'sharelinkcont'}), dict(name='div', attrs={'class':'categories'}), dict(name='ul', attrs={'class':'categories'}), + dict(name='div', attrs={'class':'related_content'}), dict(name='div', attrs={'id':'promo'}), dict(name='div', attrs={'class':'linksWrapper'}), dict(name='p', attrs={'class':'tag tvnews'}), diff --git a/resources/recipes/readitlater.recipe b/resources/recipes/readitlater.recipe new file mode 100644 index 0000000000..4bd8fc2bd6 --- /dev/null +++ b/resources/recipes/readitlater.recipe @@ -0,0 +1,64 @@ +__license__ = 'GPL v3' +__copyright__ = '2010, Darko Miletic ' +''' +readitlaterlist.com +''' + +from calibre import strftime +from calibre.web.feeds.news import BasicNewsRecipe + +class Readitlater(BasicNewsRecipe): + title = 'Read It Later' + __author__ = 'Darko Miletic' + description = '''Personalized news feeds. Go to readitlaterlist.com to + setup up your news. Fill in your account + username, and optionally you can add password.''' + publisher = 'readitlater.com' + category = 'news, custom' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + needs_subscription = True + INDEX = u'http://readitlaterlist.com' + LOGIN = INDEX + u'/l' + + + feeds = [(u'Unread articles' , INDEX + u'/unread')] + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + if self.username is not None: + br.open(self.LOGIN) + br.select_form(nr=0) + br['feed_id'] = self.username + if self.password is not None: + br['password'] = self.password + br.submit() + return br + + def parse_index(self): + totalfeeds = [] + lfeeds = self.get_feeds() + for feedobj in lfeeds: + feedtitle, feedurl = feedobj + self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl)) + articles = [] + soup = self.index_to_soup(feedurl) + ritem = soup.find('ul',attrs={'id':'list'}) + for item in ritem.findAll('li'): + description = '' + atag = item.find('a',attrs={'class':'text'}) + if atag and atag.has_key('href'): + url = self.INDEX + atag['href'] + title = self.tag_to_string(item.div) + date = strftime(self.timefmt) + articles.append({ + 'title' :title + ,'date' :date + ,'url' :url + ,'description':description + }) + totalfeeds.append((feedtitle, articles)) + return totalfeeds + diff --git a/resources/recipes/the_gazette.recipe b/resources/recipes/the_gazette.recipe deleted file mode 100644 index 19afff986e..0000000000 --- a/resources/recipes/the_gazette.recipe +++ /dev/null @@ -1,22 +0,0 @@ -from calibre.web.feeds.news import BasicNewsRecipe - -class The_Gazette(BasicNewsRecipe): - - cover_url = 'file:///D:/Documents/Pictures/Covers/The_Gazette.jpg' - title = u'The Gazette' - __author__ = 'Jerry Clapperton' - description = 'Montreal news in English' - language = 'en_CA' - - oldest_article = 7 - max_articles_per_feed = 20 - use_embedded_content = False - remove_javascript = True - no_stylesheets = True - encoding = 'utf-8' - - keep_only_tags = [dict(name='div', attrs={'id':['storyheader','page1']})] - - extra_css = '.headline {font-size: x-large;} \n .fact {padding-top: 10pt}' - - feeds = [(u'News', u'http://feeds.canada.com/canwest/F297'), (u'Opinion', u'http://feeds.canada.com/canwest/F7383'), (u'Arts', u'http://feeds.canada.com/canwest/F7366'), (u'Life', u'http://rss.canada.com/get/?F6934'), (u'Business', u'http://feeds.canada.com/canwest/F6939'), (u'Travel', u'http://rss.canada.com/get/?F6938'), (u'Health', u'http://feeds.canada.com/canwest/F7397'), (u'Technology', u'http://feeds.canada.com/canwest/F7411')] diff --git a/resources/recipes/the_new_republic.recipe b/resources/recipes/the_new_republic.recipe index 482dba1af0..59ccef3607 100644 --- a/resources/recipes/the_new_republic.recipe +++ b/resources/recipes/the_new_republic.recipe @@ -9,6 +9,7 @@ class The_New_Republic(BasicNewsRecipe): oldest_article = 7 max_articles_per_feed = 100 + no_stylesheets = True remove_tags = [ dict(name='div', attrs={'class':['print-logo', 'print-site_name', 'img-left', 'print-source_url']}), @@ -21,14 +22,15 @@ class The_New_Republic(BasicNewsRecipe): ('Economy', 'http://www.tnr.com/rss/articles/Economy'), ('Environment and Energy', 'http://www.tnr.com/rss/articles/Environment-%2526-Energy'), ('Health Care', 'http://www.tnr.com/rss/articles/Health-Care'), - ('Urban Policy', 'http://www.tnr.com/rss/articles/Urban-Policy'), + ('Metro Policy', 'http://www.tnr.com/rss/articles/Metro-Policy'), ('World', 'http://www.tnr.com/rss/articles/World'), ('Film', 'http://www.tnr.com/rss/articles/Film'), ('Books', 'http://www.tnr.com/rss/articles/books'), + ('The Book', 'http://www.tnr.com/rss/book'), + ('Jonathan Chait', 'http://www.tnr.com/rss/blogs/Jonathan-Chait'), ('The Plank', 'http://www.tnr.com/rss/blogs/The-Plank'), ('The Treatment', 'http://www.tnr.com/rss/blogs/The-Treatment'), ('The Spine', 'http://www.tnr.com/rss/blogs/The-Spine'), - ('The Stash', 'http://www.tnr.com/rss/blogs/The-Stash'), ('The Vine', 'http://www.tnr.com/rss/blogs/The-Vine'), ('The Avenue', 'http://www.tnr.com/rss/blogs/The-Avenue'), ('William Galston', 'http://www.tnr.com/rss/blogs/William-Galston'), @@ -40,3 +42,4 @@ class The_New_Republic(BasicNewsRecipe): def print_version(self, url): return url.replace('http://www.tnr.com/', 'http://www.tnr.com/print/') + diff --git a/resources/recipes/tidbits.recipe b/resources/recipes/tidbits.recipe new file mode 100644 index 0000000000..702c65e9e4 --- /dev/null +++ b/resources/recipes/tidbits.recipe @@ -0,0 +1,53 @@ + +__license__ = 'GPL v3' +__copyright__ = '2010, Darko Miletic ' +''' +db.tidbits.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class TidBITS(BasicNewsRecipe): + title = 'TidBITS: Mac News for the Rest of Us' + __author__ = 'Darko Miletic' + description = 'Insightful news, reviews, and analysis of the Macintosh and Internet worlds' + publisher = 'TidBITS Publishing Inc.' + category = 'news, Apple, Macintosh, IT, Internet' + oldest_article = 2 + max_articles_per_feed = 100 + no_stylesheets = True + encoding = 'utf-8' + use_embedded_content = True + language = 'en' + remove_empty_feeds = True + masthead_url = 'http://db.tidbits.com/images/tblogo9.gif' + extra_css = ' body{font-family: Georgia,"Times New Roman",Times,serif} ' + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + remove_attributes = ['width','height'] + remove_tags = [dict(name='small')] + remove_tags_after = dict(name='small') + + feeds = [ + (u'Business Apps' , u'http://db.tidbits.com/feeds/business.rss' ) + ,(u'Entertainment' , u'http://db.tidbits.com/feeds/entertainment.rss') + ,(u'External Links' , u'http://db.tidbits.com/feeds/links.rss' ) + ,(u'Home Mac' , u'http://db.tidbits.com/feeds/home.rss' ) + ,(u'Inside TidBITS' , u'http://db.tidbits.com/feeds/inside.rss' ) + ,(u'iPod & iPhone' , u'http://db.tidbits.com/feeds/ipod-iphone.rss' ) + ,(u'Just for Fun' , u'http://db.tidbits.com/feeds/fun.rss' ) + ,(u'Macs & Mac OS X' , u'http://db.tidbits.com/feeds/macs.rss' ) + ,(u'Media Creation' , u'http://db.tidbits.com/feeds/creative.rss' ) + ,(u'Networking & Communications', u'http://db.tidbits.com/feeds/net.rss' ) + ,(u'Opinion & Editorial' , u'http://db.tidbits.com/feeds/opinion.rss' ) + ,(u'Support & Problem Solving' , u'http://db.tidbits.com/feeds/support.rss' ) + ,(u'Safe Computing' , u'http://db.tidbits.com/feeds/security.rss' ) + ,(u'Tech News' , u'http://db.tidbits.com/feeds/tech.rss' ) + ,(u'Software Watchlist' , u'http://db.tidbits.com/feeds/watchlist.rss' ) + ] diff --git a/resources/recipes/wsj_free.recipe b/resources/recipes/wsj_free.recipe index b190f43849..e29bfe3dde 100644 --- a/resources/recipes/wsj_free.recipe +++ b/resources/recipes/wsj_free.recipe @@ -215,7 +215,7 @@ class WSJ(BasicNewsRecipe): # first, check if there is an h3 tag which provides a section name stag = divtag.find('h3') if stag: - if stag.parent['class'] == 'dynamic': + if stag.parent.get('class', '') == 'dynamic': # a carousel of articles is too complex to extract a section name # for each article, so we'll just call the section "Carousel" section_name = 'Carousel' diff --git a/setup/resources.py b/setup/resources.py index d40d31bbf5..977d753828 100644 --- a/setup/resources.py +++ b/setup/resources.py @@ -48,7 +48,9 @@ class Resources(Command): dest = self.j(self.RESOURCES, 'builtin_recipes.xml') if self.newer(dest, files): self.info('\tCreating builtin_recipes.xml') - open(dest, 'wb').write(serialize_builtin_recipes()) + xml = serialize_builtin_recipes() + with open(dest, 'wb') as f: + f.write(xml) dest = self.j(self.RESOURCES, 'ebook-convert-complete.pickle') files = [] diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py index 9f98147032..552af1590f 100644 --- a/src/calibre/ebooks/pdf/reflow.py +++ b/src/calibre/ebooks/pdf/reflow.py @@ -262,7 +262,6 @@ class Region(object): max_lines = max(max_lines, len(c)) return max_lines - @property def is_small(self): return self.line_count < 3 @@ -438,9 +437,8 @@ class Page(object): # absorb into a neighboring region (prefer the one with number of cols # closer to the avg number of cols in the set, if equal use larger # region) - # merge contiguous regions that can contain each other - '''absorbed = set([]) found = True + absorbed = set([]) while found: found = False for i, region in enumerate(self.regions): @@ -452,10 +450,33 @@ class Page(object): regions.append(self.regions[j]) else: break - prev = None if i == 0 else i-1 - next = j if self.regions[j] not in regions else None - ''' - pass + prev_region = None if i == 0 else i-1 + next_region = j if self.regions[j] not in regions else None + if prev_region is None and next_region is not None: + absorb_into = next_region + elif next_region is None and prev_region is not None: + absorb_into = prev_region + elif prev_region is None and next_region is None: + if len(regions) > 1: + absorb_into = regions[0] + regions = regions[1:] + else: + absorb_into = None + else: + absorb_into = prev_region + if next_region.line_count >= prev_region.line_count: + avg_column_count = sum([len(r.columns) for r in + regions])/float(len(regions)) + if next_region.line_count > prev_region.line_count \ + or abs(avg_column_count - len(prev_region.columns)) \ + > abs(avg_column_count - len(next_region.columns)): + absorb_into = next_region + if absorb_into is not None: + absorb_into.absorb_region(regions) + absorbed.update(regions) + i = j + for region in absorbed: + self.regions.remove(region) diff --git a/src/calibre/ebooks/rtf2xml/tokenize.py b/src/calibre/ebooks/rtf2xml/tokenize.py index 45887f33e7..ad12daa211 100755 --- a/src/calibre/ebooks/rtf2xml/tokenize.py +++ b/src/calibre/ebooks/rtf2xml/tokenize.py @@ -72,7 +72,7 @@ class Tokenize: return line def __compile_expressions(self): self.__ms_hex_exp = re.compile(r"\\\'(..)") - self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) {0,1}") + self.__utf_exp = re.compile(r"\\u(-?\d{3,6})") self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\\[^\s\\{}&]+(?:\s)?)") self.__par_exp = re.compile(r'\\$') self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)") diff --git a/src/calibre/gui2/catalog/catalog_epub_mobi.ui b/src/calibre/gui2/catalog/catalog_epub_mobi.ui index 91fcbdc364..dab8c972c7 100644 --- a/src/calibre/gui2/catalog/catalog_epub_mobi.ui +++ b/src/calibre/gui2/catalog/catalog_epub_mobi.ui @@ -80,7 +80,7 @@ Regex tips: -- The default regex - \[[\w]*\] - excludes genre tags of the form [tag], e.g., [Amazon Freebie] +- The default regex - \[[\w ]*\] - excludes genre tags of the form [tag], e.g., [Amazon Freebie] - A regex pattern of a single dot excludes all genre tags, generating no Genre Section diff --git a/src/calibre/gui2/convert/gui_conversion.py b/src/calibre/gui2/convert/gui_conversion.py index 5f339bf91d..d035a0ff93 100644 --- a/src/calibre/gui2/convert/gui_conversion.py +++ b/src/calibre/gui2/convert/gui_conversion.py @@ -57,7 +57,8 @@ def gui_catalog(fmt, title, dbspec, ids, out_file_name, sync, fmt_options, setattr(opts,option, fmt_options[option]) # Fetch and run the plugin for fmt + # Returns 0 if successful, 1 if no catalog built plugin = plugin_for_catalog_format(fmt) - plugin.run(out_file_name, opts, db, notification=notification) + return plugin.run(out_file_name, opts, db, notification=notification) diff --git a/src/calibre/gui2/device.py b/src/calibre/gui2/device.py index 5a977b37a6..679e86ab48 100644 --- a/src/calibre/gui2/device.py +++ b/src/calibre/gui2/device.py @@ -149,7 +149,7 @@ class DeviceManager(Thread): possibly_connected_devices.append((device, detected_device)) if possibly_connected_devices: if not self.do_connect(possibly_connected_devices): - print 'Connect to device failed, retying in 5 seconds...' + print 'Connect to device failed, retrying in 5 seconds...' time.sleep(5) if not self.do_connect(possibly_connected_devices): print 'Device connect failed again, giving up' diff --git a/src/calibre/gui2/dialogs/metadata_single.py b/src/calibre/gui2/dialogs/metadata_single.py index f385b76c4c..8fab6a922a 100644 --- a/src/calibre/gui2/dialogs/metadata_single.py +++ b/src/calibre/gui2/dialogs/metadata_single.py @@ -594,6 +594,11 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): self.rating.setValue(int(book.rating)) if book.tags: self.tags.setText(', '.join(book.tags)) + if book.series is not None: + if self.series.text() is None or self.series.text() == '': + self.series.setText(book.series) + if book.series_index is not None: + self.series_index.setValue(book.series_index) else: error_dialog(self, _('Cannot fetch metadata'), _('You must specify at least one of ISBN, Title, ' diff --git a/src/calibre/gui2/library.py b/src/calibre/gui2/library.py index fd4f8999b4..9b8210c75e 100644 --- a/src/calibre/gui2/library.py +++ b/src/calibre/gui2/library.py @@ -903,9 +903,13 @@ class OnDeviceSearch(SearchQueryParser): locations[i] = q[v] for i, r in enumerate(self.model.db): for loc in locations: - if query in loc(r): - matches.add(i) - break + try: + if query in loc(r): + matches.add(i) + break + except ValueError: # Unicode errors + import traceback + traceback.print_exc() return matches diff --git a/src/calibre/gui2/ui.py b/src/calibre/gui2/ui.py index 7f3ca297fd..37cea05b49 100644 --- a/src/calibre/gui2/ui.py +++ b/src/calibre/gui2/ui.py @@ -1394,6 +1394,11 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI): self.status_bar.showMessage(_('Generating %s catalog...')%fmt) def catalog_generated(self, job): + if job.result: + # Search terms nulled catalog results + return error_dialog(self, _('No books found'), + _("No books to catalog\nCheck exclude tags"), + show=True) if job.failed: return self.job_exception(job) id = self.library_view.model().add_catalog(job.catalog_file_path, job.catalog_title) diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py index 78155326dc..9b82c7310e 100644 --- a/src/calibre/library/catalog.py +++ b/src/calibre/library/catalog.py @@ -927,8 +927,16 @@ class EPUB_MOBI(CatalogPlugin): for record in data: this_title = {} - title = this_title['title'] = self.convertHTMLEntities(record['title']) - this_title['title_sort'] = self.generateSortTitle(title) + this_title['title'] = self.convertHTMLEntities(record['title']) + if record['series']: + this_title['series'] = record['series'] + this_title['series_index'] = record['series_index'] + this_title['title'] = self.generateSeriesTitle(this_title) + else: + this_title['series'] = None + this_title['series_index'] = 0.0 + + this_title['title_sort'] = self.generateSortTitle(this_title['title']) if 'authors' in record and len(record['authors']): this_title['author'] = " & ".join(record['authors']) else: @@ -984,12 +992,59 @@ class EPUB_MOBI(CatalogPlugin): def fetchBooksByAuthor(self): # Generate a list of titles sorted by author from the database + def author_compare(x,y): + # Return -1 if xy + # Different authors - sort by author_sort + if x['author_sort'] > y['author_sort']: + return 1 + elif x['author_sort'] < y['author_sort']: + return -1 + else: + # Same author + if x['series'] != y['series']: + # Different series + if x['title_sort'].lstrip() > y['title_sort'].lstrip(): + return 1 + else: + return -1 + else: + # Same series + if x['series'] == y['series']: + if float(x['series_index']) > float(y['series_index']): + return 1 + elif float(x['series_index']) < float(y['series_index']): + return -1 + else: + return 0 + else: + if x['series'] > y['series']: + return 1 + else: + return -1 self.updateProgressFullStep("Sorting database") - # Sort titles case-insensitive + ''' + # Sort titles case-insensitive, by author self.booksByAuthor = sorted(self.booksByTitle, key=lambda x:(x['author_sort'].upper(), x['author_sort'].upper())) + ''' + + self.booksByAuthor = list(self.booksByTitle) + self.booksByAuthor.sort(author_compare) + + if False and self.verbose: + self.opts.log.info("fetchBooksByAuthor(): %d books" % len(self.booksByAuthor)) + self.opts.log.info(" %-30s %-20s %s" % ('title', 'title_sort','series', 'series_index')) + for title in self.booksByAuthor: + self.opts.log.info((u" %-30s %-20s %-20s%5s " % \ + (title['title'][:30], + title['series'][:20] if title['series'] else '', + title['series_index'], + )).encode('utf-8')) + raise SystemExit # Build the unique_authors set from existing data authors = [(record['author'], record['author_sort']) for record in self.booksByAuthor] @@ -1063,7 +1118,17 @@ class EPUB_MOBI(CatalogPlugin): # Insert the book title #

Book Title

emTag = Tag(soup, "em") - emTag.insert(0, NavigableString(escape(title['title']))) + if title['series']: + # title
series series_index + brTag = Tag(soup,'br') + title_tokens = title['title'].split(': ') + emTag.insert(0, NavigableString(title_tokens[1])) + emTag.insert(1, brTag) + smallTag = Tag(soup,'small') + smallTag.insert(0,NavigableString(title_tokens[0])) + emTag.insert(2, smallTag) + else: + emTag.insert(0, NavigableString(escape(title['title']))) titleTag = body.find(attrs={'class':'title'}) titleTag.insert(0,emTag) @@ -1073,7 +1138,12 @@ class EPUB_MOBI(CatalogPlugin): aTag['href'] = "%s.html#%s" % ("ByAlphaAuthor", self.generateAuthorAnchor(title['author'])) #aTag.insert(0, escape(title['author'])) aTag.insert(0, title['author']) - authorTag.insert(0, NavigableString("by ")) + + # Insert READ_SYMBOL + if title['read']: + authorTag.insert(0, NavigableString(self.READ_SYMBOL + "by ")) + else: + authorTag.insert(0, NavigableString(self.NOT_READ_SYMBOL + "by ")) authorTag.insert(1, aTag) ''' @@ -1085,6 +1155,27 @@ class EPUB_MOBI(CatalogPlugin): tagsTag.insert(0,emTag) ''' + ''' + # Insert Series info or remove. + seriesTag = body.find(attrs={'class':'series'}) + if title['series']: + # Insert a spacer to match the author indent + stc = 0 + fontTag = Tag(soup,"font") + fontTag['style'] = 'color:white;font-size:large' + if self.opts.fmt == 'epub': + fontTag['style'] += ';opacity: 0.0' + fontTag.insert(0, NavigableString("by ")) + seriesTag.insert(stc, fontTag) + stc += 1 + if float(title['series_index']) - int(title['series_index']): + series_str = 'Series: %s [%4.2f]' % (title['series'], title['series_index']) + else: + series_str = '%s [%d]' % (title['series'], title['series_index']) + seriesTag.insert(stc,NavigableString(series_str)) + else: + seriesTag.extract() + ''' # Insert linked genres if 'tags' in title: tagsTag = body.find(attrs={'class':'tags'}) @@ -1118,7 +1209,12 @@ class EPUB_MOBI(CatalogPlugin): else: imgTag['src'] = "../images/thumbnail_default.jpg" imgTag['alt'] = "cover" - imgTag['style'] = 'width: %dpx; height:%dpx;' % (self.THUMB_WIDTH, self.THUMB_HEIGHT) + + # Tweak image size if we're building for Sony, not sure why this is needed + if self.opts.fmt == 'epub' and self.opts.output_profile.startswith("sony"): + imgTag['style'] = 'width: %dpx; height:%dpx;' % (self.THUMB_WIDTH * 2, self.THUMB_HEIGHT * 2) + else: + imgTag['style'] = 'width: %dpx; height:%dpx;' % (self.THUMB_WIDTH, self.THUMB_HEIGHT) thumbnailTag = body.find(attrs={'class':'thumbnail'}) thumbnailTag.insert(0,imgTag) @@ -1310,8 +1406,9 @@ class EPUB_MOBI(CatalogPlugin): dtc = 0 current_letter = "" current_author = "" + current_series = None - # Loop through books_by_author + # Loop through booksByAuthor book_count = 0 for book in self.booksByAuthor: book_count += 1 @@ -1349,11 +1446,23 @@ class EPUB_MOBI(CatalogPlugin): divTag.insert(dtc,pAuthorTag) dtc += 1 + # Check for series + if book['series'] and book['series'] != current_series: + # Start a new series + current_series = book['series'] + pSeriesTag = Tag(soup,'p') + pSeriesTag['class'] = "series" + pSeriesTag.insert(0,NavigableString(self.NOT_READ_SYMBOL + book['series'])) + divTag.insert(dtc,pSeriesTag) + dtc += 1 + if current_series and not book['series']: + current_series = None + # Add books pBookTag = Tag(soup, "p") ptc = 0 - # Prefix book with read/unread symbol + # book with read/unread symbol if book['read']: # check mark pBookTag.insert(ptc,NavigableString(self.READ_SYMBOL)) @@ -1367,7 +1476,11 @@ class EPUB_MOBI(CatalogPlugin): aTag = Tag(soup, "a") aTag['href'] = "book_%d.html" % (int(float(book['id']))) - aTag.insert(0,escape(book['title'])) + # Use series, series index if avail else just title + if current_series: + aTag.insert(0,escape(book['title'][len(book['series'])+1:])) + else: + aTag.insert(0,escape(book['title'])) pBookTag.insert(ptc, aTag) ptc += 1 @@ -1419,6 +1532,7 @@ class EPUB_MOBI(CatalogPlugin): divTag.insert(dtc,pIndexTag) dtc += 1 current_author = None + current_series = None for new_entry in this_months_list: if new_entry['author'] != current_author: @@ -1435,6 +1549,18 @@ class EPUB_MOBI(CatalogPlugin): divTag.insert(dtc,pAuthorTag) dtc += 1 + # Check for series + if new_entry['series'] and new_entry['series'] != current_series: + # Start a new series + current_series = new_entry['series'] + pSeriesTag = Tag(soup,'p') + pSeriesTag['class'] = "series" + pSeriesTag.insert(0,NavigableString(self.NOT_READ_SYMBOL + new_entry['series'])) + divTag.insert(dtc,pSeriesTag) + dtc += 1 + if current_series and not new_entry['series']: + current_series = None + # Add books pBookTag = Tag(soup, "p") ptc = 0 @@ -1453,7 +1579,10 @@ class EPUB_MOBI(CatalogPlugin): aTag = Tag(soup, "a") aTag['href'] = "book_%d.html" % (int(float(new_entry['id']))) - aTag.insert(0,escape(new_entry['title'])) + if current_series: + aTag.insert(0,escape(new_entry['title'][len(new_entry['series'])+1:])) + else: + aTag.insert(0,escape(new_entry['title'])) pBookTag.insert(ptc, aTag) ptc += 1 @@ -1554,6 +1683,7 @@ class EPUB_MOBI(CatalogPlugin): this_book['author_sort'] = book['author_sort'] this_book['read'] = book['read'] this_book['id'] = book['id'] + this_book['series'] = book['series'] normalized_tag = self.genre_tags_dict[friendly_tag] genre_tag_list = [key for genre in genre_list for key in genre] if normalized_tag in genre_tag_list: @@ -1579,7 +1709,9 @@ class EPUB_MOBI(CatalogPlugin): for genre in genre_list: for key in genre: - self.opts.log.info(" %s: %d titles" % (key, len(genre[key]))) + self.opts.log.info(" %s: %d %s" % (self.getFriendlyGenreTag(key), + len(genre[key]), + 'titles' if len(genre[key]) > 1 else 'title')) # Write the results # genre_list = [ {friendly_tag:[{book},{book}]}, {friendly_tag:[{book},{book}]}, ...] @@ -1786,7 +1918,9 @@ class EPUB_MOBI(CatalogPlugin): mtc += 1 # HTML files - add books to manifest and spine - for book in self.booksByTitle: + sort_descriptions_by = self.booksByAuthor if self.opts.sort_descriptions_by_author \ + else self.booksByTitle + for book in sort_descriptions_by: # manifest itemTag = Tag(soup, "item") itemTag['href'] = "content/book_%d.html" % int(book['id']) @@ -1912,7 +2046,9 @@ class EPUB_MOBI(CatalogPlugin): nptc += 1 # Loop over the titles - for book in self.booksByTitle: + sort_descriptions_by = self.booksByAuthor if self.opts.sort_descriptions_by_author \ + else self.booksByTitle + for book in sort_descriptions_by: navPointVolumeTag = Tag(ncx_soup, 'navPoint') navPointVolumeTag['class'] = "article" navPointVolumeTag['id'] = "book%dID" % int(book['id']) @@ -1920,7 +2056,11 @@ class EPUB_MOBI(CatalogPlugin): self.playOrder += 1 navLabelTag = Tag(ncx_soup, "navLabel") textTag = Tag(ncx_soup, "text") - textTag.insert(0, NavigableString(self.formatNCXText(book['title']))) + if book['series']: + tokens = book['title'].split(': ') + textTag.insert(0, NavigableString(self.formatNCXText('%s (%s)' % (tokens[1], tokens[0])))) + else: + textTag.insert(0, NavigableString(self.formatNCXText(book['title']))) navLabelTag.insert(0,textTag) navPointVolumeTag.insert(0,navLabelTag) @@ -2426,15 +2566,25 @@ class EPUB_MOBI(CatalogPlugin): else: yield tag - self.opts.log.info(u' %d available genre tags in database (exclude_genre: %s):' % \ + self.opts.log.info(u' %d genre tags in database (excluding genres matching %s):' % \ (len(genre_tags_dict), self.opts.exclude_genre)) # Display friendly/normalized genres # friendly => normalized - sorted_tags = ['%s => %s' % (key, genre_tags_dict[key]) for key in sorted(genre_tags_dict.keys())] - - for tag in next_tag(sorted_tags): - self.opts.log(u' %s' % tag) + if False: + sorted_tags = ['%s => %s' % (key, genre_tags_dict[key]) for key in sorted(genre_tags_dict.keys())] + for tag in next_tag(sorted_tags): + self.opts.log(u' %s' % tag) + else: + sorted_tags = ['%s' % (key) for key in sorted(genre_tags_dict.keys())] + out_str = '' + line_break = 70 + for tag in next_tag(sorted_tags): + out_str += tag + if len(out_str) >= line_break: + self.opts.log.info(' %s' % out_str) + out_str = '' + self.opts.log.info(' %s' % out_str) return genre_tags_dict @@ -2474,19 +2624,15 @@ class EPUB_MOBI(CatalogPlugin): body.insert(btc,aTag) btc += 1 - # Find the first instance of friendly_tag matching genre - for friendly_tag in self.genre_tags_dict: - if self.genre_tags_dict[friendly_tag] == genre: - break - titleTag = body.find(attrs={'class':'title'}) - titleTag.insert(0,NavigableString('%s' % escape(friendly_tag))) + titleTag.insert(0,NavigableString('%s' % escape(self.getFriendlyGenreTag(genre)))) # Insert the books by author list divTag = body.find(attrs={'class':'authors'}) dtc = 0 current_author = '' + current_series = None for book in books: if book['author'] != current_author: # Start a new author with link @@ -2502,6 +2648,19 @@ class EPUB_MOBI(CatalogPlugin): divTag.insert(dtc,pAuthorTag) dtc += 1 + # Check for series + if book['series'] and book['series'] != current_series: + # Start a new series + current_series = book['series'] + pSeriesTag = Tag(soup,'p') + pSeriesTag['class'] = "series" + pSeriesTag.insert(0,NavigableString(self.NOT_READ_SYMBOL + book['series'])) + divTag.insert(dtc,pSeriesTag) + dtc += 1 + + if current_series and not book['series']: + current_series = None + # Add books pBookTag = Tag(soup, "p") ptc = 0 @@ -2518,7 +2677,11 @@ class EPUB_MOBI(CatalogPlugin): # Add the book title aTag = Tag(soup, "a") aTag['href'] = "book_%d.html" % (int(float(book['id']))) - aTag.insert(0,escape(book['title'])) + # Use series, series index if avail else just title + if current_series: + aTag.insert(0,escape(book['title'][len(book['series'])+1:])) + else: + aTag.insert(0,escape(book['title'])) pBookTag.insert(ptc, aTag) ptc += 1 @@ -2553,6 +2716,7 @@ class EPUB_MOBI(CatalogPlugin):

{0}

+

 

@@ -2678,6 +2842,17 @@ class EPUB_MOBI(CatalogPlugin): draw.text((left, top), text, fill=(0,0,0), font=font) img.save(open(out_path, 'wb'), 'GIF') + def generateSeriesTitle(self, title): + if float(title['series_index']) - int(title['series_index']): + series_title = '%s %4.2f: %s' % (title['series'], + title['series_index'], + title['title']) + else: + series_title = '%s %d: %s' % (title['series'], + title['series_index'], + title['title']) + return series_title + def generateShortDescription(self, description): # Truncate the description to description_clip, on word boundaries if necessary if not description: @@ -2775,33 +2950,115 @@ class EPUB_MOBI(CatalogPlugin): else: return char + def getFriendlyGenreTag(self, genre): + # Find the first instance of friendly_tag matching genre + for friendly_tag in self.genre_tags_dict: + if self.genre_tags_dict[friendly_tag] == genre: + return friendly_tag + def markdownComments(self, comments): - ''' Convert random comment text to normalized, xml-legal block of

s''' - # reformat illegal xml - desc = prepare_string_for_xml(comments) + ''' + Convert random comment text to normalized, xml-legal block of

s + 'plain text' returns as +

plain text

- # normalize
tags - desc = re.sub(r'<br[/]{0,1}>', '
', desc) + 'plain text with minimalmarkup' returns as +

plain text with minimal markup

- # tokenize double line breaks - desc = comments.replace('\r', '') - tokens = comments.split('\n\n') + '

pre-formatted text

returns untouched - soup = BeautifulSoup() - ptc = 0 - for token in tokens: - pTag = Tag(soup, 'p') - pTag.insert(0,token) - soup.insert(ptc, pTag) - ptc += 1 - return soup.renderContents(encoding=None) + 'A line of text\n\nFollowed by a line of text' returns as +

A line of text

+

Followed by a line of text

+ + 'A line of text.\nA second line of text.\rA third line of text' returns as +

A line of text.
A second line of text.
A third line of text.

+ + '...end of a paragraph.Somehow the break was lost...' returns as +

...end of a paragraph.

+

Somehow the break was lost...

+ + Deprecated HTML returns as HTML via BeautifulSoup() + + ''' + + # Explode lost CRs to \n\n + # Hackish - ignoring sentences ending or beginning in numbers to avoid + # confusion with decimal points. + for lost_cr in re.finditer('([a-z])([\.\?!])([A-Z])',comments): + comments = comments.replace(lost_cr.group(), + '%s%s\n\n%s' % (lost_cr.group(1), + lost_cr.group(2), + lost_cr.group(3))) + + # Convert \n\n to

s + if re.search('\n\n', comments): + soup = BeautifulSoup() + split_ps = comments.split('\n\n') + tsc = 0 + for p in split_ps: + pTag = Tag(soup,'p') + pTag.insert(0,p) + soup.insert(tsc,pTag) + tsc += 1 + comments = soup.renderContents() + + # Convert solo returns to
+ comments = re.sub('[\r\n]','
', comments) + + soup = BeautifulSoup(comments) + + result = BeautifulSoup() + rtc = 0 + open_pTag = False + + all_tokens = list(soup.contents) + for token in all_tokens: + if type(token) is NavigableString: + if not open_pTag: + pTag = Tag(result,'p') + open_pTag = True + ptc = 0 + pTag.insert(ptc,prepare_string_for_xml(token)) + ptc += 1 + + elif token.name in ['br','b','i']: + if not open_pTag: + pTag = Tag(result,'p') + open_pTag = True + ptc = 0 + pTag.insert(ptc, token) + ptc += 1 + + else: + if open_pTag: + result.insert(rtc, pTag) + rtc += 1 + open_pTag = False + ptc = 0 + # Clean up NavigableStrings for xml + sub_tokens = list(token.contents) + for sub_token in sub_tokens: + if type(sub_token) is NavigableString: + sub_token.replaceWith(prepare_string_for_xml(sub_token)) + result.insert(rtc, token) + rtc += 1 + + if open_pTag: + result.insert(rtc, pTag) + + paras = result.findAll('p') + for p in paras: + p['class'] = 'description' + + return result.renderContents(encoding=None) def processSpecialTags(self, tags, this_title, opts): tag_list = [] for tag in tags: tag = self.convertHTMLEntities(tag) if tag.startswith(opts.note_tag): - this_title['notes'] = tag[1:] + this_title['notes'] = tag[len(self.opts.note_tag):] elif tag == opts.read_tag: this_title['read'] = True elif re.search(opts.exclude_genre, tag): @@ -2847,6 +3104,8 @@ class EPUB_MOBI(CatalogPlugin): opts.basename = "Catalog" opts.plugin_path = self.plugin_path opts.cli_environment = not hasattr(opts,'sync') + # GwR *** hardwired to sort by author, could be an option if passed in opts + opts.sort_descriptions_by_author = True if opts.verbose: opts_dict = vars(opts) @@ -2855,15 +3114,30 @@ class EPUB_MOBI(CatalogPlugin): 'CLI' if opts.cli_environment else 'GUI')) if opts_dict['ids']: log(" Book count: %d" % len(opts_dict['ids'])) + + sections_list = ['Descriptions','Authors'] + if opts.generate_titles: + sections_list.append('Titles') + if opts.generate_recently_added: + sections_list.append('Recently Added') + if not opts.exclude_genre.strip() == '.': + sections_list.append('Genres') + log(u"Creating Sections for %s" % ', '.join(sections_list)) + + # If exclude_genre is blank, assume user wants all genre tags included + if opts.exclude_genre.strip() == '': + opts.exclude_genre = '\[^.\]' + log(" converting empty exclude_genre to '\[^.\]'") + # Display opts keys = opts_dict.keys() keys.sort() log(" opts:") for key in keys: - if key in ['catalog_title','exclude_genre','exclude_tags','generate_titles', - 'generate_recently_added','note_tag','numbers_as_text','read_tag', - 'search_text','sort_by','sync']: + if key in ['catalog_title','exclude_genre','exclude_tags', + 'note_tag','numbers_as_text','read_tag', + 'search_text','sort_by','sort_descriptions_by_author','sync']: log(" %s: %s" % (key, opts_dict[key])) # Launch the Catalog builder diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst index 9bdd9aaa6b..a3c5bd32c4 100644 --- a/src/calibre/manual/faq.rst +++ b/src/calibre/manual/faq.rst @@ -62,7 +62,7 @@ How do I convert my file containing non-English characters, or smart quotes? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ There are two aspects to this problem: 1. Knowing the encoding of the source file: |app| tries to guess what character encoding your source files use, but often, this is impossible, so you need to tell it what encoding to use. This can be done in the GUI via the :guilabel:`Input character encoding` field in the :guilabel:`Look & Feel` section. The command-line tools all have an :option:`--input-encoding` option. - 2. When adding HTML files to |app|, you may need to tell |app| what encoding the files are in. To do this go to Preferences->Plugins->File Type plugins and customize the HTML2Zip plugin, telling it what encoding your HTML files are in. Now when you add HTML files to |app| they will be correctly processed. HTML files from different sources often have different encodings, so you may have to change this setting repeatedly. A common encoding for many files from the web is ``cp1252`` and I would suggest you try that first. + 2. When adding HTML files to |app|, you may need to tell |app| what encoding the files are in. To do this go to Preferences->Plugins->File Type plugins and customize the HTML2Zip plugin, telling it what encoding your HTML files are in. Now when you add HTML files to |app| they will be correctly processed. HTML files from different sources often have different encodings, so you may have to change this setting repeatedly. A common encoding for many files from the web is ``cp1252`` and I would suggest you try that first. Note that when converting HTML files, leave the input encoding setting mentioned above blank. This is because the HTML2ZIP plugin automatically converts the HTML files to a standard encoding (utf-8). 3. Embedding fonts: If you are generating an LRF file to read on your SONY Reader, you are limited by the fact that the Reader only supports a few non-English characters in the fonts it comes pre-loaded with. You can work around this problem by embedding a unicode-aware font that supports the character set your file uses into the LRF file. You should embed atleast a serif and a sans-serif font. Be aware that embedding fonts significantly slows down page-turn speed on the reader. diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index de8eaf6ac5..540f7cd93a 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -615,10 +615,12 @@ class BasicNewsRecipe(Recipe): del o['onload'] for script in list(soup.findAll('noscript')): - script.extract() + script.extract() for attr in self.remove_attributes: for x in soup.findAll(attrs={attr:True}): del x[attr] + for base in list(soup.findAll('base')): + base.extract() return self.postprocess_html(soup, first_fetch)