From 5804d188d9267e968582d3e0e0482368dd77a41f Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 11 Sep 2011 08:32:54 -0400 Subject: [PATCH 01/45] Fix for bug #846183: unhandled exception converting to PDF. --- src/calibre/ebooks/pdf/writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index ebe6533419..ac3708ff47 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -202,7 +202,7 @@ class PDFWriter(QObject): # {{{ inputPDF = PdfFileReader(item_stream) for page in inputPDF.pages: outPDF.addPage(page) - outPDF.write(self.out_stream) + outPDF.write(self.out_stream) finally: self._delete_tmpdir() self.loop.exit(0) From f9a358f77041052253ef643cafe82f4a60f12b09 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 11 Sep 2011 08:50:41 -0400 Subject: [PATCH 02/45] Fix PDF output on OSX: Force the use of OSX's internal PDF engine instead of using Qt's. --- src/calibre/ebooks/pdf/writer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index ac3708ff47..fe095ad441 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -11,6 +11,7 @@ Write content to PDF. import os import shutil +from calibre import isosx from calibre.ptempfile import PersistentTemporaryDirectory from calibre.ebooks.pdf.pageoptions import unit, paper_size, \ orientation @@ -164,6 +165,8 @@ class PDFWriter(QObject): # {{{ self.logger.debug('\tRendering item %s as %i.pdf' % (os.path.basename(str(self.view.url().toLocalFile())), len(self.combine_queue))) printer = get_pdf_printer(self.opts) printer.setOutputFileName(item_path) + if isosx: + printer.setOutputFormat(QPrinter.NativeFormat) self.view.print_(printer) printer.abort() self._render_book() From be691bf8fcfb6eb5a5c3d08d00c8076a1a1eb412 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 11 Sep 2011 11:12:21 -0400 Subject: [PATCH 03/45] Add more places to set the PDF engine for OS X. --- src/calibre/ebooks/pdf/writer.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index fe095ad441..05d874c9c3 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -165,6 +165,10 @@ class PDFWriter(QObject): # {{{ self.logger.debug('\tRendering item %s as %i.pdf' % (os.path.basename(str(self.view.url().toLocalFile())), len(self.combine_queue))) printer = get_pdf_printer(self.opts) printer.setOutputFileName(item_path) + # We have to set the engine to Native on OS X after the call to set + # filename. Setting a filename with .pdf as the extension causes + # Qt to set the format to use Qt's PDF engine even if native was + # previously set on the printer. if isosx: printer.setOutputFormat(QPrinter.NativeFormat) self.view.print_(printer) @@ -182,6 +186,8 @@ class PDFWriter(QObject): # {{{ item_path = os.path.join(self.tmp_path, 'cover.pdf') printer = get_pdf_printer(self.opts) printer.setOutputFileName(item_path) + if isosx: + printer.setOutputFormat(QPrinter.NativeFormat) self.combine_queue.insert(0, item_path) p = QPixmap() p.loadFromData(self.cover_data) @@ -232,6 +238,8 @@ class ImagePDFWriter(object): def render_images(self, outpath, mi, items): printer = get_pdf_printer(self.opts, for_comic=True) printer.setOutputFileName(outpath) + if isosx: + printer.setOutputFormat(QPrinter.NativeFormat) printer.setDocName(mi.title) printer.setCreator(u'%s [%s]'%(__appname__, __version__)) # Seems to be no way to set author From bd58a50675d9382a47e7e88a1bd1cb8ac7cc9bfc Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 11 Sep 2011 09:29:40 -0600 Subject: [PATCH 04/45] Improved Guardian/Observer --- recipes/guardian.recipe | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/recipes/guardian.recipe b/recipes/guardian.recipe index 124820d0a1..05d6616ace 100644 --- a/recipes/guardian.recipe +++ b/recipes/guardian.recipe @@ -15,8 +15,10 @@ class Guardian(BasicNewsRecipe): title = u'The Guardian and The Observer' if date.today().weekday() == 6: base_url = "http://www.guardian.co.uk/theobserver" + cover_pic = 'Observer digital edition' else: base_url = "http://www.guardian.co.uk/theguardian" + cover_pic = 'Guardian digital edition' __author__ = 'Seabound and Sujata Raman' language = 'en_GB' @@ -79,7 +81,7 @@ class Guardian(BasicNewsRecipe): # soup = self.index_to_soup("http://www.guardian.co.uk/theobserver") soup = self.index_to_soup(self.base_url) # find cover pic - img = soup.find( 'img',attrs ={'alt':'Guardian digital edition'}) + img = soup.find( 'img',attrs ={'alt':self.cover_pic}) if img is not None: self.cover_url = img['src'] # end find cover pic From f3d9c59cfb8bccce17a7b19d9d981731eca27295 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 11 Sep 2011 09:44:41 -0600 Subject: [PATCH 05/45] Fix Business Week --- recipes/business_week.recipe | 168 +++++++++++++++++++---------------- 1 file changed, 90 insertions(+), 78 deletions(-) diff --git a/recipes/business_week.recipe b/recipes/business_week.recipe index fcb28d1d3e..ca9078a112 100644 --- a/recipes/business_week.recipe +++ b/recipes/business_week.recipe @@ -1,93 +1,105 @@ -#!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' -__docformat__ = 'restructuredtext en' - +__copyright__ = '2008 Kovid Goyal kovid@kovidgoyal.net, 2010 Darko Miletic ' ''' -businessweek.com +www.businessweek.com ''' from calibre.web.feeds.news import BasicNewsRecipe class BusinessWeek(BasicNewsRecipe): - title = 'Business Week' - description = 'Business News, Stock Market and Financial Advice' - __author__ = 'ChuckEggDotCom and Sujata Raman' - language = 'en' + title = 'Business Week' + __author__ = 'Kovid Goyal and Darko Miletic' + description = 'Read the latest international business news & stock market news. Get updated company profiles, financial advice, global economy and technology news.' + publisher = 'Bloomberg L.P.' + category = 'Business, business news, stock market, stock market news, financial advice, company profiles, financial advice, global economy, technology news' + oldest_article = 7 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'utf8' + use_embedded_content = False + language = 'en' + remove_empty_feeds = True + publication_type = 'magazine' + cover_url = 'http://images.businessweek.com/mz/covers/current_120x160.jpg' + masthead_url = 'http://assets.businessweek.com/images/bw-logo.png' + extra_css = """ + body{font-family: Helvetica,Arial,sans-serif } + img{margin-bottom: 0.4em; display:block} + .tagline{color: gray; font-style: italic} + .photoCredit{font-size: small; color: gray} + """ - oldest_article = 7 - max_articles_per_feed = 10 - no_stylesheets = True + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } - recursions = 1 - match_regexps = [r'http://www.businessweek.com/.*_page_[1-9].*'] - extra_css = ''' - h1{font-family :Arial,Helvetica,sans-serif; font-size:large;} - .news_story_title{font-family :Arial,Helvetica,sans-serif; font-size:large;font-weight:bold;} - h2{font-family :Arial,Helvetica,sans-serif; font-size:medium;color:#666666;} - h3{text-transform:uppercase;font-family :Arial,Helvetica,sans-serif; font-size:large;font-weight:bold;} - h4{font-family :Arial,Helvetica,sans-serif; font-size:small;font-weight:bold;} - p{font-family :Arial,Helvetica,sans-serif; } - #lede600{font-size:x-small;} - #storybody{font-size:x-small;} - p{font-family :Arial,Helvetica,sans-serif;} - .strap{font-family :Arial,Helvetica,sans-serif; font-size:x-small; color:#064599;} - .byline{font-family :Arial,Helvetica,sans-serif; font-size:x-small;} - .postedBy{font-family :Arial,Helvetica,sans-serif; font-size:x-small;color:#666666;} - .trackback{font-family :Arial,Helvetica,sans-serif; font-size:x-small;color:#666666;} - .date{font-family :Arial,Helvetica,sans-serif; font-size:x-small;color:#666666;} - .wrapper{font-family :Arial,Helvetica,sans-serif; font-size:x-small;} - .photoCredit{font-family :Arial,Helvetica,sans-serif; font-size:x-small;color:#666666;} - .tagline{font-family :Arial,Helvetica,sans-serif; font-size:x-small;color:#666666;} - .pageCount{color:#666666;font-family :Arial,Helvetica,sans-serif; font-size:x-small;} - .note{font-family :Arial,Helvetica,sans-serif; font-size:small;color:#666666;font-style:italic;} - .highlight{font-family :Arial,Helvetica,sans-serif; font-size:small;background-color:#FFF200;} - .annotation{font-family :Arial,Helvetica,sans-serif; font-size:x-small;color:#666666;} - ''' - - remove_tags = [ dict(name='div', attrs={'id':["log","feedback","footer","secondarynav","secondnavbar","header","email","bw2-header","column2","wrapper-bw2-footer","wrapper-mgh-footer","inset","commentForm","commentDisplay","bwExtras","bw2-umbrella","readerComments","leg","rightcol"]}), - dict(name='div', attrs={'class':["menu",'sponsorbox smallertext',"TopNavTile","graybottom leaderboard"]}), - dict(name='img', alt ="News"), - dict(name='td', width ="1"), - ] + remove_tags = [ + dict(attrs={'class':'inStory'}) + ,dict(name=['meta','link','iframe','base','embed','object','table','th','tr','td']) + ,dict(attrs={'id':['inset','videoDisplay']}) + ] + keep_only_tags = [dict(name='div', attrs={'id':['story-body','storyBody','article_body','articleBody']})] + remove_attributes = ['lang'] + match_regexps = [r'http://www.businessweek.com/.*_page_[1-9].*'] - feeds = [ - (u'Top Stories', u'http://www.businessweek.com/topStories/rss/topStories.rss'), - (u'Top News', u'http://www.businessweek.com/rss/bwdaily.rss'), - (u'Asia', u'http://www.businessweek.com/rss/asia.rss'), - (u'Autos', u'http://www.businessweek.com/rss/autos/index.rss'), - (u'Classic Cars', u'http://rss.businessweek.com/bw_rss/classiccars'), - (u'Hybrids', u'http://rss.businessweek.com/bw_rss/hybrids'), - (u'Europe', u'http://www.businessweek.com/rss/europe.rss'), - (u'Auto Reviews', u'http://rss.businessweek.com/bw_rss/autoreviews'), - (u'Innovation & Design', u'http://www.businessweek.com/rss/innovate.rss'), - (u'Architecture', u'http://www.businessweek.com/rss/architecture.rss'), - (u'Brand Equity', u'http://www.businessweek.com/rss/brandequity.rss'), - (u'Auto Design', u'http://www.businessweek.com/rss/carbuff.rss'), - (u'Game Room', u'http://rss.businessweek.com/bw_rss/gameroom'), - (u'Technology', u'http://www.businessweek.com/rss/technology.rss'), - (u'Investing', u'http://rss.businessweek.com/bw_rss/investor'), - (u'Small Business', u'http://www.businessweek.com/rss/smallbiz.rss'), - (u'Careers', u'http://rss.businessweek.com/bw_rss/careers'), - (u'B-Schools', u'http://www.businessweek.com/rss/bschools.rss'), - (u'Magazine Selections', u'http://www.businessweek.com/rss/magazine.rss'), - (u'CEO Guide to Tech', u'http://www.businessweek.com/rss/ceo_guide_tech.rss'), - ] + + feeds = [ + (u'Top Stories', u'http://www.businessweek.com/topStories/rss/topStories.rss'), + (u'Top News' , u'http://www.businessweek.com/rss/bwdaily.rss' ), + (u'Asia', u'http://www.businessweek.com/rss/asia.rss'), + (u'Autos', u'http://www.businessweek.com/rss/autos/index.rss'), + (u'Classic Cars', u'http://rss.businessweek.com/bw_rss/classiccars'), + (u'Hybrids', u'http://rss.businessweek.com/bw_rss/hybrids'), + (u'Europe', u'http://www.businessweek.com/rss/europe.rss'), + (u'Auto Reviews', u'http://rss.businessweek.com/bw_rss/autoreviews'), + (u'Innovation & Design', u'http://www.businessweek.com/rss/innovate.rss'), + (u'Architecture', u'http://www.businessweek.com/rss/architecture.rss'), + (u'Brand Equity', u'http://www.businessweek.com/rss/brandequity.rss'), + (u'Auto Design', u'http://www.businessweek.com/rss/carbuff.rss'), + (u'Game Room', u'http://rss.businessweek.com/bw_rss/gameroom'), + (u'Technology', u'http://www.businessweek.com/rss/technology.rss'), + (u'Investing', u'http://rss.businessweek.com/bw_rss/investor'), + (u'Small Business', u'http://www.businessweek.com/rss/smallbiz.rss'), + (u'Careers', u'http://rss.businessweek.com/bw_rss/careers'), + (u'B-Schools', u'http://www.businessweek.com/rss/bschools.rss'), + (u'Magazine Selections', u'http://www.businessweek.com/rss/magazine.rss'), + (u'CEO Guide to Tech', u'http://www.businessweek.com/rss/ceo_guide_tech.rss'), + ] def get_article_url(self, article): - url = article.get('guid', None) + if 'podcasts' in url: + return None + if 'surveys' in url: + return None + if 'images' in url: + return None + if 'feedroom' in url: + return None + if '/magazine/toc/' in url: + return None + rurl, sep, rest = url.rpartition('?') + if rurl: + return rurl + return rest - if 'podcasts' in url or 'surveys' in url: - url = None - - return url - - def postprocess_html(self, soup, first): - - for tag in soup.findAll(name=['ul','li','table','td','tr','span']): - tag.name = 'div' - for tag in soup.findAll(name= 'div',attrs={ 'id':'pageNav'}): - tag.extract() - return soup + def print_version(self, url): + if '/news/' in url or '/blog/ in url': + return url + if '/magazine' in url: + rurl = url.replace('http://www.businessweek.com/','http://www.businessweek.com/printer/') + else: + rurl = url.replace('http://www.businessweek.com/','http://www.businessweek.com/print/') + return rurl.replace('/investing/','/investor/') + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for alink in soup.findAll('a'): + if alink.string is not None: + tstr = alink.string + alink.replaceWith(tstr) + return soup From 9d199ec40af48b4603fde4572921734142eebe78 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 11 Sep 2011 10:18:10 -0600 Subject: [PATCH 06/45] Dummy commit to record that PDF output regression in 0.8.18 was fixed. Fixes #846183 (unhandled exception converting to PDF) From 1411e439200c859764abac541ea676debe544503 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 11 Sep 2011 10:19:25 -0600 Subject: [PATCH 07/45] ... --- recipes/business_week.recipe | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/business_week.recipe b/recipes/business_week.recipe index ca9078a112..fe98d9fa00 100644 --- a/recipes/business_week.recipe +++ b/recipes/business_week.recipe @@ -87,7 +87,7 @@ class BusinessWeek(BasicNewsRecipe): return rest def print_version(self, url): - if '/news/' in url or '/blog/ in url': + if '/news/' in url or '/blog/' in url: return url if '/magazine' in url: rurl = url.replace('http://www.businessweek.com/','http://www.businessweek.com/printer/') From 2cd448687d263a9d926ad2b910b5ff16c1f419d8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 11 Sep 2011 10:51:37 -0600 Subject: [PATCH 08/45] author_to_author_sort(): handle multiple suffixes --- resources/default_tweaks.py | 2 +- src/calibre/ebooks/metadata/__init__.py | 19 +++++++++++++------ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/resources/default_tweaks.py b/resources/default_tweaks.py index ead9995eb3..f12121dd89 100644 --- a/resources/default_tweaks.py +++ b/resources/default_tweaks.py @@ -61,7 +61,7 @@ authors_completer_append_separator = False # selecting 'manage authors', and pressing 'Recalculate all author sort values'. # The author name suffixes are words that are ignored when they occur at the # end of an author name. The case of the suffix is ignored and trailing -# periods are automatically handled. +# periods are automatically handled. The same is true for prefixes. # The author name copy words are a set of words which if they occur in an # author name cause the automatically generated author sort string to be # identical to the author name. This means that the sort for a string like Acme diff --git a/src/calibre/ebooks/metadata/__init__.py b/src/calibre/ebooks/metadata/__init__.py index c3a229fe3c..07fae187ba 100644 --- a/src/calibre/ebooks/metadata/__init__.py +++ b/src/calibre/ebooks/metadata/__init__.py @@ -65,20 +65,27 @@ def author_to_author_sort(author, method=None): suffixes = set([x.lower() for x in tweaks['author_name_suffixes']]) suffixes |= set([x+u'.' for x in suffixes]) - last = tokens[-1].lower() - suffix = None - if last in suffixes: - suffix = tokens[-1] - tokens = tokens[:-1] + suffix = u'' + while True: + if not tokens: + return author + last = tokens[-1].lower() + if last in suffixes: + suffix = tokens[-1] + ' ' + suffix + tokens = tokens[:-1] + else: + break + suffix = suffix.strip() if method == u'comma' and u',' in u''.join(tokens): return author atokens = tokens[-1:] + tokens[:-1] + num_toks = len(atokens) if suffix: atokens.append(suffix) - if method != u'nocomma' and len(atokens) > 1: + if method != u'nocomma' and num_toks > 1: atokens[0] += u',' return u' '.join(atokens) From 400c68e20fc35426717d3dc9fca0808ea176522e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 11 Sep 2011 14:00:51 -0600 Subject: [PATCH 09/45] Hindustan Times by Krittika Goyal --- recipes/hindustan_times.recipe | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 recipes/hindustan_times.recipe diff --git a/recipes/hindustan_times.recipe b/recipes/hindustan_times.recipe new file mode 100644 index 0000000000..f228757c70 --- /dev/null +++ b/recipes/hindustan_times.recipe @@ -0,0 +1,29 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class HindustanTimes(BasicNewsRecipe): + title = u'Hindustan Times' + language = 'en_IN' + __author__ = 'Krittika Goyal' + oldest_article = 1 #days + max_articles_per_feed = 25 + use_embedded_content = False + + no_stylesheets = True + auto_cleanup = True + + feeds = [ + ('News', + 'http://feeds.hindustantimes.com/HT-NewsSectionPage-Topstories'), + ('Views', + 'http://feeds.hindustantimes.com/HT-ViewsSectionpage-Topstories'), + ('Cricket', + 'http://feeds.hindustantimes.com/HT-Cricket-TopStories'), + ('Business', + 'http://feeds.hindustantimes.com/HT-BusinessSectionpage-TopStories'), + ('Entertainment', + 'http://feeds.hindustantimes.com/HT-HomePage-Entertainment'), + ('Lifestyle', + 'http://feeds.hindustantimes.com/HT-Homepage-LifestyleNews'), +] + + From 8a7100b3386056c352926ba79236a4abd93452a9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 11 Sep 2011 15:15:16 -0600 Subject: [PATCH 10/45] ... --- src/calibre/library/server/mobile.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/calibre/library/server/mobile.py b/src/calibre/library/server/mobile.py index 3ce96a2b49..0cb7a86126 100644 --- a/src/calibre/library/server/mobile.py +++ b/src/calibre/library/server/mobile.py @@ -277,12 +277,15 @@ class MobileServer(object): cherrypy.response.headers['Content-Type'] = 'text/html; charset=utf-8' cherrypy.response.headers['Last-Modified'] = self.last_modified(updated) - url_base = "/mobile?search=" + search+";order="+order+";sort="+sort+";num="+str(num) - return html.tostring(build_index(books, num, search, sort, order, + raw = html.tostring(build_index(books, num, search, sort, order, start, len(ids), url_base, CKEYS, self.opts.url_prefix), - encoding='utf-8', include_meta_content_type=True, + encoding='utf-8', pretty_print=True) + # tostring's include_meta_content_type is broken + raw = raw.replace('', '\n' + '') + return raw From d76c312c89c89a7990d247e0a9e81f9aa571b7c8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 11 Sep 2011 15:49:03 -0600 Subject: [PATCH 11/45] Fix Inquirer.net --- recipes/inquirer_net.recipe | 45 +++++++++---------------------------- 1 file changed, 11 insertions(+), 34 deletions(-) diff --git a/recipes/inquirer_net.recipe b/recipes/inquirer_net.recipe index 3a3d5b9e89..30f2519f8b 100644 --- a/recipes/inquirer_net.recipe +++ b/recipes/inquirer_net.recipe @@ -7,56 +7,33 @@ www.inquirer.net ''' from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import Tag class InquirerNet(BasicNewsRecipe): title = 'Inquirer.net' - __author__ = 'Darko Miletic' + __author__ = 'Krittika Goyal' description = 'News from Philipines' oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False - encoding = 'cp1252' + encoding = 'utf8' publisher = 'inquirer.net' category = 'news, politics, philipines' lang = 'en' language = 'en' - extra_css = ' .fontheadline{font-size: x-large} .fontsubheadline{font-size: large} .fontkick{font-size: medium}' + use_embedded_content = False - html2lrf_options = [ - '--comment', description - , '--category', category - , '--publisher', publisher - , '--ignore-tables' - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True' - - remove_tags = [dict(name=['object','link','script','iframe','form'])] + no_stylesheets = True + auto_cleanup = True feeds = [ - (u'Breaking news', u'http://services.inquirer.net/rss/breakingnews.xml' ) - ,(u'Top stories' , u'http://services.inquirer.net/rss/topstories.xml' ) - ,(u'Sports' , u'http://services.inquirer.net/rss/brk_breakingnews.xml' ) - ,(u'InfoTech' , u'http://services.inquirer.net/rss/infotech_tech.xml' ) - ,(u'InfoTech' , u'http://services.inquirer.net/rss/infotech_tech.xml' ) - ,(u'Business' , u'http://services.inquirer.net/rss/inq7money_breaking_news.xml' ) - ,(u'Editorial' , u'http://services.inquirer.net/rss/opinion_editorial.xml' ) - ,(u'Global Nation', u'http://services.inquirer.net/rss/globalnation_breakingnews.xml') + (u'Inquirer', u'http://www.inquirer.net/fullfeed') ] - def preprocess_html(self, soup): - mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) - mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")]) - soup.head.insert(0,mlang) - soup.head.insert(1,mcharset) - for item in soup.findAll(style=True): - del item['style'] - return soup + def get_browser(self): + br = BasicNewsRecipe.get_browser(self) + br.set_handle_gzip(True) + return br + - def print_version(self, url): - rest, sep, art = url.rpartition('/view/') - art_id, sp, rrest = art.partition('/') - return 'http://services.inquirer.net/print/print.php?article_id=' + art_id From 7a3babf49eb10d4ca964212c395cda5f32e6673b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 11 Sep 2011 15:53:10 -0600 Subject: [PATCH 12/45] India Today by Krittika Goyal --- recipes/india_today.recipe | 83 ++++----------------- src/calibre/web/feeds/recipes/collection.py | 2 +- 2 files changed, 17 insertions(+), 68 deletions(-) diff --git a/recipes/india_today.recipe b/recipes/india_today.recipe index 604a7f57ad..7b53fe3d65 100644 --- a/recipes/india_today.recipe +++ b/recipes/india_today.recipe @@ -1,76 +1,25 @@ + from calibre.web.feeds.news import BasicNewsRecipe class IndiaToday(BasicNewsRecipe): - - title = 'India Today' - __author__ = 'Kovid Goyal' - language = 'en_IN' - timefmt = ' [%d %m, %Y]' - - oldest_article = 700 - max_articles_per_feed = 10 + title = u'India Today' + language = 'en_IN' + __author__ = 'Krittika Goyal' + oldest_article = 15 #days + max_articles_per_feed = 25 no_stylesheets = True + auto_cleanup = True - remove_tags_before = dict(id='content_story_title') - remove_tags_after = dict(id='rightblockdiv') - remove_tags = [dict(id=['rightblockdiv', 'share_links'])] - - extra_css = '#content_story_title { font-size: 170%; font-weight: bold;}' - conversion_options = { 'linearize_tables': True } - - def it_get_index(self): - soup = self.index_to_soup('http://indiatoday.intoday.in/site/archive') - a = soup.find('a', href=lambda x: x and 'issueId=' in x) - url = 'http://indiatoday.intoday.in/site/'+a.get('href') - img = a.find('img') - self.cover_url = img.get('src') - return self.index_to_soup(url) - - def parse_index(self): - soup = self.it_get_index() - feeds, current_section, current_articles = [], None, [] - for x in soup.findAll(name=['h1', 'a']): - if x.name == 'h1': - if current_section and current_articles: - feeds.append((current_section, current_articles)) - current_section = self.tag_to_string(x) - current_articles = [] - self.log('\tFound section:', current_section) - elif x.name == 'a' and 'Story' in x.get('href', ''): - title = self.tag_to_string(x) - url = x.get('href') - url = url.replace(' ', '%20') - if not url.startswith('/'): - url = 'http://indiatoday.intoday.in/site/' + url - if title and url: - url += '?complete=1' - self.log('\tFound article:', title) - self.log('\t\t', url) - desc = '' - h3 = x.parent.findNextSibling('h3') - if h3 is not None: - desc = 'By ' + self.tag_to_string(h3) - h4 = h3.findNextSibling('h4') - if h4 is not None: - desc = self.tag_to_string(h4) + ' ' + desc - if desc: - self.log('\t\t', desc) - current_articles.append({'title':title, 'description':desc, - 'url':url, 'date':''}) - - if current_section and current_articles: - feeds.append((current_section, current_articles)) - - return feeds - - def postprocess_html(self, soup, first): - a = soup.find(text='Print') - if a is not None: - tr = a.findParent('tr') - if tr is not None: - tr.extract() - return soup + feeds = [ +('Latest News', 'http://indiatoday.intoday.in/rss/article.jsp?sid=4'), +('Cover Story', 'http://indiatoday.intoday.in/rss/article.jsp?sid=30'), +('Nation', 'http://indiatoday.intoday.in/rss/article.jsp?sid=36'), +('States', 'http://indiatoday.intoday.in/rss/article.jsp?sid=21'), +('Economy', 'http://indiatoday.intoday.in/rss/article.jsp?sid=34'), +('World', 'http://indiatoday.intoday.in/rss/article.jsp?sid=61'), +('Sport', 'http://indiatoday.intoday.in/rss/article.jsp?sid=41'), +] diff --git a/src/calibre/web/feeds/recipes/collection.py b/src/calibre/web/feeds/recipes/collection.py index 13bae3a554..6b9c3a2129 100644 --- a/src/calibre/web/feeds/recipes/collection.py +++ b/src/calibre/web/feeds/recipes/collection.py @@ -22,7 +22,7 @@ E = ElementMaker(namespace=NS, nsmap={None:NS}) def iterate_over_builtin_recipe_files(): exclude = ['craigslist', 'iht', 'toronto_sun', - 'india_today', 'livemint'] + 'livemint'] d = os.path.dirname base = os.path.join(d(d(d(d(d(d(os.path.abspath(__file__))))))), 'recipes') for f in os.listdir(base): From bd5ddfc7ed499b0a9278cf5d1b55ee258d5d0db2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 11 Sep 2011 15:56:00 -0600 Subject: [PATCH 13/45] CIO Magazine by Julio Map --- recipes/cio_magazine.recipe | 128 ++++++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 recipes/cio_magazine.recipe diff --git a/recipes/cio_magazine.recipe b/recipes/cio_magazine.recipe new file mode 100644 index 0000000000..084a45ff93 --- /dev/null +++ b/recipes/cio_magazine.recipe @@ -0,0 +1,128 @@ +# Los primeros comentarios son las dificultades que he tenido con el Piton +# Cuando da error UTF8 revisa los comentarios (acentos). En notepad++ Search, Goto, posicion y lo ves. +# Editar con Notepad++ Si pone - donde no debe es que ha indentado mal... Edit - Blank operations - tab to space +# He entendido lo que significa el from... son paths dentro de pylib.zip... +# Con from importa solo un simbolo...con import,la libreria completa +from calibre.web.feeds.news import BasicNewsRecipe +# sys no hace falta... lo intente usar para escribir en stderr +from calibre import strftime +# Para convertir el tiempo del articulo +import string, re +# Para usar expresiones regulares +# Visto en pylib.zip... la primera letra es mayuscula +# Estas dos ultimas han sido un vago intento de establecer una cookie (no usado) + +class CIO_Magazine(BasicNewsRecipe): + title = 'CIO Magazine' + oldest_article = 14 + max_articles_per_feed = 100 + auto_cleanup = True + __author__ = 'Julio Map' + description = 'CIO is the leading information brand for today-s busy Chief information Officer - CIO Magazine bi-monthly ' + language = 'en' + encoding = 'utf8' + cover_url = 'http://www.cio.com/homepage/images/hp-cio-logo-linkedin.png' + + remove_tags_before = dict(name='div', attrs={'id':'container'}) +# Absolutamente innecesario... al final he visto un print_version (ver mas adelante) + +# Dentro de una revista dada... +# issue_details contiene el titulo y las secciones de este ejemplar +# DetailModule esta dentro de issue_details contiene las urls y resumenes +# Dentro de un articulo dado... +# Article-default-body contiene el texto. Pero como digo, he encontrado una print_version + + no_stylesheets = True + remove_javascript = True + + def print_version(self,url): + # A esta funcion le llama el sistema... no hay que llamarla uno mismo (porque seria llamada dos veces) + # Existe una version imprimible de los articulos cambiando + # http://www.cio.com/article// por + # http://www.cio.com/article/print/ que contiene todas las paginas dentro del div id=container + if url.startswith('/'): + url = 'http://www.cio.com'+url + segments = url.split('/') + printURL = '/'.join(segments[0:4]) + '/print/' + segments[4] +'#' + return printURL + + + def parse_index(self): + ########################################################################### + # This method should be implemented in recipes that parse a website + # instead of feeds to generate a list of articles. Typical uses are for + # news sources that have a Print Edition webpage that lists all the + # articles in the current print edition. If this function is implemented, + # it will be used in preference to BasicNewsRecipe.parse_feeds(). + # + # It must return a list. Each element of the list must be a 2-element + # tuple of the form ('feed title', list of articles). + # + # Each list of articles must contain dictionaries of the form: + # + # { + # 'title' : article title, + # 'url' : URL of print version, + # 'date' : The publication date of the article as a string, + # 'description' : A summary of the article + # 'content' : The full article (can be an empty string). This is used by FullContentProfile + # } + # + # For an example, see the recipe for downloading The Atlantic. + # In addition, you can add 'author' for the author of the article. + ############################################################################### + + # Primero buscamos cual es la ultima revista que se ha creado + soupinicial = self.index_to_soup('http://www.cio.com/magazine') + # Es el primer enlace que hay en el DIV con class content_body + a= soupinicial.find(True, attrs={'class':'content_body'}).find('a', href=True) + INDEX = re.sub(r'\?.*', '', a['href']) + # Como cio.com usa enlaces relativos, le anteponemos el domain name. + if INDEX.startswith('/'): # protegiendonos de que dejen de usarlos + INDEX = 'http://www.cio.com'+INDEX + # Y nos aseguramos en los logs que lo estamos haciendo bien + print ("INDEX en parse_index: ", INDEX) + + # Ya sabemos cual es la revista... procesemosla. + soup = self.index_to_soup(INDEX) + + articles = {} + key = None + feeds = [] + # Para empezar nos quedamos solo con dos DIV, 'heading' y ' issue_item' + # Del primero sacamos las categorias (key) y del segundo las urls y resumenes + for div in soup.findAll(True, + attrs={'class':['heading', 'issue_item']}): + + if div['class'] == 'heading': + key = string.capwords(self.tag_to_string(div.span)) + print ("Key: ",key) # Esto es para depurar + articles[key] = [] + feeds.append(key) + + elif div['class'] == 'issue_item': + a = div.find('a', href=True) + if not a: + continue + url = re.sub(r'\?.*', '', a['href']) + print("url: ",url) # Esto es para depurar + title = self.tag_to_string(a, use_alt=True).strip() # Ya para nota, quitar al final las dos ultimas palabras + pubdate = strftime('%a, %d %b') # No es la fecha de publicacion sino la de colecta + summary = div.find('p') # Dentro de la div 'issue_item' el unico parrafo que hay es el resumen + description = '' # Si hay summary la description sera el summary... si no, la dejamos en blanco + + if summary: + description = self.tag_to_string(summary, use_alt=False) + print ("Description = ", description) + + + feed = key if key is not None else 'Uncategorized' # Esto esta copiado del NY times + if not articles.has_key(feed): + articles[feed] = [] + if not 'podcasts' in url: + articles[feed].append( + dict(title=title, url=url, date=pubdate, + description=description, + content='')) + feeds = [(key, articles[key]) for key in feeds if articles.has_key(key)] + return feeds From fe21bf186f74a8b5e50278f2bb1917288b547dca Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 11 Sep 2011 16:12:56 -0600 Subject: [PATCH 14/45] ... --- src/calibre/library/server/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/library/server/base.py b/src/calibre/library/server/base.py index 26e4d3469e..d18bffc6a2 100644 --- a/src/calibre/library/server/base.py +++ b/src/calibre/library/server/base.py @@ -34,7 +34,7 @@ class DispatchController(object): # {{{ def __init__(self, prefix, wsgi=False): self.dispatcher = cherrypy.dispatch.RoutesDispatcher() self.funcs = [] - self.seen = set([]) + self.seen = set() self.prefix = prefix if prefix else '' if wsgi: self.prefix = '' From 7eadf1c5c14bb18160ef1751037430278f7ada4e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 11 Sep 2011 20:29:04 -0600 Subject: [PATCH 15/45] ... --- src/cherrypy/lib/httpauth.py | 6 +++--- src/cherrypy/process/servers.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/cherrypy/lib/httpauth.py b/src/cherrypy/lib/httpauth.py index 0b4743d668..f5d87d2b43 100644 --- a/src/cherrypy/lib/httpauth.py +++ b/src/cherrypy/lib/httpauth.py @@ -75,7 +75,7 @@ MD5_SESS = "MD5-sess" AUTH = "auth" AUTH_INT = "auth-int" -SUPPORTED_ALGORITHM = ('md5', MD5, MD5_SESS) +SUPPORTED_ALGORITHM = ('md5', MD5, MD5_SESS) # Changed by Kovid SUPPORTED_QOP = (AUTH, AUTH_INT) ################################################################################ @@ -83,7 +83,7 @@ SUPPORTED_QOP = (AUTH, AUTH_INT) # DIGEST_AUTH_ENCODERS = { MD5: lambda val: md5(val).hexdigest(), - 'md5': lambda val:md5(val).hexdigest(), + 'md5': lambda val:md5(val).hexdigest(), # Added by Kovid MD5_SESS: lambda val: md5(val).hexdigest(), # SHA: lambda val: sha(val).hexdigest(), } @@ -225,7 +225,7 @@ def _A1(params, password): algorithm = params.get ("algorithm", MD5) H = DIGEST_AUTH_ENCODERS[algorithm] - if algorithm in (MD5, 'md5'): + if algorithm in (MD5, 'md5'): # Changed by Kovid # If the "algorithm" directive's value is "MD5" or is # unspecified, then A1 is: # A1 = unq(username-value) ":" unq(realm-value) ":" passwd diff --git a/src/cherrypy/process/servers.py b/src/cherrypy/process/servers.py index 932d28d01f..da469bfad2 100644 --- a/src/cherrypy/process/servers.py +++ b/src/cherrypy/process/servers.py @@ -241,10 +241,10 @@ def wait_for_free_port(host, port): for trial in xrange(50): try: # we are expecting a free port, so reduce the timeout - check_port(host, port, timeout=0.2) + check_port(host, port, timeout=0.2) # Changed by Kovid except IOError: # Give the old server thread time to free the port. - time.sleep(0.2) + time.sleep(0.2) # Changed by Kovid else: return From 1ecfb81a0708b4f7027d5dde6f8b189f8e060933 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 12 Sep 2011 11:48:53 -0600 Subject: [PATCH 16/45] Keyboard shortcuts: Allow use of symbol keys like >,*,etc. Fixes #847378 (Error in shortcut-handler) --- src/calibre/gui2/keyboard.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/calibre/gui2/keyboard.py b/src/calibre/gui2/keyboard.py index 9b0b1d8f69..362a074304 100644 --- a/src/calibre/gui2/keyboard.py +++ b/src/calibre/gui2/keyboard.py @@ -443,7 +443,13 @@ class Editor(QFrame): # {{{ return QWidget.keyPressEvent(self, ev) button = getattr(self, 'button%d'%which) button.setStyleSheet('QPushButton { font-weight: normal}') - sequence = QKeySequence(code|(int(ev.modifiers())&~Qt.KeypadModifier)) + mods = int(ev.modifiers()) & ~Qt.KeypadModifier + txt = unicode(ev.text()) + if txt and txt.lower() == txt.upper(): + # We have a symbol like ! or > etc. In this case the value of code + # already includes Shift, so remove it + mods &= ~Qt.ShiftModifier + sequence = QKeySequence(code|mods) button.setText(sequence.toString(QKeySequence.NativeText)) self.capture = 0 dup_desc = self.dup_check(sequence) From 2bf6e7bed0600aa139ded457d28a9f9746a8994f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 12 Sep 2011 14:21:34 -0600 Subject: [PATCH 17/45] New MOBI writer: Change values of dictype and cdetype fields to be the same as for the old writer. Fixes #847766 (8.18 doesn't overwrite previous days newsfeeds for same publications) --- recipes/usatoday.recipe | 1 + src/calibre/ebooks/mobi/writer2/main.py | 26 ++- src/calibre/library/server/base.py | 6 + src/calibre/utils/browser.py | 4 + src/cherrypy/lib/sessions.py | 209 ++++++++++++------------ 5 files changed, 133 insertions(+), 113 deletions(-) diff --git a/recipes/usatoday.recipe b/recipes/usatoday.recipe index a4899b7187..18aeab2648 100644 --- a/recipes/usatoday.recipe +++ b/recipes/usatoday.recipe @@ -13,6 +13,7 @@ class USAToday(BasicNewsRecipe): title = 'USA Today' __author__ = 'Kovid Goyal' oldest_article = 1 + publication_type = 'newspaper' timefmt = '' max_articles_per_feed = 20 language = 'en' diff --git a/src/calibre/ebooks/mobi/writer2/main.py b/src/calibre/ebooks/mobi/writer2/main.py index 7e748aac95..987d22afd3 100644 --- a/src/calibre/ebooks/mobi/writer2/main.py +++ b/src/calibre/ebooks/mobi/writer2/main.py @@ -61,6 +61,13 @@ class MobiWriter(object): def __call__(self, oeb, path_or_stream): self.log = oeb.log + pt = None + if oeb.metadata.publication_type: + x = unicode(oeb.metadata.publication_type[0]).split(':') + if len(x) > 1: + pt = x[1].lower() + self.publication_type = pt + if hasattr(path_or_stream, 'write'): return self.dump_stream(oeb, path_or_stream) with open(path_or_stream, 'w+b') as stream: @@ -351,7 +358,7 @@ class MobiWriter(object): elif self.indexer.is_periodical: # If you change this, remember to change the cdetype in the EXTH # header as well - bt = 0x103 + bt = {'newspaper':0x101}.get(self.publication_type, 0x103) record0.write(pack(b'>IIIII', 0xe8, bt, 65001, uid, 6)) @@ -525,15 +532,16 @@ class MobiWriter(object): nrecs += 1 # Write cdetype - if self.is_periodical: - # If you set the book type header field to 0x101 use NWPR here if - # you use 0x103 use MAGZ - data = b'MAGZ' + if not self.is_periodical: + exth.write(pack(b'>II', 501, 12)) + exth.write(b'EBOK') + nrecs += 1 else: - data = b'EBOK' - exth.write(pack(b'>II', 501, len(data)+8)) - exth.write(data) - nrecs += 1 + # Should be b'NWPR' for doc type of 0x101 and b'MAGZ' for doctype + # of 0x103 but the old writer didn't write them, and I dont know + # what it should be for type 0x102 (b'BLOG'?) so write nothing + # instead + pass # Add a publication date entry if oeb.metadata['date']: diff --git a/src/calibre/library/server/base.py b/src/calibre/library/server/base.py index d18bffc6a2..9ffe1915f8 100644 --- a/src/calibre/library/server/base.py +++ b/src/calibre/library/server/base.py @@ -146,6 +146,11 @@ class LibraryServer(ContentServer, MobileServer, XMLServer, OPDSServer, Cache, self.config = {} self.is_running = False self.exception = None + self.config['/'] = { + 'tools.sessions.on' : True, + 'tools.sessions.timeout': 60, # Session times out after 60 minutes + } + if not wsgi: self.setup_loggers() cherrypy.engine.bonjour.subscribe() @@ -154,6 +159,7 @@ class LibraryServer(ContentServer, MobileServer, XMLServer, OPDSServer, Cache, 'tools.gzip.mime_types': ['text/html', 'text/plain', 'text/xml', 'text/javascript', 'text/css'], } + if opts.password: self.config['/'] = { 'tools.digest_auth.on' : True, diff --git a/src/calibre/utils/browser.py b/src/calibre/utils/browser.py index 6f8703ab49..430ced9fdd 100644 --- a/src/calibre/utils/browser.py +++ b/src/calibre/utils/browser.py @@ -28,6 +28,10 @@ class Browser(B): B.set_cookiejar(self, *args, **kwargs) self._clone_actions['set_cookiejar'] = ('set_cookiejar', args, kwargs) + @property + def cookiejar(self): + return self._clone_actions['set_cookiejar'][1][0] + def set_handle_redirect(self, *args, **kwargs): B.set_handle_redirect(self, *args, **kwargs) self._clone_actions['set_handle_redirect'] = ('set_handle_redirect', diff --git a/src/cherrypy/lib/sessions.py b/src/cherrypy/lib/sessions.py index f9b52d4e37..326e72c2b2 100644 --- a/src/cherrypy/lib/sessions.py +++ b/src/cherrypy/lib/sessions.py @@ -33,13 +33,13 @@ missing = object() class Session(object): """A CherryPy dict-like Session object (one per request).""" - + __metaclass__ = cherrypy._AttributeDocstrings - + _id = None id_observers = None id_observers__doc = "A list of callbacks to which to pass new id's." - + id__doc = "The current session ID." def _get_id(self): return self._id @@ -48,33 +48,33 @@ class Session(object): for o in self.id_observers: o(value) id = property(_get_id, _set_id, doc=id__doc) - + timeout = 60 timeout__doc = "Number of minutes after which to delete session data." - + locked = False locked__doc = """ If True, this session instance has exclusive read/write access to session data.""" - + loaded = False loaded__doc = """ If True, data has been retrieved from storage. This should happen automatically on the first attempt to access session data.""" - + clean_thread = None clean_thread__doc = "Class-level Monitor which calls self.clean_up." - + clean_freq = 5 clean_freq__doc = "The poll rate for expired session cleanup in minutes." - + def __init__(self, id=None, **kwargs): self.id_observers = [] self._data = {} - + for k, v in kwargs.iteritems(): setattr(self, k, v) - + if id is None: self.regenerate() else: @@ -84,30 +84,30 @@ class Session(object): # See http://www.cherrypy.org/ticket/709. self.id = None self.regenerate() - + def regenerate(self): """Replace the current session (with a new id).""" if self.id is not None: self.delete() - + old_session_was_locked = self.locked if old_session_was_locked: self.release_lock() - + self.id = None while self.id is None: self.id = self.generate_id() # Assert that the generated id is not already stored. if self._exists(): self.id = None - + if old_session_was_locked: self.acquire_lock() - + def clean_up(self): """Clean up expired sessions.""" pass - + try: os.urandom(20) except (AttributeError, NotImplementedError): @@ -119,7 +119,7 @@ class Session(object): def generate_id(self): """Return a new session id.""" return os.urandom(20).encode('hex') - + def save(self): """Save session data.""" try: @@ -129,12 +129,12 @@ class Session(object): t = datetime.timedelta(seconds = self.timeout * 60) expiration_time = datetime.datetime.now() + t self._save(expiration_time) - + finally: if self.locked: # Always release the lock if the user didn't release it self.release_lock() - + def load(self): """Copy stored session data into this session instance.""" data = self._load() @@ -145,7 +145,7 @@ class Session(object): else: self._data = data[0] self.loaded = True - + # Stick the clean_thread in the class, not the instance. # The instances are created and destroyed per-request. cls = self.__class__ @@ -157,23 +157,23 @@ class Session(object): t.subscribe() cls.clean_thread = t t.start() - + def delete(self): """Delete stored session data.""" self._delete() - + def __getitem__(self, key): if not self.loaded: self.load() return self._data[key] - + def __setitem__(self, key, value): if not self.loaded: self.load() self._data[key] = value - + def __delitem__(self, key): if not self.loaded: self.load() del self._data[key] - + def pop(self, key, default=missing): """Remove the specified key and return the corresponding value. If key is not found, default is returned if given, @@ -184,46 +184,46 @@ class Session(object): return self._data.pop(key) else: return self._data.pop(key, default) - + def __contains__(self, key): if not self.loaded: self.load() return key in self._data - + def has_key(self, key): """D.has_key(k) -> True if D has a key k, else False.""" if not self.loaded: self.load() return self._data.has_key(key) - + def get(self, key, default=None): """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" if not self.loaded: self.load() return self._data.get(key, default) - + def update(self, d): """D.update(E) -> None. Update D from E: for k in E: D[k] = E[k].""" if not self.loaded: self.load() self._data.update(d) - + def setdefault(self, key, default=None): """D.setdefault(k[,d]) -> D.get(k,d), also set D[k]=d if k not in D.""" if not self.loaded: self.load() return self._data.setdefault(key, default) - + def clear(self): """D.clear() -> None. Remove all items from D.""" if not self.loaded: self.load() self._data.clear() - + def keys(self): """D.keys() -> list of D's keys.""" if not self.loaded: self.load() return self._data.keys() - + def items(self): """D.items() -> list of D's (key, value) pairs, as 2-tuples.""" if not self.loaded: self.load() return self._data.items() - + def values(self): """D.values() -> list of D's values.""" if not self.loaded: self.load() @@ -231,11 +231,11 @@ class Session(object): class RamSession(Session): - + # Class-level objects. Don't rebind these! cache = {} locks = {} - + def clean_up(self): """Clean up expired sessions.""" now = datetime.datetime.now() @@ -249,29 +249,29 @@ class RamSession(Session): del self.locks[id] except KeyError: pass - + def _exists(self): return self.id in self.cache - + def _load(self): return self.cache.get(self.id) - + def _save(self, expiration_time): self.cache[self.id] = (self._data, expiration_time) - + def _delete(self): del self.cache[self.id] - + def acquire_lock(self): """Acquire an exclusive lock on the currently-loaded session data.""" self.locked = True self.locks.setdefault(self.id, threading.RLock()).acquire() - + def release_lock(self): """Release the lock on the currently-loaded session data.""" self.locks[self.id].release() self.locked = False - + def __len__(self): """Return the number of active sessions.""" return len(self.cache) @@ -279,32 +279,32 @@ class RamSession(Session): class FileSession(Session): """Implementation of the File backend for sessions - + storage_path: the folder where session data will be saved. Each session will be saved as pickle.dump(data, expiration_time) in its own file; the filename will be self.SESSION_PREFIX + self.id. """ - + SESSION_PREFIX = 'session-' LOCK_SUFFIX = '.lock' - + def __init__(self, id=None, **kwargs): # The 'storage_path' arg is required for file-based sessions. kwargs['storage_path'] = os.path.abspath(kwargs['storage_path']) Session.__init__(self, id=id, **kwargs) - + def setup(cls, **kwargs): """Set up the storage system for file-based sessions. - + This should only be called once per process; this will be done automatically when using sessions.init (as the built-in Tool does). """ # The 'storage_path' arg is required for file-based sessions. kwargs['storage_path'] = os.path.abspath(kwargs['storage_path']) - + for k, v in kwargs.iteritems(): setattr(cls, k, v) - + # Warn if any lock files exist at startup. lockfiles = [fname for fname in os.listdir(cls.storage_path) if (fname.startswith(cls.SESSION_PREFIX) @@ -316,17 +316,17 @@ class FileSession(Session): "manually delete the lockfiles found at %r." % (len(lockfiles), plural, cls.storage_path)) setup = classmethod(setup) - + def _get_file_path(self): f = os.path.join(self.storage_path, self.SESSION_PREFIX + self.id) if not os.path.abspath(f).startswith(self.storage_path): raise cherrypy.HTTPError(400, "Invalid session id in cookie.") return f - + def _exists(self): path = self._get_file_path() return os.path.exists(path) - + def _load(self, path=None): if path is None: path = self._get_file_path() @@ -338,20 +338,20 @@ class FileSession(Session): f.close() except (IOError, EOFError): return None - + def _save(self, expiration_time): f = open(self._get_file_path(), "wb") try: pickle.dump((self._data, expiration_time), f) finally: f.close() - + def _delete(self): try: os.unlink(self._get_file_path()) except OSError: pass - + def acquire_lock(self, path=None): """Acquire an exclusive lock on the currently-loaded session data.""" if path is None: @@ -363,17 +363,17 @@ class FileSession(Session): except OSError: time.sleep(0.1) else: - os.close(lockfd) + os.close(lockfd) break self.locked = True - + def release_lock(self, path=None): """Release the lock on the currently-loaded session data.""" if path is None: path = self._get_file_path() os.unlink(path + self.LOCK_SUFFIX) self.locked = False - + def clean_up(self): """Clean up expired sessions.""" now = datetime.datetime.now() @@ -395,7 +395,7 @@ class FileSession(Session): os.unlink(path) finally: self.release_lock(path) - + def __len__(self): """Return the number of active sessions.""" return len([fname for fname in os.listdir(self.storage_path) @@ -412,38 +412,38 @@ class PostgresqlSession(Session): data text, expiration_time timestamp ) - + You must provide your own get_db function. """ - + def __init__(self, id=None, **kwargs): Session.__init__(self, id, **kwargs) self.cursor = self.db.cursor() - + def setup(cls, **kwargs): """Set up the storage system for Postgres-based sessions. - + This should only be called once per process; this will be done automatically when using sessions.init (as the built-in Tool does). """ for k, v in kwargs.iteritems(): setattr(cls, k, v) - + self.db = self.get_db() setup = classmethod(setup) - + def __del__(self): if self.cursor: self.cursor.close() self.db.commit() - + def _exists(self): # Select session data from table self.cursor.execute('select data, expiration_time from session ' 'where id=%s', (self.id,)) rows = self.cursor.fetchall() return bool(rows) - + def _load(self): # Select session data from table self.cursor.execute('select data, expiration_time from session ' @@ -451,34 +451,34 @@ class PostgresqlSession(Session): rows = self.cursor.fetchall() if not rows: return None - + pickled_data, expiration_time = rows[0] data = pickle.loads(pickled_data) return data, expiration_time - + def _save(self, expiration_time): pickled_data = pickle.dumps(self._data) self.cursor.execute('update session set data = %s, ' 'expiration_time = %s where id = %s', (pickled_data, expiration_time, self.id)) - + def _delete(self): self.cursor.execute('delete from session where id=%s', (self.id,)) - + def acquire_lock(self): """Acquire an exclusive lock on the currently-loaded session data.""" # We use the "for update" clause to lock the row self.locked = True self.cursor.execute('select id from session where id=%s for update', (self.id,)) - + def release_lock(self): """Release the lock on the currently-loaded session data.""" # We just close the cursor and that will remove the lock # introduced by the "for update" clause self.cursor.close() self.locked = False - + def clean_up(self): """Clean up expired sessions.""" self.cursor.execute('delete from session where expiration_time < %s', @@ -486,43 +486,43 @@ class PostgresqlSession(Session): class MemcachedSession(Session): - + # The most popular memcached client for Python isn't thread-safe. # Wrap all .get and .set operations in a single lock. mc_lock = threading.RLock() - + # This is a seperate set of locks per session id. locks = {} - + servers = ['127.0.0.1:11211'] - + def setup(cls, **kwargs): """Set up the storage system for memcached-based sessions. - + This should only be called once per process; this will be done automatically when using sessions.init (as the built-in Tool does). """ for k, v in kwargs.iteritems(): setattr(cls, k, v) - + import memcache cls.cache = memcache.Client(cls.servers) setup = classmethod(setup) - + def _exists(self): self.mc_lock.acquire() try: return bool(self.cache.get(self.id)) finally: self.mc_lock.release() - + def _load(self): self.mc_lock.acquire() try: return self.cache.get(self.id) finally: self.mc_lock.release() - + def _save(self, expiration_time): # Send the expiration time as "Unix time" (seconds since 1/1/1970) td = int(time.mktime(expiration_time.timetuple())) @@ -532,20 +532,20 @@ class MemcachedSession(Session): raise AssertionError("Session data for id %r not set." % self.id) finally: self.mc_lock.release() - + def _delete(self): self.cache.delete(self.id) - + def acquire_lock(self): """Acquire an exclusive lock on the currently-loaded session data.""" self.locked = True self.locks.setdefault(self.id, threading.RLock()).acquire() - + def release_lock(self): """Release the lock on the currently-loaded session data.""" self.locks[self.id].release() self.locked = False - + def __len__(self): """Return the number of active sessions.""" raise NotImplementedError @@ -555,15 +555,15 @@ class MemcachedSession(Session): def save(): """Save any changed session data.""" - + if not hasattr(cherrypy.serving, "session"): return - + # Guard against running twice if hasattr(cherrypy.request, "_sessionsaved"): return cherrypy.request._sessionsaved = True - + if cherrypy.response.stream: # If the body is being streamed, we have to save the data # *after* the response has been written out @@ -589,7 +589,7 @@ close.priority = 90 def init(storage_type='ram', path=None, path_header=None, name='session_id', timeout=60, domain=None, secure=False, clean_freq=5, **kwargs): """Initialize session object (using cookies). - + storage_type: one of 'ram', 'file', 'postgresql'. This will be used to look up the corresponding class in cherrypy.lib.sessions globals. For example, 'file' will use the FileSession class. @@ -603,31 +603,31 @@ def init(storage_type='ram', path=None, path_header=None, name='session_id', secure: if False (the default) the cookie 'secure' value will not be set. If True, the cookie 'secure' value will be set (to 1). clean_freq (minutes): the poll rate for expired session cleanup. - + Any additional kwargs will be bound to the new Session instance, and may be specific to the storage type. See the subclass of Session you're using for more information. """ - + request = cherrypy.request - + # Guard against running twice if hasattr(request, "_session_init_flag"): return request._session_init_flag = True - + # Check if request came with a session ID id = None if name in request.cookie: id = request.cookie[name].value - + # Find the storage class and call setup (first time only). storage_class = storage_type.title() + 'Session' storage_class = globals()[storage_class] if not hasattr(cherrypy, "session"): if hasattr(storage_class, "setup"): storage_class.setup(**kwargs) - + # Create and attach a new Session instance to cherrypy.serving. # It will possess a reference to (and lock, and lazily load) # the requested session data. @@ -638,11 +638,11 @@ def init(storage_type='ram', path=None, path_header=None, name='session_id', """Update the cookie every time the session id changes.""" cherrypy.response.cookie[name] = id sess.id_observers.append(update_cookie) - + # Create cherrypy.session which will proxy to cherrypy.serving.session if not hasattr(cherrypy, "session"): cherrypy.session = cherrypy._ThreadLocalProxy('session') - + set_response_cookie(path=path, path_header=path_header, name=name, timeout=timeout, domain=domain, secure=secure) @@ -650,7 +650,7 @@ def init(storage_type='ram', path=None, path_header=None, name='session_id', def set_response_cookie(path=None, path_header=None, name='session_id', timeout=60, domain=None, secure=False): """Set a response cookie for the client. - + path: the 'path' value to stick in the response cookie metadata. path_header: if 'path' is None (the default), then the response cookie 'path' will be pulled from request.headers[path_header]. @@ -665,14 +665,15 @@ def set_response_cookie(path=None, path_header=None, name='session_id', cookie[name] = cherrypy.serving.session.id cookie[name]['path'] = (path or cherrypy.request.headers.get(path_header) or '/') - + # We'd like to use the "max-age" param as indicated in # http://www.faqs.org/rfcs/rfc2109.html but IE doesn't # save it to disk and the session is lost if people close # the browser. So we have to use the old "expires" ... sigh ... ## cookie[name]['max-age'] = timeout * 60 - if timeout: - cookie[name]['expires'] = http.HTTPDate(time.time() + (timeout * 60)) + if False and timeout: # Changed by Kovid, we want the user to have to + # re-authenticate on browser restart + cookie[name]['expires'] = http.HTTPDate(time.time() + timeout) if domain is not None: cookie[name]['domain'] = domain if secure: From 2de76958ce55857fe3f8aaff86932ac62fc02187 Mon Sep 17 00:00:00 2001 From: Timothy Legge Date: Mon, 12 Sep 2011 22:44:46 -0300 Subject: [PATCH 18/45] Kobo - Only process supported collections --- src/calibre/devices/kobo/driver.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/calibre/devices/kobo/driver.py b/src/calibre/devices/kobo/driver.py index 528057dad9..fa4796a5a9 100644 --- a/src/calibre/devices/kobo/driver.py +++ b/src/calibre/devices/kobo/driver.py @@ -653,6 +653,15 @@ class KOBO(USBMS): debug_print(' Commit: Set FavouritesIndex') def update_device_database_collections(self, booklists, collections_attributes, oncard): + # Only process categories in this list + supportedcategories = { + "Im_Reading":1, + "Read":2, + "Closed":3, + "Shortlist":4, + # "Preview":99, # Unsupported as we don't want to change it + } + # Define lists for the ReadStatus readstatuslist = { "Im_Reading":1, @@ -692,6 +701,7 @@ class KOBO(USBMS): # Process any collections that exist for category, books in collections.items(): + if category in supportedcategories: debug_print("Category: ", category, " id = ", readstatuslist.get(category)) for book in books: debug_print(' Title:', book.title, 'category: ', category) From be7114ba3acc8c650b48ff8881f40ae704b20c5f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 12 Sep 2011 21:21:21 -0600 Subject: [PATCH 19/45] ... --- recipes/businessworldin.recipe | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/recipes/businessworldin.recipe b/recipes/businessworldin.recipe index e44682d7e1..a4c774ccdb 100644 --- a/recipes/businessworldin.recipe +++ b/recipes/businessworldin.recipe @@ -4,6 +4,7 @@ __copyright__ = '2009-2010, Darko Miletic ' www.businessworld.in ''' +import re from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe @@ -15,7 +16,7 @@ class BusinessWorldMagazine(BasicNewsRecipe): category = 'news, politics, finances, India, Asia' delay = 1 no_stylesheets = True - INDEX = 'http://www.businessworld.in/bw/Magazine_Current_Issue' + INDEX = 'http://www.businessworld.in/businessworld/magazine_latest_issue.php' ROOT = 'http://www.businessworld.in' use_embedded_content = False encoding = 'utf-8' @@ -38,13 +39,17 @@ class BusinessWorldMagazine(BasicNewsRecipe): if litem == url: return True return False - - + + def parse_index(self): articles = [] linklist = [] - soup = self.index_to_soup(self.INDEX) - + br = self.browser + br.open(self.ROOT) + raw = br.open(br.click_link(text_regex=re.compile('Current.*Issue', + re.I))).read() + soup = self.index_to_soup(raw) + tough = soup.find('div', attrs={'id':'tough'}) if tough: for item in tough.findAll('h1'): @@ -63,7 +68,7 @@ class BusinessWorldMagazine(BasicNewsRecipe): ,'description':description }) linklist.append(url) - + for item in soup.findAll('div', attrs={'class':'nametitle'}): description = '' title_prefix = '' @@ -82,7 +87,7 @@ class BusinessWorldMagazine(BasicNewsRecipe): linklist.append(url) return [(soup.head.title.string, articles)] - + keep_only_tags = [dict(name='div', attrs={'id':'printwrapper'})] remove_tags = [dict(name=['object','link','meta','base','iframe','link','table'])] From 7198f843283f36686ad9d6bca4f87cf9986de25e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 12 Sep 2011 21:58:53 -0600 Subject: [PATCH 20/45] Fix Business World India. Fixes #848431 (businessworldin.recipe should be updated) --- recipes/businessworldin.recipe | 113 +++++++++++++-------------------- 1 file changed, 43 insertions(+), 70 deletions(-) diff --git a/recipes/businessworldin.recipe b/recipes/businessworldin.recipe index a4c774ccdb..cb5f443e9f 100644 --- a/recipes/businessworldin.recipe +++ b/recipes/businessworldin.recipe @@ -5,12 +5,11 @@ www.businessworld.in ''' import re -from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe class BusinessWorldMagazine(BasicNewsRecipe): title = 'Business World Magazine' - __author__ = 'Darko Miletic' + __author__ = 'Kovid Goyal' description = 'News from India' publisher = 'ABP Pvt Ltd Publication' category = 'news, politics, finances, India, Asia' @@ -18,86 +17,60 @@ class BusinessWorldMagazine(BasicNewsRecipe): no_stylesheets = True INDEX = 'http://www.businessworld.in/businessworld/magazine_latest_issue.php' ROOT = 'http://www.businessworld.in' - use_embedded_content = False encoding = 'utf-8' language = 'en_IN' - extra_css = """ - img{display: block; margin-bottom: 0.5em} - body{font-family: Arial,Helvetica,sans-serif} - h2{color: gray; display: block} - """ - - conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language - } - - def is_in_list(self,linklist,url): - for litem in linklist: - if litem == url: - return True - return False - + auto_cleanup = True def parse_index(self): - articles = [] - linklist = [] br = self.browser br.open(self.ROOT) raw = br.open(br.click_link(text_regex=re.compile('Current.*Issue', re.I))).read() soup = self.index_to_soup(raw) + mc = soup.find(attrs={'class':'mag_cover'}) + if mc is not None: + img = mc.find('img', src=True) + if img is not None: + self.cover_url = img['src'] + + feeds = [] + current_section = None + articles = [] + for tag in soup.findAll(['h3', 'h2']): + inner_a = tag.find('a') + if tag.name == 'h3' and inner_a is not None: + continue + if tag.name == 'h2' and (inner_a is None or current_section is + None): + continue + + if tag.name == 'h3': + if current_section is not None and articles: + feeds.append((current_section, articles)) + current_section = self.tag_to_string(tag) + self.log('Found section:', current_section) + articles = [] + elif tag.name == 'h2': + url = inner_a.get('href', None) + if url is None: continue + if url.startswith('/'): url = self.ROOT + url + title = self.tag_to_string(inner_a) + h1 = tag.findPreviousSibling('h1') + if h1 is not None: + title = self.tag_to_string(h1) + title + self.log('\tFound article:', title) + articles.append({'title':title, 'url':url, 'date':'', + 'description':''}) + + if current_section and articles: + feeds.append((current_section, articles)) + + return feeds + + - tough = soup.find('div', attrs={'id':'tough'}) - if tough: - for item in tough.findAll('h1'): - description = '' - title_prefix = '' - feed_link = item.find('a') - if feed_link and feed_link.has_key('href'): - url = self.ROOT + feed_link['href'] - if not self.is_in_list(linklist,url): - title = title_prefix + self.tag_to_string(feed_link) - date = strftime(self.timefmt) - articles.append({ - 'title' :title - ,'date' :date - ,'url' :url - ,'description':description - }) - linklist.append(url) - for item in soup.findAll('div', attrs={'class':'nametitle'}): - description = '' - title_prefix = '' - feed_link = item.find('a') - if feed_link and feed_link.has_key('href'): - url = self.ROOT + feed_link['href'] - if not self.is_in_list(linklist,url): - title = title_prefix + self.tag_to_string(feed_link) - date = strftime(self.timefmt) - articles.append({ - 'title' :title - ,'date' :date - ,'url' :url - ,'description':description - }) - linklist.append(url) - return [(soup.head.title.string, articles)] - keep_only_tags = [dict(name='div', attrs={'id':'printwrapper'})] - remove_tags = [dict(name=['object','link','meta','base','iframe','link','table'])] - def print_version(self, url): - return url.replace('/bw/','/bw/storyContent/') - def get_cover_url(self): - cover_url = None - soup = self.index_to_soup(self.INDEX) - cover_item = soup.find('img',attrs={'class':'toughbor'}) - if cover_item: - cover_url = self.ROOT + cover_item['src'] - return cover_url From cf13881d0847e55c23eb3a9f53b6613227f83347 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 12 Sep 2011 23:39:02 -0600 Subject: [PATCH 21/45] ... --- src/calibre/ebooks/mobi/writer2/serializer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/mobi/writer2/serializer.py b/src/calibre/ebooks/mobi/writer2/serializer.py index ed6df6698a..6bc597ccb4 100644 --- a/src/calibre/ebooks/mobi/writer2/serializer.py +++ b/src/calibre/ebooks/mobi/writer2/serializer.py @@ -160,7 +160,7 @@ class Serializer(object): buf.write(b'title="') self.serialize_text(ref.title, quot=True) buf.write(b'" ') - if ref.title == 'start': + if ref.title == 'start' or ref.type in ('start', 'other.start'): self._start_href = ref.href self.serialize_href(ref.href) # Space required or won't work, I kid you not @@ -348,8 +348,9 @@ class Serializer(object): ''' buf = self.buf id_offsets = self.id_offsets + start_href = getattr(self, '_start_href', None) for href, hoffs in self.href_offsets.items(): - is_start = (href and href == getattr(self, '_start_href', None)) + is_start = (href and href == start_href) # Iterate over all filepos items if href not in id_offsets: self.logger.warn('Hyperlink target %r not found' % href) From 636496f1dd88bee5d7536de7b456dc920e7cf415 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 13 Sep 2011 00:11:43 -0600 Subject: [PATCH 22/45] Fixed People/US Magazine mashup --- recipes/people_us_mashup.recipe | 72 ++------------------------------- 1 file changed, 3 insertions(+), 69 deletions(-) diff --git a/recipes/people_us_mashup.recipe b/recipes/people_us_mashup.recipe index ed43e24e56..28c76d820c 100644 --- a/recipes/people_us_mashup.recipe +++ b/recipes/people_us_mashup.recipe @@ -14,54 +14,10 @@ class PeopleMag(BasicNewsRecipe): use_embedded_content = False oldest_article = 2 max_articles_per_feed = 50 + use_embedded_content = False - extra_css = ''' - h1{font-family:verdana,arial,helvetica,sans-serif; font-size: large;} - h2{font-family:verdana,arial,helvetica,sans-serif; font-size: small;} - .body-content{font-family:verdana,arial,helvetica,sans-serif; font-size: small;} - .byline {font-size: small; color: #666666; font-style:italic; } - .lastline {font-size: small; color: #666666; font-style:italic;} - .contact {font-size: small; color: #666666;} - .contact p {font-size: small; color: #666666;} - .photoCaption { font-family:verdana,arial,helvetica,sans-serif; font-size:x-small;} - .photoCredit{ font-family:verdana,arial,helvetica,sans-serif; font-size:x-small; color:#666666;} - .article_timestamp{font-size:x-small; color:#666666;} - a {font-family:verdana,arial,helvetica,sans-serif; font-size: x-small;} - ''' - - - keep_only_tags = [ - dict(name='div', attrs={'class': 'panel_news_article_main'}), - dict(name='div', attrs={'class':'article_content'}), - dict(name='div', attrs={'class': 'headline'}), - dict(name='div', attrs={'class': 'post'}), - dict(name='div', attrs={'class': 'packageheadlines'}), - dict(name='div', attrs={'class': 'snap_preview'}), - dict(name='div', attrs={'id': 'articlebody'}) - ] - - remove_tags = [ - dict(name='div', attrs={'class':'share_comments'}), - dict(name='p', attrs={'class':'twitter_facebook'}), - dict(name='div', attrs={'class':'share_comments_bottom'}), - dict(name='h2', attrs={'id':'related_content'}), - dict(name='div', attrs={'class':'next_article'}), - dict(name='div', attrs={'class':'prev_article'}), - dict(name='ul', attrs={'id':'sharebar'}), - dict(name='div', attrs={'class':'sharelinkcont'}), - dict(name='div', attrs={'class':'categories'}), - dict(name='ul', attrs={'class':'categories'}), - dict(name='div', attrs={'class':'related_content'}), - dict(name='div', attrs={'id':'promo'}), - dict(name='div', attrs={'class':'linksWrapper'}), - dict(name='p', attrs={'class':'tag tvnews'}), - dict(name='p', attrs={'class':'tag movienews'}), - dict(name='p', attrs={'class':'tag musicnews'}), - dict(name='p', attrs={'class':'tag couples'}), - dict(name='p', attrs={'class':'tag gooddeeds'}), - dict(name='p', attrs={'class':'tag weddings'}), - dict(name='p', attrs={'class':'tag health'}) -] + no_stylesheets = True + auto_cleanup = True feeds = [ @@ -69,26 +25,4 @@ class PeopleMag(BasicNewsRecipe): ('US Headlines', 'http://www.usmagazine.com/celebrity_news/rss') ] - def get_article_url(self, article): - ans = article.link - try: - self.log('Looking for full story link in', ans) - soup = self.index_to_soup(ans) - x = soup.find(text="View All") - - if x is not None: - ans = ans + '?viewAll=y' - self.log('Found full story link', ans) - except: - pass - return ans - - def postprocess_html(self, soup,first): - - for tag in soup.findAll(name='div',attrs={'class':"container_ate_qandatitle"}): - tag.extract() - for tag in soup.findAll(name='br'): - tag.extract() - - return soup From 5168e3c18b60fd8816b79bf9f6a54c699242ca5b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 13 Sep 2011 00:12:11 -0600 Subject: [PATCH 23/45] ... --- src/calibre/ebooks/readability/readability.py | 37 +++++++++---------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/src/calibre/ebooks/readability/readability.py b/src/calibre/ebooks/readability/readability.py index 7713584d14..8d4a23b338 100644 --- a/src/calibre/ebooks/readability/readability.py +++ b/src/calibre/ebooks/readability/readability.py @@ -484,30 +484,29 @@ class HashableElement(): def __getattr__(self, tag): return getattr(self.node, tag) +def option_parser(): + from calibre.utils.config import OptionParser + parser = OptionParser(usage='%prog: [options] file') + parser.add_option('-v', '--verbose', default=False, action='store_true', + dest='verbose', + help=_('Show detailed output information. Useful for debugging')) + + return parser + def main(): - import logging - from optparse import OptionParser - parser = OptionParser(usage="%prog: [options] [file]") - parser.add_option('-v', '--verbose', action='store_true') - parser.add_option('-u', '--url', help="use URL instead of a local file") - (options, args) = parser.parse_args() + from calibre.utils.logging import default_log + parser = option_parser() + options, args = parser.parse_args() - if not (len(args) == 1 or options.url): + if len(args) != 1: parser.print_help() - sys.exit(1) - logging.basicConfig(level=logging.INFO) + raise SystemExit(1) + + with open(args[0], 'rb') as f: + raw = f.read() - file = None - if options.url: - import urllib - file = urllib.urlopen(options.url) - else: - file = open(args[0], 'rt') enc = sys.__stdout__.encoding or 'utf-8' - try: - print Document(file.read(), debug=options.verbose).summary().encode(enc, 'replace') - finally: - file.close() + print Document(raw, default_log, debug=options.verbose).summary().encode(enc, 'replace') if __name__ == '__main__': main() From 41792efac3ed46d7b31159f70d598986f1c4d7b8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 13 Sep 2011 09:42:50 -0600 Subject: [PATCH 24/45] Fix #848717 (preferences - behaviour - wrong label) --- src/calibre/gui2/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/gui2/__init__.py b/src/calibre/gui2/__init__.py index ccd1dac1ad..0e123bee8b 100644 --- a/src/calibre/gui2/__init__.py +++ b/src/calibre/gui2/__init__.py @@ -142,7 +142,7 @@ def _config(): # {{{ c.add_opt('upload_news_to_device', default=True, help=_('Upload downloaded news to device')) c.add_opt('delete_news_from_library_on_upload', default=False, - help=_('Delete books from library after uploading to device')) + help=_('Delete news books from library after uploading to device')) c.add_opt('separate_cover_flow', default=False, help=_('Show the cover flow in a separate window instead of in the main calibre window')) c.add_opt('disable_tray_notification', default=False, From dbb2ede515f12b69e5f2de78c985ff6a62e925fa Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 13 Sep 2011 09:47:47 -0600 Subject: [PATCH 25/45] ... --- src/calibre/ebooks/conversion/plumber.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 3e5313eb96..3d345b50f3 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -394,8 +394,9 @@ OptionRecommendation(name='insert_blank_line_size', OptionRecommendation(name='remove_first_image', recommended_value=False, level=OptionRecommendation.LOW, help=_('Remove the first image from the input ebook. Useful if the ' - 'first image in the source file is a cover and you are specifying ' - 'an external cover.' + 'input document has a cover image that is not identified as a cover. ' + 'In this case, if you set a cover in calibre, the output document will ' + 'end up with two cover images if you do not specify this option.' ) ), @@ -1024,7 +1025,7 @@ OptionRecommendation(name='sr3_replace', self.output_plugin.file_type not in ('mobi', 'lrf'): from calibre.ebooks.oeb.transforms.linearize_tables import LinearizeTables LinearizeTables()(self.oeb, self.opts) - + if self.opts.unsmarten_punctuation: from calibre.ebooks.oeb.transforms.unsmarten import UnsmartenPunctuation UnsmartenPunctuation()(self.oeb, self.opts) From 7abf29c5ba73cdbcd06cf75579139d1db669aa72 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 13 Sep 2011 09:49:13 -0600 Subject: [PATCH 26/45] Fix #848900 (Updated recipe for The Japan Times) --- recipes/icons/japan_times.png | Bin 0 -> 1264 bytes recipes/japan_times.recipe | 67 ++++++++++++++++++++++++++-------- 2 files changed, 51 insertions(+), 16 deletions(-) create mode 100644 recipes/icons/japan_times.png diff --git a/recipes/icons/japan_times.png b/recipes/icons/japan_times.png new file mode 100644 index 0000000000000000000000000000000000000000..1b2ac895725ec55d4328a964f6c3f70f4076f111 GIT binary patch literal 1264 zcmeIw|5MTj7zgkNycVOlqwIE@xy97zo6-l+^v-oV^y39ra_UOg4}NKDUXxZ%@6) zIOvRn?l_ReLr*;1j)yx5a5n+^6X4ISFhGa{sKPB83(i9|3b zf+Z2GN$@fWrWjyjzzhTI2)GdNFkyiSe%9(NvLM8QWj3s^0pEw*;vusAw1oo1cNmqQ z!`doKrKi3c_!4~~IHWE`-N(?NutHITDSp6IBJ7a}(}=LqI&Az3Hrs&BHDJz`W?9=M ztF+bF)jHqR>XJzx_y0aT(B)Bdc^@gg8l_LIR%^9dy7heEJ{UX0y%C&c2D5!RPI`^9}BDdflr$5pI{?<6iLjd_h0HxVRV! zg_eSNXlZ3Rgonf7f&z?~sBAAr!!LpcXex1TdU5vUP($E9?%H)Ybe zx!Rw{IkZfPGBDQhePfwp(-wP2mJ0RY#Vu{VBZKql{ZDrn8~aSvC6XtBK%U6ihmTwp zw13uod0psh+EiL#!CD|_Tu`a5Q#=(FsH39!!s((&e$mPF`!$Mh;@|JtYn120txscSyw3`B zy}hse$znyyQDN)Jw7uaa<-NW{-C!Y*9;a|!!R2`c1n)V5ymb!KAPKXE69%V3?kV;UX#7+JwZyw_#jEx zdA>HPoE~#3{XA8psnlZ@gEsJ>GKtL?E}J{^ZjeC4iR2b`)rQr#;$`vC+ZiR*{{h9F BuS5U< literal 0 HcmV?d00001 diff --git a/recipes/japan_times.recipe b/recipes/japan_times.recipe index bb83b16f1e..229d5e4035 100644 --- a/recipes/japan_times.recipe +++ b/recipes/japan_times.recipe @@ -1,7 +1,5 @@ -#!/usr/bin/env python - __license__ = 'GPL v3' -__copyright__ = '2008, Darko Miletic ' +__copyright__ = '2008-2011, Darko Miletic ' ''' japantimes.co.jp ''' @@ -9,24 +7,61 @@ japantimes.co.jp from calibre.web.feeds.news import BasicNewsRecipe class JapanTimes(BasicNewsRecipe): - title = u'The Japan Times' + title = 'The Japan Times' __author__ = 'Darko Miletic' - description = 'News from Japan' - language = 'en' - - oldest_article = 7 - max_articles_per_feed = 100 + description = "Daily news and features on Japan from the most widely read English-language newspaper in Japan. Coverage includes national news, business news, sports news, commentary and features on living in Japan, entertainment, the arts, education and more." + language = 'en_JP' + category = 'news, politics, japan' + publisher = 'The Japan Times' + oldest_article = 5 + max_articles_per_feed = 150 no_stylesheets = True use_embedded_content = False + encoding = 'utf8' + publication_type = 'newspaper' + masthead_url = 'http://search.japantimes.co.jp/images/header_title.gif' + extra_css = 'body{font-family: Geneva,Arial,Helvetica,sans-serif}' - keep_only_tags = [ dict(name='div', attrs={'id':'searchresult'}) ] - remove_tags_after = [ dict(name='div', attrs={'id':'mainbody' }) ] + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + , 'linearize_tables' : True + } + + + keep_only_tags = [dict(name='div', attrs={'id':'printresult'})] remove_tags = [ - dict(name='div' , attrs={'id':'ads' }) - ,dict(name='table', attrs={'width':470}) + dict(name=['iframe','meta','link','embed','object','base']) + ,dict(attrs={'id':'searchfooter'}) ] + feeds = [(u'The Japan Times', u'http://feeds.feedburner.com/japantimes')] + remove_attributes = ['border'] + def get_article_url(self, article): + rurl = BasicNewsRecipe.get_article_url(self, article) + return rurl.partition('?')[0] - feeds = [ - (u'The Japan Times', u'http://feedproxy.google.com/japantimes') - ] \ No newline at end of file + def print_version(self, url): + return url.replace('/cgi-bin/','/print/') + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll('img'): + if not item.has_key('alt'): + item['alt'] = 'image' + for item in soup.findAll('photo'): + item.name = 'div' + for item in soup.head.findAll('paragraph'): + item.extract() + for item in soup.findAll('wwfilename'): + item.extract() + for item in soup.findAll('jtcategory'): + item.extract() + for item in soup.findAll('nomooter'): + item.extract() + for item in soup.body.findAll('paragraph'): + item.name = 'p' + return soup From 2b315603771f56e57546523de5745794c8c8f138 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 13 Sep 2011 10:04:43 -0600 Subject: [PATCH 27/45] ... --- src/calibre/ebooks/mobi/writer2/serializer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/mobi/writer2/serializer.py b/src/calibre/ebooks/mobi/writer2/serializer.py index 6bc597ccb4..eeef720144 100644 --- a/src/calibre/ebooks/mobi/writer2/serializer.py +++ b/src/calibre/ebooks/mobi/writer2/serializer.py @@ -160,7 +160,9 @@ class Serializer(object): buf.write(b'title="') self.serialize_text(ref.title, quot=True) buf.write(b'" ') - if ref.title == 'start' or ref.type in ('start', 'other.start'): + if (ref.title.lower() == 'start' or + (ref.type and ref.type.lower() in ('start', + 'other.start'))): self._start_href = ref.href self.serialize_href(ref.href) # Space required or won't work, I kid you not From 371db4901f7aadd4b46f60da79a04c717a6dea22 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 13 Sep 2011 10:28:47 -0600 Subject: [PATCH 28/45] Conversion: Remove paragraph spacing: If you set the indent size to 0, calibre will now leave the indents specified in the input document --- src/calibre/ebooks/conversion/plumber.py | 4 +++- src/calibre/ebooks/oeb/transforms/flatcss.py | 5 +++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 3d345b50f3..7f38106229 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -368,7 +368,9 @@ OptionRecommendation(name='remove_paragraph_spacing_indent_size', recommended_value=1.5, level=OptionRecommendation.LOW, help=_('When calibre removes blank lines between paragraphs, it automatically ' 'sets a paragraph indent, to ensure that paragraphs can be easily ' - 'distinguished. This option controls the width of that indent (in em).') + 'distinguished. This option controls the width of that indent (in em). ' + 'If you set this value to 0, then the indent specified in the input ' + 'document is used.') ), OptionRecommendation(name='prefer_metadata_cover', diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py index 1493a647ae..078174218e 100644 --- a/src/calibre/ebooks/oeb/transforms/flatcss.py +++ b/src/calibre/ebooks/oeb/transforms/flatcss.py @@ -320,9 +320,10 @@ class CSSFlattener(object): if self.context.insert_blank_line: cssdict['margin-top'] = cssdict['margin-bottom'] = \ '%fem'%self.context.insert_blank_line_size - if (self.context.remove_paragraph_spacing and + indent_size = self.context.remove_paragraph_spacing_indent_size + if (self.context.remove_paragraph_spacing and indent_size != 0.0 and cssdict.get('text-align', None) not in ('center', 'right')): - cssdict['text-indent'] = "%1.1fem" % self.context.remove_paragraph_spacing_indent_size + cssdict['text-indent'] = "%1.1fem" % indent_size if cssdict: items = cssdict.items() From 92fdad1ef3b4eee384ef5d69bc4ded66ffb72acc Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 13 Sep 2011 19:07:53 -0600 Subject: [PATCH 29/45] News download: Add an auto_cleanup_keep variable that allows recipe writers to tell the auto cleanup to never remove a specified element --- recipes/people_us_mashup.recipe | 1 + src/calibre/ebooks/readability/readability.py | 65 ++++++++----------- src/calibre/web/feeds/news.py | 15 ++++- 3 files changed, 42 insertions(+), 39 deletions(-) diff --git a/recipes/people_us_mashup.recipe b/recipes/people_us_mashup.recipe index 28c76d820c..5d820bacc0 100644 --- a/recipes/people_us_mashup.recipe +++ b/recipes/people_us_mashup.recipe @@ -18,6 +18,7 @@ class PeopleMag(BasicNewsRecipe): no_stylesheets = True auto_cleanup = True + auto_cleanup_keep = '//div[@id="article-image"]' feeds = [ diff --git a/src/calibre/ebooks/readability/readability.py b/src/calibre/ebooks/readability/readability.py index 8d4a23b338..028a4d6ede 100644 --- a/src/calibre/ebooks/readability/readability.py +++ b/src/calibre/ebooks/readability/readability.py @@ -1,3 +1,8 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + import re, sys from collections import defaultdict @@ -72,10 +77,15 @@ class Document: self.options[k] = v self.html = None self.log = log + self.keep_elements = set() def _html(self, force=False): if force or self.html is None: self.html = self._parse(self.input) + path = self.options['keep_elements'] + if path is not None: + self.keep_elements = set(self.html.xpath(path)) + return self.html def _parse(self, input): @@ -152,8 +162,9 @@ class Document: append = False if sibling is best_elem: append = True - sibling_key = sibling #HashableElement(sibling) - if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold: + if sibling in candidates and candidates[sibling]['content_score'] >= sibling_score_threshold: + append = True + if sibling in self.keep_elements: append = True if sibling.tag == "p": @@ -283,6 +294,8 @@ class Document: def remove_unlikely_candidates(self): for elem in self.html.iter(): + if elem in self.keep_elements: + continue s = "%s %s" % (elem.get('class', ''), elem.get('id', '')) #self.debug(s) if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag != 'body': @@ -337,7 +350,7 @@ class Document: allowed = {} # Conditionally clean s,
    s, and
    s for el in self.reverse_tags(node, "table", "ul", "div"): - if el in allowed: + if el in allowed or el in self.keep_elements: continue weight = self.class_weight(el) if el in candidates: @@ -450,46 +463,17 @@ class Document: #self.debug("pname %s pweight %.3f" %(pname, pweight)) el.drop_tree() - for el in ([node] + [n for n in node.iter()]): - if not (self.options['attributes']): - #el.attrib = {} #FIXME:Checkout the effects of disabling this - pass - return clean_attributes(tounicode(node)) - -class HashableElement(): - def __init__(self, node): - self.node = node - self._path = None - - def _get_path(self): - if self._path is None: - reverse_path = [] - node = self.node - while node is not None: - node_id = (node.tag, tuple(node.attrib.items()), node.text) - reverse_path.append(node_id) - node = node.getparent() - self._path = tuple(reverse_path) - return self._path - path = property(_get_path) - - def __hash__(self): - return hash(self.path) - - def __eq__(self, other): - return self.path == other.path - - def __getattr__(self, tag): - return getattr(self.node, tag) - def option_parser(): from calibre.utils.config import OptionParser parser = OptionParser(usage='%prog: [options] file') parser.add_option('-v', '--verbose', default=False, action='store_true', - dest='verbose', - help=_('Show detailed output information. Useful for debugging')) + dest='verbose', + help='Show detailed output information. Useful for debugging') + parser.add_option('-k', '--keep-elements', default=None, action='store', + dest='keep_elements', + help='XPath specifying elements that should not be removed') return parser @@ -506,7 +490,12 @@ def main(): raw = f.read() enc = sys.__stdout__.encoding or 'utf-8' - print Document(raw, default_log, debug=options.verbose).summary().encode(enc, 'replace') + if options.verbose: + default_log.filter_level = default_log.DEBUG + print (Document(raw, default_log, + debug=options.verbose, + keep_elements=options.keep_elements).summary().encode(enc, + 'replace')) if __name__ == '__main__': main() diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index 436612af7e..b7efd611e0 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -144,6 +144,18 @@ class BasicNewsRecipe(Recipe): #: manually (though manual cleanup will always be superior). auto_cleanup = False + #: Specify elements that the auto cleanup algorithm should never remove + #: The syntax is a XPath expression. For example:: + #: + #: auto_cleanup_keep = '//div[@id="article-image"]' will keep all divs with + #: id="article-image" + #: auto_cleanup_keep = '//*[@class="important"]' will keep all elements + #: with class="important" + #: auto_cleanup_keep = '//div[@id="article-image"]|//span[@class="important"]' + #: will keep all divs with id="article-image" and spans + #: with class="important" + auto_cleanup_keep = None + #: Specify any extra :term:`CSS` that should be addded to downloaded :term:`HTML` files #: It will be inserted into `