diff --git a/recipes/business_week.recipe b/recipes/business_week.recipe index fcb28d1d3e..fe98d9fa00 100644 --- a/recipes/business_week.recipe +++ b/recipes/business_week.recipe @@ -1,93 +1,105 @@ -#!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' -__docformat__ = 'restructuredtext en' - +__copyright__ = '2008 Kovid Goyal kovid@kovidgoyal.net, 2010 Darko Miletic ' ''' -businessweek.com +www.businessweek.com ''' from calibre.web.feeds.news import BasicNewsRecipe class BusinessWeek(BasicNewsRecipe): - title = 'Business Week' - description = 'Business News, Stock Market and Financial Advice' - __author__ = 'ChuckEggDotCom and Sujata Raman' - language = 'en' + title = 'Business Week' + __author__ = 'Kovid Goyal and Darko Miletic' + description = 'Read the latest international business news & stock market news. Get updated company profiles, financial advice, global economy and technology news.' + publisher = 'Bloomberg L.P.' + category = 'Business, business news, stock market, stock market news, financial advice, company profiles, financial advice, global economy, technology news' + oldest_article = 7 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'utf8' + use_embedded_content = False + language = 'en' + remove_empty_feeds = True + publication_type = 'magazine' + cover_url = 'http://images.businessweek.com/mz/covers/current_120x160.jpg' + masthead_url = 'http://assets.businessweek.com/images/bw-logo.png' + extra_css = """ + body{font-family: Helvetica,Arial,sans-serif } + img{margin-bottom: 0.4em; display:block} + .tagline{color: gray; font-style: italic} + .photoCredit{font-size: small; color: gray} + """ - oldest_article = 7 - max_articles_per_feed = 10 - no_stylesheets = True + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } - recursions = 1 - match_regexps = [r'http://www.businessweek.com/.*_page_[1-9].*'] - extra_css = ''' - h1{font-family :Arial,Helvetica,sans-serif; font-size:large;} - .news_story_title{font-family :Arial,Helvetica,sans-serif; font-size:large;font-weight:bold;} - h2{font-family :Arial,Helvetica,sans-serif; font-size:medium;color:#666666;} - h3{text-transform:uppercase;font-family :Arial,Helvetica,sans-serif; font-size:large;font-weight:bold;} - h4{font-family :Arial,Helvetica,sans-serif; font-size:small;font-weight:bold;} - p{font-family :Arial,Helvetica,sans-serif; } - #lede600{font-size:x-small;} - #storybody{font-size:x-small;} - p{font-family :Arial,Helvetica,sans-serif;} - .strap{font-family :Arial,Helvetica,sans-serif; font-size:x-small; color:#064599;} - .byline{font-family :Arial,Helvetica,sans-serif; font-size:x-small;} - .postedBy{font-family :Arial,Helvetica,sans-serif; font-size:x-small;color:#666666;} - .trackback{font-family :Arial,Helvetica,sans-serif; font-size:x-small;color:#666666;} - .date{font-family :Arial,Helvetica,sans-serif; font-size:x-small;color:#666666;} - .wrapper{font-family :Arial,Helvetica,sans-serif; font-size:x-small;} - .photoCredit{font-family :Arial,Helvetica,sans-serif; font-size:x-small;color:#666666;} - .tagline{font-family :Arial,Helvetica,sans-serif; font-size:x-small;color:#666666;} - .pageCount{color:#666666;font-family :Arial,Helvetica,sans-serif; font-size:x-small;} - .note{font-family :Arial,Helvetica,sans-serif; font-size:small;color:#666666;font-style:italic;} - .highlight{font-family :Arial,Helvetica,sans-serif; font-size:small;background-color:#FFF200;} - .annotation{font-family :Arial,Helvetica,sans-serif; font-size:x-small;color:#666666;} - ''' - - remove_tags = [ dict(name='div', attrs={'id':["log","feedback","footer","secondarynav","secondnavbar","header","email","bw2-header","column2","wrapper-bw2-footer","wrapper-mgh-footer","inset","commentForm","commentDisplay","bwExtras","bw2-umbrella","readerComments","leg","rightcol"]}), - dict(name='div', attrs={'class':["menu",'sponsorbox smallertext',"TopNavTile","graybottom leaderboard"]}), - dict(name='img', alt ="News"), - dict(name='td', width ="1"), - ] + remove_tags = [ + dict(attrs={'class':'inStory'}) + ,dict(name=['meta','link','iframe','base','embed','object','table','th','tr','td']) + ,dict(attrs={'id':['inset','videoDisplay']}) + ] + keep_only_tags = [dict(name='div', attrs={'id':['story-body','storyBody','article_body','articleBody']})] + remove_attributes = ['lang'] + match_regexps = [r'http://www.businessweek.com/.*_page_[1-9].*'] - feeds = [ - (u'Top Stories', u'http://www.businessweek.com/topStories/rss/topStories.rss'), - (u'Top News', u'http://www.businessweek.com/rss/bwdaily.rss'), - (u'Asia', u'http://www.businessweek.com/rss/asia.rss'), - (u'Autos', u'http://www.businessweek.com/rss/autos/index.rss'), - (u'Classic Cars', u'http://rss.businessweek.com/bw_rss/classiccars'), - (u'Hybrids', u'http://rss.businessweek.com/bw_rss/hybrids'), - (u'Europe', u'http://www.businessweek.com/rss/europe.rss'), - (u'Auto Reviews', u'http://rss.businessweek.com/bw_rss/autoreviews'), - (u'Innovation & Design', u'http://www.businessweek.com/rss/innovate.rss'), - (u'Architecture', u'http://www.businessweek.com/rss/architecture.rss'), - (u'Brand Equity', u'http://www.businessweek.com/rss/brandequity.rss'), - (u'Auto Design', u'http://www.businessweek.com/rss/carbuff.rss'), - (u'Game Room', u'http://rss.businessweek.com/bw_rss/gameroom'), - (u'Technology', u'http://www.businessweek.com/rss/technology.rss'), - (u'Investing', u'http://rss.businessweek.com/bw_rss/investor'), - (u'Small Business', u'http://www.businessweek.com/rss/smallbiz.rss'), - (u'Careers', u'http://rss.businessweek.com/bw_rss/careers'), - (u'B-Schools', u'http://www.businessweek.com/rss/bschools.rss'), - (u'Magazine Selections', u'http://www.businessweek.com/rss/magazine.rss'), - (u'CEO Guide to Tech', u'http://www.businessweek.com/rss/ceo_guide_tech.rss'), - ] + + feeds = [ + (u'Top Stories', u'http://www.businessweek.com/topStories/rss/topStories.rss'), + (u'Top News' , u'http://www.businessweek.com/rss/bwdaily.rss' ), + (u'Asia', u'http://www.businessweek.com/rss/asia.rss'), + (u'Autos', u'http://www.businessweek.com/rss/autos/index.rss'), + (u'Classic Cars', u'http://rss.businessweek.com/bw_rss/classiccars'), + (u'Hybrids', u'http://rss.businessweek.com/bw_rss/hybrids'), + (u'Europe', u'http://www.businessweek.com/rss/europe.rss'), + (u'Auto Reviews', u'http://rss.businessweek.com/bw_rss/autoreviews'), + (u'Innovation & Design', u'http://www.businessweek.com/rss/innovate.rss'), + (u'Architecture', u'http://www.businessweek.com/rss/architecture.rss'), + (u'Brand Equity', u'http://www.businessweek.com/rss/brandequity.rss'), + (u'Auto Design', u'http://www.businessweek.com/rss/carbuff.rss'), + (u'Game Room', u'http://rss.businessweek.com/bw_rss/gameroom'), + (u'Technology', u'http://www.businessweek.com/rss/technology.rss'), + (u'Investing', u'http://rss.businessweek.com/bw_rss/investor'), + (u'Small Business', u'http://www.businessweek.com/rss/smallbiz.rss'), + (u'Careers', u'http://rss.businessweek.com/bw_rss/careers'), + (u'B-Schools', u'http://www.businessweek.com/rss/bschools.rss'), + (u'Magazine Selections', u'http://www.businessweek.com/rss/magazine.rss'), + (u'CEO Guide to Tech', u'http://www.businessweek.com/rss/ceo_guide_tech.rss'), + ] def get_article_url(self, article): - url = article.get('guid', None) + if 'podcasts' in url: + return None + if 'surveys' in url: + return None + if 'images' in url: + return None + if 'feedroom' in url: + return None + if '/magazine/toc/' in url: + return None + rurl, sep, rest = url.rpartition('?') + if rurl: + return rurl + return rest - if 'podcasts' in url or 'surveys' in url: - url = None - - return url - - def postprocess_html(self, soup, first): - - for tag in soup.findAll(name=['ul','li','table','td','tr','span']): - tag.name = 'div' - for tag in soup.findAll(name= 'div',attrs={ 'id':'pageNav'}): - tag.extract() - return soup + def print_version(self, url): + if '/news/' in url or '/blog/' in url: + return url + if '/magazine' in url: + rurl = url.replace('http://www.businessweek.com/','http://www.businessweek.com/printer/') + else: + rurl = url.replace('http://www.businessweek.com/','http://www.businessweek.com/print/') + return rurl.replace('/investing/','/investor/') + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for alink in soup.findAll('a'): + if alink.string is not None: + tstr = alink.string + alink.replaceWith(tstr) + return soup diff --git a/recipes/businessworldin.recipe b/recipes/businessworldin.recipe index e44682d7e1..cb5f443e9f 100644 --- a/recipes/businessworldin.recipe +++ b/recipes/businessworldin.recipe @@ -4,95 +4,73 @@ __copyright__ = '2009-2010, Darko Miletic ' www.businessworld.in ''' -from calibre import strftime +import re from calibre.web.feeds.news import BasicNewsRecipe class BusinessWorldMagazine(BasicNewsRecipe): title = 'Business World Magazine' - __author__ = 'Darko Miletic' + __author__ = 'Kovid Goyal' description = 'News from India' publisher = 'ABP Pvt Ltd Publication' category = 'news, politics, finances, India, Asia' delay = 1 no_stylesheets = True - INDEX = 'http://www.businessworld.in/bw/Magazine_Current_Issue' + INDEX = 'http://www.businessworld.in/businessworld/magazine_latest_issue.php' ROOT = 'http://www.businessworld.in' - use_embedded_content = False encoding = 'utf-8' language = 'en_IN' - extra_css = """ - img{display: block; margin-bottom: 0.5em} - body{font-family: Arial,Helvetica,sans-serif} - h2{color: gray; display: block} - """ + auto_cleanup = True - conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language - } - - def is_in_list(self,linklist,url): - for litem in linklist: - if litem == url: - return True - return False - - def parse_index(self): + br = self.browser + br.open(self.ROOT) + raw = br.open(br.click_link(text_regex=re.compile('Current.*Issue', + re.I))).read() + soup = self.index_to_soup(raw) + mc = soup.find(attrs={'class':'mag_cover'}) + if mc is not None: + img = mc.find('img', src=True) + if img is not None: + self.cover_url = img['src'] + + feeds = [] + current_section = None articles = [] - linklist = [] - soup = self.index_to_soup(self.INDEX) - - tough = soup.find('div', attrs={'id':'tough'}) - if tough: - for item in tough.findAll('h1'): - description = '' - title_prefix = '' - feed_link = item.find('a') - if feed_link and feed_link.has_key('href'): - url = self.ROOT + feed_link['href'] - if not self.is_in_list(linklist,url): - title = title_prefix + self.tag_to_string(feed_link) - date = strftime(self.timefmt) - articles.append({ - 'title' :title - ,'date' :date - ,'url' :url - ,'description':description - }) - linklist.append(url) - - for item in soup.findAll('div', attrs={'class':'nametitle'}): - description = '' - title_prefix = '' - feed_link = item.find('a') - if feed_link and feed_link.has_key('href'): - url = self.ROOT + feed_link['href'] - if not self.is_in_list(linklist,url): - title = title_prefix + self.tag_to_string(feed_link) - date = strftime(self.timefmt) - articles.append({ - 'title' :title - ,'date' :date - ,'url' :url - ,'description':description - }) - linklist.append(url) - return [(soup.head.title.string, articles)] + for tag in soup.findAll(['h3', 'h2']): + inner_a = tag.find('a') + if tag.name == 'h3' and inner_a is not None: + continue + if tag.name == 'h2' and (inner_a is None or current_section is + None): + continue + + if tag.name == 'h3': + if current_section is not None and articles: + feeds.append((current_section, articles)) + current_section = self.tag_to_string(tag) + self.log('Found section:', current_section) + articles = [] + elif tag.name == 'h2': + url = inner_a.get('href', None) + if url is None: continue + if url.startswith('/'): url = self.ROOT + url + title = self.tag_to_string(inner_a) + h1 = tag.findPreviousSibling('h1') + if h1 is not None: + title = self.tag_to_string(h1) + title + self.log('\tFound article:', title) + articles.append({'title':title, 'url':url, 'date':'', + 'description':''}) + + if current_section and articles: + feeds.append((current_section, articles)) + + return feeds + + + + + - - keep_only_tags = [dict(name='div', attrs={'id':'printwrapper'})] - remove_tags = [dict(name=['object','link','meta','base','iframe','link','table'])] - def print_version(self, url): - return url.replace('/bw/','/bw/storyContent/') - def get_cover_url(self): - cover_url = None - soup = self.index_to_soup(self.INDEX) - cover_item = soup.find('img',attrs={'class':'toughbor'}) - if cover_item: - cover_url = self.ROOT + cover_item['src'] - return cover_url diff --git a/recipes/cio_magazine.recipe b/recipes/cio_magazine.recipe new file mode 100644 index 0000000000..084a45ff93 --- /dev/null +++ b/recipes/cio_magazine.recipe @@ -0,0 +1,128 @@ +# Los primeros comentarios son las dificultades que he tenido con el Piton +# Cuando da error UTF8 revisa los comentarios (acentos). En notepad++ Search, Goto, posicion y lo ves. +# Editar con Notepad++ Si pone - donde no debe es que ha indentado mal... Edit - Blank operations - tab to space +# He entendido lo que significa el from... son paths dentro de pylib.zip... +# Con from importa solo un simbolo...con import,la libreria completa +from calibre.web.feeds.news import BasicNewsRecipe +# sys no hace falta... lo intente usar para escribir en stderr +from calibre import strftime +# Para convertir el tiempo del articulo +import string, re +# Para usar expresiones regulares +# Visto en pylib.zip... la primera letra es mayuscula +# Estas dos ultimas han sido un vago intento de establecer una cookie (no usado) + +class CIO_Magazine(BasicNewsRecipe): + title = 'CIO Magazine' + oldest_article = 14 + max_articles_per_feed = 100 + auto_cleanup = True + __author__ = 'Julio Map' + description = 'CIO is the leading information brand for today-s busy Chief information Officer - CIO Magazine bi-monthly ' + language = 'en' + encoding = 'utf8' + cover_url = 'http://www.cio.com/homepage/images/hp-cio-logo-linkedin.png' + + remove_tags_before = dict(name='div', attrs={'id':'container'}) +# Absolutamente innecesario... al final he visto un print_version (ver mas adelante) + +# Dentro de una revista dada... +# issue_details contiene el titulo y las secciones de este ejemplar +# DetailModule esta dentro de issue_details contiene las urls y resumenes +# Dentro de un articulo dado... +# Article-default-body contiene el texto. Pero como digo, he encontrado una print_version + + no_stylesheets = True + remove_javascript = True + + def print_version(self,url): + # A esta funcion le llama el sistema... no hay que llamarla uno mismo (porque seria llamada dos veces) + # Existe una version imprimible de los articulos cambiando + # http://www.cio.com/article// por + # http://www.cio.com/article/print/ que contiene todas las paginas dentro del div id=container + if url.startswith('/'): + url = 'http://www.cio.com'+url + segments = url.split('/') + printURL = '/'.join(segments[0:4]) + '/print/' + segments[4] +'#' + return printURL + + + def parse_index(self): + ########################################################################### + # This method should be implemented in recipes that parse a website + # instead of feeds to generate a list of articles. Typical uses are for + # news sources that have a Print Edition webpage that lists all the + # articles in the current print edition. If this function is implemented, + # it will be used in preference to BasicNewsRecipe.parse_feeds(). + # + # It must return a list. Each element of the list must be a 2-element + # tuple of the form ('feed title', list of articles). + # + # Each list of articles must contain dictionaries of the form: + # + # { + # 'title' : article title, + # 'url' : URL of print version, + # 'date' : The publication date of the article as a string, + # 'description' : A summary of the article + # 'content' : The full article (can be an empty string). This is used by FullContentProfile + # } + # + # For an example, see the recipe for downloading The Atlantic. + # In addition, you can add 'author' for the author of the article. + ############################################################################### + + # Primero buscamos cual es la ultima revista que se ha creado + soupinicial = self.index_to_soup('http://www.cio.com/magazine') + # Es el primer enlace que hay en el DIV con class content_body + a= soupinicial.find(True, attrs={'class':'content_body'}).find('a', href=True) + INDEX = re.sub(r'\?.*', '', a['href']) + # Como cio.com usa enlaces relativos, le anteponemos el domain name. + if INDEX.startswith('/'): # protegiendonos de que dejen de usarlos + INDEX = 'http://www.cio.com'+INDEX + # Y nos aseguramos en los logs que lo estamos haciendo bien + print ("INDEX en parse_index: ", INDEX) + + # Ya sabemos cual es la revista... procesemosla. + soup = self.index_to_soup(INDEX) + + articles = {} + key = None + feeds = [] + # Para empezar nos quedamos solo con dos DIV, 'heading' y ' issue_item' + # Del primero sacamos las categorias (key) y del segundo las urls y resumenes + for div in soup.findAll(True, + attrs={'class':['heading', 'issue_item']}): + + if div['class'] == 'heading': + key = string.capwords(self.tag_to_string(div.span)) + print ("Key: ",key) # Esto es para depurar + articles[key] = [] + feeds.append(key) + + elif div['class'] == 'issue_item': + a = div.find('a', href=True) + if not a: + continue + url = re.sub(r'\?.*', '', a['href']) + print("url: ",url) # Esto es para depurar + title = self.tag_to_string(a, use_alt=True).strip() # Ya para nota, quitar al final las dos ultimas palabras + pubdate = strftime('%a, %d %b') # No es la fecha de publicacion sino la de colecta + summary = div.find('p') # Dentro de la div 'issue_item' el unico parrafo que hay es el resumen + description = '' # Si hay summary la description sera el summary... si no, la dejamos en blanco + + if summary: + description = self.tag_to_string(summary, use_alt=False) + print ("Description = ", description) + + + feed = key if key is not None else 'Uncategorized' # Esto esta copiado del NY times + if not articles.has_key(feed): + articles[feed] = [] + if not 'podcasts' in url: + articles[feed].append( + dict(title=title, url=url, date=pubdate, + description=description, + content='')) + feeds = [(key, articles[key]) for key in feeds if articles.has_key(key)] + return feeds diff --git a/recipes/guardian.recipe b/recipes/guardian.recipe index 124820d0a1..05d6616ace 100644 --- a/recipes/guardian.recipe +++ b/recipes/guardian.recipe @@ -15,8 +15,10 @@ class Guardian(BasicNewsRecipe): title = u'The Guardian and The Observer' if date.today().weekday() == 6: base_url = "http://www.guardian.co.uk/theobserver" + cover_pic = 'Observer digital edition' else: base_url = "http://www.guardian.co.uk/theguardian" + cover_pic = 'Guardian digital edition' __author__ = 'Seabound and Sujata Raman' language = 'en_GB' @@ -79,7 +81,7 @@ class Guardian(BasicNewsRecipe): # soup = self.index_to_soup("http://www.guardian.co.uk/theobserver") soup = self.index_to_soup(self.base_url) # find cover pic - img = soup.find( 'img',attrs ={'alt':'Guardian digital edition'}) + img = soup.find( 'img',attrs ={'alt':self.cover_pic}) if img is not None: self.cover_url = img['src'] # end find cover pic diff --git a/recipes/hindustan_times.recipe b/recipes/hindustan_times.recipe new file mode 100644 index 0000000000..f228757c70 --- /dev/null +++ b/recipes/hindustan_times.recipe @@ -0,0 +1,29 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class HindustanTimes(BasicNewsRecipe): + title = u'Hindustan Times' + language = 'en_IN' + __author__ = 'Krittika Goyal' + oldest_article = 1 #days + max_articles_per_feed = 25 + use_embedded_content = False + + no_stylesheets = True + auto_cleanup = True + + feeds = [ + ('News', + 'http://feeds.hindustantimes.com/HT-NewsSectionPage-Topstories'), + ('Views', + 'http://feeds.hindustantimes.com/HT-ViewsSectionpage-Topstories'), + ('Cricket', + 'http://feeds.hindustantimes.com/HT-Cricket-TopStories'), + ('Business', + 'http://feeds.hindustantimes.com/HT-BusinessSectionpage-TopStories'), + ('Entertainment', + 'http://feeds.hindustantimes.com/HT-HomePage-Entertainment'), + ('Lifestyle', + 'http://feeds.hindustantimes.com/HT-Homepage-LifestyleNews'), +] + + diff --git a/recipes/icons/japan_times.png b/recipes/icons/japan_times.png new file mode 100644 index 0000000000..1b2ac89572 Binary files /dev/null and b/recipes/icons/japan_times.png differ diff --git a/recipes/icons/rtnews.png b/recipes/icons/rtnews.png new file mode 100644 index 0000000000..f29cc707a8 Binary files /dev/null and b/recipes/icons/rtnews.png differ diff --git a/recipes/icons/twitchfilms.png b/recipes/icons/twitchfilms.png new file mode 100644 index 0000000000..1a958eb4d1 Binary files /dev/null and b/recipes/icons/twitchfilms.png differ diff --git a/recipes/india_today.recipe b/recipes/india_today.recipe index 604a7f57ad..7b53fe3d65 100644 --- a/recipes/india_today.recipe +++ b/recipes/india_today.recipe @@ -1,76 +1,25 @@ + from calibre.web.feeds.news import BasicNewsRecipe class IndiaToday(BasicNewsRecipe): - - title = 'India Today' - __author__ = 'Kovid Goyal' - language = 'en_IN' - timefmt = ' [%d %m, %Y]' - - oldest_article = 700 - max_articles_per_feed = 10 + title = u'India Today' + language = 'en_IN' + __author__ = 'Krittika Goyal' + oldest_article = 15 #days + max_articles_per_feed = 25 no_stylesheets = True + auto_cleanup = True - remove_tags_before = dict(id='content_story_title') - remove_tags_after = dict(id='rightblockdiv') - remove_tags = [dict(id=['rightblockdiv', 'share_links'])] - - extra_css = '#content_story_title { font-size: 170%; font-weight: bold;}' - conversion_options = { 'linearize_tables': True } - - def it_get_index(self): - soup = self.index_to_soup('http://indiatoday.intoday.in/site/archive') - a = soup.find('a', href=lambda x: x and 'issueId=' in x) - url = 'http://indiatoday.intoday.in/site/'+a.get('href') - img = a.find('img') - self.cover_url = img.get('src') - return self.index_to_soup(url) - - def parse_index(self): - soup = self.it_get_index() - feeds, current_section, current_articles = [], None, [] - for x in soup.findAll(name=['h1', 'a']): - if x.name == 'h1': - if current_section and current_articles: - feeds.append((current_section, current_articles)) - current_section = self.tag_to_string(x) - current_articles = [] - self.log('\tFound section:', current_section) - elif x.name == 'a' and 'Story' in x.get('href', ''): - title = self.tag_to_string(x) - url = x.get('href') - url = url.replace(' ', '%20') - if not url.startswith('/'): - url = 'http://indiatoday.intoday.in/site/' + url - if title and url: - url += '?complete=1' - self.log('\tFound article:', title) - self.log('\t\t', url) - desc = '' - h3 = x.parent.findNextSibling('h3') - if h3 is not None: - desc = 'By ' + self.tag_to_string(h3) - h4 = h3.findNextSibling('h4') - if h4 is not None: - desc = self.tag_to_string(h4) + ' ' + desc - if desc: - self.log('\t\t', desc) - current_articles.append({'title':title, 'description':desc, - 'url':url, 'date':''}) - - if current_section and current_articles: - feeds.append((current_section, current_articles)) - - return feeds - - def postprocess_html(self, soup, first): - a = soup.find(text='Print') - if a is not None: - tr = a.findParent('tr') - if tr is not None: - tr.extract() - return soup + feeds = [ +('Latest News', 'http://indiatoday.intoday.in/rss/article.jsp?sid=4'), +('Cover Story', 'http://indiatoday.intoday.in/rss/article.jsp?sid=30'), +('Nation', 'http://indiatoday.intoday.in/rss/article.jsp?sid=36'), +('States', 'http://indiatoday.intoday.in/rss/article.jsp?sid=21'), +('Economy', 'http://indiatoday.intoday.in/rss/article.jsp?sid=34'), +('World', 'http://indiatoday.intoday.in/rss/article.jsp?sid=61'), +('Sport', 'http://indiatoday.intoday.in/rss/article.jsp?sid=41'), +] diff --git a/recipes/inquirer_net.recipe b/recipes/inquirer_net.recipe index 3a3d5b9e89..30f2519f8b 100644 --- a/recipes/inquirer_net.recipe +++ b/recipes/inquirer_net.recipe @@ -7,56 +7,33 @@ www.inquirer.net ''' from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import Tag class InquirerNet(BasicNewsRecipe): title = 'Inquirer.net' - __author__ = 'Darko Miletic' + __author__ = 'Krittika Goyal' description = 'News from Philipines' oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False - encoding = 'cp1252' + encoding = 'utf8' publisher = 'inquirer.net' category = 'news, politics, philipines' lang = 'en' language = 'en' - extra_css = ' .fontheadline{font-size: x-large} .fontsubheadline{font-size: large} .fontkick{font-size: medium}' + use_embedded_content = False - html2lrf_options = [ - '--comment', description - , '--category', category - , '--publisher', publisher - , '--ignore-tables' - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True' - - remove_tags = [dict(name=['object','link','script','iframe','form'])] + no_stylesheets = True + auto_cleanup = True feeds = [ - (u'Breaking news', u'http://services.inquirer.net/rss/breakingnews.xml' ) - ,(u'Top stories' , u'http://services.inquirer.net/rss/topstories.xml' ) - ,(u'Sports' , u'http://services.inquirer.net/rss/brk_breakingnews.xml' ) - ,(u'InfoTech' , u'http://services.inquirer.net/rss/infotech_tech.xml' ) - ,(u'InfoTech' , u'http://services.inquirer.net/rss/infotech_tech.xml' ) - ,(u'Business' , u'http://services.inquirer.net/rss/inq7money_breaking_news.xml' ) - ,(u'Editorial' , u'http://services.inquirer.net/rss/opinion_editorial.xml' ) - ,(u'Global Nation', u'http://services.inquirer.net/rss/globalnation_breakingnews.xml') + (u'Inquirer', u'http://www.inquirer.net/fullfeed') ] - def preprocess_html(self, soup): - mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) - mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")]) - soup.head.insert(0,mlang) - soup.head.insert(1,mcharset) - for item in soup.findAll(style=True): - del item['style'] - return soup + def get_browser(self): + br = BasicNewsRecipe.get_browser(self) + br.set_handle_gzip(True) + return br + - def print_version(self, url): - rest, sep, art = url.rpartition('/view/') - art_id, sp, rrest = art.partition('/') - return 'http://services.inquirer.net/print/print.php?article_id=' + art_id diff --git a/recipes/japan_times.recipe b/recipes/japan_times.recipe index bb83b16f1e..229d5e4035 100644 --- a/recipes/japan_times.recipe +++ b/recipes/japan_times.recipe @@ -1,7 +1,5 @@ -#!/usr/bin/env python - __license__ = 'GPL v3' -__copyright__ = '2008, Darko Miletic ' +__copyright__ = '2008-2011, Darko Miletic ' ''' japantimes.co.jp ''' @@ -9,24 +7,61 @@ japantimes.co.jp from calibre.web.feeds.news import BasicNewsRecipe class JapanTimes(BasicNewsRecipe): - title = u'The Japan Times' + title = 'The Japan Times' __author__ = 'Darko Miletic' - description = 'News from Japan' - language = 'en' - - oldest_article = 7 - max_articles_per_feed = 100 + description = "Daily news and features on Japan from the most widely read English-language newspaper in Japan. Coverage includes national news, business news, sports news, commentary and features on living in Japan, entertainment, the arts, education and more." + language = 'en_JP' + category = 'news, politics, japan' + publisher = 'The Japan Times' + oldest_article = 5 + max_articles_per_feed = 150 no_stylesheets = True use_embedded_content = False + encoding = 'utf8' + publication_type = 'newspaper' + masthead_url = 'http://search.japantimes.co.jp/images/header_title.gif' + extra_css = 'body{font-family: Geneva,Arial,Helvetica,sans-serif}' - keep_only_tags = [ dict(name='div', attrs={'id':'searchresult'}) ] - remove_tags_after = [ dict(name='div', attrs={'id':'mainbody' }) ] + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + , 'linearize_tables' : True + } + + + keep_only_tags = [dict(name='div', attrs={'id':'printresult'})] remove_tags = [ - dict(name='div' , attrs={'id':'ads' }) - ,dict(name='table', attrs={'width':470}) + dict(name=['iframe','meta','link','embed','object','base']) + ,dict(attrs={'id':'searchfooter'}) ] + feeds = [(u'The Japan Times', u'http://feeds.feedburner.com/japantimes')] + remove_attributes = ['border'] + def get_article_url(self, article): + rurl = BasicNewsRecipe.get_article_url(self, article) + return rurl.partition('?')[0] - feeds = [ - (u'The Japan Times', u'http://feedproxy.google.com/japantimes') - ] \ No newline at end of file + def print_version(self, url): + return url.replace('/cgi-bin/','/print/') + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll('img'): + if not item.has_key('alt'): + item['alt'] = 'image' + for item in soup.findAll('photo'): + item.name = 'div' + for item in soup.head.findAll('paragraph'): + item.extract() + for item in soup.findAll('wwfilename'): + item.extract() + for item in soup.findAll('jtcategory'): + item.extract() + for item in soup.findAll('nomooter'): + item.extract() + for item in soup.body.findAll('paragraph'): + item.name = 'p' + return soup diff --git a/recipes/people_us_mashup.recipe b/recipes/people_us_mashup.recipe index ed43e24e56..5d820bacc0 100644 --- a/recipes/people_us_mashup.recipe +++ b/recipes/people_us_mashup.recipe @@ -14,54 +14,11 @@ class PeopleMag(BasicNewsRecipe): use_embedded_content = False oldest_article = 2 max_articles_per_feed = 50 + use_embedded_content = False - extra_css = ''' - h1{font-family:verdana,arial,helvetica,sans-serif; font-size: large;} - h2{font-family:verdana,arial,helvetica,sans-serif; font-size: small;} - .body-content{font-family:verdana,arial,helvetica,sans-serif; font-size: small;} - .byline {font-size: small; color: #666666; font-style:italic; } - .lastline {font-size: small; color: #666666; font-style:italic;} - .contact {font-size: small; color: #666666;} - .contact p {font-size: small; color: #666666;} - .photoCaption { font-family:verdana,arial,helvetica,sans-serif; font-size:x-small;} - .photoCredit{ font-family:verdana,arial,helvetica,sans-serif; font-size:x-small; color:#666666;} - .article_timestamp{font-size:x-small; color:#666666;} - a {font-family:verdana,arial,helvetica,sans-serif; font-size: x-small;} - ''' - - - keep_only_tags = [ - dict(name='div', attrs={'class': 'panel_news_article_main'}), - dict(name='div', attrs={'class':'article_content'}), - dict(name='div', attrs={'class': 'headline'}), - dict(name='div', attrs={'class': 'post'}), - dict(name='div', attrs={'class': 'packageheadlines'}), - dict(name='div', attrs={'class': 'snap_preview'}), - dict(name='div', attrs={'id': 'articlebody'}) - ] - - remove_tags = [ - dict(name='div', attrs={'class':'share_comments'}), - dict(name='p', attrs={'class':'twitter_facebook'}), - dict(name='div', attrs={'class':'share_comments_bottom'}), - dict(name='h2', attrs={'id':'related_content'}), - dict(name='div', attrs={'class':'next_article'}), - dict(name='div', attrs={'class':'prev_article'}), - dict(name='ul', attrs={'id':'sharebar'}), - dict(name='div', attrs={'class':'sharelinkcont'}), - dict(name='div', attrs={'class':'categories'}), - dict(name='ul', attrs={'class':'categories'}), - dict(name='div', attrs={'class':'related_content'}), - dict(name='div', attrs={'id':'promo'}), - dict(name='div', attrs={'class':'linksWrapper'}), - dict(name='p', attrs={'class':'tag tvnews'}), - dict(name='p', attrs={'class':'tag movienews'}), - dict(name='p', attrs={'class':'tag musicnews'}), - dict(name='p', attrs={'class':'tag couples'}), - dict(name='p', attrs={'class':'tag gooddeeds'}), - dict(name='p', attrs={'class':'tag weddings'}), - dict(name='p', attrs={'class':'tag health'}) -] + no_stylesheets = True + auto_cleanup = True + auto_cleanup_keep = '//div[@id="article-image"]' feeds = [ @@ -69,26 +26,4 @@ class PeopleMag(BasicNewsRecipe): ('US Headlines', 'http://www.usmagazine.com/celebrity_news/rss') ] - def get_article_url(self, article): - ans = article.link - try: - self.log('Looking for full story link in', ans) - soup = self.index_to_soup(ans) - x = soup.find(text="View All") - - if x is not None: - ans = ans + '?viewAll=y' - self.log('Found full story link', ans) - except: - pass - return ans - - def postprocess_html(self, soup,first): - - for tag in soup.findAll(name='div',attrs={'class':"container_ate_qandatitle"}): - tag.extract() - for tag in soup.findAll(name='br'): - tag.extract() - - return soup diff --git a/recipes/rtnews.recipe b/recipes/rtnews.recipe new file mode 100644 index 0000000000..22cdc6467f --- /dev/null +++ b/recipes/rtnews.recipe @@ -0,0 +1,64 @@ +__license__ = 'GPL v3' +__copyright__ = '2011, Darko Miletic ' +''' +rt.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class RT_eng(BasicNewsRecipe): + title = 'RT in English' + __author__ = 'Darko Miletic' + description = 'RT is the first Russian 24/7 English-language news channel which brings the Russian view on global news.' + publisher = 'Autonomous Nonprofit Organization "TV-Novosti"' + category = 'news, politics, economy, finances, Russia, world' + oldest_article = 2 + no_stylesheets = True + encoding = 'utf8' + masthead_url = 'http://rt.com/s/css/img/printlogo.gif' + use_embedded_content = False + remove_empty_feeds = True + language = 'en_RU' + publication_type = 'newsportal' + extra_css = """ + body{font-family: Arial,Helvetica,sans-serif} + h1{font-family: Georgia,"Times New Roman",Times,serif} + .grey{color: gray} + .fs12{font-size: small} + """ + + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher': publisher + , 'language' : language + } + + keep_only_tags = [dict(name='div', attrs={'class':'all'})] + remove_tags = [ + dict(name=['object','link','embed','iframe','meta','link']) + ,dict(attrs={'class':'crumbs oh'}) + ] + remove_attributes = ['clear'] + + feeds = [ + (u'Politics' , u'http://rt.com/politics/rss/' ) + ,(u'USA' , u'http://rt.com/usa/news/rss/' ) + ,(u'Business' , u'http://rt.com/business/news/rss/' ) + ,(u'Sport' , u'http://rt.com/sport/rss/' ) + ,(u'Art&Culture', u'http://rt.com/art-and-culture/news/rss/') + ] + + def print_version(self, url): + return url + 'print/' + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll('a'): + str = item.string + if str is None: + str = self.tag_to_string(item) + item.replaceWith(str) + return soup diff --git a/recipes/twitchfilms.recipe b/recipes/twitchfilms.recipe index 681eb05aba..dab0643410 100644 --- a/recipes/twitchfilms.recipe +++ b/recipes/twitchfilms.recipe @@ -1,12 +1,9 @@ -#!/usr/bin/env python - __license__ = 'GPL v3' -__copyright__ = '2009, Darko Miletic ' +__copyright__ = '2009-2011, Darko Miletic ' ''' -twitchfilm.net/site/ +twitchfilm.net/news/ ''' from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import Tag class Twitchfilm(BasicNewsRecipe): title = 'Twitch Films' @@ -15,29 +12,46 @@ class Twitchfilm(BasicNewsRecipe): oldest_article = 30 max_articles_per_feed = 100 no_stylesheets = True - use_embedded_content = True + use_embedded_content = False encoding = 'utf-8' publisher = 'Twitch' + masthead_url = 'http://twitchfilm.com/img/logo.png' category = 'twitch, twitchfilm, movie news, movie reviews, cult cinema, independent cinema, anime, foreign cinema, geek talk' - language = 'en' - - lang = 'en-US' + language = 'en' conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : lang - , 'pretty_print' : True + 'comment' : description + , 'tags' : category + , 'publisher': publisher + , 'language' : language } - remove_tags = [dict(name='div', attrs={'class':'feedflare'})] + keep_only_tags=[dict(attrs={'class':'asset-header'})] + remove_tags_after=dict(attrs={'class':'asset-body'}) + remove_tags = [ dict(name='div', attrs={'class':['social','categories']}) + , dict(attrs={'id':'main-asset'}) + , dict(name=['meta','link','iframe','embed','object']) + ] - feeds = [(u'News', u'http://feedproxy.google.com/TwitchEverything')] + feeds = [(u'News', u'http://feeds.twitchfilm.net/TwitchEverything')] def preprocess_html(self, soup): - mtag = Tag(soup,'meta',[('http-equiv','Content-Type'),('context','text/html; charset=utf-8')]) - soup.head.insert(0,mtag) - soup.html['lang'] = self.lang - return self.adeify_images(soup) + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll('a'): + limg = item.find('img') + if item.string is not None: + str = item.string + item.replaceWith(str) + else: + if limg: + item.name = 'div' + item.attrs = [] + else: + str = self.tag_to_string(item) + item.replaceWith(str) + for item in soup.findAll('img'): + if not item.has_key('alt'): + item['alt'] = 'image' + return soup diff --git a/recipes/usatoday.recipe b/recipes/usatoday.recipe index a4899b7187..18aeab2648 100644 --- a/recipes/usatoday.recipe +++ b/recipes/usatoday.recipe @@ -13,6 +13,7 @@ class USAToday(BasicNewsRecipe): title = 'USA Today' __author__ = 'Kovid Goyal' oldest_article = 1 + publication_type = 'newspaper' timefmt = '' max_articles_per_feed = 20 language = 'en' diff --git a/recipes/wsj.recipe b/recipes/wsj.recipe index 7a044aa5a7..f01e7ae858 100644 --- a/recipes/wsj.recipe +++ b/recipes/wsj.recipe @@ -94,9 +94,11 @@ class WallStreetJournal(BasicNewsRecipe): if date is not None: self.timefmt = ' [%s]'%self.tag_to_string(date) - cov = soup.find('a', attrs={'class':'icon pdf'}, href=True) + cov = soup.find('div', attrs={'class':'itpSectionHeaderPdf'}) if cov is not None: - self.cover_url = cov['href'] + a = cov.find('a', href=True) + if a is not None: + self.cover_url = a['href'] feeds = [] div = soup.find('div', attrs={'class':'itpHeader'}) diff --git a/resources/default_tweaks.py b/resources/default_tweaks.py index ead9995eb3..f12121dd89 100644 --- a/resources/default_tweaks.py +++ b/resources/default_tweaks.py @@ -61,7 +61,7 @@ authors_completer_append_separator = False # selecting 'manage authors', and pressing 'Recalculate all author sort values'. # The author name suffixes are words that are ignored when they occur at the # end of an author name. The case of the suffix is ignored and trailing -# periods are automatically handled. +# periods are automatically handled. The same is true for prefixes. # The author name copy words are a set of words which if they occur in an # author name cause the automatically generated author sort string to be # identical to the author name. This means that the sort for a string like Acme diff --git a/src/calibre/devices/kobo/driver.py b/src/calibre/devices/kobo/driver.py index 528057dad9..fa4796a5a9 100644 --- a/src/calibre/devices/kobo/driver.py +++ b/src/calibre/devices/kobo/driver.py @@ -653,6 +653,15 @@ class KOBO(USBMS): debug_print(' Commit: Set FavouritesIndex') def update_device_database_collections(self, booklists, collections_attributes, oncard): + # Only process categories in this list + supportedcategories = { + "Im_Reading":1, + "Read":2, + "Closed":3, + "Shortlist":4, + # "Preview":99, # Unsupported as we don't want to change it + } + # Define lists for the ReadStatus readstatuslist = { "Im_Reading":1, @@ -692,6 +701,7 @@ class KOBO(USBMS): # Process any collections that exist for category, books in collections.items(): + if category in supportedcategories: debug_print("Category: ", category, " id = ", readstatuslist.get(category)) for book in books: debug_print(' Title:', book.title, 'category: ', category) diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 3e5313eb96..df1098da8f 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -368,7 +368,10 @@ OptionRecommendation(name='remove_paragraph_spacing_indent_size', recommended_value=1.5, level=OptionRecommendation.LOW, help=_('When calibre removes blank lines between paragraphs, it automatically ' 'sets a paragraph indent, to ensure that paragraphs can be easily ' - 'distinguished. This option controls the width of that indent (in em).') + 'distinguished. This option controls the width of that indent (in em). ' + 'If you set this value to 0, then the indent specified in the input ' + 'document is used, unless you also set the insert line between ' + 'paragraphs option.') ), OptionRecommendation(name='prefer_metadata_cover', @@ -394,8 +397,9 @@ OptionRecommendation(name='insert_blank_line_size', OptionRecommendation(name='remove_first_image', recommended_value=False, level=OptionRecommendation.LOW, help=_('Remove the first image from the input ebook. Useful if the ' - 'first image in the source file is a cover and you are specifying ' - 'an external cover.' + 'input document has a cover image that is not identified as a cover. ' + 'In this case, if you set a cover in calibre, the output document will ' + 'end up with two cover images if you do not specify this option.' ) ), @@ -1024,7 +1028,7 @@ OptionRecommendation(name='sr3_replace', self.output_plugin.file_type not in ('mobi', 'lrf'): from calibre.ebooks.oeb.transforms.linearize_tables import LinearizeTables LinearizeTables()(self.oeb, self.opts) - + if self.opts.unsmarten_punctuation: from calibre.ebooks.oeb.transforms.unsmarten import UnsmartenPunctuation UnsmartenPunctuation()(self.oeb, self.opts) diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index 69eb493c7d..4ebf344a2a 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -75,7 +75,7 @@ class IgnoreFile(Exception): def __init__(self, msg, errno): Exception.__init__(self, msg) - self.doesnt_exist = errno == 2 + self.doesnt_exist = errno == errno.ENOENT self.errno = errno class HTMLFile(object): diff --git a/src/calibre/ebooks/metadata/__init__.py b/src/calibre/ebooks/metadata/__init__.py index c3a229fe3c..07fae187ba 100644 --- a/src/calibre/ebooks/metadata/__init__.py +++ b/src/calibre/ebooks/metadata/__init__.py @@ -65,20 +65,27 @@ def author_to_author_sort(author, method=None): suffixes = set([x.lower() for x in tweaks['author_name_suffixes']]) suffixes |= set([x+u'.' for x in suffixes]) - last = tokens[-1].lower() - suffix = None - if last in suffixes: - suffix = tokens[-1] - tokens = tokens[:-1] + suffix = u'' + while True: + if not tokens: + return author + last = tokens[-1].lower() + if last in suffixes: + suffix = tokens[-1] + ' ' + suffix + tokens = tokens[:-1] + else: + break + suffix = suffix.strip() if method == u'comma' and u',' in u''.join(tokens): return author atokens = tokens[-1:] + tokens[:-1] + num_toks = len(atokens) if suffix: atokens.append(suffix) - if method != u'nocomma' and len(atokens) > 1: + if method != u'nocomma' and num_toks > 1: atokens[0] += u',' return u' '.join(atokens) diff --git a/src/calibre/ebooks/metadata/mobi.py b/src/calibre/ebooks/metadata/mobi.py index 74db3b3a58..2da9f74961 100644 --- a/src/calibre/ebooks/metadata/mobi.py +++ b/src/calibre/ebooks/metadata/mobi.py @@ -330,9 +330,11 @@ class MetadataUpdater(object): prefs = load_defaults('mobi_output') pas = prefs.get('prefer_author_sort', False) kindle_pdoc = prefs.get('personal_doc', None) + share_not_sync = prefs.get('share_not_sync', False) except: pas = False kindle_pdoc = None + share_not_sync = False if mi.author_sort and pas: authors = mi.author_sort update_exth_record((100, normalize(authors).encode(self.codec, 'replace'))) @@ -376,7 +378,7 @@ class MetadataUpdater(object): # Add a 113 record if not present to allow Amazon syncing if (113 not in self.original_exth_records and self.original_exth_records.get(501, None) == 'EBOK' and - not added_501): + not added_501 and not share_not_sync): from uuid import uuid4 update_exth_record((113, str(uuid4()))) if 503 in self.original_exth_records: diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py index 2d7bb73e9c..701394e1a5 100644 --- a/src/calibre/ebooks/metadata/sources/base.py +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -116,7 +116,8 @@ def cap_author_token(token): lt = lower(token) if lt in ('von', 'de', 'el', 'van', 'le'): return lt - if re.match(r'([a-z]\.){2,}$', lt) is not None: + # no digits no spez. characters + if re.match(r'([^\d\W]\.){2,}$', lt, re.UNICODE) is not None: # Normalize tokens of the form J.K. to J. K. parts = token.split('.') return '. '.join(map(capitalize, parts)).strip() diff --git a/src/calibre/ebooks/metadata/sources/ozon.py b/src/calibre/ebooks/metadata/sources/ozon.py index 3f5f956fae..fa9951c40c 100644 --- a/src/calibre/ebooks/metadata/sources/ozon.py +++ b/src/calibre/ebooks/metadata/sources/ozon.py @@ -28,7 +28,7 @@ class Ozon(Source): touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon', 'publisher', 'pubdate', 'comments', 'series', 'rating', 'language']) # Test purpose only, test function does not like when sometimes some filed are empty - #touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon', + # touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon', # 'publisher', 'pubdate', 'comments']) supports_gzip_transfer_encoding = True @@ -109,8 +109,16 @@ class Ozon(Source): # }}} def get_metadata(self, log, entries, title, authors, identifiers): # {{{ + # some book titles have extra charactes like this + # TODO: make a twick + reRemoveFromTitle = None + #reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]') + title = unicode(title).upper() if title else '' - authors = map(unicode.upper, map(unicode, authors)) if authors else None + if reRemoveFromTitle: + title = reRemoveFromTitle.sub('', title) + authors = map(_normalizeAuthorNameWithInitials, + map(unicode.upper, map(unicode, authors))) if authors else None ozon_id = identifiers.get('ozon', None) unk = unicode(_('Unknown')).upper() @@ -124,6 +132,7 @@ class Ozon(Source): def in_authors(authors, miauthors): for author in authors: for miauthor in miauthors: + #log.debug(u'=> %s <> %s'%(author, miauthor)) if author in miauthor: return True return None @@ -131,7 +140,10 @@ class Ozon(Source): match = True if title: mititle = unicode(mi.title).upper() if mi.title else '' + if reRemoveFromTitle: + mititle = reRemoveFromTitle.sub('', mititle) match = title in mititle + #log.debug(u't=> %s <> %s'%(title, mititle)) if match and authors: miauthors = map(unicode.upper, map(unicode, mi.authors)) if mi.authors else [] match = in_authors(authors, miauthors) @@ -190,7 +202,8 @@ class Ozon(Source): title = entry.xpath(xp_template.format('Name')) author = entry.xpath(xp_template.format('Author')) - mi = Metadata(title, author.split(',')) + norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u','))) + mi = Metadata(title, norm_authors) ozon_id = entry.xpath(xp_template.format('ID')) mi.identifiers = {'ozon':ozon_id} @@ -202,6 +215,11 @@ class Ozon(Source): if cover: mi.ozon_cover_url = _translateToBigCoverUrl(cover) + pub_year = entry.xpath(xp_template.format('Year')) + if pub_year: + mi.pubdate = toPubdate(log, pub_year) + #log.debug('pubdate %s'%mi.pubdate) + rating = entry.xpath(xp_template.format('ClientRatingValue')) if rating: try: @@ -269,13 +287,17 @@ class Ozon(Source): raw = self.browser.open_novisit(url, timeout=timeout).read() doc = html.fromstring(raw) + xpt_prod_det_at = u'string(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "%s")]/a[1]/@title)' + xpt_prod_det_tx = u'substring-after(//div[contains(@class, "product-detail")]//text()[contains(., "%s")], ":")' + # series - xpt = u'normalize-space(//div[@class="frame_content"]//div[contains(normalize-space(text()), "Серия:")]//a/@title)' + xpt = xpt_prod_det_at % u'Сери' + # % u'Серия:' series = doc.xpath(xpt) if series: metadata.series = series - xpt = u'substring-after(//meta[@name="description"]/@content, "ISBN")' + xpt = u'normalize-space(substring-after(//meta[@name="description"]/@content, "ISBN"))' isbn_str = doc.xpath(xpt) if isbn_str: all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if check_isbn(isbn)] @@ -283,38 +305,42 @@ class Ozon(Source): metadata.all_isbns = all_isbns metadata.isbn = all_isbns[0] - xpt = u'//div[@class="frame_content"]//div[contains(normalize-space(text()), "Издатель")]//a[@title="Издательство"]' + xpt = xpt_prod_det_at % u'Издатель' publishers = doc.xpath(xpt) if publishers: - metadata.publisher = publishers[0].text + metadata.publisher = publishers - xpt = u'string(../text()[contains(., "г.")])' - yearIn = publishers[0].xpath(xpt) + displ_lang = None + xpt = xpt_prod_det_tx % u'Язык' + langs = doc.xpath(xpt) + if langs: + lng_splt = langs.split(u',') + if lng_splt: + displ_lang = lng_splt[0].strip() + metadata.language = _translageLanguageToCode(displ_lang) + #log.debug(u'language: %s'%displ_lang) + + # can be set before from xml search responce + if not metadata.pubdate: + xpt = u'normalize-space(//div[@class="product-misc"]//text()[contains(., "г.")])' + yearIn = doc.xpath(xpt) if yearIn: matcher = re.search(r'\d{4}', yearIn) if matcher: - year = int(matcher.group(0)) - # only year is available, so use 1-st of Jan - metadata.pubdate = datetime.datetime(year, 1, 1) #<- failed comparation in identify.py - #metadata.pubdate = datetime(year, 1, 1) - xpt = u'substring-after(string(../text()[contains(., "Язык")]), ": ")' - displLang = publishers[0].xpath(xpt) - lang_code =_translageLanguageToCode(displLang) - if lang_code: - metadata.language = lang_code + metadata.pubdate = toPubdate(log, matcher.group(0)) # overwrite comments from HTML if any - # tr/td[contains(.//text(), "От издателя")] -> does not work, why? - xpt = u'//div[contains(@class, "detail")]//tr/td//text()[contains(., "От издателя")]'\ - u'/ancestor::tr[1]/following-sibling::tr[1]/td[contains(./@class, "description")][1]' + xpt = u'//table[@id="detail_description"]//tr/td' comment_elem = doc.xpath(xpt) if comment_elem: comments = unicode(etree.tostring(comment_elem[0])) if comments: # cleanup root tag, TODO: remove tags like object/embeded - comments = re.sub(r'^|.+?$', u'', comments).strip() - if comments: + comments = re.sub(r'\A.*?|.*\Z', u'', comments.strip(), re.MULTILINE).strip() + if comments and (not metadata.comments or len(comments) > len(metadata.comments)): metadata.comments = comments + else: + log.debug('HTML book description skipped in favour of search service xml responce') else: log.debug('No book description found in HTML') # }}} @@ -390,10 +416,40 @@ def _translageLanguageToCode(displayLang): # {{{ u'Итальянский': 'it', u'Испанский': 'es', u'Китайский': 'zh', - u'Японский': 'ja' } + u'Японский': 'ja', + u'Финский' : 'fi', + u'Польский' : 'pl',} return langTbl.get(displayLang, None) # }}} +# [В.П. Колесников | Колесников В.П.]-> В. П. BКолесников +def _normalizeAuthorNameWithInitials(name): # {{{ + res = name + if name: + re1 = u'^(?P\S+)\s+(?P[^\d\W]\.)(?:\s*(?P[^\d\W]\.))?$' + re2 = u'^(?P[^\d\W]\.)(?:\s*(?P[^\d\W]\.))?\s+(?P\S+)$' + matcher = re.match(re1, unicode(name), re.UNICODE) + if not matcher: + matcher = re.match(re2, unicode(name), re.UNICODE) + + if matcher: + d = matcher.groupdict() + res = ' '.join(x for x in (d['fname'], d['mname'], d['lname']) if x) + return res +# }}} + +def toPubdate(log, yearAsString): + res = None + if yearAsString: + try: + year = int(yearAsString) + # only year is available, so use 1-st of Jan + res = datetime.datetime(year, 1, 1) + except: + log.error('cannot parse to date %s'%yearAsString) + return res + + if __name__ == '__main__': # tests {{{ # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/ozon.py # comment some touched_fields before run thoses tests @@ -403,40 +459,45 @@ if __name__ == '__main__': # tests {{{ test_identify_plugin(Ozon.name, [ - - ( +# ( +# {'identifiers':{}, 'title':u'Норвежский язык: Практический курс', +# 'authors':[u'Колесников В.П.', u'Г.В. Шатков']}, +# [title_test(u'Норвежский язык: Практический курс', exact=True), +# authors_test([u'В. П. Колесников', u'Г. В. Шатков'])] +# ), + ( {'identifiers':{'isbn': '9785916572629'} }, [title_test(u'На все четыре стороны', exact=True), authors_test([u'А. А. Гилл'])] - ), - ( + ), + ( {'identifiers':{}, 'title':u'Der Himmel Kennt Keine Gunstlinge', 'authors':[u'Erich Maria Remarque']}, [title_test(u'Der Himmel Kennt Keine Gunstlinge', exact=True), authors_test([u'Erich Maria Remarque'])] - ), - ( + ), + ( {'identifiers':{ }, 'title':u'Метро 2033', 'authors':[u'Дмитрий Глуховский']}, [title_test(u'Метро 2033', exact=False)] - ), - ( + ), + ( {'identifiers':{'isbn': '9785170727209'}, 'title':u'Метро 2033', 'authors':[u'Дмитрий Глуховский']}, [title_test(u'Метро 2033', exact=True), authors_test([u'Дмитрий Глуховский']), isbn_test('9785170727209')] - ), - ( + ), + ( {'identifiers':{'isbn': '5-699-13613-4'}, 'title':u'Метро 2033', 'authors':[u'Дмитрий Глуховский']}, [title_test(u'Метро 2033', exact=True), authors_test([u'Дмитрий Глуховский'])] - ), - ( + ), + ( {'identifiers':{}, 'title':u'Метро', 'authors':[u'Глуховский']}, [title_test(u'Метро', exact=False)] - ), + ), ]) # }}} diff --git a/src/calibre/ebooks/mobi/output.py b/src/calibre/ebooks/mobi/output.py index 4f5d09c894..f22015d71f 100644 --- a/src/calibre/ebooks/mobi/output.py +++ b/src/calibre/ebooks/mobi/output.py @@ -55,6 +55,11 @@ class MOBIOutput(OutputFormatPlugin): ' specified directory. If the directory already ' 'exists, it will be deleted.') ), + OptionRecommendation(name='share_not_sync', recommended_value=False, + help=_('Enable sharing of book content via Facebook etc. ' + ' on the Kindle. WARNING: Using this feature means that ' + ' the book will not auto sync its last read position ' + ' on multiple devices. Complain to Amazon.')) ]) def check_for_periodical(self): diff --git a/src/calibre/ebooks/mobi/writer2/main.py b/src/calibre/ebooks/mobi/writer2/main.py index 7e748aac95..1705a5a342 100644 --- a/src/calibre/ebooks/mobi/writer2/main.py +++ b/src/calibre/ebooks/mobi/writer2/main.py @@ -61,6 +61,13 @@ class MobiWriter(object): def __call__(self, oeb, path_or_stream): self.log = oeb.log + pt = None + if oeb.metadata.publication_type: + x = unicode(oeb.metadata.publication_type[0]).split(':') + if len(x) > 1: + pt = x[1].lower() + self.publication_type = pt + if hasattr(path_or_stream, 'write'): return self.dump_stream(oeb, path_or_stream) with open(path_or_stream, 'w+b') as stream: @@ -346,12 +353,14 @@ class MobiWriter(object): bt = 0x002 if self.primary_index_record_idx is not None: - if self.indexer.is_flat_periodical: + if False and self.indexer.is_flat_periodical: + # Disabled as setting this to 0x102 causes the Kindle to not + # auto archive the issues bt = 0x102 elif self.indexer.is_periodical: # If you change this, remember to change the cdetype in the EXTH # header as well - bt = 0x103 + bt = {'newspaper':0x101}.get(self.publication_type, 0x103) record0.write(pack(b'>IIIII', 0xe8, bt, 65001, uid, 6)) @@ -520,20 +529,22 @@ class MobiWriter(object): if isinstance(uuid, unicode): uuid = uuid.encode('utf-8') - exth.write(pack(b'>II', 113, len(uuid) + 8)) - exth.write(uuid) - nrecs += 1 + if not self.opts.share_not_sync: + exth.write(pack(b'>II', 113, len(uuid) + 8)) + exth.write(uuid) + nrecs += 1 # Write cdetype - if self.is_periodical: - # If you set the book type header field to 0x101 use NWPR here if - # you use 0x103 use MAGZ - data = b'MAGZ' + if not self.is_periodical: + exth.write(pack(b'>II', 501, 12)) + exth.write(b'EBOK') + nrecs += 1 else: - data = b'EBOK' - exth.write(pack(b'>II', 501, len(data)+8)) - exth.write(data) - nrecs += 1 + # Should be b'NWPR' for doc type of 0x101 and b'MAGZ' for doctype + # of 0x103 but the old writer didn't write them, and I dont know + # what it should be for type 0x102 (b'BLOG'?) so write nothing + # instead + pass # Add a publication date entry if oeb.metadata['date']: diff --git a/src/calibre/ebooks/mobi/writer2/serializer.py b/src/calibre/ebooks/mobi/writer2/serializer.py index ed6df6698a..eeef720144 100644 --- a/src/calibre/ebooks/mobi/writer2/serializer.py +++ b/src/calibre/ebooks/mobi/writer2/serializer.py @@ -160,7 +160,9 @@ class Serializer(object): buf.write(b'title="') self.serialize_text(ref.title, quot=True) buf.write(b'" ') - if ref.title == 'start': + if (ref.title.lower() == 'start' or + (ref.type and ref.type.lower() in ('start', + 'other.start'))): self._start_href = ref.href self.serialize_href(ref.href) # Space required or won't work, I kid you not @@ -348,8 +350,9 @@ class Serializer(object): ''' buf = self.buf id_offsets = self.id_offsets + start_href = getattr(self, '_start_href', None) for href, hoffs in self.href_offsets.items(): - is_start = (href and href == getattr(self, '_start_href', None)) + is_start = (href and href == start_href) # Iterate over all filepos items if href not in id_offsets: self.logger.warn('Hyperlink target %r not found' % href) diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py index 1493a647ae..6458ca80b0 100644 --- a/src/calibre/ebooks/oeb/transforms/flatcss.py +++ b/src/calibre/ebooks/oeb/transforms/flatcss.py @@ -320,9 +320,11 @@ class CSSFlattener(object): if self.context.insert_blank_line: cssdict['margin-top'] = cssdict['margin-bottom'] = \ '%fem'%self.context.insert_blank_line_size - if (self.context.remove_paragraph_spacing and + indent_size = self.context.remove_paragraph_spacing_indent_size + keep_indents = indent_size == 0.0 and not self.context.insert_blank_line + if (self.context.remove_paragraph_spacing and not keep_indents and cssdict.get('text-align', None) not in ('center', 'right')): - cssdict['text-indent'] = "%1.1fem" % self.context.remove_paragraph_spacing_indent_size + cssdict['text-indent'] = "%1.1fem" % indent_size if cssdict: items = cssdict.items() diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py index a791dab48a..9d81c73c2a 100644 --- a/src/calibre/ebooks/pdf/pdftohtml.py +++ b/src/calibre/ebooks/pdf/pdftohtml.py @@ -53,7 +53,7 @@ def pdftohtml(output_dir, pdf_path, no_images): p = popen(cmd, stderr=logf._fd, stdout=logf._fd, stdin=subprocess.PIPE) except OSError as err: - if err.errno == 2: + if err.errno == errno.ENOENT: raise ConversionError(_('Could not find pdftohtml, check it is in your PATH')) else: raise diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index ebe6533419..05d874c9c3 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -11,6 +11,7 @@ Write content to PDF. import os import shutil +from calibre import isosx from calibre.ptempfile import PersistentTemporaryDirectory from calibre.ebooks.pdf.pageoptions import unit, paper_size, \ orientation @@ -164,6 +165,12 @@ class PDFWriter(QObject): # {{{ self.logger.debug('\tRendering item %s as %i.pdf' % (os.path.basename(str(self.view.url().toLocalFile())), len(self.combine_queue))) printer = get_pdf_printer(self.opts) printer.setOutputFileName(item_path) + # We have to set the engine to Native on OS X after the call to set + # filename. Setting a filename with .pdf as the extension causes + # Qt to set the format to use Qt's PDF engine even if native was + # previously set on the printer. + if isosx: + printer.setOutputFormat(QPrinter.NativeFormat) self.view.print_(printer) printer.abort() self._render_book() @@ -179,6 +186,8 @@ class PDFWriter(QObject): # {{{ item_path = os.path.join(self.tmp_path, 'cover.pdf') printer = get_pdf_printer(self.opts) printer.setOutputFileName(item_path) + if isosx: + printer.setOutputFormat(QPrinter.NativeFormat) self.combine_queue.insert(0, item_path) p = QPixmap() p.loadFromData(self.cover_data) @@ -202,7 +211,7 @@ class PDFWriter(QObject): # {{{ inputPDF = PdfFileReader(item_stream) for page in inputPDF.pages: outPDF.addPage(page) - outPDF.write(self.out_stream) + outPDF.write(self.out_stream) finally: self._delete_tmpdir() self.loop.exit(0) @@ -229,6 +238,8 @@ class ImagePDFWriter(object): def render_images(self, outpath, mi, items): printer = get_pdf_printer(self.opts, for_comic=True) printer.setOutputFileName(outpath) + if isosx: + printer.setOutputFormat(QPrinter.NativeFormat) printer.setDocName(mi.title) printer.setCreator(u'%s [%s]'%(__appname__, __version__)) # Seems to be no way to set author diff --git a/src/calibre/ebooks/readability/readability.py b/src/calibre/ebooks/readability/readability.py index 7713584d14..028a4d6ede 100644 --- a/src/calibre/ebooks/readability/readability.py +++ b/src/calibre/ebooks/readability/readability.py @@ -1,3 +1,8 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + import re, sys from collections import defaultdict @@ -72,10 +77,15 @@ class Document: self.options[k] = v self.html = None self.log = log + self.keep_elements = set() def _html(self, force=False): if force or self.html is None: self.html = self._parse(self.input) + path = self.options['keep_elements'] + if path is not None: + self.keep_elements = set(self.html.xpath(path)) + return self.html def _parse(self, input): @@ -152,8 +162,9 @@ class Document: append = False if sibling is best_elem: append = True - sibling_key = sibling #HashableElement(sibling) - if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold: + if sibling in candidates and candidates[sibling]['content_score'] >= sibling_score_threshold: + append = True + if sibling in self.keep_elements: append = True if sibling.tag == "p": @@ -283,6 +294,8 @@ class Document: def remove_unlikely_candidates(self): for elem in self.html.iter(): + if elem in self.keep_elements: + continue s = "%s %s" % (elem.get('class', ''), elem.get('id', '')) #self.debug(s) if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag != 'body': @@ -337,7 +350,7 @@ class Document: allowed = {} # Conditionally clean s,
    s, and
    s for el in self.reverse_tags(node, "table", "ul", "div"): - if el in allowed: + if el in allowed or el in self.keep_elements: continue weight = self.class_weight(el) if el in candidates: @@ -450,64 +463,39 @@ class Document: #self.debug("pname %s pweight %.3f" %(pname, pweight)) el.drop_tree() - for el in ([node] + [n for n in node.iter()]): - if not (self.options['attributes']): - #el.attrib = {} #FIXME:Checkout the effects of disabling this - pass - return clean_attributes(tounicode(node)) +def option_parser(): + from calibre.utils.config import OptionParser + parser = OptionParser(usage='%prog: [options] file') + parser.add_option('-v', '--verbose', default=False, action='store_true', + dest='verbose', + help='Show detailed output information. Useful for debugging') + parser.add_option('-k', '--keep-elements', default=None, action='store', + dest='keep_elements', + help='XPath specifying elements that should not be removed') -class HashableElement(): - def __init__(self, node): - self.node = node - self._path = None - - def _get_path(self): - if self._path is None: - reverse_path = [] - node = self.node - while node is not None: - node_id = (node.tag, tuple(node.attrib.items()), node.text) - reverse_path.append(node_id) - node = node.getparent() - self._path = tuple(reverse_path) - return self._path - path = property(_get_path) - - def __hash__(self): - return hash(self.path) - - def __eq__(self, other): - return self.path == other.path - - def __getattr__(self, tag): - return getattr(self.node, tag) + return parser def main(): - import logging - from optparse import OptionParser - parser = OptionParser(usage="%prog: [options] [file]") - parser.add_option('-v', '--verbose', action='store_true') - parser.add_option('-u', '--url', help="use URL instead of a local file") - (options, args) = parser.parse_args() + from calibre.utils.logging import default_log + parser = option_parser() + options, args = parser.parse_args() - if not (len(args) == 1 or options.url): + if len(args) != 1: parser.print_help() - sys.exit(1) - logging.basicConfig(level=logging.INFO) + raise SystemExit(1) + + with open(args[0], 'rb') as f: + raw = f.read() - file = None - if options.url: - import urllib - file = urllib.urlopen(options.url) - else: - file = open(args[0], 'rt') enc = sys.__stdout__.encoding or 'utf-8' - try: - print Document(file.read(), debug=options.verbose).summary().encode(enc, 'replace') - finally: - file.close() + if options.verbose: + default_log.filter_level = default_log.DEBUG + print (Document(raw, default_log, + debug=options.verbose, + keep_elements=options.keep_elements).summary().encode(enc, + 'replace')) if __name__ == '__main__': main() diff --git a/src/calibre/gui2/__init__.py b/src/calibre/gui2/__init__.py index ccd1dac1ad..0e123bee8b 100644 --- a/src/calibre/gui2/__init__.py +++ b/src/calibre/gui2/__init__.py @@ -142,7 +142,7 @@ def _config(): # {{{ c.add_opt('upload_news_to_device', default=True, help=_('Upload downloaded news to device')) c.add_opt('delete_news_from_library_on_upload', default=False, - help=_('Delete books from library after uploading to device')) + help=_('Delete news books from library after uploading to device')) c.add_opt('separate_cover_flow', default=False, help=_('Show the cover flow in a separate window instead of in the main calibre window')) c.add_opt('disable_tray_notification', default=False, diff --git a/src/calibre/gui2/actions/copy_to_library.py b/src/calibre/gui2/actions/copy_to_library.py index 14c61c91e6..fdcce87342 100644 --- a/src/calibre/gui2/actions/copy_to_library.py +++ b/src/calibre/gui2/actions/copy_to_library.py @@ -8,6 +8,7 @@ __docformat__ = 'restructuredtext en' import os from functools import partial from threading import Thread +from contextlib import closing from PyQt4.Qt import QToolButton @@ -52,7 +53,13 @@ class Worker(Thread): # {{{ def doit(self): from calibre.library.database2 import LibraryDatabase2 - newdb = LibraryDatabase2(self.loc) + newdb = LibraryDatabase2(self.loc, is_second_db=True) + with closing(newdb): + self._doit(newdb) + newdb.break_cycles() + del newdb + + def _doit(self, newdb): for i, x in enumerate(self.ids): mi = self.db.get_metadata(x, index_is_id=True, get_cover=True, cover_as_data=True) @@ -111,6 +118,7 @@ class Worker(Thread): # {{{ os.remove(path) except: pass + # }}} class CopyToLibraryAction(InterfaceAction): diff --git a/src/calibre/gui2/convert/mobi_output.py b/src/calibre/gui2/convert/mobi_output.py index f268ac8606..cd1d0430ae 100644 --- a/src/calibre/gui2/convert/mobi_output.py +++ b/src/calibre/gui2/convert/mobi_output.py @@ -23,7 +23,7 @@ class PluginWidget(Widget, Ui_Form): Widget.__init__(self, parent, ['prefer_author_sort', 'rescale_images', 'toc_title', 'mobi_ignore_margins', 'mobi_toc_at_start', - 'dont_compress', 'no_inline_toc', + 'dont_compress', 'no_inline_toc', 'share_not_sync', 'personal_doc']#, 'mobi_navpoints_only_deepest'] ) self.db, self.book_id = db, book_id diff --git a/src/calibre/gui2/convert/mobi_output.ui b/src/calibre/gui2/convert/mobi_output.ui index 7643d791f3..68cd55ab95 100644 --- a/src/calibre/gui2/convert/mobi_output.ui +++ b/src/calibre/gui2/convert/mobi_output.ui @@ -75,6 +75,13 @@ + + + + Enable sharing of book content via Facebook, etc. WARNING: Disables last read syncing + + + diff --git a/src/calibre/gui2/jobs.py b/src/calibre/gui2/jobs.py index b7992eb319..a6011abaa0 100644 --- a/src/calibre/gui2/jobs.py +++ b/src/calibre/gui2/jobs.py @@ -266,7 +266,7 @@ class JobManager(QAbstractTableModel): # {{{ def kill_multiple_jobs(self, rows, view): jobs = [self.jobs[row] for row in rows] - devjobs = [j for j in jobs is isinstance(j, DeviceJob)] + devjobs = [j for j in jobs if isinstance(j, DeviceJob)] if devjobs: error_dialog(view, _('Cannot kill job'), _('Cannot kill jobs that communicate with the device')).exec_() diff --git a/src/calibre/gui2/keyboard.py b/src/calibre/gui2/keyboard.py index 9b0b1d8f69..362a074304 100644 --- a/src/calibre/gui2/keyboard.py +++ b/src/calibre/gui2/keyboard.py @@ -443,7 +443,13 @@ class Editor(QFrame): # {{{ return QWidget.keyPressEvent(self, ev) button = getattr(self, 'button%d'%which) button.setStyleSheet('QPushButton { font-weight: normal}') - sequence = QKeySequence(code|(int(ev.modifiers())&~Qt.KeypadModifier)) + mods = int(ev.modifiers()) & ~Qt.KeypadModifier + txt = unicode(ev.text()) + if txt and txt.lower() == txt.upper(): + # We have a symbol like ! or > etc. In this case the value of code + # already includes Shift, so remove it + mods &= ~Qt.ShiftModifier + sequence = QKeySequence(code|mods) button.setText(sequence.toString(QKeySequence.NativeText)) self.capture = 0 dup_desc = self.dup_check(sequence) diff --git a/src/calibre/gui2/metadata/basic_widgets.py b/src/calibre/gui2/metadata/basic_widgets.py index a349b8ca92..fe20be765f 100644 --- a/src/calibre/gui2/metadata/basic_widgets.py +++ b/src/calibre/gui2/metadata/basic_widgets.py @@ -7,7 +7,7 @@ __license__ = 'GPL v3' __copyright__ = '2011, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import textwrap, re, os +import textwrap, re, os, errno from PyQt4.Qt import (Qt, QDateEdit, QDate, pyqtSignal, QMessageBox, QIcon, QToolButton, QWidget, QLabel, QGridLayout, QApplication, @@ -98,7 +98,7 @@ class TitleEdit(EnLineEdit): getattr(db, 'set_'+ self.TITLE_ATTR)(id_, title, notify=False, commit=False) except (IOError, OSError) as err: - if getattr(err, 'errno', -1) == 13: # Permission denied + if getattr(err, 'errno', -1) == errno.EACCES: # Permission denied import traceback fname = err.filename if err.filename else 'file' error_dialog(self, _('Permission denied'), @@ -262,7 +262,7 @@ class AuthorsEdit(MultiCompleteComboBox): self.books_to_refresh |= db.set_authors(id_, authors, notify=False, allow_case_change=True) except (IOError, OSError) as err: - if getattr(err, 'errno', -1) == 13: # Permission denied + if getattr(err, 'errno', -1) == errno.EACCES: # Permission denied import traceback fname = err.filename if err.filename else 'file' error_dialog(self, _('Permission denied'), diff --git a/src/calibre/gui2/metadata/single.py b/src/calibre/gui2/metadata/single.py index a2666b0351..bbc5f6fce4 100644 --- a/src/calibre/gui2/metadata/single.py +++ b/src/calibre/gui2/metadata/single.py @@ -7,7 +7,7 @@ __license__ = 'GPL v3' __copyright__ = '2011, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import os +import os, errno from functools import partial from PyQt4.Qt import (Qt, QVBoxLayout, QHBoxLayout, QWidget, QPushButton, @@ -427,7 +427,7 @@ class MetadataSingleDialogBase(ResizableDialog): self.books_to_refresh |= getattr(widget, 'books_to_refresh', set([])) except IOError as err: - if err.errno == 13: # Permission denied + if err.errno == errno.EACCES: # Permission denied import traceback fname = err.filename if err.filename else 'file' error_dialog(self, _('Permission denied'), diff --git a/src/calibre/gui2/store/stores/ozon_ru_plugin.py b/src/calibre/gui2/store/stores/ozon_ru_plugin.py index 866c1c2732..3934ebbbb3 100644 --- a/src/calibre/gui2/store/stores/ozon_ru_plugin.py +++ b/src/calibre/gui2/store/stores/ozon_ru_plugin.py @@ -80,13 +80,15 @@ class OzonRUStore(BasicStoreConfig, StorePlugin): doc = html.fromstring(f.read()) # example where we are going to find formats - #
    - # ... - # Доступные форматы: - #
    .epub, .fb2, .pdf, .pdf, .txt
    - # ... + #
    + #

    + # Доступно: + #

    #
    - xpt = u'normalize-space(//div[@class="box"]//*[contains(normalize-space(text()), "Доступные форматы:")][1]/following-sibling::div[1]/text())' + #
    + #

    .epub, .fb2.zip, .pdf

    + #
    + xpt = u'normalize-space(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "Доступ")]/ancestor-or-self::div[1]/following-sibling::div[1]/*[1])' formats = doc.xpath(xpt) if formats: result = True diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py index c65484ff56..d6c2ddd659 100644 --- a/src/calibre/library/database2.py +++ b/src/calibre/library/database2.py @@ -161,7 +161,8 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): return path and os.path.exists(os.path.join(path, 'metadata.db')) def __init__(self, library_path, row_factory=False, default_prefs=None, - read_only=False): + read_only=False, is_second_db=False): + self.is_second_db = is_second_db try: if isbytestring(library_path): library_path = library_path.decode(filesystem_encoding) @@ -263,7 +264,8 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): migrate_preference('user_categories', {}) migrate_preference('saved_searches', {}) - set_saved_searches(self, 'saved_searches') + if not self.is_second_db: + set_saved_searches(self, 'saved_searches') # migrate grouped_search_terms if self.prefs.get('grouped_search_terms', None) is None: diff --git a/src/calibre/library/server/base.py b/src/calibre/library/server/base.py index 26e4d3469e..69322512a0 100644 --- a/src/calibre/library/server/base.py +++ b/src/calibre/library/server/base.py @@ -34,7 +34,7 @@ class DispatchController(object): # {{{ def __init__(self, prefix, wsgi=False): self.dispatcher = cherrypy.dispatch.RoutesDispatcher() self.funcs = [] - self.seen = set([]) + self.seen = set() self.prefix = prefix if prefix else '' if wsgi: self.prefix = '' @@ -146,6 +146,11 @@ class LibraryServer(ContentServer, MobileServer, XMLServer, OPDSServer, Cache, self.config = {} self.is_running = False self.exception = None + #self.config['/'] = { + # 'tools.sessions.on' : True, + # 'tools.sessions.timeout': 60, # Session times out after 60 minutes + #} + if not wsgi: self.setup_loggers() cherrypy.engine.bonjour.subscribe() @@ -154,6 +159,7 @@ class LibraryServer(ContentServer, MobileServer, XMLServer, OPDSServer, Cache, 'tools.gzip.mime_types': ['text/html', 'text/plain', 'text/xml', 'text/javascript', 'text/css'], } + if opts.password: self.config['/'] = { 'tools.digest_auth.on' : True, diff --git a/src/calibre/library/server/content.py b/src/calibre/library/server/content.py index e55970ccd7..bb6fe1c454 100644 --- a/src/calibre/library/server/content.py +++ b/src/calibre/library/server/content.py @@ -202,7 +202,7 @@ class ContentServer(object): mode='rb') if fmt is None: raise cherrypy.HTTPError(404, 'book: %d does not have format: %s'%(id, format)) - mi = self.db.get_metadata(id, index_is_id=True) + mi = newmi = self.db.get_metadata(id, index_is_id=True) if format == 'EPUB': # Get the original metadata @@ -214,9 +214,8 @@ class ContentServer(object): # Transform the metadata via the plugboard newmi = mi.deepcopy_metadata() newmi.template_to_attribute(mi, cpb) - else: - newmi = mi + if format in ('MOBI', 'EPUB'): # Write the updated file from calibre.ebooks.metadata.meta import set_metadata set_metadata(fmt, newmi, 'epub') diff --git a/src/calibre/library/server/mobile.py b/src/calibre/library/server/mobile.py index 3ce96a2b49..0cb7a86126 100644 --- a/src/calibre/library/server/mobile.py +++ b/src/calibre/library/server/mobile.py @@ -277,12 +277,15 @@ class MobileServer(object): cherrypy.response.headers['Content-Type'] = 'text/html; charset=utf-8' cherrypy.response.headers['Last-Modified'] = self.last_modified(updated) - url_base = "/mobile?search=" + search+";order="+order+";sort="+sort+";num="+str(num) - return html.tostring(build_index(books, num, search, sort, order, + raw = html.tostring(build_index(books, num, search, sort, order, start, len(ids), url_base, CKEYS, self.opts.url_prefix), - encoding='utf-8', include_meta_content_type=True, + encoding='utf-8', pretty_print=True) + # tostring's include_meta_content_type is broken + raw = raw.replace('', '\n' + '') + return raw diff --git a/src/calibre/utils/browser.py b/src/calibre/utils/browser.py index 6f8703ab49..430ced9fdd 100644 --- a/src/calibre/utils/browser.py +++ b/src/calibre/utils/browser.py @@ -28,6 +28,10 @@ class Browser(B): B.set_cookiejar(self, *args, **kwargs) self._clone_actions['set_cookiejar'] = ('set_cookiejar', args, kwargs) + @property + def cookiejar(self): + return self._clone_actions['set_cookiejar'][1][0] + def set_handle_redirect(self, *args, **kwargs): B.set_handle_redirect(self, *args, **kwargs) self._clone_actions['set_handle_redirect'] = ('set_handle_redirect', diff --git a/src/calibre/utils/localization.py b/src/calibre/utils/localization.py index 947ee823c6..3e9e133590 100644 --- a/src/calibre/utils/localization.py +++ b/src/calibre/utils/localization.py @@ -125,6 +125,7 @@ _extra_lang_codes = { 'en_HR' : _('English (Croatia)'), 'en_ID' : _('English (Indonesia)'), 'en_IL' : _('English (Israel)'), + 'en_RU' : _('English (Russia)'), 'en_SG' : _('English (Singapore)'), 'en_YE' : _('English (Yemen)'), 'en_IE' : _('English (Ireland)'), diff --git a/src/calibre/utils/pyparsing.py b/src/calibre/utils/pyparsing.py index bc5571ea5f..9be97dc287 100644 --- a/src/calibre/utils/pyparsing.py +++ b/src/calibre/utils/pyparsing.py @@ -1,6 +1,6 @@ # module pyparsing.py # -# Copyright (c) 2003-2010 Paul T. McGuire +# Copyright (c) 2003-2011 Paul T. McGuire # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the @@ -58,8 +58,8 @@ The pyparsing module handles some of the problems that are typically vexing when - embedded comments """ -__version__ = "1.5.5" -__versionTime__ = "12 Aug 2010 03:56" +__version__ = "1.5.6" +__versionTime__ = "26 June 2011 10:53" __author__ = "Paul McGuire " import string @@ -101,11 +101,12 @@ if _PY3K: basestring = str unichr = chr _ustr = str - _str2dict = set alphas = string.ascii_lowercase + string.ascii_uppercase else: _MAX_INT = sys.maxint range = xrange + set = lambda s : dict( [(c,0) for c in s] ) + alphas = string.lowercase + string.uppercase def _ustr(obj): """Drop-in replacement for str(obj) that tries to be Unicode friendly. It first tries @@ -134,9 +135,6 @@ else: #return unicode(obj).encode(sys.getdefaultencoding(), 'replace') # ... - def _str2dict(strg): - return dict( [(c,0) for c in strg] ) - alphas = string.lowercase + string.uppercase # build list of single arg builtins, tolerant of Python version, that can be used as parse actions @@ -606,10 +604,10 @@ class ParseResults(object): def __setstate__(self,state): self.__toklist = state[0] - self.__tokdict, \ - par, \ - inAccumNames, \ - self.__name = state[1] + (self.__tokdict, + par, + inAccumNames, + self.__name) = state[1] self.__accumNames = {} self.__accumNames.update(inAccumNames) if par is not None: @@ -667,6 +665,35 @@ def nullDebugAction(*args): """'Do-nothing' debug action, to suppress debugging output during parsing.""" pass +'decorator to trim function calls to match the arity of the target' +if not _PY3K: + def _trim_arity(func, maxargs=2): + limit = [0] + def wrapper(*args): + while 1: + try: + return func(*args[limit[0]:]) + except TypeError: + if limit[0] <= maxargs: + limit[0] += 1 + continue + raise + return wrapper +else: + def _trim_arity(func, maxargs=2): + limit = maxargs + def wrapper(*args): + #~ nonlocal limit + while 1: + try: + return func(*args[limit:]) + except TypeError: + if limit: + limit -= 1 + continue + raise + return wrapper + class ParserElement(object): """Abstract base level parser element class.""" DEFAULT_WHITE_CHARS = " \n\t\r" @@ -731,6 +758,9 @@ class ParserElement(object): see L{I{__call__}<__call__>}. """ newself = self.copy() + if name.endswith("*"): + name = name[:-1] + listAllMatches=True newself.resultsName = name newself.modalResults = not listAllMatches return newself @@ -753,104 +783,6 @@ class ParserElement(object): self._parse = self._parse._originalParseMethod return self - def _normalizeParseActionArgs( f ): - """Internal method used to decorate parse actions that take fewer than 3 arguments, - so that all parse actions can be called as C{f(s,l,t)}.""" - STAR_ARGS = 4 - - # special handling for single-argument builtins - if (f in singleArgBuiltins): - numargs = 1 - else: - try: - restore = None - if isinstance(f,type): - restore = f - f = f.__init__ - if not _PY3K: - codeObj = f.func_code - else: - codeObj = f.code - if codeObj.co_flags & STAR_ARGS: - return f - numargs = codeObj.co_argcount - if not _PY3K: - if hasattr(f,"im_self"): - numargs -= 1 - else: - if hasattr(f,"__self__"): - numargs -= 1 - if restore: - f = restore - except AttributeError: - try: - if not _PY3K: - call_im_func_code = f.__call__.im_func.func_code - else: - call_im_func_code = f.__code__ - - # not a function, must be a callable object, get info from the - # im_func binding of its bound __call__ method - if call_im_func_code.co_flags & STAR_ARGS: - return f - numargs = call_im_func_code.co_argcount - if not _PY3K: - if hasattr(f.__call__,"im_self"): - numargs -= 1 - else: - if hasattr(f.__call__,"__self__"): - numargs -= 0 - except AttributeError: - if not _PY3K: - call_func_code = f.__call__.func_code - else: - call_func_code = f.__call__.__code__ - # not a bound method, get info directly from __call__ method - if call_func_code.co_flags & STAR_ARGS: - return f - numargs = call_func_code.co_argcount - if not _PY3K: - if hasattr(f.__call__,"im_self"): - numargs -= 1 - else: - if hasattr(f.__call__,"__self__"): - numargs -= 1 - - - #~ print ("adding function %s with %d args" % (f.func_name,numargs)) - if numargs == 3: - return f - else: - if numargs > 3: - def tmp(s,l,t): - return f(f.__call__.__self__, s,l,t) - if numargs == 2: - def tmp(s,l,t): - return f(l,t) - elif numargs == 1: - def tmp(s,l,t): - return f(t) - else: #~ numargs == 0: - def tmp(s,l,t): - return f() - try: - tmp.__name__ = f.__name__ - except (AttributeError,TypeError): - # no need for special handling if attribute doesnt exist - pass - try: - tmp.__doc__ = f.__doc__ - except (AttributeError,TypeError): - # no need for special handling if attribute doesnt exist - pass - try: - tmp.__dict__.update(f.__dict__) - except (AttributeError,TypeError): - # no need for special handling if attribute doesnt exist - pass - return tmp - _normalizeParseActionArgs = staticmethod(_normalizeParseActionArgs) - def setParseAction( self, *fns, **kwargs ): """Define action to perform when successfully matching parse element definition. Parse action fn is a callable method with 0-3 arguments, called as C{fn(s,loc,toks)}, @@ -868,13 +800,13 @@ class ParserElement(object): consistent view of the parsed string, the parse location, and line and column positions within the parsed string. """ - self.parseAction = list(map(self._normalizeParseActionArgs, list(fns))) + self.parseAction = list(map(_trim_arity, list(fns))) self.callDuringTry = ("callDuringTry" in kwargs and kwargs["callDuringTry"]) return self def addParseAction( self, *fns, **kwargs ): """Add parse action to expression's list of parse actions. See L{I{setParseAction}}.""" - self.parseAction += list(map(self._normalizeParseActionArgs, list(fns))) + self.parseAction += list(map(_trim_arity, list(fns))) self.callDuringTry = self.callDuringTry or ("callDuringTry" in kwargs and kwargs["callDuringTry"]) return self @@ -1012,9 +944,9 @@ class ParserElement(object): lookup = (self,instring,loc,callPreParse,doActions) if lookup in ParserElement._exprArgCache: value = ParserElement._exprArgCache[ lookup ] - if isinstance(value,Exception): + if isinstance(value, Exception): raise value - return value + return (value[0],value[1].copy()) else: try: value = self._parseNoCache( instring, loc, doActions, callPreParse ) @@ -1088,8 +1020,8 @@ class ParserElement(object): try: loc, tokens = self._parse( instring, 0 ) if parseAll: - #loc = self.preParse( instring, loc ) - se = StringEnd() + loc = self.preParse( instring, loc ) + se = Empty() + StringEnd() se._parse( instring, loc ) except ParseBaseException: if ParserElement.verbose_stacktrace: @@ -1101,10 +1033,11 @@ class ParserElement(object): else: return tokens - def scanString( self, instring, maxMatches=_MAX_INT ): + def scanString( self, instring, maxMatches=_MAX_INT, overlap=False ): """Scan the input string for expression matches. Each match will return the matching tokens, start location, and end location. May be called with optional - C{maxMatches} argument, to clip scanning after 'n' matches are found. + C{maxMatches} argument, to clip scanning after 'n' matches are found. If + C{overlap} is specified, then overlapping matches will be reported. Note that the start and end locations are reported relative to the string being parsed. See L{I{parseString}} for more information on parsing @@ -1133,7 +1066,14 @@ class ParserElement(object): if nextLoc > loc: matches += 1 yield tokens, preloc, nextLoc - loc = nextLoc + if overlap: + nextloc = preparseFn( instring, loc ) + if nextloc > loc: + loc = nextLoc + else: + loc += 1 + else: + loc = nextLoc else: loc = preloc+1 except ParseBaseException: @@ -1168,6 +1108,7 @@ class ParserElement(object): out.append(t) lastE = e out.append(instring[lastE:]) + out = [o for o in out if o] return "".join(map(_ustr,_flatten(out))) except ParseBaseException: if ParserElement.verbose_stacktrace: @@ -1372,6 +1313,9 @@ class ParserElement(object): userdata = Word(alphas).setResultsName("name") + Word(nums+"-").setResultsName("socsecno") could be written as:: userdata = Word(alphas)("name") + Word(nums+"-")("socsecno") + + If C{name} is given with a trailing C{'*'} character, then C{listAllMatches} will be + passed as C{True}. """ return self.setResultsName(name) @@ -1398,9 +1342,9 @@ class ParserElement(object): return self def parseWithTabs( self ): - """Overrides default behavior to expand s to spaces before parsing the input string. + """Overrides default behavior to expand C{}s to spaces before parsing the input string. Must be called before C{parseString} when the input grammar contains elements that - match characters.""" + match C{} characters.""" self.keepTabs = True return self @@ -1508,12 +1452,10 @@ class Token(ParserElement): """Abstract C{ParserElement} subclass, for defining atomic matching patterns.""" def __init__( self ): super(Token,self).__init__( savelist=False ) - #self.myException = ParseException("",0,"",self) def setName(self, name): s = super(Token,self).setName(name) self.errmsg = "Expected " + self.name - #s.myException.msg = self.errmsg return s @@ -1534,7 +1476,6 @@ class NoMatch(Token): self.mayReturnEmpty = True self.mayIndexError = False self.errmsg = "Unmatchable token" - #self.myException.msg = self.errmsg def parseImpl( self, instring, loc, doActions=True ): exc = self.myException @@ -1558,7 +1499,6 @@ class Literal(Token): self.name = '"%s"' % _ustr(self.match) self.errmsg = "Expected " + self.name self.mayReturnEmpty = False - #self.myException.msg = self.errmsg self.mayIndexError = False # Performance tuning: this routine gets called a *lot* @@ -1579,12 +1519,12 @@ _L = Literal class Keyword(Token): """Token to exactly match a specified string as a keyword, that is, it must be immediately followed by a non-keyword character. Compare with C{Literal}:: - Literal("if") will match the leading 'if' in 'ifAndOnlyIf'. - Keyword("if") will not; it will only match the leading 'if in 'if x=1', or 'if(y==2)' + Literal("if") will match the leading C{'if'} in C{'ifAndOnlyIf'}. + Keyword("if") will not; it will only match the leading C{'if'} in C{'if x=1'}, or C{'if(y==2)'} Accepts two optional constructor arguments in addition to the keyword string: C{identChars} is a string of characters that would be valid identifier characters, defaulting to all alphanumerics + "_" and "$"; C{caseless} allows case-insensitive - matching, default is False. + matching, default is C{False}. """ DEFAULT_KEYWORD_CHARS = alphanums+"_$" @@ -1600,13 +1540,12 @@ class Keyword(Token): self.name = '"%s"' % self.match self.errmsg = "Expected " + self.name self.mayReturnEmpty = False - #self.myException.msg = self.errmsg self.mayIndexError = False self.caseless = caseless if caseless: self.caselessmatch = matchString.upper() identChars = identChars.upper() - self.identChars = _str2dict(identChars) + self.identChars = set(identChars) def parseImpl( self, instring, loc, doActions=True ): if self.caseless: @@ -1648,7 +1587,6 @@ class CaselessLiteral(Literal): self.returnString = matchString self.name = "'%s'" % self.returnString self.errmsg = "Expected " + self.name - #self.myException.msg = self.errmsg def parseImpl( self, instring, loc, doActions=True ): if instring[ loc:loc+self.matchLen ].upper() == self.match: @@ -1680,18 +1618,25 @@ class Word(Token): defaults to the initial character set), and an optional minimum, maximum, and/or exact length. The default value for C{min} is 1 (a minimum value < 1 is not valid); the default values for C{max} and C{exact} - are 0, meaning no maximum or exact length restriction. + are 0, meaning no maximum or exact length restriction. An optional + C{exclude} parameter can list characters that might be found in + the input C{bodyChars} string; useful to define a word of all printables + except for one or two characters, for instance. """ - def __init__( self, initChars, bodyChars=None, min=1, max=0, exact=0, asKeyword=False ): + def __init__( self, initChars, bodyChars=None, min=1, max=0, exact=0, asKeyword=False, excludeChars=None ): super(Word,self).__init__() + if excludeChars: + initChars = ''.join([c for c in initChars if c not in excludeChars]) + if bodyChars: + bodyChars = ''.join([c for c in bodyChars if c not in excludeChars]) self.initCharsOrig = initChars - self.initChars = _str2dict(initChars) + self.initChars = set(initChars) if bodyChars : self.bodyCharsOrig = bodyChars - self.bodyChars = _str2dict(bodyChars) + self.bodyChars = set(bodyChars) else: self.bodyCharsOrig = initChars - self.bodyChars = _str2dict(initChars) + self.bodyChars = set(initChars) self.maxSpecified = max > 0 @@ -1711,7 +1656,6 @@ class Word(Token): self.name = _ustr(self) self.errmsg = "Expected " + self.name - #self.myException.msg = self.errmsg self.mayIndexError = False self.asKeyword = asKeyword @@ -1743,7 +1687,7 @@ class Word(Token): raise exc loc = result.end() - return loc,result.group() + return loc, result.group() if not(instring[ loc ] in self.initChars): #~ raise ParseException( instring, loc, self.errmsg ) @@ -1807,24 +1751,24 @@ class Regex(Token): """ compiledREtype = type(re.compile("[A-Z]")) def __init__( self, pattern, flags=0): - """The parameters pattern and flags are passed to the re.compile() function as-is. See the Python re module for an explanation of the acceptable patterns and flags.""" + """The parameters C{pattern} and C{flags} are passed to the C{re.compile()} function as-is. See the Python C{re} module for an explanation of the acceptable patterns and flags.""" super(Regex,self).__init__() if isinstance(pattern, basestring): - if len(pattern) == 0: - warnings.warn("null string passed to Regex; use Empty() instead", - SyntaxWarning, stacklevel=2) - - self.pattern = pattern - self.flags = flags - - try: - self.re = re.compile(self.pattern, self.flags) - self.reString = self.pattern - except sre_constants.error: - warnings.warn("invalid pattern (%s) passed to Regex" % pattern, - SyntaxWarning, stacklevel=2) - raise + if len(pattern) == 0: + warnings.warn("null string passed to Regex; use Empty() instead", + SyntaxWarning, stacklevel=2) + + self.pattern = pattern + self.flags = flags + + try: + self.re = re.compile(self.pattern, self.flags) + self.reString = self.pattern + except sre_constants.error: + warnings.warn("invalid pattern (%s) passed to Regex" % pattern, + SyntaxWarning, stacklevel=2) + raise elif isinstance(pattern, Regex.compiledREtype): self.re = pattern @@ -1837,7 +1781,6 @@ class Regex(Token): self.name = _ustr(self) self.errmsg = "Expected " + self.name - #self.myException.msg = self.errmsg self.mayIndexError = False self.mayReturnEmpty = True @@ -1929,7 +1872,8 @@ class QuotedString(Token): self.pattern += (r'|(?:%s)' % re.escape(escQuote)) if escChar: self.pattern += (r'|(?:%s.)' % re.escape(escChar)) - self.escCharReplacePattern = re.escape(self.escChar)+"(.)" + charset = ''.join(set(self.quoteChar[0]+self.endQuoteChar[0])).replace('^',r'\^').replace('-',r'\-') + self.escCharReplacePattern = re.escape(self.escChar)+("([%s])" % charset) self.pattern += (r')*%s' % re.escape(self.endQuoteChar)) try: @@ -1942,7 +1886,6 @@ class QuotedString(Token): self.name = _ustr(self) self.errmsg = "Expected " + self.name - #self.myException.msg = self.errmsg self.mayIndexError = False self.mayReturnEmpty = True @@ -2014,7 +1957,6 @@ class CharsNotIn(Token): self.name = _ustr(self) self.errmsg = "Expected " + self.name self.mayReturnEmpty = ( self.minLen == 0 ) - #self.myException.msg = self.errmsg self.mayIndexError = False def parseImpl( self, instring, loc, doActions=True ): @@ -2077,7 +2019,6 @@ class White(Token): self.name = ("".join([White.whiteStrs[c] for c in self.matchWhite])) self.mayReturnEmpty = True self.errmsg = "Expected " + self.name - #self.myException.msg = self.errmsg self.minLen = min @@ -2150,7 +2091,6 @@ class LineStart(_PositionToken): super(LineStart,self).__init__() self.setWhitespaceChars( ParserElement.DEFAULT_WHITE_CHARS.replace("\n","") ) self.errmsg = "Expected start of line" - #self.myException.msg = self.errmsg def preParse( self, instring, loc ): preloc = super(LineStart,self).preParse(instring,loc) @@ -2175,7 +2115,6 @@ class LineEnd(_PositionToken): super(LineEnd,self).__init__() self.setWhitespaceChars( ParserElement.DEFAULT_WHITE_CHARS.replace("\n","") ) self.errmsg = "Expected end of line" - #self.myException.msg = self.errmsg def parseImpl( self, instring, loc, doActions=True ): if loc" ] ) tagAttrValue = quotedString.copy().setParseAction( removeQuotes ) | Word(printablesLessRAbrack) - openTag = Suppress("<") + tagStr + \ + openTag = Suppress("<") + tagStr("tag") + \ Dict(ZeroOrMore(Group( tagAttrName.setParseAction(downcaseTokens) + \ Optional( Suppress("=") + tagAttrValue ) ))) + \ Optional("/",default=[False]).setResultsName("empty").setParseAction(lambda s,l,t:t[0]=='/') + Suppress(">") @@ -3508,19 +3464,21 @@ def makeXMLTags(tagStr): def withAttribute(*args,**attrDict): """Helper to create a validating parse action to be used with start tags created - with makeXMLTags or makeHTMLTags. Use withAttribute to qualify a starting tag + with C{makeXMLTags} or C{makeHTMLTags}. Use C{withAttribute} to qualify a starting tag with a required attribute value, to avoid false matches on common tags such as -
or
. + C{
} or C{
}. - Call withAttribute with a series of attribute names and values. Specify the list + Call C{withAttribute} with a series of attribute names and values. Specify the list of filter attributes names and values as: - - keyword arguments, as in (class="Customer",align="right"), or + - keyword arguments, as in C{(align="right")}, or + - as an explicit dict with C{**} operator, when an attribute name is also a Python + reserved word, as in C{**{"class":"Customer", "align":"right"}} - a list of name-value tuples, as in ( ("ns1:class", "Customer"), ("ns2:align","right") ) For attribute names with a namespace prefix, you must use the second form. Attribute names are matched insensitive to upper/lower case. To verify that the attribute exists, but without specifying a value, pass - withAttribute.ANY_VALUE as the value. + C{withAttribute.ANY_VALUE} as the value. """ if args: attrs = args[:] @@ -3631,12 +3589,12 @@ def nestedExpr(opener="(", closer=")", content=None, ignoreExpr=quotedString.cop expression will capture all whitespace-delimited content between delimiters as a list of separate values. - Use the ignoreExpr argument to define expressions that may contain + Use the C{ignoreExpr} argument to define expressions that may contain opening or closing characters that should not be treated as opening or closing characters for nesting, such as quotedString or a comment - expression. Specify multiple expressions using an Or or MatchFirst. - The default is quotedString, but if no expressions are to be ignored, - then pass None for this argument. + expression. Specify multiple expressions using an C{L{Or}} or C{L{MatchFirst}}. + The default is L{quotedString}, but if no expressions are to be ignored, + then pass C{None} for this argument. """ if opener == closer: raise ValueError("opening and closing strings cannot be the same") @@ -3683,7 +3641,7 @@ def indentedBlock(blockStatementExpr, indentStack, indent=True): the current level; set to False for block of left-most statements (default=True) - A valid block must contain at least one blockStatement. + A valid block must contain at least one C{blockStatement}. """ def checkPeerIndent(s,l,t): if l >= len(s): return diff --git a/src/calibre/utils/search_query_parser.py b/src/calibre/utils/search_query_parser.py index 3c41498107..a937e055ac 100644 --- a/src/calibre/utils/search_query_parser.py +++ b/src/calibre/utils/search_query_parser.py @@ -16,11 +16,11 @@ methods :method:`SearchQueryParser.universal_set` and If this module is run, it will perform a series of unit tests. ''' -import sys, operator +import sys, operator, weakref -from calibre.utils.pyparsing import CaselessKeyword, Group, Forward, \ - CharsNotIn, Suppress, OneOrMore, MatchFirst, CaselessLiteral, \ - Optional, NoMatch, ParseException, QuotedString +from calibre.utils.pyparsing import (CaselessKeyword, Group, Forward, + CharsNotIn, Suppress, OneOrMore, MatchFirst, CaselessLiteral, + Optional, NoMatch, ParseException, QuotedString) from calibre.constants import preferred_encoding from calibre.utils.icu import sort_key from calibre import prints @@ -37,11 +37,19 @@ class SavedSearchQueries(object): def __init__(self, db, _opt_name): self.opt_name = _opt_name; - self.db = db if db is not None: self.queries = db.prefs.get(self.opt_name, {}) else: self.queries = {} + try: + self._db = weakref.ref(db) + except: + # db could be None + self._db = lambda : None + + @property + def db(self): + return self._db() def force_unicode(self, x): if not isinstance(x, unicode): @@ -49,21 +57,27 @@ class SavedSearchQueries(object): return x def add(self, name, value): - self.queries[self.force_unicode(name)] = self.force_unicode(value).strip() - self.db.prefs[self.opt_name] = self.queries + db = self.db + if db is not None: + self.queries[self.force_unicode(name)] = self.force_unicode(value).strip() + db.prefs[self.opt_name] = self.queries def lookup(self, name): return self.queries.get(self.force_unicode(name), None) def delete(self, name): - self.queries.pop(self.force_unicode(name), False) - self.db.prefs[self.opt_name] = self.queries + db = self.db + if db is not None: + self.queries.pop(self.force_unicode(name), False) + db.prefs[self.opt_name] = self.queries def rename(self, old_name, new_name): - self.queries[self.force_unicode(new_name)] = \ - self.queries.get(self.force_unicode(old_name), None) - self.queries.pop(self.force_unicode(old_name), False) - self.db.prefs[self.opt_name] = self.queries + db = self.db + if db is not None: + self.queries[self.force_unicode(new_name)] = \ + self.queries.get(self.force_unicode(old_name), None) + self.queries.pop(self.force_unicode(old_name), False) + db.prefs[self.opt_name] = self.queries def names(self): return sorted(self.queries.keys(),key=sort_key) diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index 436612af7e..b7efd611e0 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -144,6 +144,18 @@ class BasicNewsRecipe(Recipe): #: manually (though manual cleanup will always be superior). auto_cleanup = False + #: Specify elements that the auto cleanup algorithm should never remove + #: The syntax is a XPath expression. For example:: + #: + #: auto_cleanup_keep = '//div[@id="article-image"]' will keep all divs with + #: id="article-image" + #: auto_cleanup_keep = '//*[@class="important"]' will keep all elements + #: with class="important" + #: auto_cleanup_keep = '//div[@id="article-image"]|//span[@class="important"]' + #: will keep all divs with id="article-image" and spans + #: with class="important" + auto_cleanup_keep = None + #: Specify any extra :term:`CSS` that should be addded to downloaded :term:`HTML` files #: It will be inserted into `