diff --git a/recipes/android_com_pl.recipe b/recipes/android_com_pl.recipe new file mode 100644 index 0000000000..a44d5e560a --- /dev/null +++ b/recipes/android_com_pl.recipe @@ -0,0 +1,12 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class Android_com_pl(BasicNewsRecipe): + title = u'Android.com.pl' + __author__ = 'fenuks' + description = 'Android.com.pl - biggest polish Android site' + category = 'Android, mobile' + language = 'pl' + cover_url =u'http://upload.wikimedia.org/wikipedia/commons/thumb/d/d7/Android_robot.svg/220px-Android_robot.svg.png' + oldest_article = 8 + max_articles_per_feed = 100 + feeds = [(u'Android', u'http://android.com.pl/component/content/frontpage/frontpage.feed?type=rss')] diff --git a/recipes/bash_org_pl.recipe b/recipes/bash_org_pl.recipe new file mode 100644 index 0000000000..037870ed6c --- /dev/null +++ b/recipes/bash_org_pl.recipe @@ -0,0 +1,15 @@ +from calibre.web.feeds.news import BasicNewsRecipe + + +class Bash_org_pl(BasicNewsRecipe): + title = u'Bash.org.pl' + __author__ = 'fenuks' + description = 'Bash.org.pl - funny quotations from IRC discussions' + category = 'funny quotations, humour' + language = 'pl' + oldest_article = 15 + cover_url = u'http://userlogos.org/files/logos/dzikiosiol/none_0.png' + max_articles_per_feed = 100 + no_stylesheets= True + keep_only_tags= [dict(name='div', attrs={'class':'quote post-content post-body'})] + feeds = [(u'Cytaty', u'http://bash.org.pl/rss')] diff --git a/recipes/bbc.recipe b/recipes/bbc.recipe index 9c8b92f25c..2bccbaf4ae 100644 --- a/recipes/bbc.recipe +++ b/recipes/bbc.recipe @@ -36,8 +36,9 @@ class BBC(BasicNewsRecipe): ] remove_tags = [ - dict(name='div', attrs={'class':['story-feature related narrow', 'share-help', 'embedded-hyper', \ - 'story-feature wide ', 'story-feature narrow']}) + dict(name='div', attrs={'class':['story-feature related narrow', 'share-help', 'embedded-hyper', + 'story-feature wide ', 'story-feature narrow']}), + dict(id=['hypertab', 'comment-form']), ] remove_attributes = ['width','height'] diff --git a/recipes/brasil_de_fato.recipe b/recipes/brasil_de_fato.recipe new file mode 100644 index 0000000000..d060544ece --- /dev/null +++ b/recipes/brasil_de_fato.recipe @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- + +from calibre.web.feeds.news import BasicNewsRecipe + +class BrasilDeFato(BasicNewsRecipe): + news = True + title = u'Brasil de Fato' + __author__ = 'Alex Mitrani' + description = u'Uma visão popular do Brasil e do mundo.' + publisher = u'SOCIEDADE EDITORIAL BRASIL DE FATO' + category = 'news, politics, Brazil, rss, Portuguese' + oldest_article = 10 + max_articles_per_feed = 100 + summary_length = 1000 + language = 'pt_BR' + + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + remove_empty_feeds = True + masthead_url = 'http://www.brasildefato.com.br/sites/default/files/zeropoint_logo.jpg' + keep_only_tags = [dict(name='div', attrs={'id':'main'})] + remove_tags = [dict(name='div', attrs={'class':'links'})] + remove_tags_after = [dict(name='div', attrs={'class':'links'})] + + feeds = [(u'Nacional', u'http://www.brasildefato.com.br/rss_nacional') + ,(u'Internacional', u'http://www.brasildefato.com.br/rss_internacional') + ,(u'Entrevista', u'http://www.brasildefato.com.br/rss_entrevista') + ,(u'Cultura', u'http://www.brasildefato.com.br/rss_cultura') + ,(u'Análise', u'http://www.brasildefato.com.br/rss_analise') + ] diff --git a/recipes/bugun_gazetesi.recipe b/recipes/bugun_gazetesi.recipe new file mode 100644 index 0000000000..0a1d27f517 --- /dev/null +++ b/recipes/bugun_gazetesi.recipe @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- + +from calibre.web.feeds.news import BasicNewsRecipe + +class Bugun (BasicNewsRecipe): + + title = u'BUGÜN Gazetesi' + __author__ = u'thomass' + oldest_article = 2 + max_articles_per_feed =100 + #no_stylesheets = True + #delay = 1 + use_embedded_content = False + encoding = 'UTF-8' + publisher = 'thomass' + category = 'news, haberler,TR,gazete' + language = 'tr' + publication_type = 'newspaper ' + extra_css = ' div{font-size: small} h2{font-size: small;font-weight: bold} #ctl00_ortayer_haberBaslik{font-size:20px;font-weight: bold} '#h1{ font-size:10%;font-weight: bold} '#ctl00_ortayer_haberBaslik{ 'font-size:10%;font-weight: bold'} + #introduction{} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} ' + conversion_options = { + 'tags' : category + ,'language' : language + ,'publisher' : publisher + ,'linearize_tables': True + } + cover_img_url = 'http://www.bugun.com.tr/images/bugunLogo2011.png' + masthead_url = 'http://www.bugun.com.tr/images/bugunLogo2011.png' + + keep_only_tags = [dict(name='h1', attrs={'class':[ 'haberBaslik']}),dict(name='h2', attrs={'class':[ 'haberOzet']}), dict(name='div', attrs={'class':['haberGriDivvvv']}), dict(name='div', attrs={'id':[ 'haberTextDiv']}), ] + + #keep_only_tags = [dict(name='div', attrs={'id':[ 'news-detail-content']}), dict(name='td', attrs={'class':['columnist-detail','columnist_head']}) ] + #remove_tags = [ dict(name='div', attrs={'id':['news-detail-news-text-font-size','news-detail-gallery','news-detail-news-bottom-social']}),dict(name='div', attrs={'class':['radioEmbedBg','radyoProgramAdi']}),dict(name='a', attrs={'class':['webkit-html-attribute-value webkit-html-external-link']}),dict(name='table', attrs={'id':['yaziYorumTablosu']}),dict(name='img', attrs={'src':['http://medya.zaman.com.tr/pics/paylas.gif','http://medya.zaman.com.tr/extentions/zaman.com.tr/img/columnist/ma-16.png']})] + + + #remove_attributes = ['width','height'] + remove_empty_feeds= True + + feeds = [ + ( u'Son Dakika', u'http://www.bugun.com.tr/haberler.xml'), + ( u'Yazarlar', u'http://www.bugun.com.tr/rss/yazarlar.xml'), + ( u'Gündem', u'http://www.bugun.com.tr/rss/gundem.xml'), + ( u'Ekonomi', u'http://www.bugun.com.tr/rss/ekonomi.xml'), + ( u'Spor', u'http://www.bugun.com.tr/rss/spor.xml'), + ( u'Magazin', u'http://www.bugun.com.tr/rss/magazin.xml'), + ( u'Teknoloji', u'http://www.bugun.com.tr/rss/teknoloji.xml'), + ( u'Yaşam', u'http://www.bugun.com.tr/rss/yasam.xml'), + ( u'Medya', u'http://www.bugun.com.tr/rss/medya.xml'), + ( u'Dünya', u'http://www.bugun.com.tr/rss/dunya.xml'), + ( u'Politika', u'http://www.bugun.com.tr/rss/politika.xml'), + ( u'Sağlık', u'http://www.bugun.com.tr/rss/saglik.xml'), + ( u'Tarifler', u'http://www.bugun.com.tr/rss/yemek-tarifi.xml'), + + + + + ] diff --git a/recipes/cd_action.recipe b/recipes/cd_action.recipe new file mode 100644 index 0000000000..b4cf6b326c --- /dev/null +++ b/recipes/cd_action.recipe @@ -0,0 +1,16 @@ +from calibre.web.feeds.news import BasicNewsRecipe + + +class CD_Action(BasicNewsRecipe): + title = u'CD-Action' + __author__ = 'fenuks' + description = 'cdaction.pl - polish magazine about games site' + category = 'games' + language = 'pl' + oldest_article = 8 + max_articles_per_feed = 100 + no_stylesheets= True + cover_url =u'http://s.cdaction.pl/obrazki/logo-CD-Action_172k9.JPG' + keep_only_tags= dict(id='news_content') + remove_tags_after= dict(name='div', attrs={'class':'tresc'}) + feeds = [(u'Newsy', u'http://www.cdaction.pl/rss_newsy.xml')] diff --git a/recipes/cvecezla.recipe b/recipes/cvecezla.recipe new file mode 100644 index 0000000000..712c898a3e --- /dev/null +++ b/recipes/cvecezla.recipe @@ -0,0 +1,47 @@ + +__license__ = 'GPL v3' +__copyright__ = '2011, Darko Miletic ' +''' +cvecezla.wordpress.com +''' + +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class CveceZla(BasicNewsRecipe): + title = 'Cvece zla i naopakog' + __author__ = 'Darko Miletic' + description = 'Haoticnost razmisljanja poradja haoticnost pisanja. Muzika, stripovi, igre, knjige, generalno glupiranje...' + oldest_article = 7 + max_articles_per_feed = 100 + language = 'sr' + encoding = 'utf-8' + no_stylesheets = True + use_embedded_content = False + publication_type = 'blog' + extra_css = ' @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: "Trebuchet MS",Trebuchet,Verdana,sans1,sans-serif} .article_description{font-family: sans1, sans-serif} img{display: block } ' + + conversion_options = { + 'comment' : description + , 'tags' : 'igre, muzika, film, blog, Srbija' + , 'publisher': 'Mehmet Krljic' + , 'language' : language + } + + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] + + remove_tags_before = dict(attrs={'class':'navigation'}) + remove_tags_after = dict(attrs={'class':'commentlist'}) + remove_tags = [ + dict(attrs={'class':['postmetadata alt','sharedaddy sharedaddy-dark sd-like-enabled sd-sharing-enabled','reply','navigation']}) + ,dict(attrs={'id':'respond'}) + ] + + feeds = [(u'Clanci', u'http://cvecezla.wordpress.com/feed/')] + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return soup + + diff --git a/recipes/dobreprogamy.recipe b/recipes/dobreprogamy.recipe new file mode 100644 index 0000000000..d9b2db591d --- /dev/null +++ b/recipes/dobreprogamy.recipe @@ -0,0 +1,21 @@ +from calibre.web.feeds.news import BasicNewsRecipe + + +class Dobreprogramy_pl(BasicNewsRecipe): + title = 'Dobreprogramy.pl' + __author__ = 'fenuks' + __licence__ ='GPL v3' + category = 'IT' + language = 'pl' + cover_url = 'http://userlogos.org/files/logos/Karmody/dobreprogramy_01.png' + description = u'Aktualności i blogi z dobreprogramy.pl' + encoding = 'utf-8' + no_stylesheets = True + language = 'pl' + extra_css = '.title {font-size:22px;}' + oldest_article = 8 + max_articles_per_feed = 100 + remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})] + keep_only_tags = [dict(name='div', attrs={'class':['mainBar', 'newsContent', 'postTitle title', 'postInfo', 'contentText', 'content']})] + feeds = [(u'Aktualności', 'http://feeds.feedburner.com/dobreprogramy/Aktualnosci'), + ('Blogi', 'http://feeds.feedburner.com/dobreprogramy/BlogCzytelnikow')] diff --git a/recipes/fairbanks_daily.recipe b/recipes/fairbanks_daily.recipe new file mode 100644 index 0000000000..282925728e --- /dev/null +++ b/recipes/fairbanks_daily.recipe @@ -0,0 +1,128 @@ +#import re # Provides preprocess_regexps re.compile + +from calibre.web.feeds.news import BasicNewsRecipe + +class FairbanksDailyNewsminer(BasicNewsRecipe): + title = u'Fairbanks Daily News-miner' + __author__ = 'Roger' + oldest_article = 7 + max_articles_per_feed = 100 + + description = ''''The voice of interior Alaska since 1903''' + publisher = 'http://www.newsminer.com/' + category = 'news, Alaska, Fairbanks' + language = 'en' + #extra_css = ''' + # p{font-weight: normal;text-align: justify} + # ''' + + remove_javascript = True + use_embedded_content = False + no_stylesheets = True + language = 'en' + encoding = 'utf8' + conversion_options = {'linearize_tables':True} + # TODO: I don't see any photos in my Mobi file with this masterhead_url! + masthead_url = 'http://d2uh5w9wm14i0w.cloudfront.net/sites/635/assets/top_masthead_-_menu_pic.jpg' + + + # In order to omit seeing number of views, number of posts and the pipe + # symbol for divider after the title and date of the article, a regex or + # manual processing is needed to get just the "story_item_date updated" + # (which contains the date). Everything else on this line is pretty much not needed. + # + # HTML line containing story_item_date: + #
Aug 22, 2011 | 2370 views | 52 52 comments | 9 9 recommendations | email to a friend | print
+ + # The following was suggested, but it looks like I also need to define self & soup + # (as well as bring in extra soup depends?) + #date = self.tag_to_string(soup.find('span', attrs={'class':'story_item_date updated'})) + + #preprocess_regexps = [(re.compile(r']*addthis_separator*>'), lambda match: '') ] + #preprocess_regexps = [(re.compile(r'span class="addthis_separator">|'), lambda match: '') ] + + #preprocess_regexps = [ + # (re.compile(r'.*?', re.IGNORECASE | re.DOTALL), lambda match : ''), + # ] + + #def get_browser(self): + #def preprocess_html(soup, first_fetch): + # date = self.tag_to_string(soup.find('span', attrs={'class':'story_item_date updated'})) + # return + + + # Try to keep some tags - some might not be needed here + keep_only_tags = [ + #date = self.tag_to_string(soup.find('span', attrs={'class':'story_item_date updated'})), + dict(name='div', attrs={'class':'hnews hentry item'}), + dict(name='div', attrs={'class':'story_item_headline entry-title'}), + #dict(name='span', attrs={'class':'story_item_date updated'}), + dict(name='div', attrs={'class':'full_story'}) + ] + #remove_tags = [ + # dict(name='div', attrs={'class':'story_tools'}), + # dict(name='p', attrs={'class':'ad_label'}), + # ] + + # Try to remove some bothersome tags + remove_tags = [ + #dict(name='img', attrs={'alt'}), + dict(name='img', attrs={'class':'dont_touch_me'}), + dict(name='span', attrs={'class':'number_recommendations'}), + #dict(name='div', attrs={'class':'signature_line'}), + dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'}), + dict(name='div', attrs={'class':['addthis_toolbox','addthis_default_style']}), + dict(name='span', attrs={'class':'addthis_separator'}), + dict(name='div', attrs={'class':'related_content'}), + dict(name='div', attrs={'class':'comments_container'}), + #dict(name='div', attrs={'class':'signature_line'}), + dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'}), + dict(name='div', attrs={'id':'comments_container'}) + ] + + + # This one works but only gets title, date and clips article content! + #remove_tags_after = [ + # dict(name='span', attrs={'class':'story_item_date updated'}) + # ] + + #remove_tags_after = [ + # dict(name='div', attrs={'class':'advertisement'}), + # ] + + # Try clipping tags before and after to prevent pulling img views/posts numbers after date? + #remove_tags_before = [ + # dict(name='span', attrs={'class':'story_item_date updated'}) + # ] + + #extra_css # tweak the appearance # TODO: Change article titles to bold? + + + # Comment-out or uncomment any of the following RSS feeds according to your + # liking. + # + # TODO: Adding more then one RSS Feed, and newline will be omitted for + # entries within the Table of Contents or Index of Articles + # + # TODO: Some random bits of text is trailing the last page (or TOC on MOBI + # files), these are bits of public posts and comments and need to also be + # removed. + # + feeds = [ + (u'Alaska News', u'http://newsminer.com/rss/rss_feeds/alaska_news?content_type=article&tags=alaska_news&page_name=rss_feeds&instance=alaska_news'), + (u'Local News', u'http://newsminer.com/rss/rss_feeds/local_news?content_type=article&tags=local_news&page_name=rss_feeds&offset=0&instance=local_news'), + (u'Business', u'http://newsminer.com/rss/rss_feeds/business_news?content_type=article&tags=business_news&page_name=rss_feeds&instance=business_news'), + (u'Politics', u'http://newsminer.com/rss/rss_feeds/politics_news?content_type=article&tags=politics_news&page_name=rss_feeds&instance=politics_news'), + (u'Sports', u'http://newsminer.com/rss/rss_feeds/sports_news?content_type=article&tags=sports_news&page_name=rss_feeds&instance=sports_news'), + # (u'Latitude 65 feed', u'http://newsminer.com/rss/rss_feeds/latitude_65?content_type=article&tags=latitude_65&page_name=rss_feeds&offset=0&instance=latitude_65'), + (u'Sundays', u'http://newsminer.com/rss/rss_feeds/Sundays?content_type=article&tags=alaska_science_forum+scott_mccrea+interior_gardening+in_the_bush+judy_ferguson+book_reviews+theresa_bakker+judith_kleinfeld+interior_scrapbook+nuggets_comics+freeze_frame&page_name=rss_feeds&tag_inclusion=or&instance=Sundays'), + # (u'Outdoors', u'http://newsminer.com/rss/rss_feeds/Outdoors?content_type=article&tags=outdoors&page_name=rss_feeds&instance=Outdoors'), + # (u'Fairbanks Grizzlies', u'http://newsminer.com/rss/rss_feeds/fairbanks_grizzlies?content_type=article&tags=fairbanks_grizzlies&page_name=rss_feeds&instance=fairbanks_grizzlies'), + (u'Newsminer', u'http://newsminer.com/rss/rss_feeds/Newsminer?content_type=article&tags=ted_stevens_bullets+ted_stevens+sports_news+business_news+fairbanks_grizzlies+dermot_cole_column+outdoors+alaska_science_forum+scott_mccrea+interior_gardening+in_the_bush+judy_ferguson+book_reviews+theresa_bakker+judith_kleinfeld+interior_scrapbook+nuggets_comics+freeze_frame&page_name=rss_feeds&tag_inclusion=or&instance=Newsminer'), + # (u'Opinion', u'http://newsminer.com/rss/rss_feeds/Opinion?content_type=article&tags=editorials&page_name=rss_feeds&instance=Opinion'), + # (u'Youth', u'http://newsminer.com/rss/rss_feeds/Youth?content_type=article&tags=youth&page_name=rss_feeds&instance=Youth'), + # (u'Dermot Cole Blog', u'http://newsminer.com/rss/rss_feeds/dermot_cole_blog+rss?content_type=blog+entry&sort_by=posted_on&user_ids=3015275&page_name=blogs_dermot_cole&limit=10&instance=dermot_cole_blog+rss'), + # (u'Dermot Cole Column', u'http://newsminer.com/rss/rss_feeds/Dermot_Cole_column?content_type=article&tags=dermot_cole_column&page_name=rss_feeds&instance=Dermot_Cole_column'), + (u'Sarah Palin', u'http://newsminer.com/rss/rss_feeds/sarah_palin?content_type=article&tags=palin_in_the_news+palin_on_the_issues&page_name=rss_feeds&tag_inclusion=or&instance=sarah_palin') + ] + diff --git a/recipes/film_web.recipe b/recipes/film_web.recipe new file mode 100644 index 0000000000..0061573742 --- /dev/null +++ b/recipes/film_web.recipe @@ -0,0 +1,40 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class Filmweb_pl(BasicNewsRecipe): + title = u'FilmWeb' + __author__ = 'fenuks' + description = 'FilmWeb - biggest polish movie site' + cover_url = 'http://userlogos.org/files/logos/crudus/filmweb.png' + category = 'movies' + language = 'pl' + oldest_article = 8 + max_articles_per_feed = 100 + no_stylesheets= True + extra_css = '.hdrBig {font-size:22px;}' + remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'})] + keep_only_tags= [dict(name='h1', attrs={'class':'hdrBig'}), dict(name='div', attrs={'class':['newsInfo', 'reviewContent fontSizeCont description']})] + feeds = [(u'Wszystkie newsy', u'http://www.filmweb.pl/feed/news/latest'), + (u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'), + (u'News / Festiwale, nagrody i przeglądy', u'http://www.filmweb.pl/feed/news/category/festival'), + (u'News / Seriale', u'http://www.filmweb.pl/feed/news/category/serials'), + (u'News / Box office', u'http://www.filmweb.pl/feed/news/category/boxoffice'), + (u'News / Multimedia', u'http://www.filmweb.pl/feed/news/category/multimedia'), + (u'News / Dystrybucja dvd / blu-ray', u'http://www.filmweb.pl/feed/news/category/video'), + (u'News / Dystrybucja kinowa', u'http://www.filmweb.pl/feed/news/category/cinema'), + (u'News / off', u'http://www.filmweb.pl/feed/news/category/off'), + (u'News / Gry wideo', u'http://www.filmweb.pl/feed/news/category/game'), + (u'News / Organizacje branżowe', u'http://www.filmweb.pl/feed/news/category/organizations'), + (u'News / Internet', u'http://www.filmweb.pl/feed/news/category/internet'), + (u'News / Różne', u'http://www.filmweb.pl/feed/news/category/other'), + (u'News / Kino polskie', u'http://www.filmweb.pl/feed/news/category/polish.cinema'), + (u'News / Telewizja', u'http://www.filmweb.pl/feed/news/category/tv'), + (u'Recenzje redakcji', u'http://www.filmweb.pl/feed/reviews/latest'), + (u'Recenzje użytkowników', u'http://www.filmweb.pl/feed/user-reviews/latest')] + + def skip_ad_pages(self, soup): + skip_tag = soup.find('a', attrs={'class':'welcomeScreenButton'})['href'] + #self.log.warn(skip_tag) + if skip_tag is not None: + return self.index_to_soup(skip_tag, raw=True) + else: + None diff --git a/recipes/financial_times_uk.recipe b/recipes/financial_times_uk.recipe index f3ad824bc3..4c331f115f 100644 --- a/recipes/financial_times_uk.recipe +++ b/recipes/financial_times_uk.recipe @@ -5,6 +5,7 @@ www.ft.com/uk-edition ''' import datetime +from calibre.ptempfile import PersistentTemporaryFile from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe @@ -22,6 +23,8 @@ class FinancialTimes(BasicNewsRecipe): needs_subscription = True encoding = 'utf8' publication_type = 'newspaper' + articles_are_obfuscated = True + temp_files = [] masthead_url = 'http://im.media.ft.com/m/img/masthead_main.jpg' LOGIN = 'https://registration.ft.com/registration/barrier/login' LOGIN2 = 'http://media.ft.com/h/subs3.html' @@ -47,7 +50,12 @@ class FinancialTimes(BasicNewsRecipe): br.submit() return br - keep_only_tags = [dict(name='div', attrs={'class':['fullstory fullstoryHeader','fullstory fullstoryBody','ft-story-header','ft-story-body','index-detail']})] + keep_only_tags = [ + dict(name='div', attrs={'class':['fullstory fullstoryHeader', 'ft-story-header']}) + ,dict(name='div', attrs={'class':'standfirst'}) + ,dict(name='div', attrs={'id' :'storyContent'}) + ,dict(name='div', attrs={'class':['ft-story-body','index-detail']}) + ] remove_tags = [ dict(name='div', attrs={'id':'floating-con'}) ,dict(name=['meta','iframe','base','object','embed','link']) @@ -69,18 +77,23 @@ class FinancialTimes(BasicNewsRecipe): def get_artlinks(self, elem): articles = [] + count = 0 for item in elem.findAll('a',href=True): + count = count + 1 + if self.test and count > 2: + return articles rawlink = item['href'] if rawlink.startswith('http://'): url = rawlink else: url = self.PREFIX + rawlink + urlverified = self.browser.open_novisit(url).geturl() # resolve redirect. title = self.tag_to_string(item) date = strftime(self.timefmt) articles.append({ 'title' :title ,'date' :date - ,'url' :url + ,'url' :urlverified ,'description':'' }) return articles @@ -97,7 +110,11 @@ class FinancialTimes(BasicNewsRecipe): st = wide.find('h4',attrs={'class':'section-no-arrow'}) if st: strest.insert(0,st) + count = 0 for item in strest: + count = count + 1 + if self.test and count > 2: + return feeds ftitle = self.tag_to_string(item) self.report_progress(0, _('Fetching feed')+' %s...'%(ftitle)) feedarts = self.get_artlinks(item.parent.ul) @@ -136,4 +153,19 @@ class FinancialTimes(BasicNewsRecipe): if cdate.isoweekday() == 7: cdate -= datetime.timedelta(days=1) return cdate.strftime('http://specials.ft.com/vtf_pdf/%d%m%y_FRONT1_LON.pdf') + + def get_obfuscated_article(self, url): + count = 0 + while (count < 10): + try: + response = self.browser.open(url) + html = response.read() + count = 10 + except: + print "Retrying download..." + count += 1 + self.temp_files.append(PersistentTemporaryFile('_fa.html')) + self.temp_files[-1].write(html) + self.temp_files[-1].close() + return self.temp_files[-1].name \ No newline at end of file diff --git a/recipes/fluter_de.recipe b/recipes/fluter_de.recipe new file mode 100644 index 0000000000..1f8576cf81 --- /dev/null +++ b/recipes/fluter_de.recipe @@ -0,0 +1,39 @@ +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal ' + +''' +Fetch fluter.de +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1313693926(BasicNewsRecipe): + + title = u'Fluter' + description = 'fluter.de Magazin der Bundeszentrale für politische Bildung/bpb' + language = 'de' + encoding = 'UTF-8' + + __author__ = 'Armin Geller' # 2011-08-19 + + oldest_article = 7 + max_articles_per_feed = 50 + + + remove_tags = [ + dict(name='div', attrs={'id':["comments"]}), + dict(attrs={'class':['commentlink']}), + ] + + + keep_only_tags = [ + dict(name='div', attrs={'class':["grid_8 articleText"]}), + dict(name='div', attrs={'class':["articleTextInnerText"]}), + ] + + feeds = [ + (u'Inhalt:', u'http://www.fluter.de/de/?tpl=907'), + ] + + extra_css = '.cs_img {margin-right: 10pt;}' + diff --git a/recipes/gram_pl.recipe b/recipes/gram_pl.recipe new file mode 100644 index 0000000000..091c0bb1dc --- /dev/null +++ b/recipes/gram_pl.recipe @@ -0,0 +1,16 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class Gram_pl(BasicNewsRecipe): + title = u'Gram.pl' + __author__ = 'fenuks' + description = 'Gram.pl - site about computer games' + category = 'games' + language = 'pl' + oldest_article = 8 + max_articles_per_feed = 100 + no_stylesheets= True + cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png' + remove_tags= [dict(name='p', attrs={'class':['extraText', 'must-log-in']}), dict(attrs={'class':['el', 'headline', 'post-info']}), dict(name='div', attrs={'class':['twojaOcena', 'comment-body', 'comment-author vcard', 'comment-meta commentmetadata', 'tw_button']}), dict(id=['igit_rpwt_css', 'comments', 'reply-title', 'igit_title'])] + keep_only_tags= [dict(name='div', attrs={'class':['main', 'arkh-postmetadataheader', 'arkh-postcontent', 'post', 'content', 'news_header', 'news_subheader', 'news_text']}), dict(attrs={'class':['contentheading', 'contentpaneopen']})] + feeds = [(u'gram.pl - informacje', u'http://www.gram.pl/feed_news.asp'), + (u'gram.pl - publikacje', u'http://www.gram.pl/feed_news.asp?type=articles')] diff --git a/recipes/hbr.recipe b/recipes/hbr.recipe index 1152a48784..214ae14f33 100644 --- a/recipes/hbr.recipe +++ b/recipes/hbr.recipe @@ -13,6 +13,8 @@ class HBR(BasicNewsRecipe): no_stylesheets = True LOGIN_URL = 'http://hbr.org/login?request_url=/' + LOGOUT_URL = 'http://hbr.org/logout?request_url=/' + INDEX = 'http://hbr.org/archive-toc/BR' keep_only_tags = [dict(name='div', id='pageContainer')] @@ -34,6 +36,9 @@ class HBR(BasicNewsRecipe): def get_browser(self): br = BasicNewsRecipe.get_browser(self) + self.logout_url = None + + #''' br.open(self.LOGIN_URL) br.select_form(name='signin-form') br['signin-form:username'] = self.username @@ -41,10 +46,13 @@ class HBR(BasicNewsRecipe): raw = br.submit().read() if 'My Account' not in raw: raise Exception('Failed to login, are you sure your username and password are correct?') - self.logout_url = None - link = br.find_link(text='Sign out') - if link: - self.logout_url = link.absolute_url + try: + link = br.find_link(text='Sign out') + if link: + self.logout_url = link.absolute_url + except: + self.logout_url = self.LOGOUT_URL + #''' return br def cleanup(self): @@ -57,6 +65,8 @@ class HBR(BasicNewsRecipe): def hbr_get_toc(self): + #return self.index_to_soup(open('/t/hbr.html').read()) + today = date.today() future = today + timedelta(days=30) for x in [x.strftime('%y%m') for x in (future, today)]: @@ -66,53 +76,43 @@ class HBR(BasicNewsRecipe): return soup raise Exception('Could not find current issue') - def hbr_parse_section(self, container, feeds): - current_section = None - current_articles = [] - for x in container.findAll(name=['li', 'h3', 'h4']): - if x.name in ['h3', 'h4'] and not x.findAll(True): - if current_section and current_articles: - feeds.append((current_section, current_articles)) - current_section = self.tag_to_string(x) - current_articles = [] - self.log('\tFound section:', current_section) - if x.name == 'li': - a = x.find('a', href=True) - if a is not None: - title = self.tag_to_string(a) - url = a.get('href') - if '/ar/' not in url: - continue - if url.startswith('/'): - url = 'http://hbr.org'+url - url = self.map_url(url) - p = x.find('p') - desc = '' - if p is not None: - desc = self.tag_to_string(p) - if not title or not url: - continue - self.log('\t\tFound article:', title) - self.log('\t\t\t', url) - self.log('\t\t\t', desc) - current_articles.append({'title':title, 'url':url, - 'description':desc, 'date':''}) - if current_section and current_articles: - feeds.append((current_section, current_articles)) - - - def hbr_parse_toc(self, soup): feeds = [] - features = soup.find(id='issueFeaturesContent') - self.hbr_parse_section(features, feeds) - departments = soup.find(id='issueDepartments') - self.hbr_parse_section(departments, feeds) + current_section = None + articles = [] + for x in soup.find(id='archiveToc').findAll(['h3', 'h4']): + if x.name == 'h3': + if current_section is not None and articles: + feeds.append((current_section, articles)) + current_section = self.tag_to_string(x).capitalize() + articles = [] + self.log('\tFound section:', current_section) + else: + a = x.find('a', href=True) + if a is None: continue + title = self.tag_to_string(a) + url = a['href'] + if '/ar/' not in url: + continue + if url.startswith('/'): + url = 'http://hbr.org' + url + url = self.map_url(url) + p = x.parent.find('p') + desc = '' + if p is not None: + desc = self.tag_to_string(p) + self.log('\t\tFound article:', title) + self.log('\t\t\t', url) + self.log('\t\t\t', desc) + + articles.append({'title':title, 'url':url, 'description':desc, + 'date':''}) return feeds def parse_index(self): soup = self.hbr_get_toc() + #open('/t/hbr.html', 'wb').write(unicode(soup).encode('utf-8')) feeds = self.hbr_parse_toc(soup) return feeds diff --git a/recipes/hbr_blogs.recipe b/recipes/hbr_blogs.recipe index acee567d8d..0deaef7a73 100644 --- a/recipes/hbr_blogs.recipe +++ b/recipes/hbr_blogs.recipe @@ -6,33 +6,21 @@ class HBR(BasicNewsRecipe): title = 'Harvard Business Review Blogs' description = 'To subscribe go to http://hbr.harvardbusiness.org' needs_subscription = True - __author__ = 'Kovid Goyal, enhanced by BrianG' + __author__ = 'Kovid Goyal' language = 'en' no_stylesheets = True LOGIN_URL = 'http://hbr.org/login?request_url=/' + LOGOUT_URL = 'http://hbr.org/logout?request_url=/' + INDEX = 'http://hbr.org/current' - # - # Blog Stuff - # - - - INCLUDE_BLOGS = True - INCLUDE_ARTICLES = False - - # option-specific settings. - - if INCLUDE_BLOGS == True: - remove_tags_after = dict(id='articleBody') - remove_tags_before = dict(id='pageFeature') - feeds = [('Blog','http://feeds.harvardbusiness.org/harvardbusiness')] - oldest_article = 30 - max_articles_per_feed = 100 - use_embedded_content = False - else: - timefmt = ' [%B %Y]' - + remove_tags_after = dict(id='articleBody') + remove_tags_before = dict(id='pageFeature') + feeds = [('Blog','http://feeds.harvardbusiness.org/harvardbusiness')] + oldest_article = 30 + max_articles_per_feed = 100 + use_embedded_content = False keep_only_tags = [ dict(name='div', id='pageContainer') ] @@ -41,21 +29,15 @@ class HBR(BasicNewsRecipe): 'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn', 'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR', 'articleToolbarTop','articleToolbarBottom', 'articleToolbarRD', - 'mailingListTout', 'partnerCenter', 'pageFooter']), - dict(name='iframe')] + 'mailingListTout', 'partnerCenter', 'pageFooter', 'shareWidgetTop']), + dict(name=['iframe', 'style'])] - extra_css = ''' - a {font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000; } - .article{font-family:Georgia,"Times New Roman",Times,serif; font-size: xx-small;} - h2{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:large; } - h4{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:small; } - #articleBody{font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000;font-size:x-small;} - #summaryText{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:x-small;} - ''' -#------------------------------------------------------------------------------------------------- def get_browser(self): br = BasicNewsRecipe.get_browser(self) + self.logout_url = None + + #''' br.open(self.LOGIN_URL) br.select_form(name='signin-form') br['signin-form:username'] = self.username @@ -63,11 +45,15 @@ class HBR(BasicNewsRecipe): raw = br.submit().read() if 'My Account' not in raw: raise Exception('Failed to login, are you sure your username and password are correct?') - self.logout_url = None - link = br.find_link(text='Sign out') - if link: - self.logout_url = link.absolute_url + try: + link = br.find_link(text='Sign out') + if link: + self.logout_url = link.absolute_url + except: + self.logout_url = self.LOGOUT_URL + #''' return br + #------------------------------------------------------------------------------------------------- def cleanup(self): if self.logout_url is not None: @@ -76,99 +62,7 @@ class HBR(BasicNewsRecipe): def map_url(self, url): if url.endswith('/ar/1'): return url[:-1]+'pr' -#------------------------------------------------------------------------------------------------- - def hbr_get_toc(self): - soup = self.index_to_soup(self.INDEX) - url = soup.find('a', text=lambda t:'Full Table of Contents' in t).parent.get('href') - return self.index_to_soup('http://hbr.org'+url) - -#------------------------------------------------------------------------------------------------- - - def hbr_parse_section(self, container, feeds): - current_section = None - current_articles = [] - for x in container.findAll(name=['li', 'h3', 'h4']): - if x.name in ['h3', 'h4'] and not x.findAll(True): - if current_section and current_articles: - feeds.append((current_section, current_articles)) - current_section = self.tag_to_string(x) - current_articles = [] - self.log('\tFound section:', current_section) - if x.name == 'li': - a = x.find('a', href=True) - if a is not None: - title = self.tag_to_string(a) - url = a.get('href') - if '/ar/' not in url: - continue - if url.startswith('/'): - url = 'http://hbr.org'+url - url = self.map_url(url) - p = x.find('p') - desc = '' - if p is not None: - desc = self.tag_to_string(p) - if not title or not url: - continue - self.log('\t\tFound article:', title) - self.log('\t\t\t', url) - self.log('\t\t\t', desc) - current_articles.append({'title':title, 'url':url, - 'description':desc, 'date':''}) - if current_section and current_articles: - feeds.append((current_section, current_articles)) - -#------------------------------------------------------------------------------------------------- - - def hbr_parse_toc(self, soup): - feeds = [] - features = soup.find(id='issueFeaturesContent') - self.hbr_parse_section(features, feeds) - departments = soup.find(id='issueDepartments') - self.hbr_parse_section(departments, feeds) - return feeds -#------------------------------------------------------------------------------------------------- - def feed_to_index_append(self, feedObject, masterFeed): - # Loop thru the feed object and build the correct type of article list - for feed in feedObject: - # build the correct structure from the feed object - newArticles = [] - for article in feed.articles: - newArt = { - 'title' : article.title, - 'url' : article.url, - 'date' : article.date, - 'description' : article.text_summary - } - newArticles.append(newArt) - - # Append the earliest/latest dates of the feed to the feed title - startDate, endDate = self.get_feed_dates(feed, '%d-%b') - newFeedTitle = feed.title + ' (' + startDate + ' thru ' + endDate + ')' - - # append the newly-built list object to the index object passed in - # as masterFeed. - masterFeed.append( (newFeedTitle,newArticles) ) - -#------------------------------------------------------------------------------------------------- - def get_feed_dates(self, feedObject, dateMask): - startDate = feedObject.articles[len(feedObject.articles)-1].localtime.strftime(dateMask) - endDate = feedObject.articles[0].localtime.strftime(dateMask) - - return startDate, endDate - -#------------------------------------------------------------------------------------------------- - - def parse_index(self): - if self.INCLUDE_ARTICLES == True: - soup = self.hbr_get_toc() - feeds = self.hbr_parse_toc(soup) - else: - return BasicNewsRecipe.parse_index(self) - - return feeds -#------------------------------------------------------------------------------------------------- def get_cover_url(self): cover_url = None index = 'http://hbr.org/current' diff --git a/recipes/houston_chronicle.recipe b/recipes/houston_chronicle.recipe index 3390228455..8d231dac16 100644 --- a/recipes/houston_chronicle.recipe +++ b/recipes/houston_chronicle.recipe @@ -1,8 +1,6 @@ #!/usr/bin/env python # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai -import string, pprint - from calibre.web.feeds.news import BasicNewsRecipe class HoustonChronicle(BasicNewsRecipe): @@ -13,53 +11,28 @@ class HoustonChronicle(BasicNewsRecipe): language = 'en' timefmt = ' [%a, %d %b, %Y]' no_stylesheets = True + use_embedded_content = False + remove_attributes = ['style'] - keep_only_tags = [ - dict(id=['story-head', 'story']) - ] - - remove_tags = [ - dict(id=['share-module', 'resource-box', - 'resource-box-header']) - ] - - extra_css = ''' - h1{font-family :Arial,Helvetica,sans-serif; font-size:large;} - h2{font-family :Arial,Helvetica,sans-serif; font-size:medium; color:#666666;} - h3{font-family :Arial,Helvetica,sans-serif; font-size:medium; color:#000000;} - h4{font-family :Arial,Helvetica,sans-serif; font-size: x-small;} - p{font-family :Arial,Helvetica,sans-serif; font-size:x-small;} - #story-head h1{font-family :Arial,Helvetica,sans-serif; font-size: xx-large;} - #story-head h2{font-family :Arial,Helvetica,sans-serif; font-size: small; color:#000000;} - #story-head h3{font-family :Arial,Helvetica,sans-serif; font-size: xx-small;} - #story-head h4{font-family :Arial,Helvetica,sans-serif; font-size: xx-small;} - #story{font-family :Arial,Helvetica,sans-serif; font-size:xx-small;} - #Text-TextSubhed BoldCond PoynterAgateZero h3{color:#444444;font-family :Arial,Helvetica,sans-serif; font-size:small;} - .p260x p{font-family :Arial,Helvetica,serif; font-size:x-small;font-style:italic;} - .p260x h6{color:#777777;font-family :Arial,Helvetica,sans-serif; font-size:xx-small;} - ''' - - - def parse_index(self): - categories = ['news', 'sports', 'business', 'entertainment', 'life', - 'travel'] - feeds = [] - for cat in categories: - articles = [] - soup = self.index_to_soup('http://www.chron.com/%s/'%cat) - for elem in soup.findAll(comptype='story', storyid=True): - a = elem.find('a', href=True) - if a is None: continue - url = a['href'] - if not url.startswith('http://'): - url = 'http://www.chron.com'+url - articles.append({'title':self.tag_to_string(a), 'url':url, - 'description':'', 'date':''}) - pprint.pprint(articles[-1]) - if articles: - feeds.append((string.capwords(cat), articles)) - return feeds + oldest_article = 2.0 + keep_only_tags = {'class':lambda x: x and ('hst-articletitle' in x or + 'hst-articletext' in x or 'hst-galleryitem' in x)} + feeds = [ + ('News', "http://www.chron.com/rss/feed/News-270.php"), + ('Sports', + 'http://www.chron.com/sports/headlines/collectionRss/Sports-Headlines-Staff-Stories-10767.php'), + ('Neighborhood', + 'http://www.chron.com/rss/feed/Neighborhood-305.php'), + ('Business', 'http://www.chron.com/rss/feed/Business-287.php'), + ('Entertainment', + 'http://www.chron.com/rss/feed/Entertainment-293.php'), + ('Editorials', + 'http://www.chron.com/opinion/editorials/collectionRss/Opinion-Editorials-Headline-List-10567.php'), + ('Life', 'http://www.chron.com/rss/feed/Life-297.php'), + ('Science & Tech', + 'http://www.chron.com/rss/feed/AP-Technology-and-Science-266.php'), + ] diff --git a/recipes/icons/android_com_pl.png b/recipes/icons/android_com_pl.png new file mode 100644 index 0000000000..d68bac8810 Binary files /dev/null and b/recipes/icons/android_com_pl.png differ diff --git a/recipes/icons/bash_org_pl.png b/recipes/icons/bash_org_pl.png new file mode 100644 index 0000000000..5fc18a38e0 Binary files /dev/null and b/recipes/icons/bash_org_pl.png differ diff --git a/recipes/icons/cd_action.png b/recipes/icons/cd_action.png new file mode 100644 index 0000000000..823e09a43e Binary files /dev/null and b/recipes/icons/cd_action.png differ diff --git a/recipes/icons/dobreprogamy.png b/recipes/icons/dobreprogamy.png new file mode 100644 index 0000000000..fcb658cfe1 Binary files /dev/null and b/recipes/icons/dobreprogamy.png differ diff --git a/recipes/icons/film_web.png b/recipes/icons/film_web.png new file mode 100644 index 0000000000..3ddcdf1cde Binary files /dev/null and b/recipes/icons/film_web.png differ diff --git a/recipes/icons/gram_pl.png b/recipes/icons/gram_pl.png new file mode 100644 index 0000000000..0a87f28825 Binary files /dev/null and b/recipes/icons/gram_pl.png differ diff --git a/recipes/icons/niebezpiecznik.png b/recipes/icons/niebezpiecznik.png new file mode 100644 index 0000000000..4188d91d36 Binary files /dev/null and b/recipes/icons/niebezpiecznik.png differ diff --git a/recipes/icons/wnp.png b/recipes/icons/wnp.png new file mode 100644 index 0000000000..3781f671cd Binary files /dev/null and b/recipes/icons/wnp.png differ diff --git a/recipes/msdnmag_en.recipe b/recipes/msdnmag_en.recipe index 77b8da17a8..cf9cfc4f6a 100644 --- a/recipes/msdnmag_en.recipe +++ b/recipes/msdnmag_en.recipe @@ -6,11 +6,13 @@ __copyright__ = '2009, Darko Miletic ' msdn.microsoft.com/en-us/magazine ''' from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup class MSDNMagazine_en(BasicNewsRecipe): title = 'MSDN Magazine' __author__ = 'Darko Miletic' description = 'The Microsoft Journal for Developers' + masthead_url = 'http://i3.msdn.microsoft.com/Platform/MasterPages/MsdnMagazine/smalllogo.png' publisher = 'Microsoft Press' category = 'news, IT, Microsoft, programming, windows' oldest_article = 31 @@ -20,24 +22,44 @@ class MSDNMagazine_en(BasicNewsRecipe): encoding = 'utf-8' language = 'en' + base_url = 'http://msdn.microsoft.com/en-us/magazine/default.aspx' + rss_url = 'http://msdn.microsoft.com/en-us/magazine/rss/default.aspx?z=z&iss=1' - feeds = [(u'Articles', u'http://msdn.microsoft.com/en-us/magazine/rss/default.aspx?z=z&iss=1')] - - keep_only_tags = [dict(name='div', attrs={'class':'navpage'})] + keep_only_tags = [dict(name='div', attrs={'id':'MainContent'})] remove_tags = [ - dict(name=['object','link','base','table']) - ,dict(name='div', attrs={'class':'MTPS_CollapsibleRegion'}) + dict(name='div', attrs={'class':'DivRatingsOnly'}) + ,dict(name='div', attrs={'class':'ShareThisButton4'}) ] - remove_tags_after = dict(name='div', attrs={'class':'navpage'}) - def preprocess_html(self, soup): - for item in soup.findAll('div',attrs={'class':['FeatureSmallHead','ColumnTypeSubTitle']}): - item.name="h2" - for item in soup.findAll('div',attrs={'class':['FeatureHeadline','ColumnTypeTitle']}): - item.name="h1" - for item in soup.findAll('div',attrs={'class':'ArticleTypeTitle'}): - item.name="h3" - return soup + def find_articles(self): + idx_contents = self.browser.open(self.rss_url).read() + idx = BeautifulStoneSoup(idx_contents, convertEntities=BeautifulStoneSoup.XML_ENTITIES) + + for article in idx.findAll('item'): + desc_html = self.tag_to_string(article.find('description')) + description = self.tag_to_string(BeautifulSoup(desc_html)) + + a = { + 'title': self.tag_to_string(article.find('title')), + 'url': self.tag_to_string(article.find('link')), + 'description': description, + 'date' : self.tag_to_string(article.find('pubdate')), + } + yield a + + + def parse_index(self): + soup = self.index_to_soup(self.base_url) + + #find issue name, eg "August 2011" + issue_name = self.tag_to_string(soup.find('h1')) + + # find cover pic + img = soup.find('img',attrs ={'alt':issue_name}) + if img is not None: + self.cover_url = img['src'] + + return [(issue_name, list(self.find_articles()))] diff --git a/recipes/niebezpiecznik.recipe b/recipes/niebezpiecznik.recipe new file mode 100644 index 0000000000..b33a0a3513 --- /dev/null +++ b/recipes/niebezpiecznik.recipe @@ -0,0 +1,16 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class Niebezpiecznik_pl(BasicNewsRecipe): + title = u'Niebezpiecznik.pl' + __author__ = 'fenuks' + description = 'Niebezpiecznik.pl' + category = 'hacking, IT' + language = 'pl' + oldest_article = 8 + max_articles_per_feed = 100 + no_stylesheets = True + cover_url =u'http://userlogos.org/files/logos/Karmody/niebezpiecznik_01.png' + remove_tags=[dict(name='div', attrs={'class':['sociable']}), dict(name='h4'), dict(attrs={'class':'similar-posts'})] + keep_only_tags= [dict(name='div', attrs={'class':['title', 'entry']})] + feeds = [(u'Wiadomości', u'http://feeds.feedburner.com/niebezpiecznik/'), + ('Blog', 'http://feeds.feedburner.com/niebezpiecznik/linkblog/')] diff --git a/recipes/politifact.recipe b/recipes/politifact.recipe index e3550ce7f1..a0f0d786dd 100644 --- a/recipes/politifact.recipe +++ b/recipes/politifact.recipe @@ -5,7 +5,6 @@ class PolitiFactCom(BasicNewsRecipe): __author__ = u'Michael Heinz' oldest_article = 21 max_articles_per_feed = 100 - recursion = 0 language = 'en' no_stylesheets = True diff --git a/recipes/svd_se.recipe b/recipes/svd_se.recipe index ef43caa7cd..7fa92c47f2 100644 --- a/recipes/svd_se.recipe +++ b/recipes/svd_se.recipe @@ -40,11 +40,11 @@ class SVD_se(BasicNewsRecipe): ,(u'Kultur' , u'http://www.svd.se/kulturnoje/nyheter/?service=rss') ] - keep_only_tags = [dict(name='div', attrs={'id':'articlecontent'})] - remove_tags_after = dict(name='div',attrs={'class':'articlebody normal'}) + keep_only_tags = [dict(name='div', attrs={'id':['article-content', 'articlecontent']})] + remove_tags_after = dict(name='div',attrs={'class':'articlebody'}) remove_tags = [ dict(name=['object','link','base']) - ,dict(name='div',attrs={'class':['articlead','factcolumn']}) + ,dict(name='div',attrs={'class':['articlead','factcolumn', 'article-ad']}) ,dict(name='ul', attrs={'class':'toolbar articletop clearfix'}) ,dict(name='p', attrs={'class':'more'}) ] diff --git a/recipes/wnp.recipe b/recipes/wnp.recipe new file mode 100644 index 0000000000..e53e4cc66b --- /dev/null +++ b/recipes/wnp.recipe @@ -0,0 +1,21 @@ +from calibre.web.feeds.news import BasicNewsRecipe + + +class AdvancedUserRecipe1312886443(BasicNewsRecipe): + title = u'WNP' + cover_url= 'http://k.wnp.pl/images/wnpLogo.gif' + __author__ = 'fenuks' + description = u'Wirtualny Nowy Przemysł' + category = 'economy' + language = 'pl' + oldest_article = 8 + max_articles_per_feed = 100 + no_stylesheets= True + keep_only_tags = dict(name='div', attrs={'id':'contentText'}) + feeds = [(u'Wiadomości gospodarcze', u'http://www.wnp.pl/rss/serwis_rss.xml'), + (u'Serwis Energetyka - Gaz', u'http://www.wnp.pl/rss/serwis_rss_1.xml'), + (u'Serwis Nafta - Chemia', u'http://www.wnp.pl/rss/serwis_rss_2.xml'), + (u'Serwis Hutnictwo', u'http://www.wnp.pl/rss/serwis_rss_3.xml'), + (u'Serwis Górnictwo', u'http://www.wnp.pl/rss/serwis_rss_4.xml'), + (u'Serwis Logistyka', u'http://www.wnp.pl/rss/serwis_rss_5.xml'), + (u'Serwis IT', u'http://www.wnp.pl/rss/serwis_rss_6.xml')] diff --git a/recipes/wsj_free.recipe b/recipes/wsj_free.recipe index 331a393c03..42d791294a 100644 --- a/recipes/wsj_free.recipe +++ b/recipes/wsj_free.recipe @@ -53,6 +53,12 @@ class WallStreetJournal(BasicNewsRecipe): return soup + def abs_wsj_url(self, href): + if not href.startswith('http'): + href = 'http://online.wsj.com' + href + return href + + def wsj_get_index(self): return self.index_to_soup('http://online.wsj.com/itp') @@ -83,14 +89,14 @@ class WallStreetJournal(BasicNewsRecipe): pageone = a['href'].endswith('pageone') if pageone: title = 'Front Section' - url = 'http://online.wsj.com' + a['href'] + url = self.abs_wsj_url(a['href']) feeds = self.wsj_add_feed(feeds,title,url) title = 'What''s News' url = url.replace('pageone','whatsnews') feeds = self.wsj_add_feed(feeds,title,url) else: title = self.tag_to_string(a) - url = 'http://online.wsj.com' + a['href'] + url = self.abs_wsj_url(a['href']) feeds = self.wsj_add_feed(feeds,title,url) return feeds @@ -146,7 +152,7 @@ class WallStreetJournal(BasicNewsRecipe): title = self.tag_to_string(a).strip() + ' [%s]'%meta else: title = self.tag_to_string(a).strip() - url = 'http://online.wsj.com'+a['href'] + url = self.abs_wsj_url(a['href']) desc = '' for p in container.findAll('p'): desc = self.tag_to_string(p) diff --git a/recipes/yagmur_dergisi.recipe b/recipes/yagmur_dergisi.recipe new file mode 100644 index 0000000000..786a628a0c --- /dev/null +++ b/recipes/yagmur_dergisi.recipe @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- + +from calibre.web.feeds.news import BasicNewsRecipe + +class Yagmur(BasicNewsRecipe): + title = u'Yagmur Dergisi' + __author__ = u'thomass' + description = 'Üç Aylık Dil, Kültür ve Edebiyat Dergisi' + oldest_article = 90 + max_articles_per_feed =100 + no_stylesheets = True + #delay = 1 + #use_embedded_content = False + + #publisher = ' ' + category = 'dergi, ilim, kültür, edebiyat,Türkçe' + language = 'tr' + publication_type = 'magazine' + encoding = 'ISO 8859-9' + publisher = 'thomass' + + + + #extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} ' + conversion_options = { + 'tags' : category + ,'language' : language + ,'publisher' : publisher + ,'linearize_tables': True + } + #extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} ' + #keep_only_tags = [dict(name='h1', attrs={'class':['georgia_30']})] + + #remove_attributes = ['aria-describedby'] + #remove_tags = [dict(name='div', attrs={'id':['renk10']}) ] + cover_img_url = 'http://www.sizinti.com.tr/images/dergiler/d2.gif' + masthead_url = 'http://www.sizinti.com.tr/images/dergiler/d2.gif' + #remove_tags_before = dict(id='content-right') + + + #remove_empty_feeds= True + #remove_attributes = ['width','height'] + + feeds = [ + ( u'Yagmur', u'http://open.dapper.net/services/yagmur'), + ] + + #def preprocess_html(self, soup): + # return self.adeify_images(soup) + def print_version(self, url): #there is a probem caused by table format + return url.replace('http://www.yagmurdergisi.com.tr/konu_goster.php?konu_id=', 'http://www.yagmurdergisi.com.tr/yazformati.php?konu_id=') + diff --git a/recipes/yeni_umit_dergisi.recipe b/recipes/yeni_umit_dergisi.recipe new file mode 100644 index 0000000000..24b95acae4 --- /dev/null +++ b/recipes/yeni_umit_dergisi.recipe @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- + +from calibre.web.feeds.news import BasicNewsRecipe + +class YeniUmit(BasicNewsRecipe): + title = u'Yeni Umit Dergisi' + __author__ = u'thomass' + description = 'Aylık Dini İlimler ve Kültür Dergisi' + oldest_article = 45 + max_articles_per_feed =100 + no_stylesheets = True + #delay = 1 + #use_embedded_content = False + + #publisher = ' ' + category = 'dergi, ilim, kültür, edebiyat,Türkçe' + language = 'tr' + publication_type = 'magazine' + encoding = 'ISO 8859-9' + publisher = 'thomass' + + + + #extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} ' + conversion_options = { + 'tags' : category + ,'language' : language + ,'publisher' : publisher + ,'linearize_tables': True + } + #extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} ' + #keep_only_tags = [dict(name='h1', attrs={'class':['georgia_30']})] + + #remove_attributes = ['aria-describedby'] + #remove_tags = [dict(name='div', attrs={'id':['renk10']}) ] + cover_img_url = 'http://www.sizinti.com.tr/images/dergiler/d1.gif' + masthead_url = 'http://www.sizinti.com.tr/images/dergiler/d1.gif' + #remove_tags_before = dict(id='content-right') + + + #remove_empty_feeds= True + #remove_attributes = ['width','height'] + + feeds = [ + ( u'Yeni Umit', u'http://open.dapper.net/services/yeniumit'), + ] + + #def preprocess_html(self, soup): + # return self.adeify_images(soup) + def print_version(self, url): #there is a probem caused by table format + return url.replace('http://www.yeniumit.com.tr/konular', 'http://www.yeniumit.com.tr/yazdir') + diff --git a/recipes/yenisafak_gazetesi.recipe b/recipes/yenisafak_gazetesi.recipe new file mode 100644 index 0000000000..afcec76508 --- /dev/null +++ b/recipes/yenisafak_gazetesi.recipe @@ -0,0 +1,64 @@ +# -*- coding: utf-8 -*- + +from calibre.web.feeds.news import BasicNewsRecipe + +class Bugun (BasicNewsRecipe): + + title = u'Yenişafak Gazetesi' + __author__ = u'thomass' + oldest_article = 2 + max_articles_per_feed =100 + no_stylesheets = True + #delay = 1 + use_embedded_content = False + encoding = 'ISO 8859-9' #'UTF-8' + publisher = 'thomass' + category = 'news, haberler,TR,gazete' + language = 'tr' + publication_type = 'newspaper ' + #extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} ' + conversion_options = { + 'tags' : category + ,'language' : language + ,'publisher' : publisher + ,'linearize_tables': True + } + cover_img_url = 'http://yenisafak.com.tr/resim/logo.gif' + masthead_url = 'http://yenisafak.com.tr/resim/logo.gif' + + keep_only_tags = [dict(name='div', attrs={'id':[ 'ctghaberdetay2010']}) ] + extra_css = ' h1{font-size:20px;font-weight: bold}h2{font-size: small;font-weight: bold}div{font-size: small} '#h1{ font-size:10%;font-weight: bold} '#ctl00_ortayer_haberBaslik{ 'font-size:10%;font-weight: bold'} + + #keep_only_tags = [dict(name='div', attrs={'id':[ 'news-detail-content']}), dict(name='td', attrs={'class':['columnist-detail','columnist_head']}) ] + remove_tags = [ dict(name='div', attrs={'id':['yasaluyari2010','divhaberdetayilisik2010']}),dict(name='font', attrs={'class':['haberdetaytarih']})]#,'news-detail-gallery','news-detail-news-bottom-social']}),dict(name='div', attrs={'class':['radioEmbedBg','radyoProgramAdi']}),dict(name='a', attrs={'class':['webkit-html-attribute-value webkit-html-external-link']}),dict(name='table', attrs={'id':['yaziYorumTablosu']}),dict(name='img', attrs={'src':['http://medya.zaman.com.tr/pics/paylas.gif','http://medya.zaman.com.tr/extentions/zaman.com.tr/img/columnist/ma-16.png']})] + + + #remove_attributes = ['width','height'] + remove_empty_feeds= True + + feeds = [ + ( u'SonDakika', u'http://yenisafak.com.tr/rss/?xml=anasayfa'), + ( u'Gündem', u'http://yenisafak.com.tr/rss/?xml=gundem'), + ( u'Politika', u'http://yenisafak.com.tr/rss/?xml=politika'), + ( u'Ekonomi', u'http://yenisafak.com.tr/rss/?xml=ekonomi'), + ( u'Dünya', u'http://yenisafak.com.tr/rss/?xml=dunya'), + ( u'Aktüel', u'http://yenisafak.com.tr/rss/?xml=aktuel'), + ( u'Eğitim', u'http://yenisafak.com.tr/rss/?xml=egitim'), + ( u'Spor', u'http://yenisafak.com.tr/rss/?xml=spor'), + ( u'Yazarlar', u'http://yenisafak.com.tr/rss/?xml=yazarlar'), + ( u'Televizyon', u'http://yenisafak.com.tr/rss/?xml=televizyon'), + ( u'Sağlık', u'http://yenisafak.com.tr/rss/?xml=saglik'), + ( u'Yurt Haberler', u'http://yenisafak.com.tr/rss/?xml=yurthaberler'), + ( u'Bilişim', u'http://yenisafak.com.tr/rss/?xml=bilisim'), + ( u'Diziler', u'http://yenisafak.com.tr/rss/?xml=diziler'), + ( u'Kültür-Sanat', u'http://yenisafak.com.tr/rss/?xml=kultursanat'), + ( u'Röportaj', u'http://yenisafak.com.tr/rss/?xml=roportaj'), + ( u'Sinema', u'http://yenisafak.com.tr/rss/?xml=sinema'), + ( u'Yorum', u'http://yenisafak.com.tr/rss/?xml=yorum'), + ( u' Yeni Şafak Pazar', u'http://yenisafak.com.tr/rss/?xml=pazar'), + ( u'Yeni Şafak Kitap', u'http://yenisafak.com.tr/rss/?xml=kitap'), + ( u'Yeni Şafak English', u'http://yenisafak.com.tr/rss/?xml=english'), + + + + ] diff --git a/resources/default_tweaks.py b/resources/default_tweaks.py index 12731a8c42..f11a0b7bc0 100644 --- a/resources/default_tweaks.py +++ b/resources/default_tweaks.py @@ -62,10 +62,16 @@ authors_completer_append_separator = False # The author name suffixes are words that are ignored when they occur at the # end of an author name. The case of the suffix is ignored and trailing # periods are automatically handled. +# The author name copy words are a set of words which if they occur in an +# author name cause the automatically geenrated author sort string to be +# identical to the author name. This means that the sort for a string like Acme +# Inc. will be Acme Inc. instead of Inc., Acme author_sort_copy_method = 'comma' author_name_suffixes = ('Jr', 'Sr', 'Inc', 'Ph.D', 'Phd', 'MD', 'M.D', 'I', 'II', 'III', 'IV', 'Junior', 'Senior') +author_name_copywords = ('Corporation', 'Company', 'Co.', 'Agency', 'Council', + 'Committee', 'Inc.', 'Institute', 'Society', 'Club', 'Team') #: Use author sort in Tag Browser # Set which author field to display in the tags pane (the list of authors, diff --git a/setup/gui.py b/setup/gui.py index 058a3f052f..912760ddf8 100644 --- a/setup/gui.py +++ b/setup/gui.py @@ -17,8 +17,8 @@ class GUI(Command): @classmethod def find_forms(cls): - from calibre.gui2 import find_forms - return find_forms(cls.SRC) + # We do not use the calibre function find_forms as + # mporting calibre.gui2 may not work forms = [] for root, _, files in os.walk(cls.PATH): for name in files: @@ -29,8 +29,9 @@ class GUI(Command): @classmethod def form_to_compiled_form(cls, form): - from calibre.gui2 import form_to_compiled_form - return form_to_compiled_form(form) + # We do not use the calibre function form_to_compiled_form as + # importing calibre.gui2 may not work + return form.rpartition('.')[0]+'_ui.py' def run(self, opts): self.build_forms() diff --git a/setup/install.py b/setup/install.py index 42df360b56..4194f7ed26 100644 --- a/setup/install.py +++ b/setup/install.py @@ -55,7 +55,7 @@ class Develop(Command): short_description = 'Setup a development environment for calibre' MODE = 0755 - sub_commands = ['build', 'resources', 'gui'] + sub_commands = ['build', 'resources', 'iso639', 'gui',] def add_postinstall_options(self, parser): parser.add_option('--make-errors-fatal', action='store_true', default=False, diff --git a/setup/resources.py b/setup/resources.py index 41068f78a0..ee72a98cb6 100644 --- a/setup/resources.py +++ b/setup/resources.py @@ -219,12 +219,17 @@ class Resources(Command): json.dump(function_dict, open(dest, 'wb'), indent=4) def clean(self): - for x in ('scripts', 'recipes', 'ebook-convert-complete'): + for x in ('scripts', 'ebook-convert-complete'): x = self.j(self.RESOURCES, x+'.pickle') if os.path.exists(x): os.remove(x) from setup.commands import kakasi kakasi.clean() + for x in ('builtin_recipes.xml', 'builtin_recipes.zip', + 'template-functions.json'): + x = self.j(self.RESOURCES, x) + if os.path.exists(x): + os.remove(x) diff --git a/setup/translations.py b/setup/translations.py index 2e8e6d52f3..3523272770 100644 --- a/setup/translations.py +++ b/setup/translations.py @@ -206,6 +206,10 @@ class Translations(POT): # {{{ for x in (i, j, d): if os.path.exists(x): os.remove(x) + zf = self.DEST + '.zip' + if os.path.exists(zf): + os.remove(zf) + # }}} class GetTranslations(Translations): @@ -273,13 +277,14 @@ class GetTranslations(Translations): class ISO639(Command): description = 'Compile translations for ISO 639 codes' + DEST = os.path.join(os.path.dirname(POT.SRC), 'resources', 'localization', + 'iso639.pickle') def run(self, opts): src = self.j(self.d(self.SRC), 'setup', 'iso639.xml') if not os.path.exists(src): raise Exception(src + ' does not exist') - dest = self.j(self.d(self.SRC), 'resources', 'localization', - 'iso639.pickle') + dest = self.DEST if not self.newer(dest, src): self.info('Pickled code is up to date') return @@ -322,3 +327,8 @@ class ISO639(Command): '3to2':m3to2, '3bto3t':m3bto3t, 'name_map':nm} dump(x, open(dest, 'wb'), -1) + def clean(self): + if os.path.exists(self.DEST): + os.remove(self.DEST) + + diff --git a/src/calibre/ebooks/cssselect.py b/src/calibre/ebooks/cssselect.py new file mode 100644 index 0000000000..1c2bfcc4fa --- /dev/null +++ b/src/calibre/ebooks/cssselect.py @@ -0,0 +1,1012 @@ +"""CSS Selectors based on XPath. + +This module supports selecting XML/HTML tags based on CSS selectors. +See the `CSSSelector` class for details. +""" + +import re +from lxml import etree + +__all__ = ['SelectorSyntaxError', 'ExpressionError', + 'CSSSelector'] + +try: + _basestring = basestring +except NameError: + _basestring = str + +class SelectorSyntaxError(SyntaxError): + pass + +class ExpressionError(RuntimeError): + pass + +class CSSSelector(etree.XPath): + """A CSS selector. + + Usage:: + + >>> from lxml import etree, cssselect + >>> select = cssselect.CSSSelector("a tag > child") + + >>> root = etree.XML("TEXT") + >>> [ el.tag for el in select(root) ] + ['child'] + + To use CSS namespaces, you need to pass a prefix-to-namespace + mapping as ``namespaces`` keyword argument:: + + >>> rdfns = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' + >>> select_ns = cssselect.CSSSelector('root > rdf|Description', + ... namespaces={'rdf': rdfns}) + + >>> rdf = etree.XML(( + ... '' + ... 'blah' + ... '') % rdfns) + >>> [(el.tag, el.text) for el in select_ns(rdf)] + [('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description', 'blah')] + """ + def __init__(self, css, namespaces=None): + path = css_to_xpath_no_case(css) + etree.XPath.__init__(self, path, namespaces=namespaces) + self.css = css + + def __repr__(self): + return '<%s %s for %r>' % ( + self.__class__.__name__, + hex(abs(id(self)))[2:], + self.css) + +############################## +## Token objects: + +try: + _unicode = unicode + _unichr = unichr +except NameError: + # Python 3 + _unicode = str + _unichr = chr + +class _UniToken(_unicode): + def __new__(cls, contents, pos): + obj = _unicode.__new__(cls, contents) + obj.pos = pos + return obj + + def __repr__(self): + return '%s(%s, %r)' % ( + self.__class__.__name__, + _unicode.__repr__(self), + self.pos) + +class Symbol(_UniToken): + pass + +class String(_UniToken): + pass + +class Token(_UniToken): + pass + +############################################################ +## Parsing +############################################################ + +############################## +## Syntax objects: + +class Class(object): + """ + Represents selector.class_name + """ + + def __init__(self, selector, class_name): + self.selector = selector + # Kovid: Lowercased + self.class_name = class_name.lower() + + def __repr__(self): + return '%s[%r.%s]' % ( + self.__class__.__name__, + self.selector, + self.class_name) + + def xpath(self): + sel_xpath = self.selector.xpath() + # Kovid: Lowercased + sel_xpath.add_condition( + "contains(concat(' ', normalize-space(%s), ' '), %s)" % ( + lower_case('@class'), + xpath_literal(' '+self.class_name+' '))) + return sel_xpath + +class Function(object): + """ + Represents selector:name(expr) + """ + + unsupported = [ + 'target', 'lang', 'enabled', 'disabled',] + + def __init__(self, selector, type, name, expr): + self.selector = selector + self.type = type + self.name = name + self.expr = expr + + def __repr__(self): + return '%s[%r%s%s(%r)]' % ( + self.__class__.__name__, + self.selector, + self.type, self.name, self.expr) + + def xpath(self): + sel_path = self.selector.xpath() + if self.name in self.unsupported: + raise ExpressionError( + "The pseudo-class %r is not supported" % self.name) + method = '_xpath_' + self.name.replace('-', '_') + if not hasattr(self, method): + raise ExpressionError( + "The pseudo-class %r is unknown" % self.name) + method = getattr(self, method) + return method(sel_path, self.expr) + + def _xpath_nth_child(self, xpath, expr, last=False, + add_name_test=True): + a, b = parse_series(expr) + if not a and not b and not last: + # a=0 means nothing is returned... + xpath.add_condition('false() and position() = 0') + return xpath + if add_name_test: + xpath.add_name_test() + xpath.add_star_prefix() + if a == 0: + if last: + b = 'last() - %s' % b + xpath.add_condition('position() = %s' % b) + return xpath + if last: + # FIXME: I'm not sure if this is right + a = -a + b = -b + if b > 0: + b_neg = str(-b) + else: + b_neg = '+%s' % (-b) + if a != 1: + expr = ['(position() %s) mod %s = 0' % (b_neg, a)] + else: + expr = [] + if b >= 0: + expr.append('position() >= %s' % b) + elif b < 0 and last: + expr.append('position() < (last() %s)' % b) + expr = ' and '.join(expr) + if expr: + xpath.add_condition(expr) + return xpath + # FIXME: handle an+b, odd, even + # an+b means every-a, plus b, e.g., 2n+1 means odd + # 0n+b means b + # n+0 means a=1, i.e., all elements + # an means every a elements, i.e., 2n means even + # -n means -1n + # -1n+6 means elements 6 and previous + + def _xpath_nth_last_child(self, xpath, expr): + return self._xpath_nth_child(xpath, expr, last=True) + + def _xpath_nth_of_type(self, xpath, expr): + if xpath.element == '*': + raise NotImplementedError( + "*:nth-of-type() is not implemented") + return self._xpath_nth_child(xpath, expr, add_name_test=False) + + def _xpath_nth_last_of_type(self, xpath, expr): + return self._xpath_nth_child(xpath, expr, last=True, add_name_test=False) + + def _xpath_contains(self, xpath, expr): + # text content, minus tags, must contain expr + if isinstance(expr, Element): + expr = expr._format_element() + # Kovid: Use ASCII lower case that works + xpath.add_condition('contains(%s), %s)' % ( + lower_case('string(.)'), + xpath_literal(expr.lower()))) + return xpath + + def _xpath_not(self, xpath, expr): + # everything for which not expr applies + expr = expr.xpath() + cond = expr.condition + # FIXME: should I do something about element_path? + xpath.add_condition('not(%s)' % cond) + return xpath + +# Kovid: Python functions dont work in lxml, so use translate() +# instead of the python lowercase function +def lower_case(arg): + 'An ASCII lowercase function' + return ("translate(%s, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', " + "'abcdefghijklmnopqrstuvwxyz')")%arg + +class Pseudo(object): + """ + Represents selector:ident + """ + + unsupported = ['indeterminate', 'first-line', 'first-letter', + 'selection', 'before', 'after', 'link', 'visited', + 'active', 'focus', 'hover'] + + def __init__(self, element, type, ident): + self.element = element + assert type in (':', '::') + self.type = type + self.ident = ident + + def __repr__(self): + return '%s[%r%s%s]' % ( + self.__class__.__name__, + self.element, + self.type, self.ident) + + def xpath(self): + el_xpath = self.element.xpath() + if self.ident in self.unsupported: + raise ExpressionError( + "The pseudo-class %r is unsupported" % self.ident) + method = '_xpath_' + self.ident.replace('-', '_') + if not hasattr(self, method): + raise ExpressionError( + "The pseudo-class %r is unknown" % self.ident) + method = getattr(self, method) + el_xpath = method(el_xpath) + return el_xpath + + def _xpath_checked(self, xpath): + # FIXME: is this really all the elements? + xpath.add_condition("(@selected or @checked) and (name(.) = 'input' or name(.) = 'option')") + return xpath + + def _xpath_root(self, xpath): + # if this element is the root element + raise NotImplementedError + + def _xpath_first_child(self, xpath): + xpath.add_star_prefix() + xpath.add_name_test() + xpath.add_condition('position() = 1') + return xpath + + def _xpath_last_child(self, xpath): + xpath.add_star_prefix() + xpath.add_name_test() + xpath.add_condition('position() = last()') + return xpath + + def _xpath_first_of_type(self, xpath): + if xpath.element == '*': + raise NotImplementedError( + "*:first-of-type is not implemented") + xpath.add_star_prefix() + xpath.add_condition('position() = 1') + return xpath + + def _xpath_last_of_type(self, xpath): + if xpath.element == '*': + raise NotImplementedError( + "*:last-of-type is not implemented") + xpath.add_star_prefix() + xpath.add_condition('position() = last()') + return xpath + + def _xpath_only_child(self, xpath): + xpath.add_name_test() + xpath.add_star_prefix() + xpath.add_condition('last() = 1') + return xpath + + def _xpath_only_of_type(self, xpath): + if xpath.element == '*': + raise NotImplementedError( + "*:only-of-type is not implemented") + xpath.add_condition('last() = 1') + return xpath + + def _xpath_empty(self, xpath): + xpath.add_condition("not(*) and not(normalize-space())") + return xpath + +class Attrib(object): + """ + Represents selector[namespace|attrib operator value] + """ + + def __init__(self, selector, namespace, attrib, operator, value): + self.selector = selector + self.namespace = namespace + self.attrib = attrib + self.operator = operator + self.value = value + + def __repr__(self): + if self.operator == 'exists': + return '%s[%r[%s]]' % ( + self.__class__.__name__, + self.selector, + self._format_attrib()) + else: + return '%s[%r[%s %s %r]]' % ( + self.__class__.__name__, + self.selector, + self._format_attrib(), + self.operator, + self.value) + + def _format_attrib(self): + if self.namespace == '*': + return self.attrib + else: + return '%s|%s' % (self.namespace, self.attrib) + + def _xpath_attrib(self): + # FIXME: if attrib is *? + if self.namespace == '*': + return '@' + self.attrib + else: + return '@%s:%s' % (self.namespace, self.attrib) + + def xpath(self): + path = self.selector.xpath() + attrib = self._xpath_attrib() + value = self.value + if self.operator == 'exists': + assert not value + path.add_condition(attrib) + elif self.operator == '=': + path.add_condition('%s = %s' % (attrib, + xpath_literal(value))) + elif self.operator == '!=': + # FIXME: this seems like a weird hack... + if value: + path.add_condition('not(%s) or %s != %s' + % (attrib, attrib, xpath_literal(value))) + else: + path.add_condition('%s != %s' + % (attrib, xpath_literal(value))) + #path.add_condition('%s != %s' % (attrib, xpath_literal(value))) + elif self.operator == '~=': + path.add_condition("contains(concat(' ', normalize-space(%s), ' '), %s)" % (attrib, xpath_literal(' '+value+' '))) + elif self.operator == '|=': + # Weird, but true... + path.add_condition('%s = %s or starts-with(%s, %s)' % ( + attrib, xpath_literal(value), + attrib, xpath_literal(value + '-'))) + elif self.operator == '^=': + path.add_condition('starts-with(%s, %s)' % ( + attrib, xpath_literal(value))) + elif self.operator == '$=': + # Oddly there is a starts-with in XPath 1.0, but not ends-with + path.add_condition('substring(%s, string-length(%s)-%s) = %s' + % (attrib, attrib, len(value)-1, xpath_literal(value))) + elif self.operator == '*=': + # FIXME: case sensitive? + path.add_condition('contains(%s, %s)' % ( + attrib, xpath_literal(value))) + else: + assert 0, ("Unknown operator: %r" % self.operator) + return path + +class Element(object): + """ + Represents namespace|element + """ + + def __init__(self, namespace, element): + self.namespace = namespace + self.element = element + + def __repr__(self): + return '%s[%s]' % ( + self.__class__.__name__, + self._format_element()) + + def _format_element(self): + if self.namespace == '*': + return self.element + else: + return '%s|%s' % (self.namespace, self.element) + + def xpath(self): + if self.namespace == '*': + el = self.element.lower() + else: + # Kovid: Lowercased + el = '%s:%s' % (self.namespace, self.element.lower()) + return XPathExpr(element=el) + +class Hash(object): + """ + Represents selector#id + """ + + def __init__(self, selector, id): + self.selector = selector + self.id = id + + def __repr__(self): + return '%s[%r#%s]' % ( + self.__class__.__name__, + self.selector, self.id) + + def xpath(self): + path = self.selector.xpath() + path.add_condition('@id = %s' % xpath_literal(self.id)) + return path + +class Or(object): + + def __init__(self, items): + self.items = items + def __repr__(self): + return '%s(%r)' % ( + self.__class__.__name__, + self.items) + + def xpath(self): + paths = [item.xpath() for item in self.items] + return XPathExprOr(paths) + +class CombinedSelector(object): + + _method_mapping = { + ' ': 'descendant', + '>': 'child', + '+': 'direct_adjacent', + '~': 'indirect_adjacent', + } + + def __init__(self, selector, combinator, subselector): + assert selector is not None + self.selector = selector + self.combinator = combinator + self.subselector = subselector + + def __repr__(self): + if self.combinator == ' ': + comb = '' + else: + comb = self.combinator + return '%s[%r %s %r]' % ( + self.__class__.__name__, + self.selector, + comb, + self.subselector) + + def xpath(self): + if self.combinator not in self._method_mapping: + raise ExpressionError( + "Unknown combinator: %r" % self.combinator) + method = '_xpath_' + self._method_mapping[self.combinator] + method = getattr(self, method) + path = self.selector.xpath() + return method(path, self.subselector) + + def _xpath_descendant(self, xpath, sub): + # when sub is a descendant in any way of xpath + xpath.join('/descendant::', sub.xpath()) + return xpath + + def _xpath_child(self, xpath, sub): + # when sub is an immediate child of xpath + xpath.join('/', sub.xpath()) + return xpath + + def _xpath_direct_adjacent(self, xpath, sub): + # when sub immediately follows xpath + xpath.join('/following-sibling::', sub.xpath()) + xpath.add_name_test() + xpath.add_condition('position() = 1') + return xpath + + def _xpath_indirect_adjacent(self, xpath, sub): + # when sub comes somewhere after xpath as a sibling + xpath.join('/following-sibling::', sub.xpath()) + return xpath + +############################## +## XPathExpr objects: + +_el_re = re.compile(r'^\w+\s*$', re.UNICODE) +_id_re = re.compile(r'^(\w*)#(\w+)\s*$', re.UNICODE) +_class_re = re.compile(r'^(\w*)\.(\w+)\s*$', re.UNICODE) + + +def css_to_xpath_no_case(css_expr, prefix='descendant-or-self::'): + if isinstance(css_expr, _basestring): + match = _el_re.search(css_expr) + if match is not None: + # Kovid: Lowercased + return '%s%s' % (prefix, match.group(0).strip().lower()) + match = _id_re.search(css_expr) + if match is not None: + return "%s%s[@id = '%s']" % ( + prefix, match.group(1) or '*', match.group(2)) + match = _class_re.search(css_expr) + if match is not None: + # Kovid: lowercased + return "%s%s[contains(concat(' ', normalize-space(%s), ' '), ' %s ')]" % ( + prefix, match.group(1).lower() or '*', + lower_case('@class'), match.group(2).lower()) + css_expr = parse(css_expr) + expr = css_expr.xpath() + assert expr is not None, ( + "Got None for xpath expression from %s" % repr(css_expr)) + if prefix: + expr.add_prefix(prefix) + return _unicode(expr) + +class XPathExpr(object): + + def __init__(self, prefix=None, path=None, element='*', condition=None, + star_prefix=False): + self.prefix = prefix + self.path = path + self.element = element + self.condition = condition + self.star_prefix = star_prefix + + def __str__(self): + path = '' + if self.prefix is not None: + path += _unicode(self.prefix) + if self.path is not None: + path += _unicode(self.path) + path += _unicode(self.element) + if self.condition: + path += '[%s]' % self.condition + return path + + def __repr__(self): + return '%s[%s]' % ( + self.__class__.__name__, self) + + def add_condition(self, condition): + if self.condition: + self.condition = '%s and (%s)' % (self.condition, condition) + else: + self.condition = condition + + def add_path(self, part): + if self.path is None: + self.path = self.element + else: + self.path += self.element + self.element = part + + def add_prefix(self, prefix): + if self.prefix: + self.prefix = prefix + self.prefix + else: + self.prefix = prefix + + def add_name_test(self): + if self.element == '*': + # We weren't doing a test anyway + return + self.add_condition("name() = %s" % xpath_literal(self.element)) + self.element = '*' + + def add_star_prefix(self): + """ + Adds a /* prefix if there is no prefix. This is when you need + to keep context's constrained to a single parent. + """ + if self.path: + self.path += '*/' + else: + self.path = '*/' + self.star_prefix = True + + def join(self, combiner, other): + prefix = _unicode(self) + prefix += combiner + path = (other.prefix or '') + (other.path or '') + # We don't need a star prefix if we are joining to this other + # prefix; so we'll get rid of it + if other.star_prefix and path == '*/': + path = '' + self.prefix = prefix + self.path = path + self.element = other.element + self.condition = other.condition + +class XPathExprOr(XPathExpr): + """ + Represents |'d expressions. Note that unfortunately it isn't + the union, it's the sum, so duplicate elements will appear. + """ + + def __init__(self, items, prefix=None): + for item in items: + assert item is not None + self.items = items + self.prefix = prefix + + def __str__(self): + prefix = self.prefix or '' + return ' | '.join(["%s%s" % (prefix,i) for i in self.items]) + +split_at_single_quotes = re.compile("('+)").split + +def xpath_literal(s): + if isinstance(s, Element): + # This is probably a symbol that looks like an expression... + s = s._format_element() + else: + s = _unicode(s) + if "'" not in s: + s = "'%s'" % s + elif '"' not in s: + s = '"%s"' % s + else: + s = "concat(%s)" % ','.join([ + (("'" in part) and '"%s"' or "'%s'") % part + for part in split_at_single_quotes(s) if part + ]) + return s + +############################## +## Parsing functions + +def parse(string): + stream = TokenStream(tokenize(string)) + stream.source = string + try: + return parse_selector_group(stream) + except SelectorSyntaxError: + import sys + e = sys.exc_info()[1] + message = "%s at %s -> %r" % ( + e, stream.used, stream.peek()) + e.msg = message + if sys.version_info < (2,6): + e.message = message + e.args = tuple([message]) + raise + +def parse_selector_group(stream): + result = [] + while 1: + result.append(parse_selector(stream)) + if stream.peek() == ',': + stream.next() + else: + break + if len(result) == 1: + return result[0] + else: + return Or(result) + +def parse_selector(stream): + result = parse_simple_selector(stream) + while 1: + peek = stream.peek() + if peek == ',' or peek is None: + return result + elif peek in ('+', '>', '~'): + # A combinator + combinator = stream.next() + else: + combinator = ' ' + consumed = len(stream.used) + next_selector = parse_simple_selector(stream) + if consumed == len(stream.used): + raise SelectorSyntaxError( + "Expected selector, got '%s'" % stream.peek()) + result = CombinedSelector(result, combinator, next_selector) + return result + +def parse_simple_selector(stream): + peek = stream.peek() + if peek != '*' and not isinstance(peek, Symbol): + element = namespace = '*' + else: + next = stream.next() + if next != '*' and not isinstance(next, Symbol): + raise SelectorSyntaxError( + "Expected symbol, got '%s'" % next) + if stream.peek() == '|': + namespace = next + stream.next() + element = stream.next() + if element != '*' and not isinstance(next, Symbol): + raise SelectorSyntaxError( + "Expected symbol, got '%s'" % next) + else: + namespace = '*' + element = next + result = Element(namespace, element) + has_hash = False + while 1: + peek = stream.peek() + if peek == '#': + if has_hash: + # You can't have two hashes + # (FIXME: is there some more general rule I'm missing?) + break + stream.next() + result = Hash(result, stream.next()) + has_hash = True + continue + elif peek == '.': + stream.next() + result = Class(result, stream.next()) + continue + elif peek == '[': + stream.next() + result = parse_attrib(result, stream) + next = stream.next() + if not next == ']': + raise SelectorSyntaxError( + "] expected, got '%s'" % next) + continue + elif peek == ':' or peek == '::': + type = stream.next() + ident = stream.next() + if not isinstance(ident, Symbol): + raise SelectorSyntaxError( + "Expected symbol, got '%s'" % ident) + if stream.peek() == '(': + stream.next() + peek = stream.peek() + if isinstance(peek, String): + selector = stream.next() + elif isinstance(peek, Symbol) and is_int(peek): + selector = int(stream.next()) + else: + # FIXME: parse_simple_selector, or selector, or...? + selector = parse_simple_selector(stream) + next = stream.next() + if not next == ')': + raise SelectorSyntaxError( + "Expected ')', got '%s' and '%s'" + % (next, selector)) + result = Function(result, type, ident, selector) + else: + result = Pseudo(result, type, ident) + continue + else: + if peek == ' ': + stream.next() + break + # FIXME: not sure what "negation" is + return result + +def is_int(v): + try: + int(v) + except ValueError: + return False + else: + return True + +def parse_attrib(selector, stream): + attrib = stream.next() + if stream.peek() == '|': + namespace = attrib + stream.next() + attrib = stream.next() + else: + namespace = '*' + if stream.peek() == ']': + return Attrib(selector, namespace, attrib, 'exists', None) + op = stream.next() + if not op in ('^=', '$=', '*=', '=', '~=', '|=', '!='): + raise SelectorSyntaxError( + "Operator expected, got '%s'" % op) + value = stream.next() + if not isinstance(value, (Symbol, String)): + raise SelectorSyntaxError( + "Expected string or symbol, got '%s'" % value) + return Attrib(selector, namespace, attrib, op, value) + +def parse_series(s): + """ + Parses things like '1n+2', or 'an+b' generally, returning (a, b) + """ + if isinstance(s, Element): + s = s._format_element() + if not s or s == '*': + # Happens when there's nothing, which the CSS parser thinks of as * + return (0, 0) + if isinstance(s, int): + # Happens when you just get a number + return (0, s) + if s == 'odd': + return (2, 1) + elif s == 'even': + return (2, 0) + elif s == 'n': + return (1, 0) + if 'n' not in s: + # Just a b + return (0, int(s)) + a, b = s.split('n', 1) + if not a: + a = 1 + elif a == '-' or a == '+': + a = int(a+'1') + else: + a = int(a) + if not b: + b = 0 + elif b == '-' or b == '+': + b = int(b+'1') + else: + b = int(b) + return (a, b) + + +############################################################ +## Tokenizing +############################################################ + +_match_whitespace = re.compile(r'\s+', re.UNICODE).match + +_replace_comments = re.compile(r'/\*.*?\*/', re.DOTALL).sub + +_match_count_number = re.compile(r'[+-]?\d*n(?:[+-]\d+)?').match + +def tokenize(s): + pos = 0 + s = _replace_comments('', s) + while 1: + match = _match_whitespace(s, pos=pos) + if match: + preceding_whitespace_pos = pos + pos = match.end() + else: + preceding_whitespace_pos = 0 + if pos >= len(s): + return + match = _match_count_number(s, pos=pos) + if match and match.group() != 'n': + sym = s[pos:match.end()] + yield Symbol(sym, pos) + pos = match.end() + continue + c = s[pos] + c2 = s[pos:pos+2] + if c2 in ('~=', '|=', '^=', '$=', '*=', '::', '!='): + yield Token(c2, pos) + pos += 2 + continue + if c in '>+~,.*=[]()|:#': + if c in '.#[' and preceding_whitespace_pos > 0: + yield Token(' ', preceding_whitespace_pos) + yield Token(c, pos) + pos += 1 + continue + if c == '"' or c == "'": + # Quoted string + old_pos = pos + sym, pos = tokenize_escaped_string(s, pos) + yield String(sym, old_pos) + continue + old_pos = pos + sym, pos = tokenize_symbol(s, pos) + yield Symbol(sym, old_pos) + continue + +split_at_string_escapes = re.compile(r'(\\(?:%s))' + % '|'.join(['[A-Fa-f0-9]{1,6}(?:\r\n|\s)?', + '[^A-Fa-f0-9]'])).split + +def unescape_string_literal(literal): + substrings = [] + for substring in split_at_string_escapes(literal): + if not substring: + continue + elif '\\' in substring: + if substring[0] == '\\' and len(substring) > 1: + substring = substring[1:] + if substring[0] in '0123456789ABCDEFabcdef': + # int() correctly ignores the potentially trailing whitespace + substring = _unichr(int(substring, 16)) + else: + raise SelectorSyntaxError( + "Invalid escape sequence %r in string %r" + % (substring.split('\\')[1], literal)) + substrings.append(substring) + return ''.join(substrings) + +def tokenize_escaped_string(s, pos): + quote = s[pos] + assert quote in ('"', "'") + pos = pos+1 + start = pos + while 1: + next = s.find(quote, pos) + if next == -1: + raise SelectorSyntaxError( + "Expected closing %s for string in: %r" + % (quote, s[start:])) + result = s[start:next] + if result.endswith('\\'): + # next quote character is escaped + pos = next+1 + continue + if '\\' in result: + result = unescape_string_literal(result) + return result, next+1 + +_illegal_symbol = re.compile(r'[^\w\\-]', re.UNICODE) + +def tokenize_symbol(s, pos): + start = pos + match = _illegal_symbol.search(s, pos=pos) + if not match: + # Goes to end of s + return s[start:], len(s) + if match.start() == pos: + assert 0, ( + "Unexpected symbol: %r at %s" % (s[pos], pos)) + if not match: + result = s[start:] + pos = len(s) + else: + result = s[start:match.start()] + pos = match.start() + try: + result = result.encode('ASCII', 'backslashreplace').decode('unicode_escape') + except UnicodeDecodeError: + import sys + e = sys.exc_info()[1] + raise SelectorSyntaxError( + "Bad symbol %r: %s" % (result, e)) + return result, pos + +class TokenStream(object): + + def __init__(self, tokens, source=None): + self.used = [] + self.tokens = iter(tokens) + self.source = source + self.peeked = None + self._peeking = False + try: + self.next_token = self.tokens.next + except AttributeError: + # Python 3 + self.next_token = self.tokens.__next__ + + def next(self): + if self._peeking: + self._peeking = False + self.used.append(self.peeked) + return self.peeked + else: + try: + next = self.next_token() + self.used.append(next) + return next + except StopIteration: + return None + + def __iter__(self): + return iter(self.next, None) + + def peek(self): + if not self._peeking: + try: + self.peeked = self.next_token() + except StopIteration: + return None + self._peeking = True + return self.peeked diff --git a/src/calibre/ebooks/metadata/__init__.py b/src/calibre/ebooks/metadata/__init__.py index 2c26d011b7..a9816db5ae 100644 --- a/src/calibre/ebooks/metadata/__init__.py +++ b/src/calibre/ebooks/metadata/__init__.py @@ -36,8 +36,15 @@ def author_to_author_sort(author, method=None): return author if method is None: method = tweaks['author_sort_copy_method'] + + ltoks = frozenset(x.lower() for x in tokens) + copy_words = frozenset(x.lower() for x in tweaks['author_name_copywords']) + if ltoks.intersection(copy_words): + method = u'copy' + if method == u'copy': return author + suffixes = set([x.lower() for x in tweaks['author_name_suffixes']]) suffixes |= set([x+u'.' for x in suffixes]) diff --git a/src/calibre/ebooks/metadata/opf2.py b/src/calibre/ebooks/metadata/opf2.py index c52d089c70..9b8ae12b10 100644 --- a/src/calibre/ebooks/metadata/opf2.py +++ b/src/calibre/ebooks/metadata/opf2.py @@ -1312,7 +1312,7 @@ class OPFCreator(Metadata): ncx_stream.flush() -def metadata_to_opf(mi, as_string=True): +def metadata_to_opf(mi, as_string=True, default_lang=None): from lxml import etree import textwrap from calibre.ebooks.oeb.base import OPF, DC @@ -1328,7 +1328,8 @@ def metadata_to_opf(mi, as_string=True): '[http://calibre-ebook.com]' if not mi.languages: - lang = get_lang().replace('_', '-').partition('-')[0] + lang = (get_lang().replace('_', '-').partition('-')[0] if default_lang + is None else default_lang) mi.languages = [lang] root = etree.fromstring(textwrap.dedent( diff --git a/src/calibre/ebooks/metadata/sources/identify.py b/src/calibre/ebooks/metadata/sources/identify.py index 97fbae4727..4987b8cead 100644 --- a/src/calibre/ebooks/metadata/sources/identify.py +++ b/src/calibre/ebooks/metadata/sources/identify.py @@ -481,7 +481,7 @@ def identify(log, abort, # {{{ log('The identify phase took %.2f seconds'%(time.time() - start_time)) log('The longest time (%f) was taken by:'%longest, lp) log('Merging results from different sources and finding earliest', - 'publication dates') + 'publication dates from the xisbn service') start_time = time.time() results = merge_identify_results(results, log) diff --git a/src/calibre/ebooks/metadata/worker.py b/src/calibre/ebooks/metadata/worker.py index ca8707258b..cab582a264 100644 --- a/src/calibre/ebooks/metadata/worker.py +++ b/src/calibre/ebooks/metadata/worker.py @@ -33,7 +33,7 @@ def serialize_metadata_for(formats, tdir, id_): if not mi.application_id: mi.application_id = '__calibre_dummy__' with open(os.path.join(tdir, '%s.opf'%id_), 'wb') as f: - f.write(metadata_to_opf(mi)) + f.write(metadata_to_opf(mi, default_lang='und')) if cdata: with open(os.path.join(tdir, str(id_)), 'wb') as f: f.write(cdata) diff --git a/src/calibre/ebooks/mobi/mobiml.py b/src/calibre/ebooks/mobi/mobiml.py index eefa9d9e03..56a7a8b9ca 100644 --- a/src/calibre/ebooks/mobi/mobiml.py +++ b/src/calibre/ebooks/mobi/mobiml.py @@ -308,6 +308,11 @@ class MobiMLizer(object): istate = copy.copy(istates[-1]) istate.rendered = False istate.list_num = 0 + if tag == 'ol' and 'start' in elem.attrib: + try: + istate.list_num = int(elem.attrib['start'])-1 + except: + pass istates.append(istate) left = 0 display = style['display'] diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py index ba2bd01c3c..8592392d93 100644 --- a/src/calibre/ebooks/mobi/writer2/indexer.py +++ b/src/calibre/ebooks/mobi/writer2/indexer.py @@ -504,6 +504,9 @@ class Indexer(object): # {{{ else: self.indices = self.create_book_index() + if not self.indices: + raise ValueError('No valid entries in TOC, cannot generate index') + self.records.append(self.create_index_record()) self.records.insert(0, self.create_header()) self.records.extend(self.cncx.records) diff --git a/src/calibre/ebooks/mobi/writer2/main.py b/src/calibre/ebooks/mobi/writer2/main.py index ed0e43a303..7e748aac95 100644 --- a/src/calibre/ebooks/mobi/writer2/main.py +++ b/src/calibre/ebooks/mobi/writer2/main.py @@ -590,7 +590,7 @@ class MobiWriter(object): Write the PalmDB header ''' title = ascii_filename(unicode(self.oeb.metadata.title[0])).replace( - ' ', '_') + ' ', '_')[:32] title = title + (b'\0' * (32 - len(title))) now = int(time.time()) nrecords = len(self.records) diff --git a/src/calibre/ebooks/mobi/writer2/serializer.py b/src/calibre/ebooks/mobi/writer2/serializer.py index 377b29655c..9bbaa436a7 100644 --- a/src/calibre/ebooks/mobi/writer2/serializer.py +++ b/src/calibre/ebooks/mobi/writer2/serializer.py @@ -116,6 +116,12 @@ class Serializer(object): buf.write(b'') self.end_offset = buf.tell() self.fixup_links() + if self.start_offset is None: + # If we don't set a start offset, the stupid Kindle will + # open the book at the location of the first IndexEntry, which + # could be anywhere. So ensure the book is always opened at the + # beginning, instead. + self.start_offset = self.body_start_offset return buf.getvalue() def serialize_head(self): diff --git a/src/calibre/ebooks/oeb/stylizer.py b/src/calibre/ebooks/oeb/stylizer.py index f6ff594701..5e4f389262 100644 --- a/src/calibre/ebooks/oeb/stylizer.py +++ b/src/calibre/ebooks/oeb/stylizer.py @@ -27,6 +27,7 @@ from calibre import force_unicode from calibre.ebooks import unit_convert from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES from calibre.ebooks.oeb.base import XPNSMAP, xpath, urlnormalize +from calibre.ebooks.cssselect import css_to_xpath_no_case cssutils_log.setLevel(logging.WARN) @@ -98,32 +99,71 @@ FONT_SIZE_NAMES = set(['xx-small', 'x-small', 'small', 'medium', 'large', 'x-large', 'xx-large']) -class CSSSelector(etree.XPath): - MIN_SPACE_RE = re.compile(r' *([>~+]) *') +class CSSSelector(object): + LOCAL_NAME_RE = re.compile(r"(?' % ( self.__class__.__name__, hex(abs(id(self)))[2:], self.css) +_selector_cache = {} + +MIN_SPACE_RE = re.compile(r' *([>~+]) *') + +def get_css_selector(raw_selector): + css = MIN_SPACE_RE.sub(r'\1', raw_selector) + if isinstance(css, unicode): + # Workaround for bug in lxml on windows/OS X that causes a massive + # memory leak with non ASCII selectors + css = css.encode('ascii', 'ignore').decode('ascii') + ans = _selector_cache.get(css, None) + if ans is None: + ans = CSSSelector(css) + _selector_cache[css] = ans + return ans class Stylizer(object): STYLESHEETS = WeakKeyDictionary() @@ -223,41 +263,12 @@ class Stylizer(object): rules.sort() self.rules = rules self._styles = {} - class_sel_pat = re.compile(r'\.[a-z]+', re.IGNORECASE) - capital_sel_pat = re.compile(r'h|[A-Z]+') for _, _, cssdict, text, _ in rules: fl = ':first-letter' in text if fl: text = text.replace(':first-letter', '') - try: - selector = CSSSelector(text) - except (AssertionError, ExpressionError, etree.XPathSyntaxError, - NameError, # thrown on OS X instead of SelectorSyntaxError - SelectorSyntaxError): - continue - try: - matches = selector(tree) - except etree.XPathEvalError: - continue - - if not matches: - ntext = capital_sel_pat.sub(lambda m: m.group().lower(), text) - if ntext != text: - self.logger.warn('Transformed CSS selector', text, 'to', - ntext) - selector = CSSSelector(ntext) - matches = selector(tree) - - if not matches and class_sel_pat.match(text) and text.lower() != text: - found = False - ltext = text.lower() - for x in tree.xpath('//*[@class]'): - if ltext.endswith('.'+x.get('class').lower()): - matches.append(x) - found = True - if found: - self.logger.warn('Ignoring case mismatches for CSS selector: %s in %s' - %(text, item.href)) + selector = get_css_selector(text) + matches = selector(tree, self.logger) if fl: from lxml.builder import ElementMaker E = ElementMaker(namespace=XHTML_NS) diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py index d006d8dd2d..1493a647ae 100644 --- a/src/calibre/ebooks/oeb/transforms/flatcss.py +++ b/src/calibre/ebooks/oeb/transforms/flatcss.py @@ -320,7 +320,8 @@ class CSSFlattener(object): if self.context.insert_blank_line: cssdict['margin-top'] = cssdict['margin-bottom'] = \ '%fem'%self.context.insert_blank_line_size - if self.context.remove_paragraph_spacing: + if (self.context.remove_paragraph_spacing and + cssdict.get('text-align', None) not in ('center', 'right')): cssdict['text-indent'] = "%1.1fem" % self.context.remove_paragraph_spacing_indent_size if cssdict: diff --git a/src/calibre/gui2/__init__.py b/src/calibre/gui2/__init__.py index fc02ad7fae..715696a89e 100644 --- a/src/calibre/gui2/__init__.py +++ b/src/calibre/gui2/__init__.py @@ -98,6 +98,7 @@ gprefs.defaults['book_display_fields'] = [ ] gprefs.defaults['default_author_link'] = 'http://en.wikipedia.org/w/index.php?search={author}' gprefs.defaults['preserve_date_on_ctl'] = True +gprefs.defaults['cb_fullscreen'] = False # }}} @@ -173,6 +174,8 @@ def _config(): # {{{ help='Search history for the plugin preferences') c.add_opt('shortcuts_search_history', default=[], help='Search history for the keyboard preferences') + c.add_opt('tweaks_search_history', default=[], + help='Search history for tweaks') c.add_opt('worker_limit', default=6, help=_( 'Maximum number of simultaneous conversion/news download jobs. ' @@ -186,7 +189,9 @@ def _config(): # {{{ c.add_opt('enforce_cpu_limit', default=True, help=_('Limit max simultaneous jobs to number of CPUs')) c.add_opt('gui_layout', choices=['wide', 'narrow'], - help=_('The layout of the user interface'), default='wide') + help=_('The layout of the user interface. Wide has the ' + 'book details panel on the right and narrow has ' + 'it at the bottom.'), default='wide') c.add_opt('show_avg_rating', default=True, help=_('Show the average rating per item indication in the tag browser')) c.add_opt('disable_animations', default=False, diff --git a/src/calibre/gui2/actions/catalog.py b/src/calibre/gui2/actions/catalog.py index e57ce06742..d18eb990b3 100644 --- a/src/calibre/gui2/actions/catalog.py +++ b/src/calibre/gui2/actions/catalog.py @@ -17,7 +17,7 @@ from calibre.gui2.actions import InterfaceAction class GenerateCatalogAction(InterfaceAction): name = 'Generate Catalog' - action_spec = (_('Create a catalog of the books in your calibre library'), 'catalog.png', 'Catalog builder', None) + action_spec = (_('Create catalog'), 'catalog.png', 'Catalog builder', None) dont_add_to = frozenset(['menubar-device', 'toolbar-device', 'context-menu-device']) def genesis(self): diff --git a/src/calibre/gui2/cover_flow.py b/src/calibre/gui2/cover_flow.py index ca108a592e..67a8f08bcd 100644 --- a/src/calibre/gui2/cover_flow.py +++ b/src/calibre/gui2/cover_flow.py @@ -9,8 +9,8 @@ Module to implement the Cover Flow feature import sys, os, time -from PyQt4.Qt import (QImage, QSizePolicy, QTimer, QDialog, Qt, QSize, - QStackedLayout, QLabel, QByteArray, pyqtSignal) +from PyQt4.Qt import (QImage, QSizePolicy, QTimer, QDialog, Qt, QSize, QAction, + QStackedLayout, QLabel, QByteArray, pyqtSignal, QKeySequence) from calibre import plugins from calibre.gui2 import config, available_height, available_width, gprefs @@ -150,12 +150,39 @@ class CBDialog(QDialog): if not self.restoreGeometry(geom): h, w = available_height()-60, int(available_width()/1.5) self.resize(w, h) + self.action_fs_toggle = a = QAction(self) + self.addAction(a) + a.setShortcuts([QKeySequence('F11', QKeySequence.PortableText), + QKeySequence('Ctrl+Shift+F', QKeySequence.PortableText)]) + a.triggered.connect(self.toggle_fullscreen) + self.action_esc_fs = a = QAction(self) + a.triggered.connect(self.show_normal) + self.addAction(a) + a.setShortcuts([QKeySequence('Esc', QKeySequence.PortableText)]) + + self.pre_fs_geom = None def closeEvent(self, *args): - geom = bytearray(self.saveGeometry()) - gprefs['cover_browser_dialog_geometry'] = geom + if not self.isFullScreen(): + geom = bytearray(self.saveGeometry()) + gprefs['cover_browser_dialog_geometry'] = geom self.closed.emit() + def show_normal(self): + self.showNormal() + if self.pre_fs_geom is not None: + self.restoreGeometry(self.pre_fs_geom) + self.pre_fs_geom = None + + def toggle_fullscreen(self, *args): + if self.isFullScreen(): + self.show_normal() + else: + self.pre_fs_geom = bytearray(self.saveGeometry()) + self.showFullScreen() + + + class CoverFlowMixin(object): def __init__(self): @@ -228,7 +255,7 @@ class CoverFlowMixin(object): d.addAction(self.cb_splitter.action_toggle) self.cover_flow.setVisible(True) self.cover_flow.setFocus(Qt.OtherFocusReason) - d.show() + d.showFullScreen() if gprefs['cb_fullscreen'] else d.show() self.cb_splitter.button.set_state_to_hide() d.closed.connect(self.cover_browser_closed) self.cb_dialog = d diff --git a/src/calibre/gui2/languages.py b/src/calibre/gui2/languages.py index 1e192a0c94..d7f34df1b4 100644 --- a/src/calibre/gui2/languages.py +++ b/src/calibre/gui2/languages.py @@ -9,18 +9,20 @@ __docformat__ = 'restructuredtext en' from calibre.gui2.complete import MultiCompleteComboBox from calibre.utils.localization import lang_map -from calibre.utils.icu import sort_key +from calibre.utils.icu import sort_key, lower class LanguagesEdit(MultiCompleteComboBox): def __init__(self, parent=None): MultiCompleteComboBox.__init__(self, parent) + self.setSizeAdjustPolicy(self.AdjustToMinimumContentsLengthWithIcon) + self.setMinimumContentsLength(20) self._lang_map = lang_map() self.names_with_commas = [x for x in self._lang_map.itervalues() if ',' in x] self.comma_map = {k:k.replace(',', '|') for k in self.names_with_commas} self.comma_rmap = {v:k for k, v in self.comma_map.iteritems()} - self._rmap = {v:k for k,v in self._lang_map.iteritems()} + self._rmap = {lower(v):k for k,v in self._lang_map.iteritems()} all_items = sorted(self._lang_map.itervalues(), key=sort_key) @@ -44,7 +46,7 @@ class LanguagesEdit(MultiCompleteComboBox): ans = [] for name in vals: if name: - code = self._rmap.get(name, None) + code = self._rmap.get(lower(name), None) if code is not None: ans.append(code) return ans @@ -64,7 +66,7 @@ class LanguagesEdit(MultiCompleteComboBox): bad = [] for name in vals: if name: - code = self._rmap.get(name, None) + code = self._rmap.get(lower(name), None) if code is None: bad.append(name) return bad diff --git a/src/calibre/gui2/metadata/basic_widgets.py b/src/calibre/gui2/metadata/basic_widgets.py index 29f6fffa0b..3ec34938af 100644 --- a/src/calibre/gui2/metadata/basic_widgets.py +++ b/src/calibre/gui2/metadata/basic_widgets.py @@ -308,7 +308,7 @@ class AuthorSortEdit(EnLineEdit): LABEL = _('Author s&ort:') def __init__(self, parent, authors_edit, autogen_button, db, - copy_a_to_as_action, copy_as_to_a_action): + copy_a_to_as_action, copy_as_to_a_action, a_to_as, as_to_a): EnLineEdit.__init__(self, parent) self.authors_edit = authors_edit self.db = db @@ -333,6 +333,8 @@ class AuthorSortEdit(EnLineEdit): autogen_button.clicked.connect(self.auto_generate) copy_a_to_as_action.triggered.connect(self.auto_generate) copy_as_to_a_action.triggered.connect(self.copy_to_authors) + a_to_as.triggered.connect(self.author_to_sort) + as_to_a.triggered.connect(self.sort_to_author) self.update_state() @dynamic_property @@ -389,10 +391,21 @@ class AuthorSortEdit(EnLineEdit): def auto_generate(self, *args): au = unicode(self.authors_edit.text()) - au = re.sub(r'\s+et al\.$', '', au) + au = re.sub(r'\s+et al\.$', '', au).strip() authors = string_to_authors(au) self.current_val = self.db.author_sort_from_authors(authors) + def author_to_sort(self, *args): + au = unicode(self.authors_edit.text()) + au = re.sub(r'\s+et al\.$', '', au).strip() + if au: + self.current_val = au + + def sort_to_author(self, *args): + aus = self.current_val + if aus: + self.authors_edit.current_val = [aus] + def initialize(self, db, id_): self.current_val = db.author_sort(id_, index_is_id=True) diff --git a/src/calibre/gui2/metadata/single.py b/src/calibre/gui2/metadata/single.py index 7f2ea036d6..a2666b0351 100644 --- a/src/calibre/gui2/metadata/single.py +++ b/src/calibre/gui2/metadata/single.py @@ -130,10 +130,15 @@ class MetadataSingleDialogBase(ResizableDialog): ac = m.addAction(QIcon(I('forward.png')), _('Set author sort from author')) ac2 = m.addAction(QIcon(I('back.png')), _('Set author from author sort')) ac3 = m.addAction(QIcon(I('user_profile.png')), _('Manage authors')) + ac4 = m.addAction(QIcon(I('next.png')), + _('Copy author to author sort')) + ac5 = m.addAction(QIcon(I('previous.png')), + _('Copy author sort to author')) + b.setMenu(m) self.authors = AuthorsEdit(self, ac3) self.author_sort = AuthorSortEdit(self, self.authors, b, self.db, ac, - ac2) + ac2, ac4, ac5) self.basic_metadata_widgets.extend([self.authors, self.author_sort]) self.swap_title_author_button = QToolButton(self) @@ -723,7 +728,7 @@ class MetadataSingleDialogAlt1(MetadataSingleDialogBase): # {{{ tl.addWidget(self.swap_title_author_button, 0, 0, 2, 1) tl.addWidget(self.manage_authors_button, 2, 0, 1, 1) - tl.addWidget(self.paste_isbn_button, 11, 0, 1, 1) + tl.addWidget(self.paste_isbn_button, 12, 0, 1, 1) create_row(0, self.title, self.title_sort, button=self.deduce_title_sort_button, span=2, @@ -859,7 +864,7 @@ class MetadataSingleDialogAlt2(MetadataSingleDialogBase): # {{{ tl.addWidget(self.swap_title_author_button, 0, 0, 2, 1) tl.addWidget(self.manage_authors_button, 2, 0, 2, 1) - tl.addWidget(self.paste_isbn_button, 11, 0, 1, 1) + tl.addWidget(self.paste_isbn_button, 12, 0, 1, 1) create_row(0, self.title, self.title_sort, button=self.deduce_title_sort_button, span=2, diff --git a/src/calibre/gui2/preferences/look_feel.py b/src/calibre/gui2/preferences/look_feel.py index b34c5e6042..c017fe69c2 100644 --- a/src/calibre/gui2/preferences/look_feel.py +++ b/src/calibre/gui2/preferences/look_feel.py @@ -6,16 +6,15 @@ __copyright__ = '2010, Kovid Goyal ' __docformat__ = 'restructuredtext en' from PyQt4.Qt import (QApplication, QFont, QFontInfo, QFontDialog, - QAbstractListModel, Qt, QIcon) + QAbstractListModel, Qt, QIcon, QKeySequence) from calibre.gui2.preferences import ConfigWidgetBase, test_widget, CommaSeparatedList from calibre.gui2.preferences.look_feel_ui import Ui_Form -from calibre.gui2 import config, gprefs, qt_app +from calibre.gui2 import config, gprefs, qt_app, NONE from calibre.utils.localization import (available_translations, get_language, get_lang) from calibre.utils.config import prefs from calibre.utils.icu import sort_key -from calibre.gui2 import NONE from calibre.gui2.book_details import get_field_list from calibre.gui2.preferences.coloring import EditRules @@ -130,6 +129,7 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form): r('disable_tray_notification', config) r('use_roman_numerals_for_series_number', config) r('separate_cover_flow', config, restart_required=True) + r('cb_fullscreen', gprefs) choices = [(_('Off'), 'off'), (_('Small'), 'small'), (_('Medium'), 'medium'), (_('Large'), 'large')] @@ -171,6 +171,11 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form): self.tabWidget.addTab(self.edit_rules, QIcon(I('format-fill-color.png')), _('Column coloring')) self.tabWidget.setCurrentIndex(0) + keys = [QKeySequence('F11', QKeySequence.PortableText), QKeySequence( + 'Ctrl+Shift+F', QKeySequence.PortableText)] + keys = [unicode(x.toString(QKeySequence.NativeText)) for x in keys] + self.fs_help_msg.setText(unicode(self.fs_help_msg.text())%( + _(' or ').join(keys))) def initialize(self): ConfigWidgetBase.initialize(self) diff --git a/src/calibre/gui2/preferences/look_feel.ui b/src/calibre/gui2/preferences/look_feel.ui index 07d533fdef..498013a68b 100644 --- a/src/calibre/gui2/preferences/look_feel.ui +++ b/src/calibre/gui2/preferences/look_feel.ui @@ -417,7 +417,7 @@ then the tags will be displayed each on their own line. - + Qt::Vertical @@ -430,6 +430,26 @@ then the tags will be displayed each on their own line. + + + + When showing cover browser in separate window, show it &fullscreen + + + + + + + margin-left: 1.5em + + + You can press the %s keys to toggle full screen mode. + + + true + + + diff --git a/src/calibre/gui2/preferences/tweaks.py b/src/calibre/gui2/preferences/tweaks.py index a1756bf1ba..04c11ad40e 100644 --- a/src/calibre/gui2/preferences/tweaks.py +++ b/src/calibre/gui2/preferences/tweaks.py @@ -9,14 +9,19 @@ import textwrap from calibre.gui2.preferences import ConfigWidgetBase, test_widget, AbortCommit from calibre.gui2.preferences.tweaks_ui import Ui_Form -from calibre.gui2 import error_dialog, NONE +from calibre.gui2 import error_dialog, NONE, info_dialog from calibre.utils.config import read_raw_tweaks, write_tweaks from calibre.gui2.widgets import PythonHighlighter from calibre import isbytestring +from calibre.utils.icu import lower +from calibre.utils.search_query_parser import (ParseException, + SearchQueryParser) from PyQt4.Qt import (QAbstractListModel, Qt, QStyledItemDelegate, QStyle, QStyleOptionViewItem, QFont, QDialogButtonBox, QDialog, - QVBoxLayout, QPlainTextEdit, QLabel) + QVBoxLayout, QPlainTextEdit, QLabel, QModelIndex) + +ROOT = QModelIndex() class Delegate(QStyledItemDelegate): # {{{ def __init__(self, view): @@ -35,7 +40,7 @@ class Delegate(QStyledItemDelegate): # {{{ class Tweak(object): # {{{ def __init__(self, name, doc, var_names, defaults, custom): - translate = __builtins__['_'] + translate = _ self.name = translate(name) self.doc = translate(doc.strip()) self.var_names = var_names @@ -87,10 +92,11 @@ class Tweak(object): # {{{ # }}} -class Tweaks(QAbstractListModel): # {{{ +class Tweaks(QAbstractListModel, SearchQueryParser): # {{{ def __init__(self, parent=None): QAbstractListModel.__init__(self, parent) + SearchQueryParser.__init__(self, ['all']) raw_defaults, raw_custom = read_raw_tweaks() self.parse_tweaks(raw_defaults, raw_custom) @@ -223,6 +229,54 @@ class Tweaks(QAbstractListModel): # {{{ def set_plugin_tweaks(self, d): self.plugin_tweaks = d + def universal_set(self): + return set(xrange(self.rowCount())) + + def get_matches(self, location, query, candidates=None): + if candidates is None: + candidates = self.universal_set() + ans = set() + if not query: + return ans + query = lower(query) + for r in candidates: + dat = self.data(self.index(r), Qt.UserRole) + if query in lower(dat.name):# or query in lower(dat.doc): + ans.add(r) + return ans + + def find(self, query): + query = query.strip() + if not query: + return ROOT + matches = self.parse(query) + if not matches: + return ROOT + matches = list(sorted(matches)) + return self.index(matches[0]) + + def find_next(self, idx, query, backwards=False): + query = query.strip() + if not query: + return idx + matches = self.parse(query) + if not matches: + return idx + loc = idx.row() + if loc not in matches: + return self.find(query) + if len(matches) == 1: + return ROOT + matches = list(sorted(matches)) + i = matches.index(loc) + if backwards: + ans = i - 1 if i - 1 >= 0 else len(matches)-1 + else: + ans = i + 1 if i + 1 < len(matches) else 0 + + ans = matches[ans] + return self.index(ans) + # }}} class PluginTweaks(QDialog): # {{{ @@ -257,12 +311,18 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form): self.delegate = Delegate(self.tweaks_view) self.tweaks_view.setItemDelegate(self.delegate) self.tweaks_view.currentChanged = self.current_changed + self.view = self.tweaks_view self.highlighter = PythonHighlighter(self.edit_tweak.document()) self.restore_default_button.clicked.connect(self.restore_to_default) self.apply_button.clicked.connect(self.apply_tweak) self.plugin_tweaks_button.clicked.connect(self.plugin_tweaks) self.splitter.setStretchFactor(0, 1) self.splitter.setStretchFactor(1, 100) + self.next_button.clicked.connect(self.find_next) + self.previous_button.clicked.connect(self.find_previous) + self.search.initialize('tweaks_search_history', help_text= + _('Search for tweak')) + self.search.search.connect(self.find) def plugin_tweaks(self): @@ -290,7 +350,7 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form): self.changed_signal.emit() def initialize(self): - self.tweaks = Tweaks() + self.tweaks = self._model = Tweaks() self.tweaks_view.setModel(self.tweaks) def restore_to_default(self, *args): @@ -338,6 +398,45 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form): ConfigWidgetBase.commit(self) return True + def find(self, query): + if not query: + return + try: + idx = self._model.find(query) + except ParseException: + self.search.search_done(False) + return + self.search.search_done(True) + if not idx.isValid(): + info_dialog(self, _('No matches'), + _('Could not find any shortcuts matching %s')%query, + show=True, show_copy_button=False) + return + self.highlight_index(idx) + + def highlight_index(self, idx): + if not idx.isValid(): return + self.view.scrollTo(idx) + self.view.selectionModel().select(idx, + self.view.selectionModel().ClearAndSelect) + self.view.setCurrentIndex(idx) + + def find_next(self, *args): + idx = self.view.currentIndex() + if not idx.isValid(): + idx = self._model.index(0) + idx = self._model.find_next(idx, + unicode(self.search.currentText())) + self.highlight_index(idx) + + def find_previous(self, *args): + idx = self.view.currentIndex() + if not idx.isValid(): + idx = self._model.index(0) + idx = self._model.find_next(idx, + unicode(self.search.currentText()), backwards=True) + self.highlight_index(idx) + if __name__ == '__main__': from PyQt4.Qt import QApplication diff --git a/src/calibre/gui2/preferences/tweaks.ui b/src/calibre/gui2/preferences/tweaks.ui index ab3f6b2bc3..19f6c836d5 100644 --- a/src/calibre/gui2/preferences/tweaks.ui +++ b/src/calibre/gui2/preferences/tweaks.ui @@ -6,7 +6,7 @@ 0 0 - 660 + 756 531 @@ -14,8 +14,24 @@ Form + + + + Values for the tweaks are shown below. Edit them to change the behavior of calibre. Your changes will only take effect <b>after a restart</b> of calibre. + + + true + + + + + + 0 + 10 + + Qt::Horizontal @@ -24,16 +40,6 @@ - - - - Values for the tweaks are shown below. Edit them to change the behavior of calibre. Your changes will only take effect <b>after a restart</b> of calibre. - - - true - - - @@ -72,8 +78,8 @@ - - + + Help @@ -92,7 +98,7 @@ - + Edit tweak @@ -128,12 +134,59 @@ + + + + + 10 + 0 + + + + QComboBox::AdjustToMinimumContentsLength + + + 10 + + + + + + + &Next + + + + :/images/arrow-down.png:/images/arrow-down.png + + + + + + + &Previous + + + + :/images/arrow-up.png:/images/arrow-up.png + + + - + + + SearchBox2 + QComboBox +
calibre/gui2/search_box.h
+
+
+ + + diff --git a/src/calibre/library/caches.py b/src/calibre/library/caches.py index 5f9dca6d23..62cad827c4 100644 --- a/src/calibre/library/caches.py +++ b/src/calibre/library/caches.py @@ -15,7 +15,7 @@ from calibre.utils.config import tweaks, prefs from calibre.utils.date import parse_date, now, UNDEFINED_DATE from calibre.utils.search_query_parser import SearchQueryParser from calibre.utils.pyparsing import ParseException -from calibre.utils.localization import canonicalize_lang +from calibre.utils.localization import canonicalize_lang, lang_map from calibre.ebooks.metadata import title_sort, author_to_author_sort from calibre.ebooks.metadata.opf2 import metadata_to_opf from calibre import prints @@ -728,7 +728,9 @@ class ResultCache(SearchQueryParser): # {{{ elif loc == db_col['languages']: q = canonicalize_lang(query) if q is None: - q = query + lm = lang_map() + rm = {v.lower():k for k,v in lm.iteritems()} + q = rm.get(query, query) else: q = query diff --git a/src/calibre/library/sqlite.py b/src/calibre/library/sqlite.py index b5917f1a55..90d293ba64 100644 --- a/src/calibre/library/sqlite.py +++ b/src/calibre/library/sqlite.py @@ -290,7 +290,10 @@ class DatabaseException(Exception): def __init__(self, err, tb): tb = '\n\t'.join(('\tRemote'+tb).splitlines()) - msg = unicode(err) +'\n' + tb + try: + msg = unicode(err) +'\n' + tb + except: + msg = repr(err) + '\n' + tb Exception.__init__(self, msg) self.orig_err = err self.orig_tb = tb diff --git a/src/calibre/utils/icu.py b/src/calibre/utils/icu.py index d5bef449c4..4daec9d553 100644 --- a/src/calibre/utils/icu.py +++ b/src/calibre/utils/icu.py @@ -35,7 +35,7 @@ def load_icu(): if _icu is None: print plugins['icu'][1] else: - if not _icu.ok: + if not getattr(_icu, 'ok', False): print 'icu not ok' _icu = None return _icu diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index 2017248cfc..1d513082f1 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -28,6 +28,7 @@ from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending from calibre.ptempfile import PersistentTemporaryFile from calibre.utils.date import now as nowf from calibre.utils.magick.draw import save_cover_data_to, add_borders_to_image +from calibre.utils.localization import canonicalize_lang class LoginFailed(ValueError): pass @@ -1117,6 +1118,9 @@ class BasicNewsRecipe(Recipe): mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title() mi.timestamp = nowf() mi.comments = self.description + language = canonicalize_lang(self.language) + if language is not None: + mi.language = language if not isinstance(mi.comments, unicode): mi.comments = mi.comments.decode('utf-8', 'replace') mi.pubdate = nowf()