From f5f81cbe85827e01cfc936eacca38ad614f4f4d0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 19 Aug 2011 18:49:20 -0600 Subject: [PATCH 01/39] ... --- setup/install.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup/install.py b/setup/install.py index 42df360b56..4194f7ed26 100644 --- a/setup/install.py +++ b/setup/install.py @@ -55,7 +55,7 @@ class Develop(Command): short_description = 'Setup a development environment for calibre' MODE = 0755 - sub_commands = ['build', 'resources', 'gui'] + sub_commands = ['build', 'resources', 'iso639', 'gui',] def add_postinstall_options(self, parser): parser.add_option('--make-errors-fatal', action='store_true', default=False, From 12072ac7d7d43f4289b126a5b53381d078e12d1b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 19 Aug 2011 19:05:00 -0600 Subject: [PATCH 02/39] ... --- src/calibre/utils/icu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/utils/icu.py b/src/calibre/utils/icu.py index d5bef449c4..4daec9d553 100644 --- a/src/calibre/utils/icu.py +++ b/src/calibre/utils/icu.py @@ -35,7 +35,7 @@ def load_icu(): if _icu is None: print plugins['icu'][1] else: - if not _icu.ok: + if not getattr(_icu, 'ok', False): print 'icu not ok' _icu = None return _icu From ee3baf7dcf460586d991d69a42dbbc5b264653c9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 19 Aug 2011 19:29:41 -0600 Subject: [PATCH 03/39] Fix --clean-all --- setup/gui.py | 9 +++++---- setup/resources.py | 7 ++++++- setup/translations.py | 14 ++++++++++++-- 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/setup/gui.py b/setup/gui.py index 058a3f052f..912760ddf8 100644 --- a/setup/gui.py +++ b/setup/gui.py @@ -17,8 +17,8 @@ class GUI(Command): @classmethod def find_forms(cls): - from calibre.gui2 import find_forms - return find_forms(cls.SRC) + # We do not use the calibre function find_forms as + # mporting calibre.gui2 may not work forms = [] for root, _, files in os.walk(cls.PATH): for name in files: @@ -29,8 +29,9 @@ class GUI(Command): @classmethod def form_to_compiled_form(cls, form): - from calibre.gui2 import form_to_compiled_form - return form_to_compiled_form(form) + # We do not use the calibre function form_to_compiled_form as + # importing calibre.gui2 may not work + return form.rpartition('.')[0]+'_ui.py' def run(self, opts): self.build_forms() diff --git a/setup/resources.py b/setup/resources.py index 41068f78a0..ee72a98cb6 100644 --- a/setup/resources.py +++ b/setup/resources.py @@ -219,12 +219,17 @@ class Resources(Command): json.dump(function_dict, open(dest, 'wb'), indent=4) def clean(self): - for x in ('scripts', 'recipes', 'ebook-convert-complete'): + for x in ('scripts', 'ebook-convert-complete'): x = self.j(self.RESOURCES, x+'.pickle') if os.path.exists(x): os.remove(x) from setup.commands import kakasi kakasi.clean() + for x in ('builtin_recipes.xml', 'builtin_recipes.zip', + 'template-functions.json'): + x = self.j(self.RESOURCES, x) + if os.path.exists(x): + os.remove(x) diff --git a/setup/translations.py b/setup/translations.py index 2e8e6d52f3..3523272770 100644 --- a/setup/translations.py +++ b/setup/translations.py @@ -206,6 +206,10 @@ class Translations(POT): # {{{ for x in (i, j, d): if os.path.exists(x): os.remove(x) + zf = self.DEST + '.zip' + if os.path.exists(zf): + os.remove(zf) + # }}} class GetTranslations(Translations): @@ -273,13 +277,14 @@ class GetTranslations(Translations): class ISO639(Command): description = 'Compile translations for ISO 639 codes' + DEST = os.path.join(os.path.dirname(POT.SRC), 'resources', 'localization', + 'iso639.pickle') def run(self, opts): src = self.j(self.d(self.SRC), 'setup', 'iso639.xml') if not os.path.exists(src): raise Exception(src + ' does not exist') - dest = self.j(self.d(self.SRC), 'resources', 'localization', - 'iso639.pickle') + dest = self.DEST if not self.newer(dest, src): self.info('Pickled code is up to date') return @@ -322,3 +327,8 @@ class ISO639(Command): '3to2':m3to2, '3bto3t':m3bto3t, 'name_map':nm} dump(x, open(dest, 'wb'), -1) + def clean(self): + if os.path.exists(self.DEST): + os.remove(self.DEST) + + From f27438b44a713713ee3e3cacc61ef0b319793efd Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 19 Aug 2011 21:09:14 -0600 Subject: [PATCH 04/39] Fix HBR --- recipes/hbr.recipe | 85 +++++++++++++++++++++++----------------------- 1 file changed, 42 insertions(+), 43 deletions(-) diff --git a/recipes/hbr.recipe b/recipes/hbr.recipe index 1152a48784..30cf54bf8d 100644 --- a/recipes/hbr.recipe +++ b/recipes/hbr.recipe @@ -13,6 +13,8 @@ class HBR(BasicNewsRecipe): no_stylesheets = True LOGIN_URL = 'http://hbr.org/login?request_url=/' + LOGOUT_URL = 'http://hbr.org/logout?request_url=/' + INDEX = 'http://hbr.org/archive-toc/BR' keep_only_tags = [dict(name='div', id='pageContainer')] @@ -34,6 +36,7 @@ class HBR(BasicNewsRecipe): def get_browser(self): br = BasicNewsRecipe.get_browser(self) + #''' br.open(self.LOGIN_URL) br.select_form(name='signin-form') br['signin-form:username'] = self.username @@ -42,9 +45,13 @@ class HBR(BasicNewsRecipe): if 'My Account' not in raw: raise Exception('Failed to login, are you sure your username and password are correct?') self.logout_url = None - link = br.find_link(text='Sign out') - if link: - self.logout_url = link.absolute_url + try: + link = br.find_link(text='Sign out') + if link: + self.logout_url = link.absolute_url + except: + self.logout_url = self.LOGOUT_URL + #''' return br def cleanup(self): @@ -57,6 +64,8 @@ class HBR(BasicNewsRecipe): def hbr_get_toc(self): + #return self.index_to_soup(open('/t/hbr.html').read()) + today = date.today() future = today + timedelta(days=30) for x in [x.strftime('%y%m') for x in (future, today)]: @@ -66,53 +75,43 @@ class HBR(BasicNewsRecipe): return soup raise Exception('Could not find current issue') - def hbr_parse_section(self, container, feeds): - current_section = None - current_articles = [] - for x in container.findAll(name=['li', 'h3', 'h4']): - if x.name in ['h3', 'h4'] and not x.findAll(True): - if current_section and current_articles: - feeds.append((current_section, current_articles)) - current_section = self.tag_to_string(x) - current_articles = [] - self.log('\tFound section:', current_section) - if x.name == 'li': - a = x.find('a', href=True) - if a is not None: - title = self.tag_to_string(a) - url = a.get('href') - if '/ar/' not in url: - continue - if url.startswith('/'): - url = 'http://hbr.org'+url - url = self.map_url(url) - p = x.find('p') - desc = '' - if p is not None: - desc = self.tag_to_string(p) - if not title or not url: - continue - self.log('\t\tFound article:', title) - self.log('\t\t\t', url) - self.log('\t\t\t', desc) - current_articles.append({'title':title, 'url':url, - 'description':desc, 'date':''}) - if current_section and current_articles: - feeds.append((current_section, current_articles)) - - - def hbr_parse_toc(self, soup): feeds = [] - features = soup.find(id='issueFeaturesContent') - self.hbr_parse_section(features, feeds) - departments = soup.find(id='issueDepartments') - self.hbr_parse_section(departments, feeds) + current_section = None + articles = [] + for x in soup.find(id='archiveToc').findAll(['h3', 'h4']): + if x.name == 'h3': + if current_section is not None and articles: + feeds.append((current_section, articles)) + current_section = self.tag_to_string(x).capitalize() + articles = [] + self.log('\tFound section:', current_section) + else: + a = x.find('a', href=True) + if a is None: continue + title = self.tag_to_string(a) + url = a['href'] + if '/ar/' not in url: + continue + if url.startswith('/'): + url = 'http://hbr.org' + url + url = self.map_url(url) + p = x.parent.find('p') + desc = '' + if p is not None: + desc = self.tag_to_string(p) + self.log('\t\tFound article:', title) + self.log('\t\t\t', url) + self.log('\t\t\t', desc) + + articles.append({'title':title, 'url':url, 'description':desc, + 'date':''}) return feeds def parse_index(self): soup = self.hbr_get_toc() + #open('/t/hbr.html', 'wb').write(unicode(soup).encode('utf-8')) feeds = self.hbr_parse_toc(soup) return feeds From 7a78fb5e9ac8e17cc718000ab9d56afdcd6dd98a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 19 Aug 2011 21:18:29 -0600 Subject: [PATCH 05/39] Fix HBR Blogs --- recipes/hbr.recipe | 3 +- recipes/hbr_blogs.recipe | 148 ++++++--------------------------------- 2 files changed, 23 insertions(+), 128 deletions(-) diff --git a/recipes/hbr.recipe b/recipes/hbr.recipe index 30cf54bf8d..214ae14f33 100644 --- a/recipes/hbr.recipe +++ b/recipes/hbr.recipe @@ -36,6 +36,8 @@ class HBR(BasicNewsRecipe): def get_browser(self): br = BasicNewsRecipe.get_browser(self) + self.logout_url = None + #''' br.open(self.LOGIN_URL) br.select_form(name='signin-form') @@ -44,7 +46,6 @@ class HBR(BasicNewsRecipe): raw = br.submit().read() if 'My Account' not in raw: raise Exception('Failed to login, are you sure your username and password are correct?') - self.logout_url = None try: link = br.find_link(text='Sign out') if link: diff --git a/recipes/hbr_blogs.recipe b/recipes/hbr_blogs.recipe index acee567d8d..0ca205ab5c 100644 --- a/recipes/hbr_blogs.recipe +++ b/recipes/hbr_blogs.recipe @@ -11,28 +11,16 @@ class HBR(BasicNewsRecipe): no_stylesheets = True LOGIN_URL = 'http://hbr.org/login?request_url=/' + LOGOUT_URL = 'http://hbr.org/logout?request_url=/' + INDEX = 'http://hbr.org/current' - # - # Blog Stuff - # - - - INCLUDE_BLOGS = True - INCLUDE_ARTICLES = False - - # option-specific settings. - - if INCLUDE_BLOGS == True: - remove_tags_after = dict(id='articleBody') - remove_tags_before = dict(id='pageFeature') - feeds = [('Blog','http://feeds.harvardbusiness.org/harvardbusiness')] - oldest_article = 30 - max_articles_per_feed = 100 - use_embedded_content = False - else: - timefmt = ' [%B %Y]' - + remove_tags_after = dict(id='articleBody') + remove_tags_before = dict(id='pageFeature') + feeds = [('Blog','http://feeds.harvardbusiness.org/harvardbusiness')] + oldest_article = 30 + max_articles_per_feed = 100 + use_embedded_content = False keep_only_tags = [ dict(name='div', id='pageContainer') ] @@ -41,21 +29,15 @@ class HBR(BasicNewsRecipe): 'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn', 'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR', 'articleToolbarTop','articleToolbarBottom', 'articleToolbarRD', - 'mailingListTout', 'partnerCenter', 'pageFooter']), - dict(name='iframe')] + 'mailingListTout', 'partnerCenter', 'pageFooter', 'shareWidgetTop']), + dict(name=['iframe', 'style'])] - extra_css = ''' - a {font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000; } - .article{font-family:Georgia,"Times New Roman",Times,serif; font-size: xx-small;} - h2{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:large; } - h4{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:small; } - #articleBody{font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000;font-size:x-small;} - #summaryText{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:x-small;} - ''' -#------------------------------------------------------------------------------------------------- def get_browser(self): br = BasicNewsRecipe.get_browser(self) + self.logout_url = None + + #''' br.open(self.LOGIN_URL) br.select_form(name='signin-form') br['signin-form:username'] = self.username @@ -63,11 +45,15 @@ class HBR(BasicNewsRecipe): raw = br.submit().read() if 'My Account' not in raw: raise Exception('Failed to login, are you sure your username and password are correct?') - self.logout_url = None - link = br.find_link(text='Sign out') - if link: - self.logout_url = link.absolute_url + try: + link = br.find_link(text='Sign out') + if link: + self.logout_url = link.absolute_url + except: + self.logout_url = self.LOGOUT_URL + #''' return br + #------------------------------------------------------------------------------------------------- def cleanup(self): if self.logout_url is not None: @@ -76,99 +62,7 @@ class HBR(BasicNewsRecipe): def map_url(self, url): if url.endswith('/ar/1'): return url[:-1]+'pr' -#------------------------------------------------------------------------------------------------- - def hbr_get_toc(self): - soup = self.index_to_soup(self.INDEX) - url = soup.find('a', text=lambda t:'Full Table of Contents' in t).parent.get('href') - return self.index_to_soup('http://hbr.org'+url) - -#------------------------------------------------------------------------------------------------- - - def hbr_parse_section(self, container, feeds): - current_section = None - current_articles = [] - for x in container.findAll(name=['li', 'h3', 'h4']): - if x.name in ['h3', 'h4'] and not x.findAll(True): - if current_section and current_articles: - feeds.append((current_section, current_articles)) - current_section = self.tag_to_string(x) - current_articles = [] - self.log('\tFound section:', current_section) - if x.name == 'li': - a = x.find('a', href=True) - if a is not None: - title = self.tag_to_string(a) - url = a.get('href') - if '/ar/' not in url: - continue - if url.startswith('/'): - url = 'http://hbr.org'+url - url = self.map_url(url) - p = x.find('p') - desc = '' - if p is not None: - desc = self.tag_to_string(p) - if not title or not url: - continue - self.log('\t\tFound article:', title) - self.log('\t\t\t', url) - self.log('\t\t\t', desc) - current_articles.append({'title':title, 'url':url, - 'description':desc, 'date':''}) - if current_section and current_articles: - feeds.append((current_section, current_articles)) - -#------------------------------------------------------------------------------------------------- - - def hbr_parse_toc(self, soup): - feeds = [] - features = soup.find(id='issueFeaturesContent') - self.hbr_parse_section(features, feeds) - departments = soup.find(id='issueDepartments') - self.hbr_parse_section(departments, feeds) - return feeds -#------------------------------------------------------------------------------------------------- - def feed_to_index_append(self, feedObject, masterFeed): - # Loop thru the feed object and build the correct type of article list - for feed in feedObject: - # build the correct structure from the feed object - newArticles = [] - for article in feed.articles: - newArt = { - 'title' : article.title, - 'url' : article.url, - 'date' : article.date, - 'description' : article.text_summary - } - newArticles.append(newArt) - - # Append the earliest/latest dates of the feed to the feed title - startDate, endDate = self.get_feed_dates(feed, '%d-%b') - newFeedTitle = feed.title + ' (' + startDate + ' thru ' + endDate + ')' - - # append the newly-built list object to the index object passed in - # as masterFeed. - masterFeed.append( (newFeedTitle,newArticles) ) - -#------------------------------------------------------------------------------------------------- - def get_feed_dates(self, feedObject, dateMask): - startDate = feedObject.articles[len(feedObject.articles)-1].localtime.strftime(dateMask) - endDate = feedObject.articles[0].localtime.strftime(dateMask) - - return startDate, endDate - -#------------------------------------------------------------------------------------------------- - - def parse_index(self): - if self.INCLUDE_ARTICLES == True: - soup = self.hbr_get_toc() - feeds = self.hbr_parse_toc(soup) - else: - return BasicNewsRecipe.parse_index(self) - - return feeds -#------------------------------------------------------------------------------------------------- def get_cover_url(self): cover_url = None index = 'http://hbr.org/current' From 16af96badc84850fe2e707d0322783012626f304 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 19 Aug 2011 21:52:51 -0600 Subject: [PATCH 06/39] Brasil de Fato by Alex Mitrani --- recipes/brasil_de_fato.recipe | 31 +++++++++++++++++++++++++++++++ recipes/hbr_blogs.recipe | 2 +- 2 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 recipes/brasil_de_fato.recipe diff --git a/recipes/brasil_de_fato.recipe b/recipes/brasil_de_fato.recipe new file mode 100644 index 0000000000..ba7636c703 --- /dev/null +++ b/recipes/brasil_de_fato.recipe @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- + +from calibre.web.feeds.news import BasicNewsRecipe + +class BrasilDeFato(BasicNewsRecipe): + news = True + title = u'Brasil de Fato' + __author__ = 'Alex Mitrani' + description = u'Uma visão popular do Brasil e do mundo.' + publisher = u'SOCIEDADE EDITORIAL BRASIL DE FATO' + category = 'news, politics, Brazil, rss, Portuguese' + oldest_article = 10 + max_articles_per_feed = 100 + summary_length = 1000 + language = 'pt_BR' + + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + remove_empty_feeds = True + masthead_url = 'http://www.brasildefato.com.br/sites/default/files/zeropoint_logo.jpg' + keep_only_tags = [dict(name='div', attrs={'id':'main'})] + remove_tags = [dict(name='div', attrs={'class':'links'})] + remove_tags_after = [dict(name='div', attrs={'class':'links'})] + + feeds = [(u'Nacional', u'http://www.brasildefato.com.br/rss_nacional') + ,(u'Internacional', u'http://www.brasildefato.com.br/rss_internacional') + ,(u'Entrevista', u'http://www.brasildefato.com.br/rss_entrevista') + ,(u'Cultura', u'http://www.brasildefato.com.br/rss_cultura') + ,(u'Análise', u'http://www.brasildefato.com.br/rss_analise') + ] diff --git a/recipes/hbr_blogs.recipe b/recipes/hbr_blogs.recipe index 0ca205ab5c..0deaef7a73 100644 --- a/recipes/hbr_blogs.recipe +++ b/recipes/hbr_blogs.recipe @@ -6,7 +6,7 @@ class HBR(BasicNewsRecipe): title = 'Harvard Business Review Blogs' description = 'To subscribe go to http://hbr.harvardbusiness.org' needs_subscription = True - __author__ = 'Kovid Goyal, enhanced by BrianG' + __author__ = 'Kovid Goyal' language = 'en' no_stylesheets = True From 039d2ae54f0bf8a3be4cbffa8a48ee602925f44c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 19 Aug 2011 21:57:04 -0600 Subject: [PATCH 07/39] Fluter by Armin Geller --- recipes/brasil_de_fato.recipe | 62 +++++++++++++++++------------------ recipes/fluter_de.recipe | 39 ++++++++++++++++++++++ 2 files changed, 70 insertions(+), 31 deletions(-) create mode 100644 recipes/fluter_de.recipe diff --git a/recipes/brasil_de_fato.recipe b/recipes/brasil_de_fato.recipe index ba7636c703..d060544ece 100644 --- a/recipes/brasil_de_fato.recipe +++ b/recipes/brasil_de_fato.recipe @@ -1,31 +1,31 @@ -# -*- coding: utf-8 -*- - -from calibre.web.feeds.news import BasicNewsRecipe - -class BrasilDeFato(BasicNewsRecipe): - news = True - title = u'Brasil de Fato' - __author__ = 'Alex Mitrani' - description = u'Uma visão popular do Brasil e do mundo.' - publisher = u'SOCIEDADE EDITORIAL BRASIL DE FATO' - category = 'news, politics, Brazil, rss, Portuguese' - oldest_article = 10 - max_articles_per_feed = 100 - summary_length = 1000 - language = 'pt_BR' - - remove_javascript = True - no_stylesheets = True - use_embedded_content = False - remove_empty_feeds = True - masthead_url = 'http://www.brasildefato.com.br/sites/default/files/zeropoint_logo.jpg' - keep_only_tags = [dict(name='div', attrs={'id':'main'})] - remove_tags = [dict(name='div', attrs={'class':'links'})] - remove_tags_after = [dict(name='div', attrs={'class':'links'})] - - feeds = [(u'Nacional', u'http://www.brasildefato.com.br/rss_nacional') - ,(u'Internacional', u'http://www.brasildefato.com.br/rss_internacional') - ,(u'Entrevista', u'http://www.brasildefato.com.br/rss_entrevista') - ,(u'Cultura', u'http://www.brasildefato.com.br/rss_cultura') - ,(u'Análise', u'http://www.brasildefato.com.br/rss_analise') - ] +# -*- coding: utf-8 -*- + +from calibre.web.feeds.news import BasicNewsRecipe + +class BrasilDeFato(BasicNewsRecipe): + news = True + title = u'Brasil de Fato' + __author__ = 'Alex Mitrani' + description = u'Uma visão popular do Brasil e do mundo.' + publisher = u'SOCIEDADE EDITORIAL BRASIL DE FATO' + category = 'news, politics, Brazil, rss, Portuguese' + oldest_article = 10 + max_articles_per_feed = 100 + summary_length = 1000 + language = 'pt_BR' + + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + remove_empty_feeds = True + masthead_url = 'http://www.brasildefato.com.br/sites/default/files/zeropoint_logo.jpg' + keep_only_tags = [dict(name='div', attrs={'id':'main'})] + remove_tags = [dict(name='div', attrs={'class':'links'})] + remove_tags_after = [dict(name='div', attrs={'class':'links'})] + + feeds = [(u'Nacional', u'http://www.brasildefato.com.br/rss_nacional') + ,(u'Internacional', u'http://www.brasildefato.com.br/rss_internacional') + ,(u'Entrevista', u'http://www.brasildefato.com.br/rss_entrevista') + ,(u'Cultura', u'http://www.brasildefato.com.br/rss_cultura') + ,(u'Análise', u'http://www.brasildefato.com.br/rss_analise') + ] diff --git a/recipes/fluter_de.recipe b/recipes/fluter_de.recipe new file mode 100644 index 0000000000..1f8576cf81 --- /dev/null +++ b/recipes/fluter_de.recipe @@ -0,0 +1,39 @@ +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal ' + +''' +Fetch fluter.de +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1313693926(BasicNewsRecipe): + + title = u'Fluter' + description = 'fluter.de Magazin der Bundeszentrale für politische Bildung/bpb' + language = 'de' + encoding = 'UTF-8' + + __author__ = 'Armin Geller' # 2011-08-19 + + oldest_article = 7 + max_articles_per_feed = 50 + + + remove_tags = [ + dict(name='div', attrs={'id':["comments"]}), + dict(attrs={'class':['commentlink']}), + ] + + + keep_only_tags = [ + dict(name='div', attrs={'class':["grid_8 articleText"]}), + dict(name='div', attrs={'class':["articleTextInnerText"]}), + ] + + feeds = [ + (u'Inhalt:', u'http://www.fluter.de/de/?tpl=907'), + ] + + extra_css = '.cs_img {margin-right: 10pt;}' + From 9d7ae9090bb12b7146b5e5ca4a2286dd84784a05 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 20 Aug 2011 11:45:04 -0600 Subject: [PATCH 08/39] Various Polish news sources by fenuks --- recipes/android_com_pl.recipe | 12 ++++++++++ recipes/bash_org_pl.recipe | 15 ++++++++++++ recipes/cd_action.recipe | 16 +++++++++++++ recipes/dobreprogamy.recipe | 21 ++++++++++++++++ recipes/film_web.recipe | 40 +++++++++++++++++++++++++++++++ recipes/gram_pl.recipe | 16 +++++++++++++ recipes/icons/android_com_pl.png | Bin 0 -> 1452 bytes recipes/icons/bash_org_pl.png | Bin 0 -> 391 bytes recipes/icons/cd_action.png | Bin 0 -> 972 bytes recipes/icons/dobreprogamy.png | Bin 0 -> 1127 bytes recipes/icons/film_web.png | Bin 0 -> 3433 bytes recipes/icons/gram_pl.png | Bin 0 -> 1101 bytes recipes/icons/niebezpiecznik.png | Bin 0 -> 795 bytes recipes/icons/wnp.png | Bin 0 -> 576 bytes recipes/niebezpiecznik.recipe | 16 +++++++++++++ recipes/wnp.recipe | 21 ++++++++++++++++ 16 files changed, 157 insertions(+) create mode 100644 recipes/android_com_pl.recipe create mode 100644 recipes/bash_org_pl.recipe create mode 100644 recipes/cd_action.recipe create mode 100644 recipes/dobreprogamy.recipe create mode 100644 recipes/film_web.recipe create mode 100644 recipes/gram_pl.recipe create mode 100644 recipes/icons/android_com_pl.png create mode 100644 recipes/icons/bash_org_pl.png create mode 100644 recipes/icons/cd_action.png create mode 100644 recipes/icons/dobreprogamy.png create mode 100644 recipes/icons/film_web.png create mode 100644 recipes/icons/gram_pl.png create mode 100644 recipes/icons/niebezpiecznik.png create mode 100644 recipes/icons/wnp.png create mode 100644 recipes/niebezpiecznik.recipe create mode 100644 recipes/wnp.recipe diff --git a/recipes/android_com_pl.recipe b/recipes/android_com_pl.recipe new file mode 100644 index 0000000000..a44d5e560a --- /dev/null +++ b/recipes/android_com_pl.recipe @@ -0,0 +1,12 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class Android_com_pl(BasicNewsRecipe): + title = u'Android.com.pl' + __author__ = 'fenuks' + description = 'Android.com.pl - biggest polish Android site' + category = 'Android, mobile' + language = 'pl' + cover_url =u'http://upload.wikimedia.org/wikipedia/commons/thumb/d/d7/Android_robot.svg/220px-Android_robot.svg.png' + oldest_article = 8 + max_articles_per_feed = 100 + feeds = [(u'Android', u'http://android.com.pl/component/content/frontpage/frontpage.feed?type=rss')] diff --git a/recipes/bash_org_pl.recipe b/recipes/bash_org_pl.recipe new file mode 100644 index 0000000000..037870ed6c --- /dev/null +++ b/recipes/bash_org_pl.recipe @@ -0,0 +1,15 @@ +from calibre.web.feeds.news import BasicNewsRecipe + + +class Bash_org_pl(BasicNewsRecipe): + title = u'Bash.org.pl' + __author__ = 'fenuks' + description = 'Bash.org.pl - funny quotations from IRC discussions' + category = 'funny quotations, humour' + language = 'pl' + oldest_article = 15 + cover_url = u'http://userlogos.org/files/logos/dzikiosiol/none_0.png' + max_articles_per_feed = 100 + no_stylesheets= True + keep_only_tags= [dict(name='div', attrs={'class':'quote post-content post-body'})] + feeds = [(u'Cytaty', u'http://bash.org.pl/rss')] diff --git a/recipes/cd_action.recipe b/recipes/cd_action.recipe new file mode 100644 index 0000000000..b4cf6b326c --- /dev/null +++ b/recipes/cd_action.recipe @@ -0,0 +1,16 @@ +from calibre.web.feeds.news import BasicNewsRecipe + + +class CD_Action(BasicNewsRecipe): + title = u'CD-Action' + __author__ = 'fenuks' + description = 'cdaction.pl - polish magazine about games site' + category = 'games' + language = 'pl' + oldest_article = 8 + max_articles_per_feed = 100 + no_stylesheets= True + cover_url =u'http://s.cdaction.pl/obrazki/logo-CD-Action_172k9.JPG' + keep_only_tags= dict(id='news_content') + remove_tags_after= dict(name='div', attrs={'class':'tresc'}) + feeds = [(u'Newsy', u'http://www.cdaction.pl/rss_newsy.xml')] diff --git a/recipes/dobreprogamy.recipe b/recipes/dobreprogamy.recipe new file mode 100644 index 0000000000..d9b2db591d --- /dev/null +++ b/recipes/dobreprogamy.recipe @@ -0,0 +1,21 @@ +from calibre.web.feeds.news import BasicNewsRecipe + + +class Dobreprogramy_pl(BasicNewsRecipe): + title = 'Dobreprogramy.pl' + __author__ = 'fenuks' + __licence__ ='GPL v3' + category = 'IT' + language = 'pl' + cover_url = 'http://userlogos.org/files/logos/Karmody/dobreprogramy_01.png' + description = u'AktualnoÅ›ci i blogi z dobreprogramy.pl' + encoding = 'utf-8' + no_stylesheets = True + language = 'pl' + extra_css = '.title {font-size:22px;}' + oldest_article = 8 + max_articles_per_feed = 100 + remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})] + keep_only_tags = [dict(name='div', attrs={'class':['mainBar', 'newsContent', 'postTitle title', 'postInfo', 'contentText', 'content']})] + feeds = [(u'AktualnoÅ›ci', 'http://feeds.feedburner.com/dobreprogramy/Aktualnosci'), + ('Blogi', 'http://feeds.feedburner.com/dobreprogramy/BlogCzytelnikow')] diff --git a/recipes/film_web.recipe b/recipes/film_web.recipe new file mode 100644 index 0000000000..0061573742 --- /dev/null +++ b/recipes/film_web.recipe @@ -0,0 +1,40 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class Filmweb_pl(BasicNewsRecipe): + title = u'FilmWeb' + __author__ = 'fenuks' + description = 'FilmWeb - biggest polish movie site' + cover_url = 'http://userlogos.org/files/logos/crudus/filmweb.png' + category = 'movies' + language = 'pl' + oldest_article = 8 + max_articles_per_feed = 100 + no_stylesheets= True + extra_css = '.hdrBig {font-size:22px;}' + remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'})] + keep_only_tags= [dict(name='h1', attrs={'class':'hdrBig'}), dict(name='div', attrs={'class':['newsInfo', 'reviewContent fontSizeCont description']})] + feeds = [(u'Wszystkie newsy', u'http://www.filmweb.pl/feed/news/latest'), + (u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'), + (u'News / Festiwale, nagrody i przeglÄ…dy', u'http://www.filmweb.pl/feed/news/category/festival'), + (u'News / Seriale', u'http://www.filmweb.pl/feed/news/category/serials'), + (u'News / Box office', u'http://www.filmweb.pl/feed/news/category/boxoffice'), + (u'News / Multimedia', u'http://www.filmweb.pl/feed/news/category/multimedia'), + (u'News / Dystrybucja dvd / blu-ray', u'http://www.filmweb.pl/feed/news/category/video'), + (u'News / Dystrybucja kinowa', u'http://www.filmweb.pl/feed/news/category/cinema'), + (u'News / off', u'http://www.filmweb.pl/feed/news/category/off'), + (u'News / Gry wideo', u'http://www.filmweb.pl/feed/news/category/game'), + (u'News / Organizacje branżowe', u'http://www.filmweb.pl/feed/news/category/organizations'), + (u'News / Internet', u'http://www.filmweb.pl/feed/news/category/internet'), + (u'News / Różne', u'http://www.filmweb.pl/feed/news/category/other'), + (u'News / Kino polskie', u'http://www.filmweb.pl/feed/news/category/polish.cinema'), + (u'News / Telewizja', u'http://www.filmweb.pl/feed/news/category/tv'), + (u'Recenzje redakcji', u'http://www.filmweb.pl/feed/reviews/latest'), + (u'Recenzje użytkowników', u'http://www.filmweb.pl/feed/user-reviews/latest')] + + def skip_ad_pages(self, soup): + skip_tag = soup.find('a', attrs={'class':'welcomeScreenButton'})['href'] + #self.log.warn(skip_tag) + if skip_tag is not None: + return self.index_to_soup(skip_tag, raw=True) + else: + None diff --git a/recipes/gram_pl.recipe b/recipes/gram_pl.recipe new file mode 100644 index 0000000000..091c0bb1dc --- /dev/null +++ b/recipes/gram_pl.recipe @@ -0,0 +1,16 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class Gram_pl(BasicNewsRecipe): + title = u'Gram.pl' + __author__ = 'fenuks' + description = 'Gram.pl - site about computer games' + category = 'games' + language = 'pl' + oldest_article = 8 + max_articles_per_feed = 100 + no_stylesheets= True + cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png' + remove_tags= [dict(name='p', attrs={'class':['extraText', 'must-log-in']}), dict(attrs={'class':['el', 'headline', 'post-info']}), dict(name='div', attrs={'class':['twojaOcena', 'comment-body', 'comment-author vcard', 'comment-meta commentmetadata', 'tw_button']}), dict(id=['igit_rpwt_css', 'comments', 'reply-title', 'igit_title'])] + keep_only_tags= [dict(name='div', attrs={'class':['main', 'arkh-postmetadataheader', 'arkh-postcontent', 'post', 'content', 'news_header', 'news_subheader', 'news_text']}), dict(attrs={'class':['contentheading', 'contentpaneopen']})] + feeds = [(u'gram.pl - informacje', u'http://www.gram.pl/feed_news.asp'), + (u'gram.pl - publikacje', u'http://www.gram.pl/feed_news.asp?type=articles')] diff --git a/recipes/icons/android_com_pl.png b/recipes/icons/android_com_pl.png new file mode 100644 index 0000000000000000000000000000000000000000..d68bac8810ebf42e8f553459a72a49bc6eb211e2 GIT binary patch literal 1452 zcmV;d1ylNoP)@ryq#2ilutS!lk&H@`=)#y<);-tFB^t^oquVMa#%gyP zlgYo(e!g!z-@fzx<(V^Q-g%yLp67F3P^;g*mB}<3oh~r2yEp-k+Z!6*z4P_0ub-Lm_O7j+nvzJWt0yNVk{dV1#w3#R z^5J1GFVJWf7y10IE~Amd>FqTb+}!&6jYd9yaL{BD2nGgBCO*HX$6(;`y1EPo4yU!% zXmoad{Mcx8bZl(Y>+S67>lYSiw3?bvp8#&%djB4vy!`cRfGbxfClP}7_N66}sJGW- z;_(ItjYgqxWW;0=i@Uq$=Y_(ao^RiT!r|f7RiW_dQ!LZgwzA^l($ZowvDsLL!Fcds zagk21t(~7osH*D22Y`x-nHhl6(pRqt0y;Yv7d<`u`%NZycdhpOcTdl$sb9Z*e1?Yf z`fb~qnx?1u{DA?zUL?|LS64kewA%G`S68)qWyQ(q;lrgR2IJAAPSgul^BOS`v%<`V0nl_lb$MHHoCTd3IJLijOZXwYQhcQ&UM&F3-py$>`{! zB09aSY;=^#ynTCimdV8CFc{b}zOOAri3R1|=V%duks zI1jTxXsA-@>{x7?_d*;O~F-tc6-6I(ZU6E-x-7Nq_(B zY=oksZr((wt!-_MMhgm3sQ?h#yBFZxxt10xYO}!Z-C0>|wp6NA;@Ojv2_h)y;zg3& zx392}Mhg!wDWTEA!c;0NtH8kAT!h48yp4myp+f))MR_?vD3#BinTcS}9)OULq9ST` zfJ>J~N6r0%2P-O^oc#PUGXVqw?6RllnKKx1y>)9+5`eF7dO9{LFApJ^thgAVfPmaw zf{2W)ti+q)E!k|GO-aeb1VCBY)D%Er;q&KaftZ-eN*2q*1Jh9`5MWZF5TihV#oXL* z$Q;gz6Bd8n+!7M-%$F|%SXqUI0VF0iHd+p?sAzl~prGKN4ZOUPl9N-$?Z-w6lwh1n~AwO|^K9pCk2Sz_Y)2@wc#9z{MptmdUiYkB+uP$7IID z0602Qh&xoDsu2nerQ-dsAr#_Z0rK*+T7b;Vo*wfhJ3B{3u~;@X`}YHE*bot6neqR% zt!;QX07+tjQUj^7Ds>E!5fBeSHAw>0Mps4OlFNg2^NZ{C*LV%P|Fo zI64dB4nfFd01O6}R4RvtFg4{hu5)q*2LZCOF!lNK-QD;aT3ZR?H}0bkxBQDP05UK!F)c7SEiy1vF)=zbIXW>gEig7ZFfgh*e<}a~03~!q zSaf7zbY(hiZ)9m^c>ppnF)%GKHZ3tZR53C-H8eUfFfA}PIxsN0_{}*000003HJE%?WEV*q9g? zwc?(5>F6&4x>vQtHKHUXu_V7#doc7+8V0W>y9UH2&eGBUO@w1jBrshJ%L)WG2B>gTe~DWM4ffJS=q literal 0 HcmV?d00001 diff --git a/recipes/icons/cd_action.png b/recipes/icons/cd_action.png new file mode 100644 index 0000000000000000000000000000000000000000..823e09a43effb9a62061d4dd6740cc03af68c161 GIT binary patch literal 972 zcmV;-12g=IP)4ODa(m$M@_B57mSD zBKZ*c&1dhee`5U~;m& z{N{$^6pD`zoAK`8o0WuR3dbBH9n3|JZ>-u?+D1FqtQ{ZIGGFt zbh=*v_!u5w5|G+%@901XCUJPUzTVeI;?3sP)+~nSjYf=bZ)ayQ4h9Vdne6lP@iE)9 zOp0VO9uI|}UWfq3?Vg!wXdo*hF;;6^8$~6PCKFA+zAi7*^yw+XU>uJ-912ASz;P;- zL_%T`iAZE;CpR@&1c~wahK4W(KrA*rjq%9|%gW^$z}}ulBNBN${rwmN#_L^K>FUZ& z4Z9xC(vngMacD1S1pv>_&(rk&{>X?(lmTpS4-8OLDz&&s)8VinkW>SeLL^rc35dm5 z7USz{gF!H~T1`@_xfTcn>~@U?!Ng(u03b2DyMu#&{$#tqxY*Vf3_2VPLmo`h=w!rVQK_n` z5O@~+3tk4m`}^8jVd2-8(^*;?jT((vr;#2b=R^W*w&CIGYM;+)?e0cd;lc?F1Q05UK!F)c7SEiy1vF)=zcFgi3gEig7ZFfddkdOZLD03~!qSaf7zbY(hiZ)9m^ uc>ppnF)%GKH!U(TR4_I=Gc-ChGA%GRIxsNnMc3K@0000m2L|p{dOueaK!7MM_DiS8aN`?fL4EmAPh`gC)x*2mO$Z{K7o6|LH zv^h16Vh}~rDwzpxZ=Ms~1VL@j&Bw>#ocH~HIE19h>(~AL`T2(rzj$FX<>qeQobG#n zB^tHac)XoEf9Gzu(a7ViTNjHtogz^z7L5)M3x)alfq>O26c!ZB%nS~SL>5aZWVh$$ zZrb$Uf;QW&TUl8dxOCPzn~JXy7> zqT=1VzP`-N*(CyjEn7M|Qm%MQenp>;%LU@`sj1S^>(^7R+-|3rv$OsF`}YX} zzI_`R$<79T{CN423WY*ub4CUg^mu5DM!UOdoSd{+)~`=86bi98Wdja}R!cizwKg>o zl1MBpJbOlseE&W)R9p-!F1lRDkJH>{tF8tTiQ!?noW@Ax%9U6wW#K=0^hhqx%KGqu z?!ci#v;(7~S}mzY%nY?zIxTubMfNdy=fcJYEPcr zy4C447*wj#Qd+Uc)7nZaotSv|5cvAl?Y?@Ij%aXDt)|Ap;k$Q%&z~a^qfwzq+rZ_^ zb#+BW&COcvsZ-mx1Ac!;2W@pM=5k%T2F%ZojI_1U9eDawr2@vsdwZLjT&_2742IRK zSFTLkz|EU-d0E-EZ4C`FSz#fd(NtF#6#;7X>C+MkaOu*C6ExT96pE4(prxg;v8Kjq zZEwGGr@h_b$jhs#$;&G!AVex9Wcl(GEL2qyqS4S{&?Vfx+h(Ie`}a#E)IbFTP%WWU z0u2nHzW&06`g(xRzkQp}2V^pzPbTZ>Nx{I8BZPEy5z^dD4dmoJe@_1askF9MDh19l zfcFfbj{y`G9z9AsMo3Ewy;oW3_g7Yi!zmazc8n1E6)Tm57>x%Hn#~6ebaskFYuAcI zz&-}>m;p30fISSLyj-O!FLye{Vv|WM-nA?3ojAkbaMTQk!@vN}GXRO?*J!F3@wkOi zT%5M>p9B~yR_Gb6tr~{ct7MqX-Hfube|_&Kz1upkb-jiS0000bbVXQnWMOn=I%9HW zVRU5xGB7bQEigDOGB8vzF*-CcIx{gXFg7|cFm!|_lmGw#C3HntbYx+4WjbwdWNBu3 t05UK!FfB1LEiyJ#F)=zaF*-FgEigAaFfcwEzL5X`002ovPDHLkV1f$d05$*s literal 0 HcmV?d00001 diff --git a/recipes/icons/film_web.png b/recipes/icons/film_web.png new file mode 100644 index 0000000000000000000000000000000000000000..3ddcdf1cdeaab29afa1af2962ac7cd93a4ba87aa GIT binary patch literal 3433 zcmV-v4VLnWP)-23N)k{|A2B9GMdDMy2qJ<6C3v8?AeVrG96p@L z<%v%gK|zk0Uv0XJ&g}eKi|jD-P2WFUDe$)WXO9(PELWMt1D0O^~Jw;?o^~4 zI)tC6rJTY>-6cSrlFyqeB$pfFaIS) zPVW8A_Z=8Tk&_!qnKA_p+Pzzwa_iPqN_Y1X3cxxFz;+72R}_HC$}L+$LIBFjR;*aH z3Lr1f&CSb8EY8R$jAwl&d!f10Ea06zfb`FMHyIITJrMp^QHRiSxyf5ys6_SAAXoW-`98QRCI)*BI*Yp{AGp@qc}TjP>PHHP660W0dS=B z^q5e(yB#PJ30vt*N`HO&G%M@UrNQ;lqf@6+QUGq>Mu%i%n3`Hz*3}(6=7zVJb4l|AOO|ZcReLAa5Uxd<1Z)x@f5PMKmVMRBvp|p zG?a{g?wq^3mlwyy$J^UGJ9l@Nl+2nnZyvyd2XS#HPDtA~G#oe(9ZeiDX_BpNXJ>RY z*&56d1_o~(5t!6WO);50d9sHh-Pu^eP5@C+%a^ZQ+1iTzFJJ!Yrx`Q6yh=)nid(*g^ZS6O#BEyE^%BQ8Rppc0Wq&k@buz=Fs zj01;;q8c0XlM(ydkg{Y6<_PI{O-;Ch)zw%95)*B0ot(sC?3bU9x_&+C%$diJPn+iM zo|sr%?Be3#A(7m_&m3{^;E^Mws`&XWTLzGvOd_JT7W>Vb#TI0cB)4!O?rCY6;;SzO z0JDEv8}99p5VrUJJhEbAXH%r(H8g~WM@Ev+4+;tn2FS^oI@QfhA~}3`?%V|nm;*dL zQCayLKmNoCCns0e#KapntgRg#Ij*wO$H&hPptRJ^&e@raqpq%}$8pio3m5wNB_!D`fCsc0P8%*VMv#hgiu^u@C!zbLWd3=in?{H9i_W_ zGX-EPg(S0(5OOZ4tNZdx;t-K&!-g$e0E&xkY)B-WJn7&tb!t))j%#KnyS2uTPfsTy zpP!$bD=)9E4v?EWa-={Y)zs7p6RfS9oA>Se=%dApzxaY}$v<)f8t&$%OX=xZO#%3f z0pG*jOx%kDor>)ARUoSw|=;#>C8<=j-e3os%OF=<2q%=I5i~ zOtZ7Kv+aFs3CmI%p69rgG0q~{(;NAt$q5zWXHr9EIcjDDCZQDYdm|KzTXV_kaI;3I$+31z;wHZ4rQp6sZI8P5^*g1E8=F zCsLAB?BU+ttSn>W2@`sI3knn!wX`_y#*Gmp$asAH^;gpV#f$K8awpiilSRVW zvy&&=+dDWE6euZaX|=TQ`=z94Xz1%pB(mofJG;Jl} zN==Pqby>~L)zTsk+_Pux+I8z#e_Px1bSosi~+= zPJQLCzMf6kjl$}CR1^yp{)vj!cdY9(W}pLFTlo&avdTj5%o)_hi`fb=1K8iWxk90# zVMRr0sk}TIAgkA}qao;k%a^gAhsW8oHa3oqW5*U23Isx-NW|}#lA^6GbwGML>Xa#c z5l~gdCX5EL2nY*f2b_WGJMQVAAUR5RH{Su6OS-!;`^UxM_1Mh~w}hVFxpS75wze7? z*RJX78yg!L6%`2tIyyIRmX+be`I?tU9K>-E5$o4)*l_I{&d=NXmtU-`?CniVii=fM zwYBT&`2A8-wY2p0U*iBYAUm5)cqoPAu)LL&{DUH`@3MD=S+mgahY$INGsT?1aY9OS zGoCvtE3x{Nl@%9jXz1u%zkcI}y1GzUQgZXAl9HNQXJ>i2f&xiie9g?Xw6wMsi@*9R z_;=2o!+E*8XJ?NaH-5aeby=B;in{u1sudNAie!Rssz;Cd`1ty^wV?rr4xw6E6&C8~7#J*CgzY2}e!tVF$BiSo zNOmhaIu5~=2@ZZ&LzuGi_NMKtsuUDRhz_QCc}7O2rW_X*78n>5R9!7w&vkWg-BME% z3S(mU{mRQ#R7m+vNU*bWa_Z{B_Hl8j*49I8DK<7(4qIB@*5EM|u^8V2oB+Nrn;JJowhc9 zfBZkW6OhPx@&x-uM4+0Q_GNPg1&oB#r{B(Sj@v}Je;>zTXZ9^yP*<%&$3#afQ93&K zyYndau2n_F0LTvQafJ3M=a{WCLB*Q`MuGp4FaQ&U^}^UtxLM8Y4Zv~=`nBO`!_ zh>aUJZ6a4K!meE`XYiAHpCgo&u?)t<3}iT#O?I}gug88{x1zEa80o_UJ3nB=Y}mjb zPb5-SCKG0wlQU_Ol~r#q_G5M4-~a2c$!*lgsJdEHQ%5H#=8q@uU?I+u0G_o3?Gh1;gjs;U|q>hMc;nAX3b}xEm%-fLk!=u=eOU8;jdot`!UVPAipOt z2NoC0CQCy@{sQ%bg$5p2yu9#;eD^MY;<%@Kd-2wI;sn0^u?L6Y!?FG=DPhC}1Yp1Z z$*{hj7}Q_O%MA^UjXOJU-!?EXGOGI>;ux`bp!Sz9J2;T{gV4}z+d@KWYGgxx@?_Kz zBZdqH3k?=380z!q7RkCt#s|^hT z0Wti-1!rd$7qK`jj6AU1y^HhSzaQ1vnI-u_4gYgT2n2YUad)3YDJ;Yl?&%pUSq`T6 z@2jekZ2hKMTdS)}p1M*}X3w58ht>6)H&OTQMRj#$Ij!Gbe>X?40UaIZQH~rjraXCq z4jFi*2hvB61Ol?61F1-)uWw*LlB1&|3Gssm$%_=btX;f_%HAZ|rHb8F|AJqNSdKtt zPg6^mE~F$UbP;J_V%dLrlGP&$r&?VudJPegX{{kao?Z8{{frPYYyp9 zyS4xT03~!qSaf7zbY(hYa%Ew3WdJfTF)=MLI4v?TR539+G%`9eG%YYTIxsK+A+oLj z001R)MObuXVRU6WZEs|0W_bWIFfchSFf%POGgL4+Ix{&sG&e0UHaajcFe4^y00000 LNkvXXu0mjfe(H17 literal 0 HcmV?d00001 diff --git a/recipes/icons/gram_pl.png b/recipes/icons/gram_pl.png new file mode 100644 index 0000000000000000000000000000000000000000..0a87f28825b077d3fbf9f0f1e298c10239071712 GIT binary patch literal 1101 zcmV-T1hV^yP)qPA4t2{(~q{Gm4@t^D?6gg{VL$HS!YGjLtZ> zW*lioYVzrO-?y)WSy4p4&BNim=R41PJI64-k;yDraFjWA>>^WHd7g1N%9%e`JL7O% zU>uI)%!w0OOm?<@uVG;_GHx&p4fxO8XPTQOVXM_BWQvMDGQbxG_{7llzr8S(no$5V zGoqTUOi77R#FUgwGQeksR08Aip`qbnV084-rK&1mV4$q*^l8%4(vv5F*RSpN@^ZlM zr_6viZ!Rk+zN)Ia8b~B26qJY<(1~ql4jeExGd5d@F&)R_?-W31=jzpK*8MXX~R!qB^dnB`wWj0b;R!1yw*2nqq)$%)WibCdOu)m=LF_lmvRcckbN1YaZO% zN~0qgUbzB6zb@3I)VQF($Z6>fX9#L&7&8$x98+6T?$N24l2xZb)s_&_!ugY z1ipQH^JZP0X`rKH*|O!!f$8a(Vs5X4!5#$}FpRV`GO&2@k|jViDiug1QYmU{fcc4m zF$PE`+Xp33&2^w%cq08R+e$^JL9`X~4r!=5Yn!^SxBi7OYuATi|pS6s%kcJb!-p z@R1|5e$~~qZm(WdD1h$noja*gdS+a%yu5+}N=On5hb19xK-T}efnB?_fnZRkG#HF1 z0JrfxJhVE=w&J!7^& z2l#w`1w9y5(1%9WX?(m>LE5gMIG2mA1A%7>Dz!Wpjmj+f{c^1}$mkaA-D}9tVa19z z#^XsQWmv)t5Q#`7bZaOhx*v_jRVb z+r_+p?_|8*A64$p)t|?CJax>|r(2jkdyL;>wWc$pT00000NkvXXu0mjfPN(ne literal 0 HcmV?d00001 diff --git a/recipes/icons/niebezpiecznik.png b/recipes/icons/niebezpiecznik.png new file mode 100644 index 0000000000000000000000000000000000000000..4188d91d3646c53624267c08378ed6cae2355cbe GIT binary patch literal 795 zcmY+ANlX&~7=`~bEu{^GwsfOA?Myq;j<~lnjRY%{QsUNymLNe~hznlajRy@u z3@QgTDQ=0&$+U1$BNFwZh>=!Jidq@H7!#x77OS2-y!XA!cX>yB4Rs=+S_l9lFXL{^ zQ~W1INWO7rX%he^=BxL6EEe2~6F6ZbY<8R7VRtwk4yVm#M=^^TwHPZ*CNqXvte6GI zaht`88;mB6R&O$+W)#yKDlp7yG@9lbWG7Z|2i&|`gWw#4==8>YiwaSFd4XdtH3(M77wMuo`%}FpYyBU6T0~?2l_0Z}N z7?t7!biwu21uX=3AB`xKs?I9DNnZSG3w-H%U$4yXg6%4wO2d!Y&N*T21>3X=`4C#&Vcpbb9!&nQ>KEh_emsuCtvr^ zylV8xTKWTlu&=qP>Di;niOI=AA@03QvMl@lokpcrmk*sfb9!yCk`1;~ORSd5%eT94E(!s-r>Tu8DB<1$l*);d=)GmfMmKh%# zrBKwyT00nQl(?9%;B`n);erRZJaB1q23V$Ey zZx+g#o(7&^fI}Sf-%!W^0()J9FN6=D_UoLedZ;%+Mnmo?=!HyWK?i*c7DJr z`TO4eM?BImn3dkVd2?X@{ztruZ?j4sKYA1w8+-NY)nC_dzOQfp_xJDTRcn6z`t`!m z_o1rsePOjnj0(Eu#@9Bip8<3zV@Z%-FoVOh8)+a;lDE4HN87!rRX`4Bfk$L9koEv$ zx0Bg+K*ml_7sn8Z%dY3Ii!~Vtuv}2p5$kFR@LiO^Tz#|QD(|?Yc}*3G8`PZ{C-#=1Ka3hk^paX12LROlxzE zEpgcA@cFf_i~2XK6PC$y+F#sL&06u?wEE7pe`Oc=0vXO4Ys|hLy*8iGF2PA?Qt!O9 zTA(LXOI#yLQW8s2t&)pUffR$0fuW(UfrYM-L5QKDm5GU!p@pu2nU#TotHAm?6b-rg pDVb@NPz|QKM#doq7FLEvR)(e!4Z5pRLx36>JYD@<);T3K0RT3m;0*u( literal 0 HcmV?d00001 diff --git a/recipes/niebezpiecznik.recipe b/recipes/niebezpiecznik.recipe new file mode 100644 index 0000000000..b33a0a3513 --- /dev/null +++ b/recipes/niebezpiecznik.recipe @@ -0,0 +1,16 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class Niebezpiecznik_pl(BasicNewsRecipe): + title = u'Niebezpiecznik.pl' + __author__ = 'fenuks' + description = 'Niebezpiecznik.pl' + category = 'hacking, IT' + language = 'pl' + oldest_article = 8 + max_articles_per_feed = 100 + no_stylesheets = True + cover_url =u'http://userlogos.org/files/logos/Karmody/niebezpiecznik_01.png' + remove_tags=[dict(name='div', attrs={'class':['sociable']}), dict(name='h4'), dict(attrs={'class':'similar-posts'})] + keep_only_tags= [dict(name='div', attrs={'class':['title', 'entry']})] + feeds = [(u'WiadomoÅ›ci', u'http://feeds.feedburner.com/niebezpiecznik/'), + ('Blog', 'http://feeds.feedburner.com/niebezpiecznik/linkblog/')] diff --git a/recipes/wnp.recipe b/recipes/wnp.recipe new file mode 100644 index 0000000000..e53e4cc66b --- /dev/null +++ b/recipes/wnp.recipe @@ -0,0 +1,21 @@ +from calibre.web.feeds.news import BasicNewsRecipe + + +class AdvancedUserRecipe1312886443(BasicNewsRecipe): + title = u'WNP' + cover_url= 'http://k.wnp.pl/images/wnpLogo.gif' + __author__ = 'fenuks' + description = u'Wirtualny Nowy PrzemysÅ‚' + category = 'economy' + language = 'pl' + oldest_article = 8 + max_articles_per_feed = 100 + no_stylesheets= True + keep_only_tags = dict(name='div', attrs={'id':'contentText'}) + feeds = [(u'WiadomoÅ›ci gospodarcze', u'http://www.wnp.pl/rss/serwis_rss.xml'), + (u'Serwis Energetyka - Gaz', u'http://www.wnp.pl/rss/serwis_rss_1.xml'), + (u'Serwis Nafta - Chemia', u'http://www.wnp.pl/rss/serwis_rss_2.xml'), + (u'Serwis Hutnictwo', u'http://www.wnp.pl/rss/serwis_rss_3.xml'), + (u'Serwis Górnictwo', u'http://www.wnp.pl/rss/serwis_rss_4.xml'), + (u'Serwis Logistyka', u'http://www.wnp.pl/rss/serwis_rss_5.xml'), + (u'Serwis IT', u'http://www.wnp.pl/rss/serwis_rss_6.xml')] From 1fdac7d5a014ca2f73e7cf855126c65ed1d20a5d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 20 Aug 2011 11:46:08 -0600 Subject: [PATCH 09/39] Cvece Zla by Darko Miletic. Fixes #830143 (New recipe for serbian blog Cvece Zla) --- recipes/cvecezla.recipe | 47 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 recipes/cvecezla.recipe diff --git a/recipes/cvecezla.recipe b/recipes/cvecezla.recipe new file mode 100644 index 0000000000..712c898a3e --- /dev/null +++ b/recipes/cvecezla.recipe @@ -0,0 +1,47 @@ + +__license__ = 'GPL v3' +__copyright__ = '2011, Darko Miletic ' +''' +cvecezla.wordpress.com +''' + +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class CveceZla(BasicNewsRecipe): + title = 'Cvece zla i naopakog' + __author__ = 'Darko Miletic' + description = 'Haoticnost razmisljanja poradja haoticnost pisanja. Muzika, stripovi, igre, knjige, generalno glupiranje...' + oldest_article = 7 + max_articles_per_feed = 100 + language = 'sr' + encoding = 'utf-8' + no_stylesheets = True + use_embedded_content = False + publication_type = 'blog' + extra_css = ' @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: "Trebuchet MS",Trebuchet,Verdana,sans1,sans-serif} .article_description{font-family: sans1, sans-serif} img{display: block } ' + + conversion_options = { + 'comment' : description + , 'tags' : 'igre, muzika, film, blog, Srbija' + , 'publisher': 'Mehmet Krljic' + , 'language' : language + } + + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] + + remove_tags_before = dict(attrs={'class':'navigation'}) + remove_tags_after = dict(attrs={'class':'commentlist'}) + remove_tags = [ + dict(attrs={'class':['postmetadata alt','sharedaddy sharedaddy-dark sd-like-enabled sd-sharing-enabled','reply','navigation']}) + ,dict(attrs={'id':'respond'}) + ] + + feeds = [(u'Clanci', u'http://cvecezla.wordpress.com/feed/')] + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return soup + + From 564ffc7e947ed468d0bc1a30a66f97cb0ed51d20 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 20 Aug 2011 15:47:51 -0600 Subject: [PATCH 10/39] ... --- src/calibre/gui2/__init__.py | 4 +++- src/calibre/gui2/preferences/look_feel.py | 3 +-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/calibre/gui2/__init__.py b/src/calibre/gui2/__init__.py index fc02ad7fae..94f392ae65 100644 --- a/src/calibre/gui2/__init__.py +++ b/src/calibre/gui2/__init__.py @@ -186,7 +186,9 @@ def _config(): # {{{ c.add_opt('enforce_cpu_limit', default=True, help=_('Limit max simultaneous jobs to number of CPUs')) c.add_opt('gui_layout', choices=['wide', 'narrow'], - help=_('The layout of the user interface'), default='wide') + help=_('The layout of the user interface.\nWide has the ' + 'book details panel on the right and narrow has ' + 'it at the bottom.'), default='wide') c.add_opt('show_avg_rating', default=True, help=_('Show the average rating per item indication in the tag browser')) c.add_opt('disable_animations', default=False, diff --git a/src/calibre/gui2/preferences/look_feel.py b/src/calibre/gui2/preferences/look_feel.py index b34c5e6042..c87cad7cad 100644 --- a/src/calibre/gui2/preferences/look_feel.py +++ b/src/calibre/gui2/preferences/look_feel.py @@ -10,12 +10,11 @@ from PyQt4.Qt import (QApplication, QFont, QFontInfo, QFontDialog, from calibre.gui2.preferences import ConfigWidgetBase, test_widget, CommaSeparatedList from calibre.gui2.preferences.look_feel_ui import Ui_Form -from calibre.gui2 import config, gprefs, qt_app +from calibre.gui2 import config, gprefs, qt_app, NONE from calibre.utils.localization import (available_translations, get_language, get_lang) from calibre.utils.config import prefs from calibre.utils.icu import sort_key -from calibre.gui2 import NONE from calibre.gui2.book_details import get_field_list from calibre.gui2.preferences.coloring import EditRules From 2337570c9f60220f33d1ebaf36af0bb1061bb929 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 20 Aug 2011 19:48:25 -0600 Subject: [PATCH 11/39] Fix #829912 (Edit metadata dialog: Splitters' positions are not saved) --- src/calibre/gui2/__init__.py | 2 +- src/calibre/gui2/languages.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/calibre/gui2/__init__.py b/src/calibre/gui2/__init__.py index 94f392ae65..1967f734cc 100644 --- a/src/calibre/gui2/__init__.py +++ b/src/calibre/gui2/__init__.py @@ -186,7 +186,7 @@ def _config(): # {{{ c.add_opt('enforce_cpu_limit', default=True, help=_('Limit max simultaneous jobs to number of CPUs')) c.add_opt('gui_layout', choices=['wide', 'narrow'], - help=_('The layout of the user interface.\nWide has the ' + help=_('The layout of the user interface. Wide has the ' 'book details panel on the right and narrow has ' 'it at the bottom.'), default='wide') c.add_opt('show_avg_rating', default=True, diff --git a/src/calibre/gui2/languages.py b/src/calibre/gui2/languages.py index 1e192a0c94..3398081c5f 100644 --- a/src/calibre/gui2/languages.py +++ b/src/calibre/gui2/languages.py @@ -16,6 +16,8 @@ class LanguagesEdit(MultiCompleteComboBox): def __init__(self, parent=None): MultiCompleteComboBox.__init__(self, parent) + self.setSizeAdjustPolicy(self.AdjustToMinimumContentsLengthWithIcon) + self.setMinimumContentsLength(20) self._lang_map = lang_map() self.names_with_commas = [x for x in self._lang_map.itervalues() if ',' in x] self.comma_map = {k:k.replace(',', '|') for k in self.names_with_commas} From 2a80b4ac998ddb337d6182f66dc968e6780eaa41 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 20 Aug 2011 21:12:50 -0600 Subject: [PATCH 12/39] Fix #830060 (Houston Chronicle news fetch fails) --- recipes/houston_chronicle.recipe | 67 ++++++++++---------------------- 1 file changed, 20 insertions(+), 47 deletions(-) diff --git a/recipes/houston_chronicle.recipe b/recipes/houston_chronicle.recipe index 3390228455..8d231dac16 100644 --- a/recipes/houston_chronicle.recipe +++ b/recipes/houston_chronicle.recipe @@ -1,8 +1,6 @@ #!/usr/bin/env python # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai -import string, pprint - from calibre.web.feeds.news import BasicNewsRecipe class HoustonChronicle(BasicNewsRecipe): @@ -13,53 +11,28 @@ class HoustonChronicle(BasicNewsRecipe): language = 'en' timefmt = ' [%a, %d %b, %Y]' no_stylesheets = True + use_embedded_content = False + remove_attributes = ['style'] - keep_only_tags = [ - dict(id=['story-head', 'story']) - ] - - remove_tags = [ - dict(id=['share-module', 'resource-box', - 'resource-box-header']) - ] - - extra_css = ''' - h1{font-family :Arial,Helvetica,sans-serif; font-size:large;} - h2{font-family :Arial,Helvetica,sans-serif; font-size:medium; color:#666666;} - h3{font-family :Arial,Helvetica,sans-serif; font-size:medium; color:#000000;} - h4{font-family :Arial,Helvetica,sans-serif; font-size: x-small;} - p{font-family :Arial,Helvetica,sans-serif; font-size:x-small;} - #story-head h1{font-family :Arial,Helvetica,sans-serif; font-size: xx-large;} - #story-head h2{font-family :Arial,Helvetica,sans-serif; font-size: small; color:#000000;} - #story-head h3{font-family :Arial,Helvetica,sans-serif; font-size: xx-small;} - #story-head h4{font-family :Arial,Helvetica,sans-serif; font-size: xx-small;} - #story{font-family :Arial,Helvetica,sans-serif; font-size:xx-small;} - #Text-TextSubhed BoldCond PoynterAgateZero h3{color:#444444;font-family :Arial,Helvetica,sans-serif; font-size:small;} - .p260x p{font-family :Arial,Helvetica,serif; font-size:x-small;font-style:italic;} - .p260x h6{color:#777777;font-family :Arial,Helvetica,sans-serif; font-size:xx-small;} - ''' - - - def parse_index(self): - categories = ['news', 'sports', 'business', 'entertainment', 'life', - 'travel'] - feeds = [] - for cat in categories: - articles = [] - soup = self.index_to_soup('http://www.chron.com/%s/'%cat) - for elem in soup.findAll(comptype='story', storyid=True): - a = elem.find('a', href=True) - if a is None: continue - url = a['href'] - if not url.startswith('http://'): - url = 'http://www.chron.com'+url - articles.append({'title':self.tag_to_string(a), 'url':url, - 'description':'', 'date':''}) - pprint.pprint(articles[-1]) - if articles: - feeds.append((string.capwords(cat), articles)) - return feeds + oldest_article = 2.0 + keep_only_tags = {'class':lambda x: x and ('hst-articletitle' in x or + 'hst-articletext' in x or 'hst-galleryitem' in x)} + feeds = [ + ('News', "http://www.chron.com/rss/feed/News-270.php"), + ('Sports', + 'http://www.chron.com/sports/headlines/collectionRss/Sports-Headlines-Staff-Stories-10767.php'), + ('Neighborhood', + 'http://www.chron.com/rss/feed/Neighborhood-305.php'), + ('Business', 'http://www.chron.com/rss/feed/Business-287.php'), + ('Entertainment', + 'http://www.chron.com/rss/feed/Entertainment-293.php'), + ('Editorials', + 'http://www.chron.com/opinion/editorials/collectionRss/Opinion-Editorials-Headline-List-10567.php'), + ('Life', 'http://www.chron.com/rss/feed/Life-297.php'), + ('Science & Tech', + 'http://www.chron.com/rss/feed/AP-Technology-and-Science-266.php'), + ] From ab1ca47fa7478ed4e721841c91390ec5f78f871f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 20 Aug 2011 21:47:04 -0600 Subject: [PATCH 13/39] ... --- src/calibre/gui2/actions/catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/gui2/actions/catalog.py b/src/calibre/gui2/actions/catalog.py index e57ce06742..d18eb990b3 100644 --- a/src/calibre/gui2/actions/catalog.py +++ b/src/calibre/gui2/actions/catalog.py @@ -17,7 +17,7 @@ from calibre.gui2.actions import InterfaceAction class GenerateCatalogAction(InterfaceAction): name = 'Generate Catalog' - action_spec = (_('Create a catalog of the books in your calibre library'), 'catalog.png', 'Catalog builder', None) + action_spec = (_('Create catalog'), 'catalog.png', 'Catalog builder', None) dont_add_to = frozenset(['menubar-device', 'toolbar-device', 'context-menu-device']) def genesis(self): From af5cd6ba674b8dd125a5190017ea2cdb349b676e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 20 Aug 2011 21:50:06 -0600 Subject: [PATCH 14/39] When downloading news set the language field correctly --- src/calibre/web/feeds/news.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index 2017248cfc..1d513082f1 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -28,6 +28,7 @@ from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending from calibre.ptempfile import PersistentTemporaryFile from calibre.utils.date import now as nowf from calibre.utils.magick.draw import save_cover_data_to, add_borders_to_image +from calibre.utils.localization import canonicalize_lang class LoginFailed(ValueError): pass @@ -1117,6 +1118,9 @@ class BasicNewsRecipe(Recipe): mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title() mi.timestamp = nowf() mi.comments = self.description + language = canonicalize_lang(self.language) + if language is not None: + mi.language = language if not isinstance(mi.comments, unicode): mi.comments = mi.comments.decode('utf-8', 'replace') mi.pubdate = nowf() From a4a6086d87d4f10ac58c3423de4341ad4d2a02fa Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 21 Aug 2011 00:07:56 -0600 Subject: [PATCH 15/39] ... --- src/calibre/ebooks/metadata/sources/identify.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/metadata/sources/identify.py b/src/calibre/ebooks/metadata/sources/identify.py index 97fbae4727..4987b8cead 100644 --- a/src/calibre/ebooks/metadata/sources/identify.py +++ b/src/calibre/ebooks/metadata/sources/identify.py @@ -481,7 +481,7 @@ def identify(log, abort, # {{{ log('The identify phase took %.2f seconds'%(time.time() - start_time)) log('The longest time (%f) was taken by:'%longest, lp) log('Merging results from different sources and finding earliest', - 'publication dates') + 'publication dates from the xisbn service') start_time = time.time() results = merge_identify_results(results, log) From 1a5f6d741d53cb46aaf3352be3d215dfff46e71a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 21 Aug 2011 10:06:25 -0600 Subject: [PATCH 16/39] Fix #830078 (New Language-function translated search error) --- src/calibre/library/caches.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/calibre/library/caches.py b/src/calibre/library/caches.py index 5f9dca6d23..62cad827c4 100644 --- a/src/calibre/library/caches.py +++ b/src/calibre/library/caches.py @@ -15,7 +15,7 @@ from calibre.utils.config import tweaks, prefs from calibre.utils.date import parse_date, now, UNDEFINED_DATE from calibre.utils.search_query_parser import SearchQueryParser from calibre.utils.pyparsing import ParseException -from calibre.utils.localization import canonicalize_lang +from calibre.utils.localization import canonicalize_lang, lang_map from calibre.ebooks.metadata import title_sort, author_to_author_sort from calibre.ebooks.metadata.opf2 import metadata_to_opf from calibre import prints @@ -728,7 +728,9 @@ class ResultCache(SearchQueryParser): # {{{ elif loc == db_col['languages']: q = canonicalize_lang(query) if q is None: - q = query + lm = lang_map() + rm = {v.lower():k for k,v in lm.iteritems()} + q = rm.get(query, query) else: q = query From 8812d9eadc756cd1226f8f22eab48cc93da46ad0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 21 Aug 2011 10:16:04 -0600 Subject: [PATCH 17/39] Fix #830081 (Edit Metadata - layout error past isbn) --- src/calibre/gui2/metadata/single.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/gui2/metadata/single.py b/src/calibre/gui2/metadata/single.py index 7f2ea036d6..dc3983171b 100644 --- a/src/calibre/gui2/metadata/single.py +++ b/src/calibre/gui2/metadata/single.py @@ -723,7 +723,7 @@ class MetadataSingleDialogAlt1(MetadataSingleDialogBase): # {{{ tl.addWidget(self.swap_title_author_button, 0, 0, 2, 1) tl.addWidget(self.manage_authors_button, 2, 0, 1, 1) - tl.addWidget(self.paste_isbn_button, 11, 0, 1, 1) + tl.addWidget(self.paste_isbn_button, 12, 0, 1, 1) create_row(0, self.title, self.title_sort, button=self.deduce_title_sort_button, span=2, @@ -859,7 +859,7 @@ class MetadataSingleDialogAlt2(MetadataSingleDialogBase): # {{{ tl.addWidget(self.swap_title_author_button, 0, 0, 2, 1) tl.addWidget(self.manage_authors_button, 2, 0, 2, 1) - tl.addWidget(self.paste_isbn_button, 11, 0, 1, 1) + tl.addWidget(self.paste_isbn_button, 12, 0, 1, 1) create_row(0, self.title, self.title_sort, button=self.deduce_title_sort_button, span=2, From 0e4e082ed7efac305ba05894d3c7af8b1e8614c5 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 21 Aug 2011 10:57:06 -0600 Subject: [PATCH 18/39] When adding books that have no language specified, do not automaticallly set the langauge to calibre's interface language. Fixes #830092 (Downloading and adding books causes default language to be assigned) --- src/calibre/ebooks/metadata/opf2.py | 5 +++-- src/calibre/ebooks/metadata/worker.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/metadata/opf2.py b/src/calibre/ebooks/metadata/opf2.py index c52d089c70..9b8ae12b10 100644 --- a/src/calibre/ebooks/metadata/opf2.py +++ b/src/calibre/ebooks/metadata/opf2.py @@ -1312,7 +1312,7 @@ class OPFCreator(Metadata): ncx_stream.flush() -def metadata_to_opf(mi, as_string=True): +def metadata_to_opf(mi, as_string=True, default_lang=None): from lxml import etree import textwrap from calibre.ebooks.oeb.base import OPF, DC @@ -1328,7 +1328,8 @@ def metadata_to_opf(mi, as_string=True): '[http://calibre-ebook.com]' if not mi.languages: - lang = get_lang().replace('_', '-').partition('-')[0] + lang = (get_lang().replace('_', '-').partition('-')[0] if default_lang + is None else default_lang) mi.languages = [lang] root = etree.fromstring(textwrap.dedent( diff --git a/src/calibre/ebooks/metadata/worker.py b/src/calibre/ebooks/metadata/worker.py index ca8707258b..cab582a264 100644 --- a/src/calibre/ebooks/metadata/worker.py +++ b/src/calibre/ebooks/metadata/worker.py @@ -33,7 +33,7 @@ def serialize_metadata_for(formats, tdir, id_): if not mi.application_id: mi.application_id = '__calibre_dummy__' with open(os.path.join(tdir, '%s.opf'%id_), 'wb') as f: - f.write(metadata_to_opf(mi)) + f.write(metadata_to_opf(mi, default_lang='und')) if cdata: with open(os.path.join(tdir, str(id_)), 'wb') as f: f.write(cdata) From b22f38d71b155942406d55601a1346e0c7742ca9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 21 Aug 2011 20:48:11 -0600 Subject: [PATCH 19/39] Fix #830800 (fix #760384 not applied to both WSJ recipes) --- recipes/wsj_free.recipe | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/recipes/wsj_free.recipe b/recipes/wsj_free.recipe index 331a393c03..42d791294a 100644 --- a/recipes/wsj_free.recipe +++ b/recipes/wsj_free.recipe @@ -53,6 +53,12 @@ class WallStreetJournal(BasicNewsRecipe): return soup + def abs_wsj_url(self, href): + if not href.startswith('http'): + href = 'http://online.wsj.com' + href + return href + + def wsj_get_index(self): return self.index_to_soup('http://online.wsj.com/itp') @@ -83,14 +89,14 @@ class WallStreetJournal(BasicNewsRecipe): pageone = a['href'].endswith('pageone') if pageone: title = 'Front Section' - url = 'http://online.wsj.com' + a['href'] + url = self.abs_wsj_url(a['href']) feeds = self.wsj_add_feed(feeds,title,url) title = 'What''s News' url = url.replace('pageone','whatsnews') feeds = self.wsj_add_feed(feeds,title,url) else: title = self.tag_to_string(a) - url = 'http://online.wsj.com' + a['href'] + url = self.abs_wsj_url(a['href']) feeds = self.wsj_add_feed(feeds,title,url) return feeds @@ -146,7 +152,7 @@ class WallStreetJournal(BasicNewsRecipe): title = self.tag_to_string(a).strip() + ' [%s]'%meta else: title = self.tag_to_string(a).strip() - url = 'http://online.wsj.com'+a['href'] + url = self.abs_wsj_url(a['href']) desc = '' for p in container.findAll('p'): desc = self.tag_to_string(p) From e48e7932fa01be41a041a1bc85019bc2fd979af3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 22 Aug 2011 10:41:17 -0600 Subject: [PATCH 20/39] MOBI Output: Add support for the start attribute on
    tags --- src/calibre/ebooks/mobi/mobiml.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/calibre/ebooks/mobi/mobiml.py b/src/calibre/ebooks/mobi/mobiml.py index eefa9d9e03..56a7a8b9ca 100644 --- a/src/calibre/ebooks/mobi/mobiml.py +++ b/src/calibre/ebooks/mobi/mobiml.py @@ -308,6 +308,11 @@ class MobiMLizer(object): istate = copy.copy(istates[-1]) istate.rendered = False istate.list_num = 0 + if tag == 'ol' and 'start' in elem.attrib: + try: + istate.list_num = int(elem.attrib['start'])-1 + except: + pass istates.append(istate) left = 0 display = style['display'] From 101817b1fc632723af00f47a48b74e36ac930783 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 22 Aug 2011 14:46:39 -0600 Subject: [PATCH 21/39] Fix #831622 (Updated recipe for MSDN Magazine) --- recipes/msdnmag_en.recipe | 52 ++++++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 15 deletions(-) diff --git a/recipes/msdnmag_en.recipe b/recipes/msdnmag_en.recipe index 77b8da17a8..341ca027f6 100644 --- a/recipes/msdnmag_en.recipe +++ b/recipes/msdnmag_en.recipe @@ -6,11 +6,13 @@ __copyright__ = '2009, Darko Miletic ' msdn.microsoft.com/en-us/magazine ''' from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, Tag class MSDNMagazine_en(BasicNewsRecipe): title = 'MSDN Magazine' __author__ = 'Darko Miletic' description = 'The Microsoft Journal for Developers' + masthead_url = 'http://i3.msdn.microsoft.com/Platform/MasterPages/MsdnMagazine/smalllogo.png' publisher = 'Microsoft Press' category = 'news, IT, Microsoft, programming, windows' oldest_article = 31 @@ -19,25 +21,45 @@ class MSDNMagazine_en(BasicNewsRecipe): use_embedded_content = False encoding = 'utf-8' language = 'en' + + base_url = 'http://msdn.microsoft.com/en-us/magazine/default.aspx' + rss_url = 'http://msdn.microsoft.com/en-us/magazine/rss/default.aspx?z=z&iss=1' - - feeds = [(u'Articles', u'http://msdn.microsoft.com/en-us/magazine/rss/default.aspx?z=z&iss=1')] - - keep_only_tags = [dict(name='div', attrs={'class':'navpage'})] + keep_only_tags = [dict(name='div', attrs={'id':'MainContent'})] remove_tags = [ - dict(name=['object','link','base','table']) - ,dict(name='div', attrs={'class':'MTPS_CollapsibleRegion'}) + dict(name='div', attrs={'class':'DivRatingsOnly'}) + ,dict(name='div', attrs={'class':'ShareThisButton4'}) ] - remove_tags_after = dict(name='div', attrs={'class':'navpage'}) + + def find_articles(self): + idx_contents = self.browser.open(self.rss_url).read() + idx = BeautifulStoneSoup(idx_contents, convertEntities=BeautifulStoneSoup.XML_ENTITIES) + + for article in idx.findAll('item'): + desc_html = self.tag_to_string(article.find('description')) + description = self.tag_to_string(BeautifulSoup(desc_html)) + + a = { + 'title': self.tag_to_string(article.find('title')), + 'url': self.tag_to_string(article.find('link')), + 'description': description, + 'date' : self.tag_to_string(article.find('pubdate')), + } + yield a - def preprocess_html(self, soup): - for item in soup.findAll('div',attrs={'class':['FeatureSmallHead','ColumnTypeSubTitle']}): - item.name="h2" - for item in soup.findAll('div',attrs={'class':['FeatureHeadline','ColumnTypeTitle']}): - item.name="h1" - for item in soup.findAll('div',attrs={'class':'ArticleTypeTitle'}): - item.name="h3" - return soup + def parse_index(self): + soup = self.index_to_soup(self.base_url) + + #find issue name, eg "August 2011" + issue_name = self.tag_to_string(soup.find('h1')) + + # find cover pic + img = soup.find('img',attrs ={'alt':issue_name}) + if img is not None: + self.cover_url = img['src'] + + return [(issue_name, list(self.find_articles()))] + From 161644a7527303aa6ac861c9c38b76f35ea68d54 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 22 Aug 2011 14:47:58 -0600 Subject: [PATCH 22/39] Fix #831622 (Updated recipe for MSDN Magazine) --- recipes/svd_se.recipe | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/recipes/svd_se.recipe b/recipes/svd_se.recipe index ef43caa7cd..7fa92c47f2 100644 --- a/recipes/svd_se.recipe +++ b/recipes/svd_se.recipe @@ -40,11 +40,11 @@ class SVD_se(BasicNewsRecipe): ,(u'Kultur' , u'http://www.svd.se/kulturnoje/nyheter/?service=rss') ] - keep_only_tags = [dict(name='div', attrs={'id':'articlecontent'})] - remove_tags_after = dict(name='div',attrs={'class':'articlebody normal'}) + keep_only_tags = [dict(name='div', attrs={'id':['article-content', 'articlecontent']})] + remove_tags_after = dict(name='div',attrs={'class':'articlebody'}) remove_tags = [ dict(name=['object','link','base']) - ,dict(name='div',attrs={'class':['articlead','factcolumn']}) + ,dict(name='div',attrs={'class':['articlead','factcolumn', 'article-ad']}) ,dict(name='ul', attrs={'class':'toolbar articletop clearfix'}) ,dict(name='p', attrs={'class':'more'}) ] From ac30f8edd472b755d18db484eaecba5bf7cafe48 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 22 Aug 2011 18:18:46 -0600 Subject: [PATCH 23/39] Fix #831695 (Updated recipe for Financial times UK edition) --- recipes/financial_times_uk.recipe | 36 +++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/recipes/financial_times_uk.recipe b/recipes/financial_times_uk.recipe index f3ad824bc3..4c331f115f 100644 --- a/recipes/financial_times_uk.recipe +++ b/recipes/financial_times_uk.recipe @@ -5,6 +5,7 @@ www.ft.com/uk-edition ''' import datetime +from calibre.ptempfile import PersistentTemporaryFile from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe @@ -22,6 +23,8 @@ class FinancialTimes(BasicNewsRecipe): needs_subscription = True encoding = 'utf8' publication_type = 'newspaper' + articles_are_obfuscated = True + temp_files = [] masthead_url = 'http://im.media.ft.com/m/img/masthead_main.jpg' LOGIN = 'https://registration.ft.com/registration/barrier/login' LOGIN2 = 'http://media.ft.com/h/subs3.html' @@ -47,7 +50,12 @@ class FinancialTimes(BasicNewsRecipe): br.submit() return br - keep_only_tags = [dict(name='div', attrs={'class':['fullstory fullstoryHeader','fullstory fullstoryBody','ft-story-header','ft-story-body','index-detail']})] + keep_only_tags = [ + dict(name='div', attrs={'class':['fullstory fullstoryHeader', 'ft-story-header']}) + ,dict(name='div', attrs={'class':'standfirst'}) + ,dict(name='div', attrs={'id' :'storyContent'}) + ,dict(name='div', attrs={'class':['ft-story-body','index-detail']}) + ] remove_tags = [ dict(name='div', attrs={'id':'floating-con'}) ,dict(name=['meta','iframe','base','object','embed','link']) @@ -69,18 +77,23 @@ class FinancialTimes(BasicNewsRecipe): def get_artlinks(self, elem): articles = [] + count = 0 for item in elem.findAll('a',href=True): + count = count + 1 + if self.test and count > 2: + return articles rawlink = item['href'] if rawlink.startswith('http://'): url = rawlink else: url = self.PREFIX + rawlink + urlverified = self.browser.open_novisit(url).geturl() # resolve redirect. title = self.tag_to_string(item) date = strftime(self.timefmt) articles.append({ 'title' :title ,'date' :date - ,'url' :url + ,'url' :urlverified ,'description':'' }) return articles @@ -97,7 +110,11 @@ class FinancialTimes(BasicNewsRecipe): st = wide.find('h4',attrs={'class':'section-no-arrow'}) if st: strest.insert(0,st) + count = 0 for item in strest: + count = count + 1 + if self.test and count > 2: + return feeds ftitle = self.tag_to_string(item) self.report_progress(0, _('Fetching feed')+' %s...'%(ftitle)) feedarts = self.get_artlinks(item.parent.ul) @@ -136,4 +153,19 @@ class FinancialTimes(BasicNewsRecipe): if cdate.isoweekday() == 7: cdate -= datetime.timedelta(days=1) return cdate.strftime('http://specials.ft.com/vtf_pdf/%d%m%y_FRONT1_LON.pdf') + + def get_obfuscated_article(self, url): + count = 0 + while (count < 10): + try: + response = self.browser.open(url) + html = response.read() + count = 10 + except: + print "Retrying download..." + count += 1 + self.temp_files.append(PersistentTemporaryFile('_fa.html')) + self.temp_files[-1].write(html) + self.temp_files[-1].close() + return self.temp_files[-1].name \ No newline at end of file From 2afef9211e75b0803edfaabc2730f819287a309b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 22 Aug 2011 18:33:41 -0600 Subject: [PATCH 24/39] Conversion pipeline: More robust handling of case insensitve tag and class css selectors --- src/calibre/ebooks/cssselect.py | 1007 ++++++++++++++++++++++++++++ src/calibre/ebooks/oeb/stylizer.py | 94 +-- 2 files changed, 1060 insertions(+), 41 deletions(-) create mode 100644 src/calibre/ebooks/cssselect.py diff --git a/src/calibre/ebooks/cssselect.py b/src/calibre/ebooks/cssselect.py new file mode 100644 index 0000000000..c4167a8e4d --- /dev/null +++ b/src/calibre/ebooks/cssselect.py @@ -0,0 +1,1007 @@ +"""CSS Selectors based on XPath. + +This module supports selecting XML/HTML tags based on CSS selectors. +See the `CSSSelector` class for details. +""" + +import re +from lxml import etree + +__all__ = ['SelectorSyntaxError', 'ExpressionError', + 'CSSSelector'] + +try: + _basestring = basestring +except NameError: + _basestring = str + +class SelectorSyntaxError(SyntaxError): + pass + +class ExpressionError(RuntimeError): + pass + +class CSSSelector(etree.XPath): + """A CSS selector. + + Usage:: + + >>> from lxml import etree, cssselect + >>> select = cssselect.CSSSelector("a tag > child") + + >>> root = etree.XML("TEXT") + >>> [ el.tag for el in select(root) ] + ['child'] + + To use CSS namespaces, you need to pass a prefix-to-namespace + mapping as ``namespaces`` keyword argument:: + + >>> rdfns = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' + >>> select_ns = cssselect.CSSSelector('root > rdf|Description', + ... namespaces={'rdf': rdfns}) + + >>> rdf = etree.XML(( + ... '' + ... 'blah' + ... '') % rdfns) + >>> [(el.tag, el.text) for el in select_ns(rdf)] + [('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description', 'blah')] + """ + def __init__(self, css, namespaces=None): + path = css_to_xpath_no_case(css) + etree.XPath.__init__(self, path, namespaces=namespaces) + self.css = css + + def __repr__(self): + return '<%s %s for %r>' % ( + self.__class__.__name__, + hex(abs(id(self)))[2:], + self.css) + +############################## +## Token objects: + +try: + _unicode = unicode + _unichr = unichr +except NameError: + # Python 3 + _unicode = str + _unichr = chr + +class _UniToken(_unicode): + def __new__(cls, contents, pos): + obj = _unicode.__new__(cls, contents) + obj.pos = pos + return obj + + def __repr__(self): + return '%s(%s, %r)' % ( + self.__class__.__name__, + _unicode.__repr__(self), + self.pos) + +class Symbol(_UniToken): + pass + +class String(_UniToken): + pass + +class Token(_UniToken): + pass + +############################################################ +## Parsing +############################################################ + +############################## +## Syntax objects: + +class Class(object): + """ + Represents selector.class_name + """ + + def __init__(self, selector, class_name): + self.selector = selector + # Kovid: Lowercased + self.class_name = class_name.lower() + + def __repr__(self): + return '%s[%r.%s]' % ( + self.__class__.__name__, + self.selector, + self.class_name) + + def xpath(self): + sel_xpath = self.selector.xpath() + # Kovid: Lowercased + sel_xpath.add_condition( + "contains(concat(' ', css:lower-case(normalize-space(@class)), ' '), %s)" % xpath_literal(' '+self.class_name+' ')) + return sel_xpath + +class Function(object): + """ + Represents selector:name(expr) + """ + + unsupported = [ + 'target', 'lang', 'enabled', 'disabled',] + + def __init__(self, selector, type, name, expr): + self.selector = selector + self.type = type + self.name = name + self.expr = expr + + def __repr__(self): + return '%s[%r%s%s(%r)]' % ( + self.__class__.__name__, + self.selector, + self.type, self.name, self.expr) + + def xpath(self): + sel_path = self.selector.xpath() + if self.name in self.unsupported: + raise ExpressionError( + "The pseudo-class %r is not supported" % self.name) + method = '_xpath_' + self.name.replace('-', '_') + if not hasattr(self, method): + raise ExpressionError( + "The pseudo-class %r is unknown" % self.name) + method = getattr(self, method) + return method(sel_path, self.expr) + + def _xpath_nth_child(self, xpath, expr, last=False, + add_name_test=True): + a, b = parse_series(expr) + if not a and not b and not last: + # a=0 means nothing is returned... + xpath.add_condition('false() and position() = 0') + return xpath + if add_name_test: + xpath.add_name_test() + xpath.add_star_prefix() + if a == 0: + if last: + b = 'last() - %s' % b + xpath.add_condition('position() = %s' % b) + return xpath + if last: + # FIXME: I'm not sure if this is right + a = -a + b = -b + if b > 0: + b_neg = str(-b) + else: + b_neg = '+%s' % (-b) + if a != 1: + expr = ['(position() %s) mod %s = 0' % (b_neg, a)] + else: + expr = [] + if b >= 0: + expr.append('position() >= %s' % b) + elif b < 0 and last: + expr.append('position() < (last() %s)' % b) + expr = ' and '.join(expr) + if expr: + xpath.add_condition(expr) + return xpath + # FIXME: handle an+b, odd, even + # an+b means every-a, plus b, e.g., 2n+1 means odd + # 0n+b means b + # n+0 means a=1, i.e., all elements + # an means every a elements, i.e., 2n means even + # -n means -1n + # -1n+6 means elements 6 and previous + + def _xpath_nth_last_child(self, xpath, expr): + return self._xpath_nth_child(xpath, expr, last=True) + + def _xpath_nth_of_type(self, xpath, expr): + if xpath.element == '*': + raise NotImplementedError( + "*:nth-of-type() is not implemented") + return self._xpath_nth_child(xpath, expr, add_name_test=False) + + def _xpath_nth_last_of_type(self, xpath, expr): + return self._xpath_nth_child(xpath, expr, last=True, add_name_test=False) + + def _xpath_contains(self, xpath, expr): + # text content, minus tags, must contain expr + if isinstance(expr, Element): + expr = expr._format_element() + xpath.add_condition('contains(css:lower-case(string(.)), %s)' + % xpath_literal(expr.lower())) + # FIXME: Currently case insensitive matching doesn't seem to be happening + return xpath + + def _xpath_not(self, xpath, expr): + # everything for which not expr applies + expr = expr.xpath() + cond = expr.condition + # FIXME: should I do something about element_path? + xpath.add_condition('not(%s)' % cond) + return xpath + +def _make_lower_case(context, s): + return s.lower() + +ns = etree.FunctionNamespace('http://codespeak.net/lxml/css/') +ns.prefix = 'css' +ns['lower-case'] = _make_lower_case + +class Pseudo(object): + """ + Represents selector:ident + """ + + unsupported = ['indeterminate', 'first-line', 'first-letter', + 'selection', 'before', 'after', 'link', 'visited', + 'active', 'focus', 'hover'] + + def __init__(self, element, type, ident): + self.element = element + assert type in (':', '::') + self.type = type + self.ident = ident + + def __repr__(self): + return '%s[%r%s%s]' % ( + self.__class__.__name__, + self.element, + self.type, self.ident) + + def xpath(self): + el_xpath = self.element.xpath() + if self.ident in self.unsupported: + raise ExpressionError( + "The pseudo-class %r is unsupported" % self.ident) + method = '_xpath_' + self.ident.replace('-', '_') + if not hasattr(self, method): + raise ExpressionError( + "The pseudo-class %r is unknown" % self.ident) + method = getattr(self, method) + el_xpath = method(el_xpath) + return el_xpath + + def _xpath_checked(self, xpath): + # FIXME: is this really all the elements? + xpath.add_condition("(@selected or @checked) and (name(.) = 'input' or name(.) = 'option')") + return xpath + + def _xpath_root(self, xpath): + # if this element is the root element + raise NotImplementedError + + def _xpath_first_child(self, xpath): + xpath.add_star_prefix() + xpath.add_name_test() + xpath.add_condition('position() = 1') + return xpath + + def _xpath_last_child(self, xpath): + xpath.add_star_prefix() + xpath.add_name_test() + xpath.add_condition('position() = last()') + return xpath + + def _xpath_first_of_type(self, xpath): + if xpath.element == '*': + raise NotImplementedError( + "*:first-of-type is not implemented") + xpath.add_star_prefix() + xpath.add_condition('position() = 1') + return xpath + + def _xpath_last_of_type(self, xpath): + if xpath.element == '*': + raise NotImplementedError( + "*:last-of-type is not implemented") + xpath.add_star_prefix() + xpath.add_condition('position() = last()') + return xpath + + def _xpath_only_child(self, xpath): + xpath.add_name_test() + xpath.add_star_prefix() + xpath.add_condition('last() = 1') + return xpath + + def _xpath_only_of_type(self, xpath): + if xpath.element == '*': + raise NotImplementedError( + "*:only-of-type is not implemented") + xpath.add_condition('last() = 1') + return xpath + + def _xpath_empty(self, xpath): + xpath.add_condition("not(*) and not(normalize-space())") + return xpath + +class Attrib(object): + """ + Represents selector[namespace|attrib operator value] + """ + + def __init__(self, selector, namespace, attrib, operator, value): + self.selector = selector + self.namespace = namespace + self.attrib = attrib + self.operator = operator + self.value = value + + def __repr__(self): + if self.operator == 'exists': + return '%s[%r[%s]]' % ( + self.__class__.__name__, + self.selector, + self._format_attrib()) + else: + return '%s[%r[%s %s %r]]' % ( + self.__class__.__name__, + self.selector, + self._format_attrib(), + self.operator, + self.value) + + def _format_attrib(self): + if self.namespace == '*': + return self.attrib + else: + return '%s|%s' % (self.namespace, self.attrib) + + def _xpath_attrib(self): + # FIXME: if attrib is *? + if self.namespace == '*': + return '@' + self.attrib + else: + return '@%s:%s' % (self.namespace, self.attrib) + + def xpath(self): + path = self.selector.xpath() + attrib = self._xpath_attrib() + value = self.value + if self.operator == 'exists': + assert not value + path.add_condition(attrib) + elif self.operator == '=': + path.add_condition('%s = %s' % (attrib, + xpath_literal(value))) + elif self.operator == '!=': + # FIXME: this seems like a weird hack... + if value: + path.add_condition('not(%s) or %s != %s' + % (attrib, attrib, xpath_literal(value))) + else: + path.add_condition('%s != %s' + % (attrib, xpath_literal(value))) + #path.add_condition('%s != %s' % (attrib, xpath_literal(value))) + elif self.operator == '~=': + path.add_condition("contains(concat(' ', normalize-space(%s), ' '), %s)" % (attrib, xpath_literal(' '+value+' '))) + elif self.operator == '|=': + # Weird, but true... + path.add_condition('%s = %s or starts-with(%s, %s)' % ( + attrib, xpath_literal(value), + attrib, xpath_literal(value + '-'))) + elif self.operator == '^=': + path.add_condition('starts-with(%s, %s)' % ( + attrib, xpath_literal(value))) + elif self.operator == '$=': + # Oddly there is a starts-with in XPath 1.0, but not ends-with + path.add_condition('substring(%s, string-length(%s)-%s) = %s' + % (attrib, attrib, len(value)-1, xpath_literal(value))) + elif self.operator == '*=': + # FIXME: case sensitive? + path.add_condition('contains(%s, %s)' % ( + attrib, xpath_literal(value))) + else: + assert 0, ("Unknown operator: %r" % self.operator) + return path + +class Element(object): + """ + Represents namespace|element + """ + + def __init__(self, namespace, element): + self.namespace = namespace + self.element = element + + def __repr__(self): + return '%s[%s]' % ( + self.__class__.__name__, + self._format_element()) + + def _format_element(self): + if self.namespace == '*': + return self.element + else: + return '%s|%s' % (self.namespace, self.element) + + def xpath(self): + if self.namespace == '*': + el = self.element.lower() + else: + # Kovid: Lowercased + el = '%s:%s' % (self.namespace, self.element.lower()) + return XPathExpr(element=el) + +class Hash(object): + """ + Represents selector#id + """ + + def __init__(self, selector, id): + self.selector = selector + self.id = id + + def __repr__(self): + return '%s[%r#%s]' % ( + self.__class__.__name__, + self.selector, self.id) + + def xpath(self): + path = self.selector.xpath() + path.add_condition('@id = %s' % xpath_literal(self.id)) + return path + +class Or(object): + + def __init__(self, items): + self.items = items + def __repr__(self): + return '%s(%r)' % ( + self.__class__.__name__, + self.items) + + def xpath(self): + paths = [item.xpath() for item in self.items] + return XPathExprOr(paths) + +class CombinedSelector(object): + + _method_mapping = { + ' ': 'descendant', + '>': 'child', + '+': 'direct_adjacent', + '~': 'indirect_adjacent', + } + + def __init__(self, selector, combinator, subselector): + assert selector is not None + self.selector = selector + self.combinator = combinator + self.subselector = subselector + + def __repr__(self): + if self.combinator == ' ': + comb = '' + else: + comb = self.combinator + return '%s[%r %s %r]' % ( + self.__class__.__name__, + self.selector, + comb, + self.subselector) + + def xpath(self): + if self.combinator not in self._method_mapping: + raise ExpressionError( + "Unknown combinator: %r" % self.combinator) + method = '_xpath_' + self._method_mapping[self.combinator] + method = getattr(self, method) + path = self.selector.xpath() + return method(path, self.subselector) + + def _xpath_descendant(self, xpath, sub): + # when sub is a descendant in any way of xpath + xpath.join('/descendant::', sub.xpath()) + return xpath + + def _xpath_child(self, xpath, sub): + # when sub is an immediate child of xpath + xpath.join('/', sub.xpath()) + return xpath + + def _xpath_direct_adjacent(self, xpath, sub): + # when sub immediately follows xpath + xpath.join('/following-sibling::', sub.xpath()) + xpath.add_name_test() + xpath.add_condition('position() = 1') + return xpath + + def _xpath_indirect_adjacent(self, xpath, sub): + # when sub comes somewhere after xpath as a sibling + xpath.join('/following-sibling::', sub.xpath()) + return xpath + +############################## +## XPathExpr objects: + +_el_re = re.compile(r'^\w+\s*$', re.UNICODE) +_id_re = re.compile(r'^(\w*)#(\w+)\s*$', re.UNICODE) +_class_re = re.compile(r'^(\w*)\.(\w+)\s*$', re.UNICODE) + +def css_to_xpath_no_case(css_expr, prefix='descendant-or-self::'): + if isinstance(css_expr, _basestring): + match = _el_re.search(css_expr) + if match is not None: + # Kovid: Lowercased + return '%s%s' % (prefix, match.group(0).strip().lower()) + match = _id_re.search(css_expr) + if match is not None: + return "%s%s[@id = '%s']" % ( + prefix, match.group(1) or '*', match.group(2)) + match = _class_re.search(css_expr) + if match is not None: + # Kovid: lowercased + return "%s%s[contains(concat(' ', css:lower-case(normalize-space(@class)), ' '), ' %s ')]" % ( + prefix, match.group(1).lower() or '*', match.group(2).lower()) + css_expr = parse(css_expr) + expr = css_expr.xpath() + assert expr is not None, ( + "Got None for xpath expression from %s" % repr(css_expr)) + if prefix: + expr.add_prefix(prefix) + return _unicode(expr) + +class XPathExpr(object): + + def __init__(self, prefix=None, path=None, element='*', condition=None, + star_prefix=False): + self.prefix = prefix + self.path = path + self.element = element + self.condition = condition + self.star_prefix = star_prefix + + def __str__(self): + path = '' + if self.prefix is not None: + path += _unicode(self.prefix) + if self.path is not None: + path += _unicode(self.path) + path += _unicode(self.element) + if self.condition: + path += '[%s]' % self.condition + return path + + def __repr__(self): + return '%s[%s]' % ( + self.__class__.__name__, self) + + def add_condition(self, condition): + if self.condition: + self.condition = '%s and (%s)' % (self.condition, condition) + else: + self.condition = condition + + def add_path(self, part): + if self.path is None: + self.path = self.element + else: + self.path += self.element + self.element = part + + def add_prefix(self, prefix): + if self.prefix: + self.prefix = prefix + self.prefix + else: + self.prefix = prefix + + def add_name_test(self): + if self.element == '*': + # We weren't doing a test anyway + return + self.add_condition("name() = %s" % xpath_literal(self.element)) + self.element = '*' + + def add_star_prefix(self): + """ + Adds a /* prefix if there is no prefix. This is when you need + to keep context's constrained to a single parent. + """ + if self.path: + self.path += '*/' + else: + self.path = '*/' + self.star_prefix = True + + def join(self, combiner, other): + prefix = _unicode(self) + prefix += combiner + path = (other.prefix or '') + (other.path or '') + # We don't need a star prefix if we are joining to this other + # prefix; so we'll get rid of it + if other.star_prefix and path == '*/': + path = '' + self.prefix = prefix + self.path = path + self.element = other.element + self.condition = other.condition + +class XPathExprOr(XPathExpr): + """ + Represents |'d expressions. Note that unfortunately it isn't + the union, it's the sum, so duplicate elements will appear. + """ + + def __init__(self, items, prefix=None): + for item in items: + assert item is not None + self.items = items + self.prefix = prefix + + def __str__(self): + prefix = self.prefix or '' + return ' | '.join(["%s%s" % (prefix,i) for i in self.items]) + +split_at_single_quotes = re.compile("('+)").split + +def xpath_literal(s): + if isinstance(s, Element): + # This is probably a symbol that looks like an expression... + s = s._format_element() + else: + s = _unicode(s) + if "'" not in s: + s = "'%s'" % s + elif '"' not in s: + s = '"%s"' % s + else: + s = "concat(%s)" % ','.join([ + (("'" in part) and '"%s"' or "'%s'") % part + for part in split_at_single_quotes(s) if part + ]) + return s + +############################## +## Parsing functions + +def parse(string): + stream = TokenStream(tokenize(string)) + stream.source = string + try: + return parse_selector_group(stream) + except SelectorSyntaxError: + import sys + e = sys.exc_info()[1] + message = "%s at %s -> %r" % ( + e, stream.used, stream.peek()) + e.msg = message + if sys.version_info < (2,6): + e.message = message + e.args = tuple([message]) + raise + +def parse_selector_group(stream): + result = [] + while 1: + result.append(parse_selector(stream)) + if stream.peek() == ',': + stream.next() + else: + break + if len(result) == 1: + return result[0] + else: + return Or(result) + +def parse_selector(stream): + result = parse_simple_selector(stream) + while 1: + peek = stream.peek() + if peek == ',' or peek is None: + return result + elif peek in ('+', '>', '~'): + # A combinator + combinator = stream.next() + else: + combinator = ' ' + consumed = len(stream.used) + next_selector = parse_simple_selector(stream) + if consumed == len(stream.used): + raise SelectorSyntaxError( + "Expected selector, got '%s'" % stream.peek()) + result = CombinedSelector(result, combinator, next_selector) + return result + +def parse_simple_selector(stream): + peek = stream.peek() + if peek != '*' and not isinstance(peek, Symbol): + element = namespace = '*' + else: + next = stream.next() + if next != '*' and not isinstance(next, Symbol): + raise SelectorSyntaxError( + "Expected symbol, got '%s'" % next) + if stream.peek() == '|': + namespace = next + stream.next() + element = stream.next() + if element != '*' and not isinstance(next, Symbol): + raise SelectorSyntaxError( + "Expected symbol, got '%s'" % next) + else: + namespace = '*' + element = next + result = Element(namespace, element) + has_hash = False + while 1: + peek = stream.peek() + if peek == '#': + if has_hash: + # You can't have two hashes + # (FIXME: is there some more general rule I'm missing?) + break + stream.next() + result = Hash(result, stream.next()) + has_hash = True + continue + elif peek == '.': + stream.next() + result = Class(result, stream.next()) + continue + elif peek == '[': + stream.next() + result = parse_attrib(result, stream) + next = stream.next() + if not next == ']': + raise SelectorSyntaxError( + "] expected, got '%s'" % next) + continue + elif peek == ':' or peek == '::': + type = stream.next() + ident = stream.next() + if not isinstance(ident, Symbol): + raise SelectorSyntaxError( + "Expected symbol, got '%s'" % ident) + if stream.peek() == '(': + stream.next() + peek = stream.peek() + if isinstance(peek, String): + selector = stream.next() + elif isinstance(peek, Symbol) and is_int(peek): + selector = int(stream.next()) + else: + # FIXME: parse_simple_selector, or selector, or...? + selector = parse_simple_selector(stream) + next = stream.next() + if not next == ')': + raise SelectorSyntaxError( + "Expected ')', got '%s' and '%s'" + % (next, selector)) + result = Function(result, type, ident, selector) + else: + result = Pseudo(result, type, ident) + continue + else: + if peek == ' ': + stream.next() + break + # FIXME: not sure what "negation" is + return result + +def is_int(v): + try: + int(v) + except ValueError: + return False + else: + return True + +def parse_attrib(selector, stream): + attrib = stream.next() + if stream.peek() == '|': + namespace = attrib + stream.next() + attrib = stream.next() + else: + namespace = '*' + if stream.peek() == ']': + return Attrib(selector, namespace, attrib, 'exists', None) + op = stream.next() + if not op in ('^=', '$=', '*=', '=', '~=', '|=', '!='): + raise SelectorSyntaxError( + "Operator expected, got '%s'" % op) + value = stream.next() + if not isinstance(value, (Symbol, String)): + raise SelectorSyntaxError( + "Expected string or symbol, got '%s'" % value) + return Attrib(selector, namespace, attrib, op, value) + +def parse_series(s): + """ + Parses things like '1n+2', or 'an+b' generally, returning (a, b) + """ + if isinstance(s, Element): + s = s._format_element() + if not s or s == '*': + # Happens when there's nothing, which the CSS parser thinks of as * + return (0, 0) + if isinstance(s, int): + # Happens when you just get a number + return (0, s) + if s == 'odd': + return (2, 1) + elif s == 'even': + return (2, 0) + elif s == 'n': + return (1, 0) + if 'n' not in s: + # Just a b + return (0, int(s)) + a, b = s.split('n', 1) + if not a: + a = 1 + elif a == '-' or a == '+': + a = int(a+'1') + else: + a = int(a) + if not b: + b = 0 + elif b == '-' or b == '+': + b = int(b+'1') + else: + b = int(b) + return (a, b) + + +############################################################ +## Tokenizing +############################################################ + +_match_whitespace = re.compile(r'\s+', re.UNICODE).match + +_replace_comments = re.compile(r'/\*.*?\*/', re.DOTALL).sub + +_match_count_number = re.compile(r'[+-]?\d*n(?:[+-]\d+)?').match + +def tokenize(s): + pos = 0 + s = _replace_comments('', s) + while 1: + match = _match_whitespace(s, pos=pos) + if match: + preceding_whitespace_pos = pos + pos = match.end() + else: + preceding_whitespace_pos = 0 + if pos >= len(s): + return + match = _match_count_number(s, pos=pos) + if match and match.group() != 'n': + sym = s[pos:match.end()] + yield Symbol(sym, pos) + pos = match.end() + continue + c = s[pos] + c2 = s[pos:pos+2] + if c2 in ('~=', '|=', '^=', '$=', '*=', '::', '!='): + yield Token(c2, pos) + pos += 2 + continue + if c in '>+~,.*=[]()|:#': + if c in '.#[' and preceding_whitespace_pos > 0: + yield Token(' ', preceding_whitespace_pos) + yield Token(c, pos) + pos += 1 + continue + if c == '"' or c == "'": + # Quoted string + old_pos = pos + sym, pos = tokenize_escaped_string(s, pos) + yield String(sym, old_pos) + continue + old_pos = pos + sym, pos = tokenize_symbol(s, pos) + yield Symbol(sym, old_pos) + continue + +split_at_string_escapes = re.compile(r'(\\(?:%s))' + % '|'.join(['[A-Fa-f0-9]{1,6}(?:\r\n|\s)?', + '[^A-Fa-f0-9]'])).split + +def unescape_string_literal(literal): + substrings = [] + for substring in split_at_string_escapes(literal): + if not substring: + continue + elif '\\' in substring: + if substring[0] == '\\' and len(substring) > 1: + substring = substring[1:] + if substring[0] in '0123456789ABCDEFabcdef': + # int() correctly ignores the potentially trailing whitespace + substring = _unichr(int(substring, 16)) + else: + raise SelectorSyntaxError( + "Invalid escape sequence %r in string %r" + % (substring.split('\\')[1], literal)) + substrings.append(substring) + return ''.join(substrings) + +def tokenize_escaped_string(s, pos): + quote = s[pos] + assert quote in ('"', "'") + pos = pos+1 + start = pos + while 1: + next = s.find(quote, pos) + if next == -1: + raise SelectorSyntaxError( + "Expected closing %s for string in: %r" + % (quote, s[start:])) + result = s[start:next] + if result.endswith('\\'): + # next quote character is escaped + pos = next+1 + continue + if '\\' in result: + result = unescape_string_literal(result) + return result, next+1 + +_illegal_symbol = re.compile(r'[^\w\\-]', re.UNICODE) + +def tokenize_symbol(s, pos): + start = pos + match = _illegal_symbol.search(s, pos=pos) + if not match: + # Goes to end of s + return s[start:], len(s) + if match.start() == pos: + assert 0, ( + "Unexpected symbol: %r at %s" % (s[pos], pos)) + if not match: + result = s[start:] + pos = len(s) + else: + result = s[start:match.start()] + pos = match.start() + try: + result = result.encode('ASCII', 'backslashreplace').decode('unicode_escape') + except UnicodeDecodeError: + import sys + e = sys.exc_info()[1] + raise SelectorSyntaxError( + "Bad symbol %r: %s" % (result, e)) + return result, pos + +class TokenStream(object): + + def __init__(self, tokens, source=None): + self.used = [] + self.tokens = iter(tokens) + self.source = source + self.peeked = None + self._peeking = False + try: + self.next_token = self.tokens.next + except AttributeError: + # Python 3 + self.next_token = self.tokens.__next__ + + def next(self): + if self._peeking: + self._peeking = False + self.used.append(self.peeked) + return self.peeked + else: + try: + next = self.next_token() + self.used.append(next) + return next + except StopIteration: + return None + + def __iter__(self): + return iter(self.next, None) + + def peek(self): + if not self._peeking: + try: + self.peeked = self.next_token() + except StopIteration: + return None + self._peeking = True + return self.peeked diff --git a/src/calibre/ebooks/oeb/stylizer.py b/src/calibre/ebooks/oeb/stylizer.py index f6ff594701..88e074320d 100644 --- a/src/calibre/ebooks/oeb/stylizer.py +++ b/src/calibre/ebooks/oeb/stylizer.py @@ -27,6 +27,7 @@ from calibre import force_unicode from calibre.ebooks import unit_convert from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES from calibre.ebooks.oeb.base import XPNSMAP, xpath, urlnormalize +from calibre.ebooks.cssselect import css_to_xpath_no_case cssutils_log.setLevel(logging.WARN) @@ -98,32 +99,72 @@ FONT_SIZE_NAMES = set(['xx-small', 'x-small', 'small', 'medium', 'large', 'x-large', 'xx-large']) -class CSSSelector(etree.XPath): - MIN_SPACE_RE = re.compile(r' *([>~+]) *') +class CSSSelector(object): + LOCAL_NAME_RE = re.compile(r"(?' % ( self.__class__.__name__, hex(abs(id(self)))[2:], self.css) +_selector_cache = {} + +MIN_SPACE_RE = re.compile(r' *([>~+]) *') + +def get_css_selector(raw_selector): + css = MIN_SPACE_RE.sub(r'\1', raw_selector) + if isinstance(css, unicode): + # Workaround for bug in lxml on windows/OS X that causes a massive + # memory leak with non ASCII selectors + css = css.encode('ascii', 'ignore').decode('ascii') + ans = _selector_cache.get(css, None) + if ans is None: + ans = CSSSelector(css) + _selector_cache[css] = ans + return ans class Stylizer(object): STYLESHEETS = WeakKeyDictionary() @@ -223,41 +264,12 @@ class Stylizer(object): rules.sort() self.rules = rules self._styles = {} - class_sel_pat = re.compile(r'\.[a-z]+', re.IGNORECASE) - capital_sel_pat = re.compile(r'h|[A-Z]+') for _, _, cssdict, text, _ in rules: fl = ':first-letter' in text if fl: text = text.replace(':first-letter', '') - try: - selector = CSSSelector(text) - except (AssertionError, ExpressionError, etree.XPathSyntaxError, - NameError, # thrown on OS X instead of SelectorSyntaxError - SelectorSyntaxError): - continue - try: - matches = selector(tree) - except etree.XPathEvalError: - continue - - if not matches: - ntext = capital_sel_pat.sub(lambda m: m.group().lower(), text) - if ntext != text: - self.logger.warn('Transformed CSS selector', text, 'to', - ntext) - selector = CSSSelector(ntext) - matches = selector(tree) - - if not matches and class_sel_pat.match(text) and text.lower() != text: - found = False - ltext = text.lower() - for x in tree.xpath('//*[@class]'): - if ltext.endswith('.'+x.get('class').lower()): - matches.append(x) - found = True - if found: - self.logger.warn('Ignoring case mismatches for CSS selector: %s in %s' - %(text, item.href)) + selector = get_css_selector(text) + matches = selector(tree, self.logger) if fl: from lxml.builder import ElementMaker E = ElementMaker(namespace=XHTML_NS) From b3f5484cbe90f99c6f912cfaa9504e692f6ce89d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 23 Aug 2011 00:29:02 -0600 Subject: [PATCH 25/39] Add a search for individual tweaks to Preferences->Tweaks --- src/calibre/gui2/__init__.py | 2 + src/calibre/gui2/preferences/tweaks.py | 109 +++++++++++++++++++++++-- src/calibre/gui2/preferences/tweaks.ui | 83 +++++++++++++++---- 3 files changed, 174 insertions(+), 20 deletions(-) diff --git a/src/calibre/gui2/__init__.py b/src/calibre/gui2/__init__.py index 1967f734cc..83ade61200 100644 --- a/src/calibre/gui2/__init__.py +++ b/src/calibre/gui2/__init__.py @@ -173,6 +173,8 @@ def _config(): # {{{ help='Search history for the plugin preferences') c.add_opt('shortcuts_search_history', default=[], help='Search history for the keyboard preferences') + c.add_opt('tweaks_search_history', default=[], + help='Search history for tweaks') c.add_opt('worker_limit', default=6, help=_( 'Maximum number of simultaneous conversion/news download jobs. ' diff --git a/src/calibre/gui2/preferences/tweaks.py b/src/calibre/gui2/preferences/tweaks.py index a1756bf1ba..04c11ad40e 100644 --- a/src/calibre/gui2/preferences/tweaks.py +++ b/src/calibre/gui2/preferences/tweaks.py @@ -9,14 +9,19 @@ import textwrap from calibre.gui2.preferences import ConfigWidgetBase, test_widget, AbortCommit from calibre.gui2.preferences.tweaks_ui import Ui_Form -from calibre.gui2 import error_dialog, NONE +from calibre.gui2 import error_dialog, NONE, info_dialog from calibre.utils.config import read_raw_tweaks, write_tweaks from calibre.gui2.widgets import PythonHighlighter from calibre import isbytestring +from calibre.utils.icu import lower +from calibre.utils.search_query_parser import (ParseException, + SearchQueryParser) from PyQt4.Qt import (QAbstractListModel, Qt, QStyledItemDelegate, QStyle, QStyleOptionViewItem, QFont, QDialogButtonBox, QDialog, - QVBoxLayout, QPlainTextEdit, QLabel) + QVBoxLayout, QPlainTextEdit, QLabel, QModelIndex) + +ROOT = QModelIndex() class Delegate(QStyledItemDelegate): # {{{ def __init__(self, view): @@ -35,7 +40,7 @@ class Delegate(QStyledItemDelegate): # {{{ class Tweak(object): # {{{ def __init__(self, name, doc, var_names, defaults, custom): - translate = __builtins__['_'] + translate = _ self.name = translate(name) self.doc = translate(doc.strip()) self.var_names = var_names @@ -87,10 +92,11 @@ class Tweak(object): # {{{ # }}} -class Tweaks(QAbstractListModel): # {{{ +class Tweaks(QAbstractListModel, SearchQueryParser): # {{{ def __init__(self, parent=None): QAbstractListModel.__init__(self, parent) + SearchQueryParser.__init__(self, ['all']) raw_defaults, raw_custom = read_raw_tweaks() self.parse_tweaks(raw_defaults, raw_custom) @@ -223,6 +229,54 @@ class Tweaks(QAbstractListModel): # {{{ def set_plugin_tweaks(self, d): self.plugin_tweaks = d + def universal_set(self): + return set(xrange(self.rowCount())) + + def get_matches(self, location, query, candidates=None): + if candidates is None: + candidates = self.universal_set() + ans = set() + if not query: + return ans + query = lower(query) + for r in candidates: + dat = self.data(self.index(r), Qt.UserRole) + if query in lower(dat.name):# or query in lower(dat.doc): + ans.add(r) + return ans + + def find(self, query): + query = query.strip() + if not query: + return ROOT + matches = self.parse(query) + if not matches: + return ROOT + matches = list(sorted(matches)) + return self.index(matches[0]) + + def find_next(self, idx, query, backwards=False): + query = query.strip() + if not query: + return idx + matches = self.parse(query) + if not matches: + return idx + loc = idx.row() + if loc not in matches: + return self.find(query) + if len(matches) == 1: + return ROOT + matches = list(sorted(matches)) + i = matches.index(loc) + if backwards: + ans = i - 1 if i - 1 >= 0 else len(matches)-1 + else: + ans = i + 1 if i + 1 < len(matches) else 0 + + ans = matches[ans] + return self.index(ans) + # }}} class PluginTweaks(QDialog): # {{{ @@ -257,12 +311,18 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form): self.delegate = Delegate(self.tweaks_view) self.tweaks_view.setItemDelegate(self.delegate) self.tweaks_view.currentChanged = self.current_changed + self.view = self.tweaks_view self.highlighter = PythonHighlighter(self.edit_tweak.document()) self.restore_default_button.clicked.connect(self.restore_to_default) self.apply_button.clicked.connect(self.apply_tweak) self.plugin_tweaks_button.clicked.connect(self.plugin_tweaks) self.splitter.setStretchFactor(0, 1) self.splitter.setStretchFactor(1, 100) + self.next_button.clicked.connect(self.find_next) + self.previous_button.clicked.connect(self.find_previous) + self.search.initialize('tweaks_search_history', help_text= + _('Search for tweak')) + self.search.search.connect(self.find) def plugin_tweaks(self): @@ -290,7 +350,7 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form): self.changed_signal.emit() def initialize(self): - self.tweaks = Tweaks() + self.tweaks = self._model = Tweaks() self.tweaks_view.setModel(self.tweaks) def restore_to_default(self, *args): @@ -338,6 +398,45 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form): ConfigWidgetBase.commit(self) return True + def find(self, query): + if not query: + return + try: + idx = self._model.find(query) + except ParseException: + self.search.search_done(False) + return + self.search.search_done(True) + if not idx.isValid(): + info_dialog(self, _('No matches'), + _('Could not find any shortcuts matching %s')%query, + show=True, show_copy_button=False) + return + self.highlight_index(idx) + + def highlight_index(self, idx): + if not idx.isValid(): return + self.view.scrollTo(idx) + self.view.selectionModel().select(idx, + self.view.selectionModel().ClearAndSelect) + self.view.setCurrentIndex(idx) + + def find_next(self, *args): + idx = self.view.currentIndex() + if not idx.isValid(): + idx = self._model.index(0) + idx = self._model.find_next(idx, + unicode(self.search.currentText())) + self.highlight_index(idx) + + def find_previous(self, *args): + idx = self.view.currentIndex() + if not idx.isValid(): + idx = self._model.index(0) + idx = self._model.find_next(idx, + unicode(self.search.currentText()), backwards=True) + self.highlight_index(idx) + if __name__ == '__main__': from PyQt4.Qt import QApplication diff --git a/src/calibre/gui2/preferences/tweaks.ui b/src/calibre/gui2/preferences/tweaks.ui index ab3f6b2bc3..19f6c836d5 100644 --- a/src/calibre/gui2/preferences/tweaks.ui +++ b/src/calibre/gui2/preferences/tweaks.ui @@ -6,7 +6,7 @@ 0 0 - 660 + 756 531 @@ -14,8 +14,24 @@ Form + + + + Values for the tweaks are shown below. Edit them to change the behavior of calibre. Your changes will only take effect <b>after a restart</b> of calibre. + + + true + + + + + + 0 + 10 + + Qt::Horizontal @@ -24,16 +40,6 @@ - - - - Values for the tweaks are shown below. Edit them to change the behavior of calibre. Your changes will only take effect <b>after a restart</b> of calibre. - - - true - - - @@ -72,8 +78,8 @@ - - + + Help @@ -92,7 +98,7 @@ - + Edit tweak @@ -128,12 +134,59 @@ + + + + + 10 + 0 + + + + QComboBox::AdjustToMinimumContentsLength + + + 10 + + + + + + + &Next + + + + :/images/arrow-down.png:/images/arrow-down.png + + + + + + + &Previous + + + + :/images/arrow-up.png:/images/arrow-up.png + + + - + + + SearchBox2 + QComboBox +
    calibre/gui2/search_box.h
    +
    +
    + + + From ec32d0f3f10724a55cdce3c952106414423d2a6d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 23 Aug 2011 09:51:46 -0600 Subject: [PATCH 26/39] When adding a text indent to paragraphs as part of the remove spacing between paragraphs transformation, do not add an indent to paragraphs that are directly centerd or right aligned. Fixes #830439 ([Enhancement]Indenting should ignore centered text) --- src/calibre/ebooks/oeb/transforms/flatcss.py | 3 ++- src/calibre/library/sqlite.py | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py index d006d8dd2d..1493a647ae 100644 --- a/src/calibre/ebooks/oeb/transforms/flatcss.py +++ b/src/calibre/ebooks/oeb/transforms/flatcss.py @@ -320,7 +320,8 @@ class CSSFlattener(object): if self.context.insert_blank_line: cssdict['margin-top'] = cssdict['margin-bottom'] = \ '%fem'%self.context.insert_blank_line_size - if self.context.remove_paragraph_spacing: + if (self.context.remove_paragraph_spacing and + cssdict.get('text-align', None) not in ('center', 'right')): cssdict['text-indent'] = "%1.1fem" % self.context.remove_paragraph_spacing_indent_size if cssdict: diff --git a/src/calibre/library/sqlite.py b/src/calibre/library/sqlite.py index b5917f1a55..90d293ba64 100644 --- a/src/calibre/library/sqlite.py +++ b/src/calibre/library/sqlite.py @@ -290,7 +290,10 @@ class DatabaseException(Exception): def __init__(self, err, tb): tb = '\n\t'.join(('\tRemote'+tb).splitlines()) - msg = unicode(err) +'\n' + tb + try: + msg = unicode(err) +'\n' + tb + except: + msg = repr(err) + '\n' + tb Exception.__init__(self, msg) self.orig_err = err self.orig_tb = tb From 2c33b9b4097d866e05a317a7f3ada4987e9d011f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 23 Aug 2011 10:49:32 -0600 Subject: [PATCH 27/39] Add an option to Preferences->Look and Feel->Cover Browser to show the cover browser full screen. Fixes #829855 ([Enhancements] in cover browser) --- src/calibre/gui2/__init__.py | 1 + src/calibre/gui2/cover_flow.py | 37 ++++++++++++++++++++--- src/calibre/gui2/preferences/look_feel.py | 8 ++++- src/calibre/gui2/preferences/look_feel.ui | 22 +++++++++++++- 4 files changed, 61 insertions(+), 7 deletions(-) diff --git a/src/calibre/gui2/__init__.py b/src/calibre/gui2/__init__.py index 83ade61200..715696a89e 100644 --- a/src/calibre/gui2/__init__.py +++ b/src/calibre/gui2/__init__.py @@ -98,6 +98,7 @@ gprefs.defaults['book_display_fields'] = [ ] gprefs.defaults['default_author_link'] = 'http://en.wikipedia.org/w/index.php?search={author}' gprefs.defaults['preserve_date_on_ctl'] = True +gprefs.defaults['cb_fullscreen'] = False # }}} diff --git a/src/calibre/gui2/cover_flow.py b/src/calibre/gui2/cover_flow.py index ca108a592e..67a8f08bcd 100644 --- a/src/calibre/gui2/cover_flow.py +++ b/src/calibre/gui2/cover_flow.py @@ -9,8 +9,8 @@ Module to implement the Cover Flow feature import sys, os, time -from PyQt4.Qt import (QImage, QSizePolicy, QTimer, QDialog, Qt, QSize, - QStackedLayout, QLabel, QByteArray, pyqtSignal) +from PyQt4.Qt import (QImage, QSizePolicy, QTimer, QDialog, Qt, QSize, QAction, + QStackedLayout, QLabel, QByteArray, pyqtSignal, QKeySequence) from calibre import plugins from calibre.gui2 import config, available_height, available_width, gprefs @@ -150,12 +150,39 @@ class CBDialog(QDialog): if not self.restoreGeometry(geom): h, w = available_height()-60, int(available_width()/1.5) self.resize(w, h) + self.action_fs_toggle = a = QAction(self) + self.addAction(a) + a.setShortcuts([QKeySequence('F11', QKeySequence.PortableText), + QKeySequence('Ctrl+Shift+F', QKeySequence.PortableText)]) + a.triggered.connect(self.toggle_fullscreen) + self.action_esc_fs = a = QAction(self) + a.triggered.connect(self.show_normal) + self.addAction(a) + a.setShortcuts([QKeySequence('Esc', QKeySequence.PortableText)]) + + self.pre_fs_geom = None def closeEvent(self, *args): - geom = bytearray(self.saveGeometry()) - gprefs['cover_browser_dialog_geometry'] = geom + if not self.isFullScreen(): + geom = bytearray(self.saveGeometry()) + gprefs['cover_browser_dialog_geometry'] = geom self.closed.emit() + def show_normal(self): + self.showNormal() + if self.pre_fs_geom is not None: + self.restoreGeometry(self.pre_fs_geom) + self.pre_fs_geom = None + + def toggle_fullscreen(self, *args): + if self.isFullScreen(): + self.show_normal() + else: + self.pre_fs_geom = bytearray(self.saveGeometry()) + self.showFullScreen() + + + class CoverFlowMixin(object): def __init__(self): @@ -228,7 +255,7 @@ class CoverFlowMixin(object): d.addAction(self.cb_splitter.action_toggle) self.cover_flow.setVisible(True) self.cover_flow.setFocus(Qt.OtherFocusReason) - d.show() + d.showFullScreen() if gprefs['cb_fullscreen'] else d.show() self.cb_splitter.button.set_state_to_hide() d.closed.connect(self.cover_browser_closed) self.cb_dialog = d diff --git a/src/calibre/gui2/preferences/look_feel.py b/src/calibre/gui2/preferences/look_feel.py index c87cad7cad..c017fe69c2 100644 --- a/src/calibre/gui2/preferences/look_feel.py +++ b/src/calibre/gui2/preferences/look_feel.py @@ -6,7 +6,7 @@ __copyright__ = '2010, Kovid Goyal ' __docformat__ = 'restructuredtext en' from PyQt4.Qt import (QApplication, QFont, QFontInfo, QFontDialog, - QAbstractListModel, Qt, QIcon) + QAbstractListModel, Qt, QIcon, QKeySequence) from calibre.gui2.preferences import ConfigWidgetBase, test_widget, CommaSeparatedList from calibre.gui2.preferences.look_feel_ui import Ui_Form @@ -129,6 +129,7 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form): r('disable_tray_notification', config) r('use_roman_numerals_for_series_number', config) r('separate_cover_flow', config, restart_required=True) + r('cb_fullscreen', gprefs) choices = [(_('Off'), 'off'), (_('Small'), 'small'), (_('Medium'), 'medium'), (_('Large'), 'large')] @@ -170,6 +171,11 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form): self.tabWidget.addTab(self.edit_rules, QIcon(I('format-fill-color.png')), _('Column coloring')) self.tabWidget.setCurrentIndex(0) + keys = [QKeySequence('F11', QKeySequence.PortableText), QKeySequence( + 'Ctrl+Shift+F', QKeySequence.PortableText)] + keys = [unicode(x.toString(QKeySequence.NativeText)) for x in keys] + self.fs_help_msg.setText(unicode(self.fs_help_msg.text())%( + _(' or ').join(keys))) def initialize(self): ConfigWidgetBase.initialize(self) diff --git a/src/calibre/gui2/preferences/look_feel.ui b/src/calibre/gui2/preferences/look_feel.ui index 07d533fdef..498013a68b 100644 --- a/src/calibre/gui2/preferences/look_feel.ui +++ b/src/calibre/gui2/preferences/look_feel.ui @@ -417,7 +417,7 @@ then the tags will be displayed each on their own line. - + Qt::Vertical @@ -430,6 +430,26 @@ then the tags will be displayed each on their own line. + + + + When showing cover browser in separate window, show it &fullscreen + + + + + + + margin-left: 1.5em + + + You can press the %s keys to toggle full screen mode. + + + true + + + From 967285b9f6dcc495b1f892587724732c51960323 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 23 Aug 2011 11:33:33 -0600 Subject: [PATCH 28/39] When automatically computing author sort from author's name, if the name contains certain words like Inc., Company, Team, etc. use the author name as the sort string directly. The list of such words can be controlled via Preferences->Tweaks. Fixes #797895 (author name sort order copy keywords) --- resources/default_tweaks.py | 6 ++++++ src/calibre/ebooks/metadata/__init__.py | 7 +++++++ src/calibre/gui2/metadata/basic_widgets.py | 17 +++++++++++++++-- src/calibre/gui2/metadata/single.py | 7 ++++++- 4 files changed, 34 insertions(+), 3 deletions(-) diff --git a/resources/default_tweaks.py b/resources/default_tweaks.py index 12731a8c42..f11a0b7bc0 100644 --- a/resources/default_tweaks.py +++ b/resources/default_tweaks.py @@ -62,10 +62,16 @@ authors_completer_append_separator = False # The author name suffixes are words that are ignored when they occur at the # end of an author name. The case of the suffix is ignored and trailing # periods are automatically handled. +# The author name copy words are a set of words which if they occur in an +# author name cause the automatically geenrated author sort string to be +# identical to the author name. This means that the sort for a string like Acme +# Inc. will be Acme Inc. instead of Inc., Acme author_sort_copy_method = 'comma' author_name_suffixes = ('Jr', 'Sr', 'Inc', 'Ph.D', 'Phd', 'MD', 'M.D', 'I', 'II', 'III', 'IV', 'Junior', 'Senior') +author_name_copywords = ('Corporation', 'Company', 'Co.', 'Agency', 'Council', + 'Committee', 'Inc.', 'Institute', 'Society', 'Club', 'Team') #: Use author sort in Tag Browser # Set which author field to display in the tags pane (the list of authors, diff --git a/src/calibre/ebooks/metadata/__init__.py b/src/calibre/ebooks/metadata/__init__.py index 2c26d011b7..a9816db5ae 100644 --- a/src/calibre/ebooks/metadata/__init__.py +++ b/src/calibre/ebooks/metadata/__init__.py @@ -36,8 +36,15 @@ def author_to_author_sort(author, method=None): return author if method is None: method = tweaks['author_sort_copy_method'] + + ltoks = frozenset(x.lower() for x in tokens) + copy_words = frozenset(x.lower() for x in tweaks['author_name_copywords']) + if ltoks.intersection(copy_words): + method = u'copy' + if method == u'copy': return author + suffixes = set([x.lower() for x in tweaks['author_name_suffixes']]) suffixes |= set([x+u'.' for x in suffixes]) diff --git a/src/calibre/gui2/metadata/basic_widgets.py b/src/calibre/gui2/metadata/basic_widgets.py index 29f6fffa0b..3ec34938af 100644 --- a/src/calibre/gui2/metadata/basic_widgets.py +++ b/src/calibre/gui2/metadata/basic_widgets.py @@ -308,7 +308,7 @@ class AuthorSortEdit(EnLineEdit): LABEL = _('Author s&ort:') def __init__(self, parent, authors_edit, autogen_button, db, - copy_a_to_as_action, copy_as_to_a_action): + copy_a_to_as_action, copy_as_to_a_action, a_to_as, as_to_a): EnLineEdit.__init__(self, parent) self.authors_edit = authors_edit self.db = db @@ -333,6 +333,8 @@ class AuthorSortEdit(EnLineEdit): autogen_button.clicked.connect(self.auto_generate) copy_a_to_as_action.triggered.connect(self.auto_generate) copy_as_to_a_action.triggered.connect(self.copy_to_authors) + a_to_as.triggered.connect(self.author_to_sort) + as_to_a.triggered.connect(self.sort_to_author) self.update_state() @dynamic_property @@ -389,10 +391,21 @@ class AuthorSortEdit(EnLineEdit): def auto_generate(self, *args): au = unicode(self.authors_edit.text()) - au = re.sub(r'\s+et al\.$', '', au) + au = re.sub(r'\s+et al\.$', '', au).strip() authors = string_to_authors(au) self.current_val = self.db.author_sort_from_authors(authors) + def author_to_sort(self, *args): + au = unicode(self.authors_edit.text()) + au = re.sub(r'\s+et al\.$', '', au).strip() + if au: + self.current_val = au + + def sort_to_author(self, *args): + aus = self.current_val + if aus: + self.authors_edit.current_val = [aus] + def initialize(self, db, id_): self.current_val = db.author_sort(id_, index_is_id=True) diff --git a/src/calibre/gui2/metadata/single.py b/src/calibre/gui2/metadata/single.py index dc3983171b..a2666b0351 100644 --- a/src/calibre/gui2/metadata/single.py +++ b/src/calibre/gui2/metadata/single.py @@ -130,10 +130,15 @@ class MetadataSingleDialogBase(ResizableDialog): ac = m.addAction(QIcon(I('forward.png')), _('Set author sort from author')) ac2 = m.addAction(QIcon(I('back.png')), _('Set author from author sort')) ac3 = m.addAction(QIcon(I('user_profile.png')), _('Manage authors')) + ac4 = m.addAction(QIcon(I('next.png')), + _('Copy author to author sort')) + ac5 = m.addAction(QIcon(I('previous.png')), + _('Copy author sort to author')) + b.setMenu(m) self.authors = AuthorsEdit(self, ac3) self.author_sort = AuthorSortEdit(self, self.authors, b, self.db, ac, - ac2) + ac2, ac4, ac5) self.basic_metadata_widgets.extend([self.authors, self.author_sort]) self.swap_title_author_button = QToolButton(self) From 0901c5807c5f956e044639350f41cdbb9ebcf07f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 23 Aug 2011 16:08:08 -0600 Subject: [PATCH 29/39] Various Turkish news sources by thomass --- recipes/bugun_gazetesi.recipe | 57 +++++++++++++++++++++++++++ recipes/yagmur_dergisi.recipe | 52 +++++++++++++++++++++++++ recipes/yeni_umit_dergisi.recipe | 52 +++++++++++++++++++++++++ recipes/yenisafak_gazetesi.recipe | 64 +++++++++++++++++++++++++++++++ 4 files changed, 225 insertions(+) create mode 100644 recipes/bugun_gazetesi.recipe create mode 100644 recipes/yagmur_dergisi.recipe create mode 100644 recipes/yeni_umit_dergisi.recipe create mode 100644 recipes/yenisafak_gazetesi.recipe diff --git a/recipes/bugun_gazetesi.recipe b/recipes/bugun_gazetesi.recipe new file mode 100644 index 0000000000..0a1d27f517 --- /dev/null +++ b/recipes/bugun_gazetesi.recipe @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- + +from calibre.web.feeds.news import BasicNewsRecipe + +class Bugun (BasicNewsRecipe): + + title = u'BUGÜN Gazetesi' + __author__ = u'thomass' + oldest_article = 2 + max_articles_per_feed =100 + #no_stylesheets = True + #delay = 1 + use_embedded_content = False + encoding = 'UTF-8' + publisher = 'thomass' + category = 'news, haberler,TR,gazete' + language = 'tr' + publication_type = 'newspaper ' + extra_css = ' div{font-size: small} h2{font-size: small;font-weight: bold} #ctl00_ortayer_haberBaslik{font-size:20px;font-weight: bold} '#h1{ font-size:10%;font-weight: bold} '#ctl00_ortayer_haberBaslik{ 'font-size:10%;font-weight: bold'} + #introduction{} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} ' + conversion_options = { + 'tags' : category + ,'language' : language + ,'publisher' : publisher + ,'linearize_tables': True + } + cover_img_url = 'http://www.bugun.com.tr/images/bugunLogo2011.png' + masthead_url = 'http://www.bugun.com.tr/images/bugunLogo2011.png' + + keep_only_tags = [dict(name='h1', attrs={'class':[ 'haberBaslik']}),dict(name='h2', attrs={'class':[ 'haberOzet']}), dict(name='div', attrs={'class':['haberGriDivvvv']}), dict(name='div', attrs={'id':[ 'haberTextDiv']}), ] + + #keep_only_tags = [dict(name='div', attrs={'id':[ 'news-detail-content']}), dict(name='td', attrs={'class':['columnist-detail','columnist_head']}) ] + #remove_tags = [ dict(name='div', attrs={'id':['news-detail-news-text-font-size','news-detail-gallery','news-detail-news-bottom-social']}),dict(name='div', attrs={'class':['radioEmbedBg','radyoProgramAdi']}),dict(name='a', attrs={'class':['webkit-html-attribute-value webkit-html-external-link']}),dict(name='table', attrs={'id':['yaziYorumTablosu']}),dict(name='img', attrs={'src':['http://medya.zaman.com.tr/pics/paylas.gif','http://medya.zaman.com.tr/extentions/zaman.com.tr/img/columnist/ma-16.png']})] + + + #remove_attributes = ['width','height'] + remove_empty_feeds= True + + feeds = [ + ( u'Son Dakika', u'http://www.bugun.com.tr/haberler.xml'), + ( u'Yazarlar', u'http://www.bugun.com.tr/rss/yazarlar.xml'), + ( u'Gündem', u'http://www.bugun.com.tr/rss/gundem.xml'), + ( u'Ekonomi', u'http://www.bugun.com.tr/rss/ekonomi.xml'), + ( u'Spor', u'http://www.bugun.com.tr/rss/spor.xml'), + ( u'Magazin', u'http://www.bugun.com.tr/rss/magazin.xml'), + ( u'Teknoloji', u'http://www.bugun.com.tr/rss/teknoloji.xml'), + ( u'Yaşam', u'http://www.bugun.com.tr/rss/yasam.xml'), + ( u'Medya', u'http://www.bugun.com.tr/rss/medya.xml'), + ( u'Dünya', u'http://www.bugun.com.tr/rss/dunya.xml'), + ( u'Politika', u'http://www.bugun.com.tr/rss/politika.xml'), + ( u'Sağlık', u'http://www.bugun.com.tr/rss/saglik.xml'), + ( u'Tarifler', u'http://www.bugun.com.tr/rss/yemek-tarifi.xml'), + + + + + ] diff --git a/recipes/yagmur_dergisi.recipe b/recipes/yagmur_dergisi.recipe new file mode 100644 index 0000000000..786a628a0c --- /dev/null +++ b/recipes/yagmur_dergisi.recipe @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- + +from calibre.web.feeds.news import BasicNewsRecipe + +class Yagmur(BasicNewsRecipe): + title = u'Yagmur Dergisi' + __author__ = u'thomass' + description = 'Üç Aylık Dil, Kültür ve Edebiyat Dergisi' + oldest_article = 90 + max_articles_per_feed =100 + no_stylesheets = True + #delay = 1 + #use_embedded_content = False + + #publisher = ' ' + category = 'dergi, ilim, kültür, edebiyat,Türkçe' + language = 'tr' + publication_type = 'magazine' + encoding = 'ISO 8859-9' + publisher = 'thomass' + + + + #extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} ' + conversion_options = { + 'tags' : category + ,'language' : language + ,'publisher' : publisher + ,'linearize_tables': True + } + #extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} ' + #keep_only_tags = [dict(name='h1', attrs={'class':['georgia_30']})] + + #remove_attributes = ['aria-describedby'] + #remove_tags = [dict(name='div', attrs={'id':['renk10']}) ] + cover_img_url = 'http://www.sizinti.com.tr/images/dergiler/d2.gif' + masthead_url = 'http://www.sizinti.com.tr/images/dergiler/d2.gif' + #remove_tags_before = dict(id='content-right') + + + #remove_empty_feeds= True + #remove_attributes = ['width','height'] + + feeds = [ + ( u'Yagmur', u'http://open.dapper.net/services/yagmur'), + ] + + #def preprocess_html(self, soup): + # return self.adeify_images(soup) + def print_version(self, url): #there is a probem caused by table format + return url.replace('http://www.yagmurdergisi.com.tr/konu_goster.php?konu_id=', 'http://www.yagmurdergisi.com.tr/yazformati.php?konu_id=') + diff --git a/recipes/yeni_umit_dergisi.recipe b/recipes/yeni_umit_dergisi.recipe new file mode 100644 index 0000000000..24b95acae4 --- /dev/null +++ b/recipes/yeni_umit_dergisi.recipe @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- + +from calibre.web.feeds.news import BasicNewsRecipe + +class YeniUmit(BasicNewsRecipe): + title = u'Yeni Umit Dergisi' + __author__ = u'thomass' + description = 'Aylık Dini İlimler ve Kültür Dergisi' + oldest_article = 45 + max_articles_per_feed =100 + no_stylesheets = True + #delay = 1 + #use_embedded_content = False + + #publisher = ' ' + category = 'dergi, ilim, kültür, edebiyat,Türkçe' + language = 'tr' + publication_type = 'magazine' + encoding = 'ISO 8859-9' + publisher = 'thomass' + + + + #extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} ' + conversion_options = { + 'tags' : category + ,'language' : language + ,'publisher' : publisher + ,'linearize_tables': True + } + #extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} ' + #keep_only_tags = [dict(name='h1', attrs={'class':['georgia_30']})] + + #remove_attributes = ['aria-describedby'] + #remove_tags = [dict(name='div', attrs={'id':['renk10']}) ] + cover_img_url = 'http://www.sizinti.com.tr/images/dergiler/d1.gif' + masthead_url = 'http://www.sizinti.com.tr/images/dergiler/d1.gif' + #remove_tags_before = dict(id='content-right') + + + #remove_empty_feeds= True + #remove_attributes = ['width','height'] + + feeds = [ + ( u'Yeni Umit', u'http://open.dapper.net/services/yeniumit'), + ] + + #def preprocess_html(self, soup): + # return self.adeify_images(soup) + def print_version(self, url): #there is a probem caused by table format + return url.replace('http://www.yeniumit.com.tr/konular', 'http://www.yeniumit.com.tr/yazdir') + diff --git a/recipes/yenisafak_gazetesi.recipe b/recipes/yenisafak_gazetesi.recipe new file mode 100644 index 0000000000..afcec76508 --- /dev/null +++ b/recipes/yenisafak_gazetesi.recipe @@ -0,0 +1,64 @@ +# -*- coding: utf-8 -*- + +from calibre.web.feeds.news import BasicNewsRecipe + +class Bugun (BasicNewsRecipe): + + title = u'Yenişafak Gazetesi' + __author__ = u'thomass' + oldest_article = 2 + max_articles_per_feed =100 + no_stylesheets = True + #delay = 1 + use_embedded_content = False + encoding = 'ISO 8859-9' #'UTF-8' + publisher = 'thomass' + category = 'news, haberler,TR,gazete' + language = 'tr' + publication_type = 'newspaper ' + #extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} ' + conversion_options = { + 'tags' : category + ,'language' : language + ,'publisher' : publisher + ,'linearize_tables': True + } + cover_img_url = 'http://yenisafak.com.tr/resim/logo.gif' + masthead_url = 'http://yenisafak.com.tr/resim/logo.gif' + + keep_only_tags = [dict(name='div', attrs={'id':[ 'ctghaberdetay2010']}) ] + extra_css = ' h1{font-size:20px;font-weight: bold}h2{font-size: small;font-weight: bold}div{font-size: small} '#h1{ font-size:10%;font-weight: bold} '#ctl00_ortayer_haberBaslik{ 'font-size:10%;font-weight: bold'} + + #keep_only_tags = [dict(name='div', attrs={'id':[ 'news-detail-content']}), dict(name='td', attrs={'class':['columnist-detail','columnist_head']}) ] + remove_tags = [ dict(name='div', attrs={'id':['yasaluyari2010','divhaberdetayilisik2010']}),dict(name='font', attrs={'class':['haberdetaytarih']})]#,'news-detail-gallery','news-detail-news-bottom-social']}),dict(name='div', attrs={'class':['radioEmbedBg','radyoProgramAdi']}),dict(name='a', attrs={'class':['webkit-html-attribute-value webkit-html-external-link']}),dict(name='table', attrs={'id':['yaziYorumTablosu']}),dict(name='img', attrs={'src':['http://medya.zaman.com.tr/pics/paylas.gif','http://medya.zaman.com.tr/extentions/zaman.com.tr/img/columnist/ma-16.png']})] + + + #remove_attributes = ['width','height'] + remove_empty_feeds= True + + feeds = [ + ( u'SonDakika', u'http://yenisafak.com.tr/rss/?xml=anasayfa'), + ( u'Gündem', u'http://yenisafak.com.tr/rss/?xml=gundem'), + ( u'Politika', u'http://yenisafak.com.tr/rss/?xml=politika'), + ( u'Ekonomi', u'http://yenisafak.com.tr/rss/?xml=ekonomi'), + ( u'Dünya', u'http://yenisafak.com.tr/rss/?xml=dunya'), + ( u'Aktüel', u'http://yenisafak.com.tr/rss/?xml=aktuel'), + ( u'Eğitim', u'http://yenisafak.com.tr/rss/?xml=egitim'), + ( u'Spor', u'http://yenisafak.com.tr/rss/?xml=spor'), + ( u'Yazarlar', u'http://yenisafak.com.tr/rss/?xml=yazarlar'), + ( u'Televizyon', u'http://yenisafak.com.tr/rss/?xml=televizyon'), + ( u'Sağlık', u'http://yenisafak.com.tr/rss/?xml=saglik'), + ( u'Yurt Haberler', u'http://yenisafak.com.tr/rss/?xml=yurthaberler'), + ( u'Bilişim', u'http://yenisafak.com.tr/rss/?xml=bilisim'), + ( u'Diziler', u'http://yenisafak.com.tr/rss/?xml=diziler'), + ( u'Kültür-Sanat', u'http://yenisafak.com.tr/rss/?xml=kultursanat'), + ( u'Röportaj', u'http://yenisafak.com.tr/rss/?xml=roportaj'), + ( u'Sinema', u'http://yenisafak.com.tr/rss/?xml=sinema'), + ( u'Yorum', u'http://yenisafak.com.tr/rss/?xml=yorum'), + ( u' Yeni Şafak Pazar', u'http://yenisafak.com.tr/rss/?xml=pazar'), + ( u'Yeni Şafak Kitap', u'http://yenisafak.com.tr/rss/?xml=kitap'), + ( u'Yeni Şafak English', u'http://yenisafak.com.tr/rss/?xml=english'), + + + + ] From 36c4b70030e5a516b6daad02c521959a603bbce9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 23 Aug 2011 23:09:23 -0600 Subject: [PATCH 30/39] ... --- recipes/politifact.recipe | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/recipes/politifact.recipe b/recipes/politifact.recipe index e3550ce7f1..6f5344ae4e 100644 --- a/recipes/politifact.recipe +++ b/recipes/politifact.recipe @@ -5,7 +5,6 @@ class PolitiFactCom(BasicNewsRecipe): __author__ = u'Michael Heinz' oldest_article = 21 max_articles_per_feed = 100 - recursion = 0 language = 'en' no_stylesheets = True @@ -27,4 +26,9 @@ class PolitiFactCom(BasicNewsRecipe): (u'Statements', u'http://www.politifact.com/feeds/statements/truth-o-meter/') ] - + def preprocess_html(self, soup): + for alink in soup.findAll('a'): + if alink.string is not None: + tstr = alink.string + alink.replaceWith(tstr) + return soup From fdc97e33284d984d4780a65eca798a9b314ee2e0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 24 Aug 2011 09:38:31 -0600 Subject: [PATCH 31/39] ... --- recipes/politifact.recipe | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/recipes/politifact.recipe b/recipes/politifact.recipe index 6f5344ae4e..a0f0d786dd 100644 --- a/recipes/politifact.recipe +++ b/recipes/politifact.recipe @@ -26,9 +26,4 @@ class PolitiFactCom(BasicNewsRecipe): (u'Statements', u'http://www.politifact.com/feeds/statements/truth-o-meter/') ] - def preprocess_html(self, soup): - for alink in soup.findAll('a'): - if alink.string is not None: - tstr = alink.string - alink.replaceWith(tstr) - return soup + From 6ee7c3661fe6b7e3f6847a506d5a77f14b49c15b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 24 Aug 2011 10:32:05 -0600 Subject: [PATCH 32/39] Fix #832761 (translation input is case sensitive) --- src/calibre/gui2/languages.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/calibre/gui2/languages.py b/src/calibre/gui2/languages.py index 3398081c5f..d7f34df1b4 100644 --- a/src/calibre/gui2/languages.py +++ b/src/calibre/gui2/languages.py @@ -9,7 +9,7 @@ __docformat__ = 'restructuredtext en' from calibre.gui2.complete import MultiCompleteComboBox from calibre.utils.localization import lang_map -from calibre.utils.icu import sort_key +from calibre.utils.icu import sort_key, lower class LanguagesEdit(MultiCompleteComboBox): @@ -22,7 +22,7 @@ class LanguagesEdit(MultiCompleteComboBox): self.names_with_commas = [x for x in self._lang_map.itervalues() if ',' in x] self.comma_map = {k:k.replace(',', '|') for k in self.names_with_commas} self.comma_rmap = {v:k for k, v in self.comma_map.iteritems()} - self._rmap = {v:k for k,v in self._lang_map.iteritems()} + self._rmap = {lower(v):k for k,v in self._lang_map.iteritems()} all_items = sorted(self._lang_map.itervalues(), key=sort_key) @@ -46,7 +46,7 @@ class LanguagesEdit(MultiCompleteComboBox): ans = [] for name in vals: if name: - code = self._rmap.get(name, None) + code = self._rmap.get(lower(name), None) if code is not None: ans.append(code) return ans @@ -66,7 +66,7 @@ class LanguagesEdit(MultiCompleteComboBox): bad = [] for name in vals: if name: - code = self._rmap.get(name, None) + code = self._rmap.get(lower(name), None) if code is None: bad.append(name) return bad From 6becd633baf30be65e58dd769efb165694598ab8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 24 Aug 2011 11:02:40 -0600 Subject: [PATCH 33/39] Fairbanks Daily by Roger --- recipes/fairbanks_daily.recipe | 128 +++++++++++++++++++++++++++++++++ recipes/msdnmag_en.recipe | 16 ++--- 2 files changed, 136 insertions(+), 8 deletions(-) create mode 100644 recipes/fairbanks_daily.recipe diff --git a/recipes/fairbanks_daily.recipe b/recipes/fairbanks_daily.recipe new file mode 100644 index 0000000000..282925728e --- /dev/null +++ b/recipes/fairbanks_daily.recipe @@ -0,0 +1,128 @@ +#import re # Provides preprocess_regexps re.compile + +from calibre.web.feeds.news import BasicNewsRecipe + +class FairbanksDailyNewsminer(BasicNewsRecipe): + title = u'Fairbanks Daily News-miner' + __author__ = 'Roger' + oldest_article = 7 + max_articles_per_feed = 100 + + description = ''''The voice of interior Alaska since 1903''' + publisher = 'http://www.newsminer.com/' + category = 'news, Alaska, Fairbanks' + language = 'en' + #extra_css = ''' + # p{font-weight: normal;text-align: justify} + # ''' + + remove_javascript = True + use_embedded_content = False + no_stylesheets = True + language = 'en' + encoding = 'utf8' + conversion_options = {'linearize_tables':True} + # TODO: I don't see any photos in my Mobi file with this masterhead_url! + masthead_url = 'http://d2uh5w9wm14i0w.cloudfront.net/sites/635/assets/top_masthead_-_menu_pic.jpg' + + + # In order to omit seeing number of views, number of posts and the pipe + # symbol for divider after the title and date of the article, a regex or + # manual processing is needed to get just the "story_item_date updated" + # (which contains the date). Everything else on this line is pretty much not needed. + # + # HTML line containing story_item_date: + #
    Aug 22, 2011 | 2370 views | 52 52 comments | 9 9 recommendations | email to a friend | print
    + + # The following was suggested, but it looks like I also need to define self & soup + # (as well as bring in extra soup depends?) + #date = self.tag_to_string(soup.find('span', attrs={'class':'story_item_date updated'})) + + #preprocess_regexps = [(re.compile(r']*addthis_separator*>'), lambda match: '') ] + #preprocess_regexps = [(re.compile(r'span class="addthis_separator">|'), lambda match: '') ] + + #preprocess_regexps = [ + # (re.compile(r'.*?', re.IGNORECASE | re.DOTALL), lambda match : ''), + # ] + + #def get_browser(self): + #def preprocess_html(soup, first_fetch): + # date = self.tag_to_string(soup.find('span', attrs={'class':'story_item_date updated'})) + # return + + + # Try to keep some tags - some might not be needed here + keep_only_tags = [ + #date = self.tag_to_string(soup.find('span', attrs={'class':'story_item_date updated'})), + dict(name='div', attrs={'class':'hnews hentry item'}), + dict(name='div', attrs={'class':'story_item_headline entry-title'}), + #dict(name='span', attrs={'class':'story_item_date updated'}), + dict(name='div', attrs={'class':'full_story'}) + ] + #remove_tags = [ + # dict(name='div', attrs={'class':'story_tools'}), + # dict(name='p', attrs={'class':'ad_label'}), + # ] + + # Try to remove some bothersome tags + remove_tags = [ + #dict(name='img', attrs={'alt'}), + dict(name='img', attrs={'class':'dont_touch_me'}), + dict(name='span', attrs={'class':'number_recommendations'}), + #dict(name='div', attrs={'class':'signature_line'}), + dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'}), + dict(name='div', attrs={'class':['addthis_toolbox','addthis_default_style']}), + dict(name='span', attrs={'class':'addthis_separator'}), + dict(name='div', attrs={'class':'related_content'}), + dict(name='div', attrs={'class':'comments_container'}), + #dict(name='div', attrs={'class':'signature_line'}), + dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'}), + dict(name='div', attrs={'id':'comments_container'}) + ] + + + # This one works but only gets title, date and clips article content! + #remove_tags_after = [ + # dict(name='span', attrs={'class':'story_item_date updated'}) + # ] + + #remove_tags_after = [ + # dict(name='div', attrs={'class':'advertisement'}), + # ] + + # Try clipping tags before and after to prevent pulling img views/posts numbers after date? + #remove_tags_before = [ + # dict(name='span', attrs={'class':'story_item_date updated'}) + # ] + + #extra_css # tweak the appearance # TODO: Change article titles to bold? + + + # Comment-out or uncomment any of the following RSS feeds according to your + # liking. + # + # TODO: Adding more then one RSS Feed, and newline will be omitted for + # entries within the Table of Contents or Index of Articles + # + # TODO: Some random bits of text is trailing the last page (or TOC on MOBI + # files), these are bits of public posts and comments and need to also be + # removed. + # + feeds = [ + (u'Alaska News', u'http://newsminer.com/rss/rss_feeds/alaska_news?content_type=article&tags=alaska_news&page_name=rss_feeds&instance=alaska_news'), + (u'Local News', u'http://newsminer.com/rss/rss_feeds/local_news?content_type=article&tags=local_news&page_name=rss_feeds&offset=0&instance=local_news'), + (u'Business', u'http://newsminer.com/rss/rss_feeds/business_news?content_type=article&tags=business_news&page_name=rss_feeds&instance=business_news'), + (u'Politics', u'http://newsminer.com/rss/rss_feeds/politics_news?content_type=article&tags=politics_news&page_name=rss_feeds&instance=politics_news'), + (u'Sports', u'http://newsminer.com/rss/rss_feeds/sports_news?content_type=article&tags=sports_news&page_name=rss_feeds&instance=sports_news'), + # (u'Latitude 65 feed', u'http://newsminer.com/rss/rss_feeds/latitude_65?content_type=article&tags=latitude_65&page_name=rss_feeds&offset=0&instance=latitude_65'), + (u'Sundays', u'http://newsminer.com/rss/rss_feeds/Sundays?content_type=article&tags=alaska_science_forum+scott_mccrea+interior_gardening+in_the_bush+judy_ferguson+book_reviews+theresa_bakker+judith_kleinfeld+interior_scrapbook+nuggets_comics+freeze_frame&page_name=rss_feeds&tag_inclusion=or&instance=Sundays'), + # (u'Outdoors', u'http://newsminer.com/rss/rss_feeds/Outdoors?content_type=article&tags=outdoors&page_name=rss_feeds&instance=Outdoors'), + # (u'Fairbanks Grizzlies', u'http://newsminer.com/rss/rss_feeds/fairbanks_grizzlies?content_type=article&tags=fairbanks_grizzlies&page_name=rss_feeds&instance=fairbanks_grizzlies'), + (u'Newsminer', u'http://newsminer.com/rss/rss_feeds/Newsminer?content_type=article&tags=ted_stevens_bullets+ted_stevens+sports_news+business_news+fairbanks_grizzlies+dermot_cole_column+outdoors+alaska_science_forum+scott_mccrea+interior_gardening+in_the_bush+judy_ferguson+book_reviews+theresa_bakker+judith_kleinfeld+interior_scrapbook+nuggets_comics+freeze_frame&page_name=rss_feeds&tag_inclusion=or&instance=Newsminer'), + # (u'Opinion', u'http://newsminer.com/rss/rss_feeds/Opinion?content_type=article&tags=editorials&page_name=rss_feeds&instance=Opinion'), + # (u'Youth', u'http://newsminer.com/rss/rss_feeds/Youth?content_type=article&tags=youth&page_name=rss_feeds&instance=Youth'), + # (u'Dermot Cole Blog', u'http://newsminer.com/rss/rss_feeds/dermot_cole_blog+rss?content_type=blog+entry&sort_by=posted_on&user_ids=3015275&page_name=blogs_dermot_cole&limit=10&instance=dermot_cole_blog+rss'), + # (u'Dermot Cole Column', u'http://newsminer.com/rss/rss_feeds/Dermot_Cole_column?content_type=article&tags=dermot_cole_column&page_name=rss_feeds&instance=Dermot_Cole_column'), + (u'Sarah Palin', u'http://newsminer.com/rss/rss_feeds/sarah_palin?content_type=article&tags=palin_in_the_news+palin_on_the_issues&page_name=rss_feeds&tag_inclusion=or&instance=sarah_palin') + ] + diff --git a/recipes/msdnmag_en.recipe b/recipes/msdnmag_en.recipe index 341ca027f6..cf9cfc4f6a 100644 --- a/recipes/msdnmag_en.recipe +++ b/recipes/msdnmag_en.recipe @@ -6,7 +6,7 @@ __copyright__ = '2009, Darko Miletic ' msdn.microsoft.com/en-us/magazine ''' from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, Tag +from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup class MSDNMagazine_en(BasicNewsRecipe): title = 'MSDN Magazine' @@ -21,7 +21,7 @@ class MSDNMagazine_en(BasicNewsRecipe): use_embedded_content = False encoding = 'utf-8' language = 'en' - + base_url = 'http://msdn.microsoft.com/en-us/magazine/default.aspx' rss_url = 'http://msdn.microsoft.com/en-us/magazine/rss/default.aspx?z=z&iss=1' @@ -32,15 +32,15 @@ class MSDNMagazine_en(BasicNewsRecipe): dict(name='div', attrs={'class':'DivRatingsOnly'}) ,dict(name='div', attrs={'class':'ShareThisButton4'}) ] - + def find_articles(self): idx_contents = self.browser.open(self.rss_url).read() idx = BeautifulStoneSoup(idx_contents, convertEntities=BeautifulStoneSoup.XML_ENTITIES) - + for article in idx.findAll('item'): desc_html = self.tag_to_string(article.find('description')) description = self.tag_to_string(BeautifulSoup(desc_html)) - + a = { 'title': self.tag_to_string(article.find('title')), 'url': self.tag_to_string(article.find('link')), @@ -52,14 +52,14 @@ class MSDNMagazine_en(BasicNewsRecipe): def parse_index(self): soup = self.index_to_soup(self.base_url) - + #find issue name, eg "August 2011" issue_name = self.tag_to_string(soup.find('h1')) - + # find cover pic img = soup.find('img',attrs ={'alt':issue_name}) if img is not None: self.cover_url = img['src'] return [(issue_name, list(self.find_articles()))] - + From fb08ceb1e55c0880219ac0bfc68b44d7fbf79d52 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 24 Aug 2011 11:13:31 -0600 Subject: [PATCH 34/39] ... --- src/calibre/ebooks/oeb/stylizer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/calibre/ebooks/oeb/stylizer.py b/src/calibre/ebooks/oeb/stylizer.py index 88e074320d..5e4f389262 100644 --- a/src/calibre/ebooks/oeb/stylizer.py +++ b/src/calibre/ebooks/oeb/stylizer.py @@ -118,7 +118,6 @@ class CSSSelector(object): css_to_xpath_no_case(css)) self.sel2 = etree.XPath(path, namespaces=namespaces) except: - raise self.sel2 = lambda x: [] self.sel2_use_logged = False self.css = css From 23f4463f275b9caa0c978615e7ad8b422c36ea64 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 24 Aug 2011 11:57:09 -0600 Subject: [PATCH 35/39] ... --- recipes/bbc.recipe | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/recipes/bbc.recipe b/recipes/bbc.recipe index 9c8b92f25c..2bccbaf4ae 100644 --- a/recipes/bbc.recipe +++ b/recipes/bbc.recipe @@ -36,8 +36,9 @@ class BBC(BasicNewsRecipe): ] remove_tags = [ - dict(name='div', attrs={'class':['story-feature related narrow', 'share-help', 'embedded-hyper', \ - 'story-feature wide ', 'story-feature narrow']}) + dict(name='div', attrs={'class':['story-feature related narrow', 'share-help', 'embedded-hyper', + 'story-feature wide ', 'story-feature narrow']}), + dict(id=['hypertab', 'comment-form']), ] remove_attributes = ['width','height'] From 5119925922f3f0c0485d80fa957a7d83274d7394 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 24 Aug 2011 14:45:21 -0600 Subject: [PATCH 36/39] Revert CSS pipeline changes, as python functions in lxml are broken, badly --- src/calibre/ebooks/cssselect.py | 1007 -------------------- src/calibre/ebooks/mobi/writer2/indexer.py | 3 + src/calibre/ebooks/oeb/stylizer.py | 93 +- 3 files changed, 44 insertions(+), 1059 deletions(-) delete mode 100644 src/calibre/ebooks/cssselect.py diff --git a/src/calibre/ebooks/cssselect.py b/src/calibre/ebooks/cssselect.py deleted file mode 100644 index c4167a8e4d..0000000000 --- a/src/calibre/ebooks/cssselect.py +++ /dev/null @@ -1,1007 +0,0 @@ -"""CSS Selectors based on XPath. - -This module supports selecting XML/HTML tags based on CSS selectors. -See the `CSSSelector` class for details. -""" - -import re -from lxml import etree - -__all__ = ['SelectorSyntaxError', 'ExpressionError', - 'CSSSelector'] - -try: - _basestring = basestring -except NameError: - _basestring = str - -class SelectorSyntaxError(SyntaxError): - pass - -class ExpressionError(RuntimeError): - pass - -class CSSSelector(etree.XPath): - """A CSS selector. - - Usage:: - - >>> from lxml import etree, cssselect - >>> select = cssselect.CSSSelector("a tag > child") - - >>> root = etree.XML("TEXT") - >>> [ el.tag for el in select(root) ] - ['child'] - - To use CSS namespaces, you need to pass a prefix-to-namespace - mapping as ``namespaces`` keyword argument:: - - >>> rdfns = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' - >>> select_ns = cssselect.CSSSelector('root > rdf|Description', - ... namespaces={'rdf': rdfns}) - - >>> rdf = etree.XML(( - ... '' - ... 'blah' - ... '') % rdfns) - >>> [(el.tag, el.text) for el in select_ns(rdf)] - [('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description', 'blah')] - """ - def __init__(self, css, namespaces=None): - path = css_to_xpath_no_case(css) - etree.XPath.__init__(self, path, namespaces=namespaces) - self.css = css - - def __repr__(self): - return '<%s %s for %r>' % ( - self.__class__.__name__, - hex(abs(id(self)))[2:], - self.css) - -############################## -## Token objects: - -try: - _unicode = unicode - _unichr = unichr -except NameError: - # Python 3 - _unicode = str - _unichr = chr - -class _UniToken(_unicode): - def __new__(cls, contents, pos): - obj = _unicode.__new__(cls, contents) - obj.pos = pos - return obj - - def __repr__(self): - return '%s(%s, %r)' % ( - self.__class__.__name__, - _unicode.__repr__(self), - self.pos) - -class Symbol(_UniToken): - pass - -class String(_UniToken): - pass - -class Token(_UniToken): - pass - -############################################################ -## Parsing -############################################################ - -############################## -## Syntax objects: - -class Class(object): - """ - Represents selector.class_name - """ - - def __init__(self, selector, class_name): - self.selector = selector - # Kovid: Lowercased - self.class_name = class_name.lower() - - def __repr__(self): - return '%s[%r.%s]' % ( - self.__class__.__name__, - self.selector, - self.class_name) - - def xpath(self): - sel_xpath = self.selector.xpath() - # Kovid: Lowercased - sel_xpath.add_condition( - "contains(concat(' ', css:lower-case(normalize-space(@class)), ' '), %s)" % xpath_literal(' '+self.class_name+' ')) - return sel_xpath - -class Function(object): - """ - Represents selector:name(expr) - """ - - unsupported = [ - 'target', 'lang', 'enabled', 'disabled',] - - def __init__(self, selector, type, name, expr): - self.selector = selector - self.type = type - self.name = name - self.expr = expr - - def __repr__(self): - return '%s[%r%s%s(%r)]' % ( - self.__class__.__name__, - self.selector, - self.type, self.name, self.expr) - - def xpath(self): - sel_path = self.selector.xpath() - if self.name in self.unsupported: - raise ExpressionError( - "The pseudo-class %r is not supported" % self.name) - method = '_xpath_' + self.name.replace('-', '_') - if not hasattr(self, method): - raise ExpressionError( - "The pseudo-class %r is unknown" % self.name) - method = getattr(self, method) - return method(sel_path, self.expr) - - def _xpath_nth_child(self, xpath, expr, last=False, - add_name_test=True): - a, b = parse_series(expr) - if not a and not b and not last: - # a=0 means nothing is returned... - xpath.add_condition('false() and position() = 0') - return xpath - if add_name_test: - xpath.add_name_test() - xpath.add_star_prefix() - if a == 0: - if last: - b = 'last() - %s' % b - xpath.add_condition('position() = %s' % b) - return xpath - if last: - # FIXME: I'm not sure if this is right - a = -a - b = -b - if b > 0: - b_neg = str(-b) - else: - b_neg = '+%s' % (-b) - if a != 1: - expr = ['(position() %s) mod %s = 0' % (b_neg, a)] - else: - expr = [] - if b >= 0: - expr.append('position() >= %s' % b) - elif b < 0 and last: - expr.append('position() < (last() %s)' % b) - expr = ' and '.join(expr) - if expr: - xpath.add_condition(expr) - return xpath - # FIXME: handle an+b, odd, even - # an+b means every-a, plus b, e.g., 2n+1 means odd - # 0n+b means b - # n+0 means a=1, i.e., all elements - # an means every a elements, i.e., 2n means even - # -n means -1n - # -1n+6 means elements 6 and previous - - def _xpath_nth_last_child(self, xpath, expr): - return self._xpath_nth_child(xpath, expr, last=True) - - def _xpath_nth_of_type(self, xpath, expr): - if xpath.element == '*': - raise NotImplementedError( - "*:nth-of-type() is not implemented") - return self._xpath_nth_child(xpath, expr, add_name_test=False) - - def _xpath_nth_last_of_type(self, xpath, expr): - return self._xpath_nth_child(xpath, expr, last=True, add_name_test=False) - - def _xpath_contains(self, xpath, expr): - # text content, minus tags, must contain expr - if isinstance(expr, Element): - expr = expr._format_element() - xpath.add_condition('contains(css:lower-case(string(.)), %s)' - % xpath_literal(expr.lower())) - # FIXME: Currently case insensitive matching doesn't seem to be happening - return xpath - - def _xpath_not(self, xpath, expr): - # everything for which not expr applies - expr = expr.xpath() - cond = expr.condition - # FIXME: should I do something about element_path? - xpath.add_condition('not(%s)' % cond) - return xpath - -def _make_lower_case(context, s): - return s.lower() - -ns = etree.FunctionNamespace('http://codespeak.net/lxml/css/') -ns.prefix = 'css' -ns['lower-case'] = _make_lower_case - -class Pseudo(object): - """ - Represents selector:ident - """ - - unsupported = ['indeterminate', 'first-line', 'first-letter', - 'selection', 'before', 'after', 'link', 'visited', - 'active', 'focus', 'hover'] - - def __init__(self, element, type, ident): - self.element = element - assert type in (':', '::') - self.type = type - self.ident = ident - - def __repr__(self): - return '%s[%r%s%s]' % ( - self.__class__.__name__, - self.element, - self.type, self.ident) - - def xpath(self): - el_xpath = self.element.xpath() - if self.ident in self.unsupported: - raise ExpressionError( - "The pseudo-class %r is unsupported" % self.ident) - method = '_xpath_' + self.ident.replace('-', '_') - if not hasattr(self, method): - raise ExpressionError( - "The pseudo-class %r is unknown" % self.ident) - method = getattr(self, method) - el_xpath = method(el_xpath) - return el_xpath - - def _xpath_checked(self, xpath): - # FIXME: is this really all the elements? - xpath.add_condition("(@selected or @checked) and (name(.) = 'input' or name(.) = 'option')") - return xpath - - def _xpath_root(self, xpath): - # if this element is the root element - raise NotImplementedError - - def _xpath_first_child(self, xpath): - xpath.add_star_prefix() - xpath.add_name_test() - xpath.add_condition('position() = 1') - return xpath - - def _xpath_last_child(self, xpath): - xpath.add_star_prefix() - xpath.add_name_test() - xpath.add_condition('position() = last()') - return xpath - - def _xpath_first_of_type(self, xpath): - if xpath.element == '*': - raise NotImplementedError( - "*:first-of-type is not implemented") - xpath.add_star_prefix() - xpath.add_condition('position() = 1') - return xpath - - def _xpath_last_of_type(self, xpath): - if xpath.element == '*': - raise NotImplementedError( - "*:last-of-type is not implemented") - xpath.add_star_prefix() - xpath.add_condition('position() = last()') - return xpath - - def _xpath_only_child(self, xpath): - xpath.add_name_test() - xpath.add_star_prefix() - xpath.add_condition('last() = 1') - return xpath - - def _xpath_only_of_type(self, xpath): - if xpath.element == '*': - raise NotImplementedError( - "*:only-of-type is not implemented") - xpath.add_condition('last() = 1') - return xpath - - def _xpath_empty(self, xpath): - xpath.add_condition("not(*) and not(normalize-space())") - return xpath - -class Attrib(object): - """ - Represents selector[namespace|attrib operator value] - """ - - def __init__(self, selector, namespace, attrib, operator, value): - self.selector = selector - self.namespace = namespace - self.attrib = attrib - self.operator = operator - self.value = value - - def __repr__(self): - if self.operator == 'exists': - return '%s[%r[%s]]' % ( - self.__class__.__name__, - self.selector, - self._format_attrib()) - else: - return '%s[%r[%s %s %r]]' % ( - self.__class__.__name__, - self.selector, - self._format_attrib(), - self.operator, - self.value) - - def _format_attrib(self): - if self.namespace == '*': - return self.attrib - else: - return '%s|%s' % (self.namespace, self.attrib) - - def _xpath_attrib(self): - # FIXME: if attrib is *? - if self.namespace == '*': - return '@' + self.attrib - else: - return '@%s:%s' % (self.namespace, self.attrib) - - def xpath(self): - path = self.selector.xpath() - attrib = self._xpath_attrib() - value = self.value - if self.operator == 'exists': - assert not value - path.add_condition(attrib) - elif self.operator == '=': - path.add_condition('%s = %s' % (attrib, - xpath_literal(value))) - elif self.operator == '!=': - # FIXME: this seems like a weird hack... - if value: - path.add_condition('not(%s) or %s != %s' - % (attrib, attrib, xpath_literal(value))) - else: - path.add_condition('%s != %s' - % (attrib, xpath_literal(value))) - #path.add_condition('%s != %s' % (attrib, xpath_literal(value))) - elif self.operator == '~=': - path.add_condition("contains(concat(' ', normalize-space(%s), ' '), %s)" % (attrib, xpath_literal(' '+value+' '))) - elif self.operator == '|=': - # Weird, but true... - path.add_condition('%s = %s or starts-with(%s, %s)' % ( - attrib, xpath_literal(value), - attrib, xpath_literal(value + '-'))) - elif self.operator == '^=': - path.add_condition('starts-with(%s, %s)' % ( - attrib, xpath_literal(value))) - elif self.operator == '$=': - # Oddly there is a starts-with in XPath 1.0, but not ends-with - path.add_condition('substring(%s, string-length(%s)-%s) = %s' - % (attrib, attrib, len(value)-1, xpath_literal(value))) - elif self.operator == '*=': - # FIXME: case sensitive? - path.add_condition('contains(%s, %s)' % ( - attrib, xpath_literal(value))) - else: - assert 0, ("Unknown operator: %r" % self.operator) - return path - -class Element(object): - """ - Represents namespace|element - """ - - def __init__(self, namespace, element): - self.namespace = namespace - self.element = element - - def __repr__(self): - return '%s[%s]' % ( - self.__class__.__name__, - self._format_element()) - - def _format_element(self): - if self.namespace == '*': - return self.element - else: - return '%s|%s' % (self.namespace, self.element) - - def xpath(self): - if self.namespace == '*': - el = self.element.lower() - else: - # Kovid: Lowercased - el = '%s:%s' % (self.namespace, self.element.lower()) - return XPathExpr(element=el) - -class Hash(object): - """ - Represents selector#id - """ - - def __init__(self, selector, id): - self.selector = selector - self.id = id - - def __repr__(self): - return '%s[%r#%s]' % ( - self.__class__.__name__, - self.selector, self.id) - - def xpath(self): - path = self.selector.xpath() - path.add_condition('@id = %s' % xpath_literal(self.id)) - return path - -class Or(object): - - def __init__(self, items): - self.items = items - def __repr__(self): - return '%s(%r)' % ( - self.__class__.__name__, - self.items) - - def xpath(self): - paths = [item.xpath() for item in self.items] - return XPathExprOr(paths) - -class CombinedSelector(object): - - _method_mapping = { - ' ': 'descendant', - '>': 'child', - '+': 'direct_adjacent', - '~': 'indirect_adjacent', - } - - def __init__(self, selector, combinator, subselector): - assert selector is not None - self.selector = selector - self.combinator = combinator - self.subselector = subselector - - def __repr__(self): - if self.combinator == ' ': - comb = '' - else: - comb = self.combinator - return '%s[%r %s %r]' % ( - self.__class__.__name__, - self.selector, - comb, - self.subselector) - - def xpath(self): - if self.combinator not in self._method_mapping: - raise ExpressionError( - "Unknown combinator: %r" % self.combinator) - method = '_xpath_' + self._method_mapping[self.combinator] - method = getattr(self, method) - path = self.selector.xpath() - return method(path, self.subselector) - - def _xpath_descendant(self, xpath, sub): - # when sub is a descendant in any way of xpath - xpath.join('/descendant::', sub.xpath()) - return xpath - - def _xpath_child(self, xpath, sub): - # when sub is an immediate child of xpath - xpath.join('/', sub.xpath()) - return xpath - - def _xpath_direct_adjacent(self, xpath, sub): - # when sub immediately follows xpath - xpath.join('/following-sibling::', sub.xpath()) - xpath.add_name_test() - xpath.add_condition('position() = 1') - return xpath - - def _xpath_indirect_adjacent(self, xpath, sub): - # when sub comes somewhere after xpath as a sibling - xpath.join('/following-sibling::', sub.xpath()) - return xpath - -############################## -## XPathExpr objects: - -_el_re = re.compile(r'^\w+\s*$', re.UNICODE) -_id_re = re.compile(r'^(\w*)#(\w+)\s*$', re.UNICODE) -_class_re = re.compile(r'^(\w*)\.(\w+)\s*$', re.UNICODE) - -def css_to_xpath_no_case(css_expr, prefix='descendant-or-self::'): - if isinstance(css_expr, _basestring): - match = _el_re.search(css_expr) - if match is not None: - # Kovid: Lowercased - return '%s%s' % (prefix, match.group(0).strip().lower()) - match = _id_re.search(css_expr) - if match is not None: - return "%s%s[@id = '%s']" % ( - prefix, match.group(1) or '*', match.group(2)) - match = _class_re.search(css_expr) - if match is not None: - # Kovid: lowercased - return "%s%s[contains(concat(' ', css:lower-case(normalize-space(@class)), ' '), ' %s ')]" % ( - prefix, match.group(1).lower() or '*', match.group(2).lower()) - css_expr = parse(css_expr) - expr = css_expr.xpath() - assert expr is not None, ( - "Got None for xpath expression from %s" % repr(css_expr)) - if prefix: - expr.add_prefix(prefix) - return _unicode(expr) - -class XPathExpr(object): - - def __init__(self, prefix=None, path=None, element='*', condition=None, - star_prefix=False): - self.prefix = prefix - self.path = path - self.element = element - self.condition = condition - self.star_prefix = star_prefix - - def __str__(self): - path = '' - if self.prefix is not None: - path += _unicode(self.prefix) - if self.path is not None: - path += _unicode(self.path) - path += _unicode(self.element) - if self.condition: - path += '[%s]' % self.condition - return path - - def __repr__(self): - return '%s[%s]' % ( - self.__class__.__name__, self) - - def add_condition(self, condition): - if self.condition: - self.condition = '%s and (%s)' % (self.condition, condition) - else: - self.condition = condition - - def add_path(self, part): - if self.path is None: - self.path = self.element - else: - self.path += self.element - self.element = part - - def add_prefix(self, prefix): - if self.prefix: - self.prefix = prefix + self.prefix - else: - self.prefix = prefix - - def add_name_test(self): - if self.element == '*': - # We weren't doing a test anyway - return - self.add_condition("name() = %s" % xpath_literal(self.element)) - self.element = '*' - - def add_star_prefix(self): - """ - Adds a /* prefix if there is no prefix. This is when you need - to keep context's constrained to a single parent. - """ - if self.path: - self.path += '*/' - else: - self.path = '*/' - self.star_prefix = True - - def join(self, combiner, other): - prefix = _unicode(self) - prefix += combiner - path = (other.prefix or '') + (other.path or '') - # We don't need a star prefix if we are joining to this other - # prefix; so we'll get rid of it - if other.star_prefix and path == '*/': - path = '' - self.prefix = prefix - self.path = path - self.element = other.element - self.condition = other.condition - -class XPathExprOr(XPathExpr): - """ - Represents |'d expressions. Note that unfortunately it isn't - the union, it's the sum, so duplicate elements will appear. - """ - - def __init__(self, items, prefix=None): - for item in items: - assert item is not None - self.items = items - self.prefix = prefix - - def __str__(self): - prefix = self.prefix or '' - return ' | '.join(["%s%s" % (prefix,i) for i in self.items]) - -split_at_single_quotes = re.compile("('+)").split - -def xpath_literal(s): - if isinstance(s, Element): - # This is probably a symbol that looks like an expression... - s = s._format_element() - else: - s = _unicode(s) - if "'" not in s: - s = "'%s'" % s - elif '"' not in s: - s = '"%s"' % s - else: - s = "concat(%s)" % ','.join([ - (("'" in part) and '"%s"' or "'%s'") % part - for part in split_at_single_quotes(s) if part - ]) - return s - -############################## -## Parsing functions - -def parse(string): - stream = TokenStream(tokenize(string)) - stream.source = string - try: - return parse_selector_group(stream) - except SelectorSyntaxError: - import sys - e = sys.exc_info()[1] - message = "%s at %s -> %r" % ( - e, stream.used, stream.peek()) - e.msg = message - if sys.version_info < (2,6): - e.message = message - e.args = tuple([message]) - raise - -def parse_selector_group(stream): - result = [] - while 1: - result.append(parse_selector(stream)) - if stream.peek() == ',': - stream.next() - else: - break - if len(result) == 1: - return result[0] - else: - return Or(result) - -def parse_selector(stream): - result = parse_simple_selector(stream) - while 1: - peek = stream.peek() - if peek == ',' or peek is None: - return result - elif peek in ('+', '>', '~'): - # A combinator - combinator = stream.next() - else: - combinator = ' ' - consumed = len(stream.used) - next_selector = parse_simple_selector(stream) - if consumed == len(stream.used): - raise SelectorSyntaxError( - "Expected selector, got '%s'" % stream.peek()) - result = CombinedSelector(result, combinator, next_selector) - return result - -def parse_simple_selector(stream): - peek = stream.peek() - if peek != '*' and not isinstance(peek, Symbol): - element = namespace = '*' - else: - next = stream.next() - if next != '*' and not isinstance(next, Symbol): - raise SelectorSyntaxError( - "Expected symbol, got '%s'" % next) - if stream.peek() == '|': - namespace = next - stream.next() - element = stream.next() - if element != '*' and not isinstance(next, Symbol): - raise SelectorSyntaxError( - "Expected symbol, got '%s'" % next) - else: - namespace = '*' - element = next - result = Element(namespace, element) - has_hash = False - while 1: - peek = stream.peek() - if peek == '#': - if has_hash: - # You can't have two hashes - # (FIXME: is there some more general rule I'm missing?) - break - stream.next() - result = Hash(result, stream.next()) - has_hash = True - continue - elif peek == '.': - stream.next() - result = Class(result, stream.next()) - continue - elif peek == '[': - stream.next() - result = parse_attrib(result, stream) - next = stream.next() - if not next == ']': - raise SelectorSyntaxError( - "] expected, got '%s'" % next) - continue - elif peek == ':' or peek == '::': - type = stream.next() - ident = stream.next() - if not isinstance(ident, Symbol): - raise SelectorSyntaxError( - "Expected symbol, got '%s'" % ident) - if stream.peek() == '(': - stream.next() - peek = stream.peek() - if isinstance(peek, String): - selector = stream.next() - elif isinstance(peek, Symbol) and is_int(peek): - selector = int(stream.next()) - else: - # FIXME: parse_simple_selector, or selector, or...? - selector = parse_simple_selector(stream) - next = stream.next() - if not next == ')': - raise SelectorSyntaxError( - "Expected ')', got '%s' and '%s'" - % (next, selector)) - result = Function(result, type, ident, selector) - else: - result = Pseudo(result, type, ident) - continue - else: - if peek == ' ': - stream.next() - break - # FIXME: not sure what "negation" is - return result - -def is_int(v): - try: - int(v) - except ValueError: - return False - else: - return True - -def parse_attrib(selector, stream): - attrib = stream.next() - if stream.peek() == '|': - namespace = attrib - stream.next() - attrib = stream.next() - else: - namespace = '*' - if stream.peek() == ']': - return Attrib(selector, namespace, attrib, 'exists', None) - op = stream.next() - if not op in ('^=', '$=', '*=', '=', '~=', '|=', '!='): - raise SelectorSyntaxError( - "Operator expected, got '%s'" % op) - value = stream.next() - if not isinstance(value, (Symbol, String)): - raise SelectorSyntaxError( - "Expected string or symbol, got '%s'" % value) - return Attrib(selector, namespace, attrib, op, value) - -def parse_series(s): - """ - Parses things like '1n+2', or 'an+b' generally, returning (a, b) - """ - if isinstance(s, Element): - s = s._format_element() - if not s or s == '*': - # Happens when there's nothing, which the CSS parser thinks of as * - return (0, 0) - if isinstance(s, int): - # Happens when you just get a number - return (0, s) - if s == 'odd': - return (2, 1) - elif s == 'even': - return (2, 0) - elif s == 'n': - return (1, 0) - if 'n' not in s: - # Just a b - return (0, int(s)) - a, b = s.split('n', 1) - if not a: - a = 1 - elif a == '-' or a == '+': - a = int(a+'1') - else: - a = int(a) - if not b: - b = 0 - elif b == '-' or b == '+': - b = int(b+'1') - else: - b = int(b) - return (a, b) - - -############################################################ -## Tokenizing -############################################################ - -_match_whitespace = re.compile(r'\s+', re.UNICODE).match - -_replace_comments = re.compile(r'/\*.*?\*/', re.DOTALL).sub - -_match_count_number = re.compile(r'[+-]?\d*n(?:[+-]\d+)?').match - -def tokenize(s): - pos = 0 - s = _replace_comments('', s) - while 1: - match = _match_whitespace(s, pos=pos) - if match: - preceding_whitespace_pos = pos - pos = match.end() - else: - preceding_whitespace_pos = 0 - if pos >= len(s): - return - match = _match_count_number(s, pos=pos) - if match and match.group() != 'n': - sym = s[pos:match.end()] - yield Symbol(sym, pos) - pos = match.end() - continue - c = s[pos] - c2 = s[pos:pos+2] - if c2 in ('~=', '|=', '^=', '$=', '*=', '::', '!='): - yield Token(c2, pos) - pos += 2 - continue - if c in '>+~,.*=[]()|:#': - if c in '.#[' and preceding_whitespace_pos > 0: - yield Token(' ', preceding_whitespace_pos) - yield Token(c, pos) - pos += 1 - continue - if c == '"' or c == "'": - # Quoted string - old_pos = pos - sym, pos = tokenize_escaped_string(s, pos) - yield String(sym, old_pos) - continue - old_pos = pos - sym, pos = tokenize_symbol(s, pos) - yield Symbol(sym, old_pos) - continue - -split_at_string_escapes = re.compile(r'(\\(?:%s))' - % '|'.join(['[A-Fa-f0-9]{1,6}(?:\r\n|\s)?', - '[^A-Fa-f0-9]'])).split - -def unescape_string_literal(literal): - substrings = [] - for substring in split_at_string_escapes(literal): - if not substring: - continue - elif '\\' in substring: - if substring[0] == '\\' and len(substring) > 1: - substring = substring[1:] - if substring[0] in '0123456789ABCDEFabcdef': - # int() correctly ignores the potentially trailing whitespace - substring = _unichr(int(substring, 16)) - else: - raise SelectorSyntaxError( - "Invalid escape sequence %r in string %r" - % (substring.split('\\')[1], literal)) - substrings.append(substring) - return ''.join(substrings) - -def tokenize_escaped_string(s, pos): - quote = s[pos] - assert quote in ('"', "'") - pos = pos+1 - start = pos - while 1: - next = s.find(quote, pos) - if next == -1: - raise SelectorSyntaxError( - "Expected closing %s for string in: %r" - % (quote, s[start:])) - result = s[start:next] - if result.endswith('\\'): - # next quote character is escaped - pos = next+1 - continue - if '\\' in result: - result = unescape_string_literal(result) - return result, next+1 - -_illegal_symbol = re.compile(r'[^\w\\-]', re.UNICODE) - -def tokenize_symbol(s, pos): - start = pos - match = _illegal_symbol.search(s, pos=pos) - if not match: - # Goes to end of s - return s[start:], len(s) - if match.start() == pos: - assert 0, ( - "Unexpected symbol: %r at %s" % (s[pos], pos)) - if not match: - result = s[start:] - pos = len(s) - else: - result = s[start:match.start()] - pos = match.start() - try: - result = result.encode('ASCII', 'backslashreplace').decode('unicode_escape') - except UnicodeDecodeError: - import sys - e = sys.exc_info()[1] - raise SelectorSyntaxError( - "Bad symbol %r: %s" % (result, e)) - return result, pos - -class TokenStream(object): - - def __init__(self, tokens, source=None): - self.used = [] - self.tokens = iter(tokens) - self.source = source - self.peeked = None - self._peeking = False - try: - self.next_token = self.tokens.next - except AttributeError: - # Python 3 - self.next_token = self.tokens.__next__ - - def next(self): - if self._peeking: - self._peeking = False - self.used.append(self.peeked) - return self.peeked - else: - try: - next = self.next_token() - self.used.append(next) - return next - except StopIteration: - return None - - def __iter__(self): - return iter(self.next, None) - - def peek(self): - if not self._peeking: - try: - self.peeked = self.next_token() - except StopIteration: - return None - self._peeking = True - return self.peeked diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py index ba2bd01c3c..8592392d93 100644 --- a/src/calibre/ebooks/mobi/writer2/indexer.py +++ b/src/calibre/ebooks/mobi/writer2/indexer.py @@ -504,6 +504,9 @@ class Indexer(object): # {{{ else: self.indices = self.create_book_index() + if not self.indices: + raise ValueError('No valid entries in TOC, cannot generate index') + self.records.append(self.create_index_record()) self.records.insert(0, self.create_header()) self.records.extend(self.cncx.records) diff --git a/src/calibre/ebooks/oeb/stylizer.py b/src/calibre/ebooks/oeb/stylizer.py index 5e4f389262..f6ff594701 100644 --- a/src/calibre/ebooks/oeb/stylizer.py +++ b/src/calibre/ebooks/oeb/stylizer.py @@ -27,7 +27,6 @@ from calibre import force_unicode from calibre.ebooks import unit_convert from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES from calibre.ebooks.oeb.base import XPNSMAP, xpath, urlnormalize -from calibre.ebooks.cssselect import css_to_xpath_no_case cssutils_log.setLevel(logging.WARN) @@ -99,71 +98,32 @@ FONT_SIZE_NAMES = set(['xx-small', 'x-small', 'small', 'medium', 'large', 'x-large', 'xx-large']) -class CSSSelector(object): - +class CSSSelector(etree.XPath): + MIN_SPACE_RE = re.compile(r' *([>~+]) *') LOCAL_NAME_RE = re.compile(r"(?' % ( self.__class__.__name__, hex(abs(id(self)))[2:], self.css) -_selector_cache = {} - -MIN_SPACE_RE = re.compile(r' *([>~+]) *') - -def get_css_selector(raw_selector): - css = MIN_SPACE_RE.sub(r'\1', raw_selector) - if isinstance(css, unicode): - # Workaround for bug in lxml on windows/OS X that causes a massive - # memory leak with non ASCII selectors - css = css.encode('ascii', 'ignore').decode('ascii') - ans = _selector_cache.get(css, None) - if ans is None: - ans = CSSSelector(css) - _selector_cache[css] = ans - return ans class Stylizer(object): STYLESHEETS = WeakKeyDictionary() @@ -263,12 +223,41 @@ class Stylizer(object): rules.sort() self.rules = rules self._styles = {} + class_sel_pat = re.compile(r'\.[a-z]+', re.IGNORECASE) + capital_sel_pat = re.compile(r'h|[A-Z]+') for _, _, cssdict, text, _ in rules: fl = ':first-letter' in text if fl: text = text.replace(':first-letter', '') - selector = get_css_selector(text) - matches = selector(tree, self.logger) + try: + selector = CSSSelector(text) + except (AssertionError, ExpressionError, etree.XPathSyntaxError, + NameError, # thrown on OS X instead of SelectorSyntaxError + SelectorSyntaxError): + continue + try: + matches = selector(tree) + except etree.XPathEvalError: + continue + + if not matches: + ntext = capital_sel_pat.sub(lambda m: m.group().lower(), text) + if ntext != text: + self.logger.warn('Transformed CSS selector', text, 'to', + ntext) + selector = CSSSelector(ntext) + matches = selector(tree) + + if not matches and class_sel_pat.match(text) and text.lower() != text: + found = False + ltext = text.lower() + for x in tree.xpath('//*[@class]'): + if ltext.endswith('.'+x.get('class').lower()): + matches.append(x) + found = True + if found: + self.logger.warn('Ignoring case mismatches for CSS selector: %s in %s' + %(text, item.href)) if fl: from lxml.builder import ElementMaker E = ElementMaker(namespace=XHTML_NS) From 6a4bfa920c78019f0fd420e0252daf06423865da Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 24 Aug 2011 15:26:36 -0600 Subject: [PATCH 37/39] Restore CSS pipeline changes without needing to use a python xpath func --- src/calibre/ebooks/cssselect.py | 1012 ++++++++++++++++++++++++++++ src/calibre/ebooks/oeb/stylizer.py | 93 +-- 2 files changed, 1064 insertions(+), 41 deletions(-) create mode 100644 src/calibre/ebooks/cssselect.py diff --git a/src/calibre/ebooks/cssselect.py b/src/calibre/ebooks/cssselect.py new file mode 100644 index 0000000000..1c2bfcc4fa --- /dev/null +++ b/src/calibre/ebooks/cssselect.py @@ -0,0 +1,1012 @@ +"""CSS Selectors based on XPath. + +This module supports selecting XML/HTML tags based on CSS selectors. +See the `CSSSelector` class for details. +""" + +import re +from lxml import etree + +__all__ = ['SelectorSyntaxError', 'ExpressionError', + 'CSSSelector'] + +try: + _basestring = basestring +except NameError: + _basestring = str + +class SelectorSyntaxError(SyntaxError): + pass + +class ExpressionError(RuntimeError): + pass + +class CSSSelector(etree.XPath): + """A CSS selector. + + Usage:: + + >>> from lxml import etree, cssselect + >>> select = cssselect.CSSSelector("a tag > child") + + >>> root = etree.XML("TEXT") + >>> [ el.tag for el in select(root) ] + ['child'] + + To use CSS namespaces, you need to pass a prefix-to-namespace + mapping as ``namespaces`` keyword argument:: + + >>> rdfns = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' + >>> select_ns = cssselect.CSSSelector('root > rdf|Description', + ... namespaces={'rdf': rdfns}) + + >>> rdf = etree.XML(( + ... '' + ... 'blah' + ... '') % rdfns) + >>> [(el.tag, el.text) for el in select_ns(rdf)] + [('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description', 'blah')] + """ + def __init__(self, css, namespaces=None): + path = css_to_xpath_no_case(css) + etree.XPath.__init__(self, path, namespaces=namespaces) + self.css = css + + def __repr__(self): + return '<%s %s for %r>' % ( + self.__class__.__name__, + hex(abs(id(self)))[2:], + self.css) + +############################## +## Token objects: + +try: + _unicode = unicode + _unichr = unichr +except NameError: + # Python 3 + _unicode = str + _unichr = chr + +class _UniToken(_unicode): + def __new__(cls, contents, pos): + obj = _unicode.__new__(cls, contents) + obj.pos = pos + return obj + + def __repr__(self): + return '%s(%s, %r)' % ( + self.__class__.__name__, + _unicode.__repr__(self), + self.pos) + +class Symbol(_UniToken): + pass + +class String(_UniToken): + pass + +class Token(_UniToken): + pass + +############################################################ +## Parsing +############################################################ + +############################## +## Syntax objects: + +class Class(object): + """ + Represents selector.class_name + """ + + def __init__(self, selector, class_name): + self.selector = selector + # Kovid: Lowercased + self.class_name = class_name.lower() + + def __repr__(self): + return '%s[%r.%s]' % ( + self.__class__.__name__, + self.selector, + self.class_name) + + def xpath(self): + sel_xpath = self.selector.xpath() + # Kovid: Lowercased + sel_xpath.add_condition( + "contains(concat(' ', normalize-space(%s), ' '), %s)" % ( + lower_case('@class'), + xpath_literal(' '+self.class_name+' '))) + return sel_xpath + +class Function(object): + """ + Represents selector:name(expr) + """ + + unsupported = [ + 'target', 'lang', 'enabled', 'disabled',] + + def __init__(self, selector, type, name, expr): + self.selector = selector + self.type = type + self.name = name + self.expr = expr + + def __repr__(self): + return '%s[%r%s%s(%r)]' % ( + self.__class__.__name__, + self.selector, + self.type, self.name, self.expr) + + def xpath(self): + sel_path = self.selector.xpath() + if self.name in self.unsupported: + raise ExpressionError( + "The pseudo-class %r is not supported" % self.name) + method = '_xpath_' + self.name.replace('-', '_') + if not hasattr(self, method): + raise ExpressionError( + "The pseudo-class %r is unknown" % self.name) + method = getattr(self, method) + return method(sel_path, self.expr) + + def _xpath_nth_child(self, xpath, expr, last=False, + add_name_test=True): + a, b = parse_series(expr) + if not a and not b and not last: + # a=0 means nothing is returned... + xpath.add_condition('false() and position() = 0') + return xpath + if add_name_test: + xpath.add_name_test() + xpath.add_star_prefix() + if a == 0: + if last: + b = 'last() - %s' % b + xpath.add_condition('position() = %s' % b) + return xpath + if last: + # FIXME: I'm not sure if this is right + a = -a + b = -b + if b > 0: + b_neg = str(-b) + else: + b_neg = '+%s' % (-b) + if a != 1: + expr = ['(position() %s) mod %s = 0' % (b_neg, a)] + else: + expr = [] + if b >= 0: + expr.append('position() >= %s' % b) + elif b < 0 and last: + expr.append('position() < (last() %s)' % b) + expr = ' and '.join(expr) + if expr: + xpath.add_condition(expr) + return xpath + # FIXME: handle an+b, odd, even + # an+b means every-a, plus b, e.g., 2n+1 means odd + # 0n+b means b + # n+0 means a=1, i.e., all elements + # an means every a elements, i.e., 2n means even + # -n means -1n + # -1n+6 means elements 6 and previous + + def _xpath_nth_last_child(self, xpath, expr): + return self._xpath_nth_child(xpath, expr, last=True) + + def _xpath_nth_of_type(self, xpath, expr): + if xpath.element == '*': + raise NotImplementedError( + "*:nth-of-type() is not implemented") + return self._xpath_nth_child(xpath, expr, add_name_test=False) + + def _xpath_nth_last_of_type(self, xpath, expr): + return self._xpath_nth_child(xpath, expr, last=True, add_name_test=False) + + def _xpath_contains(self, xpath, expr): + # text content, minus tags, must contain expr + if isinstance(expr, Element): + expr = expr._format_element() + # Kovid: Use ASCII lower case that works + xpath.add_condition('contains(%s), %s)' % ( + lower_case('string(.)'), + xpath_literal(expr.lower()))) + return xpath + + def _xpath_not(self, xpath, expr): + # everything for which not expr applies + expr = expr.xpath() + cond = expr.condition + # FIXME: should I do something about element_path? + xpath.add_condition('not(%s)' % cond) + return xpath + +# Kovid: Python functions dont work in lxml, so use translate() +# instead of the python lowercase function +def lower_case(arg): + 'An ASCII lowercase function' + return ("translate(%s, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', " + "'abcdefghijklmnopqrstuvwxyz')")%arg + +class Pseudo(object): + """ + Represents selector:ident + """ + + unsupported = ['indeterminate', 'first-line', 'first-letter', + 'selection', 'before', 'after', 'link', 'visited', + 'active', 'focus', 'hover'] + + def __init__(self, element, type, ident): + self.element = element + assert type in (':', '::') + self.type = type + self.ident = ident + + def __repr__(self): + return '%s[%r%s%s]' % ( + self.__class__.__name__, + self.element, + self.type, self.ident) + + def xpath(self): + el_xpath = self.element.xpath() + if self.ident in self.unsupported: + raise ExpressionError( + "The pseudo-class %r is unsupported" % self.ident) + method = '_xpath_' + self.ident.replace('-', '_') + if not hasattr(self, method): + raise ExpressionError( + "The pseudo-class %r is unknown" % self.ident) + method = getattr(self, method) + el_xpath = method(el_xpath) + return el_xpath + + def _xpath_checked(self, xpath): + # FIXME: is this really all the elements? + xpath.add_condition("(@selected or @checked) and (name(.) = 'input' or name(.) = 'option')") + return xpath + + def _xpath_root(self, xpath): + # if this element is the root element + raise NotImplementedError + + def _xpath_first_child(self, xpath): + xpath.add_star_prefix() + xpath.add_name_test() + xpath.add_condition('position() = 1') + return xpath + + def _xpath_last_child(self, xpath): + xpath.add_star_prefix() + xpath.add_name_test() + xpath.add_condition('position() = last()') + return xpath + + def _xpath_first_of_type(self, xpath): + if xpath.element == '*': + raise NotImplementedError( + "*:first-of-type is not implemented") + xpath.add_star_prefix() + xpath.add_condition('position() = 1') + return xpath + + def _xpath_last_of_type(self, xpath): + if xpath.element == '*': + raise NotImplementedError( + "*:last-of-type is not implemented") + xpath.add_star_prefix() + xpath.add_condition('position() = last()') + return xpath + + def _xpath_only_child(self, xpath): + xpath.add_name_test() + xpath.add_star_prefix() + xpath.add_condition('last() = 1') + return xpath + + def _xpath_only_of_type(self, xpath): + if xpath.element == '*': + raise NotImplementedError( + "*:only-of-type is not implemented") + xpath.add_condition('last() = 1') + return xpath + + def _xpath_empty(self, xpath): + xpath.add_condition("not(*) and not(normalize-space())") + return xpath + +class Attrib(object): + """ + Represents selector[namespace|attrib operator value] + """ + + def __init__(self, selector, namespace, attrib, operator, value): + self.selector = selector + self.namespace = namespace + self.attrib = attrib + self.operator = operator + self.value = value + + def __repr__(self): + if self.operator == 'exists': + return '%s[%r[%s]]' % ( + self.__class__.__name__, + self.selector, + self._format_attrib()) + else: + return '%s[%r[%s %s %r]]' % ( + self.__class__.__name__, + self.selector, + self._format_attrib(), + self.operator, + self.value) + + def _format_attrib(self): + if self.namespace == '*': + return self.attrib + else: + return '%s|%s' % (self.namespace, self.attrib) + + def _xpath_attrib(self): + # FIXME: if attrib is *? + if self.namespace == '*': + return '@' + self.attrib + else: + return '@%s:%s' % (self.namespace, self.attrib) + + def xpath(self): + path = self.selector.xpath() + attrib = self._xpath_attrib() + value = self.value + if self.operator == 'exists': + assert not value + path.add_condition(attrib) + elif self.operator == '=': + path.add_condition('%s = %s' % (attrib, + xpath_literal(value))) + elif self.operator == '!=': + # FIXME: this seems like a weird hack... + if value: + path.add_condition('not(%s) or %s != %s' + % (attrib, attrib, xpath_literal(value))) + else: + path.add_condition('%s != %s' + % (attrib, xpath_literal(value))) + #path.add_condition('%s != %s' % (attrib, xpath_literal(value))) + elif self.operator == '~=': + path.add_condition("contains(concat(' ', normalize-space(%s), ' '), %s)" % (attrib, xpath_literal(' '+value+' '))) + elif self.operator == '|=': + # Weird, but true... + path.add_condition('%s = %s or starts-with(%s, %s)' % ( + attrib, xpath_literal(value), + attrib, xpath_literal(value + '-'))) + elif self.operator == '^=': + path.add_condition('starts-with(%s, %s)' % ( + attrib, xpath_literal(value))) + elif self.operator == '$=': + # Oddly there is a starts-with in XPath 1.0, but not ends-with + path.add_condition('substring(%s, string-length(%s)-%s) = %s' + % (attrib, attrib, len(value)-1, xpath_literal(value))) + elif self.operator == '*=': + # FIXME: case sensitive? + path.add_condition('contains(%s, %s)' % ( + attrib, xpath_literal(value))) + else: + assert 0, ("Unknown operator: %r" % self.operator) + return path + +class Element(object): + """ + Represents namespace|element + """ + + def __init__(self, namespace, element): + self.namespace = namespace + self.element = element + + def __repr__(self): + return '%s[%s]' % ( + self.__class__.__name__, + self._format_element()) + + def _format_element(self): + if self.namespace == '*': + return self.element + else: + return '%s|%s' % (self.namespace, self.element) + + def xpath(self): + if self.namespace == '*': + el = self.element.lower() + else: + # Kovid: Lowercased + el = '%s:%s' % (self.namespace, self.element.lower()) + return XPathExpr(element=el) + +class Hash(object): + """ + Represents selector#id + """ + + def __init__(self, selector, id): + self.selector = selector + self.id = id + + def __repr__(self): + return '%s[%r#%s]' % ( + self.__class__.__name__, + self.selector, self.id) + + def xpath(self): + path = self.selector.xpath() + path.add_condition('@id = %s' % xpath_literal(self.id)) + return path + +class Or(object): + + def __init__(self, items): + self.items = items + def __repr__(self): + return '%s(%r)' % ( + self.__class__.__name__, + self.items) + + def xpath(self): + paths = [item.xpath() for item in self.items] + return XPathExprOr(paths) + +class CombinedSelector(object): + + _method_mapping = { + ' ': 'descendant', + '>': 'child', + '+': 'direct_adjacent', + '~': 'indirect_adjacent', + } + + def __init__(self, selector, combinator, subselector): + assert selector is not None + self.selector = selector + self.combinator = combinator + self.subselector = subselector + + def __repr__(self): + if self.combinator == ' ': + comb = '' + else: + comb = self.combinator + return '%s[%r %s %r]' % ( + self.__class__.__name__, + self.selector, + comb, + self.subselector) + + def xpath(self): + if self.combinator not in self._method_mapping: + raise ExpressionError( + "Unknown combinator: %r" % self.combinator) + method = '_xpath_' + self._method_mapping[self.combinator] + method = getattr(self, method) + path = self.selector.xpath() + return method(path, self.subselector) + + def _xpath_descendant(self, xpath, sub): + # when sub is a descendant in any way of xpath + xpath.join('/descendant::', sub.xpath()) + return xpath + + def _xpath_child(self, xpath, sub): + # when sub is an immediate child of xpath + xpath.join('/', sub.xpath()) + return xpath + + def _xpath_direct_adjacent(self, xpath, sub): + # when sub immediately follows xpath + xpath.join('/following-sibling::', sub.xpath()) + xpath.add_name_test() + xpath.add_condition('position() = 1') + return xpath + + def _xpath_indirect_adjacent(self, xpath, sub): + # when sub comes somewhere after xpath as a sibling + xpath.join('/following-sibling::', sub.xpath()) + return xpath + +############################## +## XPathExpr objects: + +_el_re = re.compile(r'^\w+\s*$', re.UNICODE) +_id_re = re.compile(r'^(\w*)#(\w+)\s*$', re.UNICODE) +_class_re = re.compile(r'^(\w*)\.(\w+)\s*$', re.UNICODE) + + +def css_to_xpath_no_case(css_expr, prefix='descendant-or-self::'): + if isinstance(css_expr, _basestring): + match = _el_re.search(css_expr) + if match is not None: + # Kovid: Lowercased + return '%s%s' % (prefix, match.group(0).strip().lower()) + match = _id_re.search(css_expr) + if match is not None: + return "%s%s[@id = '%s']" % ( + prefix, match.group(1) or '*', match.group(2)) + match = _class_re.search(css_expr) + if match is not None: + # Kovid: lowercased + return "%s%s[contains(concat(' ', normalize-space(%s), ' '), ' %s ')]" % ( + prefix, match.group(1).lower() or '*', + lower_case('@class'), match.group(2).lower()) + css_expr = parse(css_expr) + expr = css_expr.xpath() + assert expr is not None, ( + "Got None for xpath expression from %s" % repr(css_expr)) + if prefix: + expr.add_prefix(prefix) + return _unicode(expr) + +class XPathExpr(object): + + def __init__(self, prefix=None, path=None, element='*', condition=None, + star_prefix=False): + self.prefix = prefix + self.path = path + self.element = element + self.condition = condition + self.star_prefix = star_prefix + + def __str__(self): + path = '' + if self.prefix is not None: + path += _unicode(self.prefix) + if self.path is not None: + path += _unicode(self.path) + path += _unicode(self.element) + if self.condition: + path += '[%s]' % self.condition + return path + + def __repr__(self): + return '%s[%s]' % ( + self.__class__.__name__, self) + + def add_condition(self, condition): + if self.condition: + self.condition = '%s and (%s)' % (self.condition, condition) + else: + self.condition = condition + + def add_path(self, part): + if self.path is None: + self.path = self.element + else: + self.path += self.element + self.element = part + + def add_prefix(self, prefix): + if self.prefix: + self.prefix = prefix + self.prefix + else: + self.prefix = prefix + + def add_name_test(self): + if self.element == '*': + # We weren't doing a test anyway + return + self.add_condition("name() = %s" % xpath_literal(self.element)) + self.element = '*' + + def add_star_prefix(self): + """ + Adds a /* prefix if there is no prefix. This is when you need + to keep context's constrained to a single parent. + """ + if self.path: + self.path += '*/' + else: + self.path = '*/' + self.star_prefix = True + + def join(self, combiner, other): + prefix = _unicode(self) + prefix += combiner + path = (other.prefix or '') + (other.path or '') + # We don't need a star prefix if we are joining to this other + # prefix; so we'll get rid of it + if other.star_prefix and path == '*/': + path = '' + self.prefix = prefix + self.path = path + self.element = other.element + self.condition = other.condition + +class XPathExprOr(XPathExpr): + """ + Represents |'d expressions. Note that unfortunately it isn't + the union, it's the sum, so duplicate elements will appear. + """ + + def __init__(self, items, prefix=None): + for item in items: + assert item is not None + self.items = items + self.prefix = prefix + + def __str__(self): + prefix = self.prefix or '' + return ' | '.join(["%s%s" % (prefix,i) for i in self.items]) + +split_at_single_quotes = re.compile("('+)").split + +def xpath_literal(s): + if isinstance(s, Element): + # This is probably a symbol that looks like an expression... + s = s._format_element() + else: + s = _unicode(s) + if "'" not in s: + s = "'%s'" % s + elif '"' not in s: + s = '"%s"' % s + else: + s = "concat(%s)" % ','.join([ + (("'" in part) and '"%s"' or "'%s'") % part + for part in split_at_single_quotes(s) if part + ]) + return s + +############################## +## Parsing functions + +def parse(string): + stream = TokenStream(tokenize(string)) + stream.source = string + try: + return parse_selector_group(stream) + except SelectorSyntaxError: + import sys + e = sys.exc_info()[1] + message = "%s at %s -> %r" % ( + e, stream.used, stream.peek()) + e.msg = message + if sys.version_info < (2,6): + e.message = message + e.args = tuple([message]) + raise + +def parse_selector_group(stream): + result = [] + while 1: + result.append(parse_selector(stream)) + if stream.peek() == ',': + stream.next() + else: + break + if len(result) == 1: + return result[0] + else: + return Or(result) + +def parse_selector(stream): + result = parse_simple_selector(stream) + while 1: + peek = stream.peek() + if peek == ',' or peek is None: + return result + elif peek in ('+', '>', '~'): + # A combinator + combinator = stream.next() + else: + combinator = ' ' + consumed = len(stream.used) + next_selector = parse_simple_selector(stream) + if consumed == len(stream.used): + raise SelectorSyntaxError( + "Expected selector, got '%s'" % stream.peek()) + result = CombinedSelector(result, combinator, next_selector) + return result + +def parse_simple_selector(stream): + peek = stream.peek() + if peek != '*' and not isinstance(peek, Symbol): + element = namespace = '*' + else: + next = stream.next() + if next != '*' and not isinstance(next, Symbol): + raise SelectorSyntaxError( + "Expected symbol, got '%s'" % next) + if stream.peek() == '|': + namespace = next + stream.next() + element = stream.next() + if element != '*' and not isinstance(next, Symbol): + raise SelectorSyntaxError( + "Expected symbol, got '%s'" % next) + else: + namespace = '*' + element = next + result = Element(namespace, element) + has_hash = False + while 1: + peek = stream.peek() + if peek == '#': + if has_hash: + # You can't have two hashes + # (FIXME: is there some more general rule I'm missing?) + break + stream.next() + result = Hash(result, stream.next()) + has_hash = True + continue + elif peek == '.': + stream.next() + result = Class(result, stream.next()) + continue + elif peek == '[': + stream.next() + result = parse_attrib(result, stream) + next = stream.next() + if not next == ']': + raise SelectorSyntaxError( + "] expected, got '%s'" % next) + continue + elif peek == ':' or peek == '::': + type = stream.next() + ident = stream.next() + if not isinstance(ident, Symbol): + raise SelectorSyntaxError( + "Expected symbol, got '%s'" % ident) + if stream.peek() == '(': + stream.next() + peek = stream.peek() + if isinstance(peek, String): + selector = stream.next() + elif isinstance(peek, Symbol) and is_int(peek): + selector = int(stream.next()) + else: + # FIXME: parse_simple_selector, or selector, or...? + selector = parse_simple_selector(stream) + next = stream.next() + if not next == ')': + raise SelectorSyntaxError( + "Expected ')', got '%s' and '%s'" + % (next, selector)) + result = Function(result, type, ident, selector) + else: + result = Pseudo(result, type, ident) + continue + else: + if peek == ' ': + stream.next() + break + # FIXME: not sure what "negation" is + return result + +def is_int(v): + try: + int(v) + except ValueError: + return False + else: + return True + +def parse_attrib(selector, stream): + attrib = stream.next() + if stream.peek() == '|': + namespace = attrib + stream.next() + attrib = stream.next() + else: + namespace = '*' + if stream.peek() == ']': + return Attrib(selector, namespace, attrib, 'exists', None) + op = stream.next() + if not op in ('^=', '$=', '*=', '=', '~=', '|=', '!='): + raise SelectorSyntaxError( + "Operator expected, got '%s'" % op) + value = stream.next() + if not isinstance(value, (Symbol, String)): + raise SelectorSyntaxError( + "Expected string or symbol, got '%s'" % value) + return Attrib(selector, namespace, attrib, op, value) + +def parse_series(s): + """ + Parses things like '1n+2', or 'an+b' generally, returning (a, b) + """ + if isinstance(s, Element): + s = s._format_element() + if not s or s == '*': + # Happens when there's nothing, which the CSS parser thinks of as * + return (0, 0) + if isinstance(s, int): + # Happens when you just get a number + return (0, s) + if s == 'odd': + return (2, 1) + elif s == 'even': + return (2, 0) + elif s == 'n': + return (1, 0) + if 'n' not in s: + # Just a b + return (0, int(s)) + a, b = s.split('n', 1) + if not a: + a = 1 + elif a == '-' or a == '+': + a = int(a+'1') + else: + a = int(a) + if not b: + b = 0 + elif b == '-' or b == '+': + b = int(b+'1') + else: + b = int(b) + return (a, b) + + +############################################################ +## Tokenizing +############################################################ + +_match_whitespace = re.compile(r'\s+', re.UNICODE).match + +_replace_comments = re.compile(r'/\*.*?\*/', re.DOTALL).sub + +_match_count_number = re.compile(r'[+-]?\d*n(?:[+-]\d+)?').match + +def tokenize(s): + pos = 0 + s = _replace_comments('', s) + while 1: + match = _match_whitespace(s, pos=pos) + if match: + preceding_whitespace_pos = pos + pos = match.end() + else: + preceding_whitespace_pos = 0 + if pos >= len(s): + return + match = _match_count_number(s, pos=pos) + if match and match.group() != 'n': + sym = s[pos:match.end()] + yield Symbol(sym, pos) + pos = match.end() + continue + c = s[pos] + c2 = s[pos:pos+2] + if c2 in ('~=', '|=', '^=', '$=', '*=', '::', '!='): + yield Token(c2, pos) + pos += 2 + continue + if c in '>+~,.*=[]()|:#': + if c in '.#[' and preceding_whitespace_pos > 0: + yield Token(' ', preceding_whitespace_pos) + yield Token(c, pos) + pos += 1 + continue + if c == '"' or c == "'": + # Quoted string + old_pos = pos + sym, pos = tokenize_escaped_string(s, pos) + yield String(sym, old_pos) + continue + old_pos = pos + sym, pos = tokenize_symbol(s, pos) + yield Symbol(sym, old_pos) + continue + +split_at_string_escapes = re.compile(r'(\\(?:%s))' + % '|'.join(['[A-Fa-f0-9]{1,6}(?:\r\n|\s)?', + '[^A-Fa-f0-9]'])).split + +def unescape_string_literal(literal): + substrings = [] + for substring in split_at_string_escapes(literal): + if not substring: + continue + elif '\\' in substring: + if substring[0] == '\\' and len(substring) > 1: + substring = substring[1:] + if substring[0] in '0123456789ABCDEFabcdef': + # int() correctly ignores the potentially trailing whitespace + substring = _unichr(int(substring, 16)) + else: + raise SelectorSyntaxError( + "Invalid escape sequence %r in string %r" + % (substring.split('\\')[1], literal)) + substrings.append(substring) + return ''.join(substrings) + +def tokenize_escaped_string(s, pos): + quote = s[pos] + assert quote in ('"', "'") + pos = pos+1 + start = pos + while 1: + next = s.find(quote, pos) + if next == -1: + raise SelectorSyntaxError( + "Expected closing %s for string in: %r" + % (quote, s[start:])) + result = s[start:next] + if result.endswith('\\'): + # next quote character is escaped + pos = next+1 + continue + if '\\' in result: + result = unescape_string_literal(result) + return result, next+1 + +_illegal_symbol = re.compile(r'[^\w\\-]', re.UNICODE) + +def tokenize_symbol(s, pos): + start = pos + match = _illegal_symbol.search(s, pos=pos) + if not match: + # Goes to end of s + return s[start:], len(s) + if match.start() == pos: + assert 0, ( + "Unexpected symbol: %r at %s" % (s[pos], pos)) + if not match: + result = s[start:] + pos = len(s) + else: + result = s[start:match.start()] + pos = match.start() + try: + result = result.encode('ASCII', 'backslashreplace').decode('unicode_escape') + except UnicodeDecodeError: + import sys + e = sys.exc_info()[1] + raise SelectorSyntaxError( + "Bad symbol %r: %s" % (result, e)) + return result, pos + +class TokenStream(object): + + def __init__(self, tokens, source=None): + self.used = [] + self.tokens = iter(tokens) + self.source = source + self.peeked = None + self._peeking = False + try: + self.next_token = self.tokens.next + except AttributeError: + # Python 3 + self.next_token = self.tokens.__next__ + + def next(self): + if self._peeking: + self._peeking = False + self.used.append(self.peeked) + return self.peeked + else: + try: + next = self.next_token() + self.used.append(next) + return next + except StopIteration: + return None + + def __iter__(self): + return iter(self.next, None) + + def peek(self): + if not self._peeking: + try: + self.peeked = self.next_token() + except StopIteration: + return None + self._peeking = True + return self.peeked diff --git a/src/calibre/ebooks/oeb/stylizer.py b/src/calibre/ebooks/oeb/stylizer.py index f6ff594701..5e4f389262 100644 --- a/src/calibre/ebooks/oeb/stylizer.py +++ b/src/calibre/ebooks/oeb/stylizer.py @@ -27,6 +27,7 @@ from calibre import force_unicode from calibre.ebooks import unit_convert from calibre.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES from calibre.ebooks.oeb.base import XPNSMAP, xpath, urlnormalize +from calibre.ebooks.cssselect import css_to_xpath_no_case cssutils_log.setLevel(logging.WARN) @@ -98,32 +99,71 @@ FONT_SIZE_NAMES = set(['xx-small', 'x-small', 'small', 'medium', 'large', 'x-large', 'xx-large']) -class CSSSelector(etree.XPath): - MIN_SPACE_RE = re.compile(r' *([>~+]) *') +class CSSSelector(object): + LOCAL_NAME_RE = re.compile(r"(?' % ( self.__class__.__name__, hex(abs(id(self)))[2:], self.css) +_selector_cache = {} + +MIN_SPACE_RE = re.compile(r' *([>~+]) *') + +def get_css_selector(raw_selector): + css = MIN_SPACE_RE.sub(r'\1', raw_selector) + if isinstance(css, unicode): + # Workaround for bug in lxml on windows/OS X that causes a massive + # memory leak with non ASCII selectors + css = css.encode('ascii', 'ignore').decode('ascii') + ans = _selector_cache.get(css, None) + if ans is None: + ans = CSSSelector(css) + _selector_cache[css] = ans + return ans class Stylizer(object): STYLESHEETS = WeakKeyDictionary() @@ -223,41 +263,12 @@ class Stylizer(object): rules.sort() self.rules = rules self._styles = {} - class_sel_pat = re.compile(r'\.[a-z]+', re.IGNORECASE) - capital_sel_pat = re.compile(r'h|[A-Z]+') for _, _, cssdict, text, _ in rules: fl = ':first-letter' in text if fl: text = text.replace(':first-letter', '') - try: - selector = CSSSelector(text) - except (AssertionError, ExpressionError, etree.XPathSyntaxError, - NameError, # thrown on OS X instead of SelectorSyntaxError - SelectorSyntaxError): - continue - try: - matches = selector(tree) - except etree.XPathEvalError: - continue - - if not matches: - ntext = capital_sel_pat.sub(lambda m: m.group().lower(), text) - if ntext != text: - self.logger.warn('Transformed CSS selector', text, 'to', - ntext) - selector = CSSSelector(ntext) - matches = selector(tree) - - if not matches and class_sel_pat.match(text) and text.lower() != text: - found = False - ltext = text.lower() - for x in tree.xpath('//*[@class]'): - if ltext.endswith('.'+x.get('class').lower()): - matches.append(x) - found = True - if found: - self.logger.warn('Ignoring case mismatches for CSS selector: %s in %s' - %(text, item.href)) + selector = get_css_selector(text) + matches = selector(tree, self.logger) if fl: from lxml.builder import ElementMaker E = ElementMaker(namespace=XHTML_NS) From b5bc1ef8f7c8ba876a4db93120b8d177c84ed3f8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 24 Aug 2011 15:48:14 -0600 Subject: [PATCH 38/39] ... --- src/calibre/ebooks/mobi/writer2/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/mobi/writer2/main.py b/src/calibre/ebooks/mobi/writer2/main.py index ed0e43a303..7e748aac95 100644 --- a/src/calibre/ebooks/mobi/writer2/main.py +++ b/src/calibre/ebooks/mobi/writer2/main.py @@ -590,7 +590,7 @@ class MobiWriter(object): Write the PalmDB header ''' title = ascii_filename(unicode(self.oeb.metadata.title[0])).replace( - ' ', '_') + ' ', '_')[:32] title = title + (b'\0' * (32 - len(title))) now = int(time.time()) nrecords = len(self.records) From 8824104847b17328e67a6e369592ca29658131c2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 24 Aug 2011 16:06:48 -0600 Subject: [PATCH 39/39] ... --- src/calibre/ebooks/mobi/writer2/serializer.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/calibre/ebooks/mobi/writer2/serializer.py b/src/calibre/ebooks/mobi/writer2/serializer.py index 377b29655c..9bbaa436a7 100644 --- a/src/calibre/ebooks/mobi/writer2/serializer.py +++ b/src/calibre/ebooks/mobi/writer2/serializer.py @@ -116,6 +116,12 @@ class Serializer(object): buf.write(b'') self.end_offset = buf.tell() self.fixup_links() + if self.start_offset is None: + # If we don't set a start offset, the stupid Kindle will + # open the book at the location of the first IndexEntry, which + # could be anywhere. So ensure the book is always opened at the + # beginning, instead. + self.start_offset = self.body_start_offset return buf.getvalue() def serialize_head(self):