Sync to trunk.

2025-07-09 03:04:10 -04:00 · 2013-04-07 08:54:36 -04:00 · 2013-04-07 08:54:36 -04:00 · 90f730fbd4
commit 90f730fbd4
parent e586b6222d f082a072e9
46 changed files with 1259 additions and 612 deletions
--- a/.bzrignore
+++ b/.bzrignore
@ -40,6 +40,7 @@ recipes/.gitignore
 recipes/README.md
 recipes/icon_checker.py
 recipes/readme_updater.py
+recipes/garfield.recipe
 recipes/katalog_egazeciarz.recipe
 recipes/tv_axnscifi.recipe
 recipes/tv_comedycentral.recipe
@ -63,6 +64,7 @@ recipes/tv_tvppolonia.recipe
 recipes/tv_tvpuls.recipe
 recipes/tv_viasathistory.recipe
 recipes/icons/katalog_egazeciarz.png
+recipes/icons/garfield.png
 recipes/icons/tv_axnscifi.png
 recipes/icons/tv_comedycentral.png
 recipes/icons/tv_discoveryscience.png
--- a/Changelog.yaml
+++ b/Changelog.yaml
@ -20,6 +20,58 @@
 #   new recipes:
 #     - title: 

+- version: 0.9.26
+  date: 2013-04-05
+
+  new features:
+    - title: "PDF Output: Allow using templates to create arbitrary headers and footers. Look under PDF Output in the conversion dialog for this feature."
+
+    - title: "ToC Editor: Allow generating the ToC directly from individual files inside the ebook. Useful for EPUBs that have individual chapters in single files."
+      tickets: [1163520]
+
+    - title: "ToC Editor: Add buttons to indent/unindent the current entry"
+
+    - title: "ToC Editor: Right-click menu to perform various useful actions on entries in the ToC"
+
+    - title: "Column icons: Allow use of wide images as column icons"
+
+    - title: "Add USB ids for the Palm Pre2 and Samsung Galaxy phone to the device drivers"
+      tickets: [1162293,1163115]
+ 
+  bug fixes:
+    - title: "PDF Output: Fix generating page numbers causing links to not work."
+      tickets: [1162573]
+
+    - title: "Wrong filename output in error message when 'Guide reference not found'"
+      tickets: [1163659]
+
+    - title: "Get Books: Update Amazon, Barnes & Noble, Waterstones and Gutenberg store plugins for website change"
+
+    - title: "PDF Output: Fix 1 pixel wide left and top margins on the cover page for some PDF conversions due to incorrect rounding."
+      tickets: [1162054]
+
+    - title: "ToC Editor: Fix drag and drop of multiple items resulting in the dropped items being in random order sometimes."
+      tickets: [1161999]
+
+  improved recipes:
+    - Financial Times UK
+    - Sing Tao Daily
+    - Apple Daily
+    - A List Apart
+    - Business Week
+    - Harpers printed edition
+    - Harvard Business Review
+
+  new recipes:
+    - title: AM730
+      author: Eddie Lau
+
+    - title: Arret sur images 
+      author: Francois D
+
+    - title: Diario de Noticias
+      author: Jose Pinto
+
 - version: 0.9.25
  date: 2013-03-29

--- a/recipes/bwmagazine2.recipe
+++ b/recipes/bwmagazine2.recipe
@ -1,3 +1,4 @@
+import re
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from collections import OrderedDict

@ -39,7 +40,7 @@ class BusinessWeekMagazine(BasicNewsRecipe):
            title=self.tag_to_string(div.a).strip()
            url=div.a['href']
            soup0 = self.index_to_soup(url)
-            urlprint=soup0.find('li', attrs={'class':'print tracked'}).a['href']
+            urlprint=soup0.find('a', attrs={'href':re.compile('.*printer.*')})['href']
            articles.append({'title':title, 'url':urlprint, 'description':'', 'date':''})


@ -56,7 +57,7 @@ class BusinessWeekMagazine(BasicNewsRecipe):
            title=self.tag_to_string(div.a).strip()
            url=div.a['href']
            soup0 = self.index_to_soup(url)
-            urlprint=soup0.find('li', attrs={'class':'print tracked'}).a['href']
+            urlprint=soup0.find('a', attrs={'href':re.compile('.*printer.*')})['href']
            articles.append({'title':title, 'url':urlprint, 'description':desc, 'date':''})

            if articles:
--- a/recipes/diario_de_noticias.recipe
+++ b/recipes/diario_de_noticias.recipe
@ -0,0 +1,23 @@
+# vim:fileencoding=UTF-8
+
+from __future__ import unicode_literals
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1365070687(BasicNewsRecipe):
+  title ='Diário de Notícias'
+  oldest_article = 7
+  language = 'pt'
+  __author__ = 'Jose Pinto'
+  max_articles_per_feed = 100
+  keep_only_tags = [dict(name='div', attrs={'id':'cln-esqmid'}) ]
+  remove_tags    = [ dict(name='table', attrs={'class':'TabFerramentasInf'}) ]
+
+  feeds = [(u'Portugal', u'http://feeds.dn.pt/DN-Portugal'),
+    (u'Globo', u'http://feeds.dn.pt/DN-Globo'),
+    (u'Economia', u'http://feeds.dn.pt/DN-Economia'),
+    (u'Ci\xeancia', u'http://feeds.dn.pt/DN-Ciencia'),
+    (u'Artes', u'http://feeds.dn.pt/DN-Artes'),
+    (u'TV & Media', u'http://feeds.dn.pt/DN-Media'),
+    (u'Opini\xe3o', u'http://feeds.dn.pt/DN-Opiniao'),
+    (u'Pessoas', u'http://feeds.dn.pt/DN-Pessoas')
+    ]
--- a/recipes/economia.recipe
+++ b/recipes/economia.recipe
@ -0,0 +1,17 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1314326622(BasicNewsRecipe):
+    title          = u'Economia'
+    __author__     = 'Manish Bhattarai'
+    description = 'Economia - Intelligence & Insight for ICAEW Members'
+    language = 'en_GB'
+    oldest_article = 7
+    max_articles_per_feed = 25
+    masthead_url = 'http://economia.icaew.com/~/media/Images/Design%20Images/Economia_Red_website.ashx'
+    cover_url = 'http://economia.icaew.com/~/media/Images/Design%20Images/Economia_Red_website.ashx'
+    no_stylesheets = True
+    remove_empty_feeds = True
+    remove_tags_before = dict(id='content')
+    remove_tags_after  = dict(id='stars-wrapper')
+    remove_tags = [dict(attrs={'class':['floatR', 'sharethis', 'rating clearfix']})]
+    feeds          = [(u'News', u'http://feedity.com/icaew-com/VlNTVFRa.rss'),(u'Business', u'http://feedity.com/icaew-com/VlNTVFtS.rss'),(u'People', u'http://feedity.com/icaew-com/VlNTVFtX.rss'),(u'Opinion', u'http://feedity.com/icaew-com/VlNTVFtW.rss'),(u'Finance', u'http://feedity.com/icaew-com/VlNTVFtV.rss')]
--- a/recipes/esensja_(rss).recipe
+++ b/recipes/esensja_(rss).recipe
@ -12,12 +12,6 @@ class EsensjaRSS(BasicNewsRecipe):
    language       = 'pl'
    encoding = 'utf-8'
    INDEX = 'http://www.esensja.pl'
-    extra_css = '''.t-title {font-size: x-large; font-weight: bold; text-align: left}
-                    .t-author {font-size: x-small; text-align: left}
-                    .t-title2 {font-size: x-small; font-style: italic; text-align: left}
-                    .text {font-size: small; text-align: left}
-                    .annot-ref {font-style: italic; text-align: left}
-                    '''
    cover_url = ''
    masthead_url = 'http://esensja.pl/img/wrss.gif'
    use_embedded_content = False
--- a/recipes/financial_times_uk.recipe
+++ b/recipes/financial_times_uk.recipe
@ -110,10 +110,12 @@ class FinancialTimes(BasicNewsRecipe):
        soup = self.index_to_soup(self.INDEX)
        #dates= self.tag_to_string(soup.find('div', attrs={'class':'btm-links'}).find('div'))
        #self.timefmt = ' [%s]'%dates
+        section_title = 'Untitled'

        for column in soup.findAll('div', attrs = {'class':'feedBoxes clearfix'}):
            for section in column. findAll('div', attrs = {'class':'feedBox'}):
-                section_title=self.tag_to_string(section.find('h4'))
+                sectiontitle=self.tag_to_string(section.find('h4'))
+                if '...' not in sectiontitle: section_title=sectiontitle
                for article in section.ul.findAll('li'):
                    articles = []
                    title=self.tag_to_string(article.a)
--- a/recipes/forbes_pl.recipe
+++ b/recipes/forbes_pl.recipe
@ -0,0 +1,53 @@
+#!/usr/bin/env python
+
+__license__ = 'GPL v3'
+
+from calibre.web.feeds.news import BasicNewsRecipe
+import datetime
+import re
+
+class forbes_pl(BasicNewsRecipe):
+    title = u'Forbes.pl'
+    __author__ = 'Artur Stachecki <artur.stachecki@gmail.com>'
+    language = 'pl'
+    description = u'Biznes, finanse, gospodarka, strategie, wiadomości gospodarcze, analizy finasowe i strategiczne.'
+    oldest_article = 1
+    index = 'http://www.forbes.pl'
+    cover_url = 'http://www.forbes.pl/resources/front/images/logo.png'
+    max_articles_per_feed = 100
+    extra_css = '.Block-Photo {float:left; max-width: 300px; margin-right: 5px;}'
+    preprocess_regexps = [(re.compile(ur'<p>(<strong>)?(Czytaj|Zobacz) (też|także):.*?</p>', re.DOTALL), lambda match: ''), (re.compile(ur'<strong>Zobacz:.*?</strong>', re.DOTALL), lambda match: '')]
+    remove_javascript = True
+    no_stylesheets = True
+    now = datetime.datetime.now()
+    yesterday = now - datetime.timedelta(hours=24)
+    yesterday = yesterday.strftime("%d.%m.%Y %H:%M:%S")
+    pages_count = 4
+    keep_only_tags = [dict(attrs={'class':['Block-Node Content-Article ', 'Block-Node Content-Article piano-closed']})]
+    remove_tags = [dict(attrs={'class':['Keywords Styled', 'twitter-share-button', 'Block-List-Related Block-List']})]
+
+    feeds = [(u'Wszystkie', 'http://www.forbes.pl/rss')]
+
+    '''def preprocess_html(self, soup):
+        self.append_page(soup, soup.body)
+        return soup
+
+
+    def append_page(self, soup, appendtag):
+        cleanup = False
+        nexturl = appendtag.find('a', attrs={'class':'next'})
+        if nexturl:
+            cleanup = True
+        while nexturl:
+            soup2 = self.index_to_soup(self.index + nexturl['href'])
+            nexturl = soup2.find('a', attrs={'class':'next'})
+            pagetext = soup2.findAll(id='article-body-wrapper')
+            if not pagetext:
+                pagetext = soup2.findAll(attrs={'class':'Article-Entry Styled'})
+            for comment in pagetext.findAll(text=lambda text:isinstance(text, Comment)):
+                comment.extract()
+            pos = len(appendtag.contents)
+            appendtag.insert(pos, pagetext)
+        if cleanup:
+            for r in appendtag.findAll(attrs={'class':'paginator'}):
+                r.extract()'''
--- a/recipes/galaxys_edge.recipe
+++ b/recipes/galaxys_edge.recipe
@ -0,0 +1,108 @@
+from __future__ import with_statement
+__license__ = 'GPL 3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class GalaxyEdge(BasicNewsRecipe):
+    title                 = u'The Galaxy\'s Edge'
+    language = 'en'
+
+    oldest_article        = 7
+    __author__            = 'Krittika Goyal'
+    no_stylesheets = True
+
+    auto_cleanup = True
+
+    #keep_only_tags = [dict(id='content')]
+    #remove_tags = [dict(attrs={'class':['article-links', 'breadcr']}),
+            #dict(id=['email-section', 'right-column', 'printfooter', 'topover',
+                     #'slidebox', 'th_footer'])]
+
+    extra_css = '.photo-caption { font-size: smaller }'
+
+    def parse_index(self):
+        soup = self.index_to_soup('http://www.galaxysedge.com/')
+        main = soup.find('table', attrs={'width':'911'})
+        toc = main.find('td', attrs={'width':'225'})
+
+
+
+        current_section = None
+        current_articles = []
+        feeds = []
+        c = 0
+        for x in toc.findAll(['p']):
+            c = c+1
+            if c == 5:
+                if current_articles and current_section:
+                    feeds.append((current_section, current_articles))
+                edwo = x.find('a')
+                current_section = self.tag_to_string(edwo)
+                current_articles = []
+                self.log('\tFound section:', current_section)
+                title = self.tag_to_string(edwo)
+                url = edwo.get('href', True)
+                url = 'http://www.galaxysedge.com/'+url
+                print(title)
+                print(c)
+                if not url or not title:
+                    continue
+                self.log('\t\tFound article:', title)
+                self.log('\t\t\t', url)
+                current_articles.append({'title': title, 'url':url,
+                    'description':'', 'date':''})
+            elif c>5:
+                current_section = self.tag_to_string(x.find('b'))
+                current_articles = []
+                self.log('\tFound section:', current_section)
+                for y in x.findAll('a'):
+                    title = self.tag_to_string(y)
+                    url = y.get('href', True)
+                    url = 'http://www.galaxysedge.com/'+url
+                    print(title)
+                    if not url or not title:
+                        continue
+                    self.log('\t\tFound article:', title)
+                    self.log('\t\t\t', url)
+                    current_articles.append({'title': title, 'url':url,
+                        'description':'', 'date':''})
+            if current_articles and current_section:
+                 feeds.append((current_section, current_articles))
+
+        return feeds
+
+
+
+
+    #def preprocess_raw_html(self, raw, url):
+        #return raw.replace('<body><p>', '<p>').replace('</p></body>', '</p>')
+
+    #def postprocess_html(self, soup, first_fetch):
+        #for t in soup.findAll(['table', 'tr', 'td','center']):
+            #t.name = 'div'
+        #return soup
+
+    #def parse_index(self):
+        #today = time.strftime('%Y-%m-%d')
+        #soup = self.index_to_soup(
+                #'http://www.thehindu.com/todays-paper/tp-index/?date=' + today)
+        #div = soup.find(id='left-column')
+        #feeds = []
+        #current_section = None
+        #current_articles = []
+        #for x in div.findAll(['h3', 'div']):
+            #if current_section and x.get('class', '') == 'tpaper':
+                #a = x.find('a', href=True)
+                #if a is not None:
+                    #current_articles.append({'url':a['href']+'?css=print',
+                        #'title':self.tag_to_string(a), 'date': '',
+                        #'description':''})
+            #if x.name == 'h3':
+                #if current_section and current_articles:
+                    #feeds.append((current_section, current_articles))
+                #current_section = self.tag_to_string(x)
+                #current_articles = []
+        #return feeds
+
+
--- a/recipes/gazeta_pl_krakow.recipe
+++ b/recipes/gazeta_pl_krakow.recipe
@ -10,7 +10,7 @@ krakow.gazeta.pl
 from calibre.web.feeds.news import BasicNewsRecipe

 class gw_krakow(BasicNewsRecipe):
-    title          = u'Gazeta.pl Kraków'
+    title          = u'Gazeta Wyborcza Kraków'
    __author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks'
    language       = 'pl'
    description =u'Wiadomości z Krakowa na portalu Gazeta.pl.'
--- a/recipes/gazeta_pl_szczecin.recipe
+++ b/recipes/gazeta_pl_szczecin.recipe
@ -5,7 +5,7 @@ import string
 from calibre.web.feeds.news import BasicNewsRecipe

 class GazetaPlSzczecin(BasicNewsRecipe):
-    title          = u'Gazeta.pl Szczecin'
+    title          = u'Gazeta Wyborcza Szczecin'
    description    = u'Wiadomości ze Szczecina na portalu Gazeta.pl.'
    __author__     = u'Michał Szkutnik'
    __license__    = u'GPL v3'
--- a/recipes/gazeta_pl_warszawa.recipe
+++ b/recipes/gazeta_pl_warszawa.recipe
@ -10,7 +10,7 @@ warszawa.gazeta.pl
 from calibre.web.feeds.news import BasicNewsRecipe

 class gw_wawa(BasicNewsRecipe):
-    title          = u'Gazeta.pl Warszawa'
+    title          = u'Gazeta Wyborcza Warszawa'
    __author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks'
    language       = 'pl'
    description ='Wiadomości z Warszawy na portalu Gazeta.pl.'
--- a/recipes/gazeta_wyborcza.recipe
+++ b/recipes/gazeta_wyborcza.recipe
@ -3,7 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import Comment

 class Gazeta_Wyborcza(BasicNewsRecipe):
-    title = u'Gazeta.pl'
+    title = u'Gazeta Wyborcza'
    __author__ = 'fenuks, Artur Stachecki'
    language = 'pl'
    description = 'Wiadomości z Polski i ze świata. Serwisy tematyczne i lokalne w 20 miastach.'
--- a/recipes/hbr.recipe
+++ b/recipes/hbr.recipe
@ -20,7 +20,7 @@ class HBR(BasicNewsRecipe):
        'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn',
        'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR',
        'mailingListTout', 'partnerCenter', 'pageFooter',
-        'superNavHeadContainer', 'hbrDisqus',
+        'superNavHeadContainer', 'hbrDisqus', 'article-toolbox',
        'articleToolbarTop', 'articleToolbarBottom', 'articleToolbarRD']),
        dict(name='iframe')]
    extra_css = '''
--- a/recipes/icons/forbes_pl.png
+++ b/recipes/icons/forbes_pl.png
--- a/recipes/icons/gazeta_pl_krakow.png
+++ b/recipes/icons/gazeta_pl_krakow.png
--- a/recipes/icons/gazeta_pl_szczecin.png
+++ b/recipes/icons/gazeta_pl_szczecin.png
--- a/recipes/icons/gazeta_pl_warszawa.png
+++ b/recipes/icons/gazeta_pl_warszawa.png
--- a/recipes/icons/gazeta_wyborcza.png
+++ b/recipes/icons/gazeta_wyborcza.png
--- a/recipes/icons/slashdot.png
+++ b/recipes/icons/slashdot.png
--- a/recipes/icons/sportowefakty.png
+++ b/recipes/icons/sportowefakty.png
--- a/recipes/icons/wysokie_obcasy.png
+++ b/recipes/icons/wysokie_obcasy.png
--- a/recipes/new_yorker.recipe
+++ b/recipes/new_yorker.recipe
@ -1,64 +1,44 @@
+#!/usr/bin/env  python
+# -*- coding: utf-8 -*-
 __license__   = 'GPL v3'
-__copyright__ = '2008-2013, Darko Miletic <darko.miletic at gmail.com>'
-'''
-newyorker.com
-'''

+'''
+www.canada.com
+'''
+import re
 from calibre.web.feeds.news import BasicNewsRecipe

+from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
+
 class NewYorker(BasicNewsRecipe):
-    title                 = 'The New Yorker'
-    __author__            = 'Darko Miletic'
-    description           = 'The best of US journalism'
-    oldest_article        = 15
-    language              = 'en'
-    max_articles_per_feed = 100
-    no_stylesheets        = True
-    use_embedded_content  = False
-    publisher             = 'Conde Nast Publications'
-    category              = 'news, politics, USA'
-    encoding              = 'cp1252'
-    publication_type      = 'magazine'
-    masthead_url          = 'http://www.newyorker.com/css/i/hed/logo.gif'
-    extra_css             = """
-                                body {font-family: "Times New Roman",Times,serif}
-                                .articleauthor{color: #9F9F9F;
-                                               font-family: Arial, sans-serif;
-                                               font-size: small;
-                                               text-transform: uppercase}
-                                .rubric,.dd,h6#credit{color: #CD0021;
-                                        font-family: Arial, sans-serif;
-                                        font-size: small;
-                                        text-transform: uppercase}
-                                .descender:first-letter{display: inline; font-size: xx-large; font-weight: bold}
-                                .dd,h6#credit{color: gray}
-                                .c{display: block}
-                                .caption,h2#articleintro{font-style: italic}
-                                .caption{font-size: small}
-                            """

-    conversion_options = {
-                          'comment'   : description
-                        , 'tags'      : category
-                        , 'publisher' : publisher
-                        , 'language'  : language
-                        }

-    keep_only_tags = [dict(name='div', attrs={'id':'pagebody'})]
-    remove_tags    = [
-                         dict(name=['meta','iframe','base','link','embed','object'])
-                        ,dict(attrs={'class':['utils','socialUtils','articleRailLinks','icons','social-utils-top','entry-keywords','entry-categories','utilsPrintEmail'] })
-                        ,dict(attrs={'id':['show-header','show-footer'] })
-                     ]
-    remove_tags_after = dict(attrs={'class':'entry-content'}) 
-    remove_attributes = ['lang']
-    feeds             = [(u'The New Yorker', u'http://www.newyorker.com/services/mrss/feeds/everything.xml')]
+    title = u'New Yorker Magazine'
+    newyorker_prefix = 'http://m.newyorker.com'
+    description = u'Content from the New Yorker website'
+    fp_tag = 'CAN_TC'

-    def print_version(self, url):
-        return url + '?printable=true&currentPage=all'
+    masthead_url = 'http://www.newyorker.com/images/elements/print/newyorker_printlogo.gif'

-    def image_url_processor(self, baseurl, url):
-        return url.strip()
+    compress_news_images = True
+    compress_news_images_auto_size = 8
+    scale_news_images_to_device = False
+    scale_news_images = (768, 1024)
+
+    url_list = []
+    language = 'en'
+    __author__ = 'Nick Redding'
+    no_stylesheets = True
+    timefmt =  ' [%b %d]'
+    encoding = 'utf-8'
+    extra_css = '''
+                .byline { font-size:xx-small; font-weight: bold;}
+                h3 { margin-bottom: 6px; }
+                .caption { font-size: xx-small; font-style: italic; font-weight: normal; }
+                '''
+    keep_only_tags = [dict(name='div', attrs={'id':re.compile('pagebody')})]
+
+    remove_tags = [{'class':'socialUtils'},{'class':'entry-keywords'}]

    def get_cover_url(self):
        cover_url = "http://www.newyorker.com/images/covers/1925/1925_02_21_p233.jpg"
@ -68,13 +48,233 @@ class NewYorker(BasicNewsRecipe):
           cover_url = 'http://www.newyorker.com' + cover_item.div.img['src'].strip()
        return cover_url

-    def preprocess_html(self, soup):
-        for item in soup.findAll(style=True):
-            del item['style']
-        auth = soup.find(attrs={'id':'articleauthor'})
-        if auth:
-           alink = auth.find('a')
-           if alink and alink.string is not None:
-              txt = alink.string
-              alink.replaceWith(txt)
+    def fixChars(self,string):
+        # Replace lsquo (\x91)
+        fixed = re.sub("\x91","‘",string)
+        # Replace rsquo (\x92)
+        fixed = re.sub("\x92","’",fixed)
+        # Replace ldquo (\x93)
+        fixed = re.sub("\x93","“",fixed)
+        # Replace rdquo (\x94)
+        fixed = re.sub("\x94","”",fixed)
+        # Replace ndash (\x96)
+        fixed = re.sub("\x96","–",fixed)
+        # Replace mdash (\x97)
+        fixed = re.sub("\x97","—",fixed)
+        fixed = re.sub("&#x2019;","’",fixed)
+        return fixed
+
+    def massageNCXText(self, description):
+        # Kindle TOC descriptions won't render certain characters
+        if description:
+            massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
+            # Replace '&' with '&'
+            massaged = re.sub("&","&", massaged)
+            return self.fixChars(massaged)
+        else:
+            return description
+
+    def populate_article_metadata(self, article, soup, first):
+        if first:
+            picdiv = soup.find('body').find('img')
+            if picdiv is not None:
+                self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
+        xtitle = article.text_summary.strip()
+        if len(xtitle) == 0:
+            desc = soup.find('meta',attrs={'property':'og:description'})
+            if desc is not None:
+                article.summary = article.text_summary = desc['content']
+        shortparagraph = ""
+##        try:
+        if len(article.text_summary.strip()) == 0:
+            articlebodies = soup.findAll('div',attrs={'class':'entry-content'})
+            if articlebodies:
+                for articlebody in articlebodies:
+                    if articlebody:
+                        paras = articlebody.findAll('p')
+                        for p in paras:
+                            refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
+                            #account for blank paragraphs and short paragraphs by appending them to longer ones
+                            if len(refparagraph) > 0:
+                                if len(refparagraph) > 70: #approximately one line of text
+                                    newpara = shortparagraph + refparagraph
+                                    article.summary = article.text_summary = newpara.strip()
+                                    return
+                                else:
+                                    shortparagraph = refparagraph + " "
+                                    if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
+                                        shortparagraph = shortparagraph + "- "
+        else:
+            article.summary = article.text_summary = self.massageNCXText(article.text_summary)
+##        except:
+##            self.log("Error creating article descriptions")
+##            return
+
+
+    def strip_anchors(self,soup):
+        paras = soup.findAll(True)
+        for para in paras:
+            aTags = para.findAll('a')
+            for a in aTags:
+                if a.img is None:
+                    a.replaceWith(a.renderContents().decode('cp1252','replace'))
        return soup
+
+    def preprocess_html(self,soup):
+        dateline = soup.find('div','published')
+        byline = soup.find('div','byline')
+        title = soup.find('h1','entry-title')
+        if title is None:
+            return self.strip_anchors(soup)
+        if byline is None:
+            title.append(dateline)
+            return self.strip_anchors(soup)
+        byline.append(dateline)
+        return self.strip_anchors(soup)
+
+    def load_global_nav(self,soup):
+        seclist = []
+        ul = soup.find('ul',attrs={'id':re.compile('global-nav-menu')})
+        if ul is not None:
+            for li in ul.findAll('li'):
+                if li.a is not None:
+                    securl = li.a['href']
+                    if securl != '/' and securl != '/magazine' and securl.startswith('/'):
+                        seclist.append((self.tag_to_string(li.a),self.newyorker_prefix+securl))
+        return seclist
+
+    def exclude_url(self,url):
+        if url in self.url_list:
+            return True
+        if not url.endswith('html'):
+            return True
+        if 'goings-on-about-town-app' in url:
+            return True
+        if 'something-to-be-thankful-for' in url:
+            return True
+        if '/shouts/' in url:
+            return True
+        if 'out-loud' in url:
+            return True
+        if '/rss/' in url:
+            return True
+        if '/video-' in url:
+            return True
+        self.url_list.append(url)
+        return False
+
+
+    def load_index_page(self,soup):
+        article_list = []
+        for div in soup.findAll('div',attrs={'class':re.compile('^rotator')}):
+            h2 = div.h2
+            if h2 is not None:
+                a = h2.a
+                if a is not None:
+                    url = a['href']
+                    if not self.exclude_url(url):
+                        if url.startswith('/'):
+                            url = self.newyorker_prefix+url
+                        byline = h2.span
+                        if byline is not None:
+                            author = self.tag_to_string(byline)
+                            if author.startswith('by '):
+                                author.replace('by ','')
+                            byline.extract()
+                        else:
+                            author = ''
+                        if h2.br is not None:
+                            h2.br.replaceWith(' ')
+                        title = self.tag_to_string(h2)
+                        desc = div.find(attrs={'class':['rotator-ad-body','feature-blurb-text']})
+                        if desc is not None:
+                            description = self.tag_to_string(desc)
+                        else:
+                            description = ''
+                        article_list.append(dict(title=title,url=url,date='',description=description,author=author,content=''))
+                        ul = div.find('ul','feature-blurb-links')
+                        if ul is not None:
+                            for li in ul.findAll('li'):
+                                a = li.a
+                                if a is not None:
+                                    url = a['href']
+                                    if not self.exclude_url(url):
+                                        if url.startswith('/'):
+                                            url = self.newyorker_prefix+url
+                                        if a.br is not None:
+                                            a.br.replaceWith(' ')
+                                        title = '>>'+self.tag_to_string(a)
+                                        article_list.append(dict(title=title,url=url,date='',description='',author='',content=''))
+        for h3 in soup.findAll('h3','header'):
+            a = h3.a
+            if a is not None:
+                url = a['href']
+                if not self.exclude_url(url):
+                    if url.startswith('/'):
+                        url = self.newyorker_prefix+url
+                    byline = h3.span
+                    if byline is not None:
+                        author = self.tag_to_string(byline)
+                        if author.startswith('by '):
+                            author = author.replace('by ','')
+                        byline.extract()
+                    else:
+                        author = ''
+                    if h3.br is not None:
+                        h3.br.replaceWith(' ')
+                    title = self.tag_to_string(h3).strip()
+                    article_list.append(dict(title=title,url=url,date='',description='',author=author,content=''))
+        return article_list
+
+    def load_global_section(self,securl):
+        article_list = []
+        try:
+            soup = self.index_to_soup(securl)
+        except:
+            return article_list
+        if '/blogs/' not in securl:
+            return self.load_index_page(soup)
+        for div in soup.findAll('div',attrs={'id':re.compile('^entry')}):
+            h3 = div.h3
+            if h3 is not None:
+                a = h3.a
+                if a is not None:
+                    url = a['href']
+                    if not self.exclude_url(url):
+                        if url.startswith('/'):
+                            url = self.newyorker_prefix+url
+                        if h3.br is not None:
+                            h3.br.replaceWith(' ')
+                        title = self.tag_to_string(h3)
+                        article_list.append(dict(title=title,url=url,date='',description='',author='',content=''))
+        return article_list
+
+    def filter_ans(self, ans) :
+        total_article_count = 0
+        idx = 0
+        idx_max = len(ans)-1
+        while idx <= idx_max:
+            if True: #self.verbose
+                self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
+            for article in ans[idx][1]:
+                total_article_count += 1
+                if True: #self.verbose
+                    self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
+                              article['url'].replace('http://m.newyorker.com','').encode('cp1252','replace')))
+            idx = idx+1
+        self.log( "Queued %d articles" % total_article_count )
+        return ans
+
+
+    def parse_index(self):
+        ans = []
+        try:
+            soup = self.index_to_soup(self.newyorker_prefix)
+        except:
+            return ans
+        seclist = self.load_global_nav(soup)
+        ans.append(('Front Page',self.load_index_page(soup)))
+        for (sectitle,securl) in seclist:
+            ans.append((sectitle,self.load_global_section(securl)))
+        return self.filter_ans(ans)
+
--- a/recipes/sportowefakty.recipe
+++ b/recipes/sportowefakty.recipe
@ -0,0 +1,70 @@
+#!/usr/bin/env  python
+
+__license__ = 'GPL v3'
+
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.utils.magick import Image
+
+class sportowefakty(BasicNewsRecipe):
+    title          = u'SportoweFakty'
+    __author__ = 'Artur Stachecki <artur.stachecki@gmail.com>, Tomasz Długosz <tomek3d@gmail.com>'
+    language       = 'pl'
+    description    = u'Najważniejsze informacje sportowe z kraju i ze świata, relacje, komentarze, wywiady, zdjęcia!'
+    oldest_article = 1
+    masthead_url='http://www.sportowefakty.pl/images/logo.png'
+    max_articles_per_feed = 100
+    simultaneous_downloads = 5
+    use_embedded_content=False
+    remove_javascript=True
+    no_stylesheets=True
+    ignore_duplicate_articles = {'title', 'url'}
+
+    keep_only_tags = [dict(attrs = {'class' : 'box-article'})]
+    remove_tags =[]
+    remove_tags.append(dict(attrs = {'class' : re.compile(r'^newsStream')}))
+    remove_tags.append(dict(attrs = {'target' : '_blank'}))
+
+    feeds          = [
+                      (u'Piłka Nożna', u'http://www.sportowefakty.pl/pilka-nozna/index.rss'),
+                      (u'Koszykówka', u'http://www.sportowefakty.pl/koszykowka/index.rss'),
+                      (u'Żużel', u'http://www.sportowefakty.pl/zuzel/index.rss'),
+                      (u'Siatkówka', u'http://www.sportowefakty.pl/siatkowka/index.rss'),
+                      (u'Zimowe', u'http://www.sportowefakty.pl/zimowe/index.rss'),
+                      (u'Hokej', u'http://www.sportowefakty.pl/hokej/index.rss'),
+                      (u'Moto', u'http://www.sportowefakty.pl/moto/index.rss'),
+                      (u'Tenis', u'http://www.sportowefakty.pl/tenis/index.rss')
+                     ]
+
+    def get_article_url(self, article):
+        link = article.get('link', None)
+        if 'utm_source' in link:
+            return link.split('?utm')[0]
+        else:
+            return link
+
+    def print_version(self, url):
+        print_url = url + '/drukuj'
+        return print_url
+
+    def preprocess_html(self, soup):
+        head = soup.find('h1')
+        if 'Fotorelacja' in self.tag_to_string(head):
+            return None
+        else:
+            for alink in soup.findAll('a'):
+                if alink.string is not None:
+                    tstr = alink.string
+                    alink.replaceWith(tstr)
+            return soup
+
+    def postprocess_html(self, soup, first):
+        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
+            iurl = tag['src']
+            img = Image()
+            img.open(iurl)
+            if img < 0:
+                raise RuntimeError('Out of memory')
+            img.type = "GrayscaleType"
+            img.save(iurl)
+        return soup
--- a/recipes/theonion.recipe
+++ b/recipes/theonion.recipe
@ -36,47 +36,21 @@ class TheOnion(BasicNewsRecipe):
                        , 'publisher': publisher
                        , 'language' : language
                        }
-
-    keep_only_tags = [
-                         dict(name='h2', attrs={'class':['section_title','title']})
-                        ,dict(attrs={'class':['main_image','meta','article_photo_lead','article_body']})
-                        ,dict(attrs={'id':['entries']})
-                     ]
-    remove_attributes=['lang','rel']
-    remove_tags_after = dict(attrs={'class':['article_body','feature_content']})
+    keep_only_tags = [dict(name='article', attrs={'class':'full-article'})]
    remove_tags = [
-                     dict(name=['object','link','iframe','base','meta'])
-                    ,dict(name='div', attrs={'class':['toolbar_side','graphical_feature','toolbar_bottom']})
-                    ,dict(name='div', attrs={'id':['recent_slider','sidebar','pagination','related_media']})
-                  ]
-
+        dict(name=['nav', 'aside', 'section', 'meta']),
+        {'attrs':{'class':lambda x: x and ('share-tools' in x or 'ad-zone' in x)}},
+    ]

    feeds = [
              (u'Daily'  , u'http://feeds.theonion.com/theonion/daily' )
             ,(u'Sports' , u'http://feeds.theonion.com/theonion/sports' )
            ]

-    def get_article_url(self, article):
-        artl = BasicNewsRecipe.get_article_url(self, article)
-        if artl.startswith('http://www.theonion.com/audio/'):
-           artl = None
-        return artl
-
-    def preprocess_html(self, soup):
-        for item in soup.findAll(style=True):
-            del item['style']
-        for item in soup.findAll('a'):
-            limg = item.find('img')
-            if item.string is not None:
-               str = item.string
-               item.replaceWith(str)
-            else:
-               if limg:
-                  item.name  = 'div'
-                  item.attrs = []
-                  if not limg.has_key('alt'):
-                     limg['alt'] = 'image'
-               else:
-                   str = self.tag_to_string(item)
-                   item.replaceWith(str)
+    def preprocess_html(self, soup, *args):
+        for img in soup.findAll('img', attrs={'data-src':True}):
+            if img['data-src']:
+                img['src'] = img['data-src']
        return soup
+
+
--- a/recipes/universe_today.recipe
+++ b/recipes/universe_today.recipe
@ -0,0 +1,17 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class UniverseToday(BasicNewsRecipe):
+    title                 = u'Universe Today'
+    language = 'en'
+    description           = u'Space and astronomy news.'
+    __author__ = 'seird'
+    publisher             = u'universetoday.com'
+    category              = 'science, astronomy, news, rss'
+    oldest_article = 7
+    max_articles_per_feed = 40
+    auto_cleanup = True
+    no_stylesheets = True
+    use_embedded_content = False
+    remove_empty_feeds = True
+
+    feeds          = [(u'Universe Today', u'http://feeds.feedburner.com/universetoday/pYdq')]
--- a/recipes/vic_times.recipe
+++ b/recipes/vic_times.recipe
@ -6,17 +6,62 @@ __license__   = 'GPL v3'
 www.canada.com
 '''
 import re
-from calibre.web.feeds.recipes import BasicNewsRecipe
+from calibre.web.feeds.news import BasicNewsRecipe
+
 from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup


 class TimesColonist(BasicNewsRecipe):

+    # Customization -- remove sections you don't want.
+    # If your e-reader is an e-ink Kindle and your output profile is
+    # set properly this recipe will not include images because the
+    # resulting file is too large. If you have one of these and want
+    # images you can set kindle_omit_images = False
+    # and remove sections (typically the e-ink Kindles will
+    # work with about a dozen of these, but your mileage may vary).
+
+    kindle_omit_images = True
+
+    section_list = [
+        ('','Web Front Page'),
+        ('news/','News Headlines'),
+        ('news/b-c/','BC News'),
+        ('news/national/','National News'),
+        ('news/world/','World News'),
+        ('opinion/','Opinion'),
+        ('opinion/letters/','Letters'),
+        ('business/','Business'),
+        ('business/money/','Money'),
+        ('business/technology/','Technology'),
+        ('business/working/','Working'),
+        ('sports/','Sports'),
+        ('sports/hockey/','Hockey'),
+        ('sports/football/','Football'),
+        ('sports/basketball/','Basketball'),
+        ('sports/golf/','Golf'),
+        ('entertainment/','entertainment'),
+        ('entertainment/go/','Go!'),
+        ('entertainment/music/','Music'),
+        ('entertainment/books/','Books'),
+        ('entertainment/Movies/','Movies'),
+        ('entertainment/television/','Television'),
+        ('life/','Life'),
+        ('life/health/','Health'),
+        ('life/travel/','Travel'),
+        ('life/driving/','Driving'),
+        ('life/homes/','Homes'),
+        ('life/food-drink/','Food & Drink')
+    ]
+
    title = u'Victoria Times Colonist'
    url_prefix = 'http://www.timescolonist.com'
    description = u'News from Victoria, BC'
    fp_tag = 'CAN_TC'

+    masthead_url = 'http://www.timescolonist.com/gmg/img/global/logoTimesColonist.png'
+
+
    url_list = []
    language = 'en_CA'
    __author__ = 'Nick Redding'
@ -29,15 +74,21 @@ class TimesColonist(BasicNewsRecipe):
                .caption { font-size: xx-small; font-style: italic; font-weight: normal; }
                '''
    keep_only_tags = [dict(name='div', attrs={'class':re.compile('main.content')})]
-    remove_tags = [{'class':'comments'},
-                   {'id':'photocredit'},
-                   dict(name='div', attrs={'class':re.compile('top.controls')}),
-                   dict(name='div', attrs={'class':re.compile('social')}),
-                   dict(name='div', attrs={'class':re.compile('tools')}),
-                   dict(name='div', attrs={'class':re.compile('bottom.tools')}),
-                   dict(name='div', attrs={'class':re.compile('window')}),
-                   dict(name='div', attrs={'class':re.compile('related.news.element')})]

+    def __init__(self, options, log, progress_reporter):
+        self.remove_tags = [{'class':'comments'},
+                       {'id':'photocredit'},
+                       dict(name='div', attrs={'class':re.compile('top.controls')}),
+                       dict(name='div', attrs={'class':re.compile('^comments')}),
+                       dict(name='div', attrs={'class':re.compile('social')}),
+                       dict(name='div', attrs={'class':re.compile('tools')}),
+                       dict(name='div', attrs={'class':re.compile('bottom.tools')}),
+                       dict(name='div', attrs={'class':re.compile('window')}),
+                       dict(name='div', attrs={'class':re.compile('related.news.element')})]
+        print("PROFILE NAME = "+options.output_profile.short_name)
+        if self.kindle_omit_images and options.output_profile.short_name in ['kindle', 'kindle_dx', 'kindle_pw']:
+            self.remove_tags.append(dict(name='div', attrs={'class':re.compile('image-container')}))
+        BasicNewsRecipe.__init__(self, options, log, progress_reporter)

    def get_cover_url(self):
        from datetime import timedelta, date
@ -122,7 +173,6 @@ class TimesColonist(BasicNewsRecipe):
    def preprocess_html(self,soup):
        byline = soup.find('p',attrs={'class':re.compile('ancillary')})
        if byline is not None:
-            byline.find('a')
            authstr = self.tag_to_string(byline,False)
            authstr = re.sub('/ *Times Colonist','/',authstr, flags=re.IGNORECASE)
            authstr = re.sub('BY */','',authstr, flags=re.IGNORECASE)
@ -149,9 +199,10 @@ class TimesColonist(BasicNewsRecipe):
        atag = htag.a
        if atag is not None:
            url = atag['href']
-            #print("Checking "+url)
-            if atag['href'].startswith('/'):
-                url = self.url_prefix+atag['href']
+            url = url.strip()
+            # print("Checking >>"+url+'<<\n\r')
+            if url.startswith('/'):
+                url = self.url_prefix+url
            if url in self.url_list:
                return
            self.url_list.append(url)
@ -171,10 +222,10 @@ class TimesColonist(BasicNewsRecipe):
            if dtag is not None:
                description = self.tag_to_string(dtag,False)
            article_list.append(dict(title=title,url=url,date='',description=description,author='',content=''))
-            #print(sectitle+title+": description = "+description+" URL="+url)
+            print(sectitle+title+": description = "+description+" URL="+url+'\n\r')

    def add_section_index(self,ans,securl,sectitle):
-        print("Add section url="+self.url_prefix+'/'+securl)
+        print("Add section url="+self.url_prefix+'/'+securl+'\n\r')
        try:
            soup = self.index_to_soup(self.url_prefix+'/'+securl)
        except:
@ -193,33 +244,7 @@ class TimesColonist(BasicNewsRecipe):

    def parse_index(self):
        ans = []
-        ans = self.add_section_index(ans,'','Web Front Page')
-        ans = self.add_section_index(ans,'news/','News Headlines')
-        ans = self.add_section_index(ans,'news/b-c/','BC News')
-        ans = self.add_section_index(ans,'news/national/','Natioanl News')
-        ans = self.add_section_index(ans,'news/world/','World News')
-        ans = self.add_section_index(ans,'opinion/','Opinion')
-        ans = self.add_section_index(ans,'opinion/letters/','Letters')
-        ans = self.add_section_index(ans,'business/','Business')
-        ans = self.add_section_index(ans,'business/money/','Money')
-        ans = self.add_section_index(ans,'business/technology/','Technology')
-        ans = self.add_section_index(ans,'business/working/','Working')
-        ans = self.add_section_index(ans,'sports/','Sports')
-        ans = self.add_section_index(ans,'sports/hockey/','Hockey')
-        ans = self.add_section_index(ans,'sports/football/','Football')
-        ans = self.add_section_index(ans,'sports/basketball/','Basketball')
-        ans = self.add_section_index(ans,'sports/golf/','Golf')
-        ans = self.add_section_index(ans,'entertainment/','entertainment')
-        ans = self.add_section_index(ans,'entertainment/go/','Go!')
-        ans = self.add_section_index(ans,'entertainment/music/','Music')
-        ans = self.add_section_index(ans,'entertainment/books/','Books')
-        ans = self.add_section_index(ans,'entertainment/Movies/','movies')
-        ans = self.add_section_index(ans,'entertainment/television/','Television')
-        ans = self.add_section_index(ans,'life/','Life')
-        ans = self.add_section_index(ans,'life/health/','Health')
-        ans = self.add_section_index(ans,'life/travel/','Travel')
-        ans = self.add_section_index(ans,'life/driving/','Driving')
-        ans = self.add_section_index(ans,'life/homes/','Homes')
-        ans = self.add_section_index(ans,'life/food-drink/','Food & Drink')
+        for (url,title) in self.section_list:
+            ans = self.add_section_index(ans,url,title)
        return ans

--- a/recipes/wyborcza_duzy_format.recipe
+++ b/recipes/wyborcza_duzy_format.recipe
@ -1,144 +0,0 @@
-#!/usr/bin/env  python
-
-from calibre.web.feeds.recipes import BasicNewsRecipe
-
-class GazetaWyborczaDuzyForma(BasicNewsRecipe):
-    cover_url             = 'http://bi.gazeta.pl/im/8/5415/m5415058.gif'
-    title                 = u"Gazeta Wyborcza Duzy Format"
-    __author__            = 'ravcio - rlelusz[at]gmail.com'
-    description           = u"Articles from Gazeta's website"
-    language              = 'pl'
-    max_articles_per_feed = 50  #you can increade it event up to maybe 600, should still work
-    recursions            = 0
-    encoding              = 'iso-8859-2'
-    no_stylesheets        = True
-    remove_javascript     = True
-    use_embedded_content  = False
-
-
-    keep_only_tags    = [
-            dict(name='div', attrs={'id':['k1']})
-                ]
-
-    remove_tags = [
-            dict(name='div', attrs={'class':['zdjM', 'rel_video', 'zdjP', 'rel_box', 'index mod_zi_dolStrony']})
-            ,dict(name='div', attrs={'id':['source', 'banP4', 'article_toolbar', 'rel', 'inContext_disabled']})
-            ,dict(name='ul', attrs={'id':['articleToolbar']})
-            ,dict(name='img', attrs={'class':['brand']})
-            ,dict(name='h5', attrs={'class':['author']})
-            ,dict(name='h6', attrs={'class':['date']})
-            ,dict(name='p', attrs={'class':['txt_upl']})
-                ]
-
-    remove_tags_after = [
-            dict(name='div', attrs={'id':['Str']})                #nawigator numerow linii
-                ]
-
-    def load_article_links(self, url, count):
-        print '--- load_article_links', url, count
-
-		#page with link to articles
-        soup = self.index_to_soup(url)
-
-		#table with articles
-        list = soup.find('div', attrs={'class':'GWdalt'})
-
-		#single articles (link, title, ...)
-        links = list.findAll('div', attrs={'class':['GWdaltE']})
-
-        if len(links) < count:
-            #load links to more articles...
-
-			#remove new link
-            pages_nav = list.find('div', attrs={'class':'pages'})
-            next = pages_nav.find('a', attrs={'class':'next'})
-            if next:
-                print 'next=', next['href']
-                url = 'http://wyborcza.pl' + next['href']
-                #e.g. url = 'http://wyborcza.pl/0,75480.html?str=2'
-
-                older_links = self.load_article_links(url, count - len(links))
-                links.extend(older_links)
-
-        return links
-
-
-    #produce list of articles to download
-    def parse_index(self):
-        print '--- parse_index'
-
-        max_articles = 8000
-        links = self.load_article_links('http://wyborcza.pl/0,75480.html', max_articles)
-
-        ans = []
-        key = None
-        articles = {}
-
-        key = 'Uncategorized'
-        articles[key] = []
-
-        for div_art in links:
-            div_date = div_art.find('div', attrs={'class':'kL'})
-            div = div_art.find('div', attrs={'class':'kR'})
-
-            a = div.find('a', href=True)
-
-            url = a['href']
-            title = a.string
-            description = ''
-            pubdate = div_date.string.rstrip().lstrip()
-            summary = div.find('span', attrs={'class':'lead'})
-
-            desc = summary.find('a', href=True)
-            if desc:
-                desc.extract()
-
-            description = self.tag_to_string(summary, use_alt=False)
-            description = description.rstrip().lstrip()
-
-            feed = key if key is not None else 'Duzy Format'
-
-            if not articles.has_key(feed):
-                articles[feed] = []
-
-            if description != '':  # skip just pictures atricle
-                articles[feed].append(
-                                   dict(title=title, url=url, date=pubdate,
-                                        description=description,
-                                        content=''))
-
-        ans = [(key, articles[key])]
-        return ans
-
-    def append_page(self, soup, appendtag, position):
-        pager = soup.find('div',attrs={'id':'Str'})
-        if pager:
-			#seek for 'a' element with nast value (if not found exit)
-            list = pager.findAll('a')
-
-            for elem in list:
-                if 'nast' in elem.string:
-                    nexturl = elem['href']
-
-                    soup2 = self.index_to_soup('http://warszawa.gazeta.pl' + nexturl)
-
-                    texttag = soup2.find('div', attrs={'id':'artykul'})
-
-                    newpos = len(texttag.contents)
-                    self.append_page(soup2,texttag,newpos)
-                    texttag.extract()
-                    appendtag.insert(position,texttag)
-
-    def preprocess_html(self, soup):
-        self.append_page(soup, soup.body, 3)
-
-        # finally remove some tags
-        pager = soup.find('div',attrs={'id':'Str'})
-        if pager:
-           pager.extract()
-
-        pager = soup.find('div',attrs={'class':'tylko_int'})
-        if pager:
-           pager.extract()
-
-        return soup
--- a/recipes/wysokie_obcasy.recipe
+++ b/recipes/wysokie_obcasy.recipe
@ -0,0 +1,57 @@
+#!/usr/bin/env python
+__license__ = 'GPL v3'
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class WysokieObcasyRecipe(BasicNewsRecipe):
+    __author__ = u'Artur Stachecki <artur.stachecki@gmail.com>'
+    language = 'pl'
+    version = 1
+
+    title = u'Wysokie Obcasy'
+    publisher = 'Agora SA'
+    description = u'Serwis sobotniego dodatku do Gazety Wyborczej'
+    category='magazine'
+    language = 'pl'
+    publication_type = 'magazine'
+    cover_url=''
+    remove_empty_feeds= True
+    no_stylesheets=True
+    oldest_article = 7
+    max_articles_per_feed = 100000
+    recursions = 0
+
+    no_stylesheets = True
+    remove_javascript = True
+    simultaneous_downloads = 5
+
+    keep_only_tags =[]
+    keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'article'}))
+
+    remove_tags =[]
+    remove_tags.append(dict(name = 'img'))
+    remove_tags.append(dict(name = 'p', attrs = {'class' : 'info'}))
+
+    extra_css = '''
+                    body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}
+                    h1{text-align: left;}
+                       '''
+
+    feeds          = [
+                            ('Wszystkie Artykuly', 'feed://www.wysokieobcasy.pl/pub/rss/wysokieobcasy.xml'),
+                          ]
+
+    def print_version(self,url):
+        baseURL='http://www.wysokieobcasy.pl/wysokie-obcasy'
+        segments = url.split(',')
+        subPath= '/2029020,'
+        articleURL1 = segments[1]
+        articleURL2 = segments[2]
+        printVerString=articleURL1 + ',' + articleURL2
+        s=  baseURL + subPath + printVerString + '.html'
+        return s
+
+    def get_cover_url(self):
+        soup = self.index_to_soup('http://www.wysokieobcasy.pl/wysokie-obcasy/0,0.html')
+        self.cover_url = soup.find(attrs={'class':'holder_cr'}).find('img')['src']
+        return getattr(self, 'cover_url', self.cover_url)
--- a/resources/templates/rtf.xsl
+++ b/resources/templates/rtf.xsl
@ -357,7 +357,7 @@
       <xsl:apply-templates/>
   </xsl:template>

-       <xsl:template match="rtf:table">
+    <xsl:template match="rtf:table">
        <xsl:element name="table">
            <xsl:attribute name="id">
                <xsl:value-of select="generate-id(.)"/>
@ -390,7 +390,6 @@


    <xsl:output method = "xml"/>
-
    <xsl:key name="style-types" match="rtf:paragraph-definition" use="@style-number"/>


@ -415,13 +414,11 @@
    </xsl:template>

    <xsl:template match="rtf:page-break">
-        <xsl:element name="br">
-            <xsl:attribute name="style">page-break-after:always</xsl:attribute>
-        </xsl:element>
+        <br style = "page-break-after:always"/>
    </xsl:template>
    
    <xsl:template match="rtf:hardline-break">
-        <xsl:element name="br"/>
+        <br/>
    </xsl:template>

    <xsl:template match="rtf:rtf-definition|rtf:font-table|rtf:color-table|rtf:style-table|rtf:page-definition|rtf:list-table|rtf:override-table|rtf:override-list|rtf:list-text"/>
@ -445,7 +442,7 @@
    </xsl:template>

    <xsl:template match = "rtf:field-block">
-      <xsl:apply-templates/>
+        <xsl:apply-templates/>
    </xsl:template>

    <xsl:template match = "rtf:field[@type='hyperlink']">
@ -472,9 +469,7 @@
    </xsl:template>

    <xsl:template match="rtf:pict">
-        <xsl:element name="img">
-            <xsl:attribute name="src"><xsl:value-of select="@num" /></xsl:attribute>
-        </xsl:element>
+        <img src = "{@num}"/>
    </xsl:template>

    <xsl:template match="*">
--- a/src/calibre/constants.py
+++ b/src/calibre/constants.py
@ -4,7 +4,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
 __appname__   = u'calibre'
-numeric_version = (0, 9, 25)
+numeric_version = (0, 9, 26)
 __version__   = u'.'.join(map(unicode, numeric_version))
 __author__    = u"Kovid Goyal <kovid@kovidgoyal.net>"

--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -757,9 +757,10 @@ from calibre.ebooks.metadata.sources.isbndb import ISBNDB
 from calibre.ebooks.metadata.sources.overdrive import OverDrive
 from calibre.ebooks.metadata.sources.douban import Douban
 from calibre.ebooks.metadata.sources.ozon import Ozon
-# from calibre.ebooks.metadata.sources.google_images import GoogleImages
+from calibre.ebooks.metadata.sources.google_images import GoogleImages
+from calibre.ebooks.metadata.sources.big_book_search import BigBookSearch

-plugins += [GoogleBooks, Amazon, Edelweiss, OpenLibrary, ISBNDB, OverDrive, Douban, Ozon]
+plugins += [GoogleBooks, GoogleImages, Amazon, Edelweiss, OpenLibrary, ISBNDB, OverDrive, Douban, Ozon, BigBookSearch]

 # }}}

--- a/src/calibre/customize/ui.py
+++ b/src/calibre/customize/ui.py
@ -91,7 +91,7 @@ def restore_plugin_state_to_default(plugin_or_name):
    config['enabled_plugins'] = ep

 default_disabled_plugins = set([
-    'Overdrive', 'Douban Books', 'OZON.ru', 'Edelweiss', 'Google Images',
+    'Overdrive', 'Douban Books', 'OZON.ru', 'Edelweiss', 'Google Images', 'Big Book Search',
 ])

 def is_disabled(plugin):
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@ -132,7 +132,7 @@ class Worker(Thread): # Get details {{{
                 text()="Détails sur le produit" or \
                 text()="Detalles del producto" or \
                 text()="Detalhes do produto" or \
-                 text()="登録情報"]/../div[@class="content"]
+                 starts-with(text(), "登録情報")]/../div[@class="content"]
            '''
        # Editor: is for Spanish
        self.publisher_xpath = '''
@ -235,6 +235,12 @@ class Worker(Thread): # Get details {{{
            msg = 'Failed to parse amazon details page: %r'%self.url
            self.log.exception(msg)
            return
+        if self.domain == 'jp':
+            for a in root.xpath('//a[@href]'):
+                if 'black-curtain-redirect.html' in a.get('href'):
+                    self.url = 'http://amazon.co.jp'+a.get('href')
+                    self.log('Black curtain redirect found, following')
+                    return self.get_details()

        errmsg = root.xpath('//*[@id="errorMessage"]')
        if errmsg:
@ -252,8 +258,8 @@ class Worker(Thread): # Get details {{{
            self.log.exception('Error parsing asin for url: %r'%self.url)
            asin = None
        if self.testing:
-            import tempfile
-            with tempfile.NamedTemporaryFile(prefix=asin + '_',
+            import tempfile, uuid
+            with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_',
                    suffix='.html', delete=False) as f:
                f.write(raw)
            print ('Downloaded html for', asin, 'saved in', f.name)
@ -499,7 +505,7 @@ class Worker(Thread): # Get details {{{
    def parse_language(self, pd):
        for x in reversed(pd.xpath(self.language_xpath)):
            if x.tail:
-                raw = x.tail.strip()
+                raw = x.tail.strip().partition(',')[0].strip()
                ans = self.lang_map.get(raw, None)
                if ans:
                    return ans
@ -1004,6 +1010,11 @@ if __name__ == '__main__': # tests {{{
    ] # }}}

    jp_tests = [ # {{{
+            ( # Adult filtering test
+             {'identifiers':{'isbn':'4799500066'}},
+             [title_test(u'Ｂｉｔｃｈ Ｔｒａｐ'),]
+            ),
+
            ( # isbn -> title, authors
                {'identifiers':{'isbn': '9784101302720' }},
                [title_test(u'精霊の守り人',
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@ -31,7 +31,7 @@ msprefs.defaults['find_first_edition_date'] = False
 # Google covers are often poor quality (scans/errors) but they have high
 # resolution, so they trump covers from better sources. So make sure they
 # are only used if no other covers are found.
-msprefs.defaults['cover_priorities'] = {'Google':2, 'Google Images':2}
+msprefs.defaults['cover_priorities'] = {'Google':2, 'Google Images':2, 'Big Book Search':2}

 def create_log(ostream=None):
    from calibre.utils.logging import ThreadSafeLog, FileStream
@ -429,6 +429,40 @@ class Source(Plugin):
            mi.tags = list(map(fixcase, mi.tags))
        mi.isbn = check_isbn(mi.isbn)

+    def download_multiple_covers(self, title, authors, urls, get_best_cover, timeout, result_queue, abort, log, prefs_name='max_covers'):
+        if not urls:
+            log('No images found for, title: %r and authors: %r'%(title, authors))
+            return
+        from threading import Thread
+        import time
+        if prefs_name:
+            urls = urls[:self.prefs[prefs_name]]
+        if get_best_cover:
+            urls = urls[:1]
+        log('Downloading %d covers'%len(urls))
+        workers = [Thread(target=self.download_image, args=(u, timeout, log, result_queue)) for u in urls]
+        for w in workers:
+            w.daemon = True
+            w.start()
+        alive = True
+        start_time = time.time()
+        while alive and not abort.is_set() and time.time() - start_time < timeout:
+            alive = False
+            for w in workers:
+                if w.is_alive():
+                    alive = True
+                    break
+            abort.wait(0.1)
+
+    def download_image(self, url, timeout, log, result_queue):
+        try:
+            ans = self.browser.open_novisit(url, timeout=timeout).read()
+            result_queue.put((self, ans))
+            log('Downloaded cover from: %s'%url)
+        except Exception:
+            self.log.exception('Failed to download cover from: %r'%url)
+
+
    # }}}

    # Metadata API {{{
--- a/src/calibre/ebooks/metadata/sources/big_book_search.py
+++ b/src/calibre/ebooks/metadata/sources/big_book_search.py
@ -0,0 +1,58 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from calibre.ebooks.metadata.sources.base import Source, Option
+
+def get_urls(br, tokens):
+    from urllib import quote_plus
+    from mechanize import Request
+    from lxml import html
+    escaped = [quote_plus(x.encode('utf-8')) for x in tokens if x and x.strip()]
+    q = b'+'.join(escaped)
+    url = 'http://bigbooksearch.com/books/'+q
+    br.open(url).read()
+    req = Request('http://bigbooksearch.com/query.php?SearchIndex=books&Keywords=%s&ItemPage=1'%q)
+    req.add_header('X-Requested-With', 'XMLHttpRequest')
+    req.add_header('Referer', url)
+    raw = br.open(req).read()
+    root = html.fromstring(raw.decode('utf-8'))
+    urls = [i.get('src') for i in root.xpath('//img[@src]')]
+    return urls
+
+class BigBookSearch(Source):
+
+    name = 'Big Book Search'
+    description = _('Downloads multiple book covers from Amazon. Useful to find alternate covers.')
+    capabilities = frozenset(['cover'])
+    config_help_message = _('Configure the Big Book Search plugin')
+    can_get_multiple_covers = True
+    options = (Option('max_covers', 'number', 5, _('Maximum number of covers to get'),
+                      _('The maximum number of covers to process from the search result')),
+    )
+    supports_gzip_transfer_encoding = True
+
+    def download_cover(self, log, result_queue, abort,
+            title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
+        if not title:
+            return
+        br = self.browser
+        tokens = tuple(self.get_title_tokens(title)) + tuple(self.get_author_tokens(authors))
+        urls = get_urls(br, tokens)
+        self.download_multiple_covers(title, authors, urls, get_best_cover, timeout, result_queue, abort, log)
+
+def test():
+    from calibre import browser
+    import pprint
+    br = browser()
+    urls = get_urls(br, ['consider', 'phlebas', 'banks'])
+    pprint.pprint(urls)
+
+if __name__ == '__main__':
+    test()
+
--- a/src/calibre/ebooks/metadata/sources/covers.py
+++ b/src/calibre/ebooks/metadata/sources/covers.py
@ -18,12 +18,13 @@ from calibre.utils.magick.draw import Image, save_cover_data_to

 class Worker(Thread):

-    def __init__(self, plugin, abort, title, authors, identifiers, timeout, rq):
+    def __init__(self, plugin, abort, title, authors, identifiers, timeout, rq, get_best_cover=False):
        Thread.__init__(self)
        self.daemon = True

        self.plugin = plugin
        self.abort = abort
+        self.get_best_cover = get_best_cover
        self.buf = BytesIO()
        self.log = create_log(self.buf)
        self.title, self.authors, self.identifiers = (title, authors,
@ -37,7 +38,7 @@ class Worker(Thread):
            try:
                if self.plugin.can_get_multiple_covers:
                    self.plugin.download_cover(self.log, self.rq, self.abort,
-                        title=self.title, authors=self.authors, get_best_cover=True,
+                        title=self.title, authors=self.authors, get_best_cover=self.get_best_cover,
                        identifiers=self.identifiers, timeout=self.timeout)
                else:
                    self.plugin.download_cover(self.log, self.rq, self.abort,
@ -72,7 +73,7 @@ def process_result(log, result):
    return (plugin, width, height, fmt, data)

 def run_download(log, results, abort,
-        title=None, authors=None, identifiers={}, timeout=30):
+        title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
    '''
    Run the cover download, putting results into the queue :param:`results`.

@ -89,7 +90,7 @@ def run_download(log, results, abort,
    plugins = [p for p in metadata_plugins(['cover']) if p.is_configured()]

    rq = Queue()
-    workers = [Worker(p, abort, title, authors, identifiers, timeout, rq) for p
+    workers = [Worker(p, abort, title, authors, identifiers, timeout, rq, get_best_cover=get_best_cover) for p
            in plugins]
    for w in workers:
        w.start()
@ -163,7 +164,7 @@ def download_cover(log,
    abort = Event()

    run_download(log, rq, abort, title=title, authors=authors,
-            identifiers=identifiers, timeout=timeout)
+            identifiers=identifiers, timeout=timeout, get_best_cover=True)

    results = []

--- a/src/calibre/ebooks/metadata/sources/edelweiss.py
+++ b/src/calibre/ebooks/metadata/sources/edelweiss.py
@ -106,6 +106,8 @@ class Worker(Thread): # {{{
            parts = pub.partition(':')[0::2]
            pub = parts[1] or parts[0]
            try:
+                if ', Ship Date:' in pub:
+                    pub = pub.partition(', Ship Date:')[0]
                q = parse_only_date(pub, assume_utc=True)
                if q.year != UNDEFINED_DATE:
                    mi.pubdate = q
--- a/src/calibre/ebooks/metadata/sources/google_images.py
+++ b/src/calibre/ebooks/metadata/sources/google_images.py
@ -39,39 +39,11 @@ class GoogleImages(Source):
            title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
        if not title:
            return
-        from threading import Thread
-        import time
        timeout = max(60, timeout) # Needs at least a minute
        title = ' '.join(self.get_title_tokens(title))
        author = ' '.join(self.get_author_tokens(authors))
        urls = self.get_image_urls(title, author, log, abort, timeout)
-        if not urls:
-            log('No images found in Google for, title: %r and authors: %r'%(title, author))
-            return
-        urls = urls[:self.prefs['max_covers']]
-        if get_best_cover:
-            urls = urls[:1]
-        workers = [Thread(target=self.download_image, args=(url, timeout, log, result_queue)) for url in urls]
-        for w in workers:
-            w.daemon = True
-            w.start()
-        alive = True
-        start_time = time.time()
-        while alive and not abort.is_set() and time.time() - start_time < timeout:
-            alive = False
-            for w in workers:
-                if w.is_alive():
-                    alive = True
-                    break
-            abort.wait(0.1)
-
-    def download_image(self, url, timeout, log, result_queue):
-        try:
-            ans = self.browser.open_novisit(url, timeout=timeout).read()
-            result_queue.put((self, ans))
-            log('Downloaded cover from: %s'%url)
-        except Exception:
-            self.log.exception('Failed to download cover from: %r'%url)
+        self.download_multiple_covers(title, authors, urls, get_best_cover, timeout, result_queue, abort, log)

    def get_image_urls(self, title, author, log, abort, timeout):
        from calibre.utils.ipc.simple_worker import fork_job, WorkerError
--- a/src/calibre/ebooks/rtf2xml/border_parse.py
+++ b/src/calibre/ebooks/rtf2xml/border_parse.py
@ -180,5 +180,6 @@ class BorderParse:
        elif 'single' in border_style_list:
            new_border_dict[att] = 'single'
        else:
-            new_border_dict[att] = border_style_list[0]
+            if border_style_list:
+                new_border_dict[att] = border_style_list[0]
        return new_border_dict
--- a/src/calibre/gui2/toc/main.py
+++ b/src/calibre/gui2/toc/main.py
@ -559,11 +559,11 @@ class TOCView(QWidget): # {{{
        b.setToolTip(_('Remove all selected entries'))
        b.clicked.connect(self.del_items)

-        self.left_button = b = QToolButton(self)
+        self.right_button = b = QToolButton(self)
        b.setIcon(QIcon(I('forward.png')))
        b.setIconSize(QSize(ICON_SIZE, ICON_SIZE))
        l.addWidget(b, 4, 3)
-        b.setToolTip(_('Unindent the current entry [Ctrl+Left]'))
+        b.setToolTip(_('Indent the current entry [Ctrl+Right]'))
        b.clicked.connect(self.tocw.move_right)

        self.down_button = b = QToolButton(self)
--- a/src/calibre/library/cli.py
+++ b/src/calibre/library/cli.py
@ -54,7 +54,7 @@ def get_parser(usage):
 def get_db(dbpath, options):
    global do_notify
    if options.library_path is not None:
-        dbpath = options.library_path
+        dbpath = os.path.expanduser(options.library_path)
    if dbpath is None:
        raise ValueError('No saved library path, either run the GUI or use the'
                ' --with-library option')
--- a/src/calibre/translations/calibre.pot
+++ b/src/calibre/translations/calibre.pot
--- a/src/calibre/utils/localunzip.py
+++ b/src/calibre/utils/localunzip.py
@ -174,7 +174,13 @@ def _extractall(f, path=None, file_info=None):
        has_data_descriptors = header.flags & (1 << 3)
        seekval = header.compressed_size + (16 if has_data_descriptors else 0)
        found = True
-        parts = header.filename.split('/')
+        # Sanitize path changing absolute to relative paths and removing .. and
+        # .
+        fname = header.filename.replace(os.sep, '/')
+        fname = os.path.splitdrive(fname)[1]
+        parts = [x for x in fname.split('/') if x not in {'', os.path.pardir, os.path.curdir}]
+        if not parts:
+            continue
        if header.uncompressed_size == 0:
            # Directory
            f.seek(f.tell()+seekval)
--- a/src/calibre/utils/mreplace.py
+++ b/src/calibre/utils/mreplace.py
@ -17,8 +17,7 @@ class MReplace(UserDict):

    def compile_regex(self):
        if len(self.data) > 0:
-            keys = sorted(self.data.keys(), key=len)
-            keys.reverse()
+            keys = sorted(self.data.keys(), key=len, reverse=True)
            tmp = "(%s)" % "|".join(map(re.escape, keys))
            if self.re != tmp:
                self.re = tmp
--- a/src/calibre/utils/zipfile.py
+++ b/src/calibre/utils/zipfile.py
@ -1099,10 +1099,13 @@ class ZipFile:

        base_target = targetpath # Added by Kovid

-        # don't include leading "/" from file name if present
-        fname = member.filename
-        if fname.startswith('/'):
-            fname = fname[1:]
+        # Sanitize path, changing absolute paths to relative paths
+        # and removing .. and . (changed by Kovid)
+        fname = member.filename.replace(os.sep, '/')
+        fname = os.path.splitdrive(fname)[1]
+        fname = '/'.join(x for x in fname.split('/') if x not in {'', os.path.curdir, os.path.pardir})
+        if not fname:
+            raise BadZipfile('The member %r has an invalid name'%member.filename)

        targetpath = os.path.normpath(os.path.join(base_target, fname))