Sync to trunk.

2025-07-09 03:04:10 -04:00 · 2013-04-07 08:54:36 -04:00 · 2013-04-07 08:54:36 -04:00 · 90f730fbd4
commit 90f730fbd4
parent e586b6222d f082a072e9
46 changed files with 1259 additions and 612 deletions
--- a/.bzrignore
+++ b/.bzrignore
@ -40,6 +40,7 @@ recipes/.gitignore
 recipes/README.md
 recipes/icon_checker.py
 recipes/readme_updater.py
 recipes/garfield.recipe
 recipes/katalog_egazeciarz.recipe
 recipes/tv_axnscifi.recipe
 recipes/tv_comedycentral.recipe
@ -63,6 +64,7 @@ recipes/tv_tvppolonia.recipe
 recipes/tv_tvpuls.recipe
 recipes/tv_viasathistory.recipe
 recipes/icons/katalog_egazeciarz.png
 recipes/icons/garfield.png
 recipes/icons/tv_axnscifi.png
 recipes/icons/tv_comedycentral.png
 recipes/icons/tv_discoveryscience.png
--- a/Changelog.yaml
+++ b/Changelog.yaml
@ -20,6 +20,58 @@
 #   new recipes:
 #     - title: 
 - version: 0.9.26
  date: 2013-04-05
  new features:
    - title: "PDF Output: Allow using templates to create arbitrary headers and footers. Look under PDF Output in the conversion dialog for this feature."
    - title: "ToC Editor: Allow generating the ToC directly from individual files inside the ebook. Useful for EPUBs that have individual chapters in single files."
      tickets: [1163520]
    - title: "ToC Editor: Add buttons to indent/unindent the current entry"
    - title: "ToC Editor: Right-click menu to perform various useful actions on entries in the ToC"
    - title: "Column icons: Allow use of wide images as column icons"
    - title: "Add USB ids for the Palm Pre2 and Samsung Galaxy phone to the device drivers"
      tickets: [1162293,1163115]
  bug fixes:
    - title: "PDF Output: Fix generating page numbers causing links to not work."
      tickets: [1162573]
    - title: "Wrong filename output in error message when 'Guide reference not found'"
      tickets: [1163659]
    - title: "Get Books: Update Amazon, Barnes & Noble, Waterstones and Gutenberg store plugins for website change"
    - title: "PDF Output: Fix 1 pixel wide left and top margins on the cover page for some PDF conversions due to incorrect rounding."
      tickets: [1162054]
    - title: "ToC Editor: Fix drag and drop of multiple items resulting in the dropped items being in random order sometimes."
      tickets: [1161999]
  improved recipes:
    - Financial Times UK
    - Sing Tao Daily
    - Apple Daily
    - A List Apart
    - Business Week
    - Harpers printed edition
    - Harvard Business Review
  new recipes:
    - title: AM730
      author: Eddie Lau
    - title: Arret sur images 
      author: Francois D
    - title: Diario de Noticias
      author: Jose Pinto
 - version: 0.9.25
  date: 2013-03-29
--- a/recipes/bwmagazine2.recipe
+++ b/recipes/bwmagazine2.recipe
@ -1,3 +1,4 @@
 import re
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from collections import OrderedDict
@ -39,7 +40,7 @@ class BusinessWeekMagazine(BasicNewsRecipe):
            title=self.tag_to_string(div.a).strip()
            url=div.a['href']
            soup0 = self.index_to_soup(url)
-            urlprint=soup0.find('li', attrs={'class':'print tracked'}).a['href']
+            urlprint=soup0.find('a', attrs={'href':re.compile('.*printer.*')})['href']
            articles.append({'title':title, 'url':urlprint, 'description':'', 'date':''})
@ -56,7 +57,7 @@ class BusinessWeekMagazine(BasicNewsRecipe):
            title=self.tag_to_string(div.a).strip()
            url=div.a['href']
            soup0 = self.index_to_soup(url)
-            urlprint=soup0.find('li', attrs={'class':'print tracked'}).a['href']
+            urlprint=soup0.find('a', attrs={'href':re.compile('.*printer.*')})['href']
            articles.append({'title':title, 'url':urlprint, 'description':desc, 'date':''})
            if articles:
--- a/recipes/diario_de_noticias.recipe
+++ b/recipes/diario_de_noticias.recipe
@ -0,0 +1,23 @@
 # vim:fileencoding=UTF-8
 from __future__ import unicode_literals
 from calibre.web.feeds.news import BasicNewsRecipe
 class AdvancedUserRecipe1365070687(BasicNewsRecipe):
  title ='Diário de Notícias'
  oldest_article = 7
  language = 'pt'
  __author__ = 'Jose Pinto'
  max_articles_per_feed = 100
  keep_only_tags = [dict(name='div', attrs={'id':'cln-esqmid'}) ]
  remove_tags    = [ dict(name='table', attrs={'class':'TabFerramentasInf'}) ]
  feeds = [(u'Portugal', u'http://feeds.dn.pt/DN-Portugal'),
    (u'Globo', u'http://feeds.dn.pt/DN-Globo'),
    (u'Economia', u'http://feeds.dn.pt/DN-Economia'),
    (u'Ci\xeancia', u'http://feeds.dn.pt/DN-Ciencia'),
    (u'Artes', u'http://feeds.dn.pt/DN-Artes'),
    (u'TV & Media', u'http://feeds.dn.pt/DN-Media'),
    (u'Opini\xe3o', u'http://feeds.dn.pt/DN-Opiniao'),
    (u'Pessoas', u'http://feeds.dn.pt/DN-Pessoas')
    ]
--- a/recipes/economia.recipe
+++ b/recipes/economia.recipe
@ -0,0 +1,17 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 class AdvancedUserRecipe1314326622(BasicNewsRecipe):
    title          = u'Economia'
    __author__     = 'Manish Bhattarai'
    description = 'Economia - Intelligence & Insight for ICAEW Members'
    language = 'en_GB'
    oldest_article = 7
    max_articles_per_feed = 25
    masthead_url = 'http://economia.icaew.com/~/media/Images/Design%20Images/Economia_Red_website.ashx'
    cover_url = 'http://economia.icaew.com/~/media/Images/Design%20Images/Economia_Red_website.ashx'
    no_stylesheets = True
    remove_empty_feeds = True
    remove_tags_before = dict(id='content')
    remove_tags_after  = dict(id='stars-wrapper')
    remove_tags = [dict(attrs={'class':['floatR', 'sharethis', 'rating clearfix']})]
    feeds          = [(u'News', u'http://feedity.com/icaew-com/VlNTVFRa.rss'),(u'Business', u'http://feedity.com/icaew-com/VlNTVFtS.rss'),(u'People', u'http://feedity.com/icaew-com/VlNTVFtX.rss'),(u'Opinion', u'http://feedity.com/icaew-com/VlNTVFtW.rss'),(u'Finance', u'http://feedity.com/icaew-com/VlNTVFtV.rss')]
--- a/recipes/esensja_(rss).recipe
+++ b/recipes/esensja_(rss).recipe
@ -12,12 +12,6 @@ class EsensjaRSS(BasicNewsRecipe):
    language       = 'pl'
    encoding = 'utf-8'
    INDEX = 'http://www.esensja.pl'
    extra_css = '''.t-title {font-size: x-large; font-weight: bold; text-align: left}
                    .t-author {font-size: x-small; text-align: left}
                    .t-title2 {font-size: x-small; font-style: italic; text-align: left}
                    .text {font-size: small; text-align: left}
                    .annot-ref {font-style: italic; text-align: left}
                    '''
    cover_url = ''
    masthead_url = 'http://esensja.pl/img/wrss.gif'
    use_embedded_content = False
--- a/recipes/financial_times_uk.recipe
+++ b/recipes/financial_times_uk.recipe
@ -110,10 +110,12 @@ class FinancialTimes(BasicNewsRecipe):
        soup = self.index_to_soup(self.INDEX)
        #dates= self.tag_to_string(soup.find('div', attrs={'class':'btm-links'}).find('div'))
        #self.timefmt = ' [%s]'%dates
        section_title = 'Untitled'
        for column in soup.findAll('div', attrs = {'class':'feedBoxes clearfix'}):
            for section in column. findAll('div', attrs = {'class':'feedBox'}):
-                section_title=self.tag_to_string(section.find('h4'))
+                sectiontitle=self.tag_to_string(section.find('h4'))
                if '...' not in sectiontitle: section_title=sectiontitle
                for article in section.ul.findAll('li'):
                    articles = []
                    title=self.tag_to_string(article.a)
--- a/recipes/forbes_pl.recipe
+++ b/recipes/forbes_pl.recipe
@ -0,0 +1,53 @@
 #!/usr/bin/env python
 __license__ = 'GPL v3'
 from calibre.web.feeds.news import BasicNewsRecipe
 import datetime
 import re
 class forbes_pl(BasicNewsRecipe):
    title = u'Forbes.pl'
    __author__ = 'Artur Stachecki <artur.stachecki@gmail.com>'
    language = 'pl'
    description = u'Biznes, finanse, gospodarka, strategie, wiadomości gospodarcze, analizy finasowe i strategiczne.'
    oldest_article = 1
    index = 'http://www.forbes.pl'
    cover_url = 'http://www.forbes.pl/resources/front/images/logo.png'
    max_articles_per_feed = 100
    extra_css = '.Block-Photo {float:left; max-width: 300px; margin-right: 5px;}'
    preprocess_regexps = [(re.compile(ur'<p>(<strong>)?(Czytaj|Zobacz) (też|także):.*?</p>', re.DOTALL), lambda match: ''), (re.compile(ur'<strong>Zobacz:.*?</strong>', re.DOTALL), lambda match: '')]
    remove_javascript = True
    no_stylesheets = True
    now = datetime.datetime.now()
    yesterday = now - datetime.timedelta(hours=24)
    yesterday = yesterday.strftime("%d.%m.%Y %H:%M:%S")
    pages_count = 4
    keep_only_tags = [dict(attrs={'class':['Block-Node Content-Article ', 'Block-Node Content-Article piano-closed']})]
    remove_tags = [dict(attrs={'class':['Keywords Styled', 'twitter-share-button', 'Block-List-Related Block-List']})]
    feeds = [(u'Wszystkie', 'http://www.forbes.pl/rss')]
    '''def preprocess_html(self, soup):
        self.append_page(soup, soup.body)
        return soup
    def append_page(self, soup, appendtag):
        cleanup = False
        nexturl = appendtag.find('a', attrs={'class':'next'})
        if nexturl:
            cleanup = True
        while nexturl:
            soup2 = self.index_to_soup(self.index + nexturl['href'])
            nexturl = soup2.find('a', attrs={'class':'next'})
            pagetext = soup2.findAll(id='article-body-wrapper')
            if not pagetext:
                pagetext = soup2.findAll(attrs={'class':'Article-Entry Styled'})
            for comment in pagetext.findAll(text=lambda text:isinstance(text, Comment)):
                comment.extract()
            pos = len(appendtag.contents)
            appendtag.insert(pos, pagetext)
        if cleanup:
            for r in appendtag.findAll(attrs={'class':'paginator'}):
                r.extract()'''
--- a/recipes/galaxys_edge.recipe
+++ b/recipes/galaxys_edge.recipe
@ -0,0 +1,108 @@
 from __future__ import with_statement
 __license__ = 'GPL 3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 from calibre.web.feeds.news import BasicNewsRecipe
 class GalaxyEdge(BasicNewsRecipe):
    title                 = u'The Galaxy\'s Edge'
    language = 'en'
    oldest_article        = 7
    __author__            = 'Krittika Goyal'
    no_stylesheets = True
    auto_cleanup = True
    #keep_only_tags = [dict(id='content')]
    #remove_tags = [dict(attrs={'class':['article-links', 'breadcr']}),
            #dict(id=['email-section', 'right-column', 'printfooter', 'topover',
                     #'slidebox', 'th_footer'])]
    extra_css = '.photo-caption { font-size: smaller }'
    def parse_index(self):
        soup = self.index_to_soup('http://www.galaxysedge.com/')
        main = soup.find('table', attrs={'width':'911'})
        toc = main.find('td', attrs={'width':'225'})
        current_section = None
        current_articles = []
        feeds = []
        c = 0
        for x in toc.findAll(['p']):
            c = c+1
            if c == 5:
                if current_articles and current_section:
                    feeds.append((current_section, current_articles))
                edwo = x.find('a')
                current_section = self.tag_to_string(edwo)
                current_articles = []
                self.log('\tFound section:', current_section)
                title = self.tag_to_string(edwo)
                url = edwo.get('href', True)
                url = 'http://www.galaxysedge.com/'+url
                print(title)
                print(c)
                if not url or not title:
                    continue
                self.log('\t\tFound article:', title)
                self.log('\t\t\t', url)
                current_articles.append({'title': title, 'url':url,
                    'description':'', 'date':''})
            elif c>5:
                current_section = self.tag_to_string(x.find('b'))
                current_articles = []
                self.log('\tFound section:', current_section)
                for y in x.findAll('a'):
                    title = self.tag_to_string(y)
                    url = y.get('href', True)
                    url = 'http://www.galaxysedge.com/'+url
                    print(title)
                    if not url or not title:
                        continue
                    self.log('\t\tFound article:', title)
                    self.log('\t\t\t', url)
                    current_articles.append({'title': title, 'url':url,
                        'description':'', 'date':''})
            if current_articles and current_section:
                 feeds.append((current_section, current_articles))
        return feeds
    #def preprocess_raw_html(self, raw, url):
        #return raw.replace('<body><p>', '<p>').replace('</p></body>', '</p>')
    #def postprocess_html(self, soup, first_fetch):
        #for t in soup.findAll(['table', 'tr', 'td','center']):
            #t.name = 'div'
        #return soup
    #def parse_index(self):
        #today = time.strftime('%Y-%m-%d')
        #soup = self.index_to_soup(
                #'http://www.thehindu.com/todays-paper/tp-index/?date=' + today)
        #div = soup.find(id='left-column')
        #feeds = []
        #current_section = None
        #current_articles = []
        #for x in div.findAll(['h3', 'div']):
            #if current_section and x.get('class', '') == 'tpaper':
                #a = x.find('a', href=True)
                #if a is not None:
                    #current_articles.append({'url':a['href']+'?css=print',
                        #'title':self.tag_to_string(a), 'date': '',
                        #'description':''})
            #if x.name == 'h3':
                #if current_section and current_articles:
                    #feeds.append((current_section, current_articles))
                #current_section = self.tag_to_string(x)
                #current_articles = []
        #return feeds
--- a/recipes/gazeta_pl_krakow.recipe
+++ b/recipes/gazeta_pl_krakow.recipe
@ -10,7 +10,7 @@ krakow.gazeta.pl
 from calibre.web.feeds.news import BasicNewsRecipe
 class gw_krakow(BasicNewsRecipe):
-    title          = u'Gazeta.pl Kraków'
+    title          = u'Gazeta Wyborcza Kraków'
    __author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks'
    language       = 'pl'
    description =u'Wiadomości z Krakowa na portalu Gazeta.pl.'
--- a/recipes/gazeta_pl_szczecin.recipe
+++ b/recipes/gazeta_pl_szczecin.recipe
@ -5,7 +5,7 @@ import string
 from calibre.web.feeds.news import BasicNewsRecipe
 class GazetaPlSzczecin(BasicNewsRecipe):
-    title          = u'Gazeta.pl Szczecin'
+    title          = u'Gazeta Wyborcza Szczecin'
    description    = u'Wiadomości ze Szczecina na portalu Gazeta.pl.'
    __author__     = u'Michał Szkutnik'
    __license__    = u'GPL v3'
--- a/recipes/gazeta_pl_warszawa.recipe
+++ b/recipes/gazeta_pl_warszawa.recipe
@ -10,7 +10,7 @@ warszawa.gazeta.pl
 from calibre.web.feeds.news import BasicNewsRecipe
 class gw_wawa(BasicNewsRecipe):
-    title          = u'Gazeta.pl Warszawa'
+    title          = u'Gazeta Wyborcza Warszawa'
    __author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks'
    language       = 'pl'
    description ='Wiadomości z Warszawy na portalu Gazeta.pl.'
--- a/recipes/gazeta_wyborcza.recipe
+++ b/recipes/gazeta_wyborcza.recipe
@ -3,7 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import Comment
 class Gazeta_Wyborcza(BasicNewsRecipe):
-    title = u'Gazeta.pl'
+    title = u'Gazeta Wyborcza'
    __author__ = 'fenuks, Artur Stachecki'
    language = 'pl'
    description = 'Wiadomości z Polski i ze świata. Serwisy tematyczne i lokalne w 20 miastach.'
--- a/recipes/hbr.recipe
+++ b/recipes/hbr.recipe
@ -20,7 +20,7 @@ class HBR(BasicNewsRecipe):
        'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn',
        'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR',
        'mailingListTout', 'partnerCenter', 'pageFooter',
-        'superNavHeadContainer', 'hbrDisqus',
+        'superNavHeadContainer', 'hbrDisqus', 'article-toolbox',
        'articleToolbarTop', 'articleToolbarBottom', 'articleToolbarRD']),
        dict(name='iframe')]
    extra_css = '''
--- a/recipes/icons/forbes_pl.png
+++ b/recipes/icons/forbes_pl.png
--- a/recipes/icons/gazeta_pl_krakow.png
+++ b/recipes/icons/gazeta_pl_krakow.png
--- a/recipes/icons/gazeta_pl_szczecin.png
+++ b/recipes/icons/gazeta_pl_szczecin.png
--- a/recipes/icons/gazeta_pl_warszawa.png
+++ b/recipes/icons/gazeta_pl_warszawa.png
--- a/recipes/icons/gazeta_wyborcza.png
+++ b/recipes/icons/gazeta_wyborcza.png
--- a/recipes/icons/slashdot.png
+++ b/recipes/icons/slashdot.png
--- a/recipes/icons/sportowefakty.png
+++ b/recipes/icons/sportowefakty.png
--- a/recipes/icons/wysokie_obcasy.png
+++ b/recipes/icons/wysokie_obcasy.png
--- a/recipes/new_yorker.recipe
+++ b/recipes/new_yorker.recipe
@ -1,64 +1,44 @@
 #!/usr/bin/env  python
 # -*- coding: utf-8 -*-
 __license__   = 'GPL v3'
 __copyright__ = '2008-2013, Darko Miletic <darko.miletic at gmail.com>'
 '''
 newyorker.com
 '''
 '''
 www.canada.com
 '''
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
 class NewYorker(BasicNewsRecipe):
    title                 = 'The New Yorker'
    __author__            = 'Darko Miletic'
    description           = 'The best of US journalism'
    oldest_article        = 15
    language              = 'en'
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    publisher             = 'Conde Nast Publications'
    category              = 'news, politics, USA'
    encoding              = 'cp1252'
    publication_type      = 'magazine'
    masthead_url          = 'http://www.newyorker.com/css/i/hed/logo.gif'
    extra_css             = """
                                body {font-family: "Times New Roman",Times,serif}
                                .articleauthor{color: #9F9F9F;
                                               font-family: Arial, sans-serif;
                                               font-size: small;
                                               text-transform: uppercase}
                                .rubric,.dd,h6#credit{color: #CD0021;
                                        font-family: Arial, sans-serif;
                                        font-size: small;
                                        text-transform: uppercase}
                                .descender:first-letter{display: inline; font-size: xx-large; font-weight: bold}
                                .dd,h6#credit{color: gray}
                                .c{display: block}
                                .caption,h2#articleintro{font-style: italic}
                                .caption{font-size: small}
                            """
    conversion_options = {
                          'comment'   : description
                        , 'tags'      : category
                        , 'publisher' : publisher
                        , 'language'  : language
                        }
-    keep_only_tags = [dict(name='div', attrs={'id':'pagebody'})]
+    title = u'New Yorker Magazine'
-    remove_tags    = [
+    newyorker_prefix = 'http://m.newyorker.com'
-                         dict(name=['meta','iframe','base','link','embed','object'])
+    description = u'Content from the New Yorker website'
-                        ,dict(attrs={'class':['utils','socialUtils','articleRailLinks','icons','social-utils-top','entry-keywords','entry-categories','utilsPrintEmail'] })
+    fp_tag = 'CAN_TC'
                        ,dict(attrs={'id':['show-header','show-footer'] })
                     ]
    remove_tags_after = dict(attrs={'class':'entry-content'}) 
    remove_attributes = ['lang']
    feeds             = [(u'The New Yorker', u'http://www.newyorker.com/services/mrss/feeds/everything.xml')]
-    def print_version(self, url):
+    masthead_url = 'http://www.newyorker.com/images/elements/print/newyorker_printlogo.gif'
        return url + '?printable=true&currentPage=all'
-    def image_url_processor(self, baseurl, url):
+    compress_news_images = True
-        return url.strip()
+    compress_news_images_auto_size = 8
    scale_news_images_to_device = False
    scale_news_images = (768, 1024)
    url_list = []
    language = 'en'
    __author__ = 'Nick Redding'
    no_stylesheets = True
    timefmt =  ' [%b %d]'
    encoding = 'utf-8'
    extra_css = '''
                .byline { font-size:xx-small; font-weight: bold;}
                h3 { margin-bottom: 6px; }
                .caption { font-size: xx-small; font-style: italic; font-weight: normal; }
                '''
    keep_only_tags = [dict(name='div', attrs={'id':re.compile('pagebody')})]
    remove_tags = [{'class':'socialUtils'},{'class':'entry-keywords'}]
    def get_cover_url(self):
        cover_url = "http://www.newyorker.com/images/covers/1925/1925_02_21_p233.jpg"
@ -68,13 +48,233 @@ class NewYorker(BasicNewsRecipe):
           cover_url = 'http://www.newyorker.com' + cover_item.div.img['src'].strip()
        return cover_url
-    def preprocess_html(self, soup):
+    def fixChars(self,string):
-        for item in soup.findAll(style=True):
+        # Replace lsquo (\x91)
-            del item['style']
+        fixed = re.sub("\x91","‘",string)
-        auth = soup.find(attrs={'id':'articleauthor'})
+        # Replace rsquo (\x92)
-        if auth:
+        fixed = re.sub("\x92","’",fixed)
-           alink = auth.find('a')
+        # Replace ldquo (\x93)
-           if alink and alink.string is not None:
+        fixed = re.sub("\x93","“",fixed)
-              txt = alink.string
+        # Replace rdquo (\x94)
-              alink.replaceWith(txt)
+        fixed = re.sub("\x94","”",fixed)
        # Replace ndash (\x96)
        fixed = re.sub("\x96","–",fixed)
        # Replace mdash (\x97)
        fixed = re.sub("\x97","—",fixed)
        fixed = re.sub("&#x2019;","’",fixed)
        return fixed
    def massageNCXText(self, description):
        # Kindle TOC descriptions won't render certain characters
        if description:
            massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
            # Replace '&' with '&'
            massaged = re.sub("&","&", massaged)
            return self.fixChars(massaged)
        else:
            return description
    def populate_article_metadata(self, article, soup, first):
        if first:
            picdiv = soup.find('body').find('img')
            if picdiv is not None:
                self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
        xtitle = article.text_summary.strip()
        if len(xtitle) == 0:
            desc = soup.find('meta',attrs={'property':'og:description'})
            if desc is not None:
                article.summary = article.text_summary = desc['content']
        shortparagraph = ""
 ##        try:
        if len(article.text_summary.strip()) == 0:
            articlebodies = soup.findAll('div',attrs={'class':'entry-content'})
            if articlebodies:
                for articlebody in articlebodies:
                    if articlebody:
                        paras = articlebody.findAll('p')
                        for p in paras:
                            refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
                            #account for blank paragraphs and short paragraphs by appending them to longer ones
                            if len(refparagraph) > 0:
                                if len(refparagraph) > 70: #approximately one line of text
                                    newpara = shortparagraph + refparagraph
                                    article.summary = article.text_summary = newpara.strip()
                                    return
                                else:
                                    shortparagraph = refparagraph + " "
                                    if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
                                        shortparagraph = shortparagraph + "- "
        else:
            article.summary = article.text_summary = self.massageNCXText(article.text_summary)
 ##        except:
 ##            self.log("Error creating article descriptions")
 ##            return
    def strip_anchors(self,soup):
        paras = soup.findAll(True)
        for para in paras:
            aTags = para.findAll('a')
            for a in aTags:
                if a.img is None:
                    a.replaceWith(a.renderContents().decode('cp1252','replace'))
        return soup
    def preprocess_html(self,soup):
        dateline = soup.find('div','published')
        byline = soup.find('div','byline')
        title = soup.find('h1','entry-title')
        if title is None:
            return self.strip_anchors(soup)
        if byline is None:
            title.append(dateline)
            return self.strip_anchors(soup)
        byline.append(dateline)
        return self.strip_anchors(soup)
    def load_global_nav(self,soup):
        seclist = []
        ul = soup.find('ul',attrs={'id':re.compile('global-nav-menu')})
        if ul is not None:
            for li in ul.findAll('li'):
                if li.a is not None:
                    securl = li.a['href']
                    if securl != '/' and securl != '/magazine' and securl.startswith('/'):
                        seclist.append((self.tag_to_string(li.a),self.newyorker_prefix+securl))
        return seclist
    def exclude_url(self,url):
        if url in self.url_list:
            return True
        if not url.endswith('html'):
            return True
        if 'goings-on-about-town-app' in url:
            return True
        if 'something-to-be-thankful-for' in url:
            return True
        if '/shouts/' in url:
            return True
        if 'out-loud' in url:
            return True
        if '/rss/' in url:
            return True
        if '/video-' in url:
            return True
        self.url_list.append(url)
        return False
    def load_index_page(self,soup):
        article_list = []
        for div in soup.findAll('div',attrs={'class':re.compile('^rotator')}):
            h2 = div.h2
            if h2 is not None:
                a = h2.a
                if a is not None:
                    url = a['href']
                    if not self.exclude_url(url):
                        if url.startswith('/'):
                            url = self.newyorker_prefix+url
                        byline = h2.span
                        if byline is not None:
                            author = self.tag_to_string(byline)
                            if author.startswith('by '):
                                author.replace('by ','')
                            byline.extract()
                        else:
                            author = ''
                        if h2.br is not None:
                            h2.br.replaceWith(' ')
                        title = self.tag_to_string(h2)
                        desc = div.find(attrs={'class':['rotator-ad-body','feature-blurb-text']})
                        if desc is not None:
                            description = self.tag_to_string(desc)
                        else:
                            description = ''
                        article_list.append(dict(title=title,url=url,date='',description=description,author=author,content=''))
                        ul = div.find('ul','feature-blurb-links')
                        if ul is not None:
                            for li in ul.findAll('li'):
                                a = li.a
                                if a is not None:
                                    url = a['href']
                                    if not self.exclude_url(url):
                                        if url.startswith('/'):
                                            url = self.newyorker_prefix+url
                                        if a.br is not None:
                                            a.br.replaceWith(' ')
                                        title = '>>'+self.tag_to_string(a)
                                        article_list.append(dict(title=title,url=url,date='',description='',author='',content=''))
        for h3 in soup.findAll('h3','header'):
            a = h3.a
            if a is not None:
                url = a['href']
                if not self.exclude_url(url):
                    if url.startswith('/'):
                        url = self.newyorker_prefix+url
                    byline = h3.span
                    if byline is not None:
                        author = self.tag_to_string(byline)
                        if author.startswith('by '):
                            author = author.replace('by ','')
                        byline.extract()
                    else:
                        author = ''
                    if h3.br is not None:
                        h3.br.replaceWith(' ')
                    title = self.tag_to_string(h3).strip()
                    article_list.append(dict(title=title,url=url,date='',description='',author=author,content=''))
        return article_list
    def load_global_section(self,securl):
        article_list = []
        try:
            soup = self.index_to_soup(securl)
        except:
            return article_list
        if '/blogs/' not in securl:
            return self.load_index_page(soup)
        for div in soup.findAll('div',attrs={'id':re.compile('^entry')}):
            h3 = div.h3
            if h3 is not None:
                a = h3.a
                if a is not None:
                    url = a['href']
                    if not self.exclude_url(url):
                        if url.startswith('/'):
                            url = self.newyorker_prefix+url
                        if h3.br is not None:
                            h3.br.replaceWith(' ')
                        title = self.tag_to_string(h3)
                        article_list.append(dict(title=title,url=url,date='',description='',author='',content=''))
        return article_list
    def filter_ans(self, ans) :
        total_article_count = 0
        idx = 0
        idx_max = len(ans)-1
        while idx <= idx_max:
            if True: #self.verbose
                self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
            for article in ans[idx][1]:
                total_article_count += 1
                if True: #self.verbose
                    self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
                              article['url'].replace('http://m.newyorker.com','').encode('cp1252','replace')))
            idx = idx+1
        self.log( "Queued %d articles" % total_article_count )
        return ans
    def parse_index(self):
        ans = []
        try:
            soup = self.index_to_soup(self.newyorker_prefix)
        except:
            return ans
        seclist = self.load_global_nav(soup)
        ans.append(('Front Page',self.load_index_page(soup)))
        for (sectitle,securl) in seclist:
            ans.append((sectitle,self.load_global_section(securl)))
        return self.filter_ans(ans)
--- a/recipes/sportowefakty.recipe
+++ b/recipes/sportowefakty.recipe
@ -0,0 +1,70 @@
 #!/usr/bin/env  python
 __license__ = 'GPL v3'
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.utils.magick import Image
 class sportowefakty(BasicNewsRecipe):
    title          = u'SportoweFakty'
    __author__ = 'Artur Stachecki <artur.stachecki@gmail.com>, Tomasz Długosz <tomek3d@gmail.com>'
    language       = 'pl'
    description    = u'Najważniejsze informacje sportowe z kraju i ze świata, relacje, komentarze, wywiady, zdjęcia!'
    oldest_article = 1
    masthead_url='http://www.sportowefakty.pl/images/logo.png'
    max_articles_per_feed = 100
    simultaneous_downloads = 5
    use_embedded_content=False
    remove_javascript=True
    no_stylesheets=True
    ignore_duplicate_articles = {'title', 'url'}
    keep_only_tags = [dict(attrs = {'class' : 'box-article'})]
    remove_tags =[]
    remove_tags.append(dict(attrs = {'class' : re.compile(r'^newsStream')}))
    remove_tags.append(dict(attrs = {'target' : '_blank'}))
    feeds          = [
                      (u'Piłka Nożna', u'http://www.sportowefakty.pl/pilka-nozna/index.rss'),
                      (u'Koszykówka', u'http://www.sportowefakty.pl/koszykowka/index.rss'),
                      (u'Żużel', u'http://www.sportowefakty.pl/zuzel/index.rss'),
                      (u'Siatkówka', u'http://www.sportowefakty.pl/siatkowka/index.rss'),
                      (u'Zimowe', u'http://www.sportowefakty.pl/zimowe/index.rss'),
                      (u'Hokej', u'http://www.sportowefakty.pl/hokej/index.rss'),
                      (u'Moto', u'http://www.sportowefakty.pl/moto/index.rss'),
                      (u'Tenis', u'http://www.sportowefakty.pl/tenis/index.rss')
                     ]
    def get_article_url(self, article):
        link = article.get('link', None)
        if 'utm_source' in link:
            return link.split('?utm')[0]
        else:
            return link
    def print_version(self, url):
        print_url = url + '/drukuj'
        return print_url
    def preprocess_html(self, soup):
        head = soup.find('h1')
        if 'Fotorelacja' in self.tag_to_string(head):
            return None
        else:
            for alink in soup.findAll('a'):
                if alink.string is not None:
                    tstr = alink.string
                    alink.replaceWith(tstr)
            return soup
    def postprocess_html(self, soup, first):
        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
            iurl = tag['src']
            img = Image()
            img.open(iurl)
            if img < 0:
                raise RuntimeError('Out of memory')
            img.type = "GrayscaleType"
            img.save(iurl)
        return soup
--- a/recipes/theonion.recipe
+++ b/recipes/theonion.recipe
@ -36,47 +36,21 @@ class TheOnion(BasicNewsRecipe):
                        , 'publisher': publisher
                        , 'language' : language
                        }
-
+    keep_only_tags = [dict(name='article', attrs={'class':'full-article'})]
    keep_only_tags = [
                         dict(name='h2', attrs={'class':['section_title','title']})
                        ,dict(attrs={'class':['main_image','meta','article_photo_lead','article_body']})
                        ,dict(attrs={'id':['entries']})
                     ]
    remove_attributes=['lang','rel']
    remove_tags_after = dict(attrs={'class':['article_body','feature_content']})
    remove_tags = [
-                     dict(name=['object','link','iframe','base','meta'])
+        dict(name=['nav', 'aside', 'section', 'meta']),
-                    ,dict(name='div', attrs={'class':['toolbar_side','graphical_feature','toolbar_bottom']})
+        {'attrs':{'class':lambda x: x and ('share-tools' in x or 'ad-zone' in x)}},
-                    ,dict(name='div', attrs={'id':['recent_slider','sidebar','pagination','related_media']})
+    ]
                  ]
    feeds = [
              (u'Daily'  , u'http://feeds.theonion.com/theonion/daily' )
             ,(u'Sports' , u'http://feeds.theonion.com/theonion/sports' )
            ]
-    def get_article_url(self, article):
+    def preprocess_html(self, soup, *args):
-        artl = BasicNewsRecipe.get_article_url(self, article)
+        for img in soup.findAll('img', attrs={'data-src':True}):
-        if artl.startswith('http://www.theonion.com/audio/'):
+            if img['data-src']:
-           artl = None
+                img['src'] = img['data-src']
        return artl
    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
        for item in soup.findAll('a'):
            limg = item.find('img')
            if item.string is not None:
               str = item.string
               item.replaceWith(str)
            else:
               if limg:
                  item.name  = 'div'
                  item.attrs = []
                  if not limg.has_key('alt'):
                     limg['alt'] = 'image'
               else:
                   str = self.tag_to_string(item)
                   item.replaceWith(str)
        return soup
--- a/recipes/universe_today.recipe
+++ b/recipes/universe_today.recipe
@ -0,0 +1,17 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 class UniverseToday(BasicNewsRecipe):
    title                 = u'Universe Today'
    language = 'en'
    description           = u'Space and astronomy news.'
    __author__ = 'seird'
    publisher             = u'universetoday.com'
    category              = 'science, astronomy, news, rss'
    oldest_article = 7
    max_articles_per_feed = 40
    auto_cleanup = True
    no_stylesheets = True
    use_embedded_content = False
    remove_empty_feeds = True
    feeds          = [(u'Universe Today', u'http://feeds.feedburner.com/universetoday/pYdq')]
--- a/recipes/vic_times.recipe
+++ b/recipes/vic_times.recipe
@ -6,17 +6,62 @@ __license__   = 'GPL v3'
 www.canada.com
 '''
 import re
-from calibre.web.feeds.recipes import BasicNewsRecipe
+from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup
 class TimesColonist(BasicNewsRecipe):
    # Customization -- remove sections you don't want.
    # If your e-reader is an e-ink Kindle and your output profile is
    # set properly this recipe will not include images because the
    # resulting file is too large. If you have one of these and want
    # images you can set kindle_omit_images = False
    # and remove sections (typically the e-ink Kindles will
    # work with about a dozen of these, but your mileage may vary).
    kindle_omit_images = True
    section_list = [
        ('','Web Front Page'),
        ('news/','News Headlines'),
        ('news/b-c/','BC News'),
        ('news/national/','National News'),
        ('news/world/','World News'),
        ('opinion/','Opinion'),
        ('opinion/letters/','Letters'),
        ('business/','Business'),
        ('business/money/','Money'),
        ('business/technology/','Technology'),
        ('business/working/','Working'),
        ('sports/','Sports'),
        ('sports/hockey/','Hockey'),
        ('sports/football/','Football'),
        ('sports/basketball/','Basketball'),
        ('sports/golf/','Golf'),
        ('entertainment/','entertainment'),
        ('entertainment/go/','Go!'),
        ('entertainment/music/','Music'),
        ('entertainment/books/','Books'),
        ('entertainment/Movies/','Movies'),
        ('entertainment/television/','Television'),
        ('life/','Life'),
        ('life/health/','Health'),
        ('life/travel/','Travel'),
        ('life/driving/','Driving'),
        ('life/homes/','Homes'),
        ('life/food-drink/','Food & Drink')
    ]
    title = u'Victoria Times Colonist'
    url_prefix = 'http://www.timescolonist.com'
    description = u'News from Victoria, BC'
    fp_tag = 'CAN_TC'
    masthead_url = 'http://www.timescolonist.com/gmg/img/global/logoTimesColonist.png'
    url_list = []
    language = 'en_CA'
    __author__ = 'Nick Redding'
@ -29,15 +74,21 @@ class TimesColonist(BasicNewsRecipe):
                .caption { font-size: xx-small; font-style: italic; font-weight: normal; }
                '''
    keep_only_tags = [dict(name='div', attrs={'class':re.compile('main.content')})]
    remove_tags = [{'class':'comments'},
                   {'id':'photocredit'},
                   dict(name='div', attrs={'class':re.compile('top.controls')}),
                   dict(name='div', attrs={'class':re.compile('social')}),
                   dict(name='div', attrs={'class':re.compile('tools')}),
                   dict(name='div', attrs={'class':re.compile('bottom.tools')}),
                   dict(name='div', attrs={'class':re.compile('window')}),
                   dict(name='div', attrs={'class':re.compile('related.news.element')})]
    def __init__(self, options, log, progress_reporter):
        self.remove_tags = [{'class':'comments'},
                       {'id':'photocredit'},
                       dict(name='div', attrs={'class':re.compile('top.controls')}),
                       dict(name='div', attrs={'class':re.compile('^comments')}),
                       dict(name='div', attrs={'class':re.compile('social')}),
                       dict(name='div', attrs={'class':re.compile('tools')}),
                       dict(name='div', attrs={'class':re.compile('bottom.tools')}),
                       dict(name='div', attrs={'class':re.compile('window')}),
                       dict(name='div', attrs={'class':re.compile('related.news.element')})]
        print("PROFILE NAME = "+options.output_profile.short_name)
        if self.kindle_omit_images and options.output_profile.short_name in ['kindle', 'kindle_dx', 'kindle_pw']:
            self.remove_tags.append(dict(name='div', attrs={'class':re.compile('image-container')}))
        BasicNewsRecipe.__init__(self, options, log, progress_reporter)
    def get_cover_url(self):
        from datetime import timedelta, date
@ -122,7 +173,6 @@ class TimesColonist(BasicNewsRecipe):
    def preprocess_html(self,soup):
        byline = soup.find('p',attrs={'class':re.compile('ancillary')})
        if byline is not None:
            byline.find('a')
            authstr = self.tag_to_string(byline,False)
            authstr = re.sub('/ *Times Colonist','/',authstr, flags=re.IGNORECASE)
            authstr = re.sub('BY */','',authstr, flags=re.IGNORECASE)
@ -149,9 +199,10 @@ class TimesColonist(BasicNewsRecipe):
        atag = htag.a
        if atag is not None:
            url = atag['href']
-            #print("Checking "+url)
+            url = url.strip()
-            if atag['href'].startswith('/'):
+            # print("Checking >>"+url+'<<\n\r')
-                url = self.url_prefix+atag['href']
+            if url.startswith('/'):
                url = self.url_prefix+url
            if url in self.url_list:
                return
            self.url_list.append(url)
@ -171,10 +222,10 @@ class TimesColonist(BasicNewsRecipe):
            if dtag is not None:
                description = self.tag_to_string(dtag,False)
            article_list.append(dict(title=title,url=url,date='',description=description,author='',content=''))
-            #print(sectitle+title+": description = "+description+" URL="+url)
+            print(sectitle+title+": description = "+description+" URL="+url+'\n\r')
    def add_section_index(self,ans,securl,sectitle):
-        print("Add section url="+self.url_prefix+'/'+securl)
+        print("Add section url="+self.url_prefix+'/'+securl+'\n\r')
        try:
            soup = self.index_to_soup(self.url_prefix+'/'+securl)
        except:
@ -193,33 +244,7 @@ class TimesColonist(BasicNewsRecipe):
    def parse_index(self):
        ans = []
-        ans = self.add_section_index(ans,'','Web Front Page')
+        for (url,title) in self.section_list:
-        ans = self.add_section_index(ans,'news/','News Headlines')
+            ans = self.add_section_index(ans,url,title)
        ans = self.add_section_index(ans,'news/b-c/','BC News')
        ans = self.add_section_index(ans,'news/national/','Natioanl News')
        ans = self.add_section_index(ans,'news/world/','World News')
        ans = self.add_section_index(ans,'opinion/','Opinion')
        ans = self.add_section_index(ans,'opinion/letters/','Letters')
        ans = self.add_section_index(ans,'business/','Business')
        ans = self.add_section_index(ans,'business/money/','Money')
        ans = self.add_section_index(ans,'business/technology/','Technology')
        ans = self.add_section_index(ans,'business/working/','Working')
        ans = self.add_section_index(ans,'sports/','Sports')
        ans = self.add_section_index(ans,'sports/hockey/','Hockey')
        ans = self.add_section_index(ans,'sports/football/','Football')
        ans = self.add_section_index(ans,'sports/basketball/','Basketball')
        ans = self.add_section_index(ans,'sports/golf/','Golf')
        ans = self.add_section_index(ans,'entertainment/','entertainment')
        ans = self.add_section_index(ans,'entertainment/go/','Go!')
        ans = self.add_section_index(ans,'entertainment/music/','Music')
        ans = self.add_section_index(ans,'entertainment/books/','Books')
        ans = self.add_section_index(ans,'entertainment/Movies/','movies')
        ans = self.add_section_index(ans,'entertainment/television/','Television')
        ans = self.add_section_index(ans,'life/','Life')
        ans = self.add_section_index(ans,'life/health/','Health')
        ans = self.add_section_index(ans,'life/travel/','Travel')
        ans = self.add_section_index(ans,'life/driving/','Driving')
        ans = self.add_section_index(ans,'life/homes/','Homes')
        ans = self.add_section_index(ans,'life/food-drink/','Food & Drink')
        return ans
--- a/recipes/wyborcza_duzy_format.recipe
+++ b/recipes/wyborcza_duzy_format.recipe
@ -1,144 +0,0 @@
 #!/usr/bin/env  python
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class GazetaWyborczaDuzyForma(BasicNewsRecipe):
    cover_url             = 'http://bi.gazeta.pl/im/8/5415/m5415058.gif'
    title                 = u"Gazeta Wyborcza Duzy Format"
    __author__            = 'ravcio - rlelusz[at]gmail.com'
    description           = u"Articles from Gazeta's website"
    language              = 'pl'
    max_articles_per_feed = 50  #you can increade it event up to maybe 600, should still work
    recursions            = 0
    encoding              = 'iso-8859-2'
    no_stylesheets        = True
    remove_javascript     = True
    use_embedded_content  = False
    keep_only_tags    = [
            dict(name='div', attrs={'id':['k1']})
                ]
    remove_tags = [
            dict(name='div', attrs={'class':['zdjM', 'rel_video', 'zdjP', 'rel_box', 'index mod_zi_dolStrony']})
            ,dict(name='div', attrs={'id':['source', 'banP4', 'article_toolbar', 'rel', 'inContext_disabled']})
            ,dict(name='ul', attrs={'id':['articleToolbar']})
            ,dict(name='img', attrs={'class':['brand']})
            ,dict(name='h5', attrs={'class':['author']})
            ,dict(name='h6', attrs={'class':['date']})
            ,dict(name='p', attrs={'class':['txt_upl']})
                ]
    remove_tags_after = [
            dict(name='div', attrs={'id':['Str']})                #nawigator numerow linii
                ]
    def load_article_links(self, url, count):
        print '--- load_article_links', url, count
 		#page with link to articles
        soup = self.index_to_soup(url)
 		#table with articles
        list = soup.find('div', attrs={'class':'GWdalt'})
 		#single articles (link, title, ...)
        links = list.findAll('div', attrs={'class':['GWdaltE']})
        if len(links) < count:
            #load links to more articles...
 			#remove new link
            pages_nav = list.find('div', attrs={'class':'pages'})
            next = pages_nav.find('a', attrs={'class':'next'})
            if next:
                print 'next=', next['href']
                url = 'http://wyborcza.pl' + next['href']
                #e.g. url = 'http://wyborcza.pl/0,75480.html?str=2'
                older_links = self.load_article_links(url, count - len(links))
                links.extend(older_links)
        return links
    #produce list of articles to download
    def parse_index(self):
        print '--- parse_index'
        max_articles = 8000
        links = self.load_article_links('http://wyborcza.pl/0,75480.html', max_articles)
        ans = []
        key = None
        articles = {}
        key = 'Uncategorized'
        articles[key] = []
        for div_art in links:
            div_date = div_art.find('div', attrs={'class':'kL'})
            div = div_art.find('div', attrs={'class':'kR'})
            a = div.find('a', href=True)
            url = a['href']
            title = a.string
            description = ''
            pubdate = div_date.string.rstrip().lstrip()
            summary = div.find('span', attrs={'class':'lead'})
            desc = summary.find('a', href=True)
            if desc:
                desc.extract()
            description = self.tag_to_string(summary, use_alt=False)
            description = description.rstrip().lstrip()
            feed = key if key is not None else 'Duzy Format'
            if not articles.has_key(feed):
                articles[feed] = []
            if description != '':  # skip just pictures atricle
                articles[feed].append(
                                   dict(title=title, url=url, date=pubdate,
                                        description=description,
                                        content=''))
        ans = [(key, articles[key])]
        return ans
    def append_page(self, soup, appendtag, position):
        pager = soup.find('div',attrs={'id':'Str'})
        if pager:
 			#seek for 'a' element with nast value (if not found exit)
            list = pager.findAll('a')
            for elem in list:
                if 'nast' in elem.string:
                    nexturl = elem['href']
                    soup2 = self.index_to_soup('http://warszawa.gazeta.pl' + nexturl)
                    texttag = soup2.find('div', attrs={'id':'artykul'})
                    newpos = len(texttag.contents)
                    self.append_page(soup2,texttag,newpos)
                    texttag.extract()
                    appendtag.insert(position,texttag)
    def preprocess_html(self, soup):
        self.append_page(soup, soup.body, 3)
        # finally remove some tags
        pager = soup.find('div',attrs={'id':'Str'})
        if pager:
           pager.extract()
        pager = soup.find('div',attrs={'class':'tylko_int'})
        if pager:
           pager.extract()
        return soup
--- a/recipes/wysokie_obcasy.recipe
+++ b/recipes/wysokie_obcasy.recipe
@ -0,0 +1,57 @@
 #!/usr/bin/env python
 __license__ = 'GPL v3'
 from calibre.web.feeds.news import BasicNewsRecipe
 class WysokieObcasyRecipe(BasicNewsRecipe):
    __author__ = u'Artur Stachecki <artur.stachecki@gmail.com>'
    language = 'pl'
    version = 1
    title = u'Wysokie Obcasy'
    publisher = 'Agora SA'
    description = u'Serwis sobotniego dodatku do Gazety Wyborczej'
    category='magazine'
    language = 'pl'
    publication_type = 'magazine'
    cover_url=''
    remove_empty_feeds= True
    no_stylesheets=True
    oldest_article = 7
    max_articles_per_feed = 100000
    recursions = 0
    no_stylesheets = True
    remove_javascript = True
    simultaneous_downloads = 5
    keep_only_tags =[]
    keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'article'}))
    remove_tags =[]
    remove_tags.append(dict(name = 'img'))
    remove_tags.append(dict(name = 'p', attrs = {'class' : 'info'}))
    extra_css = '''
                    body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}
                    h1{text-align: left;}
                       '''
    feeds          = [
                            ('Wszystkie Artykuly', 'feed://www.wysokieobcasy.pl/pub/rss/wysokieobcasy.xml'),
                          ]
    def print_version(self,url):
        baseURL='http://www.wysokieobcasy.pl/wysokie-obcasy'
        segments = url.split(',')
        subPath= '/2029020,'
        articleURL1 = segments[1]
        articleURL2 = segments[2]
        printVerString=articleURL1 + ',' + articleURL2
        s=  baseURL + subPath + printVerString + '.html'
        return s
    def get_cover_url(self):
        soup = self.index_to_soup('http://www.wysokieobcasy.pl/wysokie-obcasy/0,0.html')
        self.cover_url = soup.find(attrs={'class':'holder_cr'}).find('img')['src']
        return getattr(self, 'cover_url', self.cover_url)
--- a/resources/templates/rtf.xsl
+++ b/resources/templates/rtf.xsl
@ -357,7 +357,7 @@
       <xsl:apply-templates/>
   </xsl:template>
-       <xsl:template match="rtf:table">
+    <xsl:template match="rtf:table">
        <xsl:element name="table">
            <xsl:attribute name="id">
                <xsl:value-of select="generate-id(.)"/>
@ -390,7 +390,6 @@
    <xsl:output method = "xml"/>
    <xsl:key name="style-types" match="rtf:paragraph-definition" use="@style-number"/>
@ -415,13 +414,11 @@
    </xsl:template>
    <xsl:template match="rtf:page-break">
-        <xsl:element name="br">
+        <br style = "page-break-after:always"/>
            <xsl:attribute name="style">page-break-after:always</xsl:attribute>
        </xsl:element>
    </xsl:template>
    <xsl:template match="rtf:hardline-break">
-        <xsl:element name="br"/>
+        <br/>
    </xsl:template>
    <xsl:template match="rtf:rtf-definition|rtf:font-table|rtf:color-table|rtf:style-table|rtf:page-definition|rtf:list-table|rtf:override-table|rtf:override-list|rtf:list-text"/>
@ -445,7 +442,7 @@
    </xsl:template>
    <xsl:template match = "rtf:field-block">
-      <xsl:apply-templates/>
+        <xsl:apply-templates/>
    </xsl:template>
    <xsl:template match = "rtf:field[@type='hyperlink']">
@ -472,9 +469,7 @@
    </xsl:template>
    <xsl:template match="rtf:pict">
-        <xsl:element name="img">
+        <img src = "{@num}"/>
            <xsl:attribute name="src"><xsl:value-of select="@num" /></xsl:attribute>
        </xsl:element>
    </xsl:template>
    <xsl:template match="*">
--- a/src/calibre/constants.py
+++ b/src/calibre/constants.py
@ -4,7 +4,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
 __appname__   = u'calibre'
-numeric_version = (0, 9, 25)
+numeric_version = (0, 9, 26)
 __version__   = u'.'.join(map(unicode, numeric_version))
 __author__    = u"Kovid Goyal <kovid@kovidgoyal.net>"
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -757,9 +757,10 @@ from calibre.ebooks.metadata.sources.isbndb import ISBNDB
 from calibre.ebooks.metadata.sources.overdrive import OverDrive
 from calibre.ebooks.metadata.sources.douban import Douban
 from calibre.ebooks.metadata.sources.ozon import Ozon
-# from calibre.ebooks.metadata.sources.google_images import GoogleImages
+from calibre.ebooks.metadata.sources.google_images import GoogleImages
 from calibre.ebooks.metadata.sources.big_book_search import BigBookSearch
-plugins += [GoogleBooks, Amazon, Edelweiss, OpenLibrary, ISBNDB, OverDrive, Douban, Ozon]
+plugins += [GoogleBooks, GoogleImages, Amazon, Edelweiss, OpenLibrary, ISBNDB, OverDrive, Douban, Ozon, BigBookSearch]
 # }}}
--- a/src/calibre/customize/ui.py
+++ b/src/calibre/customize/ui.py
@ -91,7 +91,7 @@ def restore_plugin_state_to_default(plugin_or_name):
    config['enabled_plugins'] = ep
 default_disabled_plugins = set([
-    'Overdrive', 'Douban Books', 'OZON.ru', 'Edelweiss', 'Google Images',
+    'Overdrive', 'Douban Books', 'OZON.ru', 'Edelweiss', 'Google Images', 'Big Book Search',
 ])
 def is_disabled(plugin):
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@ -132,7 +132,7 @@ class Worker(Thread): # Get details {{{
                 text()="Détails sur le produit" or \
                 text()="Detalles del producto" or \
                 text()="Detalhes do produto" or \
-                 text()="登録情報"]/../div[@class="content"]
+                 starts-with(text(), "登録情報")]/../div[@class="content"]
            '''
        # Editor: is for Spanish
        self.publisher_xpath = '''
@ -235,6 +235,12 @@ class Worker(Thread): # Get details {{{
            msg = 'Failed to parse amazon details page: %r'%self.url
            self.log.exception(msg)
            return
        if self.domain == 'jp':
            for a in root.xpath('//a[@href]'):
                if 'black-curtain-redirect.html' in a.get('href'):
                    self.url = 'http://amazon.co.jp'+a.get('href')
                    self.log('Black curtain redirect found, following')
                    return self.get_details()
        errmsg = root.xpath('//*[@id="errorMessage"]')
        if errmsg:
@ -252,8 +258,8 @@ class Worker(Thread): # Get details {{{
            self.log.exception('Error parsing asin for url: %r'%self.url)
            asin = None
        if self.testing:
-            import tempfile
+            import tempfile, uuid
-            with tempfile.NamedTemporaryFile(prefix=asin + '_',
+            with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_',
                    suffix='.html', delete=False) as f:
                f.write(raw)
            print ('Downloaded html for', asin, 'saved in', f.name)
@ -499,7 +505,7 @@ class Worker(Thread): # Get details {{{
    def parse_language(self, pd):
        for x in reversed(pd.xpath(self.language_xpath)):
            if x.tail:
-                raw = x.tail.strip()
+                raw = x.tail.strip().partition(',')[0].strip()
                ans = self.lang_map.get(raw, None)
                if ans:
                    return ans
@ -1004,6 +1010,11 @@ if __name__ == '__main__': # tests {{{
    ] # }}}
    jp_tests = [ # {{{
            ( # Adult filtering test
             {'identifiers':{'isbn':'4799500066'}},
             [title_test(u'Ｂｉｔｃｈ Ｔｒａｐ'),]
            ),
            ( # isbn -> title, authors
                {'identifiers':{'isbn': '9784101302720' }},
                [title_test(u'精霊の守り人',
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@ -31,7 +31,7 @@ msprefs.defaults['find_first_edition_date'] = False
 # Google covers are often poor quality (scans/errors) but they have high
 # resolution, so they trump covers from better sources. So make sure they
 # are only used if no other covers are found.
-msprefs.defaults['cover_priorities'] = {'Google':2, 'Google Images':2}
+msprefs.defaults['cover_priorities'] = {'Google':2, 'Google Images':2, 'Big Book Search':2}
 def create_log(ostream=None):
    from calibre.utils.logging import ThreadSafeLog, FileStream
@ -429,6 +429,40 @@ class Source(Plugin):
            mi.tags = list(map(fixcase, mi.tags))
        mi.isbn = check_isbn(mi.isbn)
    def download_multiple_covers(self, title, authors, urls, get_best_cover, timeout, result_queue, abort, log, prefs_name='max_covers'):
        if not urls:
            log('No images found for, title: %r and authors: %r'%(title, authors))
            return
        from threading import Thread
        import time
        if prefs_name:
            urls = urls[:self.prefs[prefs_name]]
        if get_best_cover:
            urls = urls[:1]
        log('Downloading %d covers'%len(urls))
        workers = [Thread(target=self.download_image, args=(u, timeout, log, result_queue)) for u in urls]
        for w in workers:
            w.daemon = True
            w.start()
        alive = True
        start_time = time.time()
        while alive and not abort.is_set() and time.time() - start_time < timeout:
            alive = False
            for w in workers:
                if w.is_alive():
                    alive = True
                    break
            abort.wait(0.1)
    def download_image(self, url, timeout, log, result_queue):
        try:
            ans = self.browser.open_novisit(url, timeout=timeout).read()
            result_queue.put((self, ans))
            log('Downloaded cover from: %s'%url)
        except Exception:
            self.log.exception('Failed to download cover from: %r'%url)
    # }}}
    # Metadata API {{{
--- a/src/calibre/ebooks/metadata/sources/big_book_search.py
+++ b/src/calibre/ebooks/metadata/sources/big_book_search.py
@ -0,0 +1,58 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__   = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 from calibre.ebooks.metadata.sources.base import Source, Option
 def get_urls(br, tokens):
    from urllib import quote_plus
    from mechanize import Request
    from lxml import html
    escaped = [quote_plus(x.encode('utf-8')) for x in tokens if x and x.strip()]
    q = b'+'.join(escaped)
    url = 'http://bigbooksearch.com/books/'+q
    br.open(url).read()
    req = Request('http://bigbooksearch.com/query.php?SearchIndex=books&Keywords=%s&ItemPage=1'%q)
    req.add_header('X-Requested-With', 'XMLHttpRequest')
    req.add_header('Referer', url)
    raw = br.open(req).read()
    root = html.fromstring(raw.decode('utf-8'))
    urls = [i.get('src') for i in root.xpath('//img[@src]')]
    return urls
 class BigBookSearch(Source):
    name = 'Big Book Search'
    description = _('Downloads multiple book covers from Amazon. Useful to find alternate covers.')
    capabilities = frozenset(['cover'])
    config_help_message = _('Configure the Big Book Search plugin')
    can_get_multiple_covers = True
    options = (Option('max_covers', 'number', 5, _('Maximum number of covers to get'),
                      _('The maximum number of covers to process from the search result')),
    )
    supports_gzip_transfer_encoding = True
    def download_cover(self, log, result_queue, abort,
            title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
        if not title:
            return
        br = self.browser
        tokens = tuple(self.get_title_tokens(title)) + tuple(self.get_author_tokens(authors))
        urls = get_urls(br, tokens)
        self.download_multiple_covers(title, authors, urls, get_best_cover, timeout, result_queue, abort, log)
 def test():
    from calibre import browser
    import pprint
    br = browser()
    urls = get_urls(br, ['consider', 'phlebas', 'banks'])
    pprint.pprint(urls)
 if __name__ == '__main__':
    test()
--- a/src/calibre/ebooks/metadata/sources/covers.py
+++ b/src/calibre/ebooks/metadata/sources/covers.py
@ -18,12 +18,13 @@ from calibre.utils.magick.draw import Image, save_cover_data_to
 class Worker(Thread):
-    def __init__(self, plugin, abort, title, authors, identifiers, timeout, rq):
+    def __init__(self, plugin, abort, title, authors, identifiers, timeout, rq, get_best_cover=False):
        Thread.__init__(self)
        self.daemon = True
        self.plugin = plugin
        self.abort = abort
        self.get_best_cover = get_best_cover
        self.buf = BytesIO()
        self.log = create_log(self.buf)
        self.title, self.authors, self.identifiers = (title, authors,
@ -37,7 +38,7 @@ class Worker(Thread):
            try:
                if self.plugin.can_get_multiple_covers:
                    self.plugin.download_cover(self.log, self.rq, self.abort,
-                        title=self.title, authors=self.authors, get_best_cover=True,
+                        title=self.title, authors=self.authors, get_best_cover=self.get_best_cover,
                        identifiers=self.identifiers, timeout=self.timeout)
                else:
                    self.plugin.download_cover(self.log, self.rq, self.abort,
@ -72,7 +73,7 @@ def process_result(log, result):
    return (plugin, width, height, fmt, data)
 def run_download(log, results, abort,
-        title=None, authors=None, identifiers={}, timeout=30):
+        title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
    '''
    Run the cover download, putting results into the queue :param:`results`.
@ -89,7 +90,7 @@ def run_download(log, results, abort,
    plugins = [p for p in metadata_plugins(['cover']) if p.is_configured()]
    rq = Queue()
-    workers = [Worker(p, abort, title, authors, identifiers, timeout, rq) for p
+    workers = [Worker(p, abort, title, authors, identifiers, timeout, rq, get_best_cover=get_best_cover) for p
            in plugins]
    for w in workers:
        w.start()
@ -163,7 +164,7 @@ def download_cover(log,
    abort = Event()
    run_download(log, rq, abort, title=title, authors=authors,
-            identifiers=identifiers, timeout=timeout)
+            identifiers=identifiers, timeout=timeout, get_best_cover=True)
    results = []
--- a/src/calibre/ebooks/metadata/sources/edelweiss.py
+++ b/src/calibre/ebooks/metadata/sources/edelweiss.py
@ -106,6 +106,8 @@ class Worker(Thread): # {{{
            parts = pub.partition(':')[0::2]
            pub = parts[1] or parts[0]
            try:
                if ', Ship Date:' in pub:
                    pub = pub.partition(', Ship Date:')[0]
                q = parse_only_date(pub, assume_utc=True)
                if q.year != UNDEFINED_DATE:
                    mi.pubdate = q
--- a/src/calibre/ebooks/metadata/sources/google_images.py
+++ b/src/calibre/ebooks/metadata/sources/google_images.py
@ -39,39 +39,11 @@ class GoogleImages(Source):
            title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
        if not title:
            return
        from threading import Thread
        import time
        timeout = max(60, timeout) # Needs at least a minute
        title = ' '.join(self.get_title_tokens(title))
        author = ' '.join(self.get_author_tokens(authors))
        urls = self.get_image_urls(title, author, log, abort, timeout)
-        if not urls:
+        self.download_multiple_covers(title, authors, urls, get_best_cover, timeout, result_queue, abort, log)
            log('No images found in Google for, title: %r and authors: %r'%(title, author))
            return
        urls = urls[:self.prefs['max_covers']]
        if get_best_cover:
            urls = urls[:1]
        workers = [Thread(target=self.download_image, args=(url, timeout, log, result_queue)) for url in urls]
        for w in workers:
            w.daemon = True
            w.start()
        alive = True
        start_time = time.time()
        while alive and not abort.is_set() and time.time() - start_time < timeout:
            alive = False
            for w in workers:
                if w.is_alive():
                    alive = True
                    break
            abort.wait(0.1)
    def download_image(self, url, timeout, log, result_queue):
        try:
            ans = self.browser.open_novisit(url, timeout=timeout).read()
            result_queue.put((self, ans))
            log('Downloaded cover from: %s'%url)
        except Exception:
            self.log.exception('Failed to download cover from: %r'%url)
    def get_image_urls(self, title, author, log, abort, timeout):
        from calibre.utils.ipc.simple_worker import fork_job, WorkerError
--- a/src/calibre/ebooks/rtf2xml/border_parse.py
+++ b/src/calibre/ebooks/rtf2xml/border_parse.py
@ -180,5 +180,6 @@ class BorderParse:
        elif 'single' in border_style_list:
            new_border_dict[att] = 'single'
        else:
-            new_border_dict[att] = border_style_list[0]
+            if border_style_list:
                new_border_dict[att] = border_style_list[0]
        return new_border_dict
--- a/src/calibre/gui2/toc/main.py
+++ b/src/calibre/gui2/toc/main.py
@ -559,11 +559,11 @@ class TOCView(QWidget): # {{{
        b.setToolTip(_('Remove all selected entries'))
        b.clicked.connect(self.del_items)
-        self.left_button = b = QToolButton(self)
+        self.right_button = b = QToolButton(self)
        b.setIcon(QIcon(I('forward.png')))
        b.setIconSize(QSize(ICON_SIZE, ICON_SIZE))
        l.addWidget(b, 4, 3)
-        b.setToolTip(_('Unindent the current entry [Ctrl+Left]'))
+        b.setToolTip(_('Indent the current entry [Ctrl+Right]'))
        b.clicked.connect(self.tocw.move_right)
        self.down_button = b = QToolButton(self)
--- a/src/calibre/library/cli.py
+++ b/src/calibre/library/cli.py
@ -54,7 +54,7 @@ def get_parser(usage):
 def get_db(dbpath, options):
    global do_notify
    if options.library_path is not None:
-        dbpath = options.library_path
+        dbpath = os.path.expanduser(options.library_path)
    if dbpath is None:
        raise ValueError('No saved library path, either run the GUI or use the'
                ' --with-library option')
--- a/src/calibre/translations/calibre.pot
+++ b/src/calibre/translations/calibre.pot
--- a/src/calibre/utils/localunzip.py
+++ b/src/calibre/utils/localunzip.py
@ -174,7 +174,13 @@ def _extractall(f, path=None, file_info=None):
        has_data_descriptors = header.flags & (1 << 3)
        seekval = header.compressed_size + (16 if has_data_descriptors else 0)
        found = True
-        parts = header.filename.split('/')
+        # Sanitize path changing absolute to relative paths and removing .. and
        # .
        fname = header.filename.replace(os.sep, '/')
        fname = os.path.splitdrive(fname)[1]
        parts = [x for x in fname.split('/') if x not in {'', os.path.pardir, os.path.curdir}]
        if not parts:
            continue
        if header.uncompressed_size == 0:
            # Directory
            f.seek(f.tell()+seekval)
--- a/src/calibre/utils/mreplace.py
+++ b/src/calibre/utils/mreplace.py
@ -17,8 +17,7 @@ class MReplace(UserDict):
    def compile_regex(self):
        if len(self.data) > 0:
-            keys = sorted(self.data.keys(), key=len)
+            keys = sorted(self.data.keys(), key=len, reverse=True)
            keys.reverse()
            tmp = "(%s)" % "|".join(map(re.escape, keys))
            if self.re != tmp:
                self.re = tmp
--- a/src/calibre/utils/zipfile.py
+++ b/src/calibre/utils/zipfile.py
@ -1099,10 +1099,13 @@ class ZipFile:
        base_target = targetpath # Added by Kovid
-        # don't include leading "/" from file name if present
+        # Sanitize path, changing absolute paths to relative paths
-        fname = member.filename
+        # and removing .. and . (changed by Kovid)
-        if fname.startswith('/'):
+        fname = member.filename.replace(os.sep, '/')
-            fname = fname[1:]
+        fname = os.path.splitdrive(fname)[1]
        fname = '/'.join(x for x in fname.split('/') if x not in {'', os.path.curdir, os.path.pardir})
        if not fname:
            raise BadZipfile('The member %r has an invalid name'%member.filename)
        targetpath = os.path.normpath(os.path.join(base_target, fname))