start experiment with chm

2025-08-11 09:13:57 -04:00 · 2010-01-31 12:05:56 -08:00 · 2010-01-31 12:05:56 -08:00 · f348c6235e
commit f348c6235e
parent e71b23e5c3 ae2b434b35
32 changed files with 1837 additions and 116 deletions
--- a/resources/images/news/ad.png
+++ b/resources/images/news/ad.png
--- a/resources/images/news/digitaljournal.png
+++ b/resources/images/news/digitaljournal.png
--- a/resources/recipes/ad.recipe
+++ b/resources/recipes/ad.recipe
@ -0,0 +1,86 @@
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 class ADRecipe(BasicNewsRecipe):
    __license__  = 'GPL v3'
    __author__ = 'kwetal'
    language = 'nl'
    country = 'NL'
    version = 1
    title = u'AD'
    publisher = u'de Persgroep Publishing Nederland NV'
    category = u'News, Sports, the Netherlands'
    description = u'News and Sports from the Netherlands'
    oldest_article = 1.2
    max_articles_per_feed = 100
    use_embedded_content = False
    remove_empty_feeds = True
    no_stylesheets = True
    remove_javascript = True
    keep_only_tags = []
    keep_only_tags.append(dict(name = 'div', attrs = {'id': 'art_box2'}))
    keep_only_tags.append(dict(name = 'p', attrs = {'class': 'gen_footnote3'}))
    remove_tags = []
    remove_tags.append(dict(name = 'div', attrs = {'class': 'gen_clear'}))
    remove_tags.append(dict(name = 'div', attrs = {'class': re.compile(r'gen_spacer.*')}))
    remove_attributes = ['style']
    # feeds from http://ad.nl/ad/nl/1401/home/integration/nmc/frameset/ad_footer/rssFeeds.dhtml
    feeds = []
    feeds.append((u'Binnenland', u'http://www.ad.nl/nieuws/binnenland/rss.xml'))
    feeds.append((u'Buitenland', u'http://www.ad.nl/nieuws/buitenland/rss.xml'))
    feeds.append((u'Bizar', u'http://www.ad.nl/nieuws/bizar/rss.xml'))
    feeds.append((u'Gezondheid & Wetenschap', u'http://www.ad.nl/nieuws/gezondheidwetenschap/rss.xml'))
    feeds.append((u'Economie', u'http://www.ad.nl/nieuws/economie/rss.xml'))
    feeds.append((u'Nederlands Voetbal', u'http://www.ad.nl/sportwereld/nederlandsvoetbal/rss.xml'))
    feeds.append((u'Buitenlands Voetbal', u'http://www.ad.nl/sportwereld/buitenlandsvoetbal/rss.xml'))
    feeds.append((u'Champions League/Europa League', u'http://www.ad.nl/sportwereld/championsleagueeuropaleague/rss.xml'))
    feeds.append((u'Wielrennen', u'http://www.ad.nl/sportwereld/wielrennen/rss.xml'))
    feeds.append((u'Tennis', u'http://www.ad.nl/sportwereld/tennis/rss.xml'))
    feeds.append((u'Formule 1', u'http://www.ad.nl/sportwereld/formule1/rss.xml'))
    feeds.append((u'Meer Sport', u'http://www.ad.nl/sportwereld/meersport/rss.xml'))
    feeds.append((u'Celebs', u'http://www.ad.nl/showbizz/celebs/rss.xml'))
    feeds.append((u'Film', u'http://www.ad.nl/showbizz/film/rss.xml'))
    feeds.append((u'Muziek', u'http://www.ad.nl/showbizz/muziek/rss.xml'))
    feeds.append((u'TV', u'http://www.ad.nl/showbizz/tv/rss.xml'))
    feeds.append((u'Kunst & Literatuur', u'http://www.ad.nl/showbizz/kunstenliteratuur/rss.xml'))
    feeds.append((u'Jouw Wereld', u'http://www.ad.nl/you/rss.xml'))
    feeds.append((u'Consument', u'http://www.ad.nl/consument/rss.xml'))
    feeds.append((u'Autowereld', u'http://www.ad.nl/autowereld/rss.xml'))
    feeds.append((u'Reiswereld', u'http://www.ad.nl/reiswereld/rss.xml'))
    feeds.append((u'Internet', u'http://www.ad.nl/digitaal/internet/rss.xml'))
    feeds.append((u'Games', u'http://www.ad.nl/digitaal/games/rss.xml'))
    feeds.append((u'Multimedia', u'http://www.ad.nl/digitaal/multimedia/rss.xml'))
    feeds.append((u'Planet Watch', u'http://www.ad.nl/planetwatch/rss.xml'))
    extra_css = '''
                body {font-family: verdana, arial, helvetica, geneva, sans-serif;}
                div.captionEmbeddedMasterObject {font-size: x-small; font-style: italic; color: #696969;}
                .gen_footnote3 {font-size: small; color: #666666; margin-top: 0.6em;}
                '''
    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
                          'publisher': publisher}
    def print_version(self, url):
        parts = url.split('/')
        print_url = 'http://' + parts[2] + '/' + parts[3] + '/' + parts[4] + '/' + parts[5] + '/' \
                + parts[10] + '/' + parts[7] + '/print/' + parts[8] + '/' + parts[9] + '/' + parts[13]
        return print_url
    def preprocess_html(self, soup):
        for br in soup.findAll('br'):
            prev = br.findPreviousSibling(True)
            if hasattr(prev, 'name') and prev.name == 'br':
                next = br.findNextSibling(True)
                if hasattr(next, 'name') and next.name == 'br':
                    br.extract()
        return soup
--- a/resources/recipes/amspec.recipe
+++ b/resources/recipes/amspec.recipe
@ -1,7 +1,5 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
-__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
 '''
 spectator.org
 '''
@ -11,20 +9,22 @@ from calibre.web.feeds.news import BasicNewsRecipe
 class TheAmericanSpectator(BasicNewsRecipe):
    title                 = 'The American Spectator'
    __author__            = 'Darko Miletic'
    language = 'en'
    description           = 'News from USA'
    category              = 'news, politics, USA, world'
    publisher             = 'The American Spectator'
    oldest_article        = 7
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    language              = 'en'
    INDEX                 = 'http://spectator.org'
-    html2lrf_options = [
+    conversion_options = {  
-                             '--comment'       , description
+                             'comments'        : description
-                           , '--category'      , 'news, politics, USA'
+                            ,'tags'            : category
-                           , '--publisher'     , title
+                            ,'language'        : language
-                         ]
+                            ,'publisher'       : publisher
                         }
    keep_only_tags   = [
                             dict(name='div', attrs={'class':'post inner'})
@ -33,13 +33,11 @@ class TheAmericanSpectator(BasicNewsRecipe):
    remove_tags     = [
                             dict(name='object')
-                            ,dict(name='div', attrs={'class':'col3'         })
+                            ,dict(name='div', attrs={'class':['col3','post-options','social']})
-                            ,dict(name='div', attrs={'class':'post-options' })
+                            ,dict(name='p'  , attrs={'class':['letter-editor','meta']})
                            ,dict(name='p'  , attrs={'class':'letter-editor'})
                            ,dict(name='div', attrs={'class':'social'       })
                        ]
-    feeds = [ (u'Articles', u'http://feedproxy.google.com/amspecarticles')]
+    feeds = [ (u'Articles', u'http://feeds.feedburner.com/amspecarticles')]
    def get_cover_url(self):
        cover_url = None
@ -53,3 +51,7 @@ class TheAmericanSpectator(BasicNewsRecipe):
    def print_version(self, url):
        return url + '/print'
    def get_article_url(self, article):
        return article.get('guid', None)
--- a/resources/recipes/bbc_fast.recipe
+++ b/resources/recipes/bbc_fast.recipe
@ -0,0 +1,60 @@
 __license__   = 'GPL v3'
 __copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
 '''
 news.bbc.co.uk
 '''
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class BBC(BasicNewsRecipe):
    title                  = 'BBC News (fast)'
    __author__             = 'Darko Miletic'
    description            = 'News from UK. A much faster version that does not download pictures'
    oldest_article         = 2
    max_articles_per_feed  = 100
    no_stylesheets         = True
    #delay                  = 1
    use_embedded_content   = False
    encoding               = 'utf8'
    publisher              = 'BBC'
    category               = 'news, UK, world'
    language               = 'en'
    extra_css              = ' body{ font-family: sans-serif; } .headline{font-size: xx-large; font-weight: bold} .ibox{display: block; margin: 20px 50px; padding: 10px; border: 1px solid } '
    conversion_options = {
                             'comments'        : description
                            ,'tags'            : category
                            ,'language'        : language
                            ,'publisher'       : publisher
                         }
    remove_tags_before = dict(name='div',attrs={'class':'headline'})
    remove_tags_after  = dict(name='div', attrs={'class':'footer'})
    remove_tags       = [
                           dict(name=['object','link','script','iframe'])
                          ,dict(name='div', attrs={'class':'footer'})
                        ]
    feeds          = [
                      ('News Front Page', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/front_page/rss.xml'),
                      ('Science/Nature', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/science/nature/rss.xml'),
                      ('Technology', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/technology/rss.xml'),
                      ('Entertainment', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/entertainment/rss.xml'),
                      ('Magazine', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/magazine/rss.xml'),
                      ('Business', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/business/rss.xml'),
                      ('Health', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/health/rss.xml'),
                      ('Americas', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/americas/rss.xml'),
                      ('Europe', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/europe/rss.xml'),
                      ('South Asia', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/south_asia/rss.xml'),
                      ('UK', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/rss.xml'),
                      ('Asia-Pacific', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/asia-pacific/rss.xml'),
                      ('Africa', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/africa/rss.xml'),
                    ]
    def print_version(self, url):
        emp,sep,rstrip = url.partition('http://')
        return 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/' + rstrip
    def get_article_url(self, article):
        return article.get('guid', None)
--- a/resources/recipes/calgary_herald.recipe
+++ b/resources/recipes/calgary_herald.recipe
@ -0,0 +1,121 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 '''
 www.canada.com
 '''
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class CanWestPaper(BasicNewsRecipe):
    # un-comment the following three lines for the Calgary Herald
    title = u'Calgary Herald'
    url_prefix = 'http://www.calgaryherald.com'
    description = u'News from Calgary, AB'
    # un-comment the following three lines for the Regina Leader-Post
    #title = u'Regina Leader-Post'
    #url_prefix = 'http://www.leaderpost.com'
    #description = u'News from Regina, SK'
    # un-comment the following three lines for the Saskatoon Star-Phoenix
    #title = u'Saskatoon Star-Phoenix'
    #url_prefix = 'http://www.thestarphoenix.com'
    #description = u'News from Saskatoon, SK'
    # un-comment the following three lines for the Windsor Star
    #title = u'Windsor Star'
    #url_prefix = 'http://www.windsorstar.com'
    #description = u'News from Windsor, ON'
    # un-comment the following three lines for the Ottawa Citizen
    #title = u'Ottawa Citizen'
    #url_prefix = 'http://www.ottawacitizen.com'
    #description = u'News from Ottawa, ON'
    # un-comment the following three lines for the Montreal Gazette
    #title = u'Montreal Gazette'
    #url_prefix = 'http://www.montrealgazette.com'
    #description = u'News from Montreal, QC'
    language = 'en_CA'
    __author__ = 'Nick Redding'
    no_stylesheets = True
    timefmt = ' [%b %d]'
    extra_css = '''
                .timestamp {  font-size:xx-small; display: block; }
                #storyheader { font-size: medium; }
                #storyheader h1 { font-size: x-large; }
                #storyheader h2 { font-size: large;  font-style: italic; }
                .byline { font-size:xx-small; }
                #photocaption { font-size: small; font-style: italic }
                #photocredit { font-size: xx-small; }'''
    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
    remove_tags = [{'class':'comments'},
                   dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
                   dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
                   dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
                   dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
                   dict(name='div', attrs={'class':'rule_grey_solid'}),
                   dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
    def preprocess_html(self,soup):
        #delete iempty id attributes--they screw up the TOC for unknow reasons
        divtags = soup.findAll('div',attrs={'id':''})
        if divtags:
            for div in divtags:
                del(div['id'])
        return soup
    def parse_index(self):
        soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
        articles = {}
        key = 'News'
        ans = ['News']
        # Find each instance of class="sectiontitle", class="featurecontent"
        for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
                #self.log(" div class = %s" % divtag['class'])
                if divtag['class'].startswith('section_title'):
                    # div contains section title
                    if not divtag.h3:
                        continue
                    key = self.tag_to_string(divtag.h3,False)
                    ans.append(key)
                    self.log("Section name %s" % key)
                    continue
                # div contains article data
                h1tag = divtag.find('h1')
                if not h1tag:
                    continue
                atag = h1tag.find('a',href=True)
                if not atag:
                    continue
                url = self.url_prefix+'/news/todays-paper/'+atag['href']
                #self.log("Section %s" % key)
                #self.log("url %s" % url)
                title = self.tag_to_string(atag,False)
                #self.log("title %s" % title)
                pubdate = ''
                description = ''
                ptag = divtag.find('p');
                if ptag:
                    description = self.tag_to_string(ptag,False)
                    #self.log("description %s" % description)
                author = ''
                autag = divtag.find('h4')
                if autag:
                    author = self.tag_to_string(autag,False)
                    #self.log("author %s" % author)
                if not articles.has_key(key):
                    articles[key] = []
                articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans
--- a/resources/recipes/digitaljournal.recipe
+++ b/resources/recipes/digitaljournal.recipe
@ -0,0 +1,52 @@
 __license__   = 'GPL v3'
 __copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
 '''
 digitaljournal.com
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 class DigitalJournal(BasicNewsRecipe):
    title                 = 'Digital Journal'
    __author__            = 'Darko Miletic'
    description           = 'A Global Citizen Journalism News Network'
    category              = 'news, politics, USA, world'
    publisher             = 'Digital Journal'
    oldest_article        = 2
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    encoding              = 'utf8'
    language              = 'en'
    conversion_options = {  
                             'comments'        : description
                            ,'tags'            : category
                            ,'language'        : language
                            ,'publisher'       : publisher
                         }
    keep_only_tags   = [dict(name='div', attrs={'class':['article','body']})]
    remove_tags     = [dict(name=['object','table'])]
    feeds = [ 
                (u'Latest News'  , u'http://digitaljournal.com/rss/?feed=latest_news'                   )
               ,(u'Business'     , u'http://digitaljournal.com/rss/?feed=top_news&depname=Business'     )
               ,(u'Entertainment', u'http://digitaljournal.com/rss/?feed=top_news&depname=Entertainment')
               ,(u'Environment'  , u'http://digitaljournal.com/rss/?feed=top_news&depname=Environment'  )
               ,(u'Food'         , u'http://digitaljournal.com/rss/?feed=top_news&depname=Food'         )
               ,(u'Health'       , u'http://digitaljournal.com/rss/?feed=top_news&depname=Health'       )
               ,(u'Internet'     , u'http://digitaljournal.com/rss/?feed=top_news&depname=Internet'     )
               ,(u'Politics'     , u'http://digitaljournal.com/rss/?feed=top_news&depname=Politics'     )
               ,(u'Religion'     , u'http://digitaljournal.com/rss/?feed=top_news&depname=Religion'     )
               ,(u'Science'      , u'http://digitaljournal.com/rss/?feed=top_news&depname=Science'      )
               ,(u'Sports'       , u'http://digitaljournal.com/rss/?feed=top_news&depname=Sports'       )
               ,(u'Technology'   , u'http://digitaljournal.com/rss/?feed=top_news&depname=Technology'   )
               ,(u'World'        , u'http://digitaljournal.com/rss/?feed=top_news&depname=World'        )
               ,(u'Arts'         , u'http://digitaljournal.com/rss/?feed=top_news&depname=Arts'         )
            ]
    def print_version(self, url):
        return url.replace('digitaljournal.com/','digitaljournal.com/print/')
--- a/resources/recipes/edmonton_journal.recipe
+++ b/resources/recipes/edmonton_journal.recipe
@ -0,0 +1,126 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 '''
 www.canada.com
 '''
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class CanWestPaper(BasicNewsRecipe):
    # un-comment the following three lines for the Edmonton Journal
    title = u'Edmonton Journal'
    url_prefix = 'http://www.edmontonjournal.com'
    description = u'News from Edmonton, AB'
    # un-comment the following three lines for the Calgary Herald
    #title = u'Calgary Herald'
    #url_prefix = 'http://www.calgaryherald.com'
    #description = u'News from Calgary, AB'
    # un-comment the following three lines for the Regina Leader-Post
    #title = u'Regina Leader-Post'
    #url_prefix = 'http://www.leaderpost.com'
    #description = u'News from Regina, SK'
    # un-comment the following three lines for the Saskatoon Star-Phoenix
    #title = u'Saskatoon Star-Phoenix'
    #url_prefix = 'http://www.thestarphoenix.com'
    #description = u'News from Saskatoon, SK'
    # un-comment the following three lines for the Windsor Star
    #title = u'Windsor Star'
    #url_prefix = 'http://www.windsorstar.com'
    #description = u'News from Windsor, ON'
    # un-comment the following three lines for the Ottawa Citizen
    #title = u'Ottawa Citizen'
    #url_prefix = 'http://www.ottawacitizen.com'
    #description = u'News from Ottawa, ON'
    # un-comment the following three lines for the Montreal Gazette
    #title = u'Montreal Gazette'
    #url_prefix = 'http://www.montrealgazette.com'
    #description = u'News from Montreal, QC'
    language = 'en_CA'
    __author__ = 'Nick Redding'
    no_stylesheets = True
    timefmt = ' [%b %d]'
    extra_css = '''
                .timestamp {  font-size:xx-small; display: block; }
                #storyheader { font-size: medium; }
                #storyheader h1 { font-size: x-large; }
                #storyheader h2 { font-size: large;  font-style: italic; }
                .byline { font-size:xx-small; }
                #photocaption { font-size: small; font-style: italic }
                #photocredit { font-size: xx-small; }'''
    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
    remove_tags = [{'class':'comments'},
                   dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
                   dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
                   dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
                   dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
                   dict(name='div', attrs={'class':'rule_grey_solid'}),
                   dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
    def preprocess_html(self,soup):
        #delete iempty id attributes--they screw up the TOC for unknow reasons
        divtags = soup.findAll('div',attrs={'id':''})
        if divtags:
            for div in divtags:
                del(div['id'])
        return soup
    def parse_index(self):
        soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
        articles = {}
        key = 'News'
        ans = ['News']
        # Find each instance of class="sectiontitle", class="featurecontent"
        for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
                #self.log(" div class = %s" % divtag['class'])
                if divtag['class'].startswith('section_title'):
                    # div contains section title
                    if not divtag.h3:
                        continue
                    key = self.tag_to_string(divtag.h3,False)
                    ans.append(key)
                    self.log("Section name %s" % key)
                    continue
                # div contains article data
                h1tag = divtag.find('h1')
                if not h1tag:
                    continue
                atag = h1tag.find('a',href=True)
                if not atag:
                    continue
                url = self.url_prefix+'/news/todays-paper/'+atag['href']
                #self.log("Section %s" % key)
                #self.log("url %s" % url)
                title = self.tag_to_string(atag,False)
                #self.log("title %s" % title)
                pubdate = ''
                description = ''
                ptag = divtag.find('p');
                if ptag:
                    description = self.tag_to_string(ptag,False)
                    #self.log("description %s" % description)
                author = ''
                autag = divtag.find('h4')
                if autag:
                    author = self.tag_to_string(autag,False)
                    #self.log("author %s" % author)
                if not articles.has_key(key):
                    articles[key] = []
                articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans
--- a/resources/recipes/montreal_gazette.recipe
+++ b/resources/recipes/montreal_gazette.recipe
@ -0,0 +1,96 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 '''
 www.canada.com
 '''
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class CanWestPaper(BasicNewsRecipe):
    # un-comment the following three lines for the Montreal Gazette
    title = u'Montreal Gazette'
    url_prefix = 'http://www.montrealgazette.com'
    description = u'News from Montreal, QC'
    language = 'en_CA'
    __author__ = 'Nick Redding'
    no_stylesheets = True
    timefmt = ' [%b %d]'
    extra_css = '''
                .timestamp {  font-size:xx-small; display: block; }
                #storyheader { font-size: medium; }
                #storyheader h1 { font-size: x-large; }
                #storyheader h2 { font-size: large;  font-style: italic; }
                .byline { font-size:xx-small; }
                #photocaption { font-size: small; font-style: italic }
                #photocredit { font-size: xx-small; }'''
    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
    remove_tags = [{'class':'comments'},
                   dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
                   dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
                   dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
                   dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
                   dict(name='div', attrs={'class':'rule_grey_solid'}),
                   dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
    def preprocess_html(self,soup):
        #delete iempty id attributes--they screw up the TOC for unknow reasons
        divtags = soup.findAll('div',attrs={'id':''})
        if divtags:
            for div in divtags:
                del(div['id'])
        return soup
    def parse_index(self):
        soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
        articles = {}
        key = 'News'
        ans = ['News']
        # Find each instance of class="sectiontitle", class="featurecontent"
        for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
                #self.log(" div class = %s" % divtag['class'])
                if divtag['class'].startswith('section_title'):
                    # div contains section title
                    if not divtag.h3:
                        continue
                    key = self.tag_to_string(divtag.h3,False)
                    ans.append(key)
                    self.log("Section name %s" % key)
                    continue
                # div contains article data
                h1tag = divtag.find('h1')
                if not h1tag:
                    continue
                atag = h1tag.find('a',href=True)
                if not atag:
                    continue
                url = self.url_prefix+'/news/todays-paper/'+atag['href']
                #self.log("Section %s" % key)
                #self.log("url %s" % url)
                title = self.tag_to_string(atag,False)
                #self.log("title %s" % title)
                pubdate = ''
                description = ''
                ptag = divtag.find('p');
                if ptag:
                    description = self.tag_to_string(ptag,False)
                    #self.log("description %s" % description)
                author = ''
                autag = divtag.find('h4')
                if autag:
                    author = self.tag_to_string(autag,False)
                    #self.log("author %s" % author)
                if not articles.has_key(key):
                    articles[key] = []
                articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans
--- a/resources/recipes/ottawa_citizen.recipe
+++ b/resources/recipes/ottawa_citizen.recipe
@ -0,0 +1,101 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 '''
 www.canada.com
 '''
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class CanWestPaper(BasicNewsRecipe):
    # un-comment the following three lines for the Ottawa Citizen
    title = u'Ottawa Citizen'
    url_prefix = 'http://www.ottawacitizen.com'
    description = u'News from Ottawa, ON'
    # un-comment the following three lines for the Montreal Gazette
    #title = u'Montreal Gazette'
    #url_prefix = 'http://www.montrealgazette.com'
    #description = u'News from Montreal, QC'
    language = 'en_CA'
    __author__ = 'Nick Redding'
    no_stylesheets = True
    timefmt = ' [%b %d]'
    extra_css = '''
                .timestamp {  font-size:xx-small; display: block; }
                #storyheader { font-size: medium; }
                #storyheader h1 { font-size: x-large; }
                #storyheader h2 { font-size: large;  font-style: italic; }
                .byline { font-size:xx-small; }
                #photocaption { font-size: small; font-style: italic }
                #photocredit { font-size: xx-small; }'''
    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
    remove_tags = [{'class':'comments'},
                   dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
                   dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
                   dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
                   dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
                   dict(name='div', attrs={'class':'rule_grey_solid'}),
                   dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
    def preprocess_html(self,soup):
        #delete iempty id attributes--they screw up the TOC for unknow reasons
        divtags = soup.findAll('div',attrs={'id':''})
        if divtags:
            for div in divtags:
                del(div['id'])
        return soup
    def parse_index(self):
        soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
        articles = {}
        key = 'News'
        ans = ['News']
        # Find each instance of class="sectiontitle", class="featurecontent"
        for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
                #self.log(" div class = %s" % divtag['class'])
                if divtag['class'].startswith('section_title'):
                    # div contains section title
                    if not divtag.h3:
                        continue
                    key = self.tag_to_string(divtag.h3,False)
                    ans.append(key)
                    self.log("Section name %s" % key)
                    continue
                # div contains article data
                h1tag = divtag.find('h1')
                if not h1tag:
                    continue
                atag = h1tag.find('a',href=True)
                if not atag:
                    continue
                url = self.url_prefix+'/news/todays-paper/'+atag['href']
                #self.log("Section %s" % key)
                #self.log("url %s" % url)
                title = self.tag_to_string(atag,False)
                #self.log("title %s" % title)
                pubdate = ''
                description = ''
                ptag = divtag.find('p');
                if ptag:
                    description = self.tag_to_string(ptag,False)
                    #self.log("description %s" % description)
                author = ''
                autag = divtag.find('h4')
                if autag:
                    author = self.tag_to_string(autag,False)
                    #self.log("author %s" % author)
                if not articles.has_key(key):
                    articles[key] = []
                articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans
--- a/resources/recipes/pajama.recipe
+++ b/resources/recipes/pajama.recipe
@ -0,0 +1,48 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 class PajamasMedia(BasicNewsRecipe):
    title          = u'Pajamas Media'
    description = u'Provides exclusive news and opinion for forty countries.'
    language       = 'en'
    __author__     = 'Krittika Goyal'
    oldest_article = 1 #days
    max_articles_per_feed = 25
    recursions = 1
    match_regexps = [r'http://pajamasmedia.com/blog/.*/2/$']
    #encoding = 'latin1'
    remove_stylesheets = True
    #remove_tags_before = dict(name='h1', attrs={'class':'heading'})
    remove_tags_after  = dict(name='div', attrs={'class':'paged-nav'})
    remove_tags = [
       dict(name='iframe'),
       dict(name='div', attrs={'class':['pages']}),
       #dict(name='div', attrs={'id':['bookmark']}),
       #dict(name='span', attrs={'class':['related_link', 'slideshowcontrols']}),
       #dict(name='ul', attrs={'class':'articleTools'}),
    ]
    feeds          = [
 ('pajamas Media',
 'http://feeds.feedburner.com/PajamasMedia'),
 ]
    def preprocess_html(self, soup):
        story = soup.find(name='div', attrs={'id':'innerpage-content'})
        #td = heading.findParent(name='td')
        #td.extract()
        soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
        body = soup.find(name='body')
        body.insert(0, story)
        return soup
    def postprocess_html(self, soup, first):
        if not first:
            h = soup.find(attrs={'class':'innerpage-header'})
            if h: h.extract()
            auth = soup.find(attrs={'class':'author'})
            if auth: auth.extract()
        return soup
--- a/resources/recipes/physics_today.recipe
+++ b/resources/recipes/physics_today.recipe
@ -8,8 +8,7 @@ class Physicstoday(BasicNewsRecipe):
    description           = u'Physics Today magazine'
    publisher             = 'American Institute of Physics'
    category              = 'Physics'
-    language = 'en'
+    language              = 'en'
    cover_url = strftime('http://ptonline.aip.org/journals/doc/PHTOAD-home/jrnls/images/medcover%m_%Y.jpg')
    oldest_article = 30
    max_articles_per_feed = 100
@ -30,11 +29,11 @@ class Physicstoday(BasicNewsRecipe):
    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        if self.username is not None and self.password is not None:
-            br.open('http://www.physicstoday.org/pt/sso_login.jsp')
+            br.open('http://ptonline.aip.org/journals/doc/PHTOAD-home/pt_login.jsp?fl=f')
-            br.select_form(name='login')
+            br.select_form(name='login_form')
            br['username'] = self.username
            br['password'] = self.password
            br.submit()
        return br
-    feeds          = [(u'All', u'http://www.physicstoday.org/feed.xml')]
+    feeds          = [(u'All', u'http://www.physicstoday.org/feed.xml')]
--- a/resources/recipes/readers_digest.recipe
+++ b/resources/recipes/readers_digest.recipe
@ -0,0 +1,188 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 '''
 '''
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from calibre.web.feeds import Feed
 class ReadersDigest(BasicNewsRecipe):
    title       = 'Readers Digest'
    __author__  = 'BrianG'
    language = 'en'
    description = 'Readers Digest Feeds'
    no_stylesheets        = True
    use_embedded_content  = False
    oldest_article = 60
    max_articles_per_feed = 200
    language = 'en'
    remove_javascript     = True
    extra_css      = ''' h1 {font-family:georgia,serif;color:#000000;}
                        .mainHd{font-family:georgia,serif;color:#000000;}
                         h2 {font-family:Arial,Sans-serif;}
                        .name{font-family:Arial,Sans-serif; font-size:x-small;font-weight:bold; }
                        .date{font-family:Arial,Sans-serif; font-size:x-small ;color:#999999;}
                        .byline{font-family:Arial,Sans-serif; font-size:x-small ;}
                        .photoBkt{ font-size:x-small ;}
                        .vertPhoto{font-size:x-small ;}
                        .credits{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;}
                        .credit{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;}
                        .artTxt{font-family:georgia,serif;}
                        .caption{font-family:georgia,serif; font-size:x-small;color:#333333;}
                        .credit{font-family:georgia,serif; font-size:x-small;color:#999999;}
                        a:link{color:#CC0000;}
                        .breadcrumb{font-family:Arial,Sans-serif;font-size:x-small;}
                        '''
    remove_tags = [
        dict(name='h4', attrs={'class':'close'}),
        dict(name='div', attrs={'class':'fromLine'}),
        dict(name='img', attrs={'class':'colorTag'}),
        dict(name='div', attrs={'id':'sponsorArticleHeader'}),
        dict(name='div', attrs={'class':'horizontalAd'}),
        dict(name='div', attrs={'id':'imageCounterLeft'}),
        dict(name='div', attrs={'id':'commentsPrint'})
        ]
    feeds = [
            ('New in RD', 'http://feeds.rd.com/ReadersDigest'),
            ('Jokes', 'http://feeds.rd.com/ReadersDigestJokes'),
            ('Cartoons', 'http://feeds.rd.com/ReadersDigestCartoons'),
            ('Blogs','http://feeds.rd.com/ReadersDigestBlogs')
        ]
    cover_url = 'http://www.rd.com/images/logo-main-rd.gif'
 #-------------------------------------------------------------------------------------------------
    def print_version(self, url):
        # Get the identity number of the current article and append it to the root print URL
        if url.find('/article') > 0:
            ident = url[url.find('/article')+8:url.find('.html?')-4]
            url = 'http://www.rd.com/content/printContent.do?contentId=' + ident
        elif url.find('/post') > 0:
            # in this case, have to get the page itself to derive the Print page.
            soup = self.index_to_soup(url)
            newsoup = soup.find('ul',attrs={'class':'printBlock'})
            url = 'http://www.rd.com' + newsoup('a')[0]['href']
            url = url[0:url.find('&Keep')]
        return url
 #-------------------------------------------------------------------------------------------------
    def parse_index(self):
        pages = [
                ('Your America','http://www.rd.com/your-america-inspiring-people-and-stories', 'channelLeftContainer',{'class':'moreLeft'}),
                # useless recipes ('Living Healthy','http://www.rd.com/living-healthy', 'channelLeftContainer',{'class':'moreLeft'}),
                ('Advice and Know-How','http://www.rd.com/advice-and-know-how', 'channelLeftContainer',{'class':'moreLeft'})
            ]
        feeds = []
        for page in pages:
            section, url, divider, attrList = page
            newArticles = self.page_parse(url, divider, attrList)
            feeds.append((section,newArticles))
        # after the pages of the site have been processed, parse several RSS feeds for additional sections
        newfeeds = Feed()
        newfeeds = self.parse_rss()
        # The utility code in parse_rss returns a Feed object.  Convert each feed/article combination into a form suitable
        # for this module (parse_index).
        for feed in newfeeds:
            newArticles = []
            for article in feed.articles:
                newArt = {
                            'title' : article.title,
                            'url'   : article.url,
                            'date'  : article.date,
                            'description' : article.text_summary
                        }
                newArticles.append(newArt)
            # New and Blogs should be the first two feeds.
            if feed.title == 'New in RD':
                feeds.insert(0,(feed.title,newArticles))
            elif feed.title == 'Blogs':
                feeds.insert(1,(feed.title,newArticles))
            else:
                feeds.append((feed.title,newArticles))
        return feeds
 #-------------------------------------------------------------------------------------------------
    def page_parse(self, mainurl, divider, attrList):
        articles = []
        mainsoup = self.index_to_soup(mainurl)
        for item in mainsoup.findAll(attrs=attrList):
            newArticle = {
                        'title' : item('img')[0]['alt'],
                        'url'   : 'http://www.rd.com'+item('a')[0]['href'],
                        'date'  : '',
                        'description' : ''
                    }
            articles.append(newArticle)
        return articles
 #-------------------------------------------------------------------------------------------------
    def parse_rss (self):
        # Do the "official" parse_feeds first
        feeds = BasicNewsRecipe.parse_feeds(self)
        # Loop thru the articles in all feeds to find articles with "recipe" in it
        recipeArticles = []
        for curfeed in feeds:
            delList = []
            for a,curarticle in enumerate(curfeed.articles):
                if curarticle.title.upper().find('RECIPE') >= 0:
                    recipeArticles.append(curarticle)
                    delList.append(curarticle)
            if len(delList)>0:
                for d in delList:
                    index = curfeed.articles.index(d)
                    curfeed.articles[index:index+1] = []
        # If there are any recipes found, create a new Feed object and append.
        if len(recipeArticles) > 0:
            pfeed = Feed()
            pfeed.title = 'Recipes'
            pfeed.descrition = 'Recipe Feed (Virtual)'
            pfeed.image_url  = None
            pfeed.oldest_article = 30
            pfeed.id_counter = len(recipeArticles)
            # Create a new Feed, add the recipe articles, and then append
            # to "official" list of feeds
            pfeed.articles = recipeArticles[:]
            feeds.append(pfeed)
        return feeds
--- a/resources/recipes/regina_leader_post.recipe
+++ b/resources/recipes/regina_leader_post.recipe
@ -0,0 +1,116 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 '''
 www.canada.com
 '''
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class CanWestPaper(BasicNewsRecipe):
    # un-comment the following three lines for the Regina Leader-Post
    title = u'Regina Leader-Post'
    url_prefix = 'http://www.leaderpost.com'
    description = u'News from Regina, SK'
    # un-comment the following three lines for the Saskatoon Star-Phoenix
    #title = u'Saskatoon Star-Phoenix'
    #url_prefix = 'http://www.thestarphoenix.com'
    #description = u'News from Saskatoon, SK'
    # un-comment the following three lines for the Windsor Star
    #title = u'Windsor Star'
    #url_prefix = 'http://www.windsorstar.com'
    #description = u'News from Windsor, ON'
    # un-comment the following three lines for the Ottawa Citizen
    #title = u'Ottawa Citizen'
    #url_prefix = 'http://www.ottawacitizen.com'
    #description = u'News from Ottawa, ON'
    # un-comment the following three lines for the Montreal Gazette
    #title = u'Montreal Gazette'
    #url_prefix = 'http://www.montrealgazette.com'
    #description = u'News from Montreal, QC'
    language = 'en_CA'
    __author__ = 'Nick Redding'
    no_stylesheets = True
    timefmt = ' [%b %d]'
    extra_css = '''
                .timestamp {  font-size:xx-small; display: block; }
                #storyheader { font-size: medium; }
                #storyheader h1 { font-size: x-large; }
                #storyheader h2 { font-size: large;  font-style: italic; }
                .byline { font-size:xx-small; }
                #photocaption { font-size: small; font-style: italic }
                #photocredit { font-size: xx-small; }'''
    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
    remove_tags = [{'class':'comments'},
                   dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
                   dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
                   dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
                   dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
                   dict(name='div', attrs={'class':'rule_grey_solid'}),
                   dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
    def preprocess_html(self,soup):
        #delete iempty id attributes--they screw up the TOC for unknow reasons
        divtags = soup.findAll('div',attrs={'id':''})
        if divtags:
            for div in divtags:
                del(div['id'])
        return soup
    def parse_index(self):
        soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
        articles = {}
        key = 'News'
        ans = ['News']
        # Find each instance of class="sectiontitle", class="featurecontent"
        for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
                #self.log(" div class = %s" % divtag['class'])
                if divtag['class'].startswith('section_title'):
                    # div contains section title
                    if not divtag.h3:
                        continue
                    key = self.tag_to_string(divtag.h3,False)
                    ans.append(key)
                    self.log("Section name %s" % key)
                    continue
                # div contains article data
                h1tag = divtag.find('h1')
                if not h1tag:
                    continue
                atag = h1tag.find('a',href=True)
                if not atag:
                    continue
                url = self.url_prefix+'/news/todays-paper/'+atag['href']
                #self.log("Section %s" % key)
                #self.log("url %s" % url)
                title = self.tag_to_string(atag,False)
                #self.log("title %s" % title)
                pubdate = ''
                description = ''
                ptag = divtag.find('p');
                if ptag:
                    description = self.tag_to_string(ptag,False)
                    #self.log("description %s" % description)
                author = ''
                autag = divtag.find('h4')
                if autag:
                    author = self.tag_to_string(autag,False)
                    #self.log("author %s" % author)
                if not articles.has_key(key):
                    articles[key] = []
                articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans
--- a/resources/recipes/saskatoon_star_phoenix.recipe
+++ b/resources/recipes/saskatoon_star_phoenix.recipe
@ -0,0 +1,111 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 '''
 www.canada.com
 '''
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class CanWestPaper(BasicNewsRecipe):
    # un-comment the following three lines for the Saskatoon Star-Phoenix
    title = u'Saskatoon Star-Phoenix'
    url_prefix = 'http://www.thestarphoenix.com'
    description = u'News from Saskatoon, SK'
    # un-comment the following three lines for the Windsor Star
    #title = u'Windsor Star'
    #url_prefix = 'http://www.windsorstar.com'
    #description = u'News from Windsor, ON'
    # un-comment the following three lines for the Ottawa Citizen
    #title = u'Ottawa Citizen'
    #url_prefix = 'http://www.ottawacitizen.com'
    #description = u'News from Ottawa, ON'
    # un-comment the following three lines for the Montreal Gazette
    #title = u'Montreal Gazette'
    #url_prefix = 'http://www.montrealgazette.com'
    #description = u'News from Montreal, QC'
    language = 'en_CA'
    __author__ = 'Nick Redding'
    no_stylesheets = True
    timefmt = ' [%b %d]'
    extra_css = '''
                .timestamp {  font-size:xx-small; display: block; }
                #storyheader { font-size: medium; }
                #storyheader h1 { font-size: x-large; }
                #storyheader h2 { font-size: large;  font-style: italic; }
                .byline { font-size:xx-small; }
                #photocaption { font-size: small; font-style: italic }
                #photocredit { font-size: xx-small; }'''
    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
    remove_tags = [{'class':'comments'},
                   dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
                   dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
                   dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
                   dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
                   dict(name='div', attrs={'class':'rule_grey_solid'}),
                   dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
    def preprocess_html(self,soup):
        #delete iempty id attributes--they screw up the TOC for unknow reasons
        divtags = soup.findAll('div',attrs={'id':''})
        if divtags:
            for div in divtags:
                del(div['id'])
        return soup
    def parse_index(self):
        soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
        articles = {}
        key = 'News'
        ans = ['News']
        # Find each instance of class="sectiontitle", class="featurecontent"
        for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
                #self.log(" div class = %s" % divtag['class'])
                if divtag['class'].startswith('section_title'):
                    # div contains section title
                    if not divtag.h3:
                        continue
                    key = self.tag_to_string(divtag.h3,False)
                    ans.append(key)
                    self.log("Section name %s" % key)
                    continue
                # div contains article data
                h1tag = divtag.find('h1')
                if not h1tag:
                    continue
                atag = h1tag.find('a',href=True)
                if not atag:
                    continue
                url = self.url_prefix+'/news/todays-paper/'+atag['href']
                #self.log("Section %s" % key)
                #self.log("url %s" % url)
                title = self.tag_to_string(atag,False)
                #self.log("title %s" % title)
                pubdate = ''
                description = ''
                ptag = divtag.find('p');
                if ptag:
                    description = self.tag_to_string(ptag,False)
                    #self.log("description %s" % description)
                author = ''
                autag = divtag.find('h4')
                if autag:
                    author = self.tag_to_string(autag,False)
                    #self.log("author %s" % author)
                if not articles.has_key(key):
                    articles[key] = []
                articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans
--- a/resources/recipes/vancouver_provice.recipe
+++ b/resources/recipes/vancouver_provice.recipe
@ -0,0 +1,136 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 '''
 www.canada.com
 '''
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class CanWestPaper(BasicNewsRecipe):
    # un-comment the following three lines for the Vancouver Province
    title = u'Vancouver Province'
    url_prefix = 'http://www.theprovince.com'
    description = u'News from Vancouver, BC'
    # un-comment the following three lines for the Vancouver Sun
    #title = u'Vancouver Sun'
    #url_prefix = 'http://www.vancouversun.com'
    #description = u'News from Vancouver, BC'
    # un-comment the following three lines for the Edmonton Journal
    #title = u'Edmonton Journal'
    #url_prefix = 'http://www.edmontonjournal.com'
    #description = u'News from Edmonton, AB'
    # un-comment the following three lines for the Calgary Herald
    #title = u'Calgary Herald'
    #url_prefix = 'http://www.calgaryherald.com'
    #description = u'News from Calgary, AB'
    # un-comment the following three lines for the Regina Leader-Post
    #title = u'Regina Leader-Post'
    #url_prefix = 'http://www.leaderpost.com'
    #description = u'News from Regina, SK'
    # un-comment the following three lines for the Saskatoon Star-Phoenix
    #title = u'Saskatoon Star-Phoenix'
    #url_prefix = 'http://www.thestarphoenix.com'
    #description = u'News from Saskatoon, SK'
    # un-comment the following three lines for the Windsor Star
    #title = u'Windsor Star'
    #url_prefix = 'http://www.windsorstar.com'
    #description = u'News from Windsor, ON'
    # un-comment the following three lines for the Ottawa Citizen
    #title = u'Ottawa Citizen'
    #url_prefix = 'http://www.ottawacitizen.com'
    #description = u'News from Ottawa, ON'
    # un-comment the following three lines for the Montreal Gazette
    #title = u'Montreal Gazette'
    #url_prefix = 'http://www.montrealgazette.com'
    #description = u'News from Montreal, QC'
    language = 'en_CA'
    __author__ = 'Nick Redding'
    no_stylesheets = True
    timefmt = ' [%b %d]'
    extra_css = '''
                .timestamp {  font-size:xx-small; display: block; }
                #storyheader { font-size: medium; }
                #storyheader h1 { font-size: x-large; }
                #storyheader h2 { font-size: large;  font-style: italic; }
                .byline { font-size:xx-small; }
                #photocaption { font-size: small; font-style: italic }
                #photocredit { font-size: xx-small; }'''
    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
    remove_tags = [{'class':'comments'},
                   dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
                   dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
                   dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
                   dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
                   dict(name='div', attrs={'class':'rule_grey_solid'}),
                   dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
    def preprocess_html(self,soup):
        #delete iempty id attributes--they screw up the TOC for unknow reasons
        divtags = soup.findAll('div',attrs={'id':''})
        if divtags:
            for div in divtags:
                del(div['id'])
        return soup
    def parse_index(self):
        soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
        articles = {}
        key = 'News'
        ans = ['News']
        # Find each instance of class="sectiontitle", class="featurecontent"
        for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
                #self.log(" div class = %s" % divtag['class'])
                if divtag['class'].startswith('section_title'):
                    # div contains section title
                    if not divtag.h3:
                        continue
                    key = self.tag_to_string(divtag.h3,False)
                    ans.append(key)
                    self.log("Section name %s" % key)
                    continue
                # div contains article data
                h1tag = divtag.find('h1')
                if not h1tag:
                    continue
                atag = h1tag.find('a',href=True)
                if not atag:
                    continue
                url = self.url_prefix+'/news/todays-paper/'+atag['href']
                #self.log("Section %s" % key)
                #self.log("url %s" % url)
                title = self.tag_to_string(atag,False)
                #self.log("title %s" % title)
                pubdate = ''
                description = ''
                ptag = divtag.find('p');
                if ptag:
                    description = self.tag_to_string(ptag,False)
                    #self.log("description %s" % description)
                author = ''
                autag = divtag.find('h4')
                if autag:
                    author = self.tag_to_string(autag,False)
                    #self.log("author %s" % author)
                if not articles.has_key(key):
                    articles[key] = []
                articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans
--- a/resources/recipes/vancouver_sun.recipe
+++ b/resources/recipes/vancouver_sun.recipe
@ -0,0 +1,131 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 '''
 www.canada.com
 '''
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class CanWestPaper(BasicNewsRecipe):
    # un-comment the following three lines for the Vancouver Sun
    title = u'Vancouver Sun'
    url_prefix = 'http://www.vancouversun.com'
    description = u'News from Vancouver, BC'
    # un-comment the following three lines for the Edmonton Journal
    #title = u'Edmonton Journal'
    #url_prefix = 'http://www.edmontonjournal.com'
    #description = u'News from Edmonton, AB'
    # un-comment the following three lines for the Calgary Herald
    #title = u'Calgary Herald'
    #url_prefix = 'http://www.calgaryherald.com'
    #description = u'News from Calgary, AB'
    # un-comment the following three lines for the Regina Leader-Post
    #title = u'Regina Leader-Post'
    #url_prefix = 'http://www.leaderpost.com'
    #description = u'News from Regina, SK'
    # un-comment the following three lines for the Saskatoon Star-Phoenix
    #title = u'Saskatoon Star-Phoenix'
    #url_prefix = 'http://www.thestarphoenix.com'
    #description = u'News from Saskatoon, SK'
    # un-comment the following three lines for the Windsor Star
    #title = u'Windsor Star'
    #url_prefix = 'http://www.windsorstar.com'
    #description = u'News from Windsor, ON'
    # un-comment the following three lines for the Ottawa Citizen
    #title = u'Ottawa Citizen'
    #url_prefix = 'http://www.ottawacitizen.com'
    #description = u'News from Ottawa, ON'
    # un-comment the following three lines for the Montreal Gazette
    #title = u'Montreal Gazette'
    #url_prefix = 'http://www.montrealgazette.com'
    #description = u'News from Montreal, QC'
    language = 'en_CA'
    __author__ = 'Nick Redding'
    no_stylesheets = True
    timefmt = ' [%b %d]'
    extra_css = '''
                .timestamp {  font-size:xx-small; display: block; }
                #storyheader { font-size: medium; }
                #storyheader h1 { font-size: x-large; }
                #storyheader h2 { font-size: large;  font-style: italic; }
                .byline { font-size:xx-small; }
                #photocaption { font-size: small; font-style: italic }
                #photocredit { font-size: xx-small; }'''
    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
    remove_tags = [{'class':'comments'},
                   dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
                   dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
                   dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
                   dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
                   dict(name='div', attrs={'class':'rule_grey_solid'}),
                   dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
    def preprocess_html(self,soup):
        #delete iempty id attributes--they screw up the TOC for unknow reasons
        divtags = soup.findAll('div',attrs={'id':''})
        if divtags:
            for div in divtags:
                del(div['id'])
        return soup
    def parse_index(self):
        soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
        articles = {}
        key = 'News'
        ans = ['News']
        # Find each instance of class="sectiontitle", class="featurecontent"
        for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
                #self.log(" div class = %s" % divtag['class'])
                if divtag['class'].startswith('section_title'):
                    # div contains section title
                    if not divtag.h3:
                        continue
                    key = self.tag_to_string(divtag.h3,False)
                    ans.append(key)
                    self.log("Section name %s" % key)
                    continue
                # div contains article data
                h1tag = divtag.find('h1')
                if not h1tag:
                    continue
                atag = h1tag.find('a',href=True)
                if not atag:
                    continue
                url = self.url_prefix+'/news/todays-paper/'+atag['href']
                #self.log("Section %s" % key)
                #self.log("url %s" % url)
                title = self.tag_to_string(atag,False)
                #self.log("title %s" % title)
                pubdate = ''
                description = ''
                ptag = divtag.find('p');
                if ptag:
                    description = self.tag_to_string(ptag,False)
                    #self.log("description %s" % description)
                author = ''
                autag = divtag.find('h4')
                if autag:
                    author = self.tag_to_string(autag,False)
                    #self.log("author %s" % author)
                if not articles.has_key(key):
                    articles[key] = []
                articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans
--- a/resources/recipes/vic_times.recipe
+++ b/resources/recipes/vic_times.recipe
@ -0,0 +1,141 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 '''
 www.canada.com
 '''
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class CanWestPaper(BasicNewsRecipe):
    # un-comment the following three lines for the Victoria Times Colonist
    title = u'Victoria Times Colonist'
    url_prefix = 'http://www.timescolonist.com'
    description = u'News from Victoria, BC'
    # un-comment the following three lines for the Vancouver Province
    #title = u'Vancouver Province'
    #url_prefix = 'http://www.theprovince.com'
    #description = u'News from Vancouver, BC'
    # un-comment the following three lines for the Vancouver Sun
    #title = u'Vancouver Sun'
    #url_prefix = 'http://www.vancouversun.com'
    #description = u'News from Vancouver, BC'
    # un-comment the following three lines for the Edmonton Journal
    #title = u'Edmonton Journal'
    #url_prefix = 'http://www.edmontonjournal.com'
    #description = u'News from Edmonton, AB'
    # un-comment the following three lines for the Calgary Herald
    #title = u'Calgary Herald'
    #url_prefix = 'http://www.calgaryherald.com'
    #description = u'News from Calgary, AB'
    # un-comment the following three lines for the Regina Leader-Post
    #title = u'Regina Leader-Post'
    #url_prefix = 'http://www.leaderpost.com'
    #description = u'News from Regina, SK'
    # un-comment the following three lines for the Saskatoon Star-Phoenix
    #title = u'Saskatoon Star-Phoenix'
    #url_prefix = 'http://www.thestarphoenix.com'
    #description = u'News from Saskatoon, SK'
    # un-comment the following three lines for the Windsor Star
    #title = u'Windsor Star'
    #url_prefix = 'http://www.windsorstar.com'
    #description = u'News from Windsor, ON'
    # un-comment the following three lines for the Ottawa Citizen
    #title = u'Ottawa Citizen'
    #url_prefix = 'http://www.ottawacitizen.com'
    #description = u'News from Ottawa, ON'
    # un-comment the following three lines for the Montreal Gazette
    #title = u'Montreal Gazette'
    #url_prefix = 'http://www.montrealgazette.com'
    #description = u'News from Montreal, QC'
    language = 'en_CA'
    __author__ = 'Nick Redding'
    no_stylesheets = True
    timefmt = ' [%b %d]'
    extra_css = '''
                .timestamp {  font-size:xx-small; display: block; }
                #storyheader { font-size: medium; }
                #storyheader h1 { font-size: x-large; }
                #storyheader h2 { font-size: large;  font-style: italic; }
                .byline { font-size:xx-small; }
                #photocaption { font-size: small; font-style: italic }
                #photocredit { font-size: xx-small; }'''
    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
    remove_tags = [{'class':'comments'},
                   dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
                   dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
                   dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
                   dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
                   dict(name='div', attrs={'class':'rule_grey_solid'}),
                   dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
    def preprocess_html(self,soup):
        #delete iempty id attributes--they screw up the TOC for unknow reasons
        divtags = soup.findAll('div',attrs={'id':''})
        if divtags:
            for div in divtags:
                del(div['id'])
        return soup
    def parse_index(self):
        soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
        articles = {}
        key = 'News'
        ans = ['News']
        # Find each instance of class="sectiontitle", class="featurecontent"
        for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
                #self.log(" div class = %s" % divtag['class'])
                if divtag['class'].startswith('section_title'):
                    # div contains section title
                    if not divtag.h3:
                        continue
                    key = self.tag_to_string(divtag.h3,False)
                    ans.append(key)
                    self.log("Section name %s" % key)
                    continue
                # div contains article data
                h1tag = divtag.find('h1')
                if not h1tag:
                    continue
                atag = h1tag.find('a',href=True)
                if not atag:
                    continue
                url = self.url_prefix+'/news/todays-paper/'+atag['href']
                #self.log("Section %s" % key)
                #self.log("url %s" % url)
                title = self.tag_to_string(atag,False)
                #self.log("title %s" % title)
                pubdate = ''
                description = ''
                ptag = divtag.find('p');
                if ptag:
                    description = self.tag_to_string(ptag,False)
                    #self.log("description %s" % description)
                author = ''
                autag = divtag.find('h4')
                if autag:
                    author = self.tag_to_string(autag,False)
                    #self.log("author %s" % author)
                if not articles.has_key(key):
                    articles[key] = []
                articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans
--- a/resources/recipes/windows_star.recipe
+++ b/resources/recipes/windows_star.recipe
@ -0,0 +1,106 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 '''
 www.canada.com
 '''
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class CanWestPaper(BasicNewsRecipe):
    # un-comment the following three lines for the Windsor Star
    title = u'Windsor Star'
    url_prefix = 'http://www.windsorstar.com'
    description = u'News from Windsor, ON'
    # un-comment the following three lines for the Ottawa Citizen
    #title = u'Ottawa Citizen'
    #url_prefix = 'http://www.ottawacitizen.com'
    #description = u'News from Ottawa, ON'
    # un-comment the following three lines for the Montreal Gazette
    #title = u'Montreal Gazette'
    #url_prefix = 'http://www.montrealgazette.com'
    #description = u'News from Montreal, QC'
    language = 'en_CA'
    __author__ = 'Nick Redding'
    no_stylesheets = True
    timefmt = ' [%b %d]'
    extra_css = '''
                .timestamp {  font-size:xx-small; display: block; }
                #storyheader { font-size: medium; }
                #storyheader h1 { font-size: x-large; }
                #storyheader h2 { font-size: large;  font-style: italic; }
                .byline { font-size:xx-small; }
                #photocaption { font-size: small; font-style: italic }
                #photocredit { font-size: xx-small; }'''
    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
    remove_tags = [{'class':'comments'},
                   dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
                   dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
                   dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
                   dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
                   dict(name='div', attrs={'class':'rule_grey_solid'}),
                   dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
    def preprocess_html(self,soup):
        #delete iempty id attributes--they screw up the TOC for unknow reasons
        divtags = soup.findAll('div',attrs={'id':''})
        if divtags:
            for div in divtags:
                del(div['id'])
        return soup
    def parse_index(self):
        soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
        articles = {}
        key = 'News'
        ans = ['News']
        # Find each instance of class="sectiontitle", class="featurecontent"
        for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
                #self.log(" div class = %s" % divtag['class'])
                if divtag['class'].startswith('section_title'):
                    # div contains section title
                    if not divtag.h3:
                        continue
                    key = self.tag_to_string(divtag.h3,False)
                    ans.append(key)
                    self.log("Section name %s" % key)
                    continue
                # div contains article data
                h1tag = divtag.find('h1')
                if not h1tag:
                    continue
                atag = h1tag.find('a',href=True)
                if not atag:
                    continue
                url = self.url_prefix+'/news/todays-paper/'+atag['href']
                #self.log("Section %s" % key)
                #self.log("url %s" % url)
                title = self.tag_to_string(atag,False)
                #self.log("title %s" % title)
                pubdate = ''
                description = ''
                ptag = divtag.find('p');
                if ptag:
                    description = self.tag_to_string(ptag,False)
                    #self.log("description %s" % description)
                author = ''
                autag = divtag.find('h4')
                if autag:
                    author = self.tag_to_string(autag,False)
                    #self.log("author %s" % author)
                if not articles.has_key(key):
                    articles[key] = []
                articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans
--- a/resources/recipes/wsj.recipe
+++ b/resources/recipes/wsj.recipe
@ -5,6 +5,7 @@ __docformat__ = 'restructuredtext en'
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre import strftime
 # http://online.wsj.com/page/us_in_todays_paper.html
@ -67,6 +68,13 @@ class WallStreetJournal(BasicNewsRecipe):
        def parse_index(self):
            soup = self.wsj_get_index()
            year = strftime('%Y')
            for x in soup.findAll('td', attrs={'class':'b14'}):
                txt = self.tag_to_string(x).strip()
                if year in txt:
                    self.timefmt = ' [%s]'%txt
                    break
            left_column = soup.find(
                    text=lambda t: 'begin ITP Left Column' in str(t))
@ -91,7 +99,7 @@ class WallStreetJournal(BasicNewsRecipe):
                    url = url.partition('#')[0]
                    desc = ''
                    d = x.findNextSibling(True)
-                    if d.get('class', None) == 'arialResize':
+                    if d is not None and d.get('class', None) == 'arialResize':
                        desc = self.tag_to_string(d)
                        desc = desc.partition(u'\u2022')[0]
                    self.log('\t\tFound article:', title)
--- a/resources/recipes/wsj_free.recipe
+++ b/resources/recipes/wsj_free.recipe
@ -3,47 +3,122 @@
 __license__   = 'GPL v3'
 '''
-online.wsj.com.com
+online.wsj.com
 '''
 import re
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import Tag, NavigableString
 from datetime import timedelta, datetime, date
 class WSJ(BasicNewsRecipe):
    # formatting adapted from original recipe by Kovid Goyal and Sujata Raman
    title          = u'Wall Street Journal (free)'
    __author__     = 'Nick Redding'
    language = 'en'
-    description = ('All the free content from the Wall Street Journal (business'
+    description = ('All the free content from the Wall Street Journal (business, financial and political news)')
-            ', financial and political news)')
+
    no_stylesheets = True
    timefmt = ' [%b %d]'
-    extra_css   = '''h1{font-size:large; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif;}
+
-                    h2{font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;}
+    # customization notes: delete sections you are not interested in
-                    .subhead{font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;}
+    # set omit_paid_content to False if you want the paid content article snippets
-                    .insettipUnit {font-family:Arial,Sans-serif;font-size:xx-small;}
+    # set oldest_article to the maximum number of days back from today to include articles
-                    .targetCaption{font-size:x-small; font-family:Arial,Helvetica,sans-serif;}
+    sectionlist = [
-                    .article{font-family :Arial,Helvetica,sans-serif; font-size:x-small;}
+                        ['/home-page','Front Page'],
-                    .tagline { ont-size:xx-small;}
+                        ['/public/page/news-opinion-commentary.html','Commentary'],
-                    .dateStamp {font-family:Arial,Helvetica,sans-serif;}
+                        ['/public/page/news-global-world.html','World News'],
-                    h3{font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
+                        ['/public/page/news-world-business.html','US News'],
-                    .byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small; list-style-type: none;}
+                        ['/public/page/news-business-us.html','Business'],
                        ['/public/page/news-financial-markets-stock.html','Markets'],
                        ['/public/page/news-tech-technology.html','Technology'],
                        ['/public/page/news-personal-finance.html','Personal Finnce'],
                        ['/public/page/news-lifestyle-arts-entertainment.html','Life & Style'],
                        ['/public/page/news-real-estate-homes.html','Real Estate'],
                        ['/public/page/news-career-jobs.html','Careers'],
                        ['/public/page/news-small-business-marketing.html','Small Business']
                    ]
    oldest_article = 2
    omit_paid_content = True
    extra_css   = '''h1{font-size:large; font-family:Times,serif;}
                    h2{font-family:Times,serif; font-size:small; font-style:italic;}
                    .subhead{font-family:Times,serif; font-size:small; font-style:italic;}
                    .insettipUnit {font-family:Times,serif;font-size:xx-small;}
                    .targetCaption{font-size:x-small; font-family:Times,serif; font-style:italic; margin-top: 0.25em;}
                    .article{font-family:Times,serif; font-size:x-small;}
                    .tagline { font-size:xx-small;}
                    .dateStamp {font-family:Times,serif;}
                    h3{font-family:Times,serif; font-size:xx-small;}
                    .byline {font-family:Times,serif; font-size:xx-small; list-style-type: none;}
                    .metadataType-articleCredits {list-style-type: none;}
-                    h6{ font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small;font-style:italic;}
+                    h6{font-family:Times,serif; font-size:small; font-style:italic;}
                    .paperLocation{font-size:xx-small;}'''
-    remove_tags_before = dict(name='h1')
+
-    remove_tags =   [   dict(id=["articleTabs_tab_article", "articleTabs_tab_comments",
+    remove_tags_before = dict({'class':re.compile('^articleHeadlineBox')})
-                                 "articleTabs_tab_interactive","articleTabs_tab_video",
+    remove_tags =   [   dict({'id':re.compile('^articleTabs_tab_')}),
-                                 "articleTabs_tab_map","articleTabs_tab_slideshow"]),
+                        #dict(id=["articleTabs_tab_article", "articleTabs_tab_comments",
-			{'class':['footer_columns','network','insetCol3wide','interactive','video','slideshow','map',
+                        #         "articleTabs_tab_interactive","articleTabs_tab_video",
-			'insettip','insetClose','more_in', "insetContent", 'articleTools_bottom', 'aTools', 'tooltip',
+                        #         "articleTabs_tab_map","articleTabs_tab_slideshow"]),
-			'adSummary', 'nav-inline','insetFullBracket']},
+			{'class':  ['footer_columns','network','insetCol3wide','interactive','video','slideshow','map',
-                        dict(rel='shortcut icon'),
+                                    'insettip','insetClose','more_in', "insetContent",
                        #            'articleTools_bottom','articleTools_bottom mjArticleTools',
                                    'aTools', 'tooltip',
                                    'adSummary', 'nav-inline','insetFullBracket']},
                        dict({'class':re.compile('^articleTools_bottom')}),
                        dict(rel='shortcut icon')
                    ]
    remove_tags_after = [dict(id="article_story_body"), {'class':"article story"}]
    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        return br
    def preprocess_html(self,soup):
        # check if article is too old
        datetag = soup.find('li',attrs={'class' : re.compile("^dateStamp")})
        if datetag:
            dateline_string = self.tag_to_string(datetag,False)
            date_items = dateline_string.split(',')
            datestring = date_items[0]+date_items[1]
            article_date = datetime.strptime(datestring.title(),"%B %d %Y")
            earliest_date = date.today() - timedelta(days=self.oldest_article)
            if article_date.date() < earliest_date:
                self.log("Skipping article dated %s" % datestring)
                return None
            datetag.parent.extract()
            # place dateline in article heading
            bylinetag = soup.find('h3','byline')
            if bylinetag:
                h3bylinetag = bylinetag
            else:
                bylinetag = soup.find('li','byline')
                if bylinetag:
                    h3bylinetag = bylinetag.h3
                    if not h3bylinetag:
                        h3bylinetag = bylinetag
                    bylinetag = bylinetag.parent
            if bylinetag:
                if h3bylinetag.a:
                    bylinetext = 'By '+self.tag_to_string(h3bylinetag.a,False)
                else:
                    bylinetext = self.tag_to_string(h3bylinetag,False)
                h3byline = Tag(soup,'h3',[('class','byline')])
                if bylinetext.isspace() or (bylinetext == ''):
                    h3byline.insert(0,NavigableString(date_items[0]+','+date_items[1]))
                else:
                    h3byline.insert(0,NavigableString(bylinetext+u'\u2014'+date_items[0]+','+date_items[1]))
                bylinetag.replaceWith(h3byline)
            else:
                headlinetag = soup.find('div',attrs={'class' : re.compile("^articleHeadlineBox")})
                if headlinetag:
                    dateline = Tag(soup,'h3', [('class','byline')])
                    dateline.insert(0,NavigableString(date_items[0]+','+date_items[1]))
                    headlinetag.insert(len(headlinetag),dateline)
        else: # if no date tag, don't process this page--it's not a news item
            return None
        # This gets rid of the annoying superfluous bullet symbol preceding columnist bylines
        ultag = soup.find('ul',attrs={'class' : 'cMetadata metadataType-articleCredits'})
        if ultag:
@ -58,7 +133,7 @@ class WSJ(BasicNewsRecipe):
        key = None
        ans = []
-        def parse_index_page(page_name,page_title,omit_paid_content):
+        def parse_index_page(page_name,page_title):
            def article_title(tag):
                atag = tag.find('h2') # title is usually in an h2 tag
@ -119,7 +194,6 @@ class WSJ(BasicNewsRecipe):
            soup = self.index_to_soup(pageurl)
            # Find each instance of div with class including "headlineSummary"
            for divtag in soup.findAll('div',attrs={'class' : re.compile("^headlineSummary")}):
                # divtag contains all article data as ul's and li's
                # first, check if there is an h3 tag which provides a section name
                stag = divtag.find('h3')
@ -162,7 +236,7 @@ class WSJ(BasicNewsRecipe):
                        # now skip paid subscriber articles if desired
                        subscriber_tag = litag.find(text="Subscriber Content")
                        if subscriber_tag:
-                                if omit_paid_content:
+                                if self.omit_paid_content:
                                    continue
                                # delete the tip div so it doesn't get in the way
                                tiptag = litag.find("div", { "class" : "tipTargetBox" })
@ -185,7 +259,7 @@ class WSJ(BasicNewsRecipe):
                            continue
                        if url.startswith("/article"):
                            url = mainurl+url
-                        if not url.startswith("http"):
+                        if not url.startswith("http://online.wsj.com"):
                            continue
                        if not url.endswith(".html"):
                            continue
@ -214,48 +288,10 @@ class WSJ(BasicNewsRecipe):
                            articles[page_title] = []
                        articles[page_title].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
        # customization notes: delete sections you are not interested in
        # set omit_paid_content to False if you want the paid content article previews
        sectionlist = ['Front Page','Commentary','World News','US News','Business','Markets',
                       'Technology','Personal Finance','Life & Style','Real Estate','Careers','Small Business']
        omit_paid_content = True
-        if 'Front Page' in sectionlist:
+        for page_name,page_title in self.sectionlist:
-            parse_index_page('/home-page','Front Page',omit_paid_content)
+            parse_index_page(page_name,page_title)
-            ans.append('Front Page')
+            ans.append(page_title)
        if 'Commentary' in sectionlist:
            parse_index_page('/public/page/news-opinion-commentary.html','Commentary',omit_paid_content)
            ans.append('Commentary')
        if 'World News' in sectionlist:
            parse_index_page('/public/page/news-global-world.html','World News',omit_paid_content)
            ans.append('World News')
        if 'US News' in sectionlist:
            parse_index_page('/public/page/news-world-business.html','US News',omit_paid_content)
            ans.append('US News')
        if 'Business' in sectionlist:
            parse_index_page('/public/page/news-business-us.html','Business',omit_paid_content)
            ans.append('Business')
        if 'Markets' in sectionlist:
            parse_index_page('/public/page/news-financial-markets-stock.html','Markets',omit_paid_content)
            ans.append('Markets')
        if 'Technology' in sectionlist:
            parse_index_page('/public/page/news-tech-technology.html','Technology',omit_paid_content)
            ans.append('Technology')
        if 'Personal Finance' in sectionlist:
            parse_index_page('/public/page/news-personal-finance.html','Personal Finance',omit_paid_content)
            ans.append('Personal Finance')
        if 'Life & Style' in sectionlist:
            parse_index_page('/public/page/news-lifestyle-arts-entertainment.html','Life & Style',omit_paid_content)
            ans.append('Life & Style')
        if 'Real Estate' in sectionlist:
            parse_index_page('/public/page/news-real-estate-homes.html','Real Estate',omit_paid_content)
            ans.append('Real Estate')
        if 'Careers' in sectionlist:
            parse_index_page('/public/page/news-career-jobs.html','Careers',omit_paid_content)
            ans.append('Careers')
        if 'Small Business' in sectionlist:
            parse_index_page('/public/page/news-small-business-marketing.html','Small Business',omit_paid_content)
            ans.append('Small Business')
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans
--- a/resources/viewer/images.js
+++ b/resources/viewer/images.js
@ -0,0 +1,23 @@
 /*
 * images management
 * Copyright 2008 Kovid Goyal
 * License: GNU GPL v3
 */
 function scale_images() {
    $("img:visible").each(function() {
        var offset = $(this).offset();
        //window.py_bridge.debug(window.getComputedStyle(this, '').getPropertyValue('max-width'));
        $(this).css("max-width", (window.innerWidth-offset.left-5)+"px");
        $(this).css("max-height", (window.innerHeight-5)+"px");
    });
 }
 function setup_image_scaling_handlers() {
   scale_images();
   $(window).resize(function(){
        scale_images();
   });
 }
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -382,6 +382,7 @@ from calibre.ebooks.rtf.input import RTFInput
 from calibre.ebooks.tcr.input import TCRInput
 from calibre.ebooks.txt.input import TXTInput
 from calibre.ebooks.lrf.input import LRFInput
 from calibre.ebooks.chm.input import CHMInput # XXMODIFIED
 from calibre.ebooks.epub.output import EPUBOutput
 from calibre.ebooks.fb2.output import FB2Output
@ -440,6 +441,7 @@ plugins += [
    TCRInput,
    TXTInput,
    LRFInput,
    CHMInput, # XXMODIFIED
 ]
 plugins += [
    EPUBOutput,
--- a/src/calibre/ebooks/mobi/reader.py
+++ b/src/calibre/ebooks/mobi/reader.py
@ -563,6 +563,16 @@ class MobiReader(object):
                    recindex = attrib.pop(attr, None) or recindex
                if recindex is not None:
                    attrib['src'] = 'images/%s.jpg' % recindex
                for attr in ('width', 'height'):
                    if attr in attrib:
                        val = attrib[attr]
                        if val.lower().endswith('em'):
                            try:
                                nval = float(val[:-2])
                                nval *= 16 * (168.451/72) # Assume this was set using the Kindle profile
                                attrib[attr] = "%dpx"%int(nval)
                            except:
                                del attrib[attr]
            elif tag.tag == 'pre':
                if not tag.text:
                    tag.tag = 'div'
--- a/src/calibre/ebooks/oeb/stylizer.py
+++ b/src/calibre/ebooks/oeb/stylizer.py
@ -411,6 +411,7 @@ class Style(object):
        return result
    def _unit_convert(self, value, base=None, font=None):
        ' Return value in pts'
        if isinstance(value, (int, long, float)):
            return value
        try:
@ -447,6 +448,9 @@ class Style(object):
                result = value * 0.40
        return result
    def pt_to_px(self, value):
        return (self._profile.dpi / 72.0) * value
    @property
    def fontSize(self):
        def normalize_fontsize(value, base):
--- a/src/calibre/ebooks/pdf/reflow.py
+++ b/src/calibre/ebooks/pdf/reflow.py
@ -20,6 +20,10 @@ class Font(object):
 class Column(object):
    # A column contains an element is the element bulges out to
    # the left or the right by at most HFUZZ*col width.
    HFUZZ = 0.2
    def __init__(self):
        self.left = self.right = self.top = self.bottom = 0
        self.width = self.height = 0
@ -41,6 +45,10 @@ class Column(object):
        for x in self.elements:
            yield x
    def contains(self, elem):
        return elem.left > self.left - self.HFUZZ*self.width and \
               elem.right < self.right + self.HFUZZ*self.width
 class Element(object):
    def __eq__(self, other):
@ -238,11 +246,10 @@ class Page(object):
        return columns
    def find_elements_in_row_of(self, x):
-        interval = Interval(x.top - self.YFUZZ * self.average_text_height,
+        interval = Interval(x.top,
                x.top + self.YFUZZ*(1+self.average_text_height))
        h_interval = Interval(x.left, x.right)
-        m = max(0, x.idx-15)
+        for y in self.elements[x.idx:x.idx+15]:
        for y in self.elements[m:x.idx+15]:
            if y is not x:
                y_interval = Interval(y.top, y.bottom)
                x_interval = Interval(y.left, y.right)
--- a/src/calibre/gui2/dialogs/metadata_single.py
+++ b/src/calibre/gui2/dialogs/metadata_single.py
@ -532,7 +532,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
            if self.cover_fetcher.exception is not None:
                err = self.cover_fetcher.exception
                error_dialog(self, _('Cannot fetch cover'),
-                    _('<b>Could not fetch cover.</b><br/>')+repr(err)).exec_()
+                    _('<b>Could not fetch cover.</b><br/>')+unicode(err)).exec_()
                return
            pix = QPixmap()
--- a/src/calibre/gui2/ui.py
+++ b/src/calibre/gui2/ui.py
@ -1361,7 +1361,7 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
    def generate_catalog(self):
        rows = self.library_view.selectionModel().selectedRows()
-        if not rows:
+        if not rows or len(rows) < 3:
            rows = xrange(self.library_view.model().rowCount(QModelIndex()))
        ids = map(self.library_view.model().id, rows)
        dbspec = None
--- a/src/calibre/gui2/viewer/config.ui
+++ b/src/calibre/gui2/viewer/config.ui
@ -7,14 +7,14 @@
    <x>0</x>
    <y>0</y>
    <width>479</width>
-    <height>574</height>
+    <height>606</height>
   </rect>
  </property>
  <property name="windowTitle">
   <string>Configure Ebook viewer</string>
  </property>
  <property name="windowIcon">
-   <iconset resource="../../../../resources/images.qrc">
+   <iconset>
    <normaloff>:/images/config.svg</normaloff>:/images/config.svg</iconset>
  </property>
  <layout class="QGridLayout" name="gridLayout_4">
@ -164,7 +164,7 @@
              </item>
             </widget>
            </item>
-            <item row="6" column="0" colspan="2">
+            <item row="7" column="0" colspan="2">
             <widget class="QCheckBox" name="opt_remember_window_size">
              <property name="text">
               <string>Remember last used &amp;window size</string>
@ -218,6 +218,13 @@
              </property>
             </widget>
            </item>
            <item row="6" column="0" colspan="2">
             <widget class="QCheckBox" name="opt_fit_images">
              <property name="text">
               <string>&amp;Resize images larger than the viewer window (needs restart)</string>
              </property>
             </widget>
            </item>
           </layout>
          </item>
          <item row="3" column="0">
--- a/src/calibre/gui2/viewer/documentview.py
+++ b/src/calibre/gui2/viewer/documentview.py
@ -10,7 +10,7 @@ from base64 import b64encode
 from PyQt4.Qt import QSize, QSizePolicy, QUrl, SIGNAL, Qt, QTimer, \
                     QPainter, QPalette, QBrush, QFontDatabase, QDialog, \
                     QColor, QPoint, QImage, QRegion, QVariant, QIcon, \
-                     QFont, QObject, QApplication, pyqtSignature, QAction
+                     QFont, pyqtSignature, QAction
 from PyQt4.QtWebKit import QWebPage, QWebView, QWebSettings
 from calibre.utils.config import Config, StringConfig
@ -21,7 +21,7 @@ from calibre.constants import iswindows
 from calibre import prints, guess_type
 from calibre.gui2.viewer.keys import SHORTCUTS
-bookmarks = referencing = hyphenation = jquery = jquery_scrollTo = hyphenator = None
+bookmarks = referencing = hyphenation = jquery = jquery_scrollTo = hyphenator = images =None
 def load_builtin_fonts():
    base = P('fonts/liberation/*.ttf')
@ -42,6 +42,8 @@ def config(defaults=None):
              help=_('Set the user CSS stylesheet. This can be used to customize the look of all books.'))
    c.add_opt('max_view_width', default=6000,
            help=_('Maximum width of the viewer window, in pixels.'))
    c.add_opt('fit_images', default=True,
            help=_('Resize images larger than the viewer window to fit inside it'))
    c.add_opt('hyphenate', default=False, help=_('Hyphenate text'))
    c.add_opt('hyphenate_default_lang', default='en',
            help=_('Default language for hyphenation rules'))
@ -59,20 +61,6 @@ def config(defaults=None):
    return c
 class PythonJS(QObject):
    def __init__(self, callback):
        QObject.__init__(self, QApplication.instance())
        self.setObjectName("py_bridge")
        self._callback = callback
    @pyqtSignature("QString")
    def callback(self, msg):
        print "callback called"
        self._callback(msg)
 class ConfigDialog(QDialog, Ui_Dialog):
    def __init__(self, shortcuts, parent=None):
@ -110,6 +98,7 @@ class ConfigDialog(QDialog, Ui_Dialog):
        self.shortcut_config = ShortcutConfig(shortcuts, parent=self)
        p = self.tabs.widget(1)
        p.layout().addWidget(self.shortcut_config)
        self.opt_fit_images.setChecked(opts.fit_images)
    def accept(self, *args):
@ -122,6 +111,7 @@ class ConfigDialog(QDialog, Ui_Dialog):
        c.set('standard_font', {0:'serif', 1:'sans', 2:'mono'}[self.standard_font.currentIndex()])
        c.set('user_css', unicode(self.css.toPlainText()))
        c.set('remember_window_size', self.opt_remember_window_size.isChecked())
        c.set('fit_images', self.opt_fit_images.isChecked())
        c.set('max_view_width', int(self.max_view_width.value()))
        c.set('hyphenate', self.hyphenate.isChecked())
        idx = self.hyphenate_default_lang.currentIndex()
@ -157,7 +147,6 @@ class Document(QWebPage):
        self.setObjectName("py_bridge")
        self.debug_javascript = False
        self.current_language = None
        #self.js_bridge = PythonJS(self.js_callback)
        self.setLinkDelegationPolicy(self.DelegateAllLinks)
        self.scroll_marks = []
@ -197,9 +186,14 @@ class Document(QWebPage):
        opts = config().parse()
        self.hyphenate = opts.hyphenate
        self.hyphenate_default_lang = opts.hyphenate_default_lang
        self.do_fit_images = opts.fit_images
    def fit_images(self):
        if self.do_fit_images:
            self.javascript('setup_image_scaling_handlers()')
    def load_javascript_libraries(self):
-        global bookmarks, referencing, hyphenation, jquery, jquery_scrollTo, hyphenator
+        global bookmarks, referencing, hyphenation, jquery, jquery_scrollTo, hyphenator, images
        self.mainFrame().addToJavaScriptWindowObject("py_bridge", self)
        if jquery is None:
            jquery = P('content_server/jquery.js', data=True)
@ -215,6 +209,9 @@ class Document(QWebPage):
        if referencing is None:
            referencing = P('viewer/referencing.js', data=True)
        self.javascript(referencing)
        if images is None:
            images = P('viewer/images.js', data=True)
        self.javascript(images)
        if hyphenation is None:
            hyphenation = P('viewer/hyphenation.js', data=True)
        self.javascript(hyphenation)
@ -541,6 +538,7 @@ class DocumentView(QWebView):
            return
        self.loading_url = None
        self.document.set_bottom_padding(0)
        self.document.fit_images()
        self._size_hint = self.document.mainFrame().contentsSize()
        scrolled = False
        if self.to_bottom:
--- a/src/calibre/library/database2.py
+++ b/src/calibre/library/database2.py
@ -1634,13 +1634,15 @@ class LibraryDatabase2(LibraryDatabase):
        for i in iter(self):
            yield i[x]
-    def get_data_as_dict(self, prefix=None, authors_as_string=False):
+    def get_data_as_dict(self, prefix=None, authors_as_string=False, ids=None):
        '''
        Return all metadata stored in the database as a dict. Includes paths to
        the cover and each format.
        :param prefix: The prefix for all paths. By default, the prefix is the absolute path
        to the library folder.
        :param ids: Set of ids to return the data for. If None return data for
        all entries in database.
        '''
        if prefix is None:
            prefix = self.library_path
@ -1650,11 +1652,14 @@ class LibraryDatabase2(LibraryDatabase):
        data = []
        for record in self.data:
            if record is None: continue
            db_id = record[FIELD_MAP['id']]
            if ids is not None and db_id not in ids:
                continue
            x = {}
            for field in FIELDS:
                x[field] = record[FIELD_MAP[field]]
            data.append(x)
-            x['id'] = record[FIELD_MAP['id']]
+            x['id'] = db_id
            x['formats'] = []
            if not x['authors']:
                x['authors'] = _('Unknown')
--- a/src/calibre/utils/config.py
+++ b/src/calibre/utils/config.py
@ -524,6 +524,7 @@ class DynamicConfig(dict):
                    pass
                except:
                    import traceback
                    print 'Failed to unpickle stored object:'
                    traceback.print_exc()
                    d = {}
        self.clear()