Sync to trunk.

2025-07-08 02:34:06 -04:00 · 2011-12-26 17:56:09 -05:00 · 2011-12-26 17:56:09 -05:00 · d9babbc43b
commit d9babbc43b
parent 4f611b3abd 231e8aeca3
612 changed files with 206194 additions and 119528 deletions
--- a/.bzrignore
+++ b/.bzrignore
@ -2,6 +2,7 @@
 .check-cache.pickle
 src/calibre/plugins
 resources/images.qrc
+src/calibre/ebooks/oeb/display/test/*.js
 src/calibre/manual/.build/
 src/calibre/manual/cli/
 src/calibre/manual/template_ref.rst
@ -15,6 +16,7 @@ resources/ebook-convert-complete.pickle
 resources/builtin_recipes.xml
 resources/builtin_recipes.zip
 resources/template-functions.json
+resources/display/*.js
 setup/installer/windows/calibre/build.log
 src/calibre/translations/.errors
 src/cssutils/.svn/
--- a/Changelog.old.yaml
+++ b/Changelog.old.yaml
--- a/Changelog.yaml
+++ b/Changelog.yaml
--- a/recipes/abc_au.recipe
+++ b/recipes/abc_au.recipe
@ -1,5 +1,5 @@
 __license__   = 'GPL v3'
-__copyright__ = '2010, Dean Cording'
+__copyright__ = '2011, Pat Stapleton <pat.stapleton at gmail.com>'
 '''
 abc.net.au/news
 '''
@ -8,7 +8,7 @@ from calibre.web.feeds.recipes import BasicNewsRecipe

 class ABCNews(BasicNewsRecipe):
    title                  = 'ABC News'
-    __author__             = 'Dean Cording'
+    __author__             = 'Pat Stapleton, Dean Cording'
    description            = 'News from Australia'
    masthead_url           = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png'
    cover_url              = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png'
@ -23,7 +23,9 @@ class ABCNews(BasicNewsRecipe):
    category               = 'News, Australia, World'
    language               = 'en_AU'
    publication_type       = 'newsportal'
-    preprocess_regexps     = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
+#    preprocess_regexps     = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
+#Remove annoying map links (inline-caption class is also used for some image captions! hence regex to match maps.google)
+    preprocess_regexps     = [(re.compile(r'<a class="inline-caption" href="http://maps\.google\.com.*?/a>', re.DOTALL), lambda m: '')]
    conversion_options = {
                             'comments'        : description
                            ,'tags'            : category
@ -32,23 +34,23 @@ class ABCNews(BasicNewsRecipe):
                            ,'linearize_tables': False
                         }

-    keep_only_tags    =  dict(id='article')
+    keep_only_tags = [dict(attrs={'class':['article section']})]

-    remove_tags = [dict(attrs={'class':['related', 'tags']}),
-                     dict(id='statepromo')
-                        ]
+    remove_tags = [dict(attrs={'class':['related', 'tags', 'tools', 'attached-content ready',
+        'inline-content story left', 'inline-content map left contracted', 'published',
+        'story-map', 'statepromo', 'topics', ]})]

    remove_attributes = ['width','height']

    feeds          = [
-                      ('Top Stories', 'http://www.abc.net.au/news/syndicate/topstoriesrss.xml'),
-                      ('Canberra', 'http://www.abc.net.au/news/indexes/idx-act/rss.xml'),
-                      ('Sydney', 'http://www.abc.net.au/news/indexes/sydney/rss.xml'),
-                      ('Melbourne', 'http://www.abc.net.au/news/indexes/melbourne/rss.xml'),
-                      ('Brisbane', 'http://www.abc.net.au/news/indexes/brisbane/rss.xml'),
-                      ('Perth', 'http://www.abc.net.au/news/indexes/perth/rss.xml'),
-                      ('Australia', 'http://www.abc.net.au/news/indexes/idx-australia/rss.xml'),
-                      ('World', 'http://www.abc.net.au/news/indexes/world/rss.xml'),
-                      ('Business', 'http://www.abc.net.au/news/indexes/business/rss.xml'),
-                      ('Science and Technology', 'http://www.abc.net.au/news/tag/science-and-technology/rss.xml'),
+                      ('Top Stories', 'http://www.abc.net.au/news/feed/45910/rss.xml'),
+                      ('Canberra', 'http://www.abc.net.au/news/feed/6910/rss.xml'),
+                      ('Sydney', 'http://www.abc.net.au/news/feed/10232/rss.xml'),
+                      ('Melbourne', 'http://www.abc.net.au/news/feed/21708/rss.xml'),
+                      ('Brisbane', 'http://www.abc.net.au/news/feed/12858/rss.xml'),
+                      ('Perth', 'feed://www.abc.net.au/news/feed/24886/rss.xml'),
+                      ('Australia', 'http://www.abc.net.au/news/feed/46182/rss.xml'),
+                      ('World', 'http://www.abc.net.au/news/feed/52278/rss.xml'),
+                      ('Business', 'http://www.abc.net.au/news/feed/51892/rss.xml'),
+                      ('Science and Technology', 'http://www.abc.net.au/news/feed/2298/rss.xml'),
                    ]
--- a/recipes/adventure_zone_pl.recipe
+++ b/recipes/adventure_zone_pl.recipe
@ -1,19 +1,38 @@
 from calibre.web.feeds.news import BasicNewsRecipe
-
+import re
 class Adventure_zone(BasicNewsRecipe):
    title          = u'Adventure Zone'
    __author__        = 'fenuks'
    description   = 'Adventure zone - adventure games from A to Z'
    category       = 'games'
    language       = 'pl'
-    oldest_article = 15
-    max_articles_per_feed = 100
    no_stylesheets = True
+    oldest_article = 20
+    max_articles_per_feed = 100
+    use_embedded_content=False
+    preprocess_regexps     = [(re.compile(r"<td class='capmain'>Komentarze</td>", re.IGNORECASE), lambda m: '')]
    remove_tags_before= dict(name='td', attrs={'class':'main-bg'})
-    remove_tags_after= dict(name='td', attrs={'class':'main-body middle-border'})
+    remove_tags= [dict(name='img', attrs={'alt':'Drukuj'})]
+    remove_tags_after= dict(id='comments')
    extra_css              = '.main-bg{text-align: left;}  td.capmain{ font-size: 22px; }'
    feeds          = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/feeds/news.php')]

+    def parse_feeds (self): 
+      feeds = BasicNewsRecipe.parse_feeds(self) 
+      soup=self.index_to_soup(u'http://www.adventure-zone.info/fusion/feeds/news.php')
+      tag=soup.find(name='channel')
+      titles=[]
+      for r in tag.findAll(name='image'):
+          r.extract()
+      art=tag.findAll(name='item')
+      for i in art:
+            titles.append(i.title.string)
+      for feed in feeds:
+        for article in feed.articles[:]:
+            article.title=titles[feed.articles.index(article)]
+      return feeds
+
+
    def get_cover_url(self):
        soup = self.index_to_soup('http://www.adventure-zone.info/fusion/news.php')
        cover=soup.find(id='box_OstatninumerAZ')
@ -22,17 +41,10 @@ class Adventure_zone(BasicNewsRecipe):


    def skip_ad_pages(self, soup):
-        skip_tag = soup.body.findAll(name='a')
-        if skip_tag is not None:
+        skip_tag = soup.body.find(name='td', attrs={'class':'main-bg'})
+        skip_tag = skip_tag.findAll(name='a')
        for r in skip_tag:
-                 if 'articles.php?' in r['href']:
-                     if r.strong is not None:
+           if r.strong:
                 word=r.strong.string
-                         if ('zapowied' or 'recenzj') in word:
-                             return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item_id'+r['href'][r['href'].find('_id')+3:], raw=True)
-        else:
-            None
-
-    def print_version(self, url):
-        return url.replace('news.php?readmore', 'print.php?type=N&item_id')
-
+                 if word and (('zapowied' in word) or ('recenzj' in word)  or ('solucj' in word)):
+                   return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True)
--- a/recipes/astro_news_pl.recipe
+++ b/recipes/astro_news_pl.recipe
@ -1,5 +1,4 @@
 from calibre.web.feeds.news import BasicNewsRecipe
-
 class AstroNEWS(BasicNewsRecipe):
    title          = u'AstroNEWS'
    __author__        = 'fenuks'
@ -8,11 +7,16 @@ class AstroNEWS(BasicNewsRecipe):
    language       = 'pl'
    oldest_article = 8
    max_articles_per_feed = 100
-    auto_cleanup = True
+    #extra_css= 'table {text-align: left;}'
+    no_stylesheets=True
    cover_url='http://news.astronet.pl/img/logo_news.jpg'
-   # no_stylesheets= True
+    remove_tags=[dict(name='hr')]
    feeds          = [(u'Wiadomości', u'http://news.astronet.pl/rss.cgi')]

    def print_version(self, url):
        return url.replace('astronet.pl/', 'astronet.pl/print.cgi?')

+    def preprocess_html(self, soup):
+        for item in soup.findAll(align=True):
+            del item['align']
+        return soup
--- a/recipes/b365realitatea.recipe
+++ b/recipes/b365realitatea.recipe
@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+
+__license__   = 'GPL v3'
+__copyright__ = u'2011, Silviu Cotoar\u0103'
+'''
+b365.realitatea.net
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class b365Realitatea(BasicNewsRecipe):
+    title                 = u'b365 Realitatea'
+    __author__            = u'Silviu Cotoar\u0103'
+    publisher             = u'b365 Realitatea'
+    description           = u'b365 Realitatea'
+    oldest_article        = 5
+    language              = 'ro'
+    max_articles_per_feed = 100
+    no_stylesheets        = True
+    use_embedded_content  = False
+    category              = 'Ziare,Romania,Bucuresti'
+    encoding              = 'utf-8'
+    cover_url             = 'http://b365.realitatea.net/wp-content/themes/b/images/b365-logo.png'
+
+    conversion_options = {
+                'comments'    : description
+                ,'tags'       : category
+                ,'language'   : language
+                ,'publisher'  : publisher
+                         }
+
+    keep_only_tags = [
+                      dict(name='div', attrs={'class':'newsArticle'})
+                     ]
+
+    remove_tags = [
+             dict(name='div', attrs={'class':'date'})
+           , dict(name='dic', attrs={'class':'addthis_toolbox addthis_default_style'})
+           , dict(name='div', attrs={'class':'related_posts'})
+           , dict(name='div', attrs={'id':'RelevantiWidget'})
+                  ]
+
+    remove_tags_after = [
+                     dict(name='div', attrs={'id':'RelevantiWidget'})
+                   ]
+    feeds  = [
+        (u'\u0218tiri', u'http://b365.realitatea.net/rss-full/')
+         ]
+
+    def preprocess_html(self, soup):
+        return self.adeify_images(soup)
+
--- a/recipes/bbc.recipe
+++ b/recipes/bbc.recipe
@ -1,61 +1,648 @@
-__license__   = 'GPL v3'
-__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
+##
+## Title:        BBC News, Sport, and Blog Calibre Recipe
+## Contact:      mattst - jmstanfield@gmail.com
+##
+## License:      GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
+## Copyright:    mattst - jmstanfield@gmail.com
+##
+## Written:      November 2011
+## Last Edited:  2011-11-19
+##
+
+__license__     = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
+__copyright__   = 'mattst - jmstanfield@gmail.com'
+
+
 '''
-news.bbc.co.uk
+BBC News, Sport, and Blog Calibre Recipe
 '''
+
+# Import the regular expressions module.
 import re
+
+# Import the BasicNewsRecipe class which this class extends.
 from calibre.web.feeds.recipes import BasicNewsRecipe

-class BBC(BasicNewsRecipe):
-    title                  = 'BBC News'
-    __author__             = 'Darko Miletic, Starson17'
-    description            = 'News from UK. '
-    oldest_article         = 2
-    max_articles_per_feed  = 100
-    no_stylesheets         = True
-    #delay                  = 1
-    use_embedded_content   = False
-    encoding               = 'utf8'
-    publisher              = 'BBC'
-    category               = 'news, UK, world'
-    language               = 'en_GB'
-    publication_type       = 'newsportal'
-    extra_css              = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
-    preprocess_regexps     = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
-    conversion_options = {
-                             'comments'        : description
-                            ,'tags'            : category
-                            ,'language'        : language
-                            ,'publisher'       : publisher
-                            ,'linearize_tables': True
-                         }
-
-    keep_only_tags    = [
-                       dict(name='div', attrs={'class':['layout-block-a layout-block']})
-                       ,dict(attrs={'class':['story-body','storybody']})
-                        ]
-
-    remove_tags = [
-                       dict(name='div', attrs={'class':['story-feature related narrow', 'share-help', 'embedded-hyper',
-                                                    'story-feature wide ', 'story-feature narrow']}),
-                       dict(id=['hypertab', 'comment-form']),
-                        ]
-
-    remove_attributes = ['width','height']
+class BBCNewsSportBlog(BasicNewsRecipe):

+    #
+    #    **** IMPORTANT USERS READ ME ****
+    #
+    #  First select the feeds you want then scroll down below the feeds list
+    #  and select the values you want for the other user preferences, like
+    #  oldest_article and such like.
+    #
+    #
+    #  Select the BBC rss feeds which you want in your ebook.
+    #  Selected feed have NO '#' at their start, de-selected feeds begin with a '#'.
+    #
+    #  Eg.  ("News Home", "http://feeds.bbci.co.uk/... - include feed.
+    #  Eg. #("News Home", "http://feeds.bbci.co.uk/... - do not include feed.
+    #
+    # There are 68 feeds below which constitute the bulk of the available rss
+    # feeds on the BBC web site. These include 5 blogs by editors and
+    # correspondants, 16 sports feeds, 15 'sub' regional feeds (Eg. North West
+    # Wales, Scotland Business), and 7 Welsh language feeds.
+    #
+    # Some of the feeds are low volume (Eg. blogs), or very low volume (Eg. Click)
+    # so if "oldest_article = 1.5" (only articles published in the last 36 hours)
+    # you may get some 'empty feeds' which will not then be included in the ebook.
+    #
+    # The 15 feeds currently selected below are simply my default ones.
+    #
+    # Note: With all 68 feeds selected, oldest_article set to 2,
+    # max_articles_per_feed set to 100, and simultaneous_downloads set to 10,
+    # the ebook creation took 29 minutes on my speedy 100 mbps net connection,
+    # fairly high-end desktop PC running Linux (Ubuntu Lucid-Lynx).
+    # More realistically with 15 feeds selected, oldest_article set to 1.5,
+    # max_articles_per_feed set to 100, and simultaneous_downloads set to 20,
+    # it took 6 minutes. If that's too slow increase 'simultaneous_downloads'.
+    #
+    # Select / de-select the feeds you want in your ebook.
+    #
    feeds = [
-                      ('News Front Page', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/front_page/rss.xml'),
-                      ('Science/Nature', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/science/nature/rss.xml'),
-                      ('Technology', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/technology/rss.xml'),
-                      ('Entertainment', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/entertainment/rss.xml'),
-                      ('Magazine', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/magazine/rss.xml'),
-                      ('Business', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/business/rss.xml'),
-                      ('Health', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/health/rss.xml'),
-                      ('Americas', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/americas/rss.xml'),
-                      ('Europe', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/europe/rss.xml'),
-                      ('South Asia', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/south_asia/rss.xml'),
-                      ('UK', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/rss.xml'),
-                      ('Asia-Pacific', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/asia-pacific/rss.xml'),
-                      ('Africa', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/africa/rss.xml'),
+              ("News Home", "http://feeds.bbci.co.uk/news/rss.xml"),
+              ("UK", "http://feeds.bbci.co.uk/news/uk/rss.xml"),
+              ("World", "http://feeds.bbci.co.uk/news/world/rss.xml"),
+              #("England", "http://feeds.bbci.co.uk/news/england/rss.xml"),
+              #("Scotland", "http://feeds.bbci.co.uk/news/scotland/rss.xml"),
+              #("Wales", "http://feeds.bbci.co.uk/news/wales/rss.xml"),
+              #("N. Ireland", "http://feeds.bbci.co.uk/news/northern_ireland/rss.xml"),
+              #("Africa", "http://feeds.bbci.co.uk/news/world/africa/rss.xml"),
+              #("Asia", "http://feeds.bbci.co.uk/news/world/asia/rss.xml"),
+              #("Europe", "http://feeds.bbci.co.uk/news/world/europe/rss.xml"),
+              #("Latin America", "http://feeds.bbci.co.uk/news/world/latin_america/rss.xml"),
+              #("Middle East", "http://feeds.bbci.co.uk/news/world/middle_east/rss.xml"),
+              ("US & Canada", "http://feeds.bbci.co.uk/news/world/us_and_canada/rss.xml"),
+              ("Politics", "http://feeds.bbci.co.uk/news/politics/rss.xml"),
+              ("Science/Environment", "http://feeds.bbci.co.uk/news/science_and_environment/rss.xml"),
+              ("Technology", "http://feeds.bbci.co.uk/news/technology/rss.xml"),
+              ("Magazine", "http://feeds.bbci.co.uk/news/magazine/rss.xml"),
+              ("Entertainment/Arts", "http://feeds.bbci.co.uk/news/entertainment_and_arts/rss.xml"),
+              #("Health", "http://feeds.bbci.co.uk/news/health/rss.xml"),
+              #("Education/Family", "http://feeds.bbci.co.uk/news/education/rss.xml"),
+              ("Business", "http://feeds.bbci.co.uk/news/business/rss.xml"),
+              ("Special Reports", "http://feeds.bbci.co.uk/news/special_reports/rss.xml"),
+              ("Also in the News", "http://feeds.bbci.co.uk/news/also_in_the_news/rss.xml"),
+              #("Newsbeat", "http://www.bbc.co.uk/newsbeat/rss.xml"),
+              #("Click", "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/programmes/click_online/rss.xml"),
+              ("Blog: Nick Robinson (Political Editor)", "http://feeds.bbci.co.uk/news/correspondents/nickrobinson/rss.sxml"),
+              #("Blog: Mark D'Arcy (Parliamentary Correspondent)", "http://feeds.bbci.co.uk/news/correspondents/markdarcy/rss.sxml"),
+              #("Blog: Robert Peston (Business Editor)", "http://feeds.bbci.co.uk/news/correspondents/robertpeston/rss.sxml"),
+              #("Blog: Stephanie Flanders (Economics Editor)", "http://feeds.bbci.co.uk/news/correspondents/stephanieflanders/rss.sxml"),
+              ("Blog: Rory Cellan-Jones (Technology correspondent)", "http://feeds.bbci.co.uk/news/correspondents/rorycellanjones/rss.sxml"),
+              ("Sport Front Page", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/front_page/rss.xml"),
+              #("Football", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/football/rss.xml"),
+              #("Cricket", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/cricket/rss.xml"),
+              #("Rugby Union", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/rugby_union/rss.xml"),
+              #("Rugby League", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/rugby_league/rss.xml"),
+              #("Tennis", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/tennis/rss.xml"),
+              #("Golf", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/golf/rss.xml"),
+              #("Motorsport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/motorsport/rss.xml"),
+              #("Boxing", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/boxing/rss.xml"),
+              #("Athletics", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/athletics/rss.xml"),
+              #("Snooker", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/snooker/rss.xml"),
+              #("Horse Racing", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/horse_racing/rss.xml"),
+              #("Cycling", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/cycling/rss.xml"),
+              #("Disability Sport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/disability_sport/rss.xml"),
+              #("Other Sport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/rss.xml"),
+              #("Olympics 2012", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/olympics_2012/rss.xml"),
+              #("N. Ireland Politics", "http://feeds.bbci.co.uk/news/northern_ireland/northern_ireland_politics/rss.xml"),
+              #("Scotland Politics", "http://feeds.bbci.co.uk/news/scotland/scotland_politics/rss.xml"),
+              #("Scotland Business", "http://feeds.bbci.co.uk/news/scotland/scotland_business/rss.xml"),
+              #("E. Scotland, Edinburgh & Fife", "http://feeds.bbci.co.uk/news/scotland/edinburgh_east_and_fife/rss.xml"),
+              #("W. Scotland & Glasgow", "http://feeds.bbci.co.uk/news/scotland/glasgow_and_west/rss.xml"),
+              #("Highlands & Islands", "http://feeds.bbci.co.uk/news/scotland/highlands_and_islands/rss.xml"),
+              #("NE. Scotland, Orkney & Shetland", "http://feeds.bbci.co.uk/news/scotland/north_east_orkney_and_shetland/rss.xml"),
+              #("South Scotland", "http://feeds.bbci.co.uk/news/scotland/south_scotland/rss.xml"),
+              #("Central Scotland & Tayside", "http://feeds.bbci.co.uk/news/scotland/tayside_and_central/rss.xml"),
+              #("Wales Politics", "http://feeds.bbci.co.uk/news/wales/wales_politics/rss.xml"),
+              #("NW. Wales", "http://feeds.bbci.co.uk/news/wales/north_west_wales/rss.xml"),
+              #("NE. Wales", "http://feeds.bbci.co.uk/news/wales/north_east_wales/rss.xml"),
+              #("Mid. Wales", "http://feeds.bbci.co.uk/news/wales/mid_wales/rss.xml"),
+              #("SW. Wales", "http://feeds.bbci.co.uk/news/wales/south_west_wales/rss.xml"),
+              #("SE. Wales", "http://feeds.bbci.co.uk/news/wales/south_east_wales/rss.xml"),
+              #("Newyddion - News in Welsh", "http://feeds.bbci.co.uk/newyddion/rss.xml"),
+              #("Gwleidyddiaeth", "http://feeds.bbci.co.uk/newyddion/gwleidyddiaeth/rss.xml"),
+              #("Gogledd-Ddwyrain", "http://feeds.bbci.co.uk/newyddion/gogledd-ddwyrain/rss.xml"),
+              #("Gogledd-Orllewin", "http://feeds.bbci.co.uk/newyddion/gogledd-orllewin/rss.xml"),
+              #("Canolbarth", "http://feeds.bbci.co.uk/newyddion/canolbarth/rss.xml"),
+              #("De-Ddwyrain", "http://feeds.bbci.co.uk/newyddion/de-ddwyrain/rss.xml"),
+              #("De-Orllewin", "http://feeds.bbci.co.uk/newyddion/de-orllewin/rss.xml"),
            ]

+
+    #    **** SELECT YOUR USER PREFERENCES ****
+
+    # Title to use for the ebook.
+    #
+    title = 'BBC News'
+
+    # A brief description for the ebook.
+    #
+    description = u'BBC web site ebook created using rss feeds.'
+
+    # The max number of articles which may be downloaded from each feed.
+    # I've never seen more than about 70 articles in a single feed in the
+    # BBC feeds.
+    #
+    max_articles_per_feed = 100
+
+    # The max age of articles which may be downloaded from each feed. This is
+    # specified in days - note fractions of days are allowed, Eg. 2.5 (2 and a
+    # half days). My default of 1.5 days is the last 36 hours, the point at
+    # which I've decided 'news' becomes 'old news', but be warned this is not
+    # so good for the blogs, technology, magazine, etc., and sports feeds.
+    # You may wish to extend this to 2-5 but watch out ebook creation time will
+    # increase as well. Setting this to 30 will get everything (AFAICT) as long
+    # as max_articles_per_feed remains set high (except for 'Click' which is
+    # v. low volume and its currently oldest article is 4th Feb 2011).
+    #
+    oldest_article = 1.5
+
+    # Number of simultaneous downloads. 20 is consistantly working fine on the
+    # BBC News feeds with no problems. Speeds things up from the defualt of 5.
+    # If you have a lot of feeds and/or have increased oldest_article above 2
+    # then you may wish to try increasing simultaneous_downloads to 25-30,
+    # Or, of course, if you are in a hurry. [I've not tried beyond 20.]
+    #
+    simultaneous_downloads = 20
+
+    # Timeout for fetching files from the server in seconds. The default of
+    # 120 seconds, seems somewhat excessive.
+    #
+    timeout = 30
+
+    # The format string for the date shown on the ebook's first page.
+    # List of all values: http://docs.python.org/library/time.html
+    # Default in news.py has a leading space so that's mirrored here.
+    # As with 'feeds' select/de-select by adding/removing the initial '#',
+    # only one timefmt should be selected, here's a few to choose from.
+    #
+    timefmt = ' [%a, %d %b %Y]'              # [Fri, 14 Nov 2011] (Calibre default)
+    #timefmt = ' [%a, %d %b %Y %H:%M]'       # [Fri, 14 Nov 2011 18:30]
+    #timefmt = ' [%a, %d %b %Y %I:%M %p]'    # [Fri, 14 Nov 2011 06:30 PM]
+    #timefmt = ' [%d %b %Y]'                 # [14 Nov 2011]
+    #timefmt = ' [%d %b %Y %H:%M]'           # [14 Nov 2011 18.30]
+    #timefmt = ' [%Y-%m-%d]'                 # [2011-11-14]
+    #timefmt = ' [%Y-%m-%d-%H-%M]'           # [2011-11-14-18-30]
+
+
+
+    #
+    #    **** IMPORTANT ****
+    #
+    #    DO NOT EDIT BELOW HERE UNLESS YOU KNOW WHAT YOU ARE DOING.
+    #
+    #    DO NOT EDIT BELOW HERE UNLESS YOU KNOW WHAT YOU ARE DOING.
+    #
+    #    I MEAN IT, YES I DO, ABSOLUTELY, AT YOU OWN RISK. :)
+    #
+    #    **** IMPORTANT ****
+    #
+
+
+
+    # Author of this recipe.
+    __author__ = 'mattst'
+
+    # Specify English as the language of the RSS feeds (ISO-639 code).
+    language = 'en_GB'
+
+    # Set tags.
+    tags = 'news, sport, blog'
+
+    # Set publisher and publication type.
+    publisher = 'BBC'
+    publication_type = 'newspaper'
+
+    # Disable stylesheets from site.
+    no_stylesheets = True
+
+    # Specifies an override encoding for sites that have an incorrect charset
+    # specified. Default of 'None' says to auto-detect. Some other BBC recipes
+    # use 'utf8', which works fine (so use that if necessary) but auto-detecting
+    # with None is working fine, so stick with that for robustness.
+    encoding = None
+
+    # Sets whether a feed has full articles embedded in it. The BBC feeds do not.
+    use_embedded_content = False
+
+    # Removes empty feeds - why keep them!?
+    remove_empty_feeds = True
+
+    # Create a custom title which fits nicely in the Kindle title list.
+    # Requires "import time" above class declaration, and replacing
+    # title with custom_title in conversion_options (right column only).
+    # Example of string below: "BBC News - 14 Nov 2011"
+    #
+    # custom_title = "BBC News - " + time.strftime('%d %b %Y')
+
+    '''
+    # Conversion options for advanced users, but don't forget to comment out the
+    # current conversion_options below. Avoid setting 'linearize_tables' as that
+    # plays havoc with the 'old style' table based pages.
+    #
+    conversion_options = { 'title'       : title,
+                           'comments'    : description,
+                           'tags'        : tags,
+                           'language'    : language,
+                           'publisher'   : publisher,
+                           'authors'     : publisher,
+                           'smarten_punctuation' : True
+                         }
+    '''
+
+    conversion_options = { 'smarten_punctuation' : True }
+
+    # Specify extra CSS - overrides ALL other CSS (IE. Added last).
+    extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
+                 .introduction, .first { font-weight: bold; } \
+                 .cross-head { font-weight: bold; font-size: 125%; } \
+                 .cap, .caption { display: block; font-size: 80%; font-style: italic; } \
+                 .cap, .caption, .caption img, .caption span { display: block; text-align: center; margin: 5px auto; } \
+                 .byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
+                    .correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
+                    text-align: center; font-size: 80%; font-style: italic; margin: 1px auto; } \
+                 .story-date, .published { font-size: 80%; } \
+                 table { width: 100%; } \
+                 td img { display: block; margin: 5px auto; } \
+                 ul { padding-top: 10px; } \
+                 ol { padding-top: 10px; } \
+                 li { padding-top: 5px; padding-bottom: 5px; } \
+                 h1 { text-align: center; font-size: 175%; font-weight: bold; } \
+                 h2 { text-align: center; font-size: 150%; font-weight: bold; } \
+                 h3 { text-align: center; font-size: 125%; font-weight: bold; } \
+                 h4, h5, h6 { text-align: center; font-size: 100%; font-weight: bold; }'
+
+    # Remove various tag attributes to improve the look of the ebook pages.
+    remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
+                          'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
+
+    # Remove the (admittedly rarely used) line breaks, "<br />", which sometimes
+    # cause a section of the ebook to start in an unsightly fashion or, more
+    # frequently, a "<br />" will muck up the formatting of a correspondant's byline.
+    # "<br />" and "<br clear/>" are far more frequently used on the table formatted
+    # style of pages, and really spoil the look of the ebook pages.
+    preprocess_regexps     = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
+                              (re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: '')]
+
+
+    # Create regular expressions for tag keeping and removal to make the matches more
+    # robust against minor changes and errors in the HTML, Eg. double spaces, leading
+    # and trailing spaces, missing hyphens, and such like.
+    # Python regular expression ('re' class) page: http://docs.python.org/library/re.html
+
+    # ***************************************
+    # Regular expressions for keep_only_tags:
+    # ***************************************
+
+    # The BBC News HTML pages use variants of 'storybody' to denote the section of a HTML
+    # page which contains the main text of the article. Match storybody variants: 'storybody',
+    # 'story-body', 'story body','storybody ', etc.
+    storybody_reg_exp = '^.*story[_ -]*body.*$'
+
+    # The BBC sport and 'newsbeat' (features) HTML pages use 'blq_content' to hold the title
+    # and published date. This is one level above the usual news pages which have the title
+    # and date within 'story-body'. This is annoying since 'blq_content' must also be kept,
+    # resulting in a lot of extra things to be removed by remove_tags.
+    blq_content_reg_exp = '^.*blq[_ -]*content.*$'
+
+    # The BBC has an alternative page design structure, which I suspect is an out-of-date
+    # design but which is still used in some articles, Eg. 'Click' (technology), 'FastTrack'
+    # (travel), and in some sport pages. These alternative pages are table based (which is
+    # why I think they are an out-of-date design) and account for -I'm guesstimaking- less
+    # than 1% of all articles. They use a table class 'storycontent' to hold the article
+    # and like blq_content (above) have required lots of extra removal by remove_tags.
+    story_content_reg_exp = '^.*story[_ -]*content.*$'
+
+    # Keep the sections of the HTML which match the list below. The HTML page created by
+    # Calibre will fill <body> with those sections which are matched. Note that the
+    # blq_content_reg_exp must be listed before storybody_reg_exp in keep_only_tags due to
+    # it being the parent of storybody_reg_exp, that is to say the div class/id 'story-body'
+    # will be inside div class/id 'blq_content' in the HTML (if 'blq_content' is there at
+    # all). If they are the other way around in keep_only_tags then blq_content_reg_exp
+    # will end up being discarded.
+    keep_only_tags = [ dict(name='table', attrs={'class':re.compile(story_content_reg_exp, re.IGNORECASE)}),
+                       dict(name='div',   attrs={'class':re.compile(blq_content_reg_exp, re.IGNORECASE)}),
+                       dict(name='div',   attrs={'id':re.compile(blq_content_reg_exp, re.IGNORECASE)}),
+                       dict(name='div',   attrs={'class':re.compile(storybody_reg_exp, re.IGNORECASE)}),
+                       dict(name='div',   attrs={'id':re.compile(storybody_reg_exp, re.IGNORECASE)}) ]
+
+    # ************************************
+    # Regular expressions for remove_tags:
+    # ************************************
+
+    # Regular expression to remove share-help and variant tags. The share-help class
+    # is used by the site for a variety of 'sharing' type links, Eg. Facebook, delicious,
+    # twitter, email. Removed to avoid page clutter.
+    share_help_reg_exp = '^.*share[_ -]*help.*$'
+
+    # Regular expression to remove embedded-hyper and variant tags. This class is used to
+    # display links to other BBC News articles on the same/similar subject.
+    embedded_hyper_reg_exp = '^.*embed*ed[_ -]*hyper.*$'
+
+    # Regular expression to remove hypertabs and variant tags. This class is used to
+    # display a tab bar at the top of an article which allows the user to switch to
+    # an article (viewed on the same page) providing further info., 'in depth' analysis,
+    # an editorial, a correspondant's blog entry, and such like. The ability to handle
+    # a tab bar of this nature is currently beyond the scope of this recipe and
+    # possibly of Calibre itself (not sure about that - TO DO - check!).
+    hypertabs_reg_exp = '^.*hyper[_ -]*tabs.*$'
+
+    # Regular expression to remove story-feature and variant tags. Eg. 'story-feature',
+    # 'story-feature related narrow', 'story-feature wide', 'story-feature narrow'.
+    # This class is used to add additional info. boxes, or small lists, outside of
+    # the main story. TO DO: Work out a way to incorporate these neatly.
+    story_feature_reg_exp = '^.*story[_ -]*feature.*$'
+
+    # Regular expression to remove video and variant tags, Eg. 'videoInStoryB',
+    # 'videoInStoryC'. This class is used to embed video.
+    video_reg_exp = '^.*video.*$'
+
+    # Regular expression to remove audio and variant tags, Eg. 'audioInStoryD'.
+    # This class is used to embed audio.
+    audio_reg_exp = '^.*audio.*$'
+
+    # Regular expression to remove pictureGallery and variant tags, Eg. 'pictureGallery'.
+    # This class is used to embed a photo slideshow. See also 'slideshow' below.
+    picture_gallery_reg_exp = '^.*picture.*$'
+
+    # Regular expression to remove slideshow and variant tags, Eg. 'dslideshow-enclosure'.
+    # This class is used to embed a slideshow (not necessarily photo) but both
+    # 'slideshow' and 'pictureGallery' are used for slideshows.
+    slideshow_reg_exp = '^.*slide[_ -]*show.*$'
+
+    # Regular expression to remove social-links and variant tags. This class is used to
+    # display links to a BBC bloggers main page, used in various columnist's blogs
+    # (Eg. Nick Robinson, Robert Preston).
+    social_links_reg_exp = '^.*social[_ -]*links.*$'
+
+    # Regular expression to remove quote and (multi) variant tags, Eg. 'quote',
+    # 'endquote', 'quote-credit', 'quote-credit-title', etc. These are usually
+    # removed by 'story-feature' removal (as they are usually within them), but
+    # not always. The quotation removed is always (AFAICT) in the article text
+    # as well but a 2nd copy is placed in a quote tag to draw attention to it.
+    # The quote class tags may or may not appear in div's.
+    quote_reg_exp = '^.*quote.*$'
+
+    # Regular expression to remove hidden and variant tags, Eg. 'hidden'.
+    # The purpose of these is unclear, they seem to be an internal link to a
+    # section within the article, but the text of the link (Eg. 'Continue reading
+    # the main story') never seems to be displayed anyway. Removed to avoid clutter.
+    # The hidden class tags may or may not appear in div's.
+    hidden_reg_exp = '^.*hidden.*$'
+
+    # Regular expression to remove comment and variant tags, Eg. 'comment-introduction'.
+    # Used on the site to display text about registered users entering comments.
+    comment_reg_exp = '^.*comment.*$'
+
+    # Regular expression to remove form and variant tags, Eg. 'comment-form'.
+    # Used on the site to allow registered BBC users to fill in forms, typically
+    # for entering comments about an article.
+    form_reg_exp = '^.*form.*$'
+
+    # Extra things to remove due to the addition of 'blq_content' in keep_only_tags.
+
+    #<div class="story-actions"> Used on sports pages for 'email' and 'print'.
+    story_actions_reg_exp = '^.*story[_ -]*actions.*$'
+
+    #<div class="bookmark-list"> Used on sports pages instead of 'share-help' (for
+    # social networking links).
+    bookmark_list_reg_exp = '^.*bookmark[_ -]*list.*$'
+
+    #<div id="secondary-content" class="content-group">
+    # NOTE: Don't remove class="content-group" that is needed.
+    # Used on sports pages to link to 'similar stories'.
+    secondary_content_reg_exp = '^.*secondary[_ -]*content.*$'
+
+    #<div id="featured-content" class="content-group">
+    # NOTE: Don't remove class="content-group" that is needed.
+    # Used on sports pages to link to pages like 'tables', 'fixtures', etc.
+    featured_content_reg_exp = '^.*featured[_ -]*content.*$'
+
+    #<div id="navigation">
+    # Used on sports pages to link to pages like 'tables', 'fixtures', etc.
+    # Used sometimes instead of "featured-content" above.
+    navigation_reg_exp = '^.*navigation.*$'
+
+    #<a class="skip" href="#blq-container-inner">Skip to top</a>
+    # Used on sports pages to link to the top of the page.
+    skip_reg_exp = '^.*skip.*$'
+
+    # Extra things to remove due to the addition of 'storycontent' in keep_only_tags,
+    # which are the alterative table design based pages. The purpose of some of these
+    # is not entirely clear from the pages (which are a total mess!).
+
+    # Remove mapping based tags, Eg. <map id="world_map">
+    # The dynamic maps don't seem to work during ebook creation. TO DO: Investigate.
+    map_reg_exp = '^.*map.*$'
+
+    # Remove social bookmarking variation, called 'socialBookMarks'.
+    social_bookmarks_reg_exp = '^.*social[_ -]*bookmarks.*$'
+
+    # Remove page navigation tools, like 'search', 'email', 'print', called 'blq-mast'.
+    blq_mast_reg_exp = '^.*blq[_ -]*mast.*$'
+
+    # Remove 'sharesb', I think this is a generic 'sharing' class. It seems to appear
+    # alongside 'socialBookMarks' whenever that appears. I am removing it as well
+    # under the assumption that it can appear alone as well.
+    sharesb_reg_exp = '^.*sharesb.*$'
+
+    # Remove class 'o'. The worst named user created css class of all time. The creator
+    # should immediately be fired. I've seen it used to hold nothing at all but with
+    # 20 or so empty lines in it. Also to hold a single link to another article.
+    # Whatever it was designed to do it is not wanted by this recipe. Exact match only.
+    o_reg_exp = '^o$'
+
+    # Remove 'promotopbg' and 'promobottombg', link lists. Have decided to
+    # use two reg expressions to make removing this (and variants) robust.
+    promo_top_reg_exp = '^.*promotopbg.*$'
+    promo_bottom_reg_exp = '^.*promobottombg.*$'
+
+    # Remove 'nlp', provides heading for link lists. Requires an exact match due to
+    # risk of matching those letters in something needed, unless I see a variation
+    # of 'nlp' used at a later date.
+    nlp_reg_exp = '^nlp$'
+
+    # Remove 'mva', provides embedded floating content of various types. Variant 'mvb'
+    # has also now been seen. Requires an exact match of 'mva' or 'mvb' due to risk of
+    # matching those letters in something needed.
+    mva_or_mvb_reg_exp = '^mv[ab]$'
+
+    # Remove 'mvtb', seems to be page navigation tools, like 'blq-mast'.
+    mvtb_reg_exp = '^mvtb$'
+
+    # Remove 'blq-toplink', class to provide a link to the top of the page.
+    blq_toplink_reg_exp = '^.*blq[_ -]*top[_ -]*link.*$'
+
+    # Remove 'products and services' links, Eg. desktop tools, alerts, and so on.
+    # Eg. Class="servicev4 ukfs_services" - what a mess of a name. Have decided to
+    # use two reg expressions to make removing this (and variants) robust.
+    prods_services_01_reg_exp = '^.*servicev4.*$'
+    prods_services_02_reg_exp = '^.*ukfs[_ -]*services.*$'
+
+    # Remove -what I think is- some kind of navigation tools helper class, though I am
+    # not sure, it's called: 'blq-rst blq-new-nav'. What I do know is it pops up
+    # frequently and it is not wanted. Have decided to use two reg expressions to make
+    # removing this (and variants) robust.
+    blq_misc_01_reg_exp = '^.*blq[_ -]*rst.*$'
+    blq_misc_02_reg_exp = '^.*blq[_ -]*new[_ -]*nav.*$'
+
+    # Remove 'puffbox' - this may only appear inside 'storyextra', so it may not
+    # need removing - I have no clue what it does other than it contains links.
+    # Whatever it is - it is not part of the article and is not wanted.
+    puffbox_reg_exp = '^.*puffbox.*$'
+
+    # Remove 'sibtbg' and 'sibtbgf' - some kind of table formatting classes.
+    sibtbg_reg_exp = '^.*sibtbg.*$'
+
+    # Remove 'storyextra' - links to relevant articles and external sites.
+    storyextra_reg_exp = '^.*story[_ -]*extra.*$'
+
+
+    remove_tags = [ dict(name='div',  attrs={'class':re.compile(story_feature_reg_exp, re.IGNORECASE)}),
+                    dict(name='div',  attrs={'class':re.compile(share_help_reg_exp, re.IGNORECASE)}),
+                    dict(name='div',  attrs={'class':re.compile(embedded_hyper_reg_exp, re.IGNORECASE)}),
+                    dict(name='div',  attrs={'class':re.compile(hypertabs_reg_exp, re.IGNORECASE)}),
+                    dict(name='div',  attrs={'class':re.compile(video_reg_exp, re.IGNORECASE)}),
+                    dict(name='div',  attrs={'class':re.compile(audio_reg_exp, re.IGNORECASE)}),
+                    dict(name='div',  attrs={'class':re.compile(picture_gallery_reg_exp, re.IGNORECASE)}),
+                    dict(name='div',  attrs={'class':re.compile(slideshow_reg_exp, re.IGNORECASE)}),
+                    dict(name='div',  attrs={'class':re.compile(quote_reg_exp, re.IGNORECASE)}),
+                    dict(name='div',  attrs={'class':re.compile(hidden_reg_exp, re.IGNORECASE)}),
+                    dict(name='div',  attrs={'class':re.compile(comment_reg_exp, re.IGNORECASE)}),
+                    dict(name='div',  attrs={'class':re.compile(story_actions_reg_exp, re.IGNORECASE)}),
+                    dict(name='div',  attrs={'class':re.compile(bookmark_list_reg_exp, re.IGNORECASE)}),
+                    dict(name='div',  attrs={'id':re.compile(secondary_content_reg_exp, re.IGNORECASE)}),
+                    dict(name='div',  attrs={'id':re.compile(featured_content_reg_exp, re.IGNORECASE)}),
+                    dict(name='div',  attrs={'id':re.compile(navigation_reg_exp, re.IGNORECASE)}),
+                    dict(name='form', attrs={'id':re.compile(form_reg_exp, re.IGNORECASE)}),
+                    dict(attrs={'class':re.compile(quote_reg_exp, re.IGNORECASE)}),
+                    dict(attrs={'class':re.compile(hidden_reg_exp, re.IGNORECASE)}),
+                    dict(attrs={'class':re.compile(social_links_reg_exp, re.IGNORECASE)}),
+                    dict(attrs={'class':re.compile(comment_reg_exp, re.IGNORECASE)}),
+                    dict(attrs={'class':re.compile(skip_reg_exp, re.IGNORECASE)}),
+                    dict(name='map', attrs={'id':re.compile(map_reg_exp, re.IGNORECASE)}),
+                    dict(name='map', attrs={'name':re.compile(map_reg_exp, re.IGNORECASE)}),
+                    dict(name='div', attrs={'id':re.compile(social_bookmarks_reg_exp, re.IGNORECASE)}),
+                    dict(name='div', attrs={'id':re.compile(blq_mast_reg_exp, re.IGNORECASE)}),
+                    dict(name='div', attrs={'class':re.compile(sharesb_reg_exp, re.IGNORECASE)}),
+                    dict(name='div', attrs={'class':re.compile(o_reg_exp, re.IGNORECASE)}),
+                    dict(name='div',  attrs={'class':re.compile(promo_top_reg_exp, re.IGNORECASE)}),
+                    dict(name='div',  attrs={'class':re.compile(promo_bottom_reg_exp, re.IGNORECASE)}),
+                    dict(name='div',  attrs={'class':re.compile(nlp_reg_exp, re.IGNORECASE)}),
+                    dict(name='div',  attrs={'class':re.compile(mva_or_mvb_reg_exp, re.IGNORECASE)}),
+                    dict(name='div',  attrs={'class':re.compile(mvtb_reg_exp, re.IGNORECASE)}),
+                    dict(name='div',  attrs={'class':re.compile(blq_toplink_reg_exp, re.IGNORECASE)}),
+                    dict(name='div',  attrs={'class':re.compile(prods_services_01_reg_exp, re.IGNORECASE)}),
+                    dict(name='div',  attrs={'class':re.compile(prods_services_02_reg_exp, re.IGNORECASE)}),
+                    dict(name='div',  attrs={'class':re.compile(blq_misc_01_reg_exp, re.IGNORECASE)}),
+                    dict(name='div',  attrs={'class':re.compile(blq_misc_02_reg_exp, re.IGNORECASE)}),
+                    dict(name='div',  attrs={'class':re.compile(puffbox_reg_exp, re.IGNORECASE)}),
+                    dict(attrs={'class':re.compile(sibtbg_reg_exp, re.IGNORECASE)}),
+                    dict(attrs={'class':re.compile(storyextra_reg_exp, re.IGNORECASE)})
+                  ]
+
+    # Uses url to create and return the 'printer friendly' version of the url.
+    # In other words the 'print this page' address of the page.
+    #
+    # There are 3 types of urls used in the BBC site's rss feeds. There is just
+    # 1 type for the standard news while there are 2 used for sports feed urls.
+    # Note: Sports urls are linked from regular news feeds (Eg. 'News Home') when
+    # there is a major story of interest to 'everyone'. So even if no BBC sports
+    # feeds are added to 'feeds' the logic of this method is still needed to avoid
+    # blank / missing / empty articles which have an index title and then no body.
+    def print_version(self, url):
+
+        # Handle sports page urls type 01:
+        if (url.find("go/rss/-/sport1/") != -1):
+            temp_url = url.replace("go/rss/-/", "")
+
+        # Handle sports page urls type 02:
+        elif (url.find("go/rss/int/news/-/sport1/") != -1):
+            temp_url = url.replace("go/rss/int/news/-/", "")
+
+        # Handle regular news page urls:
+        else:
+            temp_url = url.replace("go/rss/int/news/-/", "")
+
+        # Always add "?print=true" to the end of the url.
+        print_url = temp_url + "?print=true"
+
+        return print_url
+
+
+    # Remove articles in feeds based on a string in the article title or url.
+    #
+    # Code logic written by: Starson17 - posted in: "Recipes - Re-usable code"
+    # thread, in post with title: "Remove articles from feed", see url:
+    # http://www.mobileread.com/forums/showpost.php?p=1165462&postcount=6
+    # Many thanks and all credit to Starson17.
+    #
+    # Starson17's code has obviously been altered to suite my requirements.
+    def parse_feeds(self):
+
+        # Call parent's method.
+        feeds = BasicNewsRecipe.parse_feeds(self)
+
+        # Loop through all feeds.
+        for feed in feeds:
+
+            # Loop through all articles in feed.
+            for article in feed.articles[:]:
+
+                # Match key words and remove article if there's a match.
+
+                # Most BBC rss feed video only 'articles' use upper case 'VIDEO'
+                # as a title prefix. Just match upper case 'VIDEO', so that
+                # articles like 'Video game banned' won't be matched and removed.
+                if 'VIDEO' in article.title:
+                    feed.articles.remove(article)
+
+                # Most BBC rss feed audio only 'articles' use upper case 'AUDIO'
+                # as a title prefix. Just match upper case 'AUDIO', so that
+                # articles like 'Hi-Def audio...' won't be matched and removed.
+                elif 'AUDIO' in article.title:
+                    feed.articles.remove(article)
+
+                # Most BBC rss feed photo slideshow 'articles' use 'In Pictures',
+                # 'In pictures', and 'in pictures', somewhere in their title.
+                # Match any case of that phrase.
+                elif 'IN PICTURES' in article.title.upper():
+                    feed.articles.remove(article)
+
+                # As above, but user contributed pictures. Match any case.
+                elif 'YOUR PICTURES' in article.title.upper():
+                    feed.articles.remove(article)
+
+                # 'Sportsday Live' are articles which contain a constantly and
+                # dynamically updated 'running commentary' during a live sporting
+                # event. Match any case.
+                elif 'SPORTSDAY LIVE' in article.title.upper():
+                    feed.articles.remove(article)
+
+                # Sometimes 'Sportsday Live' (above) becomes 'Live - Sport Name'.
+                # These are being matched below using 'Live - ' because removing all
+                # articles with 'live' in their titles would remove some articles
+                # that are in fact not live sports pages. Match any case.
+                elif 'LIVE - ' in article.title.upper():
+                    feed.articles.remove(article)
+
+                # 'Quiz of the week' is a Flash player weekly news quiz. Match only
+                # the 'Quiz of the' part in anticipation of monthly and yearly
+                # variants. Match any case.
+                elif 'QUIZ OF THE' in article.title.upper():
+                    feed.articles.remove(article)
+
+                # Remove articles with 'scorecards' in the url. These are BBC sports
+                # pages which just display a cricket scorecard. The pages have a mass
+                # of table and css entries to display the scorecards nicely. Probably
+                # could make them work with this recipe, but might take a whole day
+                # of work to sort out all the css - basically a formatting nightmare.
+                elif 'scorecards' in article.url:
+                    feed.articles.remove(article)
+
+        return feeds
+
+# End of class and file.
--- a/recipes/berliner_zeitung.recipe
+++ b/recipes/berliner_zeitung.recipe
@ -1,61 +1,44 @@
 from calibre.web.feeds.recipes import BasicNewsRecipe
-import re
+
+'''Calibre recipe to convert the RSS feeds of the Berliner Zeitung to an ebook.'''

 class SportsIllustratedRecipe(BasicNewsRecipe) :
-    __author__    = 'ape'
-    __copyright__ = 'ape'
+    __author__    = 'a.peter'
+    __copyright__ = 'a.peter'
    __license__   = 'GPL v3'
    language      = 'de'
-    description   = 'Berliner Zeitung'
-    version       = 2
+    description   = 'Berliner Zeitung RSS'
+    version       = 4
    title         = u'Berliner Zeitung'
    timefmt       = ' [%d.%m.%Y]'

+    #oldest_article = 7.0
    no_stylesheets = True
    remove_javascript = True
    use_embedded_content = False
    publication_type = 'newspaper'

-    keep_only_tags = [dict(name='div', attrs={'class':'teaser t_split t_artikel'})]
+    remove_tags_before = dict(name='div', attrs={'class':'newstype'})
+    remove_tags_after = [dict(id='article_text')]

-    INDEX = 'http://www.berlinonline.de/berliner-zeitung/'
-
-    def parse_index(self):
-        base = 'http://www.berlinonline.de'
-        answer = []
-        articles = {}
-        more = 1
-
-        soup = self.index_to_soup(self.INDEX)
-
-        # Get list of links to ressorts from index page
-        ressort_list = soup.findAll('ul', attrs={'class': re.compile('ressortlist')})
-        for ressort in ressort_list[0].findAll('a'):
-            feed_title = ressort.string
-            print 'Analyzing', feed_title
-            if not articles.has_key(feed_title):
-                articles[feed_title] = []
-                answer.append(feed_title)
-            # Load ressort page.
-            feed = self.index_to_soup('http://www.berlinonline.de' + ressort['href'])
-            # find mainbar div which contains the list of all articles
-            for article_container in feed.findAll('div', attrs={'class': re.compile('mainbar')}):
-                # iterate over all articles
-                for article_teaser in article_container.findAll('div', attrs={'class': re.compile('teaser')}):
-                    # extract title of article
-                    if article_teaser.h3 != None:
-                        article = {'title' : article_teaser.h3.a.string, 'date' : u'', 'url'  : base + article_teaser.h3.a['href'], 'description' : u''}
-                        articles[feed_title].append(article)
-                    else:
-                        # Skip teasers for missing photos
-                        if article_teaser.div.p.contents[0].find('Foto:') > -1:
-                            continue
-                        article = {'title': 'Weitere Artikel ' + str(more), 'date': u'', 'url': base + article_teaser.div.p.a['href'], 'description': u''}
-                        articles[feed_title].append(article)
-                        more += 1
-        answer = [[key, articles[key]] for key in answer if articles.has_key(key)]
-        return answer
+    feeds = [(u'Startseite', u'http://www.berliner-zeitung.de/home/10808950,10808950,view,asFeed.xml'),
+             (u'Politik', u'http://www.berliner-zeitung.de/home/10808018,10808018,view,asFeed.xml'),
+             (u'Wirtschaft', u'http://www.berliner-zeitung.de/home/10808230,10808230,view,asFeed.xml'),
+             (u'Berlin', u'http://www.berliner-zeitung.de/home/10809148,10809148,view,asFeed.xml'),
+             (u'Brandenburg', u'http://www.berliner-zeitung.de/home/10809312,10809312,view,asFeed.xml'),
+             (u'Wissenschaft', u'http://www.berliner-zeitung.de/home/10808894,10808894,view,asFeed.xml'),
+             (u'Digital', u'http://www.berliner-zeitung.de/home/10808718,10808718,view,asFeed.xml'),
+             (u'Kultur', u'http://www.berliner-zeitung.de/home/10809150,10809150,view,asFeed.xml'),
+             (u'Panorama', u'http://www.berliner-zeitung.de/home/10808334,10808334,view,asFeed.xml'),
+             (u'Sport', u'http://www.berliner-zeitung.de/home/10808794,10808794,view,asFeed.xml'),
+             (u'Hertha', u'http://www.berliner-zeitung.de/home/10808800,10808800,view,asFeed.xml'),
+             (u'Union', u'http://www.berliner-zeitung.de/home/10808802,10808802,view,asFeed.xml'),
+             (u'Verkehr', u'http://www.berliner-zeitung.de/home/10809298,10809298,view,asFeed.xml'),
+             (u'Polizei', u'http://www.berliner-zeitung.de/home/10809296,10809296,view,asFeed.xml'),
+             (u'Meinung', u'http://www.berliner-zeitung.de/home/10808020,10808020,view,asFeed.xml')]

    def get_masthead_url(self):
-        return 'http://www.berlinonline.de/.img/berliner-zeitung/blz_logo.gif'
+        return 'http://www.berliner-zeitung.de/image/view/10810244,7040611,data,logo.png'

+    def print_version(self, url):
+        return url.replace('.html', ',view,printVersion.html')
--- a/recipes/berlingske_dk.recipe
+++ b/recipes/berlingske_dk.recipe
@ -1,4 +1,3 @@
-
 __license__   = 'GPL v3'
 __copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
 '''
@ -18,11 +17,17 @@ class Berlingske_dk(BasicNewsRecipe):
    no_stylesheets        = True
    remove_empty_feeds    = True
    use_embedded_content  = False
+    remove_javascript     = True
    publication_type      = 'newspaper'
    encoding              = 'utf8'
    language              = 'da'
-    masthead_url          = 'http://www.berlingske.dk/sites/all/themes/bm/img/layout/masthead_bg.gif'
-    extra_css             = ' body{font-family: Arial,Helvetica,sans-serif } h1,.manchet,.byline{font-family: Cambria,Georgia,Times,"Times New Roman",serif } '
+    auto_cleanup          = True
+    extra_css             = '''
+                            .manchet {color:#888888;}
+                            .dateline {font-size: x-small; color:#444444;}
+                            .manchet,.dateline { font-family: Cambria,Georgia,Times,"Times New Roman",serif }
+                            .body {font-family: Arial,Helvetica,sans-serif }
+                            '''

    conversion_options = {
                          'comment'  : description
@ -32,18 +37,14 @@ class Berlingske_dk(BasicNewsRecipe):
                        }

    feeds              = [
-                            (u'Breaking news' , u'http://www.berlingske.dk/breaking/rss'          )
-                           ,(u'Seneste nyt'   , u'http://www.berlingske.dk/seneste/rss'           )
-                           ,(u'Topnyheder'    , u'http://www.berlingske.dk/top/rss'               )
-                           ,(u'Danmark'       , u'http://www.berlingske.dk/danmark/seneste/rss'   )
-                           ,(u'Verden'        , u'http://www.berlingske.dk/verden/seneste/rss'    )
-                           ,(u'Klima'         , u'http://www.berlingske.dk/klima/seneste/rss'     )
-                           ,(u'Debat'         , u'http://www.berlingske.dk/debat/seneste/rss'     )
-                           ,(u'Koebenhavn'    , u'http://www.berlingske.dk/koebenhavn/seneste/rss')
-                           ,(u'Politik'       , u'http://www.berlingske.dk/politik/seneste/rss'   )
-                           ,(u'Kultur'        , u'http://www.berlingske.dk/kultur/seneste/rss'    )
+                            (u'Breaking news' , u'http://www.b.dk/breaking/rss'          )
+                           ,(u'Seneste nyt'   , u'http://www.b.dk/seneste/rss'           )
+                           ,(u'Topnyheder'    , u'http://www.b.dk/top/rss'               )
+                           ,(u'Danmark'       , u'http://www.b.dk/danmark/seneste/rss'   )
+                           ,(u'Verden'        , u'http://www.b.dk/verden/seneste/rss'    )
+                           ,(u'Klima'         , u'http://www.b.dk/klima/seneste/rss'     )
+                           ,(u'Debat'         , u'http://www.b.dk/debat/seneste/rss'     )
+                           ,(u'Koebenhavn'    , u'http://www.b.dk/koebenhavn/seneste/rss')
+                           ,(u'Politik'       , u'http://www.b.dk/politik/seneste/rss'   )
+                           ,(u'Kultur'        , u'http://www.b.dk/kultur/seneste/rss'    )
                          ]
-
-    keep_only_tags     = [dict(attrs={'class':['first','pt-article']})]
-    remove_tags        = [dict(name=['object','link','base','iframe','embed'])]
-
--- a/recipes/biamag.recipe
+++ b/recipes/biamag.recipe
@ -0,0 +1,38 @@
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
+'''
+bianet.com.tr
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Radikal_tr(BasicNewsRecipe):
+    title                 = 'BiaMag'
+    __author__            = 'Osman Kaysan'
+    description           = 'Independent News from Turkey'
+    publisher             = 'BiaMag'
+    category              = 'news, politics, Turkey'
+    oldest_article        = 15
+    max_articles_per_feed = 120
+    masthead_url          = 'http://bianet.org/images/biamag_logo.gif'
+    language              = 'tr'
+    no_stylesheets        = True
+
+    conversion_options = {
+                             'comments'        : description
+                            ,'tags'            : category
+                            ,'language'        : language
+                            ,'publisher'       : publisher
+                            ,'linearize_tables': True
+                ,'remove_paragraph_spacing': True,
+                          }
+
+    remove_tags_before  = dict(name='div', attrs={'class':'manset'})
+    remove_tags = [ dict(name='ul', attrs={'class':['altul']}), dict(name='div', attrs={'id':['habermenu']}), dict(name='div', attrs={'class':['mail']}), dict(name='div', attrs={'class':['from']})]
+    remove_tags_after   = dict(name='div', attrs={'id':'habermenu'})
+
+    feeds = [(u'BiaMag', u'http://www.bianet.org/biamag.rss')]
+
+    def preprocess_html(self, soup):
+        return self.adeify_images(soup)
--- a/recipes/biamag_en.recipe
+++ b/recipes/biamag_en.recipe
@ -0,0 +1,38 @@
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
+'''
+bianet.com.tr
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Radikal_tr(BasicNewsRecipe):
+    title                 = 'Bianet-English'
+    __author__            = 'Osman Kaysan'
+    description           = 'Independent News Network from Turkey(English)'
+    publisher             = 'Bianet'
+    category              = 'news, politics, Turkey'
+    oldest_article        = 7
+    max_articles_per_feed = 150
+    masthead_url          = 'http://bianet.org/images/english_logo.gif'
+    language              = 'en_TR'
+    no_stylesheets        = True
+
+    conversion_options = {
+                             'comments'        : description
+                            ,'tags'            : category
+                            ,'language'        : language
+                            ,'publisher'       : publisher
+                            ,'linearize_tables': True
+                ,'remove_paragraph_spacing': True,
+                          }
+
+    remove_tags_before  = dict(name='div', attrs={'class':'manset'})
+    remove_tags = [ dict(name='ul', attrs={'class':['altul']}), dict(name='div', attrs={'id':['habermenu']}), dict(name='div', attrs={'class':['mail']}), dict(name='div', attrs={'class':['from']})]
+    remove_tags_after   = dict(name='div', attrs={'id':'habermenu'})
+
+    feeds = [(u'Bianet-English', u'http://www.bianet.org/english.rss')]
+
+    def preprocess_html(self, soup):
+        return self.adeify_images(soup)
--- a/recipes/bianet.recipe
+++ b/recipes/bianet.recipe
@ -0,0 +1,38 @@
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
+'''
+bianet.com.tr
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Radikal_tr(BasicNewsRecipe):
+    title                 = 'Bianet'
+    __author__            = 'Osman Kaysan'
+    description           = 'Independent News from Turkey'
+    publisher             = 'Bianet'
+    category              = 'news, politics, Turkey'
+    oldest_article        = 7
+    max_articles_per_feed = 120
+    masthead_url          = 'http://bianet.org/images/bianet_logo.gif'
+    language              = 'tr'
+    no_stylesheets        = True
+
+    conversion_options = {
+                             'comments'        : description
+                            ,'tags'            : category
+                            ,'language'        : language
+                            ,'publisher'       : publisher
+                            ,'linearize_tables': True
+                ,'remove_paragraph_spacing': True,
+                          }
+
+    remove_tags_before  = dict(name='div', attrs={'class':'manset'})
+    remove_tags = [ dict(name='ul', attrs={'class':['altul']}), dict(name='div', attrs={'id':['habermenu']}), dict(name='div', attrs={'class':['mail']}), dict(name='div', attrs={'class':['from']})]
+    remove_tags_after   = dict(name='div', attrs={'id':'habermenu'})
+
+    feeds = [(u'Bianet', u'http://bianet.org/bianet.rss')]
+
+    def preprocess_html(self, soup):
+        return self.adeify_images(soup)
--- a/recipes/biolog_pl.recipe
+++ b/recipes/biolog_pl.recipe
@ -0,0 +1,19 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+from calibre.web.feeds.news import BasicNewsRecipe
+class Biolog_pl(BasicNewsRecipe):
+    title          = u'Biolog.pl'
+    oldest_article = 7
+    max_articles_per_feed = 100
+    remove_empty_feeds=True
+    __author__        = 'fenuks'
+    description   = u'Przyrodnicze aktualności ze świata nauki (codziennie aktualizowane), kurs biologii, testy i sprawdziany, forum dyskusyjne.'
+    category       = 'biology'
+    language       = 'pl'
+    cover_url='http://www.biolog.pl/naukowy,portal,biolog.png'
+    no_stylesheets = True
+    #keeps_only_tags=[dict(id='main')]
+    remove_tags_before=dict(id='main')
+    remove_tags_after=dict(name='a', attrs={'name':'komentarze'})
+    remove_tags=[dict(name='img', attrs={'alt':'Komentarze'})]
+    feeds          = [(u'Wszystkie', u'http://www.biolog.pl/backend.php'), (u'Medycyna', u'http://www.biolog.pl/medycyna-rss.php'), (u'Ekologia', u'http://www.biolog.pl/rss-ekologia.php'), (u'Genetyka i biotechnologia', u'http://www.biolog.pl/rss-biotechnologia.php'), (u'Botanika', u'http://www.biolog.pl/rss-botanika.php'), (u'Le\u015bnictwo', u'http://www.biolog.pl/rss-lesnictwo.php'), (u'Zoologia', u'http://www.biolog.pl/rss-zoologia.php')]
--- a/recipes/birgun_gazetesi.recipe
+++ b/recipes/birgun_gazetesi.recipe
@ -0,0 +1,50 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Birgun (BasicNewsRecipe):
+
+    title                  = u'Birgün Gazetesi'
+    __author__             = u'Osman Kaysan'
+    oldest_article         = 7
+    max_articles_per_feed  =150
+    use_embedded_content  = False
+    description           = 'Birgun gazatesi haberleri, kose yazarlari'
+    publisher              = 'Birgün'
+    category               = 'news,haberler,turkce,gazete,birgun'
+    language               = 'tr'
+    no_stylesheets        = True
+    publication_type = 'newspaper'
+
+    conversion_options = {
+                             'comments'        : description
+                            ,'tags'            : category
+                            ,'language'        : language
+                            ,'publisher'       : publisher
+                            ,'linearize_tables': True
+                ,'remove_paragraph_spacing': True,
+                          }
+
+    cover_img_url = 'http://www.birgun.net/i/birgun.png'
+    masthead_url = 'http://www.birgun.net/i/birgun.png'
+
+    remove_attributes = ['width','height']
+
+    remove_tags_before  = dict(name='h2', attrs={'class':'storyHeadline'})
+    #remove_tags_after   = dict(name='div', attrs={'class':'toollinks'})
+    remove_tags_after   = dict(name='tr', attrs={'valign':'top'})
+    remove_tags   = [ dict(name='div', attrs={'id':'byLine'}), dict(name='div', attrs={'class':'toollinks'})
+, dict(name='div', attrs={'class':'main-lead'}), dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'})
+, dict(name='a', attrs={'class':'addthis_button'})]
+
+    remove_empty_feeds= True
+
+    feeds          = [
+                      ( u'Güncel', u'http://www.birgun.net/actuels.xml')
+         ,( u'Köşe Yazarları', u'http://www.birgun.net/writer.xml')
+         ,( u'Politika', u'http://www.birgun.net/politics.xml')
+         ,( u'Ekonomi', u'http://www.birgun.net/economic.xml')
+         ,( u'Çalışma Yaşamı', u'http://www.birgun.net/workers.xml')
+         ,( u'Dünya', u'http://www.birgun.net/worlds.xml')
+         ,( u'Yaşam', u'http://www.birgun.net/lifes.xml')
+                     ]
--- a/recipes/birmingham_post.recipe
+++ b/recipes/birmingham_post.recipe
@ -0,0 +1,44 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+class AdvancedUserRecipe1306097511(BasicNewsRecipe):
+    title          = u'Birmingham post'
+    description = 'News for Birmingham UK'
+    timefmt = ''
+    __author__ = 'Dave Asbury'
+    cover_url = 'http://1.bp.blogspot.com/_GwWyq5eGw9M/S9BHPHxW55I/AAAAAAAAB6Q/iGCWl0egGzg/s320/Birmingham+post+Lite+front.JPG'
+    oldest_article = 1
+    max_articles_per_feed = 20
+    remove_empty_feeds = True
+    remove_javascript     = True
+    auto_cleanup = True
+    language = 'en_GB'
+
+
+    masthead_url        = 'http://www.pressgazette.co.uk/Pictures/web/t/c/g/birmingham_post.jpg'
+
+
+    keep_only_tags = [
+    #dict(name='h1',attrs={'id' : 'article-headline'}),
+                    #dict(attrs={'class':['article-meta-author','article-meta-date','article main','art-o art-align-center otm-1 ']}),
+    #dict(name='p')
+    #dict(attrs={'id' : 'three-col'})
+        ]
+    remove_tags    = [
+             # dict(name='div',attrs={'class' : 'span-33 last header-links'})
+
+                               ]
+    feeds          = [
+        #(u'News',u'http://www.birminghampost.net/news/rss.xml'),
+        (u'Local News', u'http://www.birminghampost.net/news/west-midlands-news/rss.xml'),
+        (u'UK News', u'http://www.birminghampost.net/news/uk-news/rss.xml'),
+        (u'Sports',u'http://www.birminghampost.net/midlands-birmingham-sport/rss.xml'),
+        (u'Bloggs & Comments',u'http://www.birminghampost.net/comment/rss.xml')
+
+         ]
+    extra_css  = '''
+                    body {font: sans-serif medium;}'
+    h1 {text-align : center; font-family:Arial,Helvetica,sans-serif; font-size:20px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold;}
+                h2 {text-align : center;color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:15px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; }
+                    span{ font-size:9.5px; font-weight:bold;font-style:italic}
+                    p { text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
+
+     '''
--- a/recipes/blues.recipe
+++ b/recipes/blues.recipe
@ -0,0 +1,26 @@
+__license__   = 'GPL v3'
+__copyright__ = '2011, Oskar Kunicki <rakso at interia.pl>'
+'''
+Changelog:
+2011-11-27
+News from BluesRSS.info
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class BluesRSS(BasicNewsRecipe):
+    title                     = 'Blues News'
+    __author__          = 'Oskar Kunicki'
+    description           ='Blues news from around the world'
+    publisher             = 'BluesRSS.info'
+    category              = 'news, blues, USA,UK'
+    oldest_article        = 5
+    max_articles_per_feed = 100
+    language              = 'en'
+    cover_url             = 'http://bluesrss.info/cover.jpg'
+    masthead_url       = 'http://bluesrss.info/cover.jpg'
+    no_stylesheets = True
+
+    remove_tags    = [dict(name='div', attrs={'class':'wp-pagenavi'})]
+
+    feeds = [(u'News', u'http://bluesrss.info/feed/')]
--- a/recipes/buffalo_news.recipe
+++ b/recipes/buffalo_news.recipe
@ -10,30 +10,19 @@ http://www.buffalonews.com/RSS/

 from calibre.web.feeds.news import BasicNewsRecipe

-class AdvancedUserRecipe1298680852(BasicNewsRecipe):
+class BuffaloNews(BasicNewsRecipe):
    title          = u'Buffalo News'
    oldest_article = 2
    language = 'en'
-    __author__ = 'ChappyOnIce'
+    __author__ = 'ChappyOnIce, Krittika Goyal'
    max_articles_per_feed = 20
    encoding = 'utf-8'
    masthead_url = 'http://www.buffalonews.com/buffalonews/skins/buffalonews/images/masthead/the_buffalo_news_logo.png'
-    remove_javascript = True
-    extra_css = 'body {text-align: justify;}\n  \
-       p {text-indent: 20px;}'
+    auto_cleanup = True
+    remove_empty_feeds = True

-    keep_only_tags    = [
-                       dict(name='div', attrs={'class':['main-content-left']})
-                        ]
-
-    remove_tags = [
-                       dict(name='div', attrs={'id':['commentCount']}),
-       dict(name='div', attrs={'class':['story-list-links']})
-                        ]
-
-    remove_tags_after  = dict(name='div', attrs={'class':['body storyContent']})
-
-    feeds          = [(u'City of Buffalo', u'http://www.buffalonews.com/city/communities/buffalo/?widget=rssfeed&view=feed&contentId=77944'),
+    feeds          = [
+            (u'City of Buffalo', u'http://www.buffalonews.com/city/communities/buffalo/?widget=rssfeed&view=feed&contentId=77944'),
            (u'Southern Erie County', u'http://www.buffalonews.com/city/communities/southern-erie/?widget=rssfeed&view=feed&contentId=77944'),
            (u'Eastern Erie County', u'http://www.buffalonews.com/city/communities/eastern-erie/?widget=rssfeed&view=feed&contentId=77944'),
            (u'Southern Tier', u'http://www.buffalonews.com/city/communities/southern-tier/?widget=rssfeed&view=feed&contentId=77944'),
@ -56,3 +45,4 @@ class AdvancedUserRecipe1298680852(BasicNewsRecipe):
            (u'Off Main Street', u'http://www.buffalonews.com/city/columns/off-main-street/?widget=rssfeed&view=feed&contentId=77944'),
            (u'Editorials', u'http://www.buffalonews.com/editorial-page/buffalo-news-editorials/?widget=rssfeed&view=feed&contentId=77944')
         ]
+
--- a/recipes/catavencii.recipe
+++ b/recipes/catavencii.recipe
@ -0,0 +1,51 @@
+# -*- coding: utf-8 -*-
+#!/usr/bin/env  python
+
+__license__   = 'GPL v3'
+__copyright__ = u'2011, Silviu Cotoar\u0103'
+'''
+catavencii.ro
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Catavencii(BasicNewsRecipe):
+    title                 = u'Ca\u0163avencii'
+    __author__            = u'Silviu Cotoar\u0103'
+    publisher             = u'Ca\u0163avencii'
+    description           = u'Ca\u0163avencii'
+    oldest_article        = 5
+    language              = 'ro'
+    max_articles_per_feed = 100
+    no_stylesheets        = True
+    use_embedded_content  = False
+    category              = 'Ziare,Romania'
+    encoding              = 'utf-8'
+    cover_url        	  = 'http://www.simonatache.ro/wp-content/uploads/2011/06/catavencii-logo.png'
+
+    conversion_options = {
+                'comments'    : description
+                ,'tags'       : category
+                ,'language'   : language
+                ,'publisher'  : publisher
+                         }
+
+    keep_only_tags = [
+                      dict(name='div', attrs={'id':'content'})
+                     ]
+
+    remove_tags = [
+             dict(name='div', attrs={'id':'breadcrumbs'})
+           , dict(name='span', attrs={'class':'info'})
+		   , dict(name='div', attrs={'id':'social-media-article'})
+                  ]
+
+    remove_tags_after = [
+			         dict(name='div', attrs={'id':'social-media-article'})
+	               ]
+    feeds  = [
+        (u'\u0218tiri', u'http://www.catavencii.ro/rss')
+         ]
+
+    def preprocess_html(self, soup):
+        return self.adeify_images(soup)
--- a/recipes/catavencu.recipe
+++ b/recipes/catavencu.recipe
@ -4,16 +4,16 @@
 __license__   = 'GPL v3'
 __copyright__ = u'2011, Silviu Cotoar\u0103'
 '''
-catavencu.ro
+academiacatavencu.info
 '''

 from calibre.web.feeds.news import BasicNewsRecipe

-class Catavencu(BasicNewsRecipe):
+class AcademiaCatavencu(BasicNewsRecipe):
    title                 = u'Academia Ca\u0163avencu'
    __author__            = u'Silviu Cotoar\u0103'
    description           = 'Tagma cum laude'
-    publisher             = 'Catavencu'
+    publisher             = u'Ca\u0163avencu'
    oldest_article        = 5
    language              = 'ro'
    max_articles_per_feed = 100
@ -21,7 +21,7 @@ class Catavencu(BasicNewsRecipe):
    use_embedded_content  = False
    category              = 'Ziare'
    encoding              = 'utf-8'
-    cover_url         = 'http://upload.wikimedia.org/wikipedia/en/1/1e/Academia_Catavencu.jpg'
+    cover_url         = 'http://www.academiacatavencu.info/images/logo.png'

    conversion_options = {
                             'comments'   : description
@ -31,22 +31,21 @@ class Catavencu(BasicNewsRecipe):
                         }

    keep_only_tags = [
-            dict(name='ul', attrs={'class':'articles'})
+            dict(name='h1', attrs={'class':'art_title'}),
+			dict(name='div', attrs={'class':'art_text'})
                     ]

    remove_tags = [
-             dict(name='div', attrs={'class':['tools']})
-           , dict(name='div', attrs={'class':['share']})
-           , dict(name='div', attrs={'class':['category']})
-           , dict(name='div', attrs={'id':['comments']})
+             dict(name='div', attrs={'class':['desp_m']})
+           , dict(name='div', attrs={'id':['tags']})          
                  ]

    remove_tags_after = [
-              dict(name='div', attrs={'id':'comments'})
+              dict(name='div', attrs={'class':['desp_m']})
            ]

    feeds          = [
-            (u'Feeds', u'http://catavencu.ro/feed/rss')
+            (u'Feeds', u'http://www.academiacatavencu.info/rss.xml')
                 ]

    def preprocess_html(self, soup):
--- a/recipes/cgm_pl.recipe
+++ b/recipes/cgm_pl.recipe
@ -27,7 +27,7 @@ class CGM(BasicNewsRecipe):
            del item['style']
        ad=soup.findAll('a')
        for r in ad:
-            if 'http://www.hustla.pl' in r['href']:                
+            if 'http://www.hustla.pl' in r['href'] or 'http://www.ebilet.pl' in r['href']:                
                 r.extract()
        gallery=soup.find('div', attrs={'class':'galleryFlash'})
        if gallery:
--- a/recipes/cnd.recipe
+++ b/recipes/cnd.recipe
@ -23,7 +23,9 @@ class TheCND(BasicNewsRecipe):
 	remove_tags		= [dict(name='table', attrs={'align':'right'}), dict(name='img', attrs={'src':'http://my.cnd.org/images/logo.gif'}), dict(name='hr', attrs={}), dict(name='small', attrs={})]
 	no_stylesheets	 = True

-	preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
+	preprocess_regexps = [  (re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''),
+				(re.compile('<table width.*?</table>', re.DOTALL), lambda m: ''),
+				]

 	def print_version(self, url):
 		if url.find('news/article.php') >= 0:
@ -46,13 +48,15 @@ class TheCND(BasicNewsRecipe):
 			title = self.tag_to_string(a)
 			self.log('\tFound article: ', title, 'at', url)
 			date = a.nextSibling
+			if re.search('cm', date):
+				continue
 			if (date is not None) and len(date)>2:
 				if not articles.has_key(date):
 					articles[date] = []
 				articles[date].append({'title':title, 'url':url, 'description': '', 'date':''})
 				self.log('\t\tAppend to : ', date)

-		self.log('log articles', articles)
+		#self.log('log articles', articles)
 		mostCurrent = sorted(articles).pop()
 		self.title = 'CND ' + mostCurrent		
 		
--- a/recipes/cnd_weekly.recipe
+++ b/recipes/cnd_weekly.recipe
@ -0,0 +1,72 @@
+#!/usr/bin/env  python
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Derek Liang <Derek.liang.ca @@@at@@@ gmail.com>'
+'''
+cnd.org
+'''
+import re
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class TheCND(BasicNewsRecipe):
+
+	title	  = 'CND Weekly'
+	__author__ = 'Derek Liang'
+	description = ''
+	INDEX = 'http://cnd.org'
+	language = 'zh'
+	conversion_options = {'linearize_tables':True}
+
+	remove_tags_before = dict(name='div', id='articleHead')
+	remove_tags_after  = dict(id='copyright')
+	remove_tags		= [dict(name='table', attrs={'align':'right'}), dict(name='img', attrs={'src':'http://my.cnd.org/images/logo.gif'}), dict(name='hr', attrs={}), dict(name='small', attrs={})]
+	no_stylesheets	 = True
+
+	preprocess_regexps = [  (re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''),
+				(re.compile('<table width.*?</table>', re.DOTALL), lambda m: ''),
+				]
+
+	def print_version(self, url):
+		if url.find('news/article.php') >= 0:
+			return re.sub("^[^=]*", "http://my.cnd.org/modules/news/print.php?storyid", url)
+		else:
+			return re.sub("^[^=]*", "http://my.cnd.org/modules/wfsection/print.php?articleid", url)
+
+	def parse_index(self):
+		soup = self.index_to_soup(self.INDEX)
+
+		feeds = []
+		articles = {}
+
+		for a in soup.findAll('a', attrs={'target':'_cnd'}):
+			url = a['href']
+			if url.find('article.php') < 0 :
+				continue
+			if url.startswith('/'):
+				url = 'http://cnd.org'+url
+			title = self.tag_to_string(a)
+			date = a.nextSibling
+			if not re.search('cm', date):
+				continue
+			self.log('\tFound article: ', title, 'at', url, '@', date)
+			if (date is not None) and len(date)>2:
+				if not articles.has_key(date):
+					articles[date] = []
+				articles[date].append({'title':title, 'url':url, 'description': '', 'date':''})
+				self.log('\t\tAppend to : ', date)
+
+		
+		sorted_articles = sorted(articles)
+		while sorted_articles:
+			mostCurrent = sorted_articles.pop()
+			self.title = 'CND ' + mostCurrent
+			feeds.append((self.title, articles[mostCurrent]))
+
+		return feeds
+
+	def populate_article_metadata(self, article, soup, first):
+		header = soup.find('h3')
+		self.log('header: ' + self.tag_to_string(header))
+		pass
+
--- a/recipes/computerworld_pl.recipe
+++ b/recipes/computerworld_pl.recipe
@ -0,0 +1,22 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+from calibre.web.feeds.news import BasicNewsRecipe
+class Computerworld_pl(BasicNewsRecipe):
+    title          = u'Computerworld.pl'
+    __author__        = 'fenuks'
+    description   = u'Serwis o IT w przemyśle, finansach, handlu, administracji oraz rynku IT i telekomunikacyjnym - wiadomości, opinie, analizy, porady prawne'
+    category       = 'IT'
+    language       = 'pl'
+    no_stylesheets=True
+    oldest_article = 7
+    max_articles_per_feed = 100
+    keep_only_tags=[dict(name='div', attrs={'id':'s'})]
+    remove_tags_after=dict(name='div', attrs={'class':'rMobi'})
+    remove_tags=[dict(name='div', attrs={'class':['nnav', 'rMobi']}), dict(name='table', attrs={'class':'ramka_slx'})]
+    feeds          = [(u'Wiadomo\u015bci', u'http://rssout.idg.pl/cw/news_iso.xml')]
+
+    def get_cover_url(self):
+        soup = self.index_to_soup('http://www.computerworld.pl/')
+        cover=soup.find(name='img', attrs={'class':'prawo'})
+        self.cover_url=cover['src']
+        return getattr(self, 'cover_url', self.cover_url)
--- a/recipes/cosmopolitan_uk.recipe
+++ b/recipes/cosmopolitan_uk.recipe
@ -0,0 +1,52 @@
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+#from calibre import __appname__
+from calibre.utils.magick import Image
+class AdvancedUserRecipe1306097511(BasicNewsRecipe):
+    title          = u'Cosmopolitan UK'
+    description = 'Fashion, beauty and Gossip for women from COSMOPOLITAN -UK'
+
+    __author__ = 'Dave Asbury'
+    #last update 21/12/11
+    # greyscale code by Starson
+    cover_url = 'http://www.cosmopolitan.magazine.co.uk/files/4613/2085/8988/Cosmo_Cover3.jpg'
+    no_stylesheets = True
+    oldest_article = 7
+    max_articles_per_feed = 20
+    remove_empty_feeds = True
+    remove_javascript     = True
+
+    preprocess_regexps = [
+    (re.compile(r'<!-- Begin tmpl module_competition_offer -->.*?<!-- End tmpl module_competition_offer-->', re.IGNORECASE | re.DOTALL), lambda match: '')]
+    language = 'en_GB'
+
+
+    masthead_url        = 'http://www.cosmopolitan.co.uk/cm/cosmopolitanuk/site_images/header/cosmouk_logo_home.gif'
+
+
+    keep_only_tags = [
+                              dict(attrs={'class' : ['dateAuthor', 'publishDate']}),
+                              dict(name='div',attrs ={'id' : ['main_content']})
+                              ]
+    remove_tags    = [
+                              dict(name='div',attrs={'class' : ['blogInfo','viral_toolbar','comment_number','prevEntry nav']}),
+                              dict(name='div',attrs={'class' : 'blog_module_about_the_authors'}),
+                              dict(attrs={'id': ['breadcrumbs','comment','related_links_list','right_rail','content_sec_fb_more','content_sec_mostpopularstories','content-sec_fb_frame_viewfb_bot']}),
+                              dict(attrs={'class' : ['read_liked_that_header','fb_back_next_area']}),
+                              dict(name='li',attrs={'class' : 'thumb'})
+              ]
+
+    feeds          = [
+        (u'Love & Sex', u'http://www.cosmopolitan.co.uk/love-sex/rss/'), (u'Men', u'http://cosmopolitan.co.uk/men/rss/'), (u'Fashion', u'http://cosmopolitan.co.uk/fashion/rss/'), (u'Hair & Beauty', u'http://cosmopolitan.co.uk/beauty-hair/rss/'), (u'LifeStyle', u'http://cosmopolitan.co.uk/lifestyle/rss/'), (u'Cosmo On Campus', u'http://cosmopolitan.co.uk/campus/rss/'), (u'Celebrity Gossip', u'http://cosmopolitan.co.uk/celebrity-gossip/rss/')]
+
+    def postprocess_html(self, soup, first):
+        #process all the images
+        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
+            iurl = tag['src']
+            img = Image()
+            img.open(iurl)
+            if img < 0:
+                raise RuntimeError('Out of memory')
+            img.type = "GrayscaleType"
+            img.save(iurl)
+        return soup
--- a/recipes/daily_writing_tips.recipe
+++ b/recipes/daily_writing_tips.recipe
@ -0,0 +1,18 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class DailyWritingTips(BasicNewsRecipe):
+    title          = u'Daily Writing Tips'
+    language       = 'en_GB'
+    __author__ = 'NotTaken'
+    oldest_article = 7 #days
+    max_articles_per_feed = 40
+    use_embedded_content = True
+    no_stylesheets = True
+    auto_cleanup = False
+    encoding = 'utf-8'
+
+
+    feeds          = [
+('Latest tips',
+ 'http://feeds2.feedburner.com/DailyWritingTips'),
+]
--- a/recipes/datasport.recipe
+++ b/recipes/datasport.recipe
@ -0,0 +1,15 @@
+__license__   = 'GPL v3'
+__author__    = 'faber1971'
+description   = 'Italian soccer news website - v1.00 (17, December 2011)'
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1324114272(BasicNewsRecipe):
+    title          = u'Datasport'
+    language = 'it'
+    __author__ = 'faber1971'
+    oldest_article = 1
+    max_articles_per_feed = 100
+    auto_cleanup = True
+
+    feeds          = [(u'Datasport', u'http://www.datasport.it/calcio/rss.xml')]
--- a/recipes/descopera_org.recipe
+++ b/recipes/descopera_org.recipe
@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+'''
+descopera.org
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Descopera(BasicNewsRecipe):
+    title = u'Descoperă.org'
+    __author__  = 'Marius Ignătescu'
+    description = 'Descoperă. Placerea de a cunoaște'
+    publisher = 'descopera.org'
+    category = 'science, technology, culture, history, earth'
+    language = 'ro'
+    oldest_article = 14
+    max_articles_per_feed = 100
+    encoding = 'utf8'
+    no_stylesheets = True
+    extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
+    keep_only_tags    = [dict(name='div', attrs={'class':['post']})]
+    remove_tags = [dict(name='div', attrs={'class':['topnav', 'box_a', 'shr-bookmarks shr-bookmarks-expand shr-bookmarks-center shr-bookmarks-bg-knowledge']})]
+    remove_attributes = ['width','height']
+    cover_url = 'http://www.descopera.org/wp-content/themes/dorg/styles/default/img/b_top.png?width=400'
+    feeds  = [(u'Articles', u'http://www.descopera.org/feed/')]
+
+    def preprocess_html(self, soup):
+        return self.adeify_images(soup)
--- a/recipes/di.recipe
+++ b/recipes/di.recipe
@ -46,7 +46,8 @@ class DziennikInternautowRecipe(BasicNewsRecipe):
 		dict(name = 'div', attrs = {'class' : 'poradniki_context'}),
 		dict(name = 'div', attrs = {'class' : 'uniBox'}),
 		dict(name = 'object', attrs = {}),
-		dict(name = 'h3', attrs = {})
+		dict(name = 'h3', attrs = {}),
+		dict(attrs={'class':'twitter-share-button'})
 	]
 	
 	preprocess_regexps = [
@ -58,3 +59,8 @@ class DziennikInternautowRecipe(BasicNewsRecipe):
 			(r'\s*</', lambda match: '</'),
 		]
 	]
+
+	def skip_ad_pages(self, soup):
+		if 'Advertisement' in soup.title:
+			nexturl=soup.find('a')['href']
+			return self.index_to_soup(nexturl, raw=True)
--- a/recipes/dziennik_pl.recipe
+++ b/recipes/dziennik_pl.recipe
@ -0,0 +1,58 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+from calibre.web.feeds.news import BasicNewsRecipe
+import re
+class Dziennik_pl(BasicNewsRecipe):
+    title          = u'Dziennik.pl'
+    __author__        = 'fenuks'
+    description   = u'Wiadomości z kraju i ze świata. Wiadomości gospodarcze. Znajdziesz u nas informacje, wydarzenia, komentarze, opinie.'
+    category       = 'newspaper'
+    language       = 'pl'
+    cover_url='http://6.s.dziennik.pl/images/og_dziennik.jpg'
+    no_stylesheets = True
+    oldest_article = 7
+    max_articles_per_feed = 100
+    remove_javascript=True
+    remove_empty_feeds=True
+    preprocess_regexps     = [(re.compile("Komentarze:"), lambda m: '')]
+    keep_only_tags=[dict(id='article')]
+    remove_tags=[dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget']}), dict(name='a', attrs={'class':'komentarz'})]
+    feeds          = [(u'Wszystko', u'http://rss.dziennik.pl/Dziennik-PL/'),
+		(u'Wiadomości', u'http://rss.dziennik.pl/Dziennik-Wiadomosci'),
+		(u'Gospodarka', u'http://rss.dziennik.pl/Dziennik-Gospodarka'),
+		(u'Kobieta', u'http://rss.dziennik.pl/Dziennik-Kobieta'),
+		(u'Auto', u'http://rss.dziennik.pl/Dziennik-Auto'),
+		(u'Rozrywka', u'http://rss.dziennik.pl/Dziennik-Rozrywka'),
+		(u'Film', u'http://rss.dziennik.pl/Dziennik-Film'),
+		(u'Muzyka' , u'http://rss.dziennik.pl/Dziennik-Muzyka'),
+		(u'Kultura', u'http://rss.dziennik.pl/Dziennik-Kultura'),
+		(u'Nauka', u'http://rss.dziennik.pl/Dziennik-Nauka'),
+		(u'Podróże', u'http://rss.dziennik.pl/Dziennik-Podroze/'),
+		(u'Nieruchomości', u'http://rss.dziennik.pl/Dziennik-Nieruchomosci')]
+
+    def append_page(self, soup, appendtag):
+        tag=soup.find('a', attrs={'class':'page_next'})
+        if tag:
+            appendtag.find('div', attrs={'class':'article_paginator'}).extract()
+        while tag:
+            soup2= self.index_to_soup(tag['href'])
+            tag=soup2.find('a', attrs={'class':'page_next'})
+            if not tag:
+                for r in appendtag.findAll('div', attrs={'class':'art_src'}):
+                    r.extract()
+            pagetext = soup2.find(name='div', attrs={'class':'article_body'})
+            for dictionary in self.remove_tags:
+                 v=pagetext.findAll(name=dictionary['name'], attrs=dictionary['attrs'])
+                 for delete in v:
+                     delete.extract()
+            pos = len(appendtag.contents)
+            appendtag.insert(pos, pagetext)
+            if appendtag.find('div', attrs={'class':'article_paginator'}):
+                appendtag.find('div', attrs={'class':'article_paginator'}).extract()
+
+
+
+
+    def preprocess_html(self, soup):
+         self.append_page(soup, soup.body)
+         return soup
--- a/recipes/echo_online.recipe
+++ b/recipes/echo_online.recipe
@ -0,0 +1,47 @@
+__license__   = 'GPL v3'
+__copyright__ = '2011, Kovid Goyal <kovid at kovidgoyal.net>, Armin Geller'
+'''
+Fetch echo-online.de
+'''
+
+from calibre.web.feeds.recipes import BasicNewsRecipe
+class Echo_Online(BasicNewsRecipe):
+    title          = u' Echo Online'
+    description = '-Echo Online-'
+    publisher = 'Echo Online GmbH'
+    category = 'News, Germany'
+    __author__ = 'Armin Geller' # 2011-12-17
+    language = 'de'
+    lang = 'de-DE'
+    encoding = 'iso-8859-1'
+    timefmt = ' [%a, %d %b %Y]'
+
+    oldest_article = 7
+    max_articles_per_feed = 2
+    no_stylesheets = True
+    auto_cleanup = True
+    remove_javascript = True
+
+    feeds = [
+              (u'Topnews', u'http://www.echo-online.de/storage/rss/rss/topnews.xml'),
+              (u'Darmstadt', u'http://www.echo-online.de/rss/darmstadt.xml'),
+              (u'Darmstadt-Dieburg', u'http://www.echo-online.de/rss/darmstadtdieburg.xml'),
+              (u'Kreis Gro\xdf-Gerau', u'http://www.echo-online.de/rss/kreisgrossgerau.xml'),
+              (u'R\xfcsselsheim', u'http://www.echo-online.de/rss/ruesselsheim.xml'),
+              (u'Kreis Bergstra\xdfe', u'http://www.echo-online.de/rss/bergstrasse.xml'),
+              (u'Odenwaldkreis', u'http://www.echo-online.de/rss/odenwald.xml'),
+              (u'SV 98', u'http://www.echo-online.de/rss/sv98.xml'),
+              (u'Kino', u'http://www.echo-online.de/rss/kino.xml'),
+              (u'Ausstellungen', u'http://www.echo-online.de/rss/ausstellungen.xml'),
+              (u'Ausflug & Reise', u'http://www.echo-online.de/rss/ausflugreise.xml'),
+             ]
+
+    def print_version(self, url):
+          return self.browser.open_novisit(url).geturl() + '?_FRAME=33&_FORMAT=PRINT'
+
+    remove_tags = [dict(name='div', attrs={'class':["header", "name"]}),]
+    auto_cleanup_keep = '//div[@class="bild_gross w270"]'
+
+#    cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-ash2/41801_145340745513489_893927_n.jpg' # 2011-12-16 AGe
+    cover_url = 'http://adcounter.darmstaedter-echo.de/webdav/files/config/gui/images/Zeitungsfaecher.gif' # 2011-12-16 AGe
+
--- a/recipes/economist.recipe
+++ b/recipes/economist.recipe
@ -55,12 +55,17 @@ class Economist(BasicNewsRecipe):
    '''

    def get_cover_url(self):
-        br = self.browser
-        br.open(self.INDEX)
-        issue = br.geturl().split('/')[4]
-        self.log('Fetching cover for issue: %s'%issue)
-        cover_url = "http://media.economist.com/sites/default/files/imagecache/print-cover-full/print-covers/%s_CNA400.jpg" %(issue.translate(None,'-'))
-        return cover_url
+        soup = self.index_to_soup('http://www.economist.com/printedition/covers')
+        div = soup.find('div', attrs={'class':lambda x: x and
+            'print-cover-links' in x})
+        a = div.find('a', href=True)
+        url = a.get('href')
+        if url.startswith('/'):
+            url = 'http://www.economist.com' + url
+        soup = self.index_to_soup(url)
+        div = soup.find('div', attrs={'class':'cover-content'})
+        img = div.find('img', src=True)
+        return img.get('src')

    def parse_index(self):
        return self.economist_parse_index()
--- a/recipes/economist_free.recipe
+++ b/recipes/economist_free.recipe
@ -39,13 +39,17 @@ class Economist(BasicNewsRecipe):
    delay = 1

    def get_cover_url(self):
-        br = self.browser
-        br.open(self.INDEX)
-        issue = br.geturl().split('/')[4]
-        self.log('Fetching cover for issue: %s'%issue)
-        cover_url = "http://media.economist.com/sites/default/files/imagecache/print-cover-full/print-covers/%s_CNA400.jpg" %(issue.translate(None,'-'))
-        return cover_url
-
+        soup = self.index_to_soup('http://www.economist.com/printedition/covers')
+        div = soup.find('div', attrs={'class':lambda x: x and
+            'print-cover-links' in x})
+        a = div.find('a', href=True)
+        url = a.get('href')
+        if url.startswith('/'):
+            url = 'http://www.economist.com' + url
+        soup = self.index_to_soup(url)
+        div = soup.find('div', attrs={'class':'cover-content'})
+        img = div.find('img', src=True)
+        return img.get('src')

    def parse_index(self):
        try:
--- a/recipes/el_periodico.recipe
+++ b/recipes/el_periodico.recipe
@ -5,12 +5,11 @@ __license__     = 'GPL v3'
 __copyright__   = '04 December 2010, desUBIKado'
 __author__      = 'desUBIKado'
 __description__ = 'Daily newspaper from Aragon'
-__version__     = 'v0.07'
-__date__        = '06, February 2011'
+__version__     = 'v0.08'
+__date__        = '13, November 2011'
 '''
 elperiodicodearagon.com
 '''
-import re
 from calibre.web.feeds.news import BasicNewsRecipe


@ -20,13 +19,13 @@ class elperiodicodearagon(BasicNewsRecipe):
    description           = u'Noticias desde Aragon'
    publisher             = u'elperiodicodearagon.com'
    category              = u'news, politics, Spain, Aragon'
-    oldest_article        = 2
+    oldest_article        = 1
    delay                 = 0
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    language              = 'es'
-    encoding              = 'utf8'
+    encoding              = 'iso-8859-1'
    remove_empty_feeds    = True
    remove_javascript     = True

@ -39,61 +38,30 @@ class elperiodicodearagon(BasicNewsRecipe):
                         }

    feeds              = [
-                           (u'Arag\xf3n', u'http://elperiodicodearagon.com/RSS/2.xml'),
-                           (u'Internacional', u'http://elperiodicodearagon.com/RSS/4.xml'),
-                           (u'Espa\xf1a', u'http://elperiodicodearagon.com/RSS/3.xml'),
-                           (u'Econom\xeda', u'http://elperiodicodearagon.com/RSS/5.xml'),
-                           (u'Deportes', u'http://elperiodicodearagon.com/RSS/7.xml'),
-                           (u'Real Zaragoza', u'http://elperiodicodearagon.com/RSS/10.xml'),
-                           (u'Opini\xf3n', u'http://elperiodicodearagon.com/RSS/103.xml'),
-                           (u'Escenarios', u'http://elperiodicodearagon.com/RSS/105.xml'),
-                           (u'Sociedad', u'http://elperiodicodearagon.com/RSS/104.xml'),
-                           (u'Gente', u'http://elperiodicodearagon.com/RSS/330.xml')
+                           (u'Portada', u'http://zetaestaticos.com/aragon/rss/portada_es.xml'),
+                           (u'Arag\xf3n', u'http://zetaestaticos.com/aragon/rss/2_es.xml'),
+                           (u'Internacional', u'http://zetaestaticos.com/aragon/rss/4_es.xml'),
+                           (u'Espa\xf1a', u'http://zetaestaticos.com/aragon/rss/3_es.xml'),
+                           (u'Econom\xeda', u'http://zetaestaticos.com/aragon/rss/5_es.xml'),
+                           (u'Deportes', u'http://zetaestaticos.com/aragon/rss/7_es.xml'),
+                           (u'Real Zaragoza', u'http://zetaestaticos.com/aragon/rss/10_es.xml'),
+                           (u'CAI Zaragoza', u'http://zetaestaticos.com/aragon/rss/91_es.xml'),
+                           (u'Monta\xf1ismo', u'http://zetaestaticos.com/aragon/rss/354_es.xml'),
+                           (u'Opini\xf3n', u'http://zetaestaticos.com/aragon/rss/103_es.xml'),
+                           (u'Tema del d\xeda', u'http://zetaestaticos.com/aragon/rss/102_es.xml'),
+                           (u'Escenarios', u'http://zetaestaticos.com/aragon/rss/105_es.xml'),
+                           (u'Sociedad', u'http://zetaestaticos.com/aragon/rss/104_es.xml'),
+                           (u'Gente', u'http://zetaestaticos.com/aragon/rss/330_es.xml'),
+                           (u'Espacio 3', u'http://zetaestaticos.com/aragon/rss/328_es.xml'),
+                           (u'Fiestas del Pilar', u'http://zetaestaticos.com/aragon/rss/107_es.xml')
                         ]


-    extra_css = '''
-                    h3 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;}
-                    h2 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:18px;}
-                    h4 {font-family:Arial,Helvetica,sans-serif; font-style:italic; font-weight:normal;font-size:20px;}
-                    .columnaDeRecursosRelacionados {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:14px;}
-                    img{margin-bottom: 0.4em}
-		'''
-
    remove_attributes = ['height','width']

-    keep_only_tags     = [dict(name='div', attrs={'id':'contenidos'})]
+    keep_only_tags     = [dict(name='div', attrs={'id':'Noticia'})]


-    # Quitar toda la morralla
-
-    remove_tags        = [dict(name='ul', attrs={'class':'herramientasDeNoticia'}),
-                          dict(name='span', attrs={'class':'MasInformacion '}),
-                          dict(name='span', attrs={'class':'MasInformacion'}),
-                          dict(name='div', attrs={'class':'Middle'}),
-                          dict(name='div', attrs={'class':'MenuCabeceraRZaragoza'}),
-                          dict(name='div', attrs={'id':'MenuCabeceraRZaragoza'}),
-                          dict(name='div', attrs={'class':'MenuEquipo'}),
-                          dict(name='div', attrs={'class':'TemasRelacionados'}),
-                          dict(name='div', attrs={'class':'GaleriaEnNoticia'}),
-                          dict(name='div', attrs={'class':'Recorte'}),
-                          dict(name='div', attrs={'id':'NoticiasenRecursos'}),
-                          dict(name='div', attrs={'id':'NoticiaEnPapel'}),
-                          dict(name='p', attrs={'class':'RecorteEnNoticias'}),
-                          dict(name='div', attrs={'id':'Comparte'}),
-                          dict(name='div', attrs={'id':'CajaComparte'}),
-                          dict(name='a', attrs={'class':'EscribirComentario'}),
-                          dict(name='a', attrs={'class':'AvisoComentario'}),
-                          dict(name='div', attrs={'class':'CajaAvisoComentario'}),
-                          dict(name='div', attrs={'class':'navegaNoticias'}),
-                          dict(name='div', attrs={'class':'Mensaje'}),
-                          dict(name='div', attrs={'id':'PaginadorDiCom'}),
-                          dict(name='div', attrs={'id':'CajaAccesoCuentaUsuario'}),
-                          dict(name='div', attrs={'id':'CintilloComentario'}),
-                          dict(name='div', attrs={'id':'EscribeComentario'}),
-                          dict(name='div', attrs={'id':'FormularioComentario'}),
-                          dict(name='div', attrs={'id':'FormularioNormas'})]
-
    # Recuperamos la portada de papel (la imagen format=1 tiene mayor resolucion)

    def get_cover_url(self):
@ -104,23 +72,7 @@ class elperiodicodearagon(BasicNewsRecipe):
              return image['src'].rstrip('format=2') + 'format=1'
        return None

-    # Para quitar espacios entre la noticia y los comentarios (lineas 1 y 2)
-    # El indice no apuntaba correctamente al empiece de la noticia (linea 3)
+    # Usamos la versión para móviles

-    preprocess_regexps = [
-        (re.compile(r'<p>&nbsp;</p>', re.DOTALL|re.IGNORECASE), lambda match: ''),
-        (re.compile(r'<p> </p>', re.DOTALL|re.IGNORECASE), lambda match: ''),
-        (re.compile(r'<p id="">', re.DOTALL|re.IGNORECASE), lambda match: '<p>')
-        ]
-
-    # Para sustituir el video incrustado de YouTube por una imagen
-
-    def preprocess_html(self, soup):
-        for video_yt in soup.findAll('iframe',{'title':'YouTube video player'}):
-            if video_yt:
-               video_yt.name = 'img'
-               fuente = video_yt['src']
-               fuente2 = fuente.replace('http://www.youtube.com/embed/','http://img.youtube.com/vi/')
-               video_yt['src'] = fuente2 + '/0.jpg'
-
-        return soup
+    def print_version(self, url):
+          return url.replace('http://www.elperiodicodearagon.com/', 'http://www.elperiodicodearagon.com/m/')
--- a/recipes/elet_es_irodalom.recipe
+++ b/recipes/elet_es_irodalom.recipe
@ -0,0 +1,48 @@
+################################################################################
+#Description:	  http://es.hu/ RSS channel
+#Author: 	  Bigpapa (bigpapabig@hotmail.com)
+#Date:	  2010.12.01. - V1.0
+################################################################################
+
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class elet_es_irodalom(BasicNewsRecipe):
+    title                  = u'Elet es Irodalom'
+    __author__             = 'Bigpapa'
+    oldest_article         = 7
+    max_articles_per_feed  = 20	# Az adott e-bookban tarolt cikkek feedenkenti maximalis szamat adja meg.
+    no_stylesheets         = True
+    #delay                  = 1
+    use_embedded_content   = False
+    encoding               = 'iso-8859-2'
+    category               = 'Cikkek'
+    language               = 'hu'
+    publication_type       = 'newsportal'
+    extra_css              = '.doc_title { font: bold 30px } .doc_author {font: bold 14px} '
+
+    keep_only_tags    = [
+                       dict(name='div', attrs={'class':['doc_author', 'doc_title', 'doc']})
+                    
+                       	 ]
+
+    remove_tags = [
+	 dict(name='a', attrs={'target':['_TOP']}),
+	dict(name='div', attrs={'style':['float: right; margin-left: 5px; margin-bottom: 5px;', 'float: right; margin-left: 5px; margin-bottom: 5px;']}),
+	
+
+                      	  ]
+
+    
+
+    feeds          = [
+	(u'Publicisztika', 'http://www.feed43.com/4684235031168504.xml'), 
+	(u'Interj\xfa', 'http://www.feed43.com/4032465460040618.xml'), 
+	(u'Visszhang', 'http://www.feed43.com/3727375706873086.xml'), 
+	(u'P\xe1ratlan oldal', 'http://www.feed43.com/2525784782475057.xml'), 
+	(u'Feuilleton', 'http://www.feed43.com/7216025082703073.xml'), 
+	(u'Pr\xf3za', 'http://www.feed43.com/8760248802326384.xml'), 
+	(u'Vers', 'http://www.feed43.com/1737324675134275.xml'), 
+	(u'K\xf6nyvkritika', 'http://www.feed43.com/1281156550717082.xml'), 	
+	(u'M\u0171b\xedr\xe1lat', 'http://www.feed43.com/1851854623681044.xml')
+
+                   	 ]
--- a/recipes/elmundo.recipe
+++ b/recipes/elmundo.recipe
@ -4,7 +4,8 @@ __copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'
 '''
 elmundo.es
 '''
-
+import re
+import time
 from calibre.web.feeds.news import BasicNewsRecipe

 class ElMundo(BasicNewsRecipe):
@ -18,12 +19,15 @@ class ElMundo(BasicNewsRecipe):
    no_stylesheets        = True
    use_embedded_content  = False
    encoding              = 'iso8859_15'
+    remove_javascript     = True
+    remove_empty_feeds    = True
    language              = 'es'
    masthead_url          = 'http://estaticos03.elmundo.es/elmundo/iconos/v4.x/v4.01/bg_h1.png'
    publication_type      = 'newspaper'
    extra_css             = """
                               body{font-family: Arial,Helvetica,sans-serif}
                               .metadata_noticia{font-size: small}
+                               .pestana_GDP{font-size: small; font-weight:bold}
                               h1,h2,h3,h4,h5,h6,.subtitulo {color: #3F5974}
                               .hora{color: red}
                               .update{color: gray}
@ -41,8 +45,11 @@ class ElMundo(BasicNewsRecipe):
    remove_tags_after  = dict(name='div' , attrs={'id':['desarrollo_noticia','tamano']})
    remove_attributes  = ['lang','border']
    remove_tags = [
-                     dict(name='div', attrs={'class':['herramientas','publicidad_google']})
-                    ,dict(name='div', attrs={'id':'modulo_multimedia' })
+                     dict(name='div', attrs={'class':['herramientas','publicidad_google','comenta','col col-2b','apoyos','no-te-pierdas']})
+                    ,dict(name='div', attrs={'class':['publicidad publicidad_cuerpo_noticia','comentarios_nav','mensaje_privado','interact']})
+                    ,dict(name='div', attrs={'class':['num_comentarios estirar']})
+                    ,dict(name='span', attrs={'class':['links_comentar']})
+                    ,dict(name='div', attrs={'id':['comentar']})
                    ,dict(name='ul', attrs={'class':'herramientas' })
                    ,dict(name=['object','link','embed','iframe','base','meta'])
                  ]
@ -50,13 +57,31 @@ class ElMundo(BasicNewsRecipe):
    feeds = [
              (u'Portada'         , u'http://estaticos.elmundo.es/elmundo/rss/portada.xml'       )                                      
             ,(u'Deportes'        , u'http://estaticos.elmundo.es/elmundodeporte/rss/portada.xml')
-             ,(u'Economia'        , u'http://estaticos.elmundo.es/elmundo/rss/economia.xml'      )
-             ,(u'Espana'          , u'http://estaticos.elmundo.es/elmundo/rss/espana.xml'        )
+             ,(u'Econom\xeda'     , u'http://estaticos.elmundo.es/elmundo/rss/economia.xml'      )
+             ,(u'Espa\xf1a'       , u'http://estaticos.elmundo.es/elmundo/rss/espana.xml'        )
             ,(u'Internacional'   , u'http://estaticos.elmundo.es/elmundo/rss/internacional.xml' )
             ,(u'Cultura'         , u'http://estaticos.elmundo.es/elmundo/rss/cultura.xml'       )
-             ,(u'Ciencia/Ecologia', u'http://estaticos.elmundo.es/elmundo/rss/ciencia.xml'       )
-             ,(u'Comunicacion'    , u'http://estaticos.elmundo.es/elmundo/rss/comunicacion.xml'  )
-             ,(u'Television'      , u'http://estaticos.elmundo.es/elmundo/rss/television.xml'    )
+             ,(u'Ciencia/Ecolog\xeda', u'http://estaticos.elmundo.es/elmundo/rss/ciencia.xml'    )
+             ,(u'Comunicaci\xf3n' , u'http://estaticos.elmundo.es/elmundo/rss/comunicacion.xml'  )
+             ,(u'Televisi\xf3n'   , u'http://estaticos.elmundo.es/elmundo/rss/television.xml'    )
+
+             ,(u'Salud'           , u'http://estaticos.elmundo.es/elmundosalud/rss/portada.xml'  )
+             ,(u'Solidaridad'     , u'http://estaticos.elmundo.es/elmundo/rss/solidaridad.xml'   )
+             ,(u'Su vivienda'     , u'http://estaticos.elmundo.es/elmundo/rss/suvivienda.xml'    )             
+             ,(u'Motor'           , u'http://estaticos.elmundo.es/elmundomotor/rss/portada.xml'  )             
+             
+             ,(u'Madrid'          , u'http://estaticos.elmundo.es/elmundo/rss/madrid.xml'        )
+             ,(u'Barcelona'       , u'http://estaticos.elmundo.es/elmundo/rss/barcelona.xml'     )
+             ,(u'Pa\xeds Vasco'   , u'http://estaticos.elmundo.es/elmundo/rss/paisvasco.xml'     )	     
+             ,(u'Baleares'        , u'http://estaticos.elmundo.es/elmundo/rss/baleares.xml'      )
+	     ,(u'Castilla y Le\xf3n' , u'http://estaticos.elmundo.es/elmundo/rss/castillayleon.xml' )	     
+	     ,(u'Valladolid'      , u'http://estaticos.elmundo.es/elmundo/rss/valladolid.xml'    )
+	     ,(u'Valencia'        , u'http://estaticos.elmundo.es/elmundo/rss/valencia.xml'      )
+	     ,(u'Alicante'        , u'http://estaticos.elmundo.es/elmundo/rss/alicante.xml'      )
+	     ,(u'Castell\xf3n'    , u'http://estaticos.elmundo.es/elmundo/rss/castellon.xml'     )	
+	     ,(u'Andaluc\xeda'    , u'http://estaticos.elmundo.es/elmundo/rss/andalucia.xml'     )
+	     ,(u'Sevilla'         , u'http://estaticos.elmundo.es/elmundo/rss/andalucia_sevilla.xml'  )
+	     ,(u'M\xe1laga'       , u'http://estaticos.elmundo.es/elmundo/rss/andalucia_malaga.xml'   )
            ]

    def preprocess_html(self, soup):
@ -67,3 +92,34 @@ class ElMundo(BasicNewsRecipe):
    def get_article_url(self, article):
        return article.get('guid',  None)

+
+    preprocess_regexps = [     
+                           # Para presentar la imagen de los videos incrustados                           
+
+                           (re.compile(r'var imagen', re.DOTALL|re.IGNORECASE), lambda match: '--></script><img src'),
+                           (re.compile(r'.jpg";', re.DOTALL|re.IGNORECASE), lambda match: '.jpg">'),
+                           (re.compile(r'var video=', re.DOTALL|re.IGNORECASE), lambda match: '<script language="Javascript" type="text/javascript"><!--'),
+
+                           # Para que no salga la numeración de comentarios: 1, 2, 3 ...
+
+                           (re.compile(r'<ol>\n<li style="z-index:', re.DOTALL|re.IGNORECASE), lambda match: '<ul><li style="z-index:'),
+                           (re.compile(r'</ol>\n<div class="num_comentarios estirar">', re.DOTALL|re.IGNORECASE), lambda match: '</ul><div class="num_comentarios estirar">'),
+                         ]
+
+    # Obtener la imagen de portada
+
+    def get_cover_url(self):
+       cover = None
+       st = time.localtime()
+       year = str(st.tm_year)
+       month = "%.2d" % st.tm_mon
+       day = "%.2d" % st.tm_mday
+		#http://img.kiosko.net/2011/11/19/es/elmundo.750.jpg
+       cover='http://img.kiosko.net/'+ year + '/' +  month + '/' + day +'/es/elmundo.750.jpg'
+       br = BasicNewsRecipe.get_browser()
+       try:
+           br.open(cover)
+       except:
+           self.log("\nPortada no disponible")
+           cover ='http://estaticos03.elmundo.es/elmundo/iconos/v4.x/v4.01/bg_h1.png'
+       return cover 
--- a/recipes/emuzica_pl.recipe
+++ b/recipes/emuzica_pl.recipe
@ -0,0 +1,16 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+from calibre.web.feeds.news import BasicNewsRecipe
+class eMuzyka(BasicNewsRecipe):
+    title          = u'eMuzyka'
+    __author__        = 'fenuks'
+    description   = u'Emuzyka to największa i najpopularniejsza strona o muzyce w Polsce'
+    category       = 'music'
+    language       = 'pl'
+    cover_url='http://s.emuzyka.pl/img/emuzyka_invert_small.jpg'
+    no_stylesheets = True
+    oldest_article = 7
+    max_articles_per_feed = 100
+    keep_only_tags=[dict(name='div', attrs={'id':'news_container'}), dict(name='h3'), dict(name='div', attrs={'class':'review_text'})]
+    remove_tags=[dict(name='span', attrs={'id':'date'})]
+    feeds          = [(u'Aktualno\u015bci', u'http://www.emuzyka.pl/rss.php?f=1'), (u'Recenzje', u'http://www.emuzyka.pl/rss.php?f=2')]
--- a/recipes/expansion_spanish.recipe
+++ b/recipes/expansion_spanish.recipe
@ -1,35 +1,43 @@
 #!/usr/bin/env  python
 __license__     = 'GPL v3'
-__author__    = 'Gerardo Diez'
-__copyright__ = 'Gerardo Diez<gerardo.diez.garcia@gmail.com>'
-description   = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)'
-__docformat__ = 'restructuredtext en'
+__copyright__   = '5, January 2011 Gerardo Diez<gerardo.diez.garcia@gmail.com> & desUBIKado'
+__author__      = 'desUBIKado, based on an earlier version by Gerardo Diez'
+__version__     = 'v1.01'
+__date__        = '13, November 2011'

 '''
-expansion.es
+[url]http://www.expansion.com/[/url]
 '''
+
+import time
+import re
 from calibre.web.feeds.recipes import BasicNewsRecipe
-class Publico(BasicNewsRecipe):
-    title               =u'Expansion.com'
-    __author__      ='Gerardo Diez'
-    publisher       =u'Unidad Editorial Información Económica, S.L.'
-    category                ='finances, catalunya'
-    oldest_article      =1
-    max_articles_per_feed   =100
+
+class expansion_spanish(BasicNewsRecipe):
+    __author__      ='Gerardo Diez & desUBIKado'
+    description     ='Financial news from Spain'
+    title           =u'Expansion'
+    publisher       =u'Unidad Editorial Internet, S.L.'
+    category        ='news, finances, Spain'
+    oldest_article  = 2
    simultaneous_downloads = 10
-    cover_url       =u'http://estaticos01.expansion.com/iconos/v2.x/v2.0/cabeceras/logo_expansion.png'
-    timefmt         ='[%A, %d %B, %Y]'
-    encoding        ='latin'
+    max_articles_per_feed   =100
+    timefmt         = '[%a, %d %b, %Y]'
+    encoding        ='iso-8859-15'
    language        ='es'
+    use_embedded_content  = False
    remove_javascript     = True
    no_stylesheets        = True
+    remove_empty_feeds    = True
+
    keep_only_tags      =dict(name='div', attrs={'class':['noticia primer_elemento']})
+
    remove_tags         =[
-                dict(name='div', attrs={'class':['compartir', 'metadata_desarrollo_noticia', 'relacionadas', 'mas_info','publicidad publicidad_textlink', 'ampliarfoto']}),
-                dict(name='ul', attrs={'class':['bolos_desarrollo_noticia']}),
+                dict(name='div', attrs={'class':['compartir', 'metadata_desarrollo_noticia', 'relacionadas', 'mas_info','publicidad publicidad_textlink', 'ampliarfoto','tit_relacionadas','interact','paginacion estirar','sumario derecha']}),
+                dict(name='ul', attrs={'class':['bolos_desarrollo_noticia','not_logged']}),
                dict(name='span', attrs={'class':['comentarios']}),
                dict(name='p', attrs={'class':['cintillo_comentarios', 'cintillo_comentarios formulario']}),
-                dict(name='div', attrs={'id':['comentarios_lectores_listado']})
+                dict(name='div', attrs={'id':['comentarios_lectores_listado','comentar']})
                            ]
    feeds               =[
                (u'Portada', u'http://estaticos.expansion.com/rss/portada.xml'),
@ -38,42 +46,112 @@ class Publico(BasicNewsRecipe):
                (u'Euribor', u'http://estaticos.expansion.com/rss/mercadoseuribor.xml'),
                (u'Materias Primas', u'http://estaticos.expansion.com/rss/mercadosmateriasprimas.xml'),
                (u'Renta Fija', u'http://estaticos.expansion.com/rss/mercadosrentafija.xml'),
-
                (u'Portada: Mi Dinero', u'http://estaticos.expansion.com/rss/midinero.xml'),
                (u'Hipotecas', u'http://estaticos.expansion.com/rss/midinerohipotecas.xml'),
-                (u'Créditos', u'http://estaticos.expansion.com/rss/midinerocreditos.xml'),
+                (u'Cr\xe9ditos', u'http://estaticos.expansion.com/rss/midinerocreditos.xml'),
                (u'Pensiones', u'http://estaticos.expansion.com/rss/midineropensiones.xml'),
-                (u'Fondos de Inversión', u'http://estaticos.expansion.com/rss/midinerofondos.xml'),
+                (u'Fondos de Inversi\xf3n', u'http://estaticos.expansion.com/rss/midinerofondos.xml'),
                (u'Motor', u'http://estaticos.expansion.com/rss/midineromotor.xml'),
-
                (u'Portada: Empresas', u'http://estaticos.expansion.com/rss/empresas.xml'),
                (u'Banca', u'http://estaticos.expansion.com/rss/empresasbanca.xml'),
                (u'TMT', u'http://estaticos.expansion.com/rss/empresastmt.xml'),
-                (u'Energía', u'http://estaticos.expansion.com/rss/empresasenergia.xml'),
-                (u'Inmobiliario y Construcción', u'http://estaticos.expansion.com/rss/empresasinmobiliario.xml'),
+                (u'Energ\xeda', u'http://estaticos.expansion.com/rss/empresasenergia.xml'),
+                (u'Inmobiliario y Construcci\xf3n', u'http://estaticos.expansion.com/rss/empresasinmobiliario.xml'),
                (u'Transporte y Turismo', u'http://estaticos.expansion.com/rss/empresastransporte.xml'),
-                (u'Automoción e Industria', u'http://estaticos.expansion.com/rss/empresasauto-industria.xml'),
-                (u'Distribución', u'http://estaticos.expansion.com/rss/empresasdistribucion.xml'),
-                (u'Deporte y Negocio', u' http://estaticos.expansion.com/rss/empresasdeporte.xml'),
+                (u'Automoci\xf3n e Industria', u'http://estaticos.expansion.com/rss/empresasauto-industria.xml'),
+                (u'Distribuci\xf3n', u'http://estaticos.expansion.com/rss/empresasdistribucion.xml'),
+                (u'Deporte y Negocio', u' [url]http://estaticos.expansion.com/rss/empresasdeporte.xml[/url]'),
                (u'Mi Negocio', u'http://estaticos.expansion.com/rss/empresasminegocio.xml'),
                (u'Interiores', u'http://estaticos.expansion.com/rss/empresasinteriores.xml'),
                (u'Digitech', u'http://estaticos.expansion.com/rss/empresasdigitech.xml'),
-
-                (u'Portada: Economía y Política', u'http://estaticos.expansion.com/rss/economiapolitica.xml'),
-                (u'Política', u'http://estaticos.expansion.com/rss/economia.xml'),
+                (u'Portada: Econom\xeda y Pol\xedtica', u'http://estaticos.expansion.com/rss/economiapolitica.xml'),
+                (u'Pol\xedtica', u'http://estaticos.expansion.com/rss/economia.xml'),
                (u'Portada: Sociedad', u'http://estaticos.expansion.com/rss/entorno.xml'),
-
-                (u'Portada: Opinión', u'http://estaticos.expansion.com/rss/opinion.xml'),
+                (u'Portada: Opini\xf3n', u'http://estaticos.expansion.com/rss/opinion.xml'),
                (u'Llaves y editoriales', u'http://estaticos.expansion.com/rss/opinioneditorialyllaves.xml'),
                (u'Tribunas', u'http://estaticos.expansion.com/rss/opiniontribunas.xml'),
-
-                (u'Portada: Jurídico', u'http://estaticos.expansion.com/rss/juridico.xml'),
+                (u'Portada: Jur\xeddico', u'http://estaticos.expansion.com/rss/juridico.xml'),
                (u'Entrevistas', u'http://estaticos.expansion.com/rss/juridicoentrevistas.xml'),
-                (u'Opinión', u'http://estaticos.expansion.com/rss/juridicoopinion.xml'),
+                (u'Opini\xf3n', u'http://estaticos.expansion.com/rss/juridicoopinion.xml'),
                (u'Sentencias', u'http://estaticos.expansion.com/rss/juridicosentencias.xml'),
-
                (u'Mujer', u'http://estaticos.expansion.com/rss/mujer-empresa.xml'),
-                (u'Catalu&ntilde;a', u'http://estaticos.expansion.com/rss/catalunya.xml'),
-                (u'Función pública', u'http://estaticos.expansion.com/rss/funcion-publica.xml')
+                (u'Catalu\xf1a', u'http://estaticos.expansion.com/rss/catalunya.xml'),
+                (u'Funci\xf3n p\xfablica', u'http://estaticos.expansion.com/rss/funcion-publica.xml')
                ]

+    # Obtener la imagen de portada
+
+    def get_cover_url(self):
+       cover = None
+       st = time.localtime()
+       year = str(st.tm_year)
+       month = "%.2d" % st.tm_mon
+       day = "%.2d" % st.tm_mday
+		#[url]http://img5.kiosko.net/2011/11/14/es/expansion.750.jpg[/url]
+       cover='http://img5.kiosko.net/'+ year + '/' +  month + '/' + day +'/es/expansion.750.jpg'
+       br = BasicNewsRecipe.get_browser()
+       try:
+           br.open(cover)
+       except:
+           self.log("\nPortada no disponible")
+           cover ='http://www.aproahp.org/enlaces/images/diario_expansion.gif'
+       return cover
+
+
+
+    # Para que no salte la publicidad al recuperar la noticia, y que siempre se recupere
+    # la página web, mando la variable "t" con la hora "linux" o "epoch" actual
+    # haciendole creer al sitio web que justo se acaba de ver la publicidad
+
+    def print_version(self, url):
+           st = time.time()
+           segundos = str(int(st))
+           parametros = '.html?t=' + segundos
+           return url.replace('.html', parametros)
+
+
+
+    _processed_links = []
+
+    def get_article_url(self, article):
+
+       # Para obtener la url original del artículo a partir de la de "feedsportal"
+
+       link = article.get('link', None)
+       if link is None:
+           return article
+       if link.split('/')[-1]=="story01.htm":
+           link=link.split('/')[-2]
+           a=['0B','0C','0D','0E','0F','0G','0N'  ,'0L0S','0A']
+           b=['.' ,'/' ,'?' ,'-' ,'=' ,'&' ,'.com','www.','0']
+           for i in range(0,len(a)):
+              link=link.replace(a[i],b[i])
+           link="http://"+link
+
+       # Eliminar artículos duplicados en otros feeds
+
+       if not (link in self._processed_links):
+            self._processed_links.append(link)
+       else:
+            link = None
+
+       return link
+
+
+
+    # Un poco de css para mejorar la presentación de las noticias
+
+    extra_css = '''
+                    .entradilla {font-family:Arial,Helvetica,sans-serif; font-weight:bold; font-style:italic; font-size:16px;}
+                    .fecha_publicacion,.autor {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:14px;}
+                '''
+
+
+
+    # Para presentar la imagen de los videos incrustados
+
+    preprocess_regexps = [
+                           (re.compile(r'var imagen', re.DOTALL|re.IGNORECASE), lambda match: '--></script><img src'),
+                           (re.compile(r'.jpg";', re.DOTALL|re.IGNORECASE), lambda match: '.jpg">'),
+                           (re.compile(r'var id_reproductor', re.DOTALL|re.IGNORECASE), lambda match: '<script language="Javascript" type="text/javascript"><!--'),
+                         ]
--- a/recipes/fisco_oggi.recipe
+++ b/recipes/fisco_oggi.recipe
@ -0,0 +1,18 @@
+__license__   = 'GPL v3'
+__author__    = 'faber1971'
+description   = 'Website of Italian Governament Income Agency (about revenue, taxation, taxes)- v1.00 (17, December 2011)'
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1324112023(BasicNewsRecipe):
+    title          = u'Fisco Oggi'
+    language = 'it'
+    __author__ = 'faber1971'
+    oldest_article = 7
+    max_articles_per_feed = 100
+    auto_cleanup = True
+    remove_javascript = True
+    no_stylesheets = True
+
+    feeds          = [(u'Attualit\xe0', u'http://www.fiscooggi.it/taxonomy/term/1/feed'), (u'Normativa', u'http://www.fiscooggi.it/taxonomy/term/5/feed'), (u'Giurisprudenza', u'http://www.fiscooggi.it/taxonomy/term/8/feed'), (u'Dati e statistiche', u'http://www.fiscooggi.it/taxonomy/term/12/feed'), (u'Analisi e commenti', u'http://www.fiscooggi.it/taxonomy/term/13/feed'), (u'Bilancio e contabilit\xe0', u'http://www.fiscooggi.it/taxonomy/term/576/feed'), (u'Dalle regioni', u'http://www.fiscooggi.it/taxonomy/term/16/feed'), (u'Dal mondo', u'http://www.fiscooggi.it/taxonomy/term/17/feed')]
+
--- a/recipes/focus_pl.recipe
+++ b/recipes/focus_pl.recipe
@ -1,57 +1,68 @@
-# -*- coding: utf-8 -*-
+import re
+
 from calibre.web.feeds.news import BasicNewsRecipe

-class Focus_pl(BasicNewsRecipe):
-    title          = u'Focus.pl'
-    oldest_article = 15
-    max_articles_per_feed = 100
-    __author__        = 'fenuks'
+class FocusRecipe(BasicNewsRecipe):
+    __license__ = 'GPL v3'
+    __author__ = u'intromatyk <intromatyk@gmail.com>'
    language = 'pl'
-    description ='polish scientific monthly magazine'
+    version = 1
+
+    title = u'Focus'
+    publisher = u'Gruner + Jahr Polska'
+    category = u'News'
+    description = u'Newspaper'
    category='magazine'
    cover_url=''
    remove_empty_feeds= True
    no_stylesheets=True
-    remove_tags_before=dict(name='div', attrs={'class':'h2 h2f'})
-    remove_tags_after=dict(name='div', attrs={'class':'clear'})
-    feeds          = [(u'Wszystkie kategorie', u'http://focus.pl.feedsportal.com/c/32992/f/532692/index.rss'),
-	(u'Nauka', u'http://focus.pl.feedsportal.com/c/32992/f/532693/index.rss'),
-	(u'Historia', u'http://focus.pl.feedsportal.com/c/32992/f/532694/index.rss'),
-	(u'Cywilizacja', u'http://focus.pl.feedsportal.com/c/32992/f/532695/index.rss'),
-	(u'Sport', u'http://focus.pl.feedsportal.com/c/32992/f/532696/index.rss'),
-	(u'Technika', u'http://focus.pl.feedsportal.com/c/32992/f/532697/index.rss'),
-	(u'Przyroda', u'http://focus.pl.feedsportal.com/c/32992/f/532698/index.rss'),
-	(u'Technologie', u'http://focus.pl.feedsportal.com/c/32992/f/532699/index.rss'),
-	(u'Warto wiedzieć', u'http://focus.pl.feedsportal.com/c/32992/f/532700/index.rss'),
-
+    oldest_article = 7
+    max_articles_per_feed = 100000
+    recursions = 0
+
+    no_stylesheets = True
+    remove_javascript = True
+    encoding = 'utf-8'
+    # Seems to work best, but YMMV
+    simultaneous_downloads = 5
+
+    r = re.compile('.*(?P<url>http:\/\/(www.focus.pl)|(rss.feedsportal.com\/c)\/.*\.html?).*')
+    keep_only_tags =[]
+    keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'cll'}))
+    
+    remove_tags =[]
+    remove_tags.append(dict(name = 'div', attrs = {'class' : 'ulm noprint'}))
+    remove_tags.append(dict(name = 'div', attrs = {'class' : 'txb'}))
+    remove_tags.append(dict(name = 'div', attrs = {'class' : 'h2'}))
+    remove_tags.append(dict(name = 'ul', attrs = {'class' : 'txu'}))
+    remove_tags.append(dict(name = 'div', attrs = {'class' : 'ulc'}))
+
+    extra_css = '''
+                    body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}
+                    h1{text-align: left;}
+                    h2{font-size: medium; font-weight: bold;}
+                    p.lead {font-weight: bold; text-align: left;}
+                    .authordate {font-size: small; color: #696969;}
+                    .fot{font-size: x-small; color: #666666;}
+                    '''    


+    feeds          = [
+                            ('Nauka', 'http://focus.pl.feedsportal.com/c/32992/f/532693/index.rss'),
+                            ('Historia', 'http://focus.pl.feedsportal.com/c/32992/f/532694/index.rss'),
+                            ('Cywilizacja', 'http://focus.pl.feedsportal.com/c/32992/f/532695/index.rss'),
+                            ('Sport', 'http://focus.pl.feedsportal.com/c/32992/f/532696/index.rss'),
+                            ('Technika', 'http://focus.pl.feedsportal.com/c/32992/f/532697/index.rss'),
+                            ('Przyroda', 'http://focus.pl.feedsportal.com/c/32992/f/532698/index.rss'),
+                            ('Technologie', 'http://focus.pl.feedsportal.com/c/32992/f/532699/index.rss'),                            
                          ]

    def skip_ad_pages(self, soup):
-          tag=soup.find(name='a')
-          if tag:
-            new_soup=self.index_to_soup(tag['href']+ 'do-druku/1/', raw=True)
-            return new_soup
-
-    def append_page(self, appendtag):
-        tag=appendtag.find(name='div', attrs={'class':'arrows'})
-        if tag:
-            nexturl='http://www.focus.pl/'+tag.a['href']
-            for rem in appendtag.findAll(name='div', attrs={'class':'klik-nav'}):
-                rem.extract()
-            while nexturl:
-                 soup2=self.index_to_soup(nexturl)
-                 nexturl=None
-                 pagetext=soup2.find(name='div', attrs={'class':'txt'})
-                 tag=pagetext.find(name='div', attrs={'class':'arrows'})
-                 for r in tag.findAll(name='a'):
-                     if u'Następne' in r.string:
-                         nexturl='http://www.focus.pl/'+r['href']
-                 for rem in pagetext.findAll(name='div', attrs={'class':'klik-nav'}):
-                     rem.extract()
-                 pos = len(appendtag.contents)
-                 appendtag.insert(pos, pagetext)
+        if ('advertisement' in soup.find('title').string.lower()):
+            href = soup.find('a').get('href')
+            return self.index_to_soup(href, raw=True)
+        else:
+            return None

    def get_cover_url(self):
        soup=self.index_to_soup('http://www.focus.pl/magazyn/')
@ -60,7 +71,14 @@ class Focus_pl(BasicNewsRecipe):
            self.cover_url='http://www.focus.pl/' + tag.a['href']
            return getattr(self, 'cover_url', self.cover_url)

-
-    def preprocess_html(self, soup):
-         self.append_page(soup.body)
-         return soup
+    def print_version(self, url):
+     if url.count ('focus.pl.feedsportal.com'):
+            u = url.find('focus0Bpl')
+            u = 'http://www.focus.pl/' + url[u + 11:]
+            u = u.replace('0C', '/')
+            u = u.replace('A', '')
+            u = u.replace ('0E','-')
+            u = u.replace('/nc/1//story01.htm', '/do-druku/1')
+     else:
+            u = url.replace('/nc/1','/do-druku/1')           
+     return u
--- a/recipes/folhadesaopaulo_sub.recipe
+++ b/recipes/folhadesaopaulo_sub.recipe
@ -8,31 +8,35 @@ class FSP(BasicNewsRecipe):
    __author__ = 'fluzao'
    description = u'Printed edition contents. UOL subscription required (Folha subscription currently not supported).' + \
                  u' [Conte\xfado completo da edi\xe7\xe3o impressa. Somente para assinantes UOL.]'
-    INDEX = 'http://www1.folha.uol.com.br/fsp/indices/'
+
+    #found this to be the easiest place to find the index page (13-Nov-2011).
+    #  searching for the "Indice Geral" link
+    HOMEPAGE = 'http://www1.folha.uol.com.br/fsp/'
+    masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif'
+
    language = 'pt'
    no_stylesheets = True
    max_articles_per_feed  = 40
    remove_javascript     = True
    needs_subscription = True
-    remove_tags_before = dict(name='b')
+
+    remove_tags_before = dict(name='p')
    remove_tags  = [dict(name='td', attrs={'align':'center'})]
    remove_attributes = ['height','width']
-    masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif'
-
    # fixes the problem with the section names
    section_dict = {'cotidian' : 'cotidiano', 'ilustrad': 'ilustrada', \
                    'quadrin': 'quadrinhos' , 'opiniao' : u'opini\xE3o', \
                    'ciencia' : u'ci\xeancia' , 'saude' : u'sa\xfade', \
-                    'ribeirao' : u'ribeir\xE3o' , 'equilibrio' : u'equil\xedbrio'}
+                    'ribeirao' : u'ribeir\xE3o' , 'equilibrio' : u'equil\xedbrio', \
+                    'imoveis' : u'im\xf3veis', 'negocios' : u'neg\xf3cios', \
+                    'veiculos' : u've\xedculos', 'corrida' : 'folha corrida'}

    # this solves the problem with truncated content in Kindle
    conversion_options = {'linearize_tables' : True}

    # this bit removes the footer where there are links for Proximo Texto, Texto Anterior,
    #    Indice e Comunicar Erros
-    preprocess_regexps = [(re.compile(r'<BR><BR>Texto Anterior:.*<!--/NOTICIA-->',
-                                      re.DOTALL|re.IGNORECASE), lambda match: r''),
-                          (re.compile(r'<BR><BR>Pr&oacute;ximo Texto:.*<!--/NOTICIA-->',
+    preprocess_regexps = [(re.compile(r'<!--/NOTICIA-->.*Comunicar Erros</a>',
                                      re.DOTALL|re.IGNORECASE), lambda match: r'')]

    def get_browser(self):
@ -49,7 +53,25 @@ class FSP(BasicNewsRecipe):


    def parse_index(self):
-        soup = self.index_to_soup(self.INDEX)
+        #Searching for the index page on the HOMEPAGE
+        hpsoup = self.index_to_soup(self.HOMEPAGE)
+        indexref = hpsoup.find('a', href=re.compile('^indices.*'))
+        self.log('--> tag containing the today s index: ', indexref)
+        INDEX = indexref['href']
+        INDEX = 'http://www1.folha.uol.com.br/fsp/'+INDEX
+        self.log('--> INDEX after extracting href and adding prefix: ', INDEX)
+        # ... and taking the opportunity to get the cover image link
+        coverurl = hpsoup.find('a', href=re.compile('^cp.*'))['href']
+        if coverurl:
+            self.log('--> tag containing the today s cover: ', coverurl)
+            coverurl = coverurl.replace('htm', 'jpg')
+            coverurl = 'http://www1.folha.uol.com.br/fsp/images/'+coverurl
+            self.log('--> coverurl after extracting href and adding prefix: ', coverurl)
+            self.cover_url = coverurl
+
+        #soup = self.index_to_soup(self.INDEX)
+        soup = self.index_to_soup(INDEX)
+
        feeds = []
        articles = []
        section_title = "Preambulo"
@ -68,8 +90,12 @@ class FSP(BasicNewsRecipe):
                self.log('--> new section title:   ', section_title)
            if strpost.startswith('<a href'):
                url = post['href']
+                #this bit is kept if they ever go back to the old format (pre Nov-2011)
                if url.startswith('/fsp'):
                    url = 'http://www1.folha.uol.com.br'+url
+                #
+                if url.startswith('http://www1.folha.uol.com.br/fsp'):
+                    #url = 'http://www1.folha.uol.com.br'+url
                    title = self.tag_to_string(post)
                    self.log()
                    self.log('--> post:  ', post)
@ -82,15 +108,11 @@ class FSP(BasicNewsRecipe):
        # keeping the front page url
        minha_capa = feeds[0][1][1]['url']

-        # removing the 'Preambulo' section
+        # removing the first section (now called 'top')
        del feeds[0]

-        # creating the url for the cover image
-        coverurl = feeds[0][1][0]['url']
-        coverurl = coverurl.replace('/opiniao/fz', '/images/cp')
-        coverurl = coverurl.replace('01.htm', '.jpg')
-        self.cover_url = coverurl
-
        # inserting the cover page as the first article (nicer for kindle users)
        feeds.insert(0,(u'primeira p\xe1gina', [{'title':u'Primeira p\xe1gina' , 'url':minha_capa}]))
        return feeds
+
+
--- a/recipes/formulaas.recipe
+++ b/recipes/formulaas.recipe
@ -0,0 +1,50 @@
+# -*- coding: utf-8 -*-
+#!/usr/bin/env  python
+
+__license__   = 'GPL v3'
+__copyright__ = u'2011, Silviu Cotoar\u0103'
+'''
+formula-as.ro
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class FormulaAS(BasicNewsRecipe):
+    title                 = u'Formula AS'
+    __author__            = u'Silviu Cotoar\u0103'
+    publisher             = u'Formula AS'
+    description           = u'Formula AS'
+    oldest_article        = 5
+    language              = 'ro'
+    max_articles_per_feed = 100
+    no_stylesheets        = True
+    use_embedded_content  = False
+    category              = 'Ziare,Romania'
+    encoding              = 'utf-8'
+    cover_url        	  = 'http://www.formula-as.ro/_client/img/header_logo.png'
+
+    conversion_options = {
+                'comments'    : description
+                ,'tags'       : category
+                ,'language'   : language
+                ,'publisher'  : publisher
+                         }
+
+    keep_only_tags = [
+                      dict(name='div', attrs={'class':'item padded'})					 
+                     ]
+
+    remove_tags = [
+					dict(name='ul', attrs={'class':'subtitle lower'})
+                  ]
+
+    remove_tags_after = [
+			         dict(name='ul', attrs={'class':'subtitle lower'}),
+					 dict(name='div', attrs={'class':'item-brief-options'})					 
+	               ]
+    feeds  = [
+        (u'\u0218tiri', u'http://www.formula-as.ro/rss/articole.xml')
+         ]
+
+    def preprocess_html(self, soup):
+        return self.adeify_images(soup)
--- a/recipes/frankfurter_rundschau.recipe
+++ b/recipes/frankfurter_rundschau.recipe
@ -1,35 +1,61 @@
-from calibre.web.feeds.recipes import BasicNewsRecipe
-class AdvancedUserRecipe(BasicNewsRecipe):
+#!/usr/bin/env  python

-    title          = u'Frankfurter Rundschau'
-    __author__  = 'schuster'
-    oldest_article = 1
-    max_articles_per_feed = 100
-    no_stylesheets         = True
-    use_embedded_content   = False
-    language               = 'de'
-    remove_javascript      = True
-    cover_url = 'http://www.fr-online.de/image/view/-/1474018/data/823538/-/logo.png'
-    extra_css = '''
-                    h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
-                    h4{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
-                    img {min-width:300px; max-width:600px; min-height:300px; max-height:800px}
-                    p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
-                    body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
+__license__            = 'GPL v3'
+__copyright__          = '2010-2011, Christian Schmitt'
+
+'''
+fr-online.de
 '''

-    feeds          = [(u'Startseite', u'http://www.fr-online.de/home/-/1472778/1472778/-/view/asFeed/-/index.xml'),
-                          (u'Politik', u'http://www.fr-online.de/politik/-/1472596/1472596/-/view/asFeed/-/index.xml'),
-                          (u'Meinungen', u'http://www.fr-online.de/politik/meinung/-/1472602/1472602/-/view/asFeed/-/index.xml'),
-                          (u'Wirtschaft', u'http://www.fr-online.de/wirtschaft/-/1472780/1472780/-/view/asFeed/-/index.xml'),
-                          (u'Sport', u'http://www.fr-online.de/sport/-/1472784/1472784/-/view/asFeed/-/index.xml'),
-                          (u'Kultur', u'http://www.fr-online.de/kultur/-/1472786/1472786/-/view/asFeed/-/index.xml'),
-                          (u'Panorama', u'http://www.fr-online.de/panorama/-/1472782/1472782/-/view/asFeed/-/index.xml'),
-                          (u'Digital', u'http://www.fr-online.de/digital/-/1472406/1472406/-/view/asFeed/-/index.xml'),
-                          (u'Wissenschaft', u'http://www.fr-online.de/wissenschaft/-/1472788/1472788/-/view/asFeed/-/index.xml')
-]
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class FROnlineRecipe(BasicNewsRecipe):
+  title                  = 'Frankfurter Rundschau'
+  __author__             = 'maccs'
+  description            = 'Nachrichten aus D und aller Welt'
+  encoding               = 'utf-8'
+  masthead_url           = 'http://www.fr-online.de/image/view/-/1474018/data/823552/-/logo.png'
+  publisher              = 'Druck- und Verlagshaus Frankfurt am Main GmbH'
+  category               = 'news, germany, world'
+  language               = 'de'
+  publication_type       = 'newspaper'
+  use_embedded_content   = False
+  remove_javascript      = True
+  no_stylesheets         = True
+  oldest_article         = 1   # Increase this number if you're interested in older articles
+  max_articles_per_feed  = 50  # Seems a reasonable number to me
+  extra_css              = '''
+                            body { font-family: "arial", "verdana", "geneva", sans-serif; font-size: 12px; margin: 0px; background-color: #ffffff;}
+                            .imgSubline{background-color: #f4f4f4; font-size: 0.8em;}
+                            .p--heading-1 {font-weight: bold;}
+                            .calibre_navbar {font-size: 0.8em; font-family: "arial", "verdana", "geneva", sans-serif;}
+                            '''
+  keep_only_tags         = [{'class':'ArticleHeadlineH1'}, {'class':'article_text'}]
+  cover_url              = 'http://www.fr-online.de/image/view/-/1474018/data/823552/-/logo.png'
+  cover_margins          = (100, 150, '#ffffff')
+
+
+  feeds = []
+  feeds.append(('Startseite', u'http://www.fr-online.de/home/-/1472778/1472778/-/view/asFeed/-/index.xml'))
+  feeds.append(('Politik', u'http://www.fr-online.de/politik/-/1472596/1472596/-/view/asFeed/-/index.xml'))
+  feeds.append(('Meinung', u'http://www.fr-online.de/politik/meinung/-/1472602/1472602/-/view/asFeed/-/index.xml'))
+  feeds.append(('Wirtschaft', u'http://www.fr-online.de/wirtschaft/-/1472780/1472780/-/view/asFeed/-/index.xml'))
+  feeds.append(('Sport', u'http://www.fr-online.de/sport/-/1472784/1472784/-/view/asFeed/-/index.xml'))
+  feeds.append(('Eintracht Frankfurt', u'http://www.fr-online.de/sport/eintracht-frankfurt/-/1473446/1473446/-/view/asFeed/-/index.xml'))
+  feeds.append(('Kultur und Medien', u'http://www.fr-online.de/kultur/-/1472786/1472786/-/view/asFeed/-/index.xml'))
+  feeds.append(('Panorama', u'http://www.fr-online.de/panorama/-/1472782/1472782/-/view/asFeed/-/index.xml'))
+  feeds.append(('Frankfurt', u'http://www.fr-online.de/frankfurt/-/1472798/1472798/-/view/asFeed/-/index.xml'))
+  feeds.append(('Rhein-Main', u'http://www.fr-online.de/rhein-main/-/1472796/1472796/-/view/asFeed/-/index.xml'))
+  feeds.append(('Hanau', u'http://www.fr-online.de/rhein-main/hanau/-/1472866/1472866/-/view/asFeed/-/index.xml'))
+  feeds.append(('Darmstadt', u'http://www.fr-online.de/rhein-main/darmstadt/-/1472858/1472858/-/view/asFeed/-/index.xml'))
+  feeds.append(('Wiesbaden', u'http://www.fr-online.de/rhein-main/wiesbaden/-/1472860/1472860/-/view/asFeed/-/index.xml'))
+  feeds.append(('Offenbach', u'http://www.fr-online.de/rhein-main/offenbach/-/1472856/1472856/-/view/asFeed/-/index.xml'))
+  feeds.append(('Bad Homburg', u'http://www.fr-online.de/rhein-main/bad-homburg/-/1472864/1472864/-/view/asFeed/-/index.xml'))
+  feeds.append(('Digital', u'http://www.fr-online.de/digital/-/1472406/1472406/-/view/asFeed/-/index.xml'))
+  feeds.append(('Wissenschaft', u'http://www.fr-online.de/wissenschaft/-/1472788/1472788/-/view/asFeed/-/index.xml'))


  def print_version(self, url):
    return url.replace('index.html', 'view/printVersion/-/index.html')

+
--- a/recipes/frazpc.recipe
+++ b/recipes/frazpc.recipe
@ -18,7 +18,7 @@ class FrazPC(BasicNewsRecipe):
    max_articles_per_feed = 100
    use_embedded_content = False
    no_stylesheets = True
-
+    cover_url='http://www.frazpc.pl/images/logo.png'
    feeds          = [
        (u'Aktualno\u015bci', u'http://www.frazpc.pl/feed/aktualnosci'), 
        (u'Artyku\u0142y', u'http://www.frazpc.pl/feed/artykuly')
@ -33,6 +33,7 @@ class FrazPC(BasicNewsRecipe):
        dict(name='div', attrs={'class':'comments_box'})
    ]

+    remove_tags_after=dict(name='div', attrs={'class':'content'})
    preprocess_regexps = [(re.compile(r'\| <a href="#comments">Komentarze \([0-9]*\)</a>'), lambda match: '')]

    remove_attributes = [ 'width', 'height' ]
--- a/recipes/gazeta_pl_szczecin.recipe
+++ b/recipes/gazeta_pl_szczecin.recipe
@ -0,0 +1,35 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+import re
+import string
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class GazetaPlSzczecin(BasicNewsRecipe):
+    title          = u'Gazeta.pl Szczecin'
+    description    = u'Wiadomości ze Szczecina na portalu Gazeta.pl.'
+    __author__     = u'Michał Szkutnik'
+    __license__    = u'GPL v3'
+    language       = 'pl'
+    publisher      = 'Agora S.A.'
+    category       = 'news, szczecin'
+    oldest_article = 2
+    max_articles_per_feed = 100
+    auto_cleanup   = True
+    remove_tags    = [ { "name" : "a", "attrs" : { "href" : "http://szczecin.gazeta.pl/szczecin/www.gazeta.pl" }}]
+    cover_url      = "http://bi.gazeta.pl/i/hp/hp2009/logo.gif"
+    feeds          = [(u'Wszystkie', u'http://rss.feedsportal.com/c/32739/f/530434/index.rss')]
+
+    def get_article_url(self, article):
+        s = re.search("""/0L(szczecin.*)/story01.htm""", article.link)
+        s = s.group(1)
+        replacements = { "0B" : ".", "0C" : "/", "0H" : ",", "0I"  : "_"}
+        for (a, b) in replacements.iteritems():
+            s = string.replace(s, a, b)
+        s = string.replace(s, "0A", "0")
+        return "http://"+s
+
+    def print_version(self, url):
+        s = re.search("""/(\d*),(\d*),(\d*),.*\.html""", url)
+        no1 = s.group(2)
+        no2 = s.group(3)
+        return """http://szczecin.gazeta.pl/szczecin/2029020,%s,%s.html""" % (no1, no2)
--- a/recipes/givemesomethingtoread.recipe
+++ b/recipes/givemesomethingtoread.recipe
@ -0,0 +1,90 @@
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class GiveMeSomethingToRead(BasicNewsRecipe):
+    title          = u'Give Me Something To Read'
+    description    = 'Curation / aggregation of articles on diverse topics'
+    language = 'en'
+    __author__     = 'barty on mobileread.com forum'
+    max_articles_per_feed = 100
+    no_stylesheets = False
+    timefmt        = ' [%a, %d %b, %Y]'
+    oldest_article = 365
+    auto_cleanup   = True
+    INDEX          = 'http://givemesomethingtoread.com'
+    CATEGORIES     = [
+        # comment out categories you don't want
+        # (user friendly name, system name, max number of articles to load)
+        ('The Arts','arts',25),
+        ('Science','science',30),
+        ('Technology','technology',30),
+        ('Politics','politics',20),
+        ('Media','media',30),
+        ('Crime','crime',15),
+        ('Other articles','',10)
+        ]
+
+    def parse_index(self):
+        self.cover_url = 'http://thegretchenshow.files.wordpress.com/2009/12/well-read-cat-small.jpg'
+        feeds = []
+        seen_urls = set([])
+        regex = re.compile( r'http://(www\.)?([^/:]+)', re.I)
+
+        for category in self.CATEGORIES:
+
+            (cat_name, tag, max_articles) = category
+
+            tagurl = '' if tag=='' else '/tagged/'+tag
+            self.log('Reading category:', cat_name)
+
+            articles = []
+            pageno = 1
+
+            while len(articles) < max_articles and pageno < 100:
+
+                page = "%s%s/page/%d" % (self.INDEX, tagurl, pageno) if pageno > 1 else self.INDEX + tagurl
+                pageno += 1
+
+                self.log('\tReading page:', page)
+                try:
+                    soup = self.index_to_soup(page)
+                except:
+                    break
+
+                headers = soup.findAll('h2')
+                if len(headers) == .0:
+                    break
+
+                for header in headers:
+                    atag = header.find('a')
+                    url = atag['href']
+                    # skip promotionals and duplicate
+                    if url.startswith('http://givemesomethingtoread') or url.startswith('/') or url in seen_urls:
+                        continue
+                    seen_urls.add(url)
+                    title = self.tag_to_string(header)
+                    self.log('\tFound article:', title)
+                    #self.log('\t', url)
+                    desc = header.parent.find('blockquote')
+                    desc = self.tag_to_string(desc) if desc else ''
+                    m = regex.match( url)
+                    if m:
+                        desc = "[%s] %s" %  (m.group(2), desc)
+                    #self.log('\t', desc)
+                    date = ''
+                    p = header.parent.previousSibling
+                    # navigate up to find h3, which contains the date
+                    while p:
+                        if hasattr(p,'name') and p.name == 'h3':
+                            date = self.tag_to_string(p)
+                            break
+                        p = p.previousSibling
+                    articles.append({'title':title,'url':url,'description':desc,'date':date})
+                    if len(articles) >= max_articles:
+                        break
+
+            if articles:
+                feeds.append((cat_name, articles))
+
+        return feeds
+
--- a/recipes/glasgow_herald.recipe
+++ b/recipes/glasgow_herald.recipe
@ -1,4 +1,3 @@
-
 from calibre.web.feeds.news import BasicNewsRecipe

 class GlasgowHerald(BasicNewsRecipe):
@ -9,12 +8,16 @@ class GlasgowHerald(BasicNewsRecipe):
    language = 'en_GB'

    __author__     = 'Kovid Goyal'
+    use_embedded_content = False

-    keep_only_tags = [dict(attrs={'class':'article'})]
-    remove_tags = [
-            dict(id=['pic-nav']),
-            dict(attrs={'class':['comments-top']})
-            ]
+    no_stylesheets = True
+    auto_cleanup = True
+
+    #keep_only_tags = [dict(attrs={'class':'article'})]
+    #remove_tags = [
+            #dict(id=['pic-nav']),
+            #dict(attrs={'class':['comments-top']})
+            #]


    feeds          = [
@ -26,4 +29,3 @@ class GlasgowHerald(BasicNewsRecipe):
                        u'http://www.heraldscotland.com/cmlink/1.768',),
                        (u'Columnists', u'http://www.heraldscotland.com/cmlink/1.658574')]
 
-
--- a/recipes/globe_and_mail.recipe
+++ b/recipes/globe_and_mail.recipe
@ -51,6 +51,13 @@ class AdvancedUserRecipe1287083651(BasicNewsRecipe):
            {'class':['articleTools', 'pagination', 'Ads', 'topad',
                'breadcrumbs', 'footerNav', 'footerUtil', 'downloadlinks']}]

+    def populate_article_metadata(self, article, soup, first):
+        if first and hasattr(self, 'add_toc_thumbnail'):
+            picdiv = soup.find('img')
+            if picdiv is not None:
+                self.add_toc_thumbnail(article,picdiv['src'])
+
+
    #Use the mobile version rather than the web version
    def print_version(self, url):
        return url.rpartition('?')[0] + '?service=mobile'
--- a/recipes/gosc_niedzielny.recipe
+++ b/recipes/gosc_niedzielny.recipe
@ -12,7 +12,6 @@ class GN(BasicNewsRecipe):
        EDITION = 0

        __author__ = 'Piotr Kontek'
-        title = u'Gość niedzielny'
        description = 'Weekly magazine'
        encoding = 'utf-8'
        no_stylesheets = True
@ -20,6 +19,8 @@ class GN(BasicNewsRecipe):
        remove_javascript = True
        temp_files = []
        simultaneous_downloads = 1
+        masthead_url = 'http://gosc.pl/files/11/03/12/949089_top.gif'
+        title = u'Gość niedzielny'

        articles_are_obfuscated = True

@ -64,7 +65,6 @@ class GN(BasicNewsRecipe):
                    if img != None:
                        a = img.parent
                        self.EDITION = a['href']
-                        self.title = img['alt']
                        self.cover_url = 'http://www.gosc.pl' + img['src']
                        if not first:
                            break
--- a/recipes/grantland.recipe
+++ b/recipes/grantland.recipe
@ -0,0 +1,96 @@
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class GrantLand(BasicNewsRecipe):
+	title          = u"Grantland"
+	description    = 'Writings on Sports & Pop Culture'
+	language       = 'en'
+	__author__     = 'Barty'
+	max_articles_per_feed = 100
+	no_stylesheets = False
+	# auto_cleanup is too aggressive sometimes and we end up with blank articles
+	auto_cleanup   = False
+	timefmt        = ' [%a, %d %b %Y]'
+	oldest_article = 365
+
+	cover_url      = 'http://cdn0.sbnation.com/imported_assets/740965/blog_grantland_grid_3.jpg'
+	masthead_url   = 'http://a1.espncdn.com/prod/assets/grantland/grantland-logo.jpg'
+
+	INDEX          = 'http://www.grantland.com'
+	CATEGORIES     = [
+		# comment out categories you don't want
+		# (user friendly name, url suffix, max number of articles to load)
+		('Today in Grantland','',20),
+		('In Case You Missed It','incaseyoumissedit',35),
+		]
+
+	remove_tags    = [
+		{'name':['head','style','script']},
+		{'id':['header']},
+		{'class':re.compile(r'\bside|\bad\b|floatright|tags')}
+		]
+	remove_tags_before = {'class':'wrapper'}
+	remove_tags_after  = [{'id':'content'}]
+
+	preprocess_regexps = [
+		# <header> tags with an img inside are just blog banners, don't need them
+		# note: there are other useful <header> tags so we don't want to just strip all of them
+		(re.compile(r'<header class.+?<img .+?>.+?</header>', re.DOTALL|re.IGNORECASE),lambda m: ''),
+		# delete everything between the *last* <hr class="small" /> and </article>
+		(re.compile(r'<hr class="small"(?:(?!<hr class="small").)+</article>', re.DOTALL|re.IGNORECASE),lambda m: '<hr class="small" /></article>'),
+		]
+	extra_css = """cite, time { font-size: 0.8em !important; margin-right: 1em !important; }
+		img + cite { display:block; text-align:right}"""
+
+	def parse_index(self):
+		feeds = []
+		seen_urls = set([])
+
+		for category in self.CATEGORIES:
+
+			(cat_name, tag, max_articles) = category
+			self.log('Reading category:', cat_name)
+			articles = []
+
+			page = "%s/%s" % (self.INDEX, tag)
+			soup = self.index_to_soup(page)
+			headers = soup.findAll('h2' if tag=='' else 'h3')
+
+			for header in headers:
+				tag = header.find('a')
+				if tag is None or not hasattr(tag,'href'):
+					continue
+				url = tag['href']
+				if url.startswith('/'):
+					url = self.INDEX + url
+				if url in seen_urls:
+					continue
+				seen_urls.add(url)
+				title = self.tag_to_string(tag)
+				if 'Podcast:' in title or 'In Case You Missed It' in title:
+					continue
+				desc = dt = ''
+				par = header.parent
+				#tag = par.find('cite')
+				#if tag is not None:
+				#	desc = '['+self.tag_to_string(tag) + '] '
+				tag = par.find('div')
+				if tag is not None:
+					desc = desc + self.tag_to_string(tag)
+					tag = tag.find('time')
+					if tag is not None:
+						dt = self.tag_to_string( tag)
+
+				self.log('\tFound article:', title)
+				self.log('\t', url)
+				articles.append({'title':title,'url':url,'description':desc,'date':dt})
+				if len(articles) >= max_articles:
+					break
+
+			if articles:
+				feeds.append((cat_name, articles))
+
+		return feeds
+
+	def print_version(self, url):
+		return url+'?view=print'
--- a/recipes/gs24_pl.recipe
+++ b/recipes/gs24_pl.recipe
@ -0,0 +1,43 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+import re
+import string
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1322322819(BasicNewsRecipe):
+    title          = u'GS24.pl (Głos Szczeciński)'
+    description    = u'Internetowy serwis Głosu Szczecińskiego'
+    __author__     = u'Michał Szkutnik'
+    __license__    = u'GPL v3'
+    language       = 'pl'
+    publisher      = 'Media Regionalne sp. z o.o.'
+    category       = 'news, szczecin'
+    oldest_article = 2
+    max_articles_per_feed = 100
+    auto_cleanup = True
+    cover_url = "http://www.gs24.pl/images/top_logo.png"
+
+    feeds          = [
+    # (u'Wszystko', u'http://www.gs24.pl/rss.xml'),
+     (u'Szczecin', u'http://www.gs24.pl/szczecin.xml'),
+     (u'Stargard', u'http://www.gs24.pl/stargard.xml'),
+     (u'Świnoujście', u'http://www.gs24.pl/swinoujscie.xml'),
+     (u'Goleniów', u'http://www.gs24.pl/goleniow.xml'),
+     (u'Gryfice', u'http://www.gs24.pl/gryfice.xml'),
+     (u'Kamień Pomorski', u'http://www.gs24.pl/kamienpomorski.xml'),
+     (u'Police', u'http://www.gs24.pl/police.xml'),
+     (u'Region', u'http://www.gs24.pl/region.xml'),
+     (u'Sport', u'http://www.gs24.pl/sport.xml'),
+                    ]
+
+    def get_article_url(self, article):
+        s = re.search("""/0L0S(gs24.*)/story01.htm""", article.link)
+        s = s.group(1)
+        replacements = { "0B" : ".", "0C" : "/", "0H" : ",", "0I"  : "_", "0D" : "?", "0F" : "="}
+        for (a, b) in replacements.iteritems():
+            s = string.replace(s, a, b)
+        s = string.replace(s, "0A", "0")
+        return "http://"+s
+
+    def print_version(self, url):
+        return url + "&Template=printpicart"
--- a/recipes/guardian.recipe
+++ b/recipes/guardian.recipe
@ -9,6 +9,7 @@ www.guardian.co.uk
 from calibre import strftime
 from calibre.web.feeds.news import BasicNewsRecipe
 from datetime import date
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag

 class Guardian(BasicNewsRecipe):

@ -16,9 +17,11 @@ class Guardian(BasicNewsRecipe):
    if date.today().weekday() == 6:
        base_url = "http://www.guardian.co.uk/theobserver"
        cover_pic = 'Observer digital edition'
+        masthead_url = 'http://static.guim.co.uk/sys-images/Guardian/Pix/site_furniture/2010/10/19/1287478087992/The-Observer-001.gif'
    else:
        base_url = "http://www.guardian.co.uk/theguardian"
        cover_pic = 'Guardian digital edition'
+        masthead_url = 'http://static.guim.co.uk/static/f76b43f9dcfd761f0ecf7099a127b603b2922118/common/images/logos/the-guardian/titlepiece.gif'

    __author__ = 'Seabound and Sujata Raman'
    language = 'en_GB'
@ -26,6 +29,7 @@ class Guardian(BasicNewsRecipe):
    oldest_article              = 7
    max_articles_per_feed       = 100
    remove_javascript           = True
+    encoding                    = 'utf-8'

    # List of section titles to ignore
    # For example: ['Sport']
@ -41,6 +45,16 @@ class Guardian(BasicNewsRecipe):
                        dict(name='div', attrs={'class':["guardian-tickets promo-component",]}),
                        dict(name='ul', attrs={'class':["pagination"]}),
                        dict(name='ul', attrs={'id':["content-actions"]}),
+                        # article history link
+                        dict(name='a', attrs={'class':["rollover history-link"]}),
+                        # "a version of this article ..." speil
+                        dict(name='div' , attrs = { 'class' : ['section']}),
+                        # "about this article" js dialog
+                        dict(name='div', attrs={'class':["share-top",]}),
+                        # author picture
+                        dict(name='img', attrs={'class':["contributor-pic-small"]}),
+                        # embedded videos/captions
+                        dict(name='span',attrs={'class' : ['inline embed embed-media']}),
                        #dict(name='img'),
                        ]
    use_embedded_content    = False
@ -65,8 +79,21 @@ class Guardian(BasicNewsRecipe):
              url = None
          return url

+    def populate_article_metadata(self, article, soup, first):
+        if first and hasattr(self, 'add_toc_thumbnail'):
+            picdiv = soup.find('img')
+            if picdiv is not None:
+                self.add_toc_thumbnail(article,picdiv['src'])
+
    def preprocess_html(self, soup):

+          # multiple html sections in soup, useful stuff in the first
+          html = soup.find('html')
+          soup2 = BeautifulSoup()
+          soup2.insert(0,html) 
+          
+          soup = soup2  
+          
          for item in soup.findAll(style=True):
              del item['style']

@ -75,6 +102,17 @@ class Guardian(BasicNewsRecipe):
          for tag in soup.findAll(name=['ul','li']):
                tag.name = 'div'
         
+         # removes number next to rating stars
+          items_to_remove = []
+          rating_container = soup.find('div', attrs = {'class': ['rating-container']})
+          if rating_container:
+            for item in rating_container:
+                if isinstance(item, Tag) and str(item.name) == 'span':
+                    items_to_remove.append(item)
+          
+          for item in items_to_remove:
+            item.extract()
+          
          return soup

    def find_sections(self):
--- a/recipes/hackernews.recipe
+++ b/recipes/hackernews.recipe
@ -9,9 +9,9 @@ from calibre.ptempfile import PersistentTemporaryFile
 from urlparse import urlparse
 import re

-class HackerNews(BasicNewsRecipe):
-    title                 = 'Hacker News'
-    __author__            = 'Tom Scholl'
+class HNWithCommentsLink(BasicNewsRecipe):
+    title                 = 'HN With Comments Link'
+    __author__            = 'Tom Scholl & David Kerschner'
    description           = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.'
    publisher             = 'Y Combinator'
    category              = 'news, programming, it, technology'
@ -80,6 +80,11 @@ class HackerNews(BasicNewsRecipe):
        body = body + comments
        return u'<html><title>' + title + u'</title><body>' + body + '</body></html>'

+    def parse_feeds(self):
+        a = super(HNWithCommentsLink, self).parse_feeds()
+        self.hn_articles = a[0].articles
+        return a
+
    def get_obfuscated_article(self, url):
        if url.startswith('http://news.ycombinator.com'):
            content = self.get_hn_content(url)
@ -97,6 +102,13 @@ class HackerNews(BasicNewsRecipe):
            else:
                content = self.get_readable_content(url)

+            article = 0
+            for a in self.hn_articles:
+                if a.url == url:
+                    article = a
+
+        content = re.sub(r'</body>\s*</html>\s*$', '', content) + article.summary + '</body></html>'
+
        self.temp_files.append(PersistentTemporaryFile('_fa.html'))
        self.temp_files[-1].write(content)
        self.temp_files[-1].close()
--- a/recipes/heise_online.recipe
+++ b/recipes/heise_online.recipe
@ -1,11 +1,11 @@
 from calibre.web.feeds.news import BasicNewsRecipe
-import re
-
 class AdvancedUserRecipe(BasicNewsRecipe):

-    title = 'heise online'
+    title = 'Heise-online'
    description = 'News vom Heise-Verlag'
    __author__ = 'schuster'
+    masthead_url = 'http://www.heise.de/icons/ho/heise_online_logo.gif'
+    publisher   = 'Heise Zeitschriften Verlag GmbH & Co. KG'
    use_embedded_content   = False
    language = 'de'
    oldest_article = 2
@ -14,11 +14,10 @@ class AdvancedUserRecipe(BasicNewsRecipe):
    remove_empty_feeds = True
    timeout = 5
    no_stylesheets = True
-    encoding = 'utf-8'


    remove_tags_after = dict(name ='p', attrs={'class':'editor'})
-    remove_tags = [{'class':'navi_top_container'},
+    remove_tags = [dict(id='navi_top_container'),
                            dict(id='navi_bottom'),
                            dict(id='mitte_rechts'),
                            dict(id='navigation'),
@ -29,27 +28,31 @@ class AdvancedUserRecipe(BasicNewsRecipe):
                            dict(id='seiten_navi'),
                            dict(id='adbottom'),
                            dict(id='sitemap'),
-                            dict(name='a', href=re.compile(r'^/([a-zA-Z]+/)?')),
-                ]
+                            dict(name='div', attrs={'id':'sitemap'}),
+                            dict(name='ul', attrs={'class':'erste_zeile'}),
+                            dict(name='ul', attrs={'class':'zweite_zeile'}),
+                            dict(name='div', attrs={'class':'navi_top_container'})]

    feeds =  [
                   ('Newsticker', 'http://www.heise.de/newsticker/heise.rdf'),
-                   ('iX', 'http://www.heise.de/ix/news/news.rdf'),
-                      ('Technology Review', 'http://www.heise.de/tr/news-atom.xml'),
-                   ('mobil', 'http://www.heise.de/mobil/newsticker/heise-atom.xml'),
-                   ('Security', 'http://www.heise.de/security/news/news-atom.xml'),
-                   ('Netze', 'http://www.heise.de/netze/rss/netze-atom.xml'),
-                   ('Open Source', 'http://www.heise.de/open/news/news-atom.xml'),
-                   ('Resale ', 'http://www.heise.de/resale/rss/resale.rdf'),
+                   ('Auto', 'http://www.heise.de/autos/rss/news.rdf'),
                   ('Foto ', 'http://www.heise.de/foto/rss/news-atom.xml'),
-                   ('Autos', 'http://www.heise.de/autos/rss/news.rdf'),
                   ('Mac&i', 'http://www.heise.de/mac-and-i/news.rdf'),
+                   ('Mobile ', 'http://www.heise.de/mobil/newsticker/heise-atom.xml'),
+                   ('Netz ', 'http://www.heise.de/netze/rss/netze-atom.xml'),
+                   ('Open ', 'http://www.heise.de/open/news/news-atom.xml'),
+                   ('Resale ', 'http://www.heise.de/resale/rss/resale.rdf'),
+                   ('Security ', 'http://www.heise.de/security/news/news-atom.xml'),
+                   ('C`t', 'http://www.heise.de/ct/rss/artikel-atom.xml'),
+                   ('iX', 'http://www.heise.de/ix/news/news.rdf'),
+                   ('Mach-flott', 'http://www.heise.de/mach-flott/rss/mach-flott-atom.xml'),
                   ('Blog: Babel-Bulletin', 'http://www.heise.de/developer/rss/babel-bulletin/blog.rdf'),
                   ('Blog: Der Dotnet-Doktor', 'http://www.heise.de/developer/rss/dotnet-doktor/blog.rdf'),
                   ('Blog: Bernds Management-Welt', 'http://www.heise.de/developer/rss/bernds-management-welt/blog.rdf'),
-                   ('Blog: The World of IT', 'http://www.heise.de/developer/rss/world-of-it/blog.rdf'),
-                   ('Blog: Kais bewegtes Web', 'http://www.heise.de/developer/rss/kais-bewegtes-web/blog.rdf')
-             ]
+                   ('Blog: IT conversation', 'http://www.heise.de/developer/rss/world-of-it/blog.rdf'),
+                   ('Blog: Kais bewegtes Web', 'http://www.heise.de/developer/rss/kais-bewegtes-web/blog.rdf')]

    def print_version(self, url):
        return url + '?view=print'
+
+
--- a/recipes/hindustan_times.recipe
+++ b/recipes/hindustan_times.recipe
@ -1,4 +1,5 @@
 from calibre.web.feeds.news import BasicNewsRecipe
+import urllib, re

 class HindustanTimes(BasicNewsRecipe):
    title          = u'Hindustan Times'
@ -26,4 +27,24 @@ class HindustanTimes(BasicNewsRecipe):
            'http://feeds.hindustantimes.com/HT-Homepage-LifestyleNews'),
 ]

+    def get_article_url(self, article):
+        '''
+        HT uses a variant of the feedportal RSS ad display mechanism
+        '''
+        try:
+            s = article.summary
+            return urllib.unquote(
+                re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1))
+        except:
+            pass
+        url = BasicNewsRecipe.get_article_url(self, article)
+        res = self.browser.open_novisit(url)
+        url = res.geturl().split('/')[-2]
+        encoding = {'0B': '.', '0C': '/', '0A': '0', '0F': '=', '0G': '&',
+                '0D': '?', '0E': '-', '0N': '.com', '0L': 'http://', '0S':
+                'www.'}
+        for k, v in encoding.iteritems():
+            url = url.replace(k, v)
+        return url
+

--- a/recipes/histmag.recipe
+++ b/recipes/histmag.recipe
@ -4,56 +4,20 @@ __license__   = 'GPL v3'
 __copyright__ = '2010, matek09, matek09@gmail.com'

 from calibre.web.feeds.news import BasicNewsRecipe
-import re

 class Histmag(BasicNewsRecipe):
-
    title          = u'Histmag'
+    oldest_article = 7
+    max_articles_per_feed = 100
+    cover_url='http://histmag.org/grafika/loga/histmag-logo-2-300px.png'
    __author__ = 'matek09'
    description = u"Artykuly historyczne i publicystyczne"
    encoding = 'utf-8'
+    #preprocess_regexps = [(re.compile(r'</span>'), lambda match: '</span><br><br>'),(re.compile(r'<span>'), lambda match: '<br><br><span>')]
    no_stylesheets = True
    language = 'pl'
    remove_javascript = True
-	#max_articles_per_feed = 1
-	remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'article'}))
-	remove_tags_after = dict(dict(name = 'h2', attrs = {'class' : 'komentarze'}))
-	#keep_only_tags =[]
-	#keep_only_tags.append(dict(name = 'h2'))
-	#keep_only_tags.append(dict(name = 'p'))
-
-	remove_tags =[]
-	remove_tags.append(dict(name = 'p', attrs = {'class' : 'podpis'}))
-	remove_tags.append(dict(name = 'h2', attrs = {'class' : 'komentarze'}))
-	remove_tags.append(dict(name = 'img', attrs = {'src' : 'style/buttons/wesprzyjnas-1.jpg'}))
-
-	preprocess_regexps = [(re.compile(r'</span>'), lambda match: '</span><br><br>'),
-						(re.compile(r'<span>'), lambda match: '<br><br><span>')]
-	extra_css = '''
-					.left {font-size: x-small}
-					.right {font-size: x-small}
-				'''
-
-	def find_articles(self, soup):
-		articles = []
-		for div in soup.findAll('div', attrs={'class' : 'text'}):
-			articles.append({
-				'title' : self.tag_to_string(div.h3.a),
-				'url'   : 'http://www.histmag.org/' + div.h3.a['href'],
-				'date'  : self.tag_to_string(div.next('p')).split('|')[0],
-				'description' : self.tag_to_string(div.next('p', podpis=False)),
-				})
-		return articles
-
-	def parse_index(self):
-		soup = self.index_to_soup('http://histmag.org/?arc=4&dx=0')
-		feeds = []
-		feeds.append((u"Artykuly historyczne", self.find_articles(soup)))
-		soup = self.index_to_soup('http://histmag.org/?arc=5&dx=0')
-		feeds.append((u"Artykuly publicystyczne", self.find_articles(soup)))
-		soup = self.index_to_soup('http://histmag.org/?arc=1&dx=0')
-		feeds.append((u"Wydarzenia", self.find_articles(soup)))
-
-		return feeds
-
+    keep_only_tags=[dict(id='article')]
+    remove_tags=[dict(name = 'p', attrs = {'class' : 'article-tags'})]

+    feeds          = [(u'Wszystkie', u'http://histmag.org/rss/wszystkie.xml'), (u'Wydarzenia', u'http://histmag.org/rss/wydarzenia.xml'), (u'Recenzje', u'http://histmag.org/rss/recenzje.xml'), (u'Artykuły historyczne', u'http://histmag.org/rss/historia.xml'), (u'Publicystyka', u'http://histmag.org/rss/publicystyka.xml')]
--- a/recipes/historia_pl.recipe
+++ b/recipes/historia_pl.recipe
@ -8,6 +8,15 @@ class Historia_org_pl(BasicNewsRecipe):
    category       = 'history'
    language       = 'pl'
    oldest_article = 8
+    remove_empty_feeds=True
    max_articles_per_feed = 100

-    feeds          = [(u'Artykuły', u'http://www.historia.org.pl/index.php?format=feed&type=rss')]
+    feeds          = [(u'Wszystkie', u'http://www.historia.org.pl/index.php?format=feed&type=rss'),
+		(u'Wiadomości', u'http://www.historia.org.pl/index.php/wiadomosci.feed?type=rss'),
+		(u'Publikacje', u'http://www.historia.org.pl/index.php/publikacje.feed?type=rss'),
+		(u'Publicystyka', u'http://www.historia.org.pl/index.php/publicystyka.feed?type=rss'),
+		(u'Recenzje', u'http://historia.org.pl/index.php/recenzje.feed?type=rss'),
+		(u'Kultura i sztuka', u'http://www.historia.org.pl/index.php/kultura-i-sztuka.feed?type=rss'),
+		(u'Rekonstykcje', u'http://www.historia.org.pl/index.php/rekonstrukcje.feed?type=rss'),
+		(u'Projekty', u'http://www.historia.org.pl/index.php/projekty.feed?type=rss'),
+		(u'Konkursy'), (u'http://www.historia.org.pl/index.php/konkursy.feed?type=rss')]
--- a/recipes/hvg.recipe
+++ b/recipes/hvg.recipe
@ -1,44 +1,58 @@
-# -*- coding: utf-8 -*-
-import re
-from calibre.web.feeds.recipes import BasicNewsRecipe
+################################################################################
+#Description:	  http://hvg.hu/ RSS channel
+#Author: 	  Bigpapa (bigpapabig@hotmail.com)
+#Date:	  2011.12.20. - V1.1
+################################################################################

-class HVG(BasicNewsRecipe):
-    title                 = 'HVG.HU'
-    __author__            = u'István Papp'
-    description           = u'Friss hírek a HVG-től'
-    timefmt               = ' [%Y. %b. %d., %a.]'
-    oldest_article        = 4
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class hvg(BasicNewsRecipe):
+    title          = u'HVG'
+    __author__     = 'Bigpapa'
    language = 'hu'
-
-    max_articles_per_feed = 100
+    oldest_article = 5		# Hany napos legyen a legregebbi cikk amit leszedjen.
+    max_articles_per_feed = 5	# Az adott e-bookban tarolt cikkek feedenkenti maximalis szamat adja meg.
    no_stylesheets = True
-    use_embedded_content  = False
    encoding = 'utf8'
-    publisher             = 'HVG Online'
-    category              = u'news, hírek, hvg'
-    extra_css             = 'body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
-    preprocess_regexps    = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
-    remove_tags_before    = dict(id='pg-content')
-    remove_javascript     = True
-    remove_empty_feeds    = True
+    extra_css = ' h2 { font:bold 28px} '

-    feeds = [
-              (u'Itthon', u'http://hvg.hu/rss/itthon')
-             ,(u'Világ', u'http://hvg.hu/rss/vilag')
-             ,(u'Gazdaság', u'http://hvg.hu/rss/gazdasag')
-             ,(u'IT | Tudomány', u'http://hvg.hu/rss/tudomany')
-             ,(u'Panoráma', u'http://hvg.hu/rss/Panorama')
-             ,(u'Karrier', u'http://hvg.hu/rss/karrier')
-             ,(u'Gasztronómia', u'http://hvg.hu/rss/gasztronomia')
-             ,(u'Helyi érték', u'http://hvg.hu/rss/helyiertek')
-             ,(u'Kultúra', u'http://hvg.hu/rss/kultura')
-             ,(u'Cégautó', u'http://hvg.hu/rss/cegauto')
-             ,(u'Vállalkozó szellem', u'http://hvg.hu/rss/kkv')
-             ,(u'Egészség', u'http://hvg.hu/rss/egeszseg')
-             ,(u'Vélemény', u'http://hvg.hu/rss/velemeny')
-             ,(u'Sport', u'http://hvg.hu/rss/sport')
+    remove_attributes = ['style','font', 'href']
+
+    keep_only_tags    = [
+		dict(name='div', attrs={'id':['pg-content']})
 	]

-    def print_version(self, url):
-        return url.replace ('#rss', '/print')
+    remove_tags = [ 
+	dict(name='div', attrs={'class':['box articlemenu', 'bannergoogle468', 'boxcontainer left', 'boxcontainer', 'commentbox']}),
+	dict(name='table', attrs={'class':['banner2', 'monocle']}),
+	dict(name='div', attrs={'id':['connect_widget_4cf63ca849ddf4577922632', 'sharetip', 'upprev_box']}),
+	dict(name='div', attrs={'style':['float: right; margin-bottom: 5px;', 'display: none;']}),
+	dict(name='h3', attrs={'class':['hthree']}),
+	dict(name='ul', attrs={'class':['defaultul']}),
+	dict(name='form', attrs={'id':['commentForm']}),
+	dict(name='h6', attrs={'class':['hthree']}),
+	dict(name='h6', attrs={'class':['more2']}),
+	dict(name='img', attrs={'class':['framed']}),
+	dict(name='td', attrs={'class':['greyboxbody','embedvideobody','embedvideofooter','embedvideobottom']}),

+
+
+	]
+
+    feeds          = [
+#	(u'\xd6sszes', 'http://hvg.hu/rss'),
+	(u'Itthon', 'http://hvg.hu/rss/itthon'),
+	(u'Vil\xe1g', 'http://hvg.hu/rss/vilag'),
+	(u'Gazdas\xe1g', 'http://hvg.hu/rss/gazdasag'),
+	(u'Tudom\xe1ny', 'http://hvg.hu/rss/tudomany'),
+	(u'Panor\xe1ma', 'http://hvg.hu/rss/panorama'),
+	(u'Karrier', 'http://hvg.hu/rss/karrier'),
+	(u'Gasztron\xf3mia', 'http://hvg.hu/rss/gasztronomia'),
+	(u'Helyi \xe9rt\xe9k', 'http://hvg.hu/rss/helyiertek'),
+	(u'Kult\xfara', 'http://hvg.hu/rss/kultura'),
+	(u'C\xe9gaut\xf3', 'http://hvg.hu/rss/cegauto'),
+	(u'V\xe1llalkoz\xf3 szellem', 'http://hvg.hu/rss/kkv'),
+	(u'Eg\xe9szs\xe9g', 'http://hvg.hu/rss/egeszseg'),
+	(u'V\xe9lem\xe9ny', 'http://hvg.hu/rss/velemeny'),
+	(u'Sport', 'http://hvg.hu/rss/sport')
+]
--- a/recipes/icons/b365realitatea.png
+++ b/recipes/icons/b365realitatea.png
--- a/recipes/icons/biolog_pl.png
+++ b/recipes/icons/biolog_pl.png
--- a/recipes/icons/blues.png
+++ b/recipes/icons/blues.png
--- a/recipes/icons/catavencii.png
+++ b/recipes/icons/catavencii.png
--- a/recipes/icons/computerworld_pl.png
+++ b/recipes/icons/computerworld_pl.png
--- a/recipes/icons/descopera_org.png
+++ b/recipes/icons/descopera_org.png
--- a/recipes/icons/dziennik_pl.png
+++ b/recipes/icons/dziennik_pl.png
--- a/recipes/icons/formulaas.png
+++ b/recipes/icons/formulaas.png
--- a/recipes/icons/infra_pl.png
+++ b/recipes/icons/infra_pl.png
--- a/recipes/icons/kosmonauta_pl.png
+++ b/recipes/icons/kosmonauta_pl.png
--- a/recipes/icons/mlody_technik_pl.png
+++ b/recipes/icons/mlody_technik_pl.png
--- a/recipes/icons/moneynews.png
+++ b/recipes/icons/moneynews.png
--- a/recipes/icons/skylife.png
+++ b/recipes/icons/skylife.png
--- a/recipes/icons/zaman.png
+++ b/recipes/icons/zaman.png
--- a/recipes/il_giornale.recipe
+++ b/recipes/il_giornale.recipe
@ -1,8 +1,8 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
-__author__    = 'Gabriele Marini, based on Darko Miletic'
+__author__    = 'Gambarini, based on Darko Miletic'
 __copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
-description   = 'Italian daily newspaper - 19-04-2010'
+description   = 'Italian daily newspaper - 09-11-2011'

 '''
 http://www.ilgiornale.it/
@ -11,7 +11,7 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.web.feeds.news import BasicNewsRecipe

 class IlGiornale(BasicNewsRecipe):
-    __author__        = 'Marini Gabriele'
+    __author__        = 'GAMBARINI'
    description   = 'Italian daily newspaper'

    cover_url      = 'http://www.ilgiornale.it/img_v1/logo.gif'
@ -23,9 +23,8 @@ class IlGiornale(BasicNewsRecipe):
    timefmt        = '[%a, %d %b, %Y]'

    oldest_article = 7
-    max_articles_per_feed = 50
+    max_articles_per_feed = 100
    use_embedded_content  = False
-    recursion             = 100

    no_stylesheets        = True
    conversion_options = {'linearize_tables':True}
@ -38,11 +37,11 @@ class IlGiornale(BasicNewsRecipe):
    def print_version(self, url):
        raw = self.browser.open(url).read()
        soup = BeautifulSoup(raw.decode('utf8', 'replace'))
-        all_print_tags = soup.find('div', {'style':'float:left; width:35%;'})
-        print_link = all_print_tags.contents[1]
-        if all_print_tags is None:
+        all_print_tags = soup.find('div', {'id':'print_article'})
+        print_link = all_print_tags.a
+        if print_link is None:
           return url
-        return  print_link['href']
+        return  'http://www.ilgiornale.it' + print_link['href']


    feeds = [
--- a/recipes/independent.recipe
+++ b/recipes/independent.recipe
@ -1,33 +1,60 @@
-__license__   = 'GPL v3'
-__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
-'''
-www.independent.co.uk
-'''
+# adapted from old recipe by Darko Miletic <darko.miletic at gmail.com>

-from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
+import re
+from calibre.web.feeds.recipes import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import Tag, NavigableString

-class TheIndependent(BasicNewsRecipe):
-    title                 = 'The Independent'
-    __author__            = 'Darko Miletic'
-    description           = 'Independent News - Breaking news, comment and features from The Independent newspaper'
+
+class TheIndependentNew(BasicNewsRecipe):
+
+    # flag to enable/disable article graphics on business pages/some others
+    # eg http://www.independent.co.uk/news/world/europe/berlusconi-departure-fails-to-calm-the-markets-6259682.html
+    # -max dimensions can be altered using the .pictureContainer img selector in the css
+    _FETCH_ARTICLE_GRAPHICS = True
+
+    #Flag to enable/disable image fetching (not business)
+    _FETCH_IMAGES = True
+
+
+     #used for converting rating to stars
+    _STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star.png'
+    _NO_STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star_grey.png'
+
+
+    title                   = u'The Independent'
+    __author__              = 'Will'
+    description             = 'The latest in UK News and World News from The \
+                               Independent. Wide range of international and local news, sports \
+                               news, commentary and opinion pieces.Independent News - Breaking news \
+                               that matters. Your daily comprehensive news source - The \
+                               Independent Newspaper'
    publisher               = 'The Independent'
-    category              = 'news, politics, UK'
-    oldest_article        = 2
-    max_articles_per_feed = 200
+    category                = 'news, UK'
    no_stylesheets          = True
-    encoding              = 'cp1252'
    use_embedded_content    = False
-    language              = 'en_GB'
    remove_empty_feeds      = True
+    language                = 'en_GB'
    publication_type        = 'newspaper'
-    masthead_url          = 'http://www.independent.co.uk/independent.co.uk/images/logo-london.png'
-    extra_css             = """
-                               h1{font-family: Georgia,serif }
-                               body{font-family: Verdana,Arial,Helvetica,sans-serif}
-                               img{margin-bottom: 0.4em; display:block}
-                               .info,.caption,.credits{font-size: x-small}
-                            """
+    masthead_url            = 'http://www.independent.co.uk/independent.co.uk/editorial/logo/independent_Masthead.png'
+    encoding                = 'utf-8'
+    remove_tags             =[
+                               dict(attrs={'id' : ['RelatedArtTag','renderBiography']}),
+                               dict(attrs={'class' : ['autoplay','openBiogPopup']}),
+                               dict(name='img',attrs={'alt' : ['Get Adobe Flash player']}),
+                               dict(attrs={'style' : re.compile('.*')}),
+                             ]
+
+    keep_only_tags          =[dict(attrs={'id':'main'})]
+    recursions = 0
+
+    # fixes non compliant html nesting and 'marks' article graphics links
+    preprocess_regexps      = [
+                                (re.compile('<span class="storyTop ">(?P<nested>.*?)</span>', re.DOTALL),
+                                lambda match: '<div class="storyTop">' + match.group('nested') + '</div>'),
+                                (re.compile('(<strong>.*?[Cc]lick.*?<a.*?((HERE)|([Hh]ere)).*?</strong>)', re.DOTALL),
+                                lambda match: '<div class="article-graphic">' + match.group(0) + '</div>'),
+                              ]
+

    conversion_options = {
                          'comment'   : description
@ -36,51 +63,451 @@ class TheIndependent(BasicNewsRecipe):
                        , 'language'  : language
                        }

-    remove_tags      =[
-                        dict(name=['meta','link','object','embed','iframe','base','style'])
-                        ,dict(attrs={'class':['related-articles','share','googleCols','article-tools','paging','googleArt']})
-                        ,dict(attrs={'id':['newsVideoPlayer','yahoobook','google-intext']})
-                      ]
-    keep_only_tags   =[dict(attrs={'id':'article'})]
-    remove_attributes=['lang','onclick','width','xmlns:fb']
+    extra_css             = """
+                               h1{font-family: Georgia,serif }
+                               body{font-family: Verdana,Arial,Helvetica,sans-serif}
+                               img{margin-bottom: 0.4em; display:block}
+                               .starRating img {float: left}
+                               .starRating {margin-top:0.4em; display: block}
+                               .image {clear:left; font-size: x-small; color:#888888;}
+                               .articleByTimeLocation {font-size: x-small; color:#888888;
+                                margin-bottom:0.2em ; margin-top:0.2em ; display:block}
+                                .subtitle {clear:left}
+                               .column-1 h1 { color: #191919}
+                               .column-1 h2 { color: #333333}
+                               .column-1 h3 { color: #444444}
+                               .column-1 p { color: #777777}
+                               .column-1 p,a,h1,h2,h3 { margin: 0; }
+                               .column-1 div{color:#888888; margin: 0;}
+                               .articleContent {display: block; clear:left;}
+                               .storyTop{}
+                               .pictureContainer img { max-width: 400px; max-height: 400px;}
+                            """

+    oldest_article = 1
+    max_articles_per_feed = 100
+
+    _processed_urls = []

-    feeds = [
-              (u'UK'                 , u'http://www.independent.co.uk/news/uk/rss'                 )
-             ,(u'World'              , u'http://www.independent.co.uk/news/world/rss'              )
-             ,(u'Business'           , u'http://www.independent.co.uk/news/business/rss'           )
-             ,(u'People'             , u'http://www.independent.co.uk/news/people/rss'             )
-             ,(u'Science'            , u'http://www.independent.co.uk/news/science/rss'            )
-             ,(u'Media'              , u'http://www.independent.co.uk/news/media/rss'              )
-             ,(u'Education'          , u'http://www.independent.co.uk/news/education/rss'          )
-             ,(u'Leading Articles'   , u'http://www.independent.co.uk/opinion/leading-articles/rss')
-             ,(u'Comentators'        , u'http://www.independent.co.uk/opinion/commentators/rss'    )
-             ,(u'Columnists'         , u'http://www.independent.co.uk/opinion/columnists/rss'      )
-             ,(u'Letters'            , u'http://www.independent.co.uk/opinion/letters/rss'         )
-             ,(u'Big Question'       , u'http://www.independent.co.uk/extras/big-question/rss'     )
-             ,(u'Sport'              , u'http://www.independent.co.uk/sport/rss'                   )
-             ,(u'Life&Style'         , u'http://www.independent.co.uk/life-style/rss'              )
-             ,(u'Arts&Entertainment' , u'http://www.independent.co.uk/arts-entertainment/rss'      )
-             ,(u'Travel'             , u'http://www.independent.co.uk/travel/rss'                  )
-             ,(u'Money'              , u'http://www.independent.co.uk/money/rss'                   )
-            ]

    def get_article_url(self, article):
-        return article.get('guid',  None)
+        url = super(self.__class__,self).get_article_url(article)
+
+        title = article.get('title', None)
+        if title and re.search("^Video:",title):
+            return None
+
+        #remove duplicates
+        if not (url in self._processed_urls):
+            self._processed_urls.append(url)
+        else:
+            url = None
+        return url
+
+    def populate_article_metadata(self, article, soup, first):
+        if first and hasattr(self, 'add_toc_thumbnail'):
+            picdiv = soup.find('img')
+            if picdiv is not None:
+                self.add_toc_thumbnail(article,picdiv['src'])

    def preprocess_html(self, soup):
-        for item in soup.body.findAll(style=True):
-            del item['style']
-        for item in soup.body.findAll(['author','preform']):
-            item.name='span'
-        for item in soup.body.findAll('img'):
-            if not item.has_key('alt'):
-               item['alt'] = 'image'
-        for item in soup.body.findAll('div', attrs={'class':['clear-o','body','photoCaption']}):
-            item.name = 'p'
-        for item in soup.body.findAll('div'):
-            if not item.attrs and not item.contents:
+
+        #remove 'advertorial articles'
+        strapline = soup.find('div',attrs={'class' : re.compile('.*strapLine.*')})
+        if strapline:
+            for para in strapline.findAll('p'):
+                if len(para.contents) and isinstance(para.contents[0],NavigableString) \
+                and para.contents[0] == 'ADVERTORIAL FEATURE':
+                    return None
+
+        items_to_extract = []
+        slideshow_elements = []
+
+        for item in soup.findAll(attrs={'class' : re.compile("widget.*")}):
+            remove = True
+            pattern = re.compile('((articleContent)|(title))$')
+            if (pattern.search(item['class'])) is not None:
+                remove = False
+
+            # corrections
+            # story content always good
+            pattern = re.compile('storyContent')
+            if (pattern.search(item['class'])) is not None:
+                remove = False
+
+            #images
+            pattern = re.compile('slideshow')
+            if (pattern.search(item['class'])) is not None:
+                if self._FETCH_IMAGES:
+                    remove = False
+                    slideshow_elements.append(item)
+                else:
+                    remove = True
+
+            #social widgets always bad
+            pattern = re.compile('socialwidget')
+            if (pattern.search(item['class'])) is not None:
+                remove = True
+
+            if remove:
+                items_to_extract.append(item)
+
+        for item in items_to_extract:
            item.extract()
-        soup2 = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
-        soup2.body.replaceWith(soup.body)
-        return soup2
+
+        items_to_extract = []
+
+        if self._FETCH_IMAGES:
+            for element in slideshow_elements:
+                for item in element.findAll('a',attrs={'href' : re.compile('.*')}):
+                    if item.img is not None:
+                        #use full size image
+                        img = item.findNext('img')
+
+                        img['src'] = item['href']
+
+                        #insert caption if available
+                        if img.get('title') and (len(img['title']) > 1):
+                            tag = Tag(soup,'h3')
+                            text = NavigableString(img['title'])
+                            tag.insert(0,text)
+
+                            #picture before text
+                            img.extract()
+                            item.insert(0,img)
+                            item.insert(1,tag)
+
+                        # remove link
+                        item.name = "div"
+                        item["class"]='image'
+                        del item["href"]
+
+
+        #remove empty subtitles
+        """
+        currently the subtitle is located in first paragraph after
+        sibling <h3 class="subtitle"> tag. This may be 'fixed' at
+        some point.
+        """
+        subtitle = soup.find('h3',attrs={'class' : 'subtitle'})
+        if subtitle is not None:
+            subtitleText = subtitle.findNext('p')
+            if subtitleText is not None:
+                if len(subtitleText.contents[0]) <= 1 :
+                    subtitleText.extract()
+                    subtitle.extract()
+
+
+        #replace rating numbers with stars
+        for item in soup.findAll('div',attrs={ 'class' : 'starRating'}):
+            if item is not None:
+                soup2 = self._insertRatingStars(soup,item)
+            if soup2 is not None:
+                soup = soup2
+
+
+        #remove empty paragraph tags in storyTop which can leave a space
+        #between first paragraph and rest of story
+        nested_content = False
+        storyTop = soup.find('div',attrs={ 'class' : ['storyTop']})
+        for item in storyTop.findAll('p'):
+            for nested in item:
+                if isinstance(nested, Tag):
+                    nested_content = True
+                    break
+            if not nested_content and item.contents is not None and len(item.contents[0]) <= 1 :
+                items_to_extract.append(item)
+
+        for item in items_to_extract:
+            item.extract()
+
+        items_to_extract = []
+
+
+        #remove line breaks immediately next to tags with default margins
+        #to prevent double line spacing and narrow columns of text
+        storyTop = soup.find('div',attrs={ 'class' : ['storyTop']})
+        self._remove_undesired_line_breaks_from_tag(storyTop,soup)
+
+
+        #replace article graphics link with the graphics themselves
+        if self._FETCH_ARTICLE_GRAPHICS:
+            items_to_insert = []
+            for item in soup.findAll('div', attrs={'class' : ['article-graphic']}):
+                strong = item.find('strong')
+                if not strong:
+                    continue
+                for child in strong:
+                    if isinstance(child,Tag):
+                        if str(child.name) == 'a':
+                            items_to_insert.extend(self._get_article_graphic(strong,child['href'],soup))
+
+            for item in items_to_insert:
+                item[0].replaceWith(item[1])
+
+        for item in items_to_extract:
+            item.extract()
+
+        return soup
+
+
+    def _get_article_graphic(self,old_item,url,soup):
+
+        items_to_insert = []
+
+        if re.search('\.jpg$',str(url)):
+            div = Tag(soup,'div')
+            div['class'] = 'pictureContainer'
+            img = Tag(soup,'img')
+            img['src'] = url
+            img['alt'] = 'article graphic'
+            div.insert(0,img)
+            items_to_insert.append((old_item,div,))
+            return items_to_insert
+
+        soup2 = self.index_to_soup(url)
+        for item in soup2.findAll('div',attrs={'class' : re.compile("widget picture article.*")}):
+            items_to_insert.append((old_item,item),)
+        return items_to_insert
+
+
+    def _insertRatingStars(self,soup,item):
+        if item.contents is None or len(item.contents) < 1:
+            return
+        rating = item.contents[0]
+
+        try:
+            rating = float(item.contents[0])
+        except:
+            print 'Could not convert decimal rating to star: malformatted float.'
+            return
+        for i in range(1,6):
+            star = Tag(soup,'img')
+            if i <= rating:
+                star['src'] = self._STAR_URL
+            else:
+                star['src'] = self._NO_STAR_URL
+            star['alt'] = 'star number ' +  str(i)
+            item.insert(i,star)
+        #item.contents[0] = NavigableString('(' + str(rating) + ')')
+        item.contents[0] = ''
+
+    def postprocess_html(self,soup, first_fetch):
+        #find broken images and remove captions
+        items_to_extract = []
+        for item in soup.findAll('div', attrs={'class' : 'image'}):
+            img = item.findNext('img')
+            if img and img.get('src'):
+                # broken images still point to remote url
+                pattern = re.compile('http://www.independent.co.uk.*')
+                if pattern.match(img["src"]) is not None:
+                    caption = img.findNextSibling('h3')
+                    if caption is not None:
+                        items_to_extract.append(caption)
+                    items_to_extract.append(img)
+
+        for item in items_to_extract:
+            item.extract()
+        return soup
+
+    def _recurisvely_linearise_tag_tree(
+        self,
+        item,
+        linearised= None,
+        count=0,
+        limit = 100
+        ):
+        linearised = linearised or []
+        count = count + 1
+        if count > limit:
+            return linearised
+        if not (isinstance(item,Tag)):
+            return linearised
+        for nested in item:
+            linearised.append(nested)
+            linearised = self._recurisvely_linearise_tag_tree(nested,linearised, count)
+        return linearised
+
+
+    def _get_previous_tag(self,current_index, tag_tree):
+        if current_index == 0:
+            return None
+        else:
+            return tag_tree[current_index - 1]
+
+
+    def _get_next_tag(self,current_index, tag_tree):
+        if current_index < len(tag_tree) - 1:
+            return tag_tree[current_index + 1]
+        else:
+            return None
+
+
+    def _list_match(self,test_str, list_regex):
+        for regex in list_regex:
+            match = re.match(regex, test_str)
+            if match is not None:
+                return True
+        return False
+
+    def _remove_undesired_line_breaks_from_tag(self,parent,soup):
+
+        if parent is None:
+            return
+
+
+        tag_tree = self._recurisvely_linearise_tag_tree(parent)
+        items_to_remove = []
+
+
+        for item in tag_tree:
+            if item == u'\n':
+               items_to_remove.append(item)
+               continue;
+
+        for item in items_to_remove:
+            tag_tree.remove(item)
+
+
+        spaced_tags = [r'p', r'h\d', r'blockquote']
+        tags_to_extract = []
+        tags_to_replace = []
+        for (i, tag) in enumerate(tag_tree):
+            if isinstance(tag, Tag):
+                if str(tag) == '<br />':
+                    previous_tag = self._get_previous_tag(i, tag_tree)
+
+                    if isinstance(previous_tag, Tag):
+                        previous_tag_is_spaced = previous_tag is not None\
+                             and self._list_match(str(previous_tag.name),
+                                spaced_tags)
+                    else:
+                        previous_tag_is_spaced = False
+
+                    next_tag = self._get_next_tag(i, tag_tree)
+
+                    if isinstance(next_tag, Tag):
+                        next_tag_is_spaced = next_tag is not None\
+                             and self._list_match(str(next_tag.name), spaced_tags)
+                    else:
+                        next_tag_is_spaced = False
+
+                    if previous_tag_is_spaced or next_tag_is_spaced or i == 0\
+                         or i == len(tag_tree) - 1:
+                        tags_to_extract.append(tag)
+                    else:
+                        tags_to_replace.append((tag,NavigableString(' '),))
+
+
+        for pair in tags_to_replace:
+            pair[0].replaceWith(pair[1])
+        for tag in tags_to_extract:
+            tag.extract()
+
+    feeds = [
+        (u'News - UK',
+         u'http://www.independent.co.uk/news/uk/?service=rss'),
+        (u'News - World',
+         u'http://www.independent.co.uk/news/world/?service=rss'),
+        (u'News - Business',
+         u'http://www.independent.co.uk/news/business/?service=rss'),
+        (u'News - People',
+         u'http://www.independent.co.uk/news/people/?service=rss'),
+        (u'News - Science',
+         u'http://www.independent.co.uk/news/science/?service=rss'),
+        (u'News - Media',
+         u'http://www.independent.co.uk/news/media/?service=rss'),
+        (u'News - Education',
+         u'http://www.independent.co.uk/news/education/?service=rss'),
+        (u'News - Obituaries',
+         u'http://www.independent.co.uk/news/obituaries/?service=rss'),
+        (u'News - Corrections',
+         u'http://www.independent.co.uk/news/corrections/?service=rss'
+         ),
+        (u'Opinion',
+         u'http://www.independent.co.uk/opinion/?service=rss'),
+        (u'Environment',
+         u'http://www.independent.co.uk/environment/?service=rss'),
+        (u'Sport - Athletics',
+         u'http://www.independent.co.uk/sport/general/athletics/?service=rss'
+         ),
+        (u'Sport - Cricket',
+         u'http://www.independent.co.uk/sport/cricket/?service=rss'),
+        (u'Sport - Football',
+         u'http://www.independent.co.uk/sport/football/?service=rss'),
+        (u'Sport - Golf',
+         u'http://www.independent.co.uk/sport/golf/?service=rss'),
+        (u'Sport - Motor racing',
+         u'http://www.independent.co.uk/sport/motor-racing/?service=rss'
+         ),
+        (u'Sport - Olympics',
+         u'http://www.independent.co.uk/sport/olympics/?service=rss'),
+        (u'Sport - Racing',
+         u'http://www.independent.co.uk/sport/racing/?service=rss'),
+        (u'Sport - Rugby League',
+         u'http://www.independent.co.uk/sport/general/rugby-league/?service=rss'),
+        (u'Sport - Rugby Union',
+         u'http://www.independent.co.uk/sport/rugby/rugby-union/?service=rss'
+         ),
+        (u'Sport - Sailing',
+         u'http://www.independent.co.uk/sport/general/sailing/?service=rss'
+         ),
+        (u'Sport - Tennis',
+         u'http://www.independent.co.uk/sport/tennis/?service=rss'),
+        (u'Sport - Others',
+         u'http://www.independent.co.uk/sport/general/others/?service=rss'
+         ),
+        (u'Life & Style - Fashion',
+         u'http://www.independent.co.uk/life-style/fashion/?service=rss'
+         ),
+        (u'Life & Style -Food & Drink',
+         u'http://www.independent.co.uk/life-style/food-and-drink/?service=rss'
+         ),
+        (u'Life & Style - Health and Families',
+         u'http://www.independent.co.uk/life-style/health-and-families/?service=rss'
+         ),
+        (u'Life & Style - House & Home',
+         u'http://www.independent.co.uk/life-style/house-and-home/'),
+        (u'Life & Style - History',
+         u'http://www.independent.co.uk/life-style/history/?service=rss'
+         ),
+        (u'Life & Style - Gadgets & Tech',
+         u'http://www.independent.co.uk/life-style/gadgets-and-tech/?service=rss'
+         ),
+        (u'Life & Style - Motoring',
+         u'http://www.independent.co.uk/life-style/motoring/?service=rss'
+         ),
+        (u'Arts & Ents - Art',
+         u'http://www.independent.co.uk/arts-entertainment/art/?service=rss'
+         ),
+        (u'Arts & Ents - Architecture',
+         u'http://www.independent.co.uk/arts-entertainment/architecture/?service=rss'
+         ),
+        (u'Arts & Ents - Music',
+         u'http://www.independent.co.uk/arts-entertainment/music/?service=rss'
+         ),
+        (u'Arts & Ents - Classical',
+         u'http://www.independent.co.uk/arts-entertainment/classical/?service=rss'
+         ),
+        (u'Arts & Ents - Films',
+         u'http://www.independent.co.uk/arts-entertainment/films/?service=rss'
+         ),
+        (u'Arts & Ents - TV',
+         u'http://www.independent.co.uk/arts-entertainment/tv/?service=rss'
+         ),
+        (u'Arts & Ents - Theatre and Dance',
+         u'http://www.independent.co.uk/arts-entertainment/theatre-dance/?service=rss'
+         ),
+        (u'Arts & Ents - Comedy',
+         u'http://www.independent.co.uk/arts-entertainment/comedy/?service=rss'
+         ),
+        (u'Arts & Ents - Books',
+         u'http://www.independent.co.uk/arts-entertainment/books/?service=rss'
+         ),
+        (u'Travel', u'http://www.independent.co.uk/travel/?service=rss'
+         ),
+        (u'Money', u'http://www.independent.co.uk/money/?service=rss'),
+        (u'IndyBest',
+         u'http://www.independent.co.uk/extras/indybest/?service=rss'),
+        ]
+
--- a/recipes/infra_pl.recipe
+++ b/recipes/infra_pl.recipe
@ -0,0 +1,17 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class INFRA(BasicNewsRecipe):
+    title          = u'INFRA'
+    oldest_article = 7
+    max_articles_per_feed = 100
+    __author__        = 'fenuks'
+    description   = u'Serwis Informacyjny INFRA - UFO, Zjawiska Paranormalne, Duchy, Tajemnice świata.'
+    cover_url      = 'http://npn.nazwa.pl/templates/ja_teline_ii/images/logo.jpg'
+    category       = 'UFO'
+    language       = 'pl'
+    max_articles_per_feed = 100
+    no_stylesheers=True
+    remove_tags_before=dict(name='h2', attrs={'class':'contentheading'})
+    remove_tags_after=dict(attrs={'class':'pagenav'})
+    remove_tags=[dict(attrs={'class':'pagenav'})]
+    feeds          = [(u'Najnowsze wiadomo\u015bci', u'http://www.infra.org.pl/index.php?option=com_rd_rss&id=1')]
--- a/recipes/japan_news.recipe
+++ b/recipes/japan_news.recipe
@ -0,0 +1,18 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class NewsOnJapan(BasicNewsRecipe):
+    title          = u'News On Japan'
+    language       = 'en'
+    __author__     = 'Krittika Goyal'
+    oldest_article = 1 #days
+    max_articles_per_feed = 25
+    use_embedded_content = False
+
+    no_stylesheets = True
+    auto_cleanup = True
+
+
+    feeds          = [
+('News',
+ 'http://newsonjapan.com/rss/top.xml'),
+]
--- a/recipes/kosmonauta_pl.recipe
+++ b/recipes/kosmonauta_pl.recipe
@ -0,0 +1,14 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+from calibre.web.feeds.news import BasicNewsRecipe
+class Kosmonauta(BasicNewsRecipe):
+    title          = u'Kosmonauta.net'
+    __author__        = 'fenuks'
+    description   = u'polskojęzyczny portal w całości dedykowany misjom kosmicznym i badaniom kosmosu.'
+    category       = 'astronomy'
+    language       = 'pl'
+    cover_url='http://bi.gazeta.pl/im/4/10393/z10393414X,Kosmonauta-net.jpg'
+    no_stylesheets = True
+    oldest_article = 7
+    max_articles_per_feed = 100
+    feeds          = [(u'Kosmonauta.net', u'http://www.kosmonauta.net/index.php/feed/rss.html')]
--- a/recipes/l_espresso.recipe
+++ b/recipes/l_espresso.recipe
@ -11,7 +11,7 @@ __description__ = 'Italian weekly magazine'
 from calibre.web.feeds.news import BasicNewsRecipe

 class Espresso(BasicNewsRecipe):
-    __author__     = 'Lorenzo Vigentini, Gabriele Marini'
+    __author__     = 'Lorenzo Vigentini, Gabriele Marini, Krittika Goyal'
    description    = 'Italian weekly magazine'

    cover_url      = 'http://espresso.repubblica.it/images/logo_espresso.gif'
@ -26,10 +26,9 @@ class Espresso(BasicNewsRecipe):
    oldest_article        = 16
    max_articles_per_feed = 100
    use_embedded_content  = False
-    recursion             = 10

-    remove_javascript     = True
    no_stylesheets = True
+    auto_cleanup = True


    feeds          = [
@ -42,36 +41,3 @@ class Espresso(BasicNewsRecipe):
                       (u'Chiesa: HomePage', u'http://data.kataweb.it/rss/chiesa/homepage/it'),
                       (u'Chiesa: Speciali e Focus', u'http://data.kataweb.it/rss/chiesa/speciali_e_focus/it')
                    ]
-
-
-    def print_version(self,url):
-        print url[7:25]
-        if url[7:25] == 'temi.repubblica.it':
-          return url + '/?printpage=undefined'
-        elif url[7:25] == 'www.chiesa.espress':
-          return url
-        return url + '/&print=true'
-
-
-    keep_only_tags     = [
-                            dict(name='div', attrs={'class':['testo','copertina','occhiello','firma','didascalia','content-second-right','detail-articles','titolo-local','generic-articles']}),
-                            dict(name='div', attrs={'class':['generic-articles','summary','detail-articles']}),
-                            dict(name='div', attrs={'id':['content-second-right','content2']})
-                          ]
-
-    remove_tags        = [
-                            dict(name='div',attrs={'class':['servizi','aggiungi','label-web','bottom-mobile','box-abbonamenti','box-cerca','big','little','stampaweb']}),
-                            dict(name='div',attrs={'id':['topheader','header','navigation-new','navigation','content-second-left','menutext']}),
-                            dict(name='ul',attrs={'id':'user-utility'}),
-                            dict(name=['script','noscript','iframe'])
-                         ]
-#    extra_css = '''
-#                h1 {font-family:Times New Roman,"Trebuchet MS",Arial,Helvetica,sans-serif; font-size:24px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:18px;}
-#                h2 {font-family:Times New Roman, "Trebuchet MS",Arial,Helvetica,sans-serif; font-size:18px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:16px; }
-#                h3 {color:#333333;font-family:Times New Roman, "Trebuchet MS",Arial,Helvetica,sans-serif; font-size:16px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px;}
-#                h4 {color:#333333; font-family:Times New Roman, "Trebuchet MS",Arial,Helvetica,sans-serif;font-size:16px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px; }
-#                h5 {color:#333333; font-family:Times New Roman, "Trebuchet MS",Arial,Helvetica,sans-serif; font-size:12px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px; text-transform:uppercase;}
-#                .firma {color:#333333;font-family:Times New Roman, "Trebuchet MS",Arial,Helvetica,sans-serif;font-size:12px; font-size-adjust:none; font-stretch:normal; font-style:italic; font-variant:normal; font-weight:bold; line-height:15px; text-decoration:none;}
-#                .testo {font-family:Times New Roman, "Trebuchet MS",Arial,Helvetica,sans-serif; font-size:10px;}
-#                '''
-
--- a/recipes/la_republica.recipe
+++ b/recipes/la_republica.recipe
@ -1,13 +1,12 @@
 __license__   = 'GPL v3'
 __author__    = 'Lorenzo Vigentini, based on Darko Miletic, Gabriele Marini'
 __copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>'
-description   = 'Italian daily newspaper - v1.01 (04, January 2010); 16.05.2010 new version; 17.10.2011 new version'
+description   = 'Italian daily newspaper - v1.01 (04, January 2010); 16.05.2010 new version; 17.10.2011 new version; 14.12.2011 new version'

 '''
 http://www.repubblica.it/
 '''

-import re
 from calibre.ptempfile import PersistentTemporaryFile
 from calibre.web.feeds.news import BasicNewsRecipe

@ -33,12 +32,6 @@ class LaRepubblica(BasicNewsRecipe):
                           
    remove_attributes = ['width','height','lang','xmlns:og','xmlns:fb']
    
-    preprocess_regexps = [
-        (re.compile(r'.*?<head>', re.DOTALL|re.IGNORECASE), lambda match: '<head>'),
-        (re.compile(r'<head>.*?<title>', re.DOTALL|re.IGNORECASE), lambda match: '<head><title>'),
-        (re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE), lambda match: '</title></head>')
-    ]
-    
    def get_article_url(self, article):
        link = BasicNewsRecipe.get_article_url(self, article)
        if link and not '.repubblica.it/' in link:
@ -73,15 +66,15 @@ class LaRepubblica(BasicNewsRecipe):
    remove_tags        = [
                            dict(name=['object','link','meta','iframe','embed']),
                            dict(name='span',attrs={'class':'linkindice'}),
-                            dict(name='div', attrs={'class':'bottom-mobile'}),
-                            dict(name='div', attrs={'id':['rssdiv','blocco']}),
-                            dict(name='div', attrs={'class':'utility'}),
+                            dict(name='div', attrs={'class':['bottom-mobile','adv adv-middle-inline']}),
+                            dict(name='div', attrs={'id':['rssdiv','blocco','fb-like-head']}),
+                            dict(name='div', attrs={'class':['utility','fb-like-button','archive-button']}),
                            dict(name='div', attrs={'class':'generalbox'}),
                            dict(name='ul', attrs={'id':'hystory'})
                         ]

    feeds          = [
-                       (u'Rilievo', u'http://www.repubblica.it/rss/homepage/rss2.0.xml'),
+                       (u'Homepage', u'http://www.repubblica.it/rss/homepage/rss2.0.xml'),
                       (u'Cronaca', u'http://www.repubblica.it/rss/cronaca/rss2.0.xml'),
                       (u'Esteri', u'http://www.repubblica.it/rss/esteri/rss2.0.xml'),
                       (u'Economia', u'http://www.repubblica.it/rss/economia/rss2.0.xml'),
@ -110,3 +103,5 @@ class LaRepubblica(BasicNewsRecipe):
            del item['style']           
        return soup
                      
+    def preprocess_raw_html(self, raw, url):
+       return '<html><head>'+raw[raw.find('</head>'):]
--- a/recipes/letsgetcritical.recipe
+++ b/recipes/letsgetcritical.recipe
@ -0,0 +1,94 @@
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class LetsGetCritical(BasicNewsRecipe):
+    title          = u"Let's Get Critical"
+    description    = 'Curation / aggregation of criticisms of the arts and culture '
+    language = 'en'
+    __author__     = 'barty on mobileread.com forum'
+    max_articles_per_feed = 100
+    no_stylesheets = False
+    timefmt        = ' [%a, %d %b, %Y]'
+    oldest_article = 365
+    auto_cleanup   = True
+    INDEX          = 'http://www.letsgetcritical.org'
+    CATEGORIES     = [
+        # comment out categories you don't want
+        # (user friendly name, system name, max number of articles to load)
+        ('Architecture','architecture',30),
+        ('Art','art',30),
+        ('Books','books',30),
+        ('Design','design',30),
+        ('Digital','digital',30),
+        ('Food','food',30),
+        ('Movies','movies',30),
+        ('Music','music',30),
+        ('Television','television',30),
+        ('Other articles','',10)
+        ]
+
+    def parse_index(self):
+        self.cover_url = 'http://www.letsgetcritical.org/wp-content/themes/lets_get_critical/images/lgc.jpg'
+        feeds = []
+        seen_urls = set([])
+        regex = re.compile( r'http://(www\.)?([^/:]+)', re.I)
+
+        for category in self.CATEGORIES:
+
+            (cat_name, tag, max_articles) = category
+
+            tagurl = '' if tag=='' else '/category/'+tag.lower()
+            self.log('Reading category:', cat_name)
+
+            articles = []
+            pageno = 1
+
+            while len(articles) < max_articles and pageno < 100:
+
+                page = "%s%s/page/%d" % (self.INDEX, tagurl, pageno) if pageno > 1 else self.INDEX + tagurl
+                pageno += 1
+
+                self.log('\tReading page:', page)
+                try:
+                    soup = self.index_to_soup(page)
+                except:
+                    break
+
+                posts = soup.findAll('div',attrs={'class':'post_multi'})
+                if len(posts) == 0:
+                    break
+
+                for post in posts:
+                    dt = post.find('div',attrs={'class':'title'})
+                    atag = dt.find('a')
+                    url = atag['href']
+                    # skip promotionals and duplicate
+                    if url.startswith('http://letsgetcritical') or url.startswith('/') or url in seen_urls:
+                        continue
+                    seen_urls.add(url)
+                    title = self.tag_to_string(atag)
+                    self.log('\tFound article:', title)
+                    self.log('\t', url)
+                    desc = post.find('blockquote')
+                    desc = self.tag_to_string(desc) if desc else ''
+                    m = regex.match( url)
+                    if m:
+                        desc = "[%s] %s" %  (m.group(2), desc)
+                    #self.log('\t', desc)
+                    date = ''
+                    p = post.previousSibling
+                    # navigate up sibling to find date
+                    while p:
+                        if hasattr(p,'class') and p['class'] == 'singledate':
+                            date = self.tag_to_string(p)
+                            break
+                        p = p.previousSibling
+                    articles.append({'title':title,'url':url,'description':desc,'date':date})
+                    if len(articles) >= max_articles:
+                        break
+
+            if articles:
+                feeds.append((cat_name, articles))
+
+        return feeds
+
--- a/recipes/metro_news_nl.recipe
+++ b/recipes/metro_news_nl.recipe
@ -1,95 +1,117 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 from calibre.web.feeds.news import BasicNewsRecipe
 import re
 from calibre.utils.magick import Image
+from BeautifulSoup import BeautifulSoup
+try:
+    from calibre_plugins.drMerry.debug import debuglogger as mlog
+    print 'drMerry debuglogger found, debug options can be used'
+    from calibre_plugins.drMerry.stats import statslogger as mstat
+    print 'drMerry stats tracker found, stat can be tracked'
+    mlog.setLoglevel(1) #-1 == no log; 0 for normal output
+    mstat.calculateStats(False) #track stats (to track stats loglevel must be > 0
+    KEEPSTATS = mstat.keepmystats()
+    SHOWDEBUG0 = mlog.showdebuglevel(0)
+    SHOWDEBUG1 = mlog.showdebuglevel(1)
+    SHOWDEBUG2 = mlog.showdebuglevel(2)
+except:
+    #print 'drMerry debuglogger not found, skipping debug options'
+    SHOWDEBUG0 = False
+    SHOWDEBUG1 = False
+    SHOWDEBUG2 = False
+    KEEPSTATS = False

+#print ('level0: %s\nlevel1: %s\nlevel2: %s' % (SHOWDEBUG0,SHOWDEBUG1,SHOWDEBUG2))

 ''' Version 1.2, updated cover image to match the changed website.
 added info date on title
 version 1.4 Updated tags, delay and added autoclean 22-09-2011
 version 1.5 Changes due to changes in site
 version 1.6 Added css, removed auto cleanup, added buitenland section, added use_embedded_content, added remove_attributes
-    Added som processing on pictures
+    Added some processing on pictures
    Removed links in html
    Removed extre white characters
    changed handling of self closing span
+ Version 1.7 11-11-2011 Changed oldest_article back to 1.5
+    changed è into &egrave;
+    updated remove tags
+    removed keep_only tags
+ Version 1.8 26-11-2022
+   added remove tag: article-slideshow
 '''

 class AdvancedUserRecipe1306097511(BasicNewsRecipe):
    title = u'Metro Nieuws NL'
-    oldest_article = 2
-    max_articles_per_feed = 100
+    oldest_article = 10
+    max_articles_per_feed = 15
    __author__     = u'DrMerry'
    description    = u'Metro Nederland'
    language       = u'nl'
    simultaneous_downloads = 5
-    #delay          = 1
-    #auto_cleanup = True
-    #auto_cleanup_keep = '//div[@class="article-image-caption-2column"]/*|//div[@id="date"]/*|//div[@class="article-image-caption-3column"]/*'
+    masthead_url = 'http://blog.metronieuws.nl/wp-content/themes/metro/images/header.gif'
+    timeout = 2
+    center_navbar  = True
    timefmt        = ' [%A, %d %b %Y]'
    no_stylesheets = True
    remove_javascript = True
    remove_empty_feeds = True
    cover_url      = 'http://www.oldreadmetro.com/img/en/metroholland/last/1/small.jpg'
    publication_type = 'newspaper'
-    remove_tags_before = dict(name='div', attrs={'id':'date'})
-    remove_tags_after = dict(name='div', attrs={'class':'article-body'})
    encoding              = 'utf-8'
    remove_attributes = ['style', 'font', 'width', 'height']
    use_embedded_content = False
+    conversion_options = {
+        'authors'        : 'Metro Nederland & calibre & DrMerry',
+        'author_sort'    : 'Metro Nederland & calibre & DrMerry',
+        'publisher'      : 'DrMerry/Metro Nederland'
+    }
    extra_css = 'body {padding:5px 0px; background:#fff;font-size: 13px;}\
-        #date {clear: both;margin-left: 19px;font-size: 11px;font-weight: 300;color: #616262;height: 15px;}\
-        .article-box-fact.module-title {clear:both;border-top:1px solid black;border-bottom:4px solid black;padding: 8px 0;color: #24763b;font-family: arial, sans-serif;font-size: 14px;font-weight: bold;}\
-        h1.title {color: #000000;font-size: 44px;padding-bottom: 10px;line-height: 1.15;font-weight: 300;} h2.subtitle {font-size: 13px;font-weight: 700;padding-bottom: 10px;}\
-        .article-body p{padding-bottom:10px;}div.column-1-3{float: left;display: inline;width: 567px;margin-left: 19px;border-right: 1px solid #CACACA;padding-right: 9px;}\
-        div.column-1-2 {float: left;display: inline;width: 373px;padding-right: 7px;border-right: 1px solid #CACACA;}\
-        p.article-image-caption {font-size: 12px;font-weight: 300;line-height: 1.4;color: #616262;margin-top: 5px;} \
+        #date, div.share-and-byline div.byline div.text div.title, div.share-and-byline div.byline div.text div.name {clear: both;margin-bottom: 10px;font-size:0.5em; color: #616262;}\
+        .article-box-fact.module-title {clear:both;padding: 8px 0;color: #24763b;font-family: arial, sans-serif;font-size: 14px;font-weight: bold;}\
+        h1.title {color: #000000;font-size: 44px;padding-bottom: 10px;font-weight: 300;} h2.subtitle {font-size: 13px;font-weight: 700;padding-bottom: 10px;}\
+        .article-body p{padding-bottom:10px;}div.column-1-3{margin-left: 19px;padding-right: 9px;}\
+        div.column-1-2 {display: inline;padding-right: 7px;}\
+        p.article-image-caption {font-size: 12px;font-weight: 300;color: #616262;margin-top: 5px;} \
        p.article-image-caption .credits {font-style: italic;font-size: 10px;}\
        div.article-image-caption {width: 246px;margin-bottom: 5px;margin-left: 10px;}\
        div.article-image-caption-2column {margin-bottom: 10px;width: 373px;} div.article-image-caption-3column {}\
-        img {border:0px;} .img-mask {position:absolute;top:0px;left:0px;}'
+        img {border:0px; padding:2px;} hr.merryhr {width:30%;  border-width:0px; color:green; margin-left:5px; background-color: green} div.column-3 {background-color:#eee; width:50%; margin:2px; float:right; padding:2px;} div.column-3 module-title {border: 1px solid #aaa} div.article-box-fact div.subtitle {font-weight:bold; color:green;}'

-    keep_only_tags = [dict(name='div', attrs={'class':[ 'article-image-caption-2column', 'article-image-caption-3column', 'article-body', 'article-box-fact']}),
-        dict(name='div', attrs={'id':['date']}),
-        dict(name='h1', attrs={'class':['title']}),
-        dict(name='h2', attrs={'class':['subtitle']})]
-
-    remove_tags = [dict(name='div', attrs={'class':[ 'metroCommentFormWrap',
-        'commentForm', 'metroCommentInnerWrap', 'article-slideshow-counter-container', 'article-slideshow-control', 'ad', 'header-links',
-        'art-rgt','pluck-app pluck-comm', 'share-and-byline', 'article-tools-below-title', 'col-179 ', 'related-links', 'clear padding-top-15', 'share-tools', 'article-page-auto-pushes', 'footer-edit']}),
-        dict(name='div', attrs={'id':['article-2', 'article-4', 'article-1', 'navigation', 'footer', 'header', 'comments', 'sidebar', 'share-and-byline']}),
-        dict(name='iframe')]
-
-    preprocess_regexps = [(re.compile(r'(<p>(&nbsp;|\s)*</p>|<a[^>]*>Tweet</a>|<a[^>]*>|</a>|<!--.*?-->)', re.DOTALL|re.IGNORECASE),lambda match: ''),
-        (re.compile(r'(&nbsp;|\s\s)+\s*', re.DOTALL|re.IGNORECASE),lambda match: ' '),
-        (re.compile(r'([\s>])([^\s>]+)(<span[^>]+) />', re.DOTALL|re.IGNORECASE),
-            lambda match: match.group(1) + match.group(3) + '>' + match.group(2) + '</span>'),
+    preprocess_regexps = [
+        (re.compile(r'<img[^>]+top-line[^>]+>', re.DOTALL|re.IGNORECASE),
+        lambda match: '<hr class="merryhr" />'),
+        (re.compile(r'(<img[^>]+metronieuws\.nl/[^>]+/templates/[^>]+jpe?g[^>]+>|metronieuws\.nl/internal\-roxen\-unit\.gif)', re.DOTALL|re.IGNORECASE),
+        lambda match: ''),
        ]

+    def preprocess_html(self, soup):
+        if SHOWDEBUG0 == True:
+            mlog.setdefaults()
+            mlog.addTextAndTag(['Show debug = on with level'], [str(mlog.debuglevel)])
+            if KEEPSTATS == True:
+                mlog.addDebug('Stats will be calculated')
+            else:
+                mlog.addTextAndTag(['Stats won\'t be calculated\nTo be enabled, stats must be true, currently','and debug level must be 1 or higher, currently'],[mstat.dokeepmystats, mlog.debuglevel])
+            mlog.showDebug()
+        myProcess = MerryProcess()
+        myProcess.removeUnwantedTags(soup)
+        return soup
+
    def postprocess_html(self, soup, first):
-        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
-            iurl = tag['src']
-            img = Image()
-            img.open(iurl)
-        #width, height = img.size
-        #print '***img is: ', iurl, '\n****width is: ', width, 'height is: ', height
-            img.trim(0)
-            img.save(iurl)
-            '''
-            #width, height = img.size
-            #print '***TRIMMED img width is: ', width, 'height is: ', height
-            left=0
-            top=0
-            border_color='#ffffff'
-            width, height = img.size
-            #print '***retrieved img width is: ', width, 'height is: ', height
-            height_correction = 1.17
-            canvas = create_canvas(width, height*height_correction,border_color)
-            canvas.compose(img, left, top)
-            #img = canvas
-            canvas.save(iurl)
-            #width, height = canvas.size
-            #print '***NEW img width is: ', width, 'height is: ', height
-            '''
+        myProcess = MerryProcess()
+        myProcess.optimizeLayout(soup)
+        if SHOWDEBUG0 == True:
+            if KEEPSTATS == True:
+                statinfo = 'generated stats:'
+                statinfo += str(mstat.stats(mstat.statslist))
+                print statinfo
+                statinfo = 'generated stats (for removed tags):'
+                statinfo += str(mstat.stats(mstat.removedtagslist))
+                print statinfo
+            #show all Debug info we forgot to report
+            #Using print to be sure that this text will not be added at the end of the log.
+            print '\n!!!!!unreported messages:\n(should be empty)\n'
+            mlog.showDebug()
        return soup

    feeds = [
@ -105,6 +127,291 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
        (u'Familie', u'http://www.metronieuws.nl/rss.xml?c=1283166782-9'),
        (u'Blogs', u'http://www.metronieuws.nl/rss.xml?c=1295586825-6'),
        (u'Reizen', u'http://www.metronieuws.nl/rss.xml?c=1277377288-13'),
-        (u'Carrière', u'http://www.metronieuws.nl/rss.xml?c=1278070988-1'),
+        (u'Carri&egrave;re', u'http://www.metronieuws.nl/rss.xml?c=1278070988-1'),
        (u'Sport', u'http://www.metronieuws.nl/rss.xml?c=1277377288-12')
        ]
+
+class MerryPreProcess():
+    def replacePictures(self, soup):
+        #to be implemented
+        return soup
+
+    def optimizePicture(self,soup):
+        if SHOWDEBUG0 == True:
+            mlog.addDebug('start image optimize')
+        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
+            iurl = tag['src']
+            img = Image()
+            img.open(iurl)
+            img.trim(0)
+            img.save(iurl)
+        if SHOWDEBUG0 == True:
+            mlog.addDebug('Images optimized')
+            mlog.showDebug()
+        return soup
+
+class MerryExtract():
+    def safeRemovePart(self, killingSoup, soupIsArray):
+        if killingSoup and not killingSoup == None:
+            if SHOWDEBUG2 == True:
+                mlog.addTextAndTag(['items to remove'],[killingSoup])
+            try:
+                if soupIsArray == True:
+                    for killer in killingSoup:
+                        killer.extract()
+                else:
+                    killingSoup.extract()
+                if SHOWDEBUG1 == True:
+                    mlog.addDebug('tag extracted')
+                    mlog.showDebug()
+                    if KEEPSTATS == True:
+                        try:
+                            mstat.addstat(mstat.removedtagslist,str(killingSoup.name))
+                        except:
+                            mstat.addstat(mstat.removedtagslist,'unknown')
+            except:
+                if SHOWDEBUG1 == True:
+                    mlog.addDebug('tag extraction failed')
+                    mlog.showDebug()
+                    if KEEPSTATS == True:
+                        mstat.addstat(mstat.removedtagslist,'exception')
+                return False
+        else:
+            return False
+        return killingSoup
+
+class MerryReplace():
+    myKiller = MerryExtract()
+    def replaceATag(self, soup):
+        anchors = []
+        anchors = soup.findAll('a')
+        if anchors and not (anchors == None or anchors == []):
+          try:
+            for link in anchors:
+                # print str(link)
+                if link and not link == None:
+                    # print ('type: %s'%(str(type(link))))
+                    # print ('link: %s' % (link))
+                    myParent = link.parent
+                    # print str('parent: %s'%(myParent))
+                    try:
+                        myIndex = link.parent.index(link)
+                        hasIndex = True
+                    except:
+                        myIndex = 0
+                        hasIndex = False
+                    # print str('index %s'%(myIndex))
+                    if not link.string == None:
+                        # print 'link=notnone'
+                        if hasIndex == True:
+                            myParent.insert(myIndex, link.string)
+                        else:
+                            myParent.append(link.string)
+                    else:
+                        # print 'link=none'
+                        myParent.insert(myIndex, link.contents)
+                    self.myKiller.safeRemovePart(link, False)
+                else:
+                     notshown = 'tag received is empty' # print
+          except:
+            notshown = 'tag received is empty' # print
+            notshown
+        return soup
+
+class MerryProcess(BeautifulSoup):
+    myKiller = MerryExtract()
+    myReplacer = MerryReplace()
+    myPrepare = MerryPreProcess()
+
+    def optimizeLayout(self,soup):
+        self.myPrepare.optimizePicture(soup)
+        if SHOWDEBUG0 == True:
+            mlog.addDebug('End of Optimize Layout')
+            mlog.showDebug()
+        return soup
+
+    def insertFacts(self, soup):
+        allfacts = soup.findAll('div', {'class':re.compile('^article-box-fact.*$')})
+        if SHOWDEBUG0 == True:
+            mlog.addTextAndTag(['allfacts'],[allfacts])
+            mlog.showDebug()
+        if allfacts and not allfacts == None:
+            allfactsparent = soup.find('div', {'class':re.compile('^article-box-fact.*$')}).parent
+            if SHOWDEBUG0 == True:
+                mlog.addTextAndTag(['allfactsparent'],[allfactsparent])
+                mlog.showDebug()
+            for part in allfactsparent:
+                if not part in allfacts:
+                    if SHOWDEBUG0 == True:
+                        mlog.addTextAndTag(['FOUND A non-fact'],[part])
+                        mlog.showDebug()
+                    self.myKiller.safeRemovePart(part, True)
+            if SHOWDEBUG1 == True:
+                mlog.addTextAndTag(['New All Facts'],[allfacts])
+                mlog.showDebug()
+        articlefacts = soup.find('div', {'class':'article-box-fact column'})
+        errorOccured=False
+        if (articlefacts and not articlefacts==None):
+          try:
+            contenttag = soup.find('div', {'class':'article-body'})
+            if SHOWDEBUG0 == True:
+                mlog.addTextAndTag(['curcontag'],[contenttag])
+                mlog.showDebug()
+            foundrighttag = False
+            if contenttag and not contenttag == None:
+                foundrighttag = True
+            if SHOWDEBUG0 == True:
+                if errorOccured == False:
+                    mlog.addTextAndTag(['type','curcontag (in while)'],[type(contenttag),contenttag])
+                else:
+                    mlog.addDebug('Could not find right parent tag. Error Occured')
+                mlog.showDebug()
+            if foundrighttag == True:
+                contenttag.insert(0, allfactsparent)
+                if SHOWDEBUG2 == True:
+                    mlog.addTextAndTag(['added parent'],[soup.prettify()])
+                    mlog.showDebug()
+          except:
+            errorOccured=True
+            mlog.addTrace()
+        else:
+            errorOccured=True
+        if SHOWDEBUG0 == True and errorOccured == True:
+            mlog.addTextAndTag(['no articlefacts'],[articlefacts])
+            mlog.showDebug()
+        return soup
+
+    def previousNextSibRemover(self, soup, previous=True, soupIsArray=False):
+        findsibsof = soup
+        firstpart = previous
+        if findsibsof and not findsibsof == None:
+            if soupIsArray == True:
+                for foundsib in findsibsof:
+                    self.previousNextSibRemover(foundsib, firstpart, soupIsArray=False)
+            else:
+                if firstpart == True and soupIsArray == False:
+                    sibs = findsibsof.previousSiblingGenerator()
+                else:
+                    sibs = findsibsof.nextSiblingGenerator()
+                for sib in sibs:
+                    self.myKiller.safeRemovePart(sib, True)
+        else:
+            if SHOWDEBUG1 == True:
+                mlog.addDebug('Not any sib found')
+        return
+
+    def removeUnwantedTags(self,soup):
+        if SHOWDEBUG1 == True:
+            mlog.addTextAndTag(['Len of Soup before RemoveTagsByName'],[len(str(soup))])
+            mlog.showDebug()
+        self.removeTagsByName(soup)
+        if SHOWDEBUG1 == True:
+            mlog.addDebug('Len of Soup before firstandlastpart: %s' % len(str(soup)))
+            mlog.showDebug()
+        self.insertFacts(soup)
+        self.removeFirstAndLastPart(soup)
+        if SHOWDEBUG1 == True:
+            mlog.addDebug('Len of Soup before unwantedpart: %s' % len(str(soup)))
+            mlog.showDebug()
+        self.removeUnwantedParts(soup)
+        if SHOWDEBUG1 == True:
+            mlog.addDebug('Len of Soup before EmptyParts: %s' % len(str(soup)))
+            mlog.showDebug()
+        self.removeEmptyTags(soup)
+        if SHOWDEBUG1 == True:
+            mlog.addDebug('Len of Soup after EmptyParts: %s' % len(str(soup)))
+            mlog.showDebug()
+        self.myReplacer.replaceATag(soup)
+        return soup
+
+    def removeUnwantedParts(self, soup):
+        if SHOWDEBUG1 == True:
+            mlog.addDebug('Len of Soup before UnwantedID: %s' % len(str(soup)))
+            mlog.showDebug()
+        self.removeUnwantedTagsByID(soup)
+        if SHOWDEBUG1 == True:
+            mlog.addDebug('Len of Soup before Class: %s' % len(str(soup)))
+            mlog.showDebug()
+        self.removeUnwantedTagsByClass(soup)
+        if SHOWDEBUG1 == True:
+            mlog.addDebug('Len of Soup before Style: %s' % len(str(soup)))
+            mlog.showDebug()
+        self.removeUnwantedTagsByStyle(soup)
+        return soup
+
+    def removeUnwantedTagsByStyle(self,soup):
+        self.removeArrayOfTags(soup.findAll(attrs={'style' : re.compile("^(.*(display\s?:\s?none|img-mask|white)\s?;?.*)$")}))
+        if SHOWDEBUG0 == True:
+            mlog.addDebug('end remove by style')
+        return soup
+
+    def removeArrayOfTags(self,souparray):
+        return self.myKiller.safeRemovePart(souparray, True)
+
+    def removeUnwantedTagsByClass(self,soup):
+        if SHOWDEBUG0 == True:
+            mlog.addDebug('start remove by class')
+        self.removeArrayOfTags(soup.findAll("div", { "class" :re.compile('^(promo.*?|article-tools-below-title|metroCommentFormWrap|ad|share-tools|tools|header-links|related-links|padding-top-15)$')}))
+        return soup
+
+    def removeUnwantedTagsByID(self,soup):
+        defaultids = ['footer-extra',re.compile('^ad(\d+|adcomp.*?)?$'),'column-4-5','navigation','header',re.compile('^column-1-5-(top|bottom)$'),'footer','hidden_div','sidebar',re.compile('^article-\d$'),'comments','footer']
+        for removeid in defaultids:
+            if SHOWDEBUG1 == True:
+                mlog.addDebug('RemoveTagByID, tag: %s, Len of Soup: %s' % (str(removeid), len(str(soup))))
+                mlog.showDebug()
+            self.removeArrayOfTags(soup.findAll(id=removeid))
+        return soup
+
+    # def safeRemoveTag(self, subtree):
+        # return self.myKiller.safeRemovePart(subtree, True)
+
+
+    def removeTagsByName(self, soup):
+        self.myKiller.safeRemovePart(soup.script, True)
+        self.myKiller.safeRemovePart(soup.iframe, True)
+        self.myKiller.safeRemovePart(soup.style, True)
+        self.myKiller.safeRemovePart(soup.noscript, True)
+        return soup
+
+    def removeEmptyTags(self,soup,run=0):
+        if SHOWDEBUG0 == True:
+            mlog.addDebug('starting removeEmptyTags')
+            if SHOWDEBUG1 == True:
+                run += 1
+                mlog.addDebug(run)
+                if SHOWDEBUG2 == True:
+                    mlog.addDebug(str(soup.prettify()))
+            mlog.showDebug()
+        emptymatches = re.compile('^(&nbsp;|\s|\n|\r|\t)*$')
+        emptytags = soup.findAll(lambda tag: tag.find(True) is None and (tag.string is None or tag.string.strip()=="" or tag.string.strip()==emptymatches) and not tag.isSelfClosing)
+        if emptytags and not (emptytags == None or emptytags == []):
+            if SHOWDEBUG1 == True:
+                mlog.addDebug('tags found')
+                mlog.addDebug(str(emptytags))
+            self.removeArrayOfTags(emptytags)
+            #recursive in case removing empty tag creates new empty tag
+            self.removeEmptyTags(soup, run=run)
+        else:
+            if SHOWDEBUG1 == True:
+                mlog.addDebug('no empty tags found')
+                mlog.showDebug()
+        if SHOWDEBUG0 == True:
+            if SHOWDEBUG2 == True:
+                mlog.addDebug('new soup:')
+                mlog.addDebug(str(soup.prettify()))
+            mlog.addDebug('RemoveEmptyTags Completed')
+            mlog.showDebug()
+        return soup
+
+    def removeFirstAndLastPart(self,soup):
+        def findparenttag(lookuptag):
+            if lookuptag and not lookuptag == None:
+                return lookuptag.findParents()
+        findtag = soup.find(id="date")
+        self.previousNextSibRemover(findtag, previous=True, soupIsArray=False)
+        self.previousNextSibRemover(findparenttag(findtag), previous=True, soupIsArray=True)
+        for endtag in [soup.find(id="share-and-byline"), soup.find("div", { "class" : "gallery-text" })]:
+            self.previousNextSibRemover(endtag, previous=False, soupIsArray=False)
+            self.previousNextSibRemover(findparenttag(endtag), previous=False, soupIsArray=True)
+        return soup
--- a/recipes/metro_uk.recipe
+++ b/recipes/metro_uk.recipe
@ -5,8 +5,8 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
    description = 'News as provide by The Metro -UK'

    __author__ = 'Dave Asbury'
+    #last update 3/12/11
    cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/276636_117118184990145_2132092232_n.jpg'
-
    no_stylesheets = True
    oldest_article = 1
    max_articles_per_feed = 20
@ -32,9 +32,11 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
                    dict(name='div', attrs={'class':'art-lft'}),
                    dict(name='p')
    ]
-    remove_tags    = [dict(name='div', attrs={'class':[ 'news m12 clrd clr-b p5t shareBtm', 'commentForm', 'metroCommentInnerWrap',
-                             'art-rgt','pluck-app pluck-comm','news m12 clrd clr-l p5t', 'flt-r' ]}),
-	          dict(attrs={'class':[ 'metroCommentFormWrap','commentText','commentsNav','avatar','submDateAndTime']})
+    remove_tags    = [
+                             dict(name = 'div',attrs={'id' : ['comments-news','formSubmission']}),
+                             dict(name='div', attrs={'class':[ 'news m12 clrd clr-b p5t shareBtm', 'commentForm', 'metroCommentInnerWrap',
+                             'art-rgt','pluck-app pluck-comm','news m12 clrd clr-l p5t', 'flt-r','username','clrd' ]}),
+              dict(attrs={'class':['username', 'metroCommentFormWrap','commentText','commentsNav','avatar','submDateAndTime','addYourComment','displayName']})
                              ,dict(name='div', attrs={'class' : 'clrd art-fd fd-gr1-b'})
                               ]
    feeds          = [
--- a/recipes/ming_pao.recipe
+++ b/recipes/ming_pao.recipe
@ -10,6 +10,10 @@ __MakePeriodical__ = True
 __UseChineseTitle__ = False
 # Set it to False if you want to skip images (Default: True)
 __KeepImages__ = True
+# Set it to True if you want to include a summary in Kindle's article view (Default: False)
+__IncludeSummary__ = False
+# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
+__IncludeThumbnails__ = True
 # (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
 __UseLife__ = True
 # (HK only) It is to disable premium content (Default: False)
@ -24,6 +28,10 @@ __Date__ = ''

 '''
 Change Log:
+2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
+            from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
+            download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
+2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
 2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
 2011/10/19: fix a bug in txt source parsing
 2011/10/17: disable fetching of premium content, also improved txt source parsing
@ -52,6 +60,7 @@ Change Log:
 2010/10/31: skip repeated articles in section pages
 '''

+from calibre.utils.date import now as nowf
 import os, datetime, re, mechanize
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from contextlib import nested
@ -59,10 +68,14 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.metadata import MetaInformation
+from calibre.utils.localization import canonicalize_lang

 # MAIN CLASS
 class MPRecipe(BasicNewsRecipe):
    if __Region__ == 'Hong Kong':
+        if __UseChineseTitle__ == True:
+            title = u'\u660e\u5831 (\u9999\u6e2f)'
+        else:
            title   = 'Ming Pao - Hong Kong'
        description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
        category    = 'Chinese, News, Hong Kong'
@ -108,6 +121,9 @@ class MPRecipe(BasicNewsRecipe):
                              lambda match: "</b>")
                             ]
    elif __Region__ == 'Vancouver':
+        if __UseChineseTitle__ == True:
+            title   = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
+        else:
            title   = 'Ming Pao - Vancouver'
        description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
        category    = 'Chinese, News, Vancouver'
@ -126,6 +142,9 @@ class MPRecipe(BasicNewsRecipe):
                              lambda match: ''),
                             ]
    elif __Region__ == 'Toronto':
+        if __UseChineseTitle__ == True:
+            title   = u'\u660e\u5831 (\u591a\u502b\u591a)'
+        else:
            title   = 'Ming Pao - Toronto'
        description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
        category    = 'Chinese, News, Toronto'
@ -160,9 +179,9 @@ class MPRecipe(BasicNewsRecipe):
    def get_dtlocal(self):
        dt_utc = datetime.datetime.utcnow()
        if __Region__ == 'Hong Kong':
-            # convert UTC to local hk time - at HKT 5.30am, all news are available
-            dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(5.5/24)
-            # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(5.5/24)
+            # convert UTC to local hk time - at HKT 4.30am, all news are available
+            dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24)
+            # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
        elif __Region__ == 'Vancouver':
            # convert UTC to local Vancouver time - at PST time 5.30am, all news are available
            dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
@ -185,6 +204,18 @@ class MPRecipe(BasicNewsRecipe):
        else:
            return self.get_dtlocal().strftime("%Y-%m-%d")

+    def get_fetchyear(self):
+        if __Date__ <> '':
+            return __Date__[0:4]
+        else:
+            return self.get_dtlocal().strftime("%Y")
+
+    def get_fetchmonth(self):
+        if __Date__ <> '':
+            return __Date__[4:6]
+        else:
+            return self.get_dtlocal().strftime("%m")
+
    def get_fetchday(self):
        if __Date__ <> '':
            return __Date__[6:8]
@ -533,12 +564,22 @@ class MPRecipe(BasicNewsRecipe):
                new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
                next_is_img_txt = False
                title_started = False
+                title_break_reached = False
                met_article_start_char = False
                for item in splitter.split(raw_html):
                    item = item.strip()
-                    if item.startswith(u'\u3010'):
+                    # if title already reached but break between title and content not yet found, record title_break_reached
+                    if title_started == True and title_break_reached == False and item == '':
+                        title_break_reached = True
+                    # if title reached and title_break_reached and met_article_start_char == False and item is not empty
+                    # start content
+                    elif title_started == True and title_break_reached == True and met_article_start_char == False:
+                        if item <> '':
                            met_article_start_char = True
                            new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
+                    #if item.startswith(u'\u3010'):
+                    #    met_article_start_char = True
+                    #    new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
                    else:
                        if next_is_img_txt == False:
                            if item.startswith("=@"):
@ -643,38 +684,112 @@ class MPRecipe(BasicNewsRecipe):
            del item['absmiddle']
        return soup

+    def populate_article_metadata(self, article, soup, first):
+        # thumbnails shouldn't be available if using hi-res images
+        if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'):
+            img = soup.find('img')
+            if img is not None:
+                self.add_toc_thumbnail(article, img['src'])
+
+        try:
+            if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
+                # look for content
+                articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
+                if not articlebodies:
+                    articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
+                if not articlebodies:
+                    articlebodies = soup.findAll('div',attrs={'class':'content'})
+                if not articlebodies:
+                    articlebodies = soup.findAll('div', attrs={'id':'font'})
+                if articlebodies:
+                    for articlebody in articlebodies:
+                        if articlebody:
+                            # the text may or may not be enclosed in <p></p> tag
+                            paras = articlebody.findAll('p')
+                            if not paras:
+                            	paras = articlebody
+                            textFound = False
+                            for p in paras:
+                                if not textFound:
+                                    summary_candidate = self.tag_to_string(p).strip()
+                                    summary_candidate = summary_candidate.replace(u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1)
+                                    if len(summary_candidate) > 0:
+                                        article.summary = article.text_summary = summary_candidate
+                                        textFound = True
+            else:
+                # display a simple text
+                #article.summary = article.text_summary = u'\u66f4\u591a......'
+                # display word counts
+                counts = 0
+                articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
+                if not articlebodies:
+                    articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
+                if not articlebodies:
+                    articlebodies = soup.findAll('div',attrs={'class':'content'})
+                if not articlebodies:
+                    articlebodies = soup.findAll('div', attrs={'id':'font'})
+                if articlebodies:
+                    for articlebody in articlebodies:
+                        # the text may or may not be enclosed in <p></p> tag
+                        paras = articlebody.findAll('p')
+                        if not paras:
+                            paras = articlebody
+                        for p in paras:
+                            summary_candidate = self.tag_to_string(p).strip()
+                            counts += len(summary_candidate)
+                    article.summary = article.text_summary = u'\uff08' + str(counts) + u'\u5b57\uff09'
+        except:
+            self.log("Error creating article descriptions")
+            return
+
+    # override from the one in version 0.8.31
    def create_opf(self, feeds, dir=None):
        if dir is None:
            dir = self.output_dir
-        if __UseChineseTitle__ == True:
-            if __Region__ == 'Hong Kong':
-                title = u'\u660e\u5831 (\u9999\u6e2f)'
-            elif __Region__ == 'Vancouver':
-                title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
-            elif __Region__ == 'Toronto':
-                title = u'\u660e\u5831 (\u591a\u502b\u591a)'
-        else:
        title = self.short_title()
-        # if not generating a periodical, force date to apply in title
-        if __MakePeriodical__ == False:
+        # change 1: allow our own flag to tell if a periodical is to be generated
+        # also use customed date instead of current time
+        if __MakePeriodical__ == False or self.output_profile.periodical_date_in_title:
            title = title + ' ' + self.get_fetchformatteddate()
-        if True:
-            mi = MetaInformation(title, [self.publisher])
-            mi.publisher = self.publisher
-            mi.author_sort = self.publisher
+        # end of change 1
+        # change 2: __appname__ replaced by newspaper publisher
+        __appname__ = self.publisher
+        mi = MetaInformation(title, [__appname__])
+        mi.publisher = __appname__
+        mi.author_sort = __appname__
+        # change 3: use __MakePeriodical__ flag to tell if a periodical should be generated
        if __MakePeriodical__ == True:
            mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
        else:
            mi.publication_type = self.publication_type+':'+self.short_title()
-            #mi.timestamp = nowf()
-            mi.timestamp = self.get_dtlocal()
-            mi.comments = self.description
-            if not isinstance(mi.comments, unicode):
-                mi.comments = mi.comments.decode('utf-8', 'replace')
+        #mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
+        # change 4: in the following, all the nowf() are changed to adjusted time
+        # This one doesn't matter
+        mi.timestamp = nowf()
+        # change 5: skip listing the articles
+        #article_titles, aseen = [], set()
+        #for f in feeds:
+        #    for a in f:
+        #        if a.title and a.title not in aseen:
+        #            aseen.add(a.title)
+        #            article_titles.append(force_unicode(a.title, 'utf-8'))
+
+        #mi.comments = self.description
+        #if not isinstance(mi.comments, unicode):
+        #    mi.comments = mi.comments.decode('utf-8', 'replace')
+        #mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
+        #        '\n\n'.join(article_titles))
+
+        language = canonicalize_lang(self.language)
+        if language is not None:
+            mi.language = language
+        # This one affects the pub date shown in kindle title
        #mi.pubdate = nowf()
-            mi.pubdate = self.get_dtlocal()
+        # now appears to need the time field to be > 12.00noon as well
+        mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
        opf_path = os.path.join(dir, 'index.opf')
        ncx_path = os.path.join(dir, 'index.ncx')
+
        opf = OPFCreator(dir, mi)
        # Add mastheadImage entry to <guide> section
        mp = getattr(self, 'masthead_path', None)
@ -710,11 +825,13 @@ class MPRecipe(BasicNewsRecipe):
                mani.id = 'ncx'
            if mani.path.endswith('mastheadImage.jpg'):
                mani.id = 'masthead-image'
+
        entries = ['index.html']
        toc = TOC(base_path=dir)
        self.play_order_counter = 0
        self.play_order_map = {}

+
        def feed_index(num, parent):
            f = feeds[num]
            for j, a in enumerate(f):
@ -728,13 +845,16 @@ class MPRecipe(BasicNewsRecipe):
                        desc = None
                    else:
                        desc = self.description_limiter(desc)
+                    tt = a.toc_thumbnail if a.toc_thumbnail else None
                    entries.append('%sindex.html'%adir)
                    po = self.play_order_map.get(entries[-1], None)
                    if po is None:
                        self.play_order_counter += 1
                        po = self.play_order_counter
-                    parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
-                                    play_order=po, author=auth, description=desc)
+                    parent.add_item('%sindex.html'%adir, None,
+                            a.title if a.title else _('Untitled Article'),
+                            play_order=po, author=auth,
+                            description=desc, toc_thumbnail=tt)
                    last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
                    for sp in a.sub_pages:
                        prefix = os.path.commonprefix([opf_path, sp])
@ -751,7 +871,7 @@ class MPRecipe(BasicNewsRecipe):
                            prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
                            templ = self.navbar.generate(True, num, j, len(f),
                                            not self.has_single_feed,
-                                            a.orig_url, self.publisher, prefix=prefix,
+                                            a.orig_url, __appname__, prefix=prefix,
                                            center=self.center_navbar)
                            elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
                            body.insert(len(body.contents), elem)
@ -787,3 +907,5 @@ class MPRecipe(BasicNewsRecipe):

        with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
            opf.render(opf_file, ncx_file)
+
+
--- a/recipes/ming_pao_toronto.recipe
+++ b/recipes/ming_pao_toronto.recipe
@ -4,18 +4,41 @@ __copyright__ = '2010-2011, Eddie Lau'
 # Region - Hong Kong, Vancouver, Toronto
 __Region__ = 'Toronto'
 # Users of Kindle 3 with limited system-level CJK support
-# please replace the following "True" with "False".
+# please replace the following "True" with "False". (Default: True)
 __MakePeriodical__ = True
-# Turn below to true if your device supports display of CJK titles
+# Turn below to True if your device supports display of CJK titles (Default: False)
 __UseChineseTitle__ = False
-# Set it to False if you want to skip images
+# Set it to False if you want to skip images (Default: True)
 __KeepImages__ = True
-# (HK only) Turn below to true if you wish to use life.mingpao.com as the main article source
+# Set it to True if you want to include a summary in Kindle's article view (Default: False)
+__IncludeSummary__ = False
+# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
+__IncludeThumbnails__ = True
+# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
 __UseLife__ = True
+# (HK only) It is to disable premium content (Default: False)
+__InclPremium__ = False
+# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: True)
+__ParsePFF__ = True
+# (HK only) Turn below to True if you wish hi-res images (Default: False)
+__HiResImg__ = False
+# Override the date returned by the program if specifying a YYYYMMDD below
+__Date__ = ''


 '''
 Change Log:
+2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
+            from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
+            download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
+2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
+2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
+2011/10/19: fix a bug in txt source parsing
+2011/10/17: disable fetching of premium content, also improved txt source parsing
+2011/10/04: option to get hi-res photos for the articles
+2011/09/21: fetching "column" section is made optional.
+2011/09/18: parse "column" section stuff from source text file directly.
+2011/09/07: disable "column" section as it is no longer offered free.
 2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
            provide options to remove all images in the file
 2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
@ -37,30 +60,38 @@ Change Log:
 2010/10/31: skip repeated articles in section pages
 '''

-import os, datetime, re
+from calibre.utils.date import now as nowf
+import os, datetime, re, mechanize
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from contextlib import nested
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.metadata import MetaInformation
+from calibre.utils.localization import canonicalize_lang

 # MAIN CLASS
 class MPRecipe(BasicNewsRecipe):
    if __Region__ == 'Hong Kong':
+        if __UseChineseTitle__ == True:
+            title = u'\u660e\u5831 (\u9999\u6e2f)'
+        else:
            title   = 'Ming Pao - Hong Kong'
        description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
        category    = 'Chinese, News, Hong Kong'
-        extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
+        extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
        masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
        keep_only_tags = [dict(name='h1'),
                          dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
                          dict(name='font', attrs={'color':['AA0000']}), # for column articles title
+                          dict(attrs={'class':['heading']}),  # for heading from txt
                          dict(attrs={'id':['newscontent']}), # entertainment and column page content
                          dict(attrs={'id':['newscontent01','newscontent02']}),
+                          dict(attrs={'class':['content']}),  # for content from txt
                          dict(attrs={'class':['photo']}),
                          dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}),  # content in printed version of life.mingpao.com
-                          dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com
+                          dict(name='img', attrs={'width':['180'], 'alt':['????']}), # images for source from life.mingpao.com
+                          dict(attrs={'class':['images']})   # for images from txt
                          ]
        if __KeepImages__:
            remove_tags = [dict(name='style'),
@ -90,6 +121,9 @@ class MPRecipe(BasicNewsRecipe):
                              lambda match: "</b>")
                             ]
    elif __Region__ == 'Vancouver':
+        if __UseChineseTitle__ == True:
+            title   = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
+        else:
            title   = 'Ming Pao - Vancouver'
        description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
        category    = 'Chinese, News, Vancouver'
@ -108,6 +142,9 @@ class MPRecipe(BasicNewsRecipe):
                              lambda match: ''),
                             ]
    elif __Region__ == 'Toronto':
+        if __UseChineseTitle__ == True:
+            title   = u'\u660e\u5831 (\u591a\u502b\u591a)'
+        else:
            title   = 'Ming Pao - Toronto'
        description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
        category    = 'Chinese, News, Toronto'
@ -139,49 +176,12 @@ class MPRecipe(BasicNewsRecipe):
    conversion_options = {'linearize_tables':True}
    timefmt = ''

-    def image_url_processor(cls, baseurl, url):
-        # trick: break the url at the first occurance of digit, add an additional
-        # '_' at the front
-        # not working, may need to move this to preprocess_html() method
-#        minIdx = 10000
-#        i0 = url.find('0')
-#        if i0 >= 0 and i0 < minIdx:
-#           minIdx = i0
-#        i1 = url.find('1')
-#        if i1 >= 0 and i1 < minIdx:
-#           minIdx = i1
-#        i2 = url.find('2')
-#        if i2 >= 0 and i2 < minIdx:
-#           minIdx = i2
-#        i3 = url.find('3')
-#        if i3 >= 0 and i0 < minIdx:
-#           minIdx = i3
-#        i4 = url.find('4')
-#        if i4 >= 0 and i4 < minIdx:
-#           minIdx = i4
-#        i5 = url.find('5')
-#        if i5 >= 0 and i5 < minIdx:
-#           minIdx = i5
-#        i6 = url.find('6')
-#        if i6 >= 0 and i6 < minIdx:
-#           minIdx = i6
-#        i7 = url.find('7')
-#        if i7 >= 0 and i7 < minIdx:
-#           minIdx = i7
-#        i8 = url.find('8')
-#        if i8 >= 0 and i8 < minIdx:
-#           minIdx = i8
-#        i9 = url.find('9')
-#        if i9 >= 0 and i9 < minIdx:
-#           minIdx = i9
-        return url
-
    def get_dtlocal(self):
        dt_utc = datetime.datetime.utcnow()
        if __Region__ == 'Hong Kong':
-            # convert UTC to local hk time - at HKT 5.30am, all news are available
-            dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(5.5/24)
-            # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(5.5/24)
+            # convert UTC to local hk time - at HKT 4.30am, all news are available
+            dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24)
+            # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
        elif __Region__ == 'Vancouver':
            # convert UTC to local Vancouver time - at PST time 5.30am, all news are available
            dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
@ -193,12 +193,33 @@ class MPRecipe(BasicNewsRecipe):
        return dt_local

    def get_fetchdate(self):
+        if __Date__ <> '':
+            return __Date__
+        else:
            return self.get_dtlocal().strftime("%Y%m%d")

    def get_fetchformatteddate(self):
+        if __Date__ <> '':
+            return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
+        else:
            return self.get_dtlocal().strftime("%Y-%m-%d")

+    def get_fetchyear(self):
+        if __Date__ <> '':
+            return __Date__[0:4]
+        else:
+            return self.get_dtlocal().strftime("%Y")
+
+    def get_fetchmonth(self):
+        if __Date__ <> '':
+            return __Date__[4:6]
+        else:
+            return self.get_dtlocal().strftime("%m")
+
    def get_fetchday(self):
+        if __Date__ <> '':
+            return __Date__[6:8]
+        else:
            return self.get_dtlocal().strftime("%d")

    def get_cover_url(self):
@ -230,12 +251,23 @@ class MPRecipe(BasicNewsRecipe):
                                           (u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
                                           (u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
                                           (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
-                                           (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'),
-                                           (u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]:
+                                           (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
+                                          ]:
+                    if __InclPremium__ == True:
+                        articles = self.parse_section2_txt(url, keystr)
+                    else:
                        articles = self.parse_section2(url, keystr)
                    if articles:
                        feeds.append((title, articles))

+                if __InclPremium__ == True:
+                    # parse column section articles directly from .txt files
+                    for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
+                                              ]:
+                        articles = self.parse_section2_txt(url, keystr)
+                        if articles:
+                            feeds.append((title, articles))
+
                for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
                                   (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
                    articles = self.parse_section(url)
@ -244,15 +276,16 @@ class MPRecipe(BasicNewsRecipe):
            else:
                for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
                                   (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
-                                   (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
+                                   (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
+                                   (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm')]:
                    articles = self.parse_section(url)
                    if articles:
                        feeds.append((title, articles))

                # special- editorial
-                ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
-                if ed_articles:
-                    feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
+                #ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
+                #if ed_articles:
+                #    feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))

                for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
                                   (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
@ -263,20 +296,39 @@ class MPRecipe(BasicNewsRecipe):

                # special - finance
                #fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
-                fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
-                if fin_articles:
-                    feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
+                #fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
+                #if fin_articles:
+                #    feeds.append((u'\u7d93\u6fdf Finance', fin_articles))

-                for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
-                                   (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
-                    articles = self.parse_section(url)
+                for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]:
+                    articles = self.parse_section2_txt(url, keystr)
                    if articles:
                        feeds.append((title, articles))

+                #for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
+                #                   (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
+                #    articles = self.parse_section(url)
+                #    if articles:
+                #        feeds.append((title, articles))
+
                # special - entertainment
-                ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
-                if ent_articles:
-                    feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
+                #ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
+                #if ent_articles:
+                #    feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
+
+                for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
+                                          ]:
+                    articles = self.parse_section2_txt(url, keystr)
+                    if articles:
+                        feeds.append((title, articles))
+
+                if __InclPremium__ == True:
+                    # parse column section articles directly from .txt files
+                    for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
+                                              ]:
+                        articles = self.parse_section2_txt(url, keystr)
+                        if articles:
+                            feeds.append((title, articles))

                for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
                                   (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
@ -284,11 +336,6 @@ class MPRecipe(BasicNewsRecipe):
                    if articles:
                        feeds.append((title, articles))

-
-                # special- columns
-                col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
-                if col_articles:
-                    feeds.append((u'\u5c08\u6b04 Columns', col_articles))
        elif __Region__ == 'Vancouver':
            for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
                               (u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
@ -332,6 +379,16 @@ class MPRecipe(BasicNewsRecipe):
            title = self.tag_to_string(a)
            url = a.get('href', False)
            url = 'http://news.mingpao.com/' + dateStr + '/' +url
+            # replace the url to the print-friendly version
+            if __ParsePFF__ == True:
+                if url.rfind('Redirect') <> -1 and __InclPremium__ == True:
+                    url = re.sub(dateStr + '.*' + dateStr, dateStr, url)
+                    url = re.sub('%2F.*%2F', '/', url)
+                    title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
+                    url = url.replace('%2Etxt', '_print.htm')
+                    url = url.replace('%5F', '_')
+                else:
+                    url = url.replace('.htm', '_print.htm')
            if url not in included_urls and url.rfind('Redirect') == -1:
                current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
                included_urls.append(url)
@ -340,6 +397,8 @@ class MPRecipe(BasicNewsRecipe):

    # parse from life.mingpao.com
    def parse_section2(self, url, keystr):
+        br = mechanize.Browser()
+        br.set_handle_redirect(False)
        self.get_fetchdate()
        soup = self.index_to_soup(url)
        a = soup.findAll('a', href=True)
@ -350,9 +409,31 @@ class MPRecipe(BasicNewsRecipe):
            title = self.tag_to_string(i)
            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
+                try:
+                    br.open_novisit(url)
                    url = url.replace('dailynews3.cfm', 'dailynews3a.cfm')  # use printed version of the article
                    current_articles.append({'title': title, 'url': url, 'description': ''})
                    included_urls.append(url)
+                except:
+				    print 'skipping a premium article'
+        current_articles.reverse()
+        return current_articles
+
+    # parse from text file of life.mingpao.com
+    def parse_section2_txt(self, url, keystr):
+        self.get_fetchdate()
+        soup = self.index_to_soup(url)
+        a = soup.findAll('a', href=True)
+        a.reverse()
+        current_articles = []
+        included_urls = []
+        for i in a:
+            title = self.tag_to_string(i)
+            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
+            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
+                url = url.replace('cfm/dailynews3.cfm?File=', 'ftp/Life3/')  # use printed version of the article
+                current_articles.append({'title': title, 'url': url, 'description': ''})
+                included_urls.append(url)
        current_articles.reverse()
        return current_articles

@ -438,6 +519,162 @@ class MPRecipe(BasicNewsRecipe):
        current_articles.reverse()
        return current_articles

+    # preprocess those .txt and javascript based files
+    def preprocess_raw_html(self, raw_html, url):
+        new_html = raw_html
+        if url.rfind('ftp') <> -1 or url.rfind('_print.htm') <> -1:
+            if url.rfind('_print.htm') <> -1:
+                # javascript based file
+                splitter = re.compile(r'\n')
+                new_raw_html = '<html><head><title>Untitled</title></head>'
+                new_raw_html = new_raw_html + '<body>'
+                for item in splitter.split(raw_html):
+                    if item.startswith('var heading1 ='):
+                        heading = item.replace('var heading1 = \'', '')
+                        heading = heading.replace('\'', '')
+                        heading = heading.replace(';', '')
+                        new_raw_html = new_raw_html + '<div class="heading">' + heading
+                    if item.startswith('var heading2 ='):
+                        heading = item.replace('var heading2 = \'', '')
+                        heading = heading.replace('\'', '')
+                        heading = heading.replace(';', '')
+                        if heading <> '':
+                            new_raw_html = new_raw_html + '<br>' + heading + '</div>'
+                        else:
+                            new_raw_html = new_raw_html + '</div>'
+                    if item.startswith('var content ='):
+                        content = item.replace("var content = ", '')
+                        content = content.replace('\'', '')
+                        content = content.replace(';', '')
+                        new_raw_html = new_raw_html + '<div class="content">' + content + '</div>'
+                    if item.startswith('var photocontent ='):
+                        photo = item.replace('var photocontent = \'', '')
+                        photo = photo.replace('\'', '')
+                        photo = photo.replace(';', '')
+                        photo = photo.replace('<tr>', '')
+                        photo = photo.replace('<td>', '')
+                        photo = photo.replace('</tr>', '')
+                        photo = photo.replace('</td>', '<br>')
+                        photo = photo.replace('class="photo"', '')
+                        new_raw_html = new_raw_html + '<div class="images">' + photo + '</div>'
+                new_html = new_raw_html + '</body></html>'
+            else:
+                # .txt based file
+                splitter = re.compile(r'\n') # Match non-digits
+                new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
+                next_is_img_txt = False
+                title_started = False
+                title_break_reached = False
+                met_article_start_char = False
+                for item in splitter.split(raw_html):
+                    item = item.strip()
+                    # if title already reached but break between title and content not yet found, record title_break_reached
+                    if title_started == True and title_break_reached == False and item == '':
+                        title_break_reached = True
+                    # if title reached and title_break_reached and met_article_start_char == False and item is not empty
+                    # start content
+                    elif title_started == True and title_break_reached == True and met_article_start_char == False:
+                        if item <> '':
+                            met_article_start_char = True
+                            new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
+                    #if item.startswith(u'\u3010'):
+                    #    met_article_start_char = True
+                    #    new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
+                    else:
+                        if next_is_img_txt == False:
+                            if item.startswith("=@"):
+                                print 'skip movie link'
+                            elif item.startswith("=?"):
+                                next_is_img_txt = True
+                                new_raw_html += '<img src="' + str(item)[2:].strip() + '.gif" /><p>\n'
+                            elif item.startswith('=='):
+                                next_is_img_txt = True
+                                if False:
+                                    # TODO: check existence of .gif first
+                                    newimg = '_' + item[2:].strip() + '.jpg'
+                                    new_raw_html += '<img src="' + newimg + '" /><p>\n'
+                                else:
+                                    new_raw_html += '<img src="' + str(item)[2:].strip() + '.jpg" /><p>\n'
+                            elif item.startswith('='):
+                                next_is_img_txt = True
+                                if False:
+                                    # TODO: check existence of .gif first
+                                    newimg = '_' + item[1:].strip() + '.jpg'
+                                    new_raw_html += '<img src="' + newimg + '" /><p>\n'
+                                else:
+                                    new_raw_html += '<img src="' + str(item)[1:].strip() + '.jpg" /><p>\n'
+                            else:
+                                if next_is_img_txt == False and met_article_start_char == False:
+                                    if item <> '':
+                                        if title_started == False:
+                                            #print 'Title started at ', item
+                                            new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
+                                            title_started = True
+                                        else:
+                                            new_raw_html = new_raw_html + item + '\n'
+                                else:
+                                    new_raw_html = new_raw_html + item + '<p>\n'
+                        else:
+                            next_is_img_txt = False
+                            new_raw_html = new_raw_html + item + '\n'
+                new_html = new_raw_html + '</div></body></html>'
+        #raw_html = raw_html.replace(u'<p>\u3010', u'\u3010')
+        if __HiResImg__ == True:
+            # TODO: add a _ in front of an image url
+            if url.rfind('news.mingpao.com') > -1:
+                imglist =  re.findall('src="?.*?jpg"', new_html)
+                br = mechanize.Browser()
+                br.set_handle_redirect(False)
+                for img in imglist:
+                    gifimg = img.replace('jpg"', 'gif"')
+                    try:
+                        br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
+                        new_html = new_html.replace(img, gifimg)
+                    except:
+                        # find the location of the first _
+                        pos = img.find('_')
+                        if pos > -1:
+                            # if found, insert _ after the first _
+                            newimg = img[0:pos] + '_' + img[pos:]
+                            new_html = new_html.replace(img, newimg)
+                        else:
+                            # if not found, insert _ after "
+                            new_html = new_html.replace(img[1:], '"_' + img[1:])
+            elif url.rfind('life.mingpao.com') > -1:
+                imglist = re.findall('src=\'?.*?jpg\'', new_html)
+                br = mechanize.Browser()
+                br.set_handle_redirect(False)
+                #print 'Img list: ', imglist, '\n'
+                for img in imglist:
+                    #print 'Found img: ', img
+                    gifimg = img.replace('jpg\'', 'gif\'')
+                    try:
+                        gifurl = re.sub(r'dailynews.*txt', '', url)
+                        br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
+                        new_html = new_html.replace(img, gifimg)
+                    except:
+                        pos = img.rfind('/')
+                        newimg = img[0:pos+1] + '_' + img[pos+1:]
+                        new_html = new_html.replace(img, newimg)
+                # repeat with src quoted by double quotes, for text parsed from src txt
+                imglist = re.findall('src="?.*?jpg"', new_html)
+                for img in imglist:
+                    #print 'Found img: ', img
+                    gifimg = img.replace('jpg"', 'gif"')
+                    try:
+                        #print 'url', url
+                        pos = url.rfind('/')
+                        gifurl = url[:pos+1]
+                        #print 'try it:', gifurl + gifimg[5:len(gifimg)-1]
+                        br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
+                        new_html = new_html.replace(img, gifimg)
+                    except:
+                        pos = img.find('"')
+                        newimg = img[0:pos+1] + '_' + img[pos+1:]
+                        #print 'Use hi-res img', newimg
+                        new_html = new_html.replace(img, newimg)
+        return new_html
+
    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
@ -447,38 +684,112 @@ class MPRecipe(BasicNewsRecipe):
            del item['absmiddle']
        return soup

+    def populate_article_metadata(self, article, soup, first):
+        # thumbnails shouldn't be available if using hi-res images
+        if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'):
+            img = soup.find('img')
+            if img is not None:
+                self.add_toc_thumbnail(article, img['src'])
+
+        try:
+            if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
+                # look for content
+                articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
+                if not articlebodies:
+                    articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
+                if not articlebodies:
+                    articlebodies = soup.findAll('div',attrs={'class':'content'})
+                if not articlebodies:
+                    articlebodies = soup.findAll('div', attrs={'id':'font'})
+                if articlebodies:
+                    for articlebody in articlebodies:
+                        if articlebody:
+                            # the text may or may not be enclosed in <p></p> tag
+                            paras = articlebody.findAll('p')
+                            if not paras:
+                            	paras = articlebody
+                            textFound = False
+                            for p in paras:
+                                if not textFound:
+                                    summary_candidate = self.tag_to_string(p).strip()
+                                    summary_candidate = summary_candidate.replace(u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1)
+                                    if len(summary_candidate) > 0:
+                                        article.summary = article.text_summary = summary_candidate
+                                        textFound = True
+            else:
+                # display a simple text
+                #article.summary = article.text_summary = u'\u66f4\u591a......'
+                # display word counts
+                counts = 0
+                articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
+                if not articlebodies:
+                    articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
+                if not articlebodies:
+                    articlebodies = soup.findAll('div',attrs={'class':'content'})
+                if not articlebodies:
+                    articlebodies = soup.findAll('div', attrs={'id':'font'})
+                if articlebodies:
+                    for articlebody in articlebodies:
+                        # the text may or may not be enclosed in <p></p> tag
+                        paras = articlebody.findAll('p')
+                        if not paras:
+                            paras = articlebody
+                        for p in paras:
+                            summary_candidate = self.tag_to_string(p).strip()
+                            counts += len(summary_candidate)
+                    article.summary = article.text_summary = u'\uff08' + str(counts) + u'\u5b57\uff09'
+        except:
+            self.log("Error creating article descriptions")
+            return
+
+    # override from the one in version 0.8.31
    def create_opf(self, feeds, dir=None):
        if dir is None:
            dir = self.output_dir
-        if __UseChineseTitle__ == True:
-            if __Region__ == 'Hong Kong':
-                title = u'\u660e\u5831 (\u9999\u6e2f)'
-            elif __Region__ == 'Vancouver':
-                title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
-            elif __Region__ == 'Toronto':
-                title = u'\u660e\u5831 (\u591a\u502b\u591a)'
-        else:
        title = self.short_title()
-        # if not generating a periodical, force date to apply in title
-        if __MakePeriodical__ == False:
+        # change 1: allow our own flag to tell if a periodical is to be generated
+        # also use customed date instead of current time
+        if __MakePeriodical__ == False or self.output_profile.periodical_date_in_title:
            title = title + ' ' + self.get_fetchformatteddate()
-        if True:
-            mi = MetaInformation(title, [self.publisher])
-            mi.publisher = self.publisher
-            mi.author_sort = self.publisher
+        # end of change 1
+        # change 2: __appname__ replaced by newspaper publisher
+        __appname__ = self.publisher
+        mi = MetaInformation(title, [__appname__])
+        mi.publisher = __appname__
+        mi.author_sort = __appname__
+        # change 3: use __MakePeriodical__ flag to tell if a periodical should be generated
        if __MakePeriodical__ == True:
            mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
        else:
            mi.publication_type = self.publication_type+':'+self.short_title()
-            #mi.timestamp = nowf()
-            mi.timestamp = self.get_dtlocal()
-            mi.comments = self.description
-            if not isinstance(mi.comments, unicode):
-                mi.comments = mi.comments.decode('utf-8', 'replace')
+        #mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
+        # change 4: in the following, all the nowf() are changed to adjusted time
+        # This one doesn't matter
+        mi.timestamp = nowf()
+        # change 5: skip listing the articles
+        #article_titles, aseen = [], set()
+        #for f in feeds:
+        #    for a in f:
+        #        if a.title and a.title not in aseen:
+        #            aseen.add(a.title)
+        #            article_titles.append(force_unicode(a.title, 'utf-8'))
+
+        #mi.comments = self.description
+        #if not isinstance(mi.comments, unicode):
+        #    mi.comments = mi.comments.decode('utf-8', 'replace')
+        #mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
+        #        '\n\n'.join(article_titles))
+
+        language = canonicalize_lang(self.language)
+        if language is not None:
+            mi.language = language
+        # This one affects the pub date shown in kindle title
        #mi.pubdate = nowf()
-            mi.pubdate = self.get_dtlocal()
+        # now appears to need the time field to be > 12.00noon as well
+        mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
        opf_path = os.path.join(dir, 'index.opf')
        ncx_path = os.path.join(dir, 'index.ncx')
+
        opf = OPFCreator(dir, mi)
        # Add mastheadImage entry to <guide> section
        mp = getattr(self, 'masthead_path', None)
@ -514,11 +825,13 @@ class MPRecipe(BasicNewsRecipe):
                mani.id = 'ncx'
            if mani.path.endswith('mastheadImage.jpg'):
                mani.id = 'masthead-image'
+
        entries = ['index.html']
        toc = TOC(base_path=dir)
        self.play_order_counter = 0
        self.play_order_map = {}

+
        def feed_index(num, parent):
            f = feeds[num]
            for j, a in enumerate(f):
@ -532,13 +845,16 @@ class MPRecipe(BasicNewsRecipe):
                        desc = None
                    else:
                        desc = self.description_limiter(desc)
+                    tt = a.toc_thumbnail if a.toc_thumbnail else None
                    entries.append('%sindex.html'%adir)
                    po = self.play_order_map.get(entries[-1], None)
                    if po is None:
                        self.play_order_counter += 1
                        po = self.play_order_counter
-                    parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
-                                    play_order=po, author=auth, description=desc)
+                    parent.add_item('%sindex.html'%adir, None,
+                            a.title if a.title else _('Untitled Article'),
+                            play_order=po, author=auth,
+                            description=desc, toc_thumbnail=tt)
                    last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
                    for sp in a.sub_pages:
                        prefix = os.path.commonprefix([opf_path, sp])
@ -555,7 +871,7 @@ class MPRecipe(BasicNewsRecipe):
                            prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
                            templ = self.navbar.generate(True, num, j, len(f),
                                            not self.has_single_feed,
-                                            a.orig_url, self.publisher, prefix=prefix,
+                                            a.orig_url, __appname__, prefix=prefix,
                                            center=self.center_navbar)
                            elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
                            body.insert(len(body.contents), elem)
@ -592,3 +908,4 @@ class MPRecipe(BasicNewsRecipe):
        with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
            opf.render(opf_file, ncx_file)

+
--- a/recipes/ming_pao_vancouver.recipe
+++ b/recipes/ming_pao_vancouver.recipe
@ -4,18 +4,41 @@ __copyright__ = '2010-2011, Eddie Lau'
 # Region - Hong Kong, Vancouver, Toronto
 __Region__ = 'Vancouver'
 # Users of Kindle 3 with limited system-level CJK support
-# please replace the following "True" with "False".
+# please replace the following "True" with "False". (Default: True)
 __MakePeriodical__ = True
-# Turn below to true if your device supports display of CJK titles
+# Turn below to True if your device supports display of CJK titles (Default: False)
 __UseChineseTitle__ = False
-# Set it to False if you want to skip images
+# Set it to False if you want to skip images (Default: True)
 __KeepImages__ = True
-# (HK only) Turn below to true if you wish to use life.mingpao.com as the main article source
+# Set it to True if you want to include a summary in Kindle's article view (Default: False)
+__IncludeSummary__ = False
+# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
+__IncludeThumbnails__ = True
+# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
 __UseLife__ = True
+# (HK only) It is to disable premium content (Default: False)
+__InclPremium__ = False
+# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: True)
+__ParsePFF__ = True
+# (HK only) Turn below to True if you wish hi-res images (Default: False)
+__HiResImg__ = False
+# Override the date returned by the program if specifying a YYYYMMDD below
+__Date__ = ''


 '''
 Change Log:
+2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
+            from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
+            download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
+2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
+2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
+2011/10/19: fix a bug in txt source parsing
+2011/10/17: disable fetching of premium content, also improved txt source parsing
+2011/10/04: option to get hi-res photos for the articles
+2011/09/21: fetching "column" section is made optional.
+2011/09/18: parse "column" section stuff from source text file directly.
+2011/09/07: disable "column" section as it is no longer offered free.
 2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
            provide options to remove all images in the file
 2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
@ -37,30 +60,38 @@ Change Log:
 2010/10/31: skip repeated articles in section pages
 '''

-import os, datetime, re
+from calibre.utils.date import now as nowf
+import os, datetime, re, mechanize
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from contextlib import nested
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.metadata import MetaInformation
+from calibre.utils.localization import canonicalize_lang

 # MAIN CLASS
 class MPRecipe(BasicNewsRecipe):
    if __Region__ == 'Hong Kong':
+        if __UseChineseTitle__ == True:
+            title = u'\u660e\u5831 (\u9999\u6e2f)'
+        else:
            title   = 'Ming Pao - Hong Kong'
        description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
        category    = 'Chinese, News, Hong Kong'
-        extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
+        extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
        masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
        keep_only_tags = [dict(name='h1'),
                          dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
                          dict(name='font', attrs={'color':['AA0000']}), # for column articles title
+                          dict(attrs={'class':['heading']}),  # for heading from txt
                          dict(attrs={'id':['newscontent']}), # entertainment and column page content
                          dict(attrs={'id':['newscontent01','newscontent02']}),
+                          dict(attrs={'class':['content']}),  # for content from txt
                          dict(attrs={'class':['photo']}),
                          dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}),  # content in printed version of life.mingpao.com
-                          dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com
+                          dict(name='img', attrs={'width':['180'], 'alt':['????']}), # images for source from life.mingpao.com
+                          dict(attrs={'class':['images']})   # for images from txt
                          ]
        if __KeepImages__:
            remove_tags = [dict(name='style'),
@ -90,6 +121,9 @@ class MPRecipe(BasicNewsRecipe):
                              lambda match: "</b>")
                             ]
    elif __Region__ == 'Vancouver':
+        if __UseChineseTitle__ == True:
+            title   = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
+        else:
            title   = 'Ming Pao - Vancouver'
        description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
        category    = 'Chinese, News, Vancouver'
@ -108,6 +142,9 @@ class MPRecipe(BasicNewsRecipe):
                              lambda match: ''),
                             ]
    elif __Region__ == 'Toronto':
+        if __UseChineseTitle__ == True:
+            title   = u'\u660e\u5831 (\u591a\u502b\u591a)'
+        else:
            title   = 'Ming Pao - Toronto'
        description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
        category    = 'Chinese, News, Toronto'
@ -139,49 +176,12 @@ class MPRecipe(BasicNewsRecipe):
    conversion_options = {'linearize_tables':True}
    timefmt = ''

-    def image_url_processor(cls, baseurl, url):
-        # trick: break the url at the first occurance of digit, add an additional
-        # '_' at the front
-        # not working, may need to move this to preprocess_html() method
-#        minIdx = 10000
-#        i0 = url.find('0')
-#        if i0 >= 0 and i0 < minIdx:
-#           minIdx = i0
-#        i1 = url.find('1')
-#        if i1 >= 0 and i1 < minIdx:
-#           minIdx = i1
-#        i2 = url.find('2')
-#        if i2 >= 0 and i2 < minIdx:
-#           minIdx = i2
-#        i3 = url.find('3')
-#        if i3 >= 0 and i0 < minIdx:
-#           minIdx = i3
-#        i4 = url.find('4')
-#        if i4 >= 0 and i4 < minIdx:
-#           minIdx = i4
-#        i5 = url.find('5')
-#        if i5 >= 0 and i5 < minIdx:
-#           minIdx = i5
-#        i6 = url.find('6')
-#        if i6 >= 0 and i6 < minIdx:
-#           minIdx = i6
-#        i7 = url.find('7')
-#        if i7 >= 0 and i7 < minIdx:
-#           minIdx = i7
-#        i8 = url.find('8')
-#        if i8 >= 0 and i8 < minIdx:
-#           minIdx = i8
-#        i9 = url.find('9')
-#        if i9 >= 0 and i9 < minIdx:
-#           minIdx = i9
-        return url
-
    def get_dtlocal(self):
        dt_utc = datetime.datetime.utcnow()
        if __Region__ == 'Hong Kong':
-            # convert UTC to local hk time - at HKT 5.30am, all news are available
-            dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(5.5/24)
-            # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(5.5/24)
+            # convert UTC to local hk time - at HKT 4.30am, all news are available
+            dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24)
+            # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
        elif __Region__ == 'Vancouver':
            # convert UTC to local Vancouver time - at PST time 5.30am, all news are available
            dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
@ -193,12 +193,33 @@ class MPRecipe(BasicNewsRecipe):
        return dt_local

    def get_fetchdate(self):
+        if __Date__ <> '':
+            return __Date__
+        else:
            return self.get_dtlocal().strftime("%Y%m%d")

    def get_fetchformatteddate(self):
+        if __Date__ <> '':
+            return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
+        else:
            return self.get_dtlocal().strftime("%Y-%m-%d")

+    def get_fetchyear(self):
+        if __Date__ <> '':
+            return __Date__[0:4]
+        else:
+            return self.get_dtlocal().strftime("%Y")
+
+    def get_fetchmonth(self):
+        if __Date__ <> '':
+            return __Date__[4:6]
+        else:
+            return self.get_dtlocal().strftime("%m")
+
    def get_fetchday(self):
+        if __Date__ <> '':
+            return __Date__[6:8]
+        else:
            return self.get_dtlocal().strftime("%d")

    def get_cover_url(self):
@ -230,12 +251,23 @@ class MPRecipe(BasicNewsRecipe):
                                           (u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
                                           (u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
                                           (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
-                                           (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'),
-                                           (u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]:
+                                           (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
+                                          ]:
+                    if __InclPremium__ == True:
+                        articles = self.parse_section2_txt(url, keystr)
+                    else:
                        articles = self.parse_section2(url, keystr)
                    if articles:
                        feeds.append((title, articles))

+                if __InclPremium__ == True:
+                    # parse column section articles directly from .txt files
+                    for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
+                                              ]:
+                        articles = self.parse_section2_txt(url, keystr)
+                        if articles:
+                            feeds.append((title, articles))
+
                for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
                                   (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
                    articles = self.parse_section(url)
@ -244,15 +276,16 @@ class MPRecipe(BasicNewsRecipe):
            else:
                for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
                                   (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
-                                   (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
+                                   (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
+                                   (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm')]:
                    articles = self.parse_section(url)
                    if articles:
                        feeds.append((title, articles))

                # special- editorial
-                ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
-                if ed_articles:
-                    feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
+                #ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
+                #if ed_articles:
+                #    feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))

                for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
                                   (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
@ -263,20 +296,39 @@ class MPRecipe(BasicNewsRecipe):

                # special - finance
                #fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
-                fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
-                if fin_articles:
-                    feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
+                #fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
+                #if fin_articles:
+                #    feeds.append((u'\u7d93\u6fdf Finance', fin_articles))

-                for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
-                                   (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
-                    articles = self.parse_section(url)
+                for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]:
+                    articles = self.parse_section2_txt(url, keystr)
                    if articles:
                        feeds.append((title, articles))

+                #for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
+                #                   (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
+                #    articles = self.parse_section(url)
+                #    if articles:
+                #        feeds.append((title, articles))
+
                # special - entertainment
-                ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
-                if ent_articles:
-                    feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
+                #ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
+                #if ent_articles:
+                #    feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
+
+                for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
+                                          ]:
+                    articles = self.parse_section2_txt(url, keystr)
+                    if articles:
+                        feeds.append((title, articles))
+
+                if __InclPremium__ == True:
+                    # parse column section articles directly from .txt files
+                    for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
+                                              ]:
+                        articles = self.parse_section2_txt(url, keystr)
+                        if articles:
+                            feeds.append((title, articles))

                for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
                                   (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
@ -284,11 +336,6 @@ class MPRecipe(BasicNewsRecipe):
                    if articles:
                        feeds.append((title, articles))

-
-                # special- columns
-                col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
-                if col_articles:
-                    feeds.append((u'\u5c08\u6b04 Columns', col_articles))
        elif __Region__ == 'Vancouver':
            for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
                               (u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
@ -332,6 +379,16 @@ class MPRecipe(BasicNewsRecipe):
            title = self.tag_to_string(a)
            url = a.get('href', False)
            url = 'http://news.mingpao.com/' + dateStr + '/' +url
+            # replace the url to the print-friendly version
+            if __ParsePFF__ == True:
+                if url.rfind('Redirect') <> -1 and __InclPremium__ == True:
+                    url = re.sub(dateStr + '.*' + dateStr, dateStr, url)
+                    url = re.sub('%2F.*%2F', '/', url)
+                    title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
+                    url = url.replace('%2Etxt', '_print.htm')
+                    url = url.replace('%5F', '_')
+                else:
+                    url = url.replace('.htm', '_print.htm')
            if url not in included_urls and url.rfind('Redirect') == -1:
                current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
                included_urls.append(url)
@ -340,6 +397,8 @@ class MPRecipe(BasicNewsRecipe):

    # parse from life.mingpao.com
    def parse_section2(self, url, keystr):
+        br = mechanize.Browser()
+        br.set_handle_redirect(False)
        self.get_fetchdate()
        soup = self.index_to_soup(url)
        a = soup.findAll('a', href=True)
@ -350,9 +409,31 @@ class MPRecipe(BasicNewsRecipe):
            title = self.tag_to_string(i)
            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
+                try:
+                    br.open_novisit(url)
                    url = url.replace('dailynews3.cfm', 'dailynews3a.cfm')  # use printed version of the article
                    current_articles.append({'title': title, 'url': url, 'description': ''})
                    included_urls.append(url)
+                except:
+				    print 'skipping a premium article'
+        current_articles.reverse()
+        return current_articles
+
+    # parse from text file of life.mingpao.com
+    def parse_section2_txt(self, url, keystr):
+        self.get_fetchdate()
+        soup = self.index_to_soup(url)
+        a = soup.findAll('a', href=True)
+        a.reverse()
+        current_articles = []
+        included_urls = []
+        for i in a:
+            title = self.tag_to_string(i)
+            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
+            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
+                url = url.replace('cfm/dailynews3.cfm?File=', 'ftp/Life3/')  # use printed version of the article
+                current_articles.append({'title': title, 'url': url, 'description': ''})
+                included_urls.append(url)
        current_articles.reverse()
        return current_articles

@ -438,6 +519,162 @@ class MPRecipe(BasicNewsRecipe):
        current_articles.reverse()
        return current_articles

+    # preprocess those .txt and javascript based files
+    def preprocess_raw_html(self, raw_html, url):
+        new_html = raw_html
+        if url.rfind('ftp') <> -1 or url.rfind('_print.htm') <> -1:
+            if url.rfind('_print.htm') <> -1:
+                # javascript based file
+                splitter = re.compile(r'\n')
+                new_raw_html = '<html><head><title>Untitled</title></head>'
+                new_raw_html = new_raw_html + '<body>'
+                for item in splitter.split(raw_html):
+                    if item.startswith('var heading1 ='):
+                        heading = item.replace('var heading1 = \'', '')
+                        heading = heading.replace('\'', '')
+                        heading = heading.replace(';', '')
+                        new_raw_html = new_raw_html + '<div class="heading">' + heading
+                    if item.startswith('var heading2 ='):
+                        heading = item.replace('var heading2 = \'', '')
+                        heading = heading.replace('\'', '')
+                        heading = heading.replace(';', '')
+                        if heading <> '':
+                            new_raw_html = new_raw_html + '<br>' + heading + '</div>'
+                        else:
+                            new_raw_html = new_raw_html + '</div>'
+                    if item.startswith('var content ='):
+                        content = item.replace("var content = ", '')
+                        content = content.replace('\'', '')
+                        content = content.replace(';', '')
+                        new_raw_html = new_raw_html + '<div class="content">' + content + '</div>'
+                    if item.startswith('var photocontent ='):
+                        photo = item.replace('var photocontent = \'', '')
+                        photo = photo.replace('\'', '')
+                        photo = photo.replace(';', '')
+                        photo = photo.replace('<tr>', '')
+                        photo = photo.replace('<td>', '')
+                        photo = photo.replace('</tr>', '')
+                        photo = photo.replace('</td>', '<br>')
+                        photo = photo.replace('class="photo"', '')
+                        new_raw_html = new_raw_html + '<div class="images">' + photo + '</div>'
+                new_html = new_raw_html + '</body></html>'
+            else:
+                # .txt based file
+                splitter = re.compile(r'\n') # Match non-digits
+                new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
+                next_is_img_txt = False
+                title_started = False
+                title_break_reached = False
+                met_article_start_char = False
+                for item in splitter.split(raw_html):
+                    item = item.strip()
+                    # if title already reached but break between title and content not yet found, record title_break_reached
+                    if title_started == True and title_break_reached == False and item == '':
+                        title_break_reached = True
+                    # if title reached and title_break_reached and met_article_start_char == False and item is not empty
+                    # start content
+                    elif title_started == True and title_break_reached == True and met_article_start_char == False:
+                        if item <> '':
+                            met_article_start_char = True
+                            new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
+                    #if item.startswith(u'\u3010'):
+                    #    met_article_start_char = True
+                    #    new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
+                    else:
+                        if next_is_img_txt == False:
+                            if item.startswith("=@"):
+                                print 'skip movie link'
+                            elif item.startswith("=?"):
+                                next_is_img_txt = True
+                                new_raw_html += '<img src="' + str(item)[2:].strip() + '.gif" /><p>\n'
+                            elif item.startswith('=='):
+                                next_is_img_txt = True
+                                if False:
+                                    # TODO: check existence of .gif first
+                                    newimg = '_' + item[2:].strip() + '.jpg'
+                                    new_raw_html += '<img src="' + newimg + '" /><p>\n'
+                                else:
+                                    new_raw_html += '<img src="' + str(item)[2:].strip() + '.jpg" /><p>\n'
+                            elif item.startswith('='):
+                                next_is_img_txt = True
+                                if False:
+                                    # TODO: check existence of .gif first
+                                    newimg = '_' + item[1:].strip() + '.jpg'
+                                    new_raw_html += '<img src="' + newimg + '" /><p>\n'
+                                else:
+                                    new_raw_html += '<img src="' + str(item)[1:].strip() + '.jpg" /><p>\n'
+                            else:
+                                if next_is_img_txt == False and met_article_start_char == False:
+                                    if item <> '':
+                                        if title_started == False:
+                                            #print 'Title started at ', item
+                                            new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
+                                            title_started = True
+                                        else:
+                                            new_raw_html = new_raw_html + item + '\n'
+                                else:
+                                    new_raw_html = new_raw_html + item + '<p>\n'
+                        else:
+                            next_is_img_txt = False
+                            new_raw_html = new_raw_html + item + '\n'
+                new_html = new_raw_html + '</div></body></html>'
+        #raw_html = raw_html.replace(u'<p>\u3010', u'\u3010')
+        if __HiResImg__ == True:
+            # TODO: add a _ in front of an image url
+            if url.rfind('news.mingpao.com') > -1:
+                imglist =  re.findall('src="?.*?jpg"', new_html)
+                br = mechanize.Browser()
+                br.set_handle_redirect(False)
+                for img in imglist:
+                    gifimg = img.replace('jpg"', 'gif"')
+                    try:
+                        br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
+                        new_html = new_html.replace(img, gifimg)
+                    except:
+                        # find the location of the first _
+                        pos = img.find('_')
+                        if pos > -1:
+                            # if found, insert _ after the first _
+                            newimg = img[0:pos] + '_' + img[pos:]
+                            new_html = new_html.replace(img, newimg)
+                        else:
+                            # if not found, insert _ after "
+                            new_html = new_html.replace(img[1:], '"_' + img[1:])
+            elif url.rfind('life.mingpao.com') > -1:
+                imglist = re.findall('src=\'?.*?jpg\'', new_html)
+                br = mechanize.Browser()
+                br.set_handle_redirect(False)
+                #print 'Img list: ', imglist, '\n'
+                for img in imglist:
+                    #print 'Found img: ', img
+                    gifimg = img.replace('jpg\'', 'gif\'')
+                    try:
+                        gifurl = re.sub(r'dailynews.*txt', '', url)
+                        br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
+                        new_html = new_html.replace(img, gifimg)
+                    except:
+                        pos = img.rfind('/')
+                        newimg = img[0:pos+1] + '_' + img[pos+1:]
+                        new_html = new_html.replace(img, newimg)
+                # repeat with src quoted by double quotes, for text parsed from src txt
+                imglist = re.findall('src="?.*?jpg"', new_html)
+                for img in imglist:
+                    #print 'Found img: ', img
+                    gifimg = img.replace('jpg"', 'gif"')
+                    try:
+                        #print 'url', url
+                        pos = url.rfind('/')
+                        gifurl = url[:pos+1]
+                        #print 'try it:', gifurl + gifimg[5:len(gifimg)-1]
+                        br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
+                        new_html = new_html.replace(img, gifimg)
+                    except:
+                        pos = img.find('"')
+                        newimg = img[0:pos+1] + '_' + img[pos+1:]
+                        #print 'Use hi-res img', newimg
+                        new_html = new_html.replace(img, newimg)
+        return new_html
+
    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
@ -447,38 +684,112 @@ class MPRecipe(BasicNewsRecipe):
            del item['absmiddle']
        return soup

+    def populate_article_metadata(self, article, soup, first):
+        # thumbnails shouldn't be available if using hi-res images
+        if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'):
+            img = soup.find('img')
+            if img is not None:
+                self.add_toc_thumbnail(article, img['src'])
+
+        try:
+            if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
+                # look for content
+                articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
+                if not articlebodies:
+                    articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
+                if not articlebodies:
+                    articlebodies = soup.findAll('div',attrs={'class':'content'})
+                if not articlebodies:
+                    articlebodies = soup.findAll('div', attrs={'id':'font'})
+                if articlebodies:
+                    for articlebody in articlebodies:
+                        if articlebody:
+                            # the text may or may not be enclosed in <p></p> tag
+                            paras = articlebody.findAll('p')
+                            if not paras:
+                            	paras = articlebody
+                            textFound = False
+                            for p in paras:
+                                if not textFound:
+                                    summary_candidate = self.tag_to_string(p).strip()
+                                    summary_candidate = summary_candidate.replace(u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1)
+                                    if len(summary_candidate) > 0:
+                                        article.summary = article.text_summary = summary_candidate
+                                        textFound = True
+            else:
+                # display a simple text
+                #article.summary = article.text_summary = u'\u66f4\u591a......'
+                # display word counts
+                counts = 0
+                articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
+                if not articlebodies:
+                    articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
+                if not articlebodies:
+                    articlebodies = soup.findAll('div',attrs={'class':'content'})
+                if not articlebodies:
+                    articlebodies = soup.findAll('div', attrs={'id':'font'})
+                if articlebodies:
+                    for articlebody in articlebodies:
+                        # the text may or may not be enclosed in <p></p> tag
+                        paras = articlebody.findAll('p')
+                        if not paras:
+                            paras = articlebody
+                        for p in paras:
+                            summary_candidate = self.tag_to_string(p).strip()
+                            counts += len(summary_candidate)
+                    article.summary = article.text_summary = u'\uff08' + str(counts) + u'\u5b57\uff09'
+        except:
+            self.log("Error creating article descriptions")
+            return
+
+    # override from the one in version 0.8.31
    def create_opf(self, feeds, dir=None):
        if dir is None:
            dir = self.output_dir
-        if __UseChineseTitle__ == True:
-            if __Region__ == 'Hong Kong':
-                title = u'\u660e\u5831 (\u9999\u6e2f)'
-            elif __Region__ == 'Vancouver':
-                title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
-            elif __Region__ == 'Toronto':
-                title = u'\u660e\u5831 (\u591a\u502b\u591a)'
-        else:
        title = self.short_title()
-        # if not generating a periodical, force date to apply in title
-        if __MakePeriodical__ == False:
+        # change 1: allow our own flag to tell if a periodical is to be generated
+        # also use customed date instead of current time
+        if __MakePeriodical__ == False or self.output_profile.periodical_date_in_title:
            title = title + ' ' + self.get_fetchformatteddate()
-        if True:
-            mi = MetaInformation(title, [self.publisher])
-            mi.publisher = self.publisher
-            mi.author_sort = self.publisher
+        # end of change 1
+        # change 2: __appname__ replaced by newspaper publisher
+        __appname__ = self.publisher
+        mi = MetaInformation(title, [__appname__])
+        mi.publisher = __appname__
+        mi.author_sort = __appname__
+        # change 3: use __MakePeriodical__ flag to tell if a periodical should be generated
        if __MakePeriodical__ == True:
            mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
        else:
            mi.publication_type = self.publication_type+':'+self.short_title()
-            #mi.timestamp = nowf()
-            mi.timestamp = self.get_dtlocal()
-            mi.comments = self.description
-            if not isinstance(mi.comments, unicode):
-                mi.comments = mi.comments.decode('utf-8', 'replace')
+        #mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
+        # change 4: in the following, all the nowf() are changed to adjusted time
+        # This one doesn't matter
+        mi.timestamp = nowf()
+        # change 5: skip listing the articles
+        #article_titles, aseen = [], set()
+        #for f in feeds:
+        #    for a in f:
+        #        if a.title and a.title not in aseen:
+        #            aseen.add(a.title)
+        #            article_titles.append(force_unicode(a.title, 'utf-8'))
+
+        #mi.comments = self.description
+        #if not isinstance(mi.comments, unicode):
+        #    mi.comments = mi.comments.decode('utf-8', 'replace')
+        #mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
+        #        '\n\n'.join(article_titles))
+
+        language = canonicalize_lang(self.language)
+        if language is not None:
+            mi.language = language
+        # This one affects the pub date shown in kindle title
        #mi.pubdate = nowf()
-            mi.pubdate = self.get_dtlocal()
+        # now appears to need the time field to be > 12.00noon as well
+        mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
        opf_path = os.path.join(dir, 'index.opf')
        ncx_path = os.path.join(dir, 'index.ncx')
+
        opf = OPFCreator(dir, mi)
        # Add mastheadImage entry to <guide> section
        mp = getattr(self, 'masthead_path', None)
@ -514,11 +825,13 @@ class MPRecipe(BasicNewsRecipe):
                mani.id = 'ncx'
            if mani.path.endswith('mastheadImage.jpg'):
                mani.id = 'masthead-image'
+
        entries = ['index.html']
        toc = TOC(base_path=dir)
        self.play_order_counter = 0
        self.play_order_map = {}

+
        def feed_index(num, parent):
            f = feeds[num]
            for j, a in enumerate(f):
@ -532,13 +845,16 @@ class MPRecipe(BasicNewsRecipe):
                        desc = None
                    else:
                        desc = self.description_limiter(desc)
+                    tt = a.toc_thumbnail if a.toc_thumbnail else None
                    entries.append('%sindex.html'%adir)
                    po = self.play_order_map.get(entries[-1], None)
                    if po is None:
                        self.play_order_counter += 1
                        po = self.play_order_counter
-                    parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
-                                    play_order=po, author=auth, description=desc)
+                    parent.add_item('%sindex.html'%adir, None,
+                            a.title if a.title else _('Untitled Article'),
+                            play_order=po, author=auth,
+                            description=desc, toc_thumbnail=tt)
                    last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
                    for sp in a.sub_pages:
                        prefix = os.path.commonprefix([opf_path, sp])
@ -555,7 +871,7 @@ class MPRecipe(BasicNewsRecipe):
                            prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
                            templ = self.navbar.generate(True, num, j, len(f),
                                            not self.has_single_feed,
-                                            a.orig_url, self.publisher, prefix=prefix,
+                                            a.orig_url, __appname__, prefix=prefix,
                                            center=self.center_navbar)
                            elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
                            body.insert(len(body.contents), elem)
@ -592,3 +908,4 @@ class MPRecipe(BasicNewsRecipe):
        with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
            opf.render(opf_file, ncx_file)

+
--- a/recipes/mlody_technik_pl.recipe
+++ b/recipes/mlody_technik_pl.recipe
@ -0,0 +1,15 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+from calibre.web.feeds.news import BasicNewsRecipe
+class Mlody_technik(BasicNewsRecipe):
+    title          = u'Mlody technik'
+    __author__        = 'fenuks'
+    description   = u'Młody technik'
+    category       = 'science'
+    language       = 'pl'
+    cover_url='http://science-everywhere.pl/wp-content/uploads/2011/10/mt12.jpg'
+    no_stylesheets = True
+    oldest_article = 7
+    max_articles_per_feed = 100
+    #keep_only_tags=[dict(id='container')]
+    feeds          = [(u'Artyku\u0142y', u'http://www.mt.com.pl/feed')]
--- a/recipes/moneynews.recipe
+++ b/recipes/moneynews.recipe
@ -1,9 +1,7 @@
-#!/usr/bin/env  python
-
 __license__   = 'GPL v3'
-__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'
 '''
-moneynews.newsmax.com
+www.moneynews.com
 '''

 from calibre.web.feeds.news import BasicNewsRecipe
@ -12,40 +10,40 @@ class MoneyNews(BasicNewsRecipe):
    title                 = 'Moneynews.com'
    __author__            = 'Darko Miletic'
    description           = 'Financial news worldwide'
-    publisher             = 'moneynews.com'
+    publisher             = 'Newsmax.com'
    language              = 'en'
-
    category              = 'news, finances, USA, business'
    oldest_article        = 2
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
-    encoding              = 'cp1252'
+    encoding              = 'utf8'
+    extra_css             = 'img{display: block} body{font-family: Arial, Helvetica, sans-serif}'
    
-    html2lrf_options = [
-                          '--comment', description
-                        , '--category', category
-                        , '--publisher', publisher
-                        , '--ignore-tables'
-                        ]
-
-    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
+    conversion_options = {
+                          'comment'   : description
+                        , 'tags'      : category
+                        , 'publisher' : publisher
+                        , 'language'  : language
+                        , 'linearize_tables' : True
+                        }
    
    feeds = [
-              (u'Street Talk'          , u'http://moneynews.newsmax.com/xml/streettalk.xml'  )
-             ,(u'Finance News'         , u'http://moneynews.newsmax.com/xml/FinanceNews.xml' )
-             ,(u'Economy'              , u'http://moneynews.newsmax.com/xml/economy.xml'     )
-             ,(u'Companies'            , u'http://moneynews.newsmax.com/xml/companies.xml'   )
-             ,(u'Markets'              , u'http://moneynews.newsmax.com/xml/Markets.xml'     )
-             ,(u'Investing & Analysis' , u'http://moneynews.newsmax.com/xml/investing.xml'   )
+              (u'Street Talk'          , u'http://www.moneynews.com/rss/StreetTalk/8.xml'  )
+             ,(u'Finance News'         , u'http://www.moneynews.com/rss/FinanceNews/4.xml' )
+             ,(u'Economy'              , u'http://www.moneynews.com/rss/Economy/2.xml'     )
+             ,(u'Companies'            , u'http://www.moneynews.com/rss/Companies/6.xml'   )
+             ,(u'Markets'              , u'http://www.moneynews.com/rss/Markets/7.xml'     )
+             ,(u'Investing & Analysis' , u'http://www.moneynews.com/rss/InvestingAnalysis/17.xml')
            ]

-
-    keep_only_tags = [dict(name='table', attrs={'class':'copy'})]
+    keep_only_tags = [dict(name='div', attrs={'class':'copy'})]
    
    remove_tags = [
-                     dict(name='td'   , attrs={'id':'article_fontsize'})
-                    ,dict(name='table', attrs={'id':'toolbox'         })
-                    ,dict(name='tr'   , attrs={'id':'noprint3'        })
+                    dict(attrs={'class':['MsoNormal', 'MsoNoSpacing']}),
+                    dict(name=['object','link','embed','form','meta'])
                  ]
    
+    def print_version(self, url):
+        nodeid = url.rpartition('/')[2]
+        return 'http://www.moneynews.com/PrintTemplate?nodeid=' + nodeid
--- a/recipes/naczytniki.recipe
+++ b/recipes/naczytniki.recipe
@ -7,6 +7,7 @@ class naczytniki(BasicNewsRecipe):
    language       = 'pl'
    description ='everything about e-readers'
    category='readers'
+    no_stylesheets=True
    oldest_article = 7
    max_articles_per_feed = 100
    remove_tags_after= dict(name='div', attrs={'class':'sociable'})
--- a/recipes/nin.recipe
+++ b/recipes/nin.recipe
@ -6,11 +6,7 @@ www.nin.co.rs
 '''

 import re
-from calibre import strftime
 from calibre.web.feeds.news import BasicNewsRecipe
-from contextlib import closing
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
-from calibre import entity_to_unicode

 class Nin(BasicNewsRecipe):
    title                  = 'NIN online'
@ -80,59 +76,11 @@ class Nin(BasicNewsRecipe):
                   return self.PREFIX + item.img['src']
        return cover_url

-    def parse_index(self):
-        articles = []
-        count = 0
-        soup = self.index_to_soup(self.INDEX)
-        for item in soup.findAll('a',attrs={'class':'lmeninavFont'}):
-            count = count +1
-            if self.test and count > 2:
-               return articles
-            section  = self.tag_to_string(item)
-            feedlink = self.PREFIX + item['href']
-            feedpage = self.index_to_soup(feedlink)
-            self.report_progress(0, _('Fetching feed')+' %s...'%(section))
-            inarts   = []
-            for art in feedpage.findAll('span',attrs={'class':'artTitle'}):
-                alink = art.parent
-                url   = self.PREFIX + alink['href']
-                title = self.tag_to_string(art)
-                sparent = alink.parent
-                alink.extract()
-                description = self.tag_to_string(sparent)
-                date = strftime(self.timefmt)
-                inarts.append({
-                                  'title'      :title
-                                 ,'date'       :date
-                                 ,'url'        :url
-                                 ,'description':description
-                                })
-            articles.append((section,inarts))
-        return articles
+    feeds          = [(u'NIN Online', u'http://www.nin.co.rs/misc/rss.php?feed=RSS2.0')]

-    def index_to_soup(self, url_or_raw, raw=False):
-        if re.match(r'\w+://', url_or_raw):
-            open_func = getattr(self.browser, 'open_novisit', self.browser.open)
-            with closing(open_func(url_or_raw)) as f:
-                _raw = f.read()
-            if not _raw:
-                raise RuntimeError('Could not fetch index from %s'%url_or_raw)
-        else:
-            _raw = url_or_raw
-        if raw:
-            return _raw
-        if not isinstance(_raw, unicode) and self.encoding:
-            if callable(self.encoding):
-                _raw = self.encoding(_raw)
-            else:
-                _raw = _raw.decode(self.encoding, 'replace')
-        massage = list(BeautifulSoup.MARKUP_MASSAGE)
-        enc = 'cp1252' if callable(self.encoding) or self.encoding is None else self.encoding
-        massage.append((re.compile(r'&(\S+?);'), lambda match:
-            entity_to_unicode(match, encoding=enc)))
-        massage.append((re.compile(r'[\x00-\x08]+'), lambda match:
-            ''))
-        return BeautifulSoup(_raw, markupMassage=massage)
+    def get_article_url(self, article):
+        url = BasicNewsRecipe.get_article_url(self, article)
+        return url.replace('.co.yu', '.co.rs')

    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
--- a/recipes/nol.recipe
+++ b/recipes/nol.recipe
@ -0,0 +1,54 @@
+################################################################################
+#Description:	  http://nol.hu/ RSS channel
+#Author: 	  Bigpapa (bigpapabig@hotmail.com)
+#Date:	  2011.12.18. - V1.1
+################################################################################
+
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class NOL(BasicNewsRecipe):
+    title = u'NOL'
+    __author__             = 'Bigpapa'
+    oldest_article         = 5
+    max_articles_per_feed  = 5	# Az adott e-bookban tarolt cikkek feedenkenti maximalis szamat adja meg.
+    no_stylesheets         = True
+    #delay                  = 1
+    use_embedded_content   = False
+    encoding               = 'utf8'
+    language               = 'hu'
+    publication_type       = 'newsportal'
+
+    conversion_options ={
+	'linearize_tables' : True,
+	}
+  
+    keep_only_tags    = [
+	dict(name='table', attrs={'class':['article-box']})
+	]
+
+    remove_tags = [
+
+	dict(name='div', attrs={'class':['h','ad-container-outer','tags noborder','ad-container-inner','image-container-lead','tags','related-container']}),
+	dict(name='h4'),
+	dict(name='tfoot'),
+	dict(name='td', attrs={'class':['foot']}),
+	dict(name='span', attrs={'class':['image-container-caption']}),
+	]
+
+
+    feeds          = [
+    #	(u'V\xe1logat\xe1s', 'http://nol.hu/feed/valogatas.rss'),
+	(u'Belf\xf6ld', 'http://nol.hu/feed/belfold.rss'),
+	(u'K\xfclf\xf6ld', 'http://nol.hu/feed/kulfold.rss'),
+	(u'Gazdas\xe1g', 'http://nol.hu/feed/gazdasag.rss'),
+	(u'V\xe9lem\xe9ny', 'http://nol.hu/feed/velemeny.rss'),
+	(u'Kult\xfara', 'http://nol.hu/feed/kult.rss'),
+	(u'Tud/Tech', 'http://nol.hu/feed/tud-tech.rss'),
+	(u'Sport', 'http://nol.hu/feed/sport.rss'),
+	(u'Noller', 'http://nol.hu/feed/noller.rss'),
+	(u'Mozaik', 'http://nol.hu/feed/mozaik.rss'),
+	(u'Utaz\xe1s', 'http://nol.hu/feed/utazas.rss'),
+	(u'Aut\xf3', 'http://nol.hu/feed/auto.rss'),
+	(u'Voks', 'http://nol.hu/feed/voks.rss'),
+
+                   	 ]
--- a/recipes/nowa_fantastyka.recipe
+++ b/recipes/nowa_fantastyka.recipe
@ -1,20 +1,21 @@
 # -*- coding: utf-8 -*-
 from calibre.web.feeds.news import BasicNewsRecipe
-
 class Nowa_Fantastyka(BasicNewsRecipe):
    title          = u'Nowa Fantastyka'
    oldest_article = 7
    __author__        = 'fenuks'
    language       = 'pl'
+    encoding='latin2'
    description ='site for fantasy readers'
    category='fantasy'
    max_articles_per_feed = 100
    INDEX='http://www.fantastyka.pl/'
+    no_stylesheets=True
+    needs_subscription = 'optional'
    remove_tags_before=dict(attrs={'class':'belka1-tlo-md'})
    #remove_tags_after=dict(name='span', attrs={'class':'naglowek-oceny'})
    remove_tags_after=dict(name='td', attrs={'class':'belka1-bot'})
-    remove_tags=[dict(attrs={'class':'avatar2'})]
-    feeds          = []
+    remove_tags=[dict(attrs={'class':'avatar2'}), dict(name='span', attrs={'class':'alert-oceny'}), dict(name='img', attrs={'src':['obrazki/sledz1.png', 'obrazki/print.gif', 'obrazki/mlnf.gif']}), dict(name='b', text='Dodaj komentarz'),dict(name='a', attrs={'href':'http://www.fantastyka.pl/10,1727.html'})]

    def find_articles(self, url):
        articles = []
@ -45,3 +46,13 @@ class Nowa_Fantastyka(BasicNewsRecipe):
        cover=soup.find(name='img', attrs={'class':'okladka'})
        self.cover_url=self.INDEX+ cover['src']
        return getattr(self, 'cover_url', self.cover_url)
+
+    def get_browser(self):
+        br = BasicNewsRecipe.get_browser()
+        if self.username is not None and self.password is not None:
+            br.open('http://www.fantastyka.pl/')
+            br.select_form(nr=0)
+            br['login']   = self.username
+            br['pass'] = self.password
+            br.submit()
+        return br
--- a/recipes/nytimes.recipe
+++ b/recipes/nytimes.recipe
@ -1,5 +1,5 @@
 #!/usr/bin/env  python
-
+# -*- coding: utf-8 -*-
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''
@ -707,6 +707,16 @@ class NYTimes(BasicNewsRecipe):
 		return soup

    def populate_article_metadata(self, article, soup, first):
+        if first and hasattr(self, 'add_toc_thumbnail'):
+            idxdiv = soup.find('div',attrs={'class':'articleSpanImage'})
+            if idxdiv is not None:
+                if idxdiv.img:
+                    self.add_toc_thumbnail(article, idxdiv.img['src'])
+            else:
+                img = soup.find('img')
+                if img is not None:
+                    self.add_toc_thumbnail(article, img['src'])
+
        shortparagraph = ""
        try:
            if len(article.text_summary.strip()) == 0:
--- a/recipes/nytimes_sub.recipe
+++ b/recipes/nytimes_sub.recipe
@ -855,6 +855,16 @@ class NYTimes(BasicNewsRecipe):

        return soup
    def populate_article_metadata(self, article, soup, first):
+        if first and hasattr(self, 'add_toc_thumbnail'):
+            idxdiv = soup.find('div',attrs={'class':'articleSpanImage'})
+            if idxdiv is not None:
+                if idxdiv.img:
+                    self.add_toc_thumbnail(article, idxdiv.img['src'])
+            else:
+                img = soup.find('img')
+                if img is not None:
+                    self.add_toc_thumbnail(article, img['src'])
+
        shortparagraph = ""
        try:
            if len(article.text_summary.strip()) == 0:
--- a/recipes/osnews_pl.recipe
+++ b/recipes/osnews_pl.recipe
@ -23,7 +23,7 @@ class OSNewsRecipe(BasicNewsRecipe):

    oldest_article = 7
    max_articles_per_feed = 100
-
+    cover_url='http://osnews.pl/wp-content/themes/osnews/img/logo.png'
    extra_css = '''
        .news-heading {font-size:150%}
        .newsinformations li {display:inline;}
@ -44,7 +44,9 @@ class OSNewsRecipe(BasicNewsRecipe):
        dict(name = 'div', attrs = {'class' : 'sociable'}),
        dict(name = 'div', attrs = {'class' : 'post_prev'}),
        dict(name = 'div', attrs = {'class' : 'post_next'}),
-        dict(name = 'div', attrs = {'class' : 'clr'})
+        dict(name = 'div', attrs = {'class' : 'clr'}),
+        dict(name = 'div', attrs = {'class' : 'tw_button'}),
+        dict(name = 'div', attrs = {'style' : 'width:56px;height:60px;float:left;margin-right:10px'})
    ]

    preprocess_regexps = [(re.compile(u'</span>Komentarze: \(?[0-9]+\)? ?<span'), lambda match: '</span><span')]
--- a/recipes/prospectmaguk.recipe
+++ b/recipes/prospectmaguk.recipe
@ -0,0 +1,79 @@
+#!/usr/bin/env  python
+__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
+
+__license__   = 'GPL v3'
+
+'''
+calibre recipe for prospectmagazine.co.uk (subscription)
+'''
+
+import re
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class ProspectMagUK(BasicNewsRecipe):
+	title                   = u'Prospect Magazine'
+	description				= 'A general-interest publication offering analysis and commentary about politics, news and business.'
+	__author__				= 'barty, duluoz'
+	timefmt					= ' [%d %B %Y]'
+	no_stylesheets			= True
+	publication_type        = 'magazine'
+	masthead_url            = 'http://www.prospectmagazine.co.uk/wp-content/themes/prospect/images/titleMain.jpg'
+	category                = 'news, UK'
+	language                = 'en_GB'
+	max_articles_per_feed   = 100
+	auto_cleanup            = True
+	needs_subscription      = True
+
+	auto_cleanup_keep = '//div[@class="lead_image"]'
+	remove_tags       = [{'class':['shareinpost','postutils','postinfo']}]
+
+	INDEX             = 'http://www.prospectmagazine.co.uk/current-issue'
+
+	def get_browser(self):
+		br = BasicNewsRecipe.get_browser()
+		if self.username is not None and self.password is not None:
+			br.open('http://www.prospectmagazine.co.uk/wp-login.php')
+			br.select_form(name='loginform')
+			br['log'] = self.username
+			br['pwd'] = self.password
+			br.submit()
+		return br
+
+	def parse_index(self):
+		soup = self.index_to_soup(self.INDEX)
+		#div = soup.find('h1',text=re.compile(r'Issue \d+'))
+		#fname = self.tag_to_string( div) if div is not None else 'Current Issue'
+		div = soup.find('div', id='cover_image')
+		if div is not None:
+			img = div.find('img', src=True)
+			if img is not None:
+				src = img['src']
+				if src.startswith('/'):
+					src = 'http://www.prospectmagazine.co.uk' + src
+				self.cover_url = src
+		feeds = []
+		# loop through sections
+		for sect in soup.findAll('div',attrs={'class':'sectionheading'}):
+			fname = self.tag_to_string( sect).replace('>','').strip()
+			self.log('Found section', fname)
+			articles = []
+
+			# note: can't just find siblings with class='post' because that will also
+			#       grab all the articles belonging to the sections that follow.
+			for item in sect.findNextSiblings('div',attrs={'class':True}):
+				if not 'post' in item['class']: break
+				a = item.find('a', href=True)
+				if a is None: continue
+				url = a['href']
+				title = self.tag_to_string(a)
+				p = item.find('p')
+				desc = self.tag_to_string( p) if p is not None else ''
+				art = {'title':title, 'description':desc,'date':' ', 'url':url}
+				p = item.find(attrs={'class':re.compile('author')})
+				self.log('\tFound article:', title, '::', url)
+				if p is not None:
+					art['author'] = self.tag_to_string( p).strip()
+				articles.append(art)
+
+			feeds.append((fname, articles))
+		return feeds
--- a/recipes/radikal_tr.recipe
+++ b/recipes/radikal_tr.recipe
@ -42,6 +42,9 @@ class Radikal_tr(BasicNewsRecipe):
              ,(u'Politika'    , u'http://www.radikal.com.tr/d/rss/Rss_98.xml'     )
              ,(u'Dis Haberler', u'http://www.radikal.com.tr/d/rss/Rss_100.xml'    )
              ,(u'Ekonomi'     , u'http://www.radikal.com.tr/d/rss/Rss_101.xml'    )
+              ,(u'Radikal Iki'    , u'http://www.radikal.com.tr/d/rss/Rss_42.xml')
+              ,(u'Radikal Hayat'  , u'http://www.radikal.com.tr/d/rss/Rss_41.xml' )
+              ,(u'Radikal Kitap'    , u'http://www.radikal.com.tr/d/rss/Rss_40.xml'     )
            ]

    def print_version(self, url):
--- a/recipes/rstones.recipe
+++ b/recipes/rstones.recipe
@ -29,22 +29,7 @@ class RollingStones(BasicNewsRecipe):
    max_articles_per_feed = 25
    use_embedded_content  = False
    no_stylesheets = True
-
-    remove_javascript     = True
-    #####################################################################################
-    # cleanup section                                                                   #
-    #####################################################################################
-    keep_only_tags       = [
-                            dict(name='div', attrs={'class':['c65l']}),
-                            dict(name='div', attrs={'id':['col1']}),
-
-
-                           ]
-    remove_tags = [
-                    dict(name='div', attrs={'class': ['storyActions upper','storyActions lowerArticleNav']}),
-                    dict(name='div', attrs={'id': ['comments','related']}),
-                  ]
-
+    auto_cleanup = True

    feeds          = [
                       (u'News', u'http://www.rollingstone.com/siteServices/rss/allNews'),
@ -58,25 +43,7 @@ class RollingStones(BasicNewsRecipe):



-    def get_article_url(self, article):
-        return article.get('guid',  None)
-
-
-    def append_page(self, soup, appendtag, position):
-        '''
-        Some are the articles are multipage so the below function
-        will get the articles that have <next>
-        '''
-        pager = soup.find('li',attrs={'class':'next'})
-        if pager:
-           nexturl = pager.a['href']
-           soup2 = self.index_to_soup(nexturl)
-           texttag = soup2.find('div', attrs={'id':'storyTextContainer'})
-           for it in texttag.findAll(style=True):
-               del it['style']
-           newpos = len(texttag.contents)
-           self.append_page(soup2,texttag,newpos)
-           texttag.extract()
-           appendtag.insert(position,texttag)
+    def print_version(self, url):
+        return url +'?print=true'


--- a/recipes/rynek_zdrowia.recipe
+++ b/recipes/rynek_zdrowia.recipe
@ -0,0 +1,21 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class rynekzdrowia(BasicNewsRecipe):
+    title          = u'Rynek Zdrowia'
+    __author__ = u'spi630'
+    language = 'pl'
+    masthead_url = 'http://k.rynekzdrowia.pl/images/headerLogo.png'
+    cover_url = 'http://k.rynekzdrowia.pl/images/headerLogo.png'
+    oldest_article = 3
+    max_articles_per_feed = 25
+    no_stylesheets = True
+    auto_cleanup = True
+    remove_empty_feeds=True
+
+    remove_tags_before = dict(name='h3')
+
+    feeds          = [(u'Finanse i Zarz\u0105dzanie', u'http://www.rynekzdrowia.pl/Kanal/finanse.html'), (u'Inwestycje', u'http://www.rynekzdrowia.pl/Kanal/inwestycje.html'), (u'Aparatura i wyposa\u017cenie', u'http://www.rynekzdrowia.pl/Kanal/aparatura.html'), (u'Informatyka', u'http://www.rynekzdrowia.pl/Kanal/informatyka.html'), (u'Prawo', u'http://www.rynekzdrowia.pl/Kanal/prawo.html'), (u'Polityka zdrowotna', u'http://www.rynekzdrowia.pl/Kanal/polityka_zdrowotna.html'), (u'Ubezpieczenia Zdrowotne', u'http://www.rynekzdrowia.pl/Kanal/ubezpieczenia.html'), (u'Farmacja', u'http://www.rynekzdrowia.pl/Kanal/farmacja.html'), (u'Badania i rozw\xf3j', u'http://www.rynekzdrowia.pl/Kanal/badania.html'), (u'Nauka', u'http://www.rynekzdrowia.pl/Kanal/nauka.html'), (u'Po godzinach', u'http://www.rynekzdrowia.pl/Kanal/godziny.html'), (u'Us\u0142ugi medyczne', u'http://www.rynekzdrowia.pl/Kanal/uslugi.html')]
+
+    def print_version(self, url):
+        url = url.replace('.html', ',drukuj.html')
+        return url
--- a/recipes/salon.recipe
+++ b/recipes/salon.recipe
@ -11,17 +11,16 @@ from calibre.web.feeds.news import BasicNewsRecipe

 class Salon_com(BasicNewsRecipe):
    title = 'Salon.com'
-    __author__ = 'cix3'
+    __author__ = 'Kovid Goyal'
    description = 'Salon.com - Breaking news, opinion, politics, entertainment, sports and culture.'
    timefmt = ' [%b %d, %Y]'
    language = 'en'

    oldest_article = 7
    max_articles_per_feed = 100
-
-    remove_tags = [dict(name='div', attrs={'class':['ad_content', 'clearfix']}), dict(name='hr'), dict(name='img')]
-
-    remove_tags_before = dict(name='h2')
+    auto_cleanup = True
+    auto_cleanup_keep = '//div[@class="art"]'
+    remove_empty_feeds = True

    feeds = [
        ('News & Politics', 'http://feeds.salon.com/salon/news'),
@ -40,5 +39,5 @@ class Salon_com(BasicNewsRecipe):
            ]

    def print_version(self, url):
-        return url.replace('/index.html', '/print.html')
+        return url + '/print/'

--- a/recipes/salonica_press_news.recipe
+++ b/recipes/salonica_press_news.recipe
@ -0,0 +1,17 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class spn(BasicNewsRecipe):
+    title          = u'Salonica Press News'
+    language = 'gr'
+    __author__ = "SteliosGero"
+    oldest_article = 3
+    max_articles_per_feed = 100
+    auto_cleanup = True
+    category               = 'news, GR'
+    language               = 'el'
+
+
+    feeds          = [(u'\u03a0\u03bf\u03bb\u03b9\u03c4\u03b9\u03ba\u03ae', u'http://www.spnews.gr/politiki?format=feed&amp;type=rss'), (u'\u039f\u03b9\u03ba\u03bf\u03bd\u03bf\u03bc\u03af\u03b1', u'http://www.spnews.gr/oikonomia?format=feed&amp;type=rss'), (u'\u0391\u03c5\u03c4\u03bf\u03b4\u03b9\u03bf\u03af\u03ba\u03b7\u03c3\u03b7', u'http://www.spnews.gr/aftodioikisi?format=feed&amp;type=rss'), (u'\u039a\u03bf\u03b9\u03bd\u03c9\u03bd\u03af\u03b1', u'http://www.spnews.gr/koinonia?format=feed&amp;type=rss'), (u'\u0391\u03b8\u03bb\u03b7\u03c4\u03b9\u03c3\u03bc\u03cc\u03c2', u'http://www.spnews.gr/sports?format=feed&amp;type=rss'), (u'\u0394\u03b9\u03b5\u03b8\u03bd\u03ae', u'http://www.spnews.gr/diethni?format=feed&amp;type=rss'), (u'\u03a0\u03bf\u03bb\u03b9\u03c4\u03b9\u03c3\u03bc\u03cc\u03c2', u'http://www.spnews.gr/politismos?format=feed&amp;type=rss'), (u'Media', u'http://www.spnews.gr/media-news?format=feed&amp;type=rss'), (u'\u0396\u03c9\u03ae', u'http://www.spnews.gr/zoi?format=feed&amp;type=rss'), (u'\u03a4\u03b5\u03c7\u03bd\u03bf\u03bb\u03bf\u03b3\u03af\u03b1', u'http://spnews.gr/texnologia?format=feed&amp;type=rss'), (u'\u03a0\u03b5\u03c1\u03b9\u03b2\u03ac\u03bb\u03bb\u03bf\u03bd', u'http://spnews.gr/periballon?format=feed&amp;type=rss'), (u'\u03a0\u03b1\u03c1\u03b1\u03c0\u03bf\u03bb\u03b9\u03c4\u03b9\u03ba\u03ac', u'http://spnews.gr/parapolitika?format=feed&amp;type=rss'), (u'\u03a0\u03b1\u03c1\u03b1\u03b4\u03b7\u03bc\u03bf\u03c4\u03b9\u03ba\u03ac', u'http://spnews.gr/paradimotika?format=feed&amp;type=rss'), (u'\u03a0\u03b1\u03c1\u03b1\u03b1\u03b8\u03bb\u03b7\u03c4\u03b9\u03ba\u03ac', u'http://spnews.gr/parathlitika?format=feed&amp;type=rss'), (u'\u0391\u03c0\u03cc\u03c8\u03b5\u03b9\u03c2', u'http://spnews.gr/apopseis?format=feed&amp;type=rss'), (u'\u03a3\u03c5\u03bd\u03b5\u03cd\u03be\u03b5\u03b9\u03c2', u'http://spnews.gr/synenteykseis?format=feed&amp;type=rss'), (u'Alert!', u'http://spnews.gr/alert?format=feed&amp;type=rss')]
+
+    def print_version(self, url):
+        return url+'?tmpl=component&print=1&layout=default&page='
--- a/Show More
+++ b/Show More