Merge from trunk

2025-08-30 23:00:21 -04:00 · 2012-05-17 06:26:35 +02:00 · 2012-05-17 06:26:35 +02:00 · 99c57ac10d
commit 99c57ac10d
parent 49d1385ea9 b4684cc1c2
201 changed files with 62463 additions and 36984 deletions
--- a/.bzrignore
+++ b/.bzrignore
@ -16,7 +16,6 @@ resources/ebook-convert-complete.pickle
 resources/builtin_recipes.xml
 resources/builtin_recipes.zip
 resources/template-functions.json
-resources/display/*.js
 setup/installer/windows/calibre/build.log
 src/calibre/translations/.errors
 src/cssutils/.svn/
--- a/Changelog.yaml
+++ b/Changelog.yaml
@ -19,6 +19,67 @@
 #   new recipes:
 #     - title: 

+- version: 0.8.51
+  date: 2012-05-11
+
+  new features:
+    - title: "When switching libraries preserve the position and selected books if you switch back to a previously opened library."
+      tickets: [994514]
+
+    - title: "Conversion pipeline: Filter out the useless font-face rules inserted by Microsoft Word for every font on the system"
+
+    - title: "Driver for Motorola XT875 and Pandigital SuperNova"
+      tickets: [996890]
+
+    - title: "Add a colour swatch the the dialog for creating column coloring rules, to ease selection of colors"
+      tickets: [994811] 
+
+    - title: "EPUB Output: Consolidate internal CSS generated by calibre into external stylesheets for ease of editing the EPUB"
+
+    - title: "List EPUB and MOBI at the top of the dropdown list fo formats to convert to, as they are the most common choices"
+      tickets: [994838] 
+
+  bug fixes:
+    - title: "E-book viewer: Improve performance when switching between normal and fullscreen views."
+      tickets: [996102]
+
+    - title: "Edit metadata dialog: When running download metadata do not insert duplicate tags into the list of tags"
+
+    - title: "KF8 Input: Do not error out if the file has a few invalidly encoded bytes."
+      tickets: [997034]
+
+    - title: "Fix download of news in AZW3 format not working"
+      tickets: [996439]
+
+    - title: "Pocketbook driver: Update for new PB 611 firmware."
+      tickets: [903079]
+
+    - title: "ebook-convert: Error out if the user prvides extra command line args instead of silently ignoring them"
+      tickets: [994939] 
+
+    - title: "EPUB Output: Do not self close any container tags to prevent artifacts when EPUBs are viewed using buggy browser based viewers."
+      tickets: [994861]
+
+    - title: "Fix regression in 0.8.50 that broke the conversion of HTML files that contained non-ascii font-face declarations, typically produced by Microsoft Word"
+
+  improved recipes:
+    - Mainichi news
+    - derStandard
+    - Endgadget Japan
+
+  new recipes:
+    - title: Mainichi English
+      author: Hiroshi Miura
+
+    - title: The Grid TO
+      author: Yusuf W
+
+    - title: National Geographic (Italy)
+      author: faber1971
+
+    - title: Rebelion 
+      author: Marc Busque
+
 - version: 0.8.50
  date: 2012-05-04

--- a/recipes/ads_of_the_world.recipe
+++ b/recipes/ads_of_the_world.recipe
@ -0,0 +1,26 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1336986047(BasicNewsRecipe):
+    title          = u'Ads of the World'
+    oldest_article = 7
+    max_articles_per_feed = 100
+    auto_cleanup = False
+    description   = 'The best international advertising campaigns'
+    language = 'en'
+    __author__ = 'faber1971'
+
+    no_stylesheets = True
+    keep_only_tags = [
+                       dict(name='div', attrs={'id':'primary'})
+                     ]
+
+    remove_tags = [
+                       dict(name='ul', attrs={'class':'links inline'})
+                      ,dict(name='div', attrs={'class':'form-item'})
+                      ,dict(name='div', attrs={'id':['options', 'comments']})
+                      ,dict(name='ul', attrs={'id':'nodePager'})
+                     ]
+
+    reverse_article_order = True
+    masthead_url            = 'http://bigcatgroup.co.uk/files/2011/01/05-ads-of-the-world.png'
+    feeds          = [(u'Ads of the world', u'http://feeds.feedburner.com/adsoftheworld-latest')]
--- a/recipes/air_force_times.recipe
+++ b/recipes/air_force_times.recipe
@ -0,0 +1,43 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AirForceTimes(BasicNewsRecipe):
+    title = 'Air Force Times'
+    __author__ = 'jde'
+    __date__ = '16 May 2012'
+    __version__ = '1.0'
+    description = 'News of the U.S. Air Force'
+    language = 'en'
+    publisher = 'AirForceTimes.com'
+    category = 'news, U.S. Air Force'
+    tags = 'news, U.S. Air Force'
+    cover_url = 'http://www.airforcetimes.com/images/logo_airforcetimes_alert.jpg'
+    masthead_url = 'http://www.airforcetimes.com/images/logo_airforcetimes_alert.jpg'
+    oldest_article = 7 #days
+    max_articles_per_feed = 25
+    publication_type = 'newspaper'
+    no_stylesheets = True
+    use_embedded_content = False
+    encoding = None
+    recursions = 0
+    needs_subscription = False
+    remove_javascript = True
+    remove_empty_feeds = True
+    auto_cleanup = True
+
+
+
+    feeds = [
+
+	('News', 		'http://www.airforcetimes.com/rss_news.php'),
+	('Benefits', 		'http://www.airforcetimes.com/rss_benefits.php'),
+	('Money', 		'http://www.airforcetimes.com/rss_money.php'),
+	('Careers & Education', 	'http://www.airforcetimes.com/rss_careers.php'),
+	('Community', 	'http://www.airforcetimes.com/rss_community.php'),
+	('Off Duty', 		'http://www.airforcetimes.com/rss_off_duty.php'),
+	('Entertainment', 	'http://www.airforcetimes.com/rss_entertainment.php'),
+	('Guard & Reserve', 	'http://www.airforcetimes.com/rss_guard.php'),
+              ]
+
+
+
+
--- a/recipes/army_times.recipe
+++ b/recipes/army_times.recipe
@ -0,0 +1,42 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+class ArmyTimes(BasicNewsRecipe):
+    title                  	= 'Army Times'
+    __author__             	= 'jde'
+    __date__		= '16 May 2012'
+    __version__	= '1.0'
+    description            	= 'News of the U.S. Army'
+    language               	= 'en'
+    publisher              	= 'ArmyTimes.com'
+    category               	= 'news, U.S. Army'
+    tags 		= 'news, U.S. Army'
+    cover_url        	= 'http://www.armytimes.com/images/logo_armytimes_alert.jpg'
+    masthead_url        	= 'http://www.armytimes.com/images/logo_armytimes_alert.jpg'
+    oldest_article 	= 7 #days
+    max_articles_per_feed  	= 25
+    publication_type 	= 'newspaper'
+    no_stylesheets         	= True
+    use_embedded_content  = False
+    encoding        	= None
+    recursions      	= 0
+    needs_subscription 	= False
+    remove_javascript 	= True
+    remove_empty_feeds    	= True
+    auto_cleanup 	= True
+
+
+
+    feeds          =   [
+
+('News', 		'http://www.armytimes.com/rss_news.php'),
+('Benefits', 		'http://www.armytimes.com/rss_benefits.php'),
+('Money', 		'http://www.armytimes.com/rss_money.php'),
+('Careers & Education', 	'http://www.armytimes.com/rss_careers.php'),
+('Community', 	'http://www.armytimes.com/rss_community.php'),
+('Off Duty', 		'http://www.armytimes.com/rss_off_duty.php'),
+('Entertainment', 	'http://www.armytimes.com/rss_entertainment.php'),
+('Guard & Reserve', 	'http://www.armytimes.com/rss_guard.php'),
+
+    ]
+
+
+
--- a/recipes/ars_technica.recipe
+++ b/recipes/ars_technica.recipe
@ -1,5 +1,5 @@
 __license__   = 'GPL v3'
-__copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>'
 '''
 arstechnica.com
 '''
@ -12,22 +12,24 @@ class ArsTechnica(BasicNewsRecipe):
    title                 = u'Ars Technica'
    language              = 'en'
    __author__            = 'Darko Miletic, Sujata Raman, Alexis Rohou'
-    description           = 'The art of technology'
-    publisher             = 'Ars Technica'
+    description           = 'Ars Technica: Serving the technologist for 1.2 decades'
+    publisher             = 'Conde Nast Publications'
    category              = 'news, IT, technology'
    oldest_article        = 5
    max_articles_per_feed = 100
    no_stylesheets        = True
    encoding              = 'utf-8'
    use_embedded_content  = False
-    extra_css             = 	'''
-				body {font-family: Arial,Helvetica,sans-serif}
-				.title{text-align: left}
-				.byline{font-weight: bold; line-height: 1em; font-size: 0.625em; text-decoration: none}
-				.news-item-figure-caption-text{font-size:small; font-style:italic}
-				.news-item-figure-caption-byline{font-size:small; font-style:italic; font-weight:bold}
-				'''
-    ignoreEtcArticles     = True	# Etc feed items can be ignored, as they're not real stories
+    remove_empty_feeds    = True
+    publication_type      = 'newsportal'    
+    extra_css             = '''
+                            body {font-family: Arial,sans-serif}
+                            .heading{font-family: "Times New Roman",serif}
+                            .byline{font-weight: bold; line-height: 1em; font-size: 0.625em; text-decoration: none}
+                            img{display: block}
+                            .caption-text{font-size:small; font-style:italic}
+                            .caption-byline{font-size:small; font-style:italic; font-weight:bold}
+				            '''

    conversion_options = {
                             'comments'  : description
@ -36,93 +38,64 @@ class ArsTechnica(BasicNewsRecipe):
                            ,'publisher' : publisher
                         }

-
-    #preprocess_regexps = [
-    #            (re.compile(r'<div class="news-item-figure', re.DOTALL|re.IGNORECASE),lambda match: '<div class="news-item-figure"')
-    #           ,(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')
-    #                     ]
-
-    keep_only_tags = [dict(name='div', attrs={'id':['story','etc-story']})]
+    keep_only_tags = [
+                       dict(attrs={'class':'standalone'})
+                      ,dict(attrs={'id':'article-guts'})
+                     ]

    remove_tags = [
-                     dict(name=['object','link','embed'])
-                    ,dict(name='div', attrs={'class':'read-more-link'})
+                     dict(name=['object','link','embed','iframe','meta'])
+                    ,dict(attrs={'class':'corner-info'})
                  ]
-    #remove_attributes=['width','height']
+    remove_attributes = ['lang']

+                  
    feeds = [
              (u'Infinite Loop (Apple content)'        , u'http://feeds.arstechnica.com/arstechnica/apple/'      )
             ,(u'Opposable Thumbs (Gaming content)'    , u'http://feeds.arstechnica.com/arstechnica/gaming/'     )
             ,(u'Gear and Gadgets'                     , u'http://feeds.arstechnica.com/arstechnica/gadgets/'    )
-             ,(u'Chipster (Hardware content)'          , u'http://feeds.arstechnica.com/arstechnica/hardware/'   )
             ,(u'Uptime (IT content)'                  , u'http://feeds.arstechnica.com/arstechnica/business/'   )
             ,(u'Open Ended (Open Source content)'     , u'http://feeds.arstechnica.com/arstechnica/open-source/')
             ,(u'One Microsoft Way'                    , u'http://feeds.arstechnica.com/arstechnica/microsoft/'  )
-             ,(u'Nobel Intent (Science content)'       , u'http://feeds.arstechnica.com/arstechnica/science/'    )
+             ,(u'Scientific method (Science content)'       , u'http://feeds.arstechnica.com/arstechnica/science/'    )
             ,(u'Law & Disorder (Tech policy content)' , u'http://feeds.arstechnica.com/arstechnica/tech-policy/')
            ]

-    # This deals with multi-page stories
    def append_page(self, soup, appendtag, position):
-        pager = soup.find('div',attrs={'class':'pager'})
+        pager = soup.find(attrs={'class':'numbers'})
        if pager:
-           for atag in pager.findAll('a',href=True):
-               str = self.tag_to_string(atag)
-               if str.startswith('Next'):
-                  nurl = 'http://arstechnica.com' + atag['href']
-                  rawc = self.index_to_soup(nurl,True)
-                  soup2 = BeautifulSoup(rawc, fromEncoding=self.encoding)
-
-                  readmoretag = soup2.find('div', attrs={'class':'read-more-link'})
-                  if readmoretag:
-                     readmoretag.extract()
-                  texttag = soup2.find('div', attrs={'class':'body'})
-                  for it in texttag.findAll(style=True):
-                      del it['style']
-
-                  newpos = len(texttag.contents)
-                  self.append_page(soup2,texttag,newpos)
-                  texttag.extract()
-                  pager.extract()
-                  appendtag.insert(position,texttag)
+           nexttag = pager.find(attrs={'class':'next'})
+           if nexttag:
+              nurl = nexttag.parent['href']
+              rawc = self.index_to_soup(nurl,True)
+              soup2 = BeautifulSoup(rawc, fromEncoding=self.encoding)
+              texttag = soup2.find(attrs={'id':'article-guts'})
+              newpos = len(texttag.contents)
+              self.append_page(soup2,texttag,newpos)
+              texttag.extract()
+              pager.extract()
+              appendtag.insert(position,texttag)


    def preprocess_html(self, soup):
-	# Adds line breaks near the byline (not sure why this is needed)
-        ftag = soup.find('div', attrs={'class':'byline'})
-        if ftag:
-           brtag = Tag(soup,'br')
-           brtag2 = Tag(soup,'br')
-           ftag.insert(4,brtag)
-           ftag.insert(5,brtag2)
-
-	# Remove style items
-        for item in soup.findAll(style=True):
-           del item['style']
-
-	# Remove id
-	for item in soup.findAll(id=True):
-		del item['id']
-
-	# For some reason, links to authors don't have the domainname
-	a_author = soup.find('a',{'href':re.compile("^/author")})
-	if a_author:
-		a_author['href'] = 'http://arstechnica.com'+a_author['href']
-
-	# within div class news-item-figure, we need to grab images
-
-	# Deal with multi-page stories
        self.append_page(soup, soup.body, 3)
-
+        for item in soup.findAll('a'):
+            limg = item.find('img')
+            if item.string is not None:
+               str = item.string
+               item.replaceWith(str)
+            else:
+               if limg:
+                  item.name = 'div'
+                  item.attrs = []
+               else:
+                   str = self.tag_to_string(item)
+                   item.replaceWith(str)
+        for item in soup.findAll('img'):
+            if not item.has_key('alt'):
+               item['alt'] = 'image'        
        return soup

-    def get_article_url(self, article):
-	# If the article title starts with Etc:, don't return it
-	if self.ignoreEtcArticles:
-		article_title = article.get('title',None)
-		if re.match('Etc: ',article_title) is not None:
-			return None
-
-	# The actual article is in a guid tag
-        return article.get('guid',  None).rpartition('?')[0]
-
+    def preprocess_raw_html(self, raw, url):
+       return '<html><head>'+raw[raw.find('</head>'):]
+        
--- a/recipes/der_standard.recipe
+++ b/recipes/der_standard.recipe
@ -7,10 +7,11 @@ __copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>'
 ''' http://www.derstandard.at - Austrian Newspaper '''
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
+from time import strftime

 class DerStandardRecipe(BasicNewsRecipe):
    title = u'derStandard'
-    __author__ = 'Gerhard Aigner and Sujata Raman and Marcel Jira'
+    __author__ = 'Gerhard Aigner and Sujata Raman and Marcel Jira and Peter Reschenhofer'
    description = u'Nachrichten aus Österreich'
    publisher ='derStandard.at'
    category = 'news, politics, nachrichten, Austria'
@ -88,3 +89,41 @@ class DerStandardRecipe(BasicNewsRecipe):
        for t in soup.findAll(['ul', 'li']):
            t.name = 'div'
        return soup
+
+    def get_cover_url(self):
+        highResolution = True
+
+        date    = strftime("%Y/%Y%m%d")
+        # it is also possible for the past
+        #date    = '2012/20120503'
+
+        urlP1   = 'http://epaper.derstandarddigital.at/'
+        urlP2   = 'data_ep/STAN/' + date
+        urlP3   = '/V.B1/'
+        urlP4   = 'paper.htm'
+        urlHTML = urlP1 + urlP2 + urlP3 + urlP4
+
+        br = self.clone_browser(self.browser)
+        htmlF  = br.open_novisit(urlHTML)
+        htmlC  = htmlF.read()
+
+
+        # URL EXAMPLE: data_ep/STAN/2012/20120504/V.B1/pages/A3B6798F-2751-4D8D-A103-C5EF22F7ACBE.htm
+        # consists of part2 + part3 + 'pages/' + code
+        # 'pages/' has length 6, code has lenght 36
+
+        index   = htmlC.find(urlP2) + len(urlP2 + urlP3) + 6
+        code    = htmlC[index:index + 36]
+
+
+        # URL EXAMPLE HIGH RESOLUTION: http://epaper.derstandarddigital.at/data_ep/STAN/2012/20120504/pagejpg/A3B6798F-2751-4D8D-A103-C5EF22F7ACBE_b.png
+        # URL EXAMPLE LOW RESOLUTION: http://epaper.derstandarddigital.at/data_ep/STAN/2012/20120504/pagejpg/2AB52F71-11C1-4859-9114-CDCD79BEFDCB.png
+
+        urlPic  = urlP1 + urlP2 + '/pagejpg/' + code
+
+        if highResolution:
+            urlPic  = urlPic + '_b'
+
+        urlPic  = urlPic + '.png'
+
+        return urlPic
--- a/recipes/economico.recipe
+++ b/recipes/economico.recipe
@ -0,0 +1,30 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Economico(BasicNewsRecipe):
+    title          = u'Economico'
+    language       = 'pt'
+    __author__     = 'Krittika Goyal'
+    oldest_article = 1 #days
+    max_articles_per_feed = 25
+    encoding = 'utf-8'
+    use_embedded_content = False
+
+    no_stylesheets = True
+    auto_cleanup = True
+
+
+    feeds          = [
+('Ultima Hora',
+ 'http://economico.sapo.pt/rss/ultimas'),
+ ('Em Foco',
+ 'http://economico.sapo.pt/rss/emfoco'),
+ ('Mercados',
+ 'http://economico.sapo.pt/rss/mercados'),
+ ('Empresas',
+ 'http://economico.sapo.pt/rss/empresas'),
+ ('Economia',
+ 'http://economico.sapo.pt/rss/economia'),
+ ('Politica',
+ 'http://economico.sapo.pt/rss/politica'),
+]
+
--- a/recipes/endgadget_ja.recipe
+++ b/recipes/endgadget_ja.recipe
@ -17,7 +17,25 @@ class EndgadgetJapan(BasicNewsRecipe):
    no_stylesheets = True
    language = 'ja'
    encoding = 'utf-8'
-    feeds          = [(u'engadget', u'http://japanese.engadget.com/rss.xml')]
+    index = 'http://japanese.engadget.com/'
+    remove_javascript = True
+
+    remove_tags_before = dict(name="h1", attrs={'class':"post_title"})
+    remove_tags_after = dict(name='div', attrs={'class':'post_body'})
+
+    def parse_index(self):
+        feeds = []
+        newsarticles = []
+        soup   = self.index_to_soup(self.index)
+        for topstories in soup.findAll('div',attrs={'class':'post_content'}):
+           itt = topstories.find('h4')
+           itema = itt.find('a',href=True)
+           newsarticles.append({
+                                      'title'      :itema.string
+                                     ,'date'       :''
+                                     ,'url'        :itema['href']
+                                     ,'description':''
+                                    })
+        feeds.append(('Latest Posts', newsarticles))
+        return feeds

-    remove_tags_before = dict(name="div", attrs={'id':"content_wrap"})
-    remove_tags_after = dict(name='h3', attrs={'id':'addcomments'})
--- a/recipes/folha.recipe
+++ b/recipes/folha.recipe
@ -0,0 +1,82 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
+'''
+www.folha.uol.com.br
+'''
+import urllib
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Folha_de_s_paulo(BasicNewsRecipe):
+    title                 = u'Folha de São Paulo - portal'
+    __author__            = 'Darko Miletic'
+    description           = 'Um Jornala a servicao do Brasil'
+    publisher             = 'Folhapress'
+    category              = 'news, politics, Brasil'
+    oldest_article        = 2
+    max_articles_per_feed = 200
+    no_stylesheets        = True
+    encoding              = 'cp1252'
+    use_embedded_content  = False
+    language              = 'pt_BR'
+    remove_empty_feeds    = True
+    publication_type      = 'newspaper'
+    masthead_url          = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif'
+    extra_css             = """
+                               body{font-family: Arial,Helvetica,sans-serif }
+                               img{margin-bottom: 0.4em; display:block}
+                            """
+
+    conversion_options = {
+                          'comment'   : description
+                        , 'tags'      : category
+                        , 'publisher' : publisher
+                        , 'language'  : language
+                        }
+
+    remove_tags = [dict(name=['meta','link','base','iframe','embed','object'])]
+    keep_only_tags = [dict(attrs={'id':'articleNew'})]
+
+
+    feeds = [
+              (u'Poder'          , u'http://feeds.folha.uol.com.br/poder/rss091.xml'               )
+             ,(u'Mundo'          , u'http://feeds.folha.uol.com.br/mundo/rss091.xml'               )
+             ,(u'Mercado'        , u'http://feeds.folha.uol.com.br/mercado/rss091.xml'             )
+             ,(u'Cotidiano'      , u'http://feeds.folha.uol.com.br/cotidiano/rss091.xml'           )
+             ,(u'Esporte'        , u'http://feeds.folha.uol.com.br/esporte/rss091.xml'             )
+             ,(u'Ilustrada'      , u'http://feeds.folha.uol.com.br/ilustrada/rss091.xml'           )
+             ,(u'F5'             , u'http://feeds.folha.uol.com.br/f5/rss091.xml'                  )
+             ,(u'Ciência'        , u'http://feeds.folha.uol.com.br/ciencia/rss091.xml'             )
+             ,(u'Tec'            , u'http://feeds.folha.uol.com.br/tec/rss091.xml'                 )
+             ,(u'Ambiente'       , u'http://feeds.folha.uol.com.br/ambiente/rss091.xml'            )
+             ,(u'Bichos'         , u'http://feeds.folha.uol.com.br/bichos/rss091.xml'              )
+             ,(u'Celebridades'   , u'http://feeds.folha.uol.com.br/celebridades/rss091.xml'        )
+             ,(u'Comida'         , u'http://feeds.folha.uol.com.br/comida/rss091.xml'              )
+             ,(u'Equilibrio'     , u'http://feeds.folha.uol.com.br/equilibrioesaude/rss091.xml'    )
+             ,(u'Folhateen'      , u'http://feeds.folha.uol.com.br/folhateen/rss091.xml'           )
+             ,(u'Folhinha'       , u'http://feeds.folha.uol.com.br/folhinha/rss091.xml'            )
+             ,(u'Ilustrissima'   , u'http://feeds.folha.uol.com.br/ilustrissima/rss091.xml'        )
+             ,(u'Saber'          , u'http://feeds.folha.uol.com.br/saber/rss091.xml'               )
+             ,(u'Turismo'        , u'http://feeds.folha.uol.com.br/turismo/rss091.xml'             )
+             ,(u'Panel do Leitor', u'http://feeds.folha.uol.com.br/folha/paineldoleitor/rss091.xml')
+             ,(u'Publifolha'     , u'http://feeds.folha.uol.com.br/folha/publifolha/rss091.xml'    )
+             ,(u'Em cima da hora', u'http://feeds.folha.uol.com.br/emcimadahora/rss091.xml'        )
+            ]
+
+    def get_article_url(self, article):
+        url = BasicNewsRecipe.get_article_url(self, article)
+        curl = url.partition('/*')[2]
+        return curl
+
+    def print_version(self, url):
+        return 'http://tools.folha.com.br/print?site=emcimadahora&url=' + urllib.quote_plus(url)
+
+    def get_cover_url(self):
+        soup = self.index_to_soup('http://www.folha.uol.com.br/')
+        cont = soup.find('div', attrs={'id':'newspaper'})
+        if cont:
+           ai = cont.find('a', href='http://www1.folha.uol.com.br/fsp/')
+           if ai:
+              return ai.img['src']
+        return None
--- a/recipes/folhadesaopaulo.recipe
+++ b/recipes/folhadesaopaulo.recipe
@ -8,7 +8,7 @@ from urllib2 import Request, urlopen, URLError
 class FolhaOnline(BasicNewsRecipe):
    THUMBALIZR_API        = '' # ---->Get your at http://www.thumbalizr.com/ and put here
    LANGUAGE              = 'pt_br'
-    language = 'pt'
+    language = 'pt_BR'
    LANGHTM               = 'pt-br'
    ENCODING              = 'cp1252'
    ENCHTM                = 'iso-8859-1'
--- a/recipes/folhadesaopaulo_sub.recipe
+++ b/recipes/folhadesaopaulo_sub.recipe
@ -14,7 +14,7 @@ class FSP(BasicNewsRecipe):
    HOMEPAGE = 'http://www1.folha.uol.com.br/fsp/'
    masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif'

-    language = 'pt'
+    language = 'pt_BR'
    no_stylesheets = True
    max_articles_per_feed  = 40
    remove_javascript     = True
--- a/recipes/gosc_niedzielny.recipe
+++ b/recipes/gosc_niedzielny.recipe
@ -6,21 +6,20 @@ __copyright__ = '2011, Piotr Kontek, piotr.kontek@gmail.com'

 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ptempfile import PersistentTemporaryFile
+from datetime import date
 import re

 class GN(BasicNewsRecipe):
        EDITION = 0

        __author__ = 'Piotr Kontek'
+        title = u'Gość niedzielny'
        description = 'Weekly magazine'
        encoding = 'utf-8'
        no_stylesheets = True
        language = 'pl'
        remove_javascript = True
        temp_files = []
-        simultaneous_downloads = 1
-        masthead_url = 'http://gosc.pl/files/11/03/12/949089_top.gif'
-        title = u'Gość niedzielny'

        articles_are_obfuscated = True

@ -56,22 +55,28 @@ class GN(BasicNewsRecipe):
            self.temp_files[-1].close()
            return self.temp_files[-1].name

-        def find_last_issue(self):
-                soup = self.index_to_soup('http://gosc.pl/wyszukaj/wydania/3.Gosc-Niedzielny')
-                #szukam zdjęcia i linka do porzedniego pełnego numeru
+        def find_last_issue(self, year):
+                soup = self.index_to_soup('http://gosc.pl/wyszukaj/wydania/3.Gosc-Niedzielny/rok/' + str(year))
+
+                #szukam zdjęcia i linka do poprzedniego pełnego numeru
                first = True
                for d in soup.findAll('div', attrs={'class':'l release_preview_l'}):
                    img = d.find('img')
                    if img != None:
                        a = img.parent
                        self.EDITION = a['href']
+                        self.title = img['alt']
                        self.cover_url = 'http://www.gosc.pl' + img['src']
-                        if not first:
+                        if year != date.today().year or not first:
                            break
                        first = False

        def parse_index(self):
-                self.find_last_issue()
+                year = date.today().year
+                self.find_last_issue(year)
+                ##jeśli to pierwszy numer w roku trzeba pobrać poprzedni rok
+                if self.EDITION == 0:
+                	self.find_last_issue(year-1)
                soup = self.index_to_soup('http://www.gosc.pl' + self.EDITION)
                feeds = []
                #wstepniak
--- a/recipes/grid_to.recipe
+++ b/recipes/grid_to.recipe
@ -0,0 +1,79 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class TheGridTO(BasicNewsRecipe):
+    #: The title to use for the ebook
+    title               = u'The Grid TO'
+
+    #: A couple of lines that describe the content this recipe downloads.
+    #: This will be used primarily in a GUI that presents a list of recipes.
+    description         = (u'The Grid is a weekly city magazine and daily website providing a fresh, '
+                    'accessible voice for Toronto.')
+
+    #: The author of this recipe
+    __author__          = u'Yusuf W'
+
+    #: The language that the news is in. Must be an ISO-639 code either
+    #: two or three characters long
+    language            = 'en_CA'
+
+    #: Publication type
+    #: Set to newspaper, magazine or blog
+    publication_type    = 'newspaper'
+
+    #: Convenient flag to disable loading of stylesheets for websites
+    #: that have overly complex stylesheets unsuitable for conversion
+    #: to ebooks formats
+    #: If True stylesheets are not downloaded and processed
+    no_stylesheets      = True
+
+    #: List of tags to be removed. Specified tags are removed from downloaded HTML.
+    remove_tags_before  = dict(name='div', id='content')
+    remove_tags_after   = dict(name='div', id='content')
+    remove_tags         =  [
+                                dict(name='div', attrs={'class':'right-content pull-right'}),
+                                dict(name='div', attrs={'class':'right-content'}),
+                                dict(name='div', attrs={'class':'ftr-line'}),
+                                dict(name='div', attrs={'class':'pull-right'}),
+                                dict(name='div', id='comments'),
+                                dict(name='div', id='tags')
+                            ]
+
+    #: Keep only the specified tags and their children.
+    #keep_only_tags        = [dict(name='div', id='content')]
+
+    cover_margins       = (0, 0, '#ffffff')
+
+    INDEX               = 'http://www.thegridto.com'
+
+    def get_cover_url(self):
+        soup = self.index_to_soup(self.INDEX)
+        cover_url = soup.find(attrs={'class':'article-block latest-issue'}).find('img')['src']
+
+        return cover_url
+
+    def parse_index(self):
+
+        # Get the latest issue
+        soup = self.index_to_soup(self.INDEX)
+        a = soup.find('div', attrs={'class': 'full-content stuff-ftr'}).findAll('a')[2]
+
+        # Parse the index of the latest issue
+        self.INDEX = self.INDEX + a['href']
+        soup = self.index_to_soup(self.INDEX)
+
+        feeds = []
+        for section in ['city', 'life', 'culture']:
+            section_class = 'left-content article-listing ' + section + ' pull-left'
+            div = soup.find(attrs={'class': section_class})
+
+            articles = []
+            for tag in div.findAllNext(attrs={'class':'search-block'}):
+                a = tag.findAll('a', href=True)[1]
+
+                title = self.tag_to_string(a)
+                url = a['href']
+
+                articles.append({'title': title, 'url': url, 'description':'', 'date':''})
+
+            feeds.append((section, articles))
+        return feeds
--- a/recipes/heavy_metal_it.recipe
+++ b/recipes/heavy_metal_it.recipe
@ -0,0 +1,22 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+
+class AdvancedUserRecipe1336289226(BasicNewsRecipe):
+    title          = u'Heavy Metal'
+    oldest_article = 15
+    max_articles_per_feed = 100
+    auto_cleanup = False
+    masthead_url            = 'http://net-static2.tccstatic.com/template/tmw/img/tj.gif'
+    feeds          = [(u'Heavy Metal', u'http://www.heavy-metal.it/feed/')]
+    keep_only_tags = [
+                       dict(name='div', attrs={'class':'entry'})
+                     ]
+    remove_tags_after = [
+                            dict(name='div', attrs={'class':'sociable'})
+                            ]
+    description = 'An Heavy metal Italian magazine'
+    __author__      = 'faber1971'
+    language = 'it'
+
+__version__     = 'v1.0'
+__date__        = '6, May 2012'
--- a/recipes/icons/folha.png
+++ b/recipes/icons/folha.png
--- a/recipes/icons/strategic_culture.png
+++ b/recipes/icons/strategic_culture.png
--- a/recipes/jijinews.recipe
+++ b/recipes/jijinews.recipe
@ -20,6 +20,8 @@ class JijiDotCom(BasicNewsRecipe):
    top_url        = 'http://www.jiji.com/'

    feeds          = [(u'\u30cb\u30e5\u30fc\u30b9', u'http://www.jiji.com/rss/ranking.rdf')]
+
+    remove_tags_before = dict(id="article-area")
    remove_tags_after = dict(id="ad_google")

    def get_cover_url(self):
--- a/recipes/juve_la_stampa.recipe
+++ b/recipes/juve_la_stampa.recipe
@ -0,0 +1,24 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1336504510(BasicNewsRecipe):
+    title          = u'Juve - La Stampa'
+    oldest_article = 1
+    language = 'it'
+    max_articles_per_feed = 100
+    auto_cleanup = True
+    masthead_url            = 'http://www3.lastampa.it/fileadmin/media/sport/quijuve/top_quijuve.jpg'
+    feeds          = [(u'Qui Juve - La Stampa', u'http://feed43.com/2352784107537677.xml')]
+    remove_tags        = [dict(name='div',attrs={'class':['article-toolbar', 'sezione sezione-news', 'intestazione']})]
+
+    extra_css = '''
+                div.dettaglio div.immagine_girata p.news-single-imgcaption {color: #000000; font-family: "Georgia", "Times", serif; font-size: 7px; font-weight: 400;line-height: 1.2; padding-bottom: 12px; text-transform: none; }
+                .sezione {color: #000000; font-family: "Georgia", "Times", serif; font-size: 7px; font-weight: 400;line-height: 1.2; padding-bottom: 12px; text-transform: none; }
+                body {color: #000000; font-family: "Georgia", "Times", serif; font-size: 7px; font-weight: 400;line-height: 1.2; padding-bottom: 12px; text-transform: none; }
+                h3 {color: #000000; font-family: "Georgia", "Times", serif; font-size: 22px; font-weight: 400;line-height: 1.2; padding-bottom: 12px; text-transform: none; }
+                div.dettaglio h2.catenaccio {color: #000000; font-family: "Georgia", "Times", serif; font-size: 18px; font-weight: 400;line-height: 1.2; padding-bottom: 12px; text-transform: none; }
+                '''
+    description = 'News about Juventus from La Stampa'
+    __author__      = 'faber1971'
+
+__version__     = 'v1.0'
+__date__        = '8, May 2012'
--- a/recipes/la_republica.recipe
+++ b/recipes/la_republica.recipe
@ -1,7 +1,7 @@
 __license__   = 'GPL v3'
-__author__    = 'Lorenzo Vigentini, based on Darko Miletic, Gabriele Marini'
-__copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>'
-description   = 'Italian daily newspaper - v1.01 (04, January 2010); 16.05.2010 new version; 17.10.2011 new version; 14.12.2011 new version'
+__author__    = 'Lorenzo Vigentini, based on Darko Miletic, Gabriele Marini; minor fixes by faber1971'
+__copyright__ = '2009-2012, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>, faber1971'
+description   = 'Italian daily newspaper - v1.02 (04, January 2010); 16.05.2010 new version; 17.10.2011 new version; 14.12.2011 new version; 11.05.2012 new version'

 '''
 http://www.repubblica.it/
@ -12,14 +12,14 @@ from calibre.web.feeds.news import BasicNewsRecipe

 class LaRepubblica(BasicNewsRecipe):
    title                   = 'La Repubblica'
-    __author__              = 'Lorenzo Vigentini, Gabriele Marini, Darko Miletic'
+    __author__              = 'Lorenzo Vigentini, Gabriele Marini, Darko Miletic, faber1971'
    description             = 'il quotidiano online con tutte le notizie in tempo reale. News e ultime notizie. Tutti i settori: politica, cronaca, economia, sport, esteri, scienza, tecnologia, internet, spettacoli, musica, cultura, arte, mostre, libri, dvd, vhs, concerti, cinema, attori, attrici, recensioni, chat, cucina, mappe. Le citta di Repubblica: Roma, Milano, Bologna, Firenze, Palermo, Napoli, Bari, Torino.'
    masthead_url            = 'http://www.repubblica.it/static/images/homepage/2010/la-repubblica-logo-home-payoff.png'
    publisher               = 'Gruppo editoriale L\'Espresso'
    category                = 'News, politics, culture, economy, general interest'
    language                = 'it'
    timefmt                 = '[%a, %d %b, %Y]'
-    oldest_article          = 5
+    oldest_article          = 1
    encoding                = 'utf8'
    use_embedded_content    = False
    no_stylesheets          = True
@ -59,6 +59,7 @@ class LaRepubblica(BasicNewsRecipe):
                          dict(attrs={'class':'articolo'}),
                          dict(attrs={'class':'body-text'}),
                          dict(name='p', attrs={'class':'disclaimer clearfix'}),
+                          dict(name='div', attrs={'id':'main'}),
                          dict(attrs={'id':'contA'})
                         ]

@ -67,7 +68,7 @@ class LaRepubblica(BasicNewsRecipe):
                            dict(name=['object','link','meta','iframe','embed']),
                            dict(name='span',attrs={'class':'linkindice'}),
                            dict(name='div', attrs={'class':['bottom-mobile','adv adv-middle-inline']}),
-                            dict(name='div', attrs={'id':['rssdiv','blocco','fb-like-head']}),
+                            dict(name='div', attrs={'id':['rssdiv','blocco','fb-like-head', 'sidebar']}),
                            dict(name='div', attrs={'class':['utility','fb-like-button','archive-button']}),
                            dict(name='div', attrs={'class':'generalbox'}),
                            dict(name='ul', attrs={'id':'hystory'})
@ -88,11 +89,12 @@ class LaRepubblica(BasicNewsRecipe):
                       (u'Sport', u'http://www.repubblica.it/rss/sport/rss2.0.xml'),
                       (u'Calcio', u'http://www.repubblica.it/rss/sport/calcio/rss2.0.xml'),
                       (u'Motori', u'http://www.repubblica.it/rss/motori/rss2.0.xml'),
-                       (u'Edizione Roma', u'http://roma.repubblica.it/rss/rss2.0.xml'),
-                       (u'Edizione Torino', u'http://torino.repubblica.it/rss/rss2.0.xml'),
-                       (u'Edizione Milano', u'feed://milano.repubblica.it/rss/rss2.0.xml'),
-                       (u'Edizione Napoli', u'feed://napoli.repubblica.it/rss/rss2.0.xml'),
-                       (u'Edizione Palermo', u'feed://palermo.repubblica.it/rss/rss2.0.xml')
+                       (u'Roma', u'http://roma.repubblica.it/rss/rss2.0.xml'),
+                       (u'Torino', u'http://torino.repubblica.it/rss/rss2.0.xml'),
+                       (u'Milano', u'feed://milano.repubblica.it/rss/rss2.0.xml'),
+                       (u'Napoli', u'feed://napoli.repubblica.it/rss/rss2.0.xml'),
+                       (u'Bari', u'http://bari.repubblica.it/rss/rss2.0.xml'),
+                       (u'Palermo', u'feed://palermo.repubblica.it/rss/rss2.0.xml')
                      ]

    def preprocess_html(self, soup):
--- a/recipes/mainichi.recipe
+++ b/recipes/mainichi.recipe
@ -16,12 +16,12 @@ class MainichiDailyNews(BasicNewsRecipe):
    publisher      = 'Mainichi Daily News'
    category       = 'news, japan'
    language       = 'ja'
-
-    feeds          = [(u'daily news', u'http://mainichi.jp/rss/etc/flash.rss')]
+    index          = 'http://mainichi.jp/select/'
+    remove_javascript = True
+    masthead_title = u'MAINICHI DAILY NEWS'

    remove_tags_before = {'class':"NewsTitle"}
-    remove_tags = [{'class':"RelatedArticle"}]
-    remove_tags_after = {'class':"Credit"}
+    remove_tags_after = {'class':"NewsBody clr"}

    def parse_feeds(self):

@ -32,9 +32,30 @@ class MainichiDailyNews(BasicNewsRecipe):
            for a,curarticle in enumerate(curfeed.articles):
                if re.search(r'pheedo.jp', curarticle.url):
                    delList.append(curarticle)
+                if re.search(r'rssad.jp', curarticle.url):
+                    delList.append(curarticle)
            if len(delList)>0:
                for d in delList:
                    index = curfeed.articles.index(d)
                    curfeed.articles[index:index+1] = []

        return feeds
+
+    def parse_index(self):
+        feeds = []
+        soup   = self.index_to_soup(self.index)
+        topstories = soup.find('ul',attrs={'class':'MaiLink'})
+        if topstories:
+           newsarticles = []
+           for itt in topstories.findAll('li'):
+                itema = itt.find('a',href=True)
+                if itema:
+                    newsarticles.append({
+                                      'title'      :itema.string
+                                     ,'date'       :''
+                                     ,'url'        :itema['href']
+                                     ,'description':''
+                                    })
+           feeds.append(('latest', newsarticles))
+        return feeds
+
--- a/recipes/mainichi_en.recipe
+++ b/recipes/mainichi_en.recipe
@ -0,0 +1,67 @@
+__license__   = 'GPL v3'
+__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
+'''
+www.mainichi.jp
+'''
+
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class MainichiEnglishNews(BasicNewsRecipe):
+    title          = u'The Mainichi'
+    __author__     = 'Hiroshi Miura'
+    oldest_article = 2
+    max_articles_per_feed = 40
+    description    = 'Japanese traditional newspaper Mainichi news in English'
+    publisher      = 'Mainichi News'
+    category       = 'news, japan'
+    language       = 'en_JP'
+    index          = 'http://mainichi.jp/english/english/index.html'
+    remove_javascript = True
+    masthead_url = 'http://mainichi.jp/english/images/themainichi.png'
+
+    remove_tags_before = {'class':"NewsTitle"}
+    remove_tags_after = {'class':"NewsBody clr"}
+
+    def parse_feeds(self):
+
+        feeds = BasicNewsRecipe.parse_feeds(self)
+
+        for curfeed in feeds:
+            delList = []
+            for a,curarticle in enumerate(curfeed.articles):
+                if re.search(r'pheedo.jp', curarticle.url):
+                    delList.append(curarticle)
+                if re.search(r'rssad.jp', curarticle.url):
+                    delList.append(curarticle)
+            if len(delList)>0:
+                for d in delList:
+                    index = curfeed.articles.index(d)
+                    curfeed.articles[index:index+1] = []
+
+        return feeds
+
+    def parse_index(self):
+        feeds = []
+        soup   = self.index_to_soup(self.index)
+        for section in soup.findAll('section'):
+           newsarticles = []
+           section_name = 'news'
+           hds = section.find('div', attrs={'class':'CategoryHead clr'})
+           if hds:
+               section_item = hds.find('h1')
+               if section_item:
+                   section_name = section_item.find('a').string
+               items = section.find('ul', attrs={'class':'MaiLink'})
+               for item in items.findAll('li'):
+                   if item:
+                       itema = item.find('a')
+                       newsarticles.append({
+                                      'title'      :itema.string
+                                     ,'date'       :''
+                                     ,'url'        :itema['href']
+                                     ,'description':''
+                                    })
+               feeds.append((section_name, newsarticles))
+        return feeds
+
--- a/recipes/mainichi_it_news.recipe
+++ b/recipes/mainichi_it_news.recipe
@ -1,34 +0,0 @@
-from calibre.web.feeds.news import BasicNewsRecipe
-import re
-
-class MainichiDailyITNews(BasicNewsRecipe):
-    title          = u'\u6bce\u65e5\u65b0\u805e(IT&\u5bb6\u96fb)'
-    __author__     = 'Hiroshi Miura'
-    oldest_article = 2
-    max_articles_per_feed = 100
-    description    = 'Japanese traditional newspaper Mainichi Daily News - IT and electronics'
-    publisher      = 'Mainichi Daily News'
-    category       = 'news, Japan, IT, Electronics'
-    language       = 'ja'
-
-    feeds          = [(u'IT News', u'http://mainichi.pheedo.jp/f/mainichijp_electronics')]
-
-    remove_tags_before = {'class':"NewsTitle"}
-    remove_tags = [{'class':"RelatedArticle"}]
-    remove_tags_after = {'class':"Credit"}
-
-    def parse_feeds(self):
-
-        feeds = BasicNewsRecipe.parse_feeds(self)
-
-        for curfeed in feeds:
-            delList = []
-            for a,curarticle in enumerate(curfeed.articles):
-                if re.search(r'pheedo.jp', curarticle.url):
-                    delList.append(curarticle)
-            if len(delList)>0:
-                for d in delList:
-                    index = curfeed.articles.index(d)
-                    curfeed.articles[index:index+1] = []
-
-        return feeds
--- a/recipes/mainichi_science_news.recipe
+++ b/recipes/mainichi_science_news.recipe
@ -0,0 +1,59 @@
+__license__   = 'GPL v3'
+__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
+'''
+www.mainichi.jp
+'''
+
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class MainichiDailyScienceNews(BasicNewsRecipe):
+    title          = u'\u6bce\u65e5\u65b0\u805e(Science)'
+    __author__     = 'Hiroshi Miura'
+    oldest_article = 2
+    max_articles_per_feed = 20
+    description    = 'Japanese traditional newspaper Mainichi Daily News - science'
+    publisher      = 'Mainichi Daily News'
+    category       = 'news, japan'
+    language       = 'ja'
+    index          = 'http://mainichi.jp/select/science'
+    remove_javascript = True
+    masthead_title = u'MAINICHI DAILY NEWS'
+
+    remove_tags_before = {'class':"NewsTitle"}
+    remove_tags_after = {'class':"NewsBody clr"}
+
+    def parse_feeds(self):
+
+        feeds = BasicNewsRecipe.parse_feeds(self)
+
+        for curfeed in feeds:
+            delList = []
+            for a,curarticle in enumerate(curfeed.articles):
+                if re.search(r'rssad.jp', curarticle.url):
+                    delList.append(curarticle)
+            if len(delList)>0:
+                for d in delList:
+                    index = curfeed.articles.index(d)
+                    curfeed.articles[index:index+1] = []
+
+        return feeds
+
+    def parse_index(self):
+        feeds = []
+        soup   = self.index_to_soup(self.index)
+        topstories = soup.find('ul',attrs={'class':'MaiLink'})
+        if topstories:
+           newsarticles = []
+           for itt in topstories.findAll('li'):
+                itema = itt.find('a',href=True)
+                if itema:
+                    newsarticles.append({
+                                      'title'      :itema.string
+                                     ,'date'       :''
+                                     ,'url'        :itema['href']
+                                     ,'description':''
+                                    })
+           feeds.append(('Science', newsarticles))
+        return feeds
+
--- a/recipes/marine_corps_times.recipe
+++ b/recipes/marine_corps_times.recipe
@ -0,0 +1,42 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+class MarineCorpsTimes(BasicNewsRecipe):
+    title = 'Marine Corps Times'
+    __author__ = 'jde'
+    __date__ = '16 May 2012'
+    __version__ = '1.0'
+    description = 'News of the U.S. Marine Corps'
+    language = 'en'
+    publisher = 'MarineCorpsTimes.com'
+    category = 'news, U.S. Marine Corps'
+    tags = 'news, U.S. Marine Corps'
+    cover_url 		= 'http://www.marinecorpstimes.com/images/logo_marinetimes-alert.jpg'
+    masthead_url 	= 'http://www.marinecorpstimes.com/images/logo_marinetimes-alert.jpg'
+    oldest_article = 7 #days
+    max_articles_per_feed = 25
+    publication_type = 'newspaper'
+    no_stylesheets = True
+    use_embedded_content = False
+    encoding = None
+    recursions = 0
+    needs_subscription = False
+    remove_javascript = True
+    remove_empty_feeds = True
+    auto_cleanup = True
+
+
+
+    feeds = [
+
+	('News', 		'http://www.MarineCorpstimes.com/rss_news.php'),
+	('Benefits', 		'http://www.MarineCorpstimes.com/rss_benefits.php'),
+	('Money', 		'http://www.MarineCorpstimes.com/rss_money.php'),
+	('Careers & Education', 	'http://www.MarineCorpstimes.com/rss_careers.php'),
+	('Community', 	'http://www.MarineCorpstimes.com/rss_community.php'),
+	('Off Duty', 		'http://www.MarineCorpstimes.com/rss_off_duty.php'),
+	('Entertainment', 	'http://www.MarineCorpstimes.com/rss_entertainment.php'),
+	('Guard & Reserve', 	'http://www.MarineCorpstimes.com/rss_guard.php'),
+              ]
+
+
+
+
--- a/recipes/military_times.recipe
+++ b/recipes/military_times.recipe
@ -0,0 +1,41 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class MilitaryTimes(BasicNewsRecipe):
+    title                  	= 'Military Times'
+    __author__             	= 'jde'
+    __date__		= '16 May 2012'
+    __version__	= '1.0'
+    description            	= 'News of the U.S. Military'
+    language               	= 'en'
+    publisher              	= 'MilitaryTimes.com'
+    category               	= 'news, U.S. Military'
+    tags 		= 'news, U.S. Military'
+    cover_url        	= 'http://www.militarytimes.com/images/logo_militarytimes_landing-s.gif'
+    masthead_url        	= 'http://www.militarytimes.com/images/logo_militarytimes_landing-s.gif'
+    oldest_article 	= 7 #days
+    max_articles_per_feed  	= 25
+    publication_type 	= 'newspaper'
+    no_stylesheets         	= True
+    use_embedded_content  = False
+    encoding        	= None
+    recursions      	= 0
+    needs_subscription 	= False
+    remove_javascript 	= True
+    remove_empty_feeds    	= True
+    auto_cleanup 	= True
+
+
+
+    feeds          =   [
+
+('News', 		'http://www.militarytimes.com/rss_news.php'),
+('Benefits', 		'http://www.militarytimes.com/rss_benefits.php'),
+('Money', 		'http://www.militarytimes.com/rss_money.php'),
+('Careers & Education', 	'http://www.militarytimes.com/rss_careers.php'),
+('Community', 	'http://www.militarytimes.com/rss_community.php'),
+('Off Duty', 		'http://www.militarytimes.com/rss_off_duty.php'),
+('Entertainment', 	'http://www.militarytimes.com/rss_entertainment.php'),
+('Guard & Reserve', 	'http://www.militarytimes.com/rss_guard.php'),
+
+    ]
+
--- a/recipes/montreal_gazette.recipe
+++ b/recipes/montreal_gazette.recipe
@ -1,5 +1,4 @@
 #!/usr/bin/env  python
-# -*- coding: utf-8 -*-

 __license__   = 'GPL v3'

@ -7,77 +6,21 @@ __license__   = 'GPL v3'
 www.canada.com
 '''

-import re
-from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
+from calibre.web.feeds.recipes import BasicNewsRecipe


 class CanWestPaper(BasicNewsRecipe):

-    # un-comment the following four lines for the Victoria Times Colonist
-##    title = u'Victoria Times Colonist'
-##    url_prefix = 'http://www.timescolonist.com'
-##    description = u'News from Victoria, BC'
-##    fp_tag = 'CAN_TC'
-
-    # un-comment the following four lines for the Vancouver Province
-##    title = u'Vancouver Province'
-##    url_prefix = 'http://www.theprovince.com'
-##    description = u'News from Vancouver, BC'
-##    fp_tag = 'CAN_VP'
-
-    # un-comment the following four lines for the Vancouver Sun
-##    title = u'Vancouver Sun'
-##    url_prefix = 'http://www.vancouversun.com'
-##    description = u'News from Vancouver, BC'
-##    fp_tag = 'CAN_VS'
-
-    # un-comment the following four lines for the Edmonton Journal
-##    title = u'Edmonton Journal'
-##    url_prefix = 'http://www.edmontonjournal.com'
-##    description = u'News from Edmonton, AB'
-##    fp_tag = 'CAN_EJ'
-
-    # un-comment the following four lines for the Calgary Herald
-##    title = u'Calgary Herald'
-##    url_prefix = 'http://www.calgaryherald.com'
-##    description = u'News from Calgary, AB'
-##    fp_tag = 'CAN_CH'
-
-    # un-comment the following four lines for the Regina Leader-Post
-##    title = u'Regina Leader-Post'
-##    url_prefix = 'http://www.leaderpost.com'
-##    description = u'News from Regina, SK'
-##    fp_tag = ''
-
-    # un-comment the following four lines for the Saskatoon Star-Phoenix
-##    title = u'Saskatoon Star-Phoenix'
-##    url_prefix = 'http://www.thestarphoenix.com'
-##    description = u'News from Saskatoon, SK'
-##    fp_tag = ''
-
-    # un-comment the following four lines for the Windsor Star
-##    title = u'Windsor Star'
-##    url_prefix = 'http://www.windsorstar.com'
-##    description = u'News from Windsor, ON'
-##    fp_tag = 'CAN_'
-
-    # un-comment the following four lines for the Ottawa Citizen
-##    title = u'Ottawa Citizen'
-##    url_prefix = 'http://www.ottawacitizen.com'
-##    description = u'News from Ottawa, ON'
-##    fp_tag = 'CAN_OC'
-
-    # un-comment the following four lines for the Montreal Gazette
+    # un-comment the following three lines for the Montreal Gazette
    title = u'Montreal Gazette'
-    url_prefix = 'http://www.montrealgazette.com'
    description = u'News from Montreal, QC'
-    fp_tag = 'CAN_MG'


    language = 'en_CA'
    __author__ = 'Nick Redding'
    no_stylesheets = True
+    auto_cleanup = True
+    auto_cleanup_keep = '//*[@id="imageBox"]'
    timefmt = ' [%b %d]'
    extra_css = '''
                .timestamp {  font-size:xx-small; display: block; }
@ -87,135 +30,19 @@ class CanWestPaper(BasicNewsRecipe):
                .byline { font-size:xx-small; }
                #photocaption { font-size: small; font-style: italic }
                #photocredit { font-size: xx-small; }'''
-    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
-    remove_tags = [{'class':'comments'},
-                   dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
-                   dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
-                   dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
-                   dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
-                   dict(name='div', attrs={'class':'rule_grey_solid'}),
-                   dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
+    


-    def get_cover_url(self):
-        from datetime import timedelta, date
-        if self.fp_tag=='':
-            return None
-        cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
-        br = BasicNewsRecipe.get_browser()
-        daysback=1
-        try:
-            br.open(cover)
-        except:
-            while daysback<7:
-                cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
-                br = BasicNewsRecipe.get_browser()
-                try:
-                    br.open(cover)
-                except:
-                    daysback = daysback+1
-                    continue
-                break
-        if daysback==7:
-            self.log("\nCover unavailable")
-            cover = None
-        return cover
-
-    def fixChars(self,string):
-        # Replace lsquo (\x91)
-        fixed = re.sub("\x91","‘",string)
-        # Replace rsquo (\x92)
-        fixed = re.sub("\x92","’",fixed)
-        # Replace ldquo (\x93)
-        fixed = re.sub("\x93","“",fixed)
-        # Replace rdquo (\x94)
-        fixed = re.sub("\x94","”",fixed)
-        # Replace ndash (\x96)
-        fixed = re.sub("\x96","–",fixed)
-        # Replace mdash (\x97)
-        fixed = re.sub("\x97","—",fixed)
-        fixed = re.sub("&#x2019;","’",fixed)
-        return fixed
-
-    def massageNCXText(self, description):
-        # Kindle TOC descriptions won't render certain characters
-        if description:
-            massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
-            # Replace '&' with '&'
-            massaged = re.sub("&","&", massaged)
-            return self.fixChars(massaged)
-        else:
-            return description
-
-    def populate_article_metadata(self, article, soup, first):
-        if first:
-            picdiv = soup.find('body').find('img')
-            if picdiv is not None:
-                self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
-        xtitle = article.text_summary.strip()
-        if len(xtitle) == 0:
-            desc = soup.find('meta',attrs={'property':'og:description'})
-            if desc is not None:
-                article.summary = article.text_summary = desc['content']
-
-    def strip_anchors(self,soup):
-        paras = soup.findAll(True)
-        for para in paras:
-            aTags = para.findAll('a')
-            for a in aTags:
-                if a.img is None:
-                    a.replaceWith(a.renderContents().decode('cp1252','replace'))
-        return soup
-
-    def preprocess_html(self, soup):
-        return self.strip_anchors(soup)
+    feeds          = [
+('News', 
+ 'http://rss.canada.com/get/?F297'),
+ ('Sports', 
+ 'http://rss.canada.com/get/?F299'),
+ ('Entertainment', 
+ 'http://rss.canada.com/get/?F7366'),
+ ('Business', 
+ 'http://rss.canada.com/get/?F6939'),
+]


-
-    def parse_index(self):
-        soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
-
-        articles = {}
-        key = 'News'
-        ans = ['News']
-
-        # Find each instance of class="sectiontitle", class="featurecontent"
-        for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
-                #self.log(" div class = %s" % divtag['class'])
-                if divtag['class'].startswith('section_title'):
-                    # div contains section title
-                    if not divtag.h3:
-                        continue
-                    key = self.tag_to_string(divtag.h3,False)
-                    ans.append(key)
-                    self.log("Section name %s" % key)
-                    continue
-                # div contains article data
-                h1tag = divtag.find('h1')
-                if not h1tag:
-                    continue
-                atag = h1tag.find('a',href=True)
-                if not atag:
-                    continue
-                url = self.url_prefix+'/news/todays-paper/'+atag['href']
-                #self.log("Section %s" % key)
-                #self.log("url %s" % url)
-                title = self.tag_to_string(atag,False)
-                #self.log("title %s" % title)
-                pubdate = ''
-                description = ''
-                ptag = divtag.find('p');
-                if ptag:
-                    description = self.tag_to_string(ptag,False)
-                    #self.log("description %s" % description)
-                author = ''
-                autag = divtag.find('h4')
-                if autag:
-                    author = self.tag_to_string(autag,False)
-                    #self.log("author %s" % author)
-                if not articles.has_key(key):
-                    articles[key] = []
-                articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
-
-        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
-        return ans
+ 
--- a/recipes/nachdenkseiten.recipe
+++ b/recipes/nachdenkseiten.recipe
@ -0,0 +1,22 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Nachdenkseiten(BasicNewsRecipe):
+    title          = u'Nachdenkseiten'
+    __author__ = 'jrda'
+    publisher = 'www.nachdenkseiten.de Albrecht Mueller und Dr. Wolfgang Lieb'
+    description = 'NachDenkSeiten - Die kritische Website'
+    category = 'news'
+    oldest_article = 7
+    use_embedded_content  = False
+    language = 'de'
+    timefmt = ''
+    max_articles_per_feed = 6
+    no_stylesheets        = True
+    encoding              = 'utf-8'
+    remove_javascript     = True
+    keep_only_tags = [
+            {'id':'content'}]
+
+    feeds = [
+              ('News', 'http://www.nachdenkseiten.de/?feed=rss2'),
+            ]
--- a/recipes/national_geographic_it.recipe
+++ b/recipes/national_geographic_it.recipe
@ -0,0 +1,16 @@
+__version__     = 'v1.0'
+__date__        = '5, May 2012'
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1336226255(BasicNewsRecipe):
+    title          = u'National Geographic'
+    __author__      = 'faber1971'
+    description = 'Science magazine'
+    language = 'it'
+
+    oldest_article = 15
+    max_articles_per_feed = 100
+    auto_cleanup = True
+    remove_tags        = [dict(name='div',attrs={'class':'banner-abbonamenti'})]
+    feeds          = [(u'National Geographic', u'http://www.nationalgeographic.it/rss/all/rss2.0.xml')]
--- a/recipes/navy_times.recipe
+++ b/recipes/navy_times.recipe
@ -0,0 +1,42 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+class NavyTimes(BasicNewsRecipe):
+    title                  	= 'Navy Times'
+    __author__             	= 'jde'
+    __date__		= '16 May 2012'
+    __version__	= '1.0'
+    description            	= 'News of the U.S. Navy'
+    language               	= 'en'
+    publisher              	= 'NavyTimes.com'
+    category               	= 'news, U.S. Navy'
+    tags 		= 'news, U.S. Navy'
+    cover_url        	= 'http://www.navytimes.com/images/logo_navytimes_alert.jpg'
+    masthead_url        	= 'http://www.navytimes.com/images/logo_navytimes_alert.jpg'
+    oldest_article 	= 7 #days
+    max_articles_per_feed  	= 25
+    publication_type 	= 'newspaper'
+    no_stylesheets         	= True
+    use_embedded_content  = False
+    encoding        	= None
+    recursions      	= 0
+    needs_subscription 	= False
+    remove_javascript 	= True
+    remove_empty_feeds    	= True
+    auto_cleanup 	= True
+
+
+
+    feeds          =   [
+
+('News', 		'http://www.navytimes.com/rss_news.php'),
+('Benefits', 		'http://www.navytimes.com/rss_benefits.php'),
+('Money', 		'http://www.navytimes.com/rss_money.php'),
+('Careers & Education', 	'http://www.navytimes.com/rss_careers.php'),
+('Community', 	'http://www.navytimes.com/rss_community.php'),
+('Off Duty', 		'http://www.navytimes.com/rss_off_duty.php'),
+('Entertainment', 	'http://www.navytimes.com/rss_entertainment.php'),
+('Guard & Reserve', 	'http://www.navytimes.com/rss_guard.php'),
+
+    ]
+
+
+
--- a/recipes/news_busters.recipe
+++ b/recipes/news_busters.recipe
@ -0,0 +1,20 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class NewsBusters(BasicNewsRecipe):
+    title          	     = u'News Busters'
+    description      = 'Exposing and Combating Liberal Media Bias'
+    __author__     = 'jde'
+    oldest_article = 1#day
+    max_articles_per_feed = 100
+    cover_url = "http://newsbusters.org/sites/all/themes/genesis_nb/images/nb-mrc.png"
+    language        = 'en'
+    encoding        = 'utf8'
+    needs_subscription = False
+    remove_javascript = True
+    recursions      = 0
+    use_embedded_content = False
+    no_stylesheets = True
+    auto_cleanup = True
+
+    feeds          = [(u'Blog', u'http://www.newsbusters.org/rss.xml')]
+
--- a/recipes/pescanik.recipe
+++ b/recipes/pescanik.recipe
@ -9,10 +9,10 @@ import re
 from calibre.web.feeds.news import BasicNewsRecipe

 class Pescanik(BasicNewsRecipe):
-    title                 = 'Peščanik'
+    title                 = u'Peščanik'
    __author__            = 'Darko Miletic'
-    description           = 'Peščanik je udruženje građana osnovano 2006. godine. Glavni proizvod Peščanika je radio emisija koja je emitovana na Radiju B92 od 02.02.2000. do 16.06.2011, a od septembra 2011. se emituje na osam radio stanica u Srbiji, Crnoj Gori i BiH'
-    publisher             = 'Peščanik'
+    description           = u'Peščanik je udruženje građana osnovano 2006. godine. Glavni proizvod Peščanika je radio emisija koja je emitovana na Radiju B92 od 02.02.2000. do 16.06.2011, a od septembra 2011. se emituje na osam radio stanica u Srbiji, Crnoj Gori i BiH'
+    publisher             = u'Peščanik'
    category              = 'news, politics, Serbia'
    oldest_article        = 10
    max_articles_per_feed = 100
@ -45,4 +45,4 @@ class Pescanik(BasicNewsRecipe):
                  ]

    def print_version(self, url):
-        return url + 'print/'
+        return url + 'print/'
--- a/recipes/readitlater.recipe
+++ b/recipes/readitlater.recipe
@ -1,5 +1,5 @@
 """
-Pocket Calibre Recipe v1.0
+Pocket Calibre Recipe v1.2
 """
 __license__   = 'GPL v3'
 __copyright__ = '''
@ -73,6 +73,9 @@ class Pocket(BasicNewsRecipe):
            articles = []
            soup = self.index_to_soup(feedurl)
            ritem = soup.find('ul', attrs={'id':'list'})
+            if ritem is None:
+                self.log.exception("Page %s skipped: invalid HTML" % (feedtitle if feedtitle else feedurl))
+                continue
            for item in reversed(ritem.findAll('li')):
                if articlesToGrab < 1:
                    break
@ -94,7 +97,12 @@ class Pocket(BasicNewsRecipe):
                    self.readList.append(readLink)
            totalfeeds.append((feedtitle, articles))
        if len(self.readList) < self.minimum_articles:
-            raise Exception("Not enough articles in RIL! Change minimum_articles or add more.")
+            self.mark_as_read_after_dl = False
+            if hasattr(self, 'abort_recipe_processing'):
+               self.abort_recipe_processing("Only %d articles retrieved, minimum_articles not reached" % len(self.readList))
+            else:
+                self.log.exception("Only %d articles retrieved, minimum_articles not reached" % len(self.readList))
+                return []
        return totalfeeds

    def mark_as_read(self, markList):
--- a/recipes/revista_summa.recipe
+++ b/recipes/revista_summa.recipe
@ -0,0 +1,22 @@
+__license__   = 'GPL v3'
+__author__    = 'Vakya'
+__version__     = 'v1.0'
+__date__        = '14, May 2012'
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1336226255(BasicNewsRecipe):
+
+    title          = u'Revista Summa'
+    publisher      = u'Summa'
+    __author__      = 'Vakya'
+    description = 'Informacion regional sobre economia y negocios'
+    language = 'es'
+
+    oldest_article = 15
+    max_articles_per_feed = 100
+    auto_cleanup = True
+    remove_tags_before = dict(name='h1')
+    remove_tags_after = dict(name='label')
+    feeds          = [(u'Revista Summa', u'http://www.revistasumma.com/rss/rss-v2.0.rss')]
+
--- a/recipes/spiegel_int.recipe
+++ b/recipes/spiegel_int.recipe
@ -1,3 +1,4 @@
+ 
 __license__   = 'GPL v3'
 __copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
 '''
@ -15,6 +16,8 @@ class Spiegel_int(BasicNewsRecipe):
    language              = 'en_DE'
    no_stylesheets        = True
    use_embedded_content  = False
+    auto_cleanup = True
+    auto_cleanup_keep = '//*[@id="spArticleTopAsset"]'
    encoding              = 'cp1252'
    publisher             = 'SPIEGEL ONLINE GmbH'
    category              = 'news, politics, Germany'
@ -43,25 +46,25 @@ class Spiegel_int(BasicNewsRecipe):
                   .spPhotoGallery{font-size:x-small; color:#990000 ;}
                '''

-    keep_only_tags    = [dict(attrs={'id':'spArticleContent'})]
-    remove_tags_after = dict(attrs={'id':'spArticleBody'})
-    remove_tags       = [dict(name=['meta','base','iframe','embed','object'])]
-    remove_attributes = ['clear']
+    #keep_only_tags    = [dict(attrs={'id':'spArticleContent'})]
+    #remove_tags_after = dict(attrs={'id':'spArticleBody'})
+    #remove_tags       = [dict(name=['meta','base','iframe','embed','object'])]
+    #remove_attributes = ['clear']
    feeds             = [(u'Spiegel Online', u'http://www.spiegel.de/international/index.rss')]

-    def print_version(self, url):
-        main, sep, rest = url.rpartition(',')
-        rmain, rsep, rrest = main.rpartition(',')
-        return rmain + ',druck-' + rrest + ',' + rest
+    #def print_version(self, url):
+        #main, sep, rest = url.rpartition(',')
+        #rmain, rsep, rrest = main.rpartition(',')
+        #return rmain + ',druck-' + rrest + ',' + rest

-    def preprocess_html(self, soup):
-        for item in soup.findAll(style=True):
-            del item['style']
-        for item in soup.findAll('a'):
-            if item.string is not None:
-               str = item.string
-               item.replaceWith(str)
-            else:
-               str = self.tag_to_string(item)
-               item.replaceWith(str)
-        return soup
+    #def preprocess_html(self, soup):
+        #for item in soup.findAll(style=True):
+            #del item['style']
+        #for item in soup.findAll('a'):
+            #if item.string is not None:
+               #str = item.string
+               #item.replaceWith(str)
+            #else:
+               #str = self.tag_to_string(item)
+               #item.replaceWith(str)
+        #return soup
--- a/recipes/stars_and_stripes.recipe
+++ b/recipes/stars_and_stripes.recipe
@ -0,0 +1,39 @@
+''' Stars and Stripes
+ '''
+
+
+import re
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+
+class AdvancedUserRecipe1308791026(BasicNewsRecipe):
+    title          = u'Stars and Stripes'
+    oldest_article = 3
+    max_articles_per_feed = 100
+    __author__             = 'adoucette'
+    description            = 'The U.S. militarys independent news source, featuring exclusive reports from Iraq, Afghanistan, Europe and the Far East.'
+    no_stylesheets         = True
+    #delay                  = 1
+    use_embedded_content   = False
+    encoding               = 'utf8'
+    publisher              = 'stripes.com'
+    category               = 'news, US, world'
+    language               = 'en_US'
+    publication_type       = 'newsportal'
+    preprocess_regexps     = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
+    conversion_options = {
+        'comments'        : description
+        ,'tags'            : category
+        ,'language'        : language
+        ,'publisher'       : publisher
+        ,'linearize_tables': True
+        }
+    keep_only_tags    = [dict(name='div', attrs={'class':['element article']})]
+    remove_tags_after = [dict(name='ul', attrs={'class':'inline-bookmarks'})]
+    feeds          = [
+            (u'News', u'http://feeds.stripes.com/starsandstripes/news'),
+            (u'Sports', u'http://feeds.stripes.com/starsandstripes/sports'),
+            (u'Military Life', u'http://feeds.stripes.com/starsandstripes/militarylife'),
+            (u'Opinion', u'http://feeds.stripes.com/starsandstripes/opinion'),
+            (u'Travel', u'http://feeds.stripes.com/starsandstripes/travel')
+            ]
--- a/recipes/strategic_culture.recipe
+++ b/recipes/strategic_culture.recipe
@ -0,0 +1,92 @@
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
+
+'''
+www.strategic-culture.org
+'''
+
+import time
+from calibre import strftime
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class StrategicCulture(BasicNewsRecipe):
+    title                 = 'Strategic Culture Foundation'
+    __author__            = 'Darko Miletic'
+    description           = 'Online Journal'
+    publisher             = 'Strategic Culture Foundation'
+    category              = 'news, politics'
+    oldest_article        = 7
+    max_articles_per_feed = 100
+    no_stylesheets        = True
+    encoding              = 'utf-8'
+    use_embedded_content  = False
+    language              = 'en'
+    publication_type      = 'newsportal'
+    masthead_url          = 'http://www.strategic-culture.org/img/logo.jpg'    
+    extra_css             = '''
+                             body{font-family: Arial, sans-serif}
+                             h1{font-family: "Times New Roman",Times,serif}                             
+                             img{margin-bottom: 0.8em}
+                            '''
+
+    conversion_options = {
+                          'comment'          : description
+                        , 'tags'             : category
+                        , 'publisher'        : publisher
+                        , 'language'         : language
+                        }
+
+    keep_only_tags = [
+                      dict(name=['h1','p'])
+                      ,dict(name='div', attrs={'id':'cke_pastebin'})                      
+                     ]
+
+    remove_tags = [dict(name=['object','link','base','meta','iframe'])]
+
+    feeds = [
+               (u'News'             , u'http://www.strategic-culture.org/blocks/news.html'                )
+              ,(u'Politics'         , u'http://www.strategic-culture.org/rubrics/politics.html'           )
+              ,(u'Economics'        , u'http://www.strategic-culture.org/rubrics/economics.html'          )
+              ,(u'History & Culture', u'http://www.strategic-culture.org/rubrics/history-and-culture.html')
+              ,(u'Columnists'       , u'http://www.strategic-culture.org/rubrics/columnists.html'         )
+            ]
+
+    def print_version(self, url):
+        return url.replace('-culture.org/news/','-culture.org/pview/')
+            
+    def parse_index(self):
+        totalfeeds = []
+        lfeeds = self.get_feeds()
+        for feedobj in lfeeds:
+            feedtitle, feedurl = feedobj
+            self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
+            articles = []
+            soup = self.index_to_soup(feedurl)
+            if feedurl.endswith('news.html'):
+               clname = 'sini14'
+            else:
+               clname = 'h22'
+            checker = []
+            for item in soup.findAll('a', attrs={'class':clname}):
+                atag          = item
+                url           = atag['href']
+                title         = self.tag_to_string(atag)
+                description   = ''
+                daypart = url.rpartition('/')[0]
+                mpart,sep,day = daypart.rpartition('/')
+                ypart,sep,month = mpart.rpartition('/')
+                year = ypart.rpartition('/')[2]                
+                date          = strftime("%a, %d %b %Y %H:%M:%S +0000", time.strptime(day + "/" + month + "/" + year, "%d/%m/%Y"))
+                if url not in checker:
+                    checker.append(url)
+                    articles.append({
+                                          'title'      :title
+                                         ,'date'       :date
+                                         ,'url'        :url
+                                         ,'description':description
+                                        })
+            totalfeeds.append((feedtitle, articles))
+        return totalfeeds
+
+
--- a/resources/compiled_coffeescript.zip
+++ b/resources/compiled_coffeescript.zip
--- a/resources/default_tweaks.py
+++ b/resources/default_tweaks.py
@ -506,3 +506,17 @@ change_book_details_font_size_by = 0
 # No compile: compile_gpm_templates = False
 compile_gpm_templates = True

+#: What format to default to when using the Tweak feature
+# The Tweak feature of calibre allows direct editing of a book format.
+# If multiple formats are available, calibre will offer you a choice
+# of formats, defaulting to your preferred output format if it is available.
+# Set this tweak to a specific value of 'EPUB' or 'AZW3' to always default
+# to that format rather than your output format preference.
+# Set to a value of 'remember' to use whichever format you chose last time you
+# used the Tweak feature.
+# Examples:
+#   default_tweak_format = None       (Use output format)
+#   default_tweak_format = 'EPUB'
+#   default_tweak_format = 'remember'
+default_tweak_format = None
+
--- a/session.vim
+++ b/session.vim
@ -20,7 +20,11 @@ vipy.session.initialize(project_name='calibre', src_dir=src_dir,
            project_dir=project_dir, base_dir=project_dir)

 def recipe_title_callback(raw):
-    return eval(raw.decode('utf-8')).replace(' ', '_')
+    try:
+        return eval(raw.decode('utf-8')).replace(u' ', u'_')
+    except:
+        print ('Failed to decode recipe title: %r'%raw)
+        raise

 vipy.session.add_content_browser('<leader>r', 'Recipe',
    vipy.session.glob_based_iterator(os.path.join(project_dir, 'recipes', '*.recipe')),
--- a/setup/install.py
+++ b/setup/install.py
@ -22,7 +22,8 @@ Do not modify it unless you know what you are doing.
 import sys, os

 path = os.environ.get('CALIBRE_PYTHON_PATH', {path!r})
-sys.path.insert(0, path)
+if path not in sys.path:
+    sys.path.insert(0, path)

 sys.resources_location = os.environ.get('CALIBRE_RESOURCES_PATH', {resources!r})
 sys.extensions_location = os.environ.get('CALIBRE_EXTENSIONS_PATH', {extensions!r})
--- a/setup/installer/linux/freeze2.py
+++ b/setup/installer/linux/freeze2.py
@ -41,8 +41,8 @@ binary_includes = [
                '/usr/lib/libgthread-2.0.so.0',
                '/usr/lib/libpng14.so.14',
                '/usr/lib/libexslt.so.0',
-                MAGICK_PREFIX+'/lib/libMagickWand.so.4',
-                MAGICK_PREFIX+'/lib/libMagickCore.so.4',
+                MAGICK_PREFIX+'/lib/libMagickWand.so.5',
+                MAGICK_PREFIX+'/lib/libMagickCore.so.5',
                '/usr/lib/libgcrypt.so.11',
                '/usr/lib/libgpg-error.so.0',
                '/usr/lib/libphonon.so.4',
--- a/setup/installer/osx/app/main.py
+++ b/setup/installer/osx/app/main.py
@ -429,7 +429,7 @@ class Py2App(object):
    def add_imagemagick(self):
        info('\nAdding ImageMagick')
        for x in ('Wand', 'Core'):
-            self.install_dylib(os.path.join(SW, 'lib', 'libMagick%s.4.dylib'%x))
+            self.install_dylib(os.path.join(SW, 'lib', 'libMagick%s.5.dylib'%x))
        idir = glob.glob(os.path.join(SW, 'lib', 'ImageMagick-*'))[-1]
        dest = os.path.join(self.frameworks_dir, 'ImageMagick')
        if os.path.exists(dest):
--- a/setup/installer/windows/freeze.py
+++ b/setup/installer/windows/freeze.py
@ -18,7 +18,7 @@ QT_DIR = 'Q:\\Qt\\4.8.1'
 QT_DLLS = ['Core', 'Gui', 'Network', 'Svg', 'WebKit', 'Xml', 'XmlPatterns']
 LIBUNRAR         = 'C:\\Program Files\\UnrarDLL\\unrar.dll'
 SW               = r'C:\cygwin\home\kovid\sw'
-IMAGEMAGICK      = os.path.join(SW, 'build', 'ImageMagick-6.6.6',
+IMAGEMAGICK      = os.path.join(SW, 'build', 'ImageMagick-6.7.6',
        'VisualMagick', 'bin')
 CRT = r'C:\Microsoft.VC90.CRT'

--- a/setup/installer/windows/notes.rst
+++ b/setup/installer/windows/notes.rst
@ -336,6 +336,8 @@ Index: src/PdfFiltersPrivate.cpp
 ImageMagick
 --------------

+Get the source from: http://www.imagemagick.org/download/windows/ImageMagick-windows.zip
+
 Edit VisualMagick/configure/configure.cpp to set

 int projectType = MULTITHREADEDDLL;
@ -349,7 +351,10 @@ Edit magick/magick-config.h
 Undefine ProvideDllMain and MAGICKCORE_X11_DELEGATE

 Now open VisualMagick/VisualDynamicMT.sln set to Release
-Remove the CORE_xlib and UTIL_Imdisplay project CORE_Magick++
+Remove the CORE_xlib, UTIL_Imdisplay and CORE_Magick++ projects.
+
+F7 for build project, you will get one error due to the removal of xlib, ignore
+it.

 calibre
 ---------
--- a/setup/iso_639/ca.po
+++ b/setup/iso_639/ca.po
@ -12,14 +12,14 @@ msgstr ""
 "Report-Msgid-Bugs-To: Debian iso-codes team <pkg-isocodes-"
 "devel@lists.alioth.debian.org>\n"
 "POT-Creation-Date: 2011-11-25 14:01+0000\n"
-"PO-Revision-Date: 2012-04-28 10:42+0000\n"
-"Last-Translator: Ferran Rius <frius64@hotmail.com>\n"
+"PO-Revision-Date: 2012-05-03 16:09+0000\n"
+"Last-Translator: Dídac Rios <didac@niorcs.com>\n"
 "Language-Team: Catalan <linux@softcatala.org>\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=UTF-8\n"
 "Content-Transfer-Encoding: 8bit\n"
-"X-Launchpad-Export-Date: 2012-04-29 04:45+0000\n"
-"X-Generator: Launchpad (build 15149)\n"
+"X-Launchpad-Export-Date: 2012-05-04 04:47+0000\n"
+"X-Generator: Launchpad (build 15195)\n"
 "Language: ca\n"

 #. name for aaa
@ -9936,11 +9936,11 @@ msgstr "Ibani"

 #. name for ica
 msgid "Ede Ica"
-msgstr ""
+msgstr "Ede Ica"

 #. name for ich
 msgid "Etkywan"
-msgstr ""
+msgstr "Etkywan"

 #. name for icl
 msgid "Icelandic Sign Language"
@ -9952,7 +9952,7 @@ msgstr "Anglès crioll; Islander"

 #. name for ida
 msgid "Idakho-Isukha-Tiriki"
-msgstr ""
+msgstr "Idakho-Isukha-Tiriki"

 #. name for idb
 msgid "Indo-Portuguese"
@ -9960,15 +9960,15 @@ msgstr "Indo-portuguès"

 #. name for idc
 msgid "Idon"
-msgstr ""
+msgstr "Idon"

 #. name for idd
 msgid "Ede Idaca"
-msgstr ""
+msgstr "Ede Idaca"

 #. name for ide
 msgid "Idere"
-msgstr ""
+msgstr "Idere"

 #. name for idi
 msgid "Idi"
@ -9976,43 +9976,43 @@ msgstr ""

 #. name for ido
 msgid "Ido"
-msgstr ""
+msgstr "ido"

 #. name for idr
 msgid "Indri"
-msgstr ""
+msgstr "Indri"

 #. name for ids
 msgid "Idesa"
-msgstr ""
+msgstr "Idesa"

 #. name for idt
 msgid "Idaté"
-msgstr ""
+msgstr "Idaté"

 #. name for idu
 msgid "Idoma"
-msgstr ""
+msgstr "Idoma"

 #. name for ifa
 msgid "Ifugao; Amganad"
-msgstr ""
+msgstr "Ifugao; Amganad"

 #. name for ifb
 msgid "Ifugao; Batad"
-msgstr ""
+msgstr "Ifugao; Batad"

 #. name for ife
 msgid "Ifè"
-msgstr ""
+msgstr "Ifè"

 #. name for iff
 msgid "Ifo"
-msgstr ""
+msgstr "Ifo"

 #. name for ifk
 msgid "Ifugao; Tuwali"
-msgstr ""
+msgstr "Ifugao; Tuwali"

 #. name for ifm
 msgid "Teke-Fuumu"
@ -10020,15 +10020,15 @@ msgstr "Teke; Fuumu"

 #. name for ifu
 msgid "Ifugao; Mayoyao"
-msgstr ""
+msgstr "Ifugao; Mayoyao"

 #. name for ify
 msgid "Kallahan; Keley-I"
-msgstr ""
+msgstr "Kallahan; Keley-I"

 #. name for igb
 msgid "Ebira"
-msgstr ""
+msgstr "Ebira"

 #. name for ige
 msgid "Igede"
--- a/setup/iso_639/sr.po
+++ b/setup/iso_639/sr.po
@ -8,14 +8,14 @@ msgstr ""
 "Report-Msgid-Bugs-To: Debian iso-codes team <pkg-isocodes-"
 "devel@lists.alioth.debian.org>\n"
 "POT-Creation-Date: 2011-11-25 14:01+0000\n"
-"PO-Revision-Date: 2012-03-25 12:19+0000\n"
-"Last-Translator: Radan Putnik <srastral@gmail.com>\n"
+"PO-Revision-Date: 2012-05-03 14:49+0000\n"
+"Last-Translator: Иван Старчевић <ivanstar61@gmail.com>\n"
 "Language-Team: Serbian <gnu@prevod.org>\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=UTF-8\n"
 "Content-Transfer-Encoding: 8bit\n"
-"X-Launchpad-Export-Date: 2012-03-26 04:37+0000\n"
-"X-Generator: Launchpad (build 15008)\n"
+"X-Launchpad-Export-Date: 2012-05-04 04:47+0000\n"
+"X-Generator: Launchpad (build 15195)\n"
 "Language: sr\n"

 #. name for aaa
@ -6152,7 +6152,7 @@ msgstr ""

 #. name for deu
 msgid "German"
-msgstr "немачки"
+msgstr "Немачки"

 #. name for dev
 msgid "Domung"
@ -8416,7 +8416,7 @@ msgstr "ирски"

 #. name for glg
 msgid "Galician"
-msgstr ""
+msgstr "Галицијски"

 #. name for glh
 msgid "Pashayi; Northwest"
@ -8472,11 +8472,11 @@ msgstr ""

 #. name for gmh
 msgid "German; Middle High (ca. 1050-1500)"
-msgstr ""
+msgstr "Немачки; средње високи (ca. 1050-1500)"

 #. name for gml
 msgid "German; Middle Low"
-msgstr ""
+msgstr "Немачки; средње низак"

 #. name for gmm
 msgid "Gbaya-Mbodomo"
@ -8792,7 +8792,7 @@ msgstr ""

 #. name for gsg
 msgid "German Sign Language"
-msgstr ""
+msgstr "Немачки језик"

 #. name for gsl
 msgid "Gusilay"
@ -8820,7 +8820,7 @@ msgstr ""

 #. name for gsw
 msgid "German; Swiss"
-msgstr ""
+msgstr "Немачки ; Швајцарска"

 #. name for gta
 msgid "Guató"
@ -17954,7 +17954,7 @@ msgstr ""

 #. name for nds
 msgid "German; Low"
-msgstr ""
+msgstr "Немачки; низак"

 #. name for ndt
 msgid "Ndunga"
@ -18778,7 +18778,7 @@ msgstr ""

 #. name for nno
 msgid "Norwegian Nynorsk"
-msgstr "норвешки модерни"
+msgstr "Норвешки модерни"

 #. name for nnp
 msgid "Naga; Wancho"
@ -18830,7 +18830,7 @@ msgstr ""

 #. name for nob
 msgid "Norwegian Bokmål"
-msgstr ""
+msgstr "Норвешки (књижевни)"

 #. name for noc
 msgid "Nuk"
@ -18886,7 +18886,7 @@ msgstr ""

 #. name for nor
 msgid "Norwegian"
-msgstr "норвешки"
+msgstr "Норвешки"

 #. name for nos
 msgid "Nisu; Eastern"
@ -19066,7 +19066,7 @@ msgstr ""

 #. name for nsl
 msgid "Norwegian Sign Language"
-msgstr ""
+msgstr "Норвешки језик"

 #. name for nsm
 msgid "Naga; Sumi"
@ -20406,7 +20406,7 @@ msgstr ""

 #. name for pdc
 msgid "German; Pennsylvania"
-msgstr ""
+msgstr "Немачки ; Пенсилванија"

 #. name for pdi
 msgid "Pa Di"
@ -22086,7 +22086,7 @@ msgstr ""

 #. name for rmg
 msgid "Norwegian; Traveller"
-msgstr ""
+msgstr "Норвешки; путнички"

 #. name for rmh
 msgid "Murkim"
@ -22871,7 +22871,7 @@ msgstr ""

 #. name for sgg
 msgid "Swiss-German Sign Language"
-msgstr ""
+msgstr "Швајцарско-Немачки језик"

 #. name for sgh
 msgid "Shughni"
--- a/setup/resources.py
+++ b/setup/resources.py
@ -26,7 +26,7 @@ def get_opts_from_parser(parser):
 class Coffee(Command): # {{{

    description = 'Compile coffeescript files into javascript'
-    COFFEE_DIRS = {'ebooks/oeb/display': 'display'}
+    COFFEE_DIRS = ('ebooks/oeb/display',)

    def add_options(self, parser):
        parser.add_option('--watch', '-w', action='store_true', default=False,
@ -47,49 +47,69 @@ class Coffee(Command): # {{{
            except KeyboardInterrupt:
                pass

-    def show_js(self, jsfile):
+    def show_js(self, raw):
        from pygments.lexers import JavascriptLexer
        from pygments.formatters import TerminalFormatter
        from pygments import highlight
-        with open(jsfile, 'rb') as f:
-            raw = f.read()
        print highlight(raw, JavascriptLexer(), TerminalFormatter())

    def do_coffee_compile(self, opts, timestamp=False, ignore_errors=False):
-        for toplevel, dest in self.COFFEE_DIRS.iteritems():
-            dest = self.j(self.RESOURCES, dest)
-            for x in glob.glob(self.j(self.SRC, __appname__, toplevel, '*.coffee')):
-                js = self.j(dest, os.path.basename(x.rpartition('.')[0]+'.js'))
-                if self.newer(js, x):
-                    print ('\t%sCompiling %s'%(time.strftime('[%H:%M:%S] ') if
-                        timestamp else '', os.path.basename(x)))
-                    try:
-                        cs = subprocess.check_output(self.compiler +
-                                [x]).decode('utf-8')
-                    except Exception as e:
-                        print ('\n\tCompilation of %s failed'%os.path.basename(x))
-                        print (e)
-                        if ignore_errors:
-                            with open(js, 'wb') as f:
-                                f.write('# Compilation from coffeescript failed')
-                        else:
-                            raise SystemExit(1)
-                    else:
-                        with open(js, 'wb') as f:
-                            f.write(cs.encode('utf-8'))
-                        if opts.show_js:
-                            self.show_js(js)
-                            print ('#'*80)
-                            print ('#'*80)
+        src_files = {}
+        for src in self.COFFEE_DIRS:
+            for f in glob.glob(self.j(self.SRC, __appname__, src,
+                '*.coffee')):
+                bn = os.path.basename(f).rpartition('.')[0]
+                arcname = src.replace('/', '.') + '.' + bn + '.js'
+                src_files[arcname] = (f, os.stat(f).st_mtime)
+
+        existing = {}
+        dest = self.j(self.RESOURCES, 'compiled_coffeescript.zip')
+        if os.path.exists(dest):
+            with zipfile.ZipFile(dest, 'r') as zf:
+                for info in zf.infolist():
+                    mtime = time.mktime(info.date_time + (0, 0, -1))
+                    arcname = info.filename
+                    if (arcname in src_files and src_files[arcname][1] <
+                            mtime):
+                        existing[arcname] = (zf.read(info), info)
+
+        todo = set(src_files) - set(existing)
+        updated = {}
+        for arcname in todo:
+            name = arcname.rpartition('.')[0]
+            print ('\t%sCompiling %s'%(time.strftime('[%H:%M:%S] ') if
+                        timestamp else '', name))
+            src = src_files[arcname][0]
+            try:
+                js = subprocess.check_output(self.compiler +
+                        [src]).decode('utf-8')
+            except Exception as e:
+                print ('\n\tCompilation of %s failed'%name)
+                print (e)
+                if ignore_errors:
+                    js = u'# Compilation from coffeescript failed'
+                else:
+                    raise SystemExit(1)
+            else:
+                if opts.show_js:
+                    self.show_js(js)
+                    print ('#'*80)
+                    print ('#'*80)
+            zi = zipfile.ZipInfo()
+            zi.filename = arcname
+            zi.date_time = time.localtime()[:6]
+            updated[arcname] = (js.encode('utf-8'), zi)
+        if updated:
+            with zipfile.ZipFile(dest, 'w', zipfile.ZIP_STORED) as zf:
+                for raw, zi in updated.itervalues():
+                    zf.writestr(zi, raw)
+                for raw, zi in existing.itervalues():
+                    zf.writestr(zi, raw)

    def clean(self):
-        for toplevel, dest in self.COFFEE_DIRS.iteritems():
-            dest = self.j(self.RESOURCES, dest)
-            for x in glob.glob(self.j(self.SRC, __appname__, toplevel, '*.coffee')):
-                x = x.rpartition('.')[0] + '.js'
-                x = self.j(dest, os.path.basename(x))
-                if os.path.exists(x):
-                    os.remove(x)
+        x = self.j(self.RESOURCES, 'compiled_coffeescript.zip')
+        if os.path.exists(x):
+            os.remove(x)
 # }}}

 class Kakasi(Command): # {{{
--- a/src/calibre/constants.py
+++ b/src/calibre/constants.py
@ -4,7 +4,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
 __appname__   = u'calibre'
-numeric_version = (0, 8, 50)
+numeric_version = (0, 8, 51)
 __version__   = u'.'.join(map(unicode, numeric_version))
 __author__    = u"Kovid Goyal <kovid@kovidgoyal.net>"

--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@ -302,7 +302,9 @@ class OutputFormatPlugin(Plugin):

        :param item: The item (HTML file) being processed
        :param stylizer: A Stylizer object containing the flattened styles for
-        item. You can get the style for any element by stylizer.style(element).
+                         item. You can get the style for any element by
+                         stylizer.style(element).
+
        '''
        pass

--- a/src/calibre/devices/android/driver.py
+++ b/src/calibre/devices/android/driver.py
@ -57,6 +57,7 @@ class ANDROID(USBMS):
                       0x4316 : [0x216],
                       0x42d6 : [0x216],
                       0x42d7 : [0x216],
+                       0x42f7 : [0x216],
                     },
            # Freescale
            0x15a2 : {
@ -193,7 +194,7 @@ class ANDROID(USBMS):
            'GT-I9003_CARD', 'XT912', 'FILE-CD_GADGET', 'RK29_SDK', 'MB855',
            'XT910', 'BOOK_A10', 'USB_2.0_DRIVER', 'I9100T', 'P999DW',
            'KTABLET_PC', 'INGENIC', 'GT-I9001_CARD', 'USB_2.0_DRIVER',
-            'GT-S5830L_CARD', 'UNIVERSE']
+            'GT-S5830L_CARD', 'UNIVERSE', 'XT875']
    WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897',
            'FILE-STOR_GADGET', 'SGH-T959_CARD', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD',
            'A70S', 'A101IT', '7', 'INCREDIBLE', 'A7EB', 'SGH-T849_CARD',
@ -201,7 +202,8 @@ class ANDROID(USBMS):
            'ANDROID_MID', 'P990_SD_CARD', '.K080', 'LTE_CARD', 'MB853',
            'A1-07___C0541A4F', 'XT912', 'MB855', 'XT910', 'BOOK_A10_CARD',
            'USB_2.0_DRIVER', 'I9100T', 'P999DW_SD_CARD', 'KTABLET_PC',
-            'FILE-CD_GADGET', 'GT-I9001_CARD', 'USB_2.0_DRIVER']
+            'FILE-CD_GADGET', 'GT-I9001_CARD', 'USB_2.0_DRIVER', 'XT875',
+            'UMS_COMPOSITE']

    OSX_MAIN_MEM = 'Android Device Main Memory'

--- a/src/calibre/devices/eb600/driver.py
+++ b/src/calibre/devices/eb600/driver.py
@ -92,6 +92,10 @@ class POCKETBOOK360(EB600):
    name = 'PocketBook 360 Device Interface'

    gui_name = 'PocketBook 360'
+    VENDOR_ID   = [0x1f85, 0x525]
+    PRODUCT_ID  = [0x1688, 0xa4a5]
+    BCD         = [0x110]
+

    FORMATS = ['epub', 'fb2', 'prc', 'mobi', 'pdf', 'djvu', 'rtf', 'chm', 'txt']

--- a/src/calibre/ebooks/conversion/init.py
+++ b/src/calibre/ebooks/conversion/init.py
@ -1,4 +1,25 @@
-from __future__ import with_statement
-__license__ = 'GPL 3'
-__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
+
+
+class ConversionUserFeedBack(Exception):
+
+    def __init__(self, title, msg, level='info', det_msg=''):
+        ''' Show a simple message to the user
+
+        :param title: The title (very short description)
+        :param msg: The message to show the user
+        :param level: Must be one of 'info', 'warn' or 'error'
+        :param det_msg: Optional detailed message to show the user
+        '''
+        import json
+        Exception.__init__(self, json.dumps({'msg':msg, 'level':level,
+            'det_msg':det_msg, 'title':title}))
+        self.title, self.msg, self.det_msg = title, msg, det_msg
+        self.level = level
+
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@ -15,6 +15,7 @@ from calibre.utils.logging import Log
 from calibre.constants import preferred_encoding
 from calibre.customize.conversion import OptionRecommendation
 from calibre import patheq
+from calibre.ebooks.conversion import ConversionUserFeedBack

 USAGE = '%prog ' + _('''\
 input_file output_file [options]
@ -304,7 +305,10 @@ def read_sr_patterns(path, log=None):
 def main(args=sys.argv):
    log = Log()
    parser, plumber = create_option_parser(args, log)
-    opts = parser.parse_args(args)[0]
+    opts, leftover_args = parser.parse_args(args)
+    if len(leftover_args) > 3:
+        log.error('Extra arguments not understood:', u', '.join(leftover_args[3:]))
+        return 1
    for x in ('read_metadata_from_opf', 'cover'):
        if getattr(opts, x, None) is not None:
            setattr(opts, x, abspath(getattr(opts, x)))
@ -317,7 +321,16 @@ def main(args=sys.argv):
                                        if n.dest]
    plumber.merge_ui_recommendations(recommendations)

-    plumber.run()
+    try:
+        plumber.run()
+    except ConversionUserFeedBack as e:
+        ll = {'info': log.info, 'warn': log.warn,
+                'error':log.error}.get(e.level, log.info)
+        ll(e.title)
+        if e.det_msg:
+            log.debug(e.detmsg)
+        ll(e.msg)
+        raise SystemExit(1)

    log(_('Output saved to'), ' ', plumber.output)

--- a/src/calibre/ebooks/conversion/plugins/epub_input.py
+++ b/src/calibre/ebooks/conversion/plugins/epub_input.py
@ -207,7 +207,7 @@ class EPUBInput(InputFormatPlugin):
        if rc:
            cover_toc_item = None
            for item in oeb.toc.iterdescendants():
-                if item.href == rc:
+                if item.href and item.href.partition('#')[0] == rc:
                    cover_toc_item = item
                    break
            spine = {x.href for x in oeb.spine}
--- a/src/calibre/ebooks/conversion/plugins/epub_output.py
+++ b/src/calibre/ebooks/conversion/plugins/epub_output.py
@ -393,8 +393,14 @@ class EPUBOutput(OutputFormatPlugin):
            for tag in XPath('//h:body/descendant::h:script')(root):
                tag.getparent().remove(tag)

+            formchildren = XPath('./h:input|./h:button|./h:textarea|'
+                    './h:label|./h:fieldset|./h:legend')
            for tag in XPath('//h:form')(root):
-                tag.getparent().remove(tag)
+                if formchildren(tag):
+                    tag.getparent().remove(tag)
+                else:
+                    # Not a real form
+                    tag.tag = XHTML('div')

            for tag in XPath('//h:center')(root):
                tag.tag = XHTML('div')
--- a/src/calibre/ebooks/conversion/plugins/mobi_input.py
+++ b/src/calibre/ebooks/conversion/plugins/mobi_input.py
@ -12,7 +12,7 @@ class MOBIInput(InputFormatPlugin):
    name        = 'MOBI Input'
    author      = 'Kovid Goyal'
    description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML'
-    file_types  = set(['mobi', 'prc', 'azw', 'azw3'])
+    file_types  = set(['mobi', 'prc', 'azw', 'azw3', 'pobi'])

    def convert(self, stream, options, file_ext, log,
                accelerators):
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -343,21 +343,25 @@ OptionRecommendation(name='remove_fake_margins',
 OptionRecommendation(name='margin_top',
        recommended_value=5.0, level=OptionRecommendation.LOW,
        help=_('Set the top margin in pts. Default is %default. '
+            'Setting this to less than zero will cause no margin to be set. '
            'Note: 72 pts equals 1 inch')),

 OptionRecommendation(name='margin_bottom',
        recommended_value=5.0, level=OptionRecommendation.LOW,
        help=_('Set the bottom margin in pts. Default is %default. '
+            'Setting this to less than zero will cause no margin to be set. '
            'Note: 72 pts equals 1 inch')),

 OptionRecommendation(name='margin_left',
        recommended_value=5.0, level=OptionRecommendation.LOW,
        help=_('Set the left margin in pts. Default is %default. '
+            'Setting this to less than zero will cause no margin to be set. '
            'Note: 72 pts equals 1 inch')),

 OptionRecommendation(name='margin_right',
        recommended_value=5.0, level=OptionRecommendation.LOW,
        help=_('Set the right margin in pts. Default is %default. '
+            'Setting this to less than zero will cause no margin to be set. '
            'Note: 72 pts equals 1 inch')),

 OptionRecommendation(name='change_justification',
@ -885,7 +889,10 @@ OptionRecommendation(name='search_replace',
            self.log.debug('Resolved conversion options')
            try:
                self.log.debug('calibre version:', __version__)
-                self.log.debug(pprint.pformat(self.opts.__dict__))
+                odict = dict(self.opts.__dict__)
+                for x in ('username', 'password'):
+                    odict.pop(x, None)
+                self.log.debug(pprint.pformat(odict))
            except:
                self.log.exception('Failed to get resolved conversion options')

--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -5,7 +5,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-import re, error as re_error
+import re
 from math import ceil
 from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
 from calibre.utils.logging import default_log
@ -184,7 +184,7 @@ class HeuristicProcessor(object):
                except OverflowError:
                    # match.group(0) was too large to be compiled into a regex
                    continue
-                except re_error:
+                except re.error:
                    # the match was not a valid regular expression
                    continue

--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@ -113,6 +113,11 @@ class HTMLFile(object):
                raise IOError(msg)
            raise IgnoreFile(msg, err.errno)

+        if not src:
+            if level == 0:
+                raise ValueError('The file %s is empty'%self.path)
+            self.is_binary = True
+
        if not self.is_binary:
            if not encoding:
                encoding = detect_xml_encoding(src[:4096], verbose=verbose)[1]
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@ -18,7 +18,7 @@ from calibre.ebooks.metadata import check_isbn
 from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase,
        fixauthors)
 from calibre.ebooks.metadata.book.base import Metadata
-from calibre.utils.date import parse_date
+from calibre.utils.date import parse_only_date
 from calibre.utils.localization import canonicalize_lang

 class Worker(Thread): # Get details {{{
@ -471,7 +471,7 @@ class Worker(Thread): # Get details {{{
                ans = x.tail
                date = ans.rpartition('(')[-1].replace(')', '').strip()
                date = self.delocalize_datestr(date)
-                return parse_date(date, assume_utc=True)
+                return parse_only_date(date, assume_utc=True)

    def parse_language(self, pd):
        for x in reversed(pd.xpath(self.language_xpath)):
--- a/src/calibre/ebooks/mobi/debug/headers.py
+++ b/src/calibre/ebooks/mobi/debug/headers.py
@ -306,10 +306,15 @@ class MOBIHeader(object): # {{{
        self.extra_data_flags = 0
        if self.has_extra_data_flags:
            self.unknown4 = self.raw[184:192]
-            self.fdst_idx, self.fdst_count = struct.unpack_from(b'>LL',
-                    self.raw, 192)
-            if self.fdst_count <= 1:
-                self.fdst_idx = NULL_INDEX
+            if self.file_version < 8:
+                self.first_text_record, self.last_text_record = \
+                    struct.unpack_from(b'>HH', self.raw, 192)
+                self.fdst_count = struct.unpack_from(b'>L', self.raw, 196)
+            else:
+                self.fdst_idx, self.fdst_count = struct.unpack_from(b'>LL',
+                        self.raw, 192)
+                if self.fdst_count <= 1:
+                    self.fdst_idx = NULL_INDEX
            (self.fcis_number, self.fcis_count, self.flis_number,
                    self.flis_count) = struct.unpack(b'>IIII',
                            self.raw[200:216])
@ -409,7 +414,11 @@ class MOBIHeader(object): # {{{
            a('DRM Flags: %r'%self.drm_flags)
        if self.has_extra_data_flags:
            a('Unknown4: %r'%self.unknown4)
-            r('FDST Index', 'fdst_idx')
+            if hasattr(self, 'first_text_record'):
+                a('First content record: %d'%self.first_text_record)
+                a('Last content record: %d'%self.last_text_record)
+            else:
+                r('FDST Index', 'fdst_idx')
            a('FDST Count: %d'% self.fdst_count)
            r('FCIS number', 'fcis_number')
            a('FCIS count: %d'% self.fcis_count)
--- a/src/calibre/ebooks/mobi/reader/markup.py
+++ b/src/calibre/ebooks/mobi/reader/markup.py
@ -111,7 +111,11 @@ def update_flow_links(mobi8_reader, resource_map, log):
            continue

        if not isinstance(flow, unicode):
-            flow = flow.decode(mr.header.codec)
+            try:
+                flow = flow.decode(mr.header.codec)
+            except UnicodeDecodeError:
+                log.error('Flow part has invalid %s encoded bytes'%mr.header.codec)
+                flow = flow.decode(mr.header.codec, 'replace')

        # links to raster image files from image tags
        # image_pattern
--- a/src/calibre/ebooks/mobi/reader/mobi8.py
+++ b/src/calibre/ebooks/mobi/reader/mobi8.py
@ -207,9 +207,9 @@ class Mobi8Reader(object):
                    fname = 'svgimg' + nstr + '.svg'
            else:
                # search for CDATA and if exists inline it
-                if flowpart.find('[CDATA[') >= 0:
+                if flowpart.find(b'[CDATA[') >= 0:
                    typ = 'css'
-                    flowpart = '<style type="text/css">\n' + flowpart + '\n</style>\n'
+                    flowpart = b'<style type="text/css">\n' + flowpart + b'\n</style>\n'
                    format = 'inline'
                    dir = None
                    fname = None
--- a/src/calibre/ebooks/mobi/writer2/main.py
+++ b/src/calibre/ebooks/mobi/writer2/main.py
@ -382,6 +382,7 @@ class MobiWriter(object):
            first_image_record  = len(self.records)
            self.resources.serialize(self.records, used_images)
        resource_record_count = len(self.records) - old
+        last_content_record = len(self.records) - 1

        # FCIS/FLIS (Seems to serve no purpose)
        flis_number = len(self.records)
@ -406,7 +407,7 @@ class MobiWriter(object):
        # header
        header_fields['first_resource_record'] = first_image_record
        header_fields['exth_flags'] = 0b100001010000 # Kinglegen uses this
-        header_fields['fdst_record'] = NULL_INDEX
+        header_fields['fdst_record'] = pack(b'>HH', 1, last_content_record)
        header_fields['fdst_count'] = 1 # Why not 0? Kindlegen uses 1
        header_fields['flis_record'] = flis_number
        header_fields['fcis_record'] = fcis_number
--- a/src/calibre/ebooks/mobi/writer8/main.py
+++ b/src/calibre/ebooks/mobi/writer8/main.py
@ -314,9 +314,9 @@ class KF8Writer(object):
            return

        # Flatten the ToC into a depth first list
-        fl = toc.iter() if is_periodical else toc.iterdescendants()
+        fl = toc.iterdescendants()
        for i, item in enumerate(fl):
-            entry = {'id': id(item), 'index': i, 'href':item.href,
+            entry = {'id': id(item), 'index': i, 'href':item.href or '',
                    'label':(item.title or _('Unknown')),
                    'children':[]}
            entry['depth'] = getattr(item, 'ncx_hlvl', 0)
--- a/src/calibre/ebooks/mobi/writer8/mobi.py
+++ b/src/calibre/ebooks/mobi/writer8/mobi.py
@ -138,6 +138,8 @@ class MOBIHeader(Header): # {{{
    unknown2 = zeroes(8)

    # 192: FDST
+    # In MOBI 6 the fdst record is instead two two byte fields storing the
+    # index of the first and last content records
    fdst_record = DYN
    fdst_count = DYN

--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -966,7 +966,7 @@ class Manifest(object):
                data = data.cssText
                if isinstance(data, unicode):
                    data = data.encode('utf-8')
-                return data
+                return data + b'\n'
            return str(data)

        def __unicode__(self):
--- a/src/calibre/ebooks/oeb/display/cfi.coffee
+++ b/src/calibre/ebooks/oeb/display/cfi.coffee
@ -389,8 +389,17 @@ class CanonicalFragmentIdentifier
        # Drill down into iframes, etc.
        while true
            target = cdoc.elementFromPoint x, y
-            if not target or target.localName == 'html'
-                log("No element at (#{ x }, #{ y })")
+            if not target or target.localName in ['html', 'body']
+                # We ignore both html and body even though body could
+                # have text nodes under it as performance is very poor if body
+                # has large margins/padding (for e.g. in fullscreen mode)
+                # A possible solution for this is to wrap all text node
+                # children of body in <span> but that is seriously ugly and
+                # might have side effects. Lets do this only if there are lots of
+                # books in the wild that actually have text children of body,
+                # and even in this case it might be better to change the input
+                # plugin to prevent this from happening.
+                # log("No element at (#{ x }, #{ y })")
                return null

            name = target.localName
--- a/src/calibre/ebooks/oeb/display/indexing.coffee
+++ b/src/calibre/ebooks/oeb/display/indexing.coffee
@ -0,0 +1,76 @@
+#!/usr/bin/env coffee
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+###
+ Copyright 2012, Kovid Goyal <kovid@kovidgoyal.net>
+ Released under the GPLv3 License
+###
+
+body_height = () ->
+    db = document.body
+    dde = document.documentElement
+    if db? and dde?
+        return Math.max(db.scrollHeight, dde.scrollHeight, db.offsetHeight,
+            dde.offsetHeight, db.clientHeight, dde.clientHeight)
+    return 0
+
+abstop = (elem) ->
+    ans = elem.offsetTop
+    while elem.offsetParent
+        elem = elem.offsetParent
+        ans += elem.offsetTop
+    return ans
+
+class BookIndexing
+    ###
+    This class is a namespace to expose indexing functions via the
+    window.book_indexing object. The most important functions are:
+
+    anchor_positions(): Get the absolute (document co-ordinate system) position
+    for elements with the specified id/name attributes.
+
+    ###
+
+    constructor: () ->
+        this.cache = {}
+        this.body_height_at_last_check = null
+
+    cache_valid: (anchors) ->
+        for a in anchors
+            if not Object.prototype.hasOwnProperty.call(this.cache, a)
+                return false
+        for p of this.cache
+            if Object.prototype.hasOwnProperty.call(this.cache, p) and p not in anchors
+                return false
+        return true
+
+    anchor_positions: (anchors, use_cache=false) ->
+        if use_cache and body_height() == this.body_height_at_last_check and this.cache_valid(anchors)
+            return this.cache
+
+        ans = {}
+        for anchor in anchors
+            elem = document.getElementById(anchor)
+            if elem == null
+                # Look for an <a name="anchor"> element
+                try
+                    result = document.evaluate(
+                        ".//*[local-name() = 'a' and @name='#{ anchor }']",
+                        document.body, null,
+                        XPathResult.FIRST_ORDERED_NODE_TYPE, null)
+                    elem = result.singleNodeValue
+                catch error
+                    # The anchor had a ' or other invalid char
+                    elem = null
+            if elem == null
+                pos = body_height() + 10000
+            else
+                pos = abstop(elem)
+            ans[anchor] = pos
+        this.cache = ans
+        this.body_height_at_last_check = body_height()
+        return ans
+
+if window?
+    window.book_indexing = new BookIndexing()
+
--- a/src/calibre/ebooks/oeb/iterator.py
+++ b/src/calibre/ebooks/oeb/iterator.py
@ -1,383 +0,0 @@
-from __future__ import with_statement
-__license__   = 'GPL v3'
-__copyright__ = '2008 Kovid Goyal <kovid at kovidgoyal.net>'
-
-'''
-Iterate over the HTML files in an ebook. Useful for writing viewers.
-'''
-
-import re, os, math
-from cStringIO import StringIO
-
-from PyQt4.Qt import QFontDatabase
-
-from calibre.customize.ui import available_input_formats
-from calibre.ebooks.metadata.opf2 import OPF
-from calibre.ptempfile import TemporaryDirectory
-from calibre.ebooks.chardet import xml_to_unicode
-from calibre.utils.zipfile import safe_replace
-from calibre.utils.config import DynamicConfig
-from calibre.utils.logging import Log
-from calibre import (guess_type, prints, prepare_string_for_xml,
-        xml_replace_entities)
-from calibre.ebooks.oeb.transforms.cover import CoverManager
-from calibre.constants import filesystem_encoding
-
-TITLEPAGE = CoverManager.SVG_TEMPLATE.decode('utf-8').replace(\
-        '__ar__', 'none').replace('__viewbox__', '0 0 600 800'
-        ).replace('__width__', '600').replace('__height__', '800')
-BM_FIELD_SEP = u'*|!|?|*'
-BM_LEGACY_ESC = u'esc-text-%&*#%(){}ads19-end-esc'
-
-def character_count(html):
-    '''
-    Return the number of "significant" text characters in a HTML string.
-    '''
-    count = 0
-    strip_space = re.compile(r'\s+')
-    for match in re.finditer(r'>[^<]+<', html):
-        count += len(strip_space.sub(' ', match.group()))-2
-    return count
-
-class UnsupportedFormatError(Exception):
-
-    def __init__(self, fmt):
-        Exception.__init__(self, _('%s format books are not supported')%fmt.upper())
-
-class SpineItem(unicode):
-
-    def __new__(cls, path, mime_type=None):
-        ppath = path.partition('#')[0]
-        if not os.path.exists(path) and os.path.exists(ppath):
-            path = ppath
-        obj = super(SpineItem, cls).__new__(cls, path)
-        raw = open(path, 'rb').read()
-        raw, obj.encoding = xml_to_unicode(raw)
-        obj.character_count = character_count(raw)
-        obj.start_page = -1
-        obj.pages      = -1
-        obj.max_page   = -1
-        if mime_type is None:
-            mime_type = guess_type(obj)[0]
-        obj.mime_type = mime_type
-        return obj
-
-class FakeOpts(object):
-    verbose = 0
-    breadth_first = False
-    max_levels = 5
-    input_encoding = None
-
-def is_supported(path):
-    ext = os.path.splitext(path)[1].replace('.', '').lower()
-    ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
-    return ext in available_input_formats()
-
-
-def write_oebbook(oeb, path):
-    from calibre.ebooks.oeb.writer import OEBWriter
-    from calibre import walk
-    w = OEBWriter()
-    w(oeb, path)
-    for f in walk(path):
-        if f.endswith('.opf'):
-            return f
-
-class EbookIterator(object):
-
-    CHARACTERS_PER_PAGE = 1000
-
-    def __init__(self, pathtoebook, log=None):
-        self.log = log
-        if log is None:
-            self.log = Log()
-        pathtoebook = pathtoebook.strip()
-        self.pathtoebook = os.path.abspath(pathtoebook)
-        self.config = DynamicConfig(name='iterator')
-        ext = os.path.splitext(pathtoebook)[1].replace('.', '').lower()
-        ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
-        self.ebook_ext = ext.replace('original_', '')
-
-    def search(self, text, index, backwards=False):
-        text = prepare_string_for_xml(text.lower())
-        pmap = [(i, path) for i, path in enumerate(self.spine)]
-        if backwards:
-            pmap.reverse()
-        for i, path in pmap:
-            if (backwards and i < index) or (not backwards and i > index):
-                with open(path, 'rb') as f:
-                    raw = f.read().decode(path.encoding)
-                try:
-                    raw = xml_replace_entities(raw)
-                except:
-                    pass
-                if text in raw.lower():
-                    return i
-
-    def find_missing_css_files(self):
-        for x in os.walk(os.path.dirname(self.pathtoopf)):
-            for f in x[-1]:
-                if f.endswith('.css'):
-                    yield os.path.join(x[0], f)
-
-    def find_declared_css_files(self):
-        for item in self.opf.manifest:
-            if item.mime_type and 'css' in item.mime_type.lower():
-                yield item.path
-
-    def find_embedded_fonts(self):
-        '''
-        This will become unnecessary once Qt WebKit supports the @font-face rule.
-        '''
-        css_files = set(self.find_declared_css_files())
-        if not css_files:
-            css_files = set(self.find_missing_css_files())
-        bad_map = {}
-        font_family_pat = re.compile(r'font-family\s*:\s*([^;]+)')
-        for csspath in css_files:
-            try:
-                css = open(csspath, 'rb').read().decode('utf-8', 'replace')
-            except:
-                continue
-            for match in re.compile(r'@font-face\s*{([^}]+)}').finditer(css):
-                block  = match.group(1)
-                family = font_family_pat.search(block)
-                url    = re.compile(r'url\s*\([\'"]*(.+?)[\'"]*\)', re.DOTALL).search(block)
-                if url:
-                    path = url.group(1).split('/')
-                    path = os.path.join(os.path.dirname(csspath), *path)
-                    if not os.access(path, os.R_OK):
-                        continue
-                    id = QFontDatabase.addApplicationFont(path)
-                    if id != -1:
-                        families = [unicode(f) for f in QFontDatabase.applicationFontFamilies(id)]
-                        if family:
-                            family = family.group(1)
-                            specified_families = [x.strip().replace('"',
-                                '').replace("'", '') for x in family.split(',')]
-                            aliasing_ok = False
-                            for f in specified_families:
-                                bad_map[f] = families[0]
-                                if not aliasing_ok and f in families:
-                                    aliasing_ok = True
-
-                            if not aliasing_ok:
-                                prints('WARNING: Family aliasing not fully supported.')
-                                prints('\tDeclared family: %r not in actual families: %r'
-                                        % (family, families))
-                            else:
-                                prints('Loaded embedded font:', repr(family))
-        if bad_map:
-            def prepend_embedded_font(match):
-                for bad, good in bad_map.items():
-                    if bad in match.group(1):
-                        prints('Substituting font family: %s -> %s'%(bad, good))
-                        return match.group().replace(bad, '"%s"'%good)
-
-            from calibre.ebooks.chardet import force_encoding
-            for csspath in css_files:
-                with open(csspath, 'r+b') as f:
-                    css = f.read()
-                    enc = force_encoding(css, False)
-                    css = css.decode(enc, 'replace')
-                    ncss = font_family_pat.sub(prepend_embedded_font, css)
-                    if ncss != css:
-                        f.seek(0)
-                        f.truncate()
-                        f.write(ncss.encode(enc))
-
-    def __enter__(self, processed=False, only_input_plugin=False):
-        self.delete_on_exit = []
-        self._tdir = TemporaryDirectory('_ebook_iter')
-        self.base  = self._tdir.__enter__()
-        if not isinstance(self.base, unicode):
-            self.base = self.base.decode(filesystem_encoding)
-        from calibre.ebooks.conversion.plumber import Plumber, create_oebbook
-        plumber = Plumber(self.pathtoebook, self.base, self.log)
-        plumber.setup_options()
-        if self.pathtoebook.lower().endswith('.opf'):
-            plumber.opts.dont_package = True
-        if hasattr(plumber.opts, 'no_process'):
-            plumber.opts.no_process = True
-
-        plumber.input_plugin.for_viewer = True
-        with plumber.input_plugin:
-            self.pathtoopf = plumber.input_plugin(open(plumber.input, 'rb'),
-                plumber.opts, plumber.input_fmt, self.log,
-                {}, self.base)
-
-        if not only_input_plugin:
-            if processed or plumber.input_fmt.lower() in ('pdb', 'pdf', 'rb') and \
-                    not hasattr(self.pathtoopf, 'manifest'):
-                if hasattr(self.pathtoopf, 'manifest'):
-                    self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
-                self.pathtoopf = create_oebbook(self.log, self.pathtoopf,
-                        plumber.opts)
-
-        if hasattr(self.pathtoopf, 'manifest'):
-            self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
-
-        self.book_format = os.path.splitext(self.pathtoebook)[1][1:].upper()
-        if getattr(plumber.input_plugin, 'is_kf8', False):
-            self.book_format = 'KF8'
-
-        self.opf = getattr(plumber.input_plugin, 'optimize_opf_parsing', None)
-        if self.opf is None:
-            self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf))
-        self.language = self.opf.language
-        if self.language:
-            self.language = self.language.lower()
-        ordered = [i for i in self.opf.spine if i.is_linear] + \
-                  [i for i in self.opf.spine if not i.is_linear]
-        self.spine = []
-        for i in ordered:
-            spath = i.path
-            mt = None
-            if i.idref is not None:
-                mt = self.opf.manifest.type_for_id(i.idref)
-            if mt is None:
-                mt = guess_type(spath)[0]
-            try:
-                self.spine.append(SpineItem(spath, mime_type=mt))
-            except:
-                self.log.warn('Missing spine item:', repr(spath))
-
-        cover = self.opf.cover
-        if self.ebook_ext in ('lit', 'mobi', 'prc', 'opf', 'fb2') and cover:
-            cfile = os.path.join(self.base, 'calibre_iterator_cover.html')
-            rcpath = os.path.relpath(cover, self.base).replace(os.sep, '/')
-            chtml = (TITLEPAGE%prepare_string_for_xml(rcpath, True)).encode('utf-8')
-            open(cfile, 'wb').write(chtml)
-            self.spine[0:0] = [SpineItem(cfile,
-                mime_type='application/xhtml+xml')]
-            self.delete_on_exit.append(cfile)
-
-        if self.opf.path_to_html_toc is not None and \
-           self.opf.path_to_html_toc not in self.spine:
-            try:
-                self.spine.append(SpineItem(self.opf.path_to_html_toc))
-            except:
-                import traceback
-                traceback.print_exc()
-
-
-        sizes = [i.character_count for i in self.spine]
-        self.pages = [math.ceil(i/float(self.CHARACTERS_PER_PAGE)) for i in sizes]
-        for p, s in zip(self.pages, self.spine):
-            s.pages = p
-        start = 1
-
-        for s in self.spine:
-            s.start_page = start
-            start += s.pages
-            s.max_page = s.start_page + s.pages - 1
-        self.toc = self.opf.toc
-
-        self.read_bookmarks()
-
-        return self
-
-    def parse_bookmarks(self, raw):
-        for line in raw.splitlines():
-            bm = None
-            if line.count('^') > 0:
-                tokens = line.rpartition('^')
-                title, ref = tokens[0], tokens[2]
-                try:
-                    spine, _, pos = ref.partition('#')
-                    spine = int(spine.strip())
-                except:
-                    continue
-                bm = {'type':'legacy', 'title':title, 'spine':spine, 'pos':pos}
-            elif BM_FIELD_SEP in line:
-                try:
-                    title, spine, pos = line.strip().split(BM_FIELD_SEP)
-                    spine = int(spine)
-                except:
-                    continue
-                # Unescape from serialization
-                pos = pos.replace(BM_LEGACY_ESC, u'^')
-                # Check for pos being a scroll fraction
-                try:
-                    pos = float(pos)
-                except:
-                    pass
-                bm = {'type':'cfi', 'title':title, 'pos':pos, 'spine':spine}
-
-            if bm:
-                self.bookmarks.append(bm)
-
-    def serialize_bookmarks(self, bookmarks):
-        dat = []
-        for bm in bookmarks:
-            if bm['type'] == 'legacy':
-                rec = u'%s^%d#%s'%(bm['title'], bm['spine'], bm['pos'])
-            else:
-                pos = bm['pos']
-                if isinstance(pos, (int, float)):
-                    pos = unicode(pos)
-                else:
-                    pos = pos.replace(u'^', BM_LEGACY_ESC)
-                rec = BM_FIELD_SEP.join([bm['title'], unicode(bm['spine']), pos])
-            dat.append(rec)
-        return (u'\n'.join(dat) +u'\n')
-
-    def read_bookmarks(self):
-        self.bookmarks = []
-        bmfile = os.path.join(self.base, 'META-INF', 'calibre_bookmarks.txt')
-        raw = ''
-        if os.path.exists(bmfile):
-            with open(bmfile, 'rb') as f:
-                raw = f.read()
-        else:
-            saved = self.config['bookmarks_'+self.pathtoebook]
-            if saved:
-                raw = saved
-        if not isinstance(raw, unicode):
-            raw = raw.decode('utf-8')
-        self.parse_bookmarks(raw)
-
-    def save_bookmarks(self, bookmarks=None):
-        if bookmarks is None:
-            bookmarks = self.bookmarks
-        dat = self.serialize_bookmarks(bookmarks)
-        if os.path.splitext(self.pathtoebook)[1].lower() == '.epub' and \
-            os.access(self.pathtoebook, os.R_OK):
-            try:
-                zf = open(self.pathtoebook, 'r+b')
-            except IOError:
-                return
-            safe_replace(zf, 'META-INF/calibre_bookmarks.txt',
-                    StringIO(dat.encode('utf-8')),
-                    add_missing=True)
-        else:
-            self.config['bookmarks_'+self.pathtoebook] = dat
-
-    def add_bookmark(self, bm):
-        self.bookmarks = [x for x in self.bookmarks if x['title'] !=
-                bm['title']]
-        self.bookmarks.append(bm)
-        self.save_bookmarks()
-
-    def set_bookmarks(self, bookmarks):
-        self.bookmarks = bookmarks
-
-    def __exit__(self, *args):
-        self._tdir.__exit__(*args)
-        for x in self.delete_on_exit:
-            if os.path.exists(x):
-                os.remove(x)
-
-def get_preprocess_html(path_to_ebook, output):
-    from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
-    iterator = EbookIterator(path_to_ebook)
-    iterator.__enter__(only_input_plugin=True)
-    preprocessor = HTMLPreProcessor(None, False)
-    with open(output, 'wb') as out:
-        for path in iterator.spine:
-            with open(path, 'rb') as f:
-                html = f.read().decode('utf-8', 'replace')
-            html = preprocessor(html, get_preprocess_html=True)
-            out.write(html.encode('utf-8'))
-            out.write(b'\n\n' + b'-'*80 + b'\n\n')
-
--- a/src/calibre/ebooks/oeb/iterator/init.py
+++ b/src/calibre/ebooks/oeb/iterator/init.py
@ -0,0 +1,42 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import os, re
+
+from calibre.customize.ui import available_input_formats
+
+def is_supported(path):
+    ext = os.path.splitext(path)[1].replace('.', '').lower()
+    ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
+    return ext in available_input_formats()
+
+class UnsupportedFormatError(Exception):
+
+    def __init__(self, fmt):
+        Exception.__init__(self, _('%s format books are not supported')%fmt.upper())
+
+def EbookIterator(*args, **kwargs):
+    'For backwards compatibility'
+    from calibre.ebooks.oeb.iterator.book import EbookIterator
+    return EbookIterator(*args, **kwargs)
+
+def get_preprocess_html(path_to_ebook, output):
+    from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
+    iterator = EbookIterator(path_to_ebook)
+    iterator.__enter__(only_input_plugin=True, run_char_count=False,
+            read_anchor_map=False)
+    preprocessor = HTMLPreProcessor(None, False)
+    with open(output, 'wb') as out:
+        for path in iterator.spine:
+            with open(path, 'rb') as f:
+                html = f.read().decode('utf-8', 'replace')
+            html = preprocessor(html, get_preprocess_html=True)
+            out.write(html.encode('utf-8'))
+            out.write(b'\n\n' + b'-'*80 + b'\n\n')
+
--- a/src/calibre/ebooks/oeb/iterator/book.py
+++ b/src/calibre/ebooks/oeb/iterator/book.py
@ -0,0 +1,187 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+
+'''
+Iterate over the HTML files in an ebook. Useful for writing viewers.
+'''
+
+import re, os, math
+from functools import partial
+
+from calibre.ebooks.metadata.opf2 import OPF
+from calibre.ptempfile import TemporaryDirectory
+from calibre.utils.config import DynamicConfig
+from calibre.utils.logging import default_log
+from calibre import (guess_type, prepare_string_for_xml,
+        xml_replace_entities)
+from calibre.ebooks.oeb.transforms.cover import CoverManager
+
+from calibre.ebooks.oeb.iterator.spine import (SpineItem, create_indexing_data)
+from calibre.ebooks.oeb.iterator.bookmarks import BookmarksMixin
+
+TITLEPAGE = CoverManager.SVG_TEMPLATE.decode('utf-8').replace(\
+        '__ar__', 'none').replace('__viewbox__', '0 0 600 800'
+        ).replace('__width__', '600').replace('__height__', '800')
+
+class FakeOpts(object):
+    verbose = 0
+    breadth_first = False
+    max_levels = 5
+    input_encoding = None
+
+
+def write_oebbook(oeb, path):
+    from calibre.ebooks.oeb.writer import OEBWriter
+    from calibre import walk
+    w = OEBWriter()
+    w(oeb, path)
+    for f in walk(path):
+        if f.endswith('.opf'):
+            return f
+
+class EbookIterator(BookmarksMixin):
+
+    CHARACTERS_PER_PAGE = 1000
+
+    def __init__(self, pathtoebook, log=None):
+        self.log = log or default_log
+        pathtoebook = pathtoebook.strip()
+        self.pathtoebook = os.path.abspath(pathtoebook)
+        self.config = DynamicConfig(name='iterator')
+        ext = os.path.splitext(pathtoebook)[1].replace('.', '').lower()
+        ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
+        self.ebook_ext = ext.replace('original_', '')
+
+    def search(self, text, index, backwards=False):
+        text = prepare_string_for_xml(text.lower())
+        pmap = [(i, path) for i, path in enumerate(self.spine)]
+        if backwards:
+            pmap.reverse()
+        for i, path in pmap:
+            if (backwards and i < index) or (not backwards and i > index):
+                with open(path, 'rb') as f:
+                    raw = f.read().decode(path.encoding)
+                try:
+                    raw = xml_replace_entities(raw)
+                except:
+                    pass
+                if text in raw.lower():
+                    return i
+
+    def __enter__(self, processed=False, only_input_plugin=False,
+            run_char_count=True, read_anchor_map=True):
+        ''' Convert an ebook file into an exploded OEB book suitable for
+        display in viewers/preprocessing etc. '''
+
+        from calibre.ebooks.conversion.plumber import Plumber, create_oebbook
+
+        self.delete_on_exit = []
+        self._tdir = TemporaryDirectory('_ebook_iter')
+        self.base  = self._tdir.__enter__()
+        plumber = Plumber(self.pathtoebook, self.base, self.log)
+        plumber.setup_options()
+        if self.pathtoebook.lower().endswith('.opf'):
+            plumber.opts.dont_package = True
+        if hasattr(plumber.opts, 'no_process'):
+            plumber.opts.no_process = True
+
+        plumber.input_plugin.for_viewer = True
+        with plumber.input_plugin, open(plumber.input, 'rb') as inf:
+            self.pathtoopf = plumber.input_plugin(inf,
+                plumber.opts, plumber.input_fmt, self.log,
+                {}, self.base)
+
+            if not only_input_plugin:
+                # Run the HTML preprocess/parsing from the conversion pipeline as
+                # well
+                if (processed or plumber.input_fmt.lower() in {'pdb', 'pdf', 'rb'}
+                        and not hasattr(self.pathtoopf, 'manifest')):
+                    if hasattr(self.pathtoopf, 'manifest'):
+                        self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
+                    self.pathtoopf = create_oebbook(self.log, self.pathtoopf,
+                            plumber.opts)
+
+            if hasattr(self.pathtoopf, 'manifest'):
+                self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
+
+        self.book_format = os.path.splitext(self.pathtoebook)[1][1:].upper()
+        if getattr(plumber.input_plugin, 'is_kf8', False):
+            self.book_format = 'KF8'
+
+        self.opf = getattr(plumber.input_plugin, 'optimize_opf_parsing', None)
+        if self.opf is None:
+            self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf))
+        self.language = self.opf.language
+        if self.language:
+            self.language = self.language.lower()
+        ordered = [i for i in self.opf.spine if i.is_linear] + \
+                  [i for i in self.opf.spine if not i.is_linear]
+        self.spine = []
+        Spiny = partial(SpineItem, read_anchor_map=read_anchor_map,
+                run_char_count=run_char_count)
+        for i in ordered:
+            spath = i.path
+            mt = None
+            if i.idref is not None:
+                mt = self.opf.manifest.type_for_id(i.idref)
+            if mt is None:
+                mt = guess_type(spath)[0]
+            try:
+                self.spine.append(Spiny(spath, mime_type=mt))
+            except:
+                self.log.warn('Missing spine item:', repr(spath))
+
+        cover = self.opf.cover
+        if cover and self.ebook_ext in {'lit', 'mobi', 'prc', 'opf', 'fb2',
+                'azw', 'azw3'}:
+            cfile = os.path.join(self.base, 'calibre_iterator_cover.html')
+            rcpath = os.path.relpath(cover, self.base).replace(os.sep, '/')
+            chtml = (TITLEPAGE%prepare_string_for_xml(rcpath, True)).encode('utf-8')
+            with open(cfile, 'wb') as f:
+                f.write(chtml)
+            self.spine[0:0] = [Spiny(cfile,
+                mime_type='application/xhtml+xml')]
+            self.delete_on_exit.append(cfile)
+
+        if self.opf.path_to_html_toc is not None and \
+           self.opf.path_to_html_toc not in self.spine:
+            try:
+                self.spine.append(Spiny(self.opf.path_to_html_toc))
+            except:
+                import traceback
+                traceback.print_exc()
+
+        sizes = [i.character_count for i in self.spine]
+        self.pages = [math.ceil(i/float(self.CHARACTERS_PER_PAGE)) for i in sizes]
+        for p, s in zip(self.pages, self.spine):
+            s.pages = p
+        start = 1
+
+        for s in self.spine:
+            s.start_page = start
+            start += s.pages
+            s.max_page = s.start_page + s.pages - 1
+        self.toc = self.opf.toc
+        if read_anchor_map:
+            create_indexing_data(self.spine, self.toc)
+
+        self.read_bookmarks()
+
+        return self
+
+    def __exit__(self, *args):
+        self._tdir.__exit__(*args)
+        for x in self.delete_on_exit:
+            try:
+                os.remove(x)
+            except:
+                pass
+
+
--- a/src/calibre/ebooks/oeb/iterator/bookmarks.py
+++ b/src/calibre/ebooks/oeb/iterator/bookmarks.py
@ -0,0 +1,105 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import os
+from io import BytesIO
+
+from calibre.utils.zipfile import safe_replace
+
+BM_FIELD_SEP = u'*|!|?|*'
+BM_LEGACY_ESC = u'esc-text-%&*#%(){}ads19-end-esc'
+
+class BookmarksMixin(object):
+
+    def parse_bookmarks(self, raw):
+        for line in raw.splitlines():
+            bm = None
+            if line.count('^') > 0:
+                tokens = line.rpartition('^')
+                title, ref = tokens[0], tokens[2]
+                try:
+                    spine, _, pos = ref.partition('#')
+                    spine = int(spine.strip())
+                except:
+                    continue
+                bm = {'type':'legacy', 'title':title, 'spine':spine, 'pos':pos}
+            elif BM_FIELD_SEP in line:
+                try:
+                    title, spine, pos = line.strip().split(BM_FIELD_SEP)
+                    spine = int(spine)
+                except:
+                    continue
+                # Unescape from serialization
+                pos = pos.replace(BM_LEGACY_ESC, u'^')
+                # Check for pos being a scroll fraction
+                try:
+                    pos = float(pos)
+                except:
+                    pass
+                bm = {'type':'cfi', 'title':title, 'pos':pos, 'spine':spine}
+
+            if bm:
+                self.bookmarks.append(bm)
+
+    def serialize_bookmarks(self, bookmarks):
+        dat = []
+        for bm in bookmarks:
+            if bm['type'] == 'legacy':
+                rec = u'%s^%d#%s'%(bm['title'], bm['spine'], bm['pos'])
+            else:
+                pos = bm['pos']
+                if isinstance(pos, (int, float)):
+                    pos = unicode(pos)
+                else:
+                    pos = pos.replace(u'^', BM_LEGACY_ESC)
+                rec = BM_FIELD_SEP.join([bm['title'], unicode(bm['spine']), pos])
+            dat.append(rec)
+        return (u'\n'.join(dat) +u'\n')
+
+    def read_bookmarks(self):
+        self.bookmarks = []
+        bmfile = os.path.join(self.base, 'META-INF', 'calibre_bookmarks.txt')
+        raw = ''
+        if os.path.exists(bmfile):
+            with open(bmfile, 'rb') as f:
+                raw = f.read()
+        else:
+            saved = self.config['bookmarks_'+self.pathtoebook]
+            if saved:
+                raw = saved
+        if not isinstance(raw, unicode):
+            raw = raw.decode('utf-8')
+        self.parse_bookmarks(raw)
+
+    def save_bookmarks(self, bookmarks=None):
+        if bookmarks is None:
+            bookmarks = self.bookmarks
+        dat = self.serialize_bookmarks(bookmarks)
+        if os.path.splitext(self.pathtoebook)[1].lower() == '.epub' and \
+            os.access(self.pathtoebook, os.R_OK):
+            try:
+                zf = open(self.pathtoebook, 'r+b')
+            except IOError:
+                return
+            safe_replace(zf, 'META-INF/calibre_bookmarks.txt',
+                    BytesIO(dat.encode('utf-8')),
+                    add_missing=True)
+        else:
+            self.config['bookmarks_'+self.pathtoebook] = dat
+
+    def add_bookmark(self, bm):
+        self.bookmarks = [x for x in self.bookmarks if x['title'] !=
+                bm['title']]
+        self.bookmarks.append(bm)
+        self.save_bookmarks()
+
+    def set_bookmarks(self, bookmarks):
+        self.bookmarks = bookmarks
+
+
--- a/src/calibre/ebooks/oeb/iterator/spine.py
+++ b/src/calibre/ebooks/oeb/iterator/spine.py
@ -0,0 +1,120 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+from future_builtins import map
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import re, os
+from functools import partial
+from operator import attrgetter
+from collections import namedtuple
+
+from calibre import guess_type
+from calibre.ebooks.chardet import xml_to_unicode
+
+def character_count(html):
+    ''' Return the number of "significant" text characters in a HTML string. '''
+    count = 0
+    strip_space = re.compile(r'\s+')
+    for match in re.finditer(r'>[^<]+<', html):
+        count += len(strip_space.sub(' ', match.group()))-2
+    return count
+
+def anchor_map(html):
+    ''' Return map of all anchor names to their offsets in the html '''
+    ans = {}
+    for match in re.finditer(
+        r'''(?:id|name)\s*=\s*['"]([^'"]+)['"]''', html):
+        anchor = match.group(0)
+        ans[anchor] = ans.get(anchor, match.start())
+    return ans
+
+class SpineItem(unicode):
+
+    def __new__(cls, path, mime_type=None, read_anchor_map=True,
+            run_char_count=True):
+        ppath = path.partition('#')[0]
+        if not os.path.exists(path) and os.path.exists(ppath):
+            path = ppath
+        obj = super(SpineItem, cls).__new__(cls, path)
+        with open(path, 'rb') as f:
+            raw = f.read()
+        raw, obj.encoding = xml_to_unicode(raw)
+        obj.character_count = character_count(raw) if run_char_count else 10000
+        obj.anchor_map = anchor_map(raw) if read_anchor_map else {}
+        obj.start_page = -1
+        obj.pages      = -1
+        obj.max_page   = -1
+        obj.index_entries = []
+        if mime_type is None:
+            mime_type = guess_type(obj)[0]
+        obj.mime_type = mime_type
+        return obj
+
+class IndexEntry(object):
+
+    def __init__(self, spine, toc_entry, num):
+        self.num = num
+        self.text = toc_entry.text or _('Unknown')
+        self.key = toc_entry.abspath
+        self.anchor = self.start_anchor = toc_entry.fragment or None
+        try:
+            self.spine_pos = spine.index(self.key)
+        except ValueError:
+            self.spine_pos = -1
+        self.anchor_pos = 0
+        if self.spine_pos > -1:
+            self.anchor_pos = spine[self.spine_pos].anchor_map.get(self.anchor,
+                    0)
+
+        self.depth = 0
+        p = toc_entry.parent
+        while p is not None:
+            self.depth += 1
+            p = p.parent
+
+        self.sort_key = (self.spine_pos, self.anchor_pos)
+        self.spine_count = len(spine)
+
+    def find_end(self, all_entries):
+        potential_enders = [i for i in all_entries if
+                i.depth <= self.depth and
+                (
+                    (i.spine_pos == self.spine_pos and i.anchor_pos >
+                                                            self.anchor_pos)
+                    or
+                    i.spine_pos > self.spine_pos
+                )]
+        if potential_enders:
+            # potential_enders is sorted by (spine_pos, anchor_pos)
+            end = potential_enders[0]
+            self.end_spine_pos = end.spine_pos
+            self.end_anchor = end.anchor
+        else:
+            self.end_spine_pos = self.spine_count - 1
+            self.end_anchor = None
+
+def create_indexing_data(spine, toc):
+    if not toc: return
+    f = partial(IndexEntry, spine)
+    index_entries = list(map(f,
+        (t for t in toc.flat() if t is not toc),
+        (i-1 for i, t in enumerate(toc.flat()) if t is not toc)
+        ))
+    index_entries.sort(key=attrgetter('sort_key'))
+    [ i.find_end(index_entries) for i in index_entries ]
+
+    ie = namedtuple('IndexEntry', 'entry start_anchor end_anchor')
+
+    for spine_pos, spine_item in enumerate(spine):
+        for i in index_entries:
+            if i.end_spine_pos < spine_pos or i.spine_pos > spine_pos:
+                continue # Does not touch this file
+            start = i.anchor if i.spine_pos == spine_pos else None
+            end = i.end_anchor if i.spine_pos == spine_pos else None
+            spine_item.index_entries.append(ie(i, start, end))
+
--- a/src/calibre/ebooks/oeb/parse_utils.py
+++ b/src/calibre/ebooks/oeb/parse_utils.py
@ -361,9 +361,11 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
    # Remove any encoding-specifying <meta/> elements
    for meta in META_XP(data):
        meta.getparent().remove(meta)
-    etree.SubElement(head, XHTML('meta'),
-        attrib={'http-equiv': 'Content-Type',
-                'content': '%s; charset=utf-8' % XHTML_NS})
+    meta = etree.SubElement(head, XHTML('meta'),
+        attrib={'http-equiv': 'Content-Type'})
+    meta.set('content', 'text/html; charset=utf-8') # Ensure content is second
+                                                    # attribute
+
    # Ensure has a <body/>
    if not xpath(data, '/h:html/h:body'):
        body = xpath(data, '//h:body')
--- a/src/calibre/ebooks/oeb/stylizer.py
+++ b/src/calibre/ebooks/oeb/stylizer.py
@ -347,7 +347,11 @@ class Stylizer(object):
            style = self.flatten_style(rule.style)
            self.page_rule.update(style)
        elif isinstance(rule, CSSFontFaceRule):
-            self.font_face_rules.append(rule)
+            if rule.style.length > 1:
+                # Ignore the meaningless font face rules generated by the
+                # benighted MS Word that contain only a font-family declaration
+                # and nothing else
+                self.font_face_rules.append(rule)
        return results

    def flatten_style(self, cssstyle):
--- a/src/calibre/ebooks/oeb/transforms/flatcss.py
+++ b/src/calibre/ebooks/oeb/transforms/flatcss.py
@ -157,10 +157,12 @@ class CSSFlattener(object):
            bs = body.get('style', '').split(';')
            bs.append('margin-top: 0pt')
            bs.append('margin-bottom: 0pt')
-            bs.append('margin-left : %fpt'%\
-                    float(self.context.margin_left))
-            bs.append('margin-right : %fpt'%\
-                    float(self.context.margin_right))
+            if float(self.context.margin_left) >= 0:
+                bs.append('margin-left : %gpt'%\
+                        float(self.context.margin_left))
+            if float(self.context.margin_right) >= 0:
+                bs.append('margin-right : %gpt'%\
+                        float(self.context.margin_right))
            bs.extend(['padding-left: 0pt', 'padding-right: 0pt'])
            if self.page_break_on_body:
                bs.extend(['page-break-before: always'])
@ -393,10 +395,11 @@ class CSSFlattener(object):
        l = etree.SubElement(head, XHTML('link'),
            rel='stylesheet', type=CSS_MIME, href=href)
        l.tail='\n'
-        href = item.relhref(global_href)
-        l = etree.SubElement(head, XHTML('link'),
-            rel='stylesheet', type=CSS_MIME, href=href)
-        l.tail = '\n'
+        if global_href:
+            href = item.relhref(global_href)
+            l = etree.SubElement(head, XHTML('link'),
+                rel='stylesheet', type=CSS_MIME, href=href)
+            l.tail = '\n'

    def replace_css(self, css):
        manifest = self.oeb.manifest
@ -413,14 +416,16 @@ class CSSFlattener(object):
        global_css = defaultdict(list)
        for item in self.oeb.spine:
            stylizer = self.stylizers[item]
-            stylizer.page_rule['margin-top'] = '%gpt'%\
-                    float(self.context.margin_top)
-            stylizer.page_rule['margin-bottom'] = '%gpt'%\
-                    float(self.context.margin_bottom)
+            if float(self.context.margin_top) >= 0:
+                stylizer.page_rule['margin-top'] = '%gpt'%\
+                        float(self.context.margin_top)
+            if float(self.context.margin_bottom) >= 0:
+                stylizer.page_rule['margin-bottom'] = '%gpt'%\
+                        float(self.context.margin_bottom)
            items = stylizer.page_rule.items()
            items.sort()
            css = ';\n'.join("%s: %s" % (key, val) for key, val in items)
-            css = '@page {\n%s\n}\n'%css
+            css = ('@page {\n%s\n}\n'%css) if items else ''
            rules = [r.cssText for r in stylizer.font_face_rules]
            raw = '\n\n'.join(rules)
            css += '\n\n' + raw
@ -429,9 +434,11 @@ class CSSFlattener(object):
        gc_map = {}
        manifest = self.oeb.manifest
        for css in global_css:
-            id_, href = manifest.generate('page_css', 'page_styles.css')
-            manifest.add(id_, href, CSS_MIME, data=cssutils.parseString(css,
-                validate=False))
+            href = None
+            if css.strip():
+                id_, href = manifest.generate('page_css', 'page_styles.css')
+                manifest.add(id_, href, CSS_MIME, data=cssutils.parseString(css,
+                    validate=False))
            gc_map[css] = href

        ans = {}
--- a/src/calibre/ebooks/oeb/transforms/metadata.py
+++ b/src/calibre/ebooks/oeb/transforms/metadata.py
@ -6,7 +6,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-import os
+import os, re
 from calibre.utils.date import isoformat, now
 from calibre import guess_type

@ -141,7 +141,7 @@ class MergeMetadata(object):
                item = self.oeb.manifest.hrefs[old_cover.href]
                if not cdata:
                    return item.id
-                self.oeb.manifest.remove(item)
+                self.remove_old_cover(item)
            elif not cdata:
                id = self.oeb.manifest.generate(id='cover')
                self.oeb.manifest.add(id, old_cover.href, 'image/jpeg')
@ -152,3 +152,41 @@ class MergeMetadata(object):
            self.oeb.guide.add('cover', 'Cover', href)
        return id

+    def remove_old_cover(self, cover_item):
+        from calibre.ebooks.oeb.base import XPath
+        from lxml import etree
+
+        self.oeb.manifest.remove(cover_item)
+
+        # Remove any references to the cover in the HTML
+        affected_items = set()
+        for item in self.oeb.spine:
+            try:
+                images = XPath('//h:img[@src]')(item.data)
+            except:
+                images = []
+            removed = False
+            for img in images:
+                href = item.abshref(img.get('src'))
+                if href == cover_item.href:
+                    img.getparent().remove(img)
+                    removed = True
+            if removed:
+                affected_items.add(item)
+
+        # Check if the resulting HTML has no content, if so remove it
+        for item in affected_items:
+            body = XPath('//h:body')(item.data)
+            if body:
+                text = etree.tostring(body[0], method='text', encoding=unicode)
+            else:
+                text = ''
+            text = re.sub(r'\s+', '', text)
+            if not text and not XPath('//h:img|//svg:svg')(item.data):
+                self.log('Removing %s as it is a wrapper around'
+                        ' the cover image'%item.href)
+                self.oeb.spine.remove(item)
+                self.oeb.manifest.remove(item)
+
+
+
--- a/src/calibre/ebooks/rtf2xml/ParseRtf.py
+++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py
@ -372,8 +372,8 @@ class ParseRtf:
        old_rtf = old_rtf_obj.check_if_old_rtf()
        if old_rtf:
            if self.__run_level > 5:
-                msg = 'Older RTF\n'
-                msg += 'self.__run_level is "%s"\n' % self.__run_level
+                msg = 'Older RTF\n' \
+                'self.__run_level is "%s"\n' % self.__run_level
                raise RtfInvalidCodeException, msg
            if self.__run_level > 1:
                sys.stderr.write('File could be older RTF...\n')
@ -381,7 +381,7 @@ class ParseRtf:
                if self.__run_level > 1:
                    sys.stderr.write(
                        'File also has newer RTF.\n'
-                        'Will do the best to convert.\n'
+                        'Will do the best to convert...\n'
                    )
            add_brackets_obj = add_brackets.AddBrackets(
                    in_file = self.__temp_file,
--- a/src/calibre/ebooks/rtf2xml/add_brackets.py
+++ b/src/calibre/ebooks/rtf2xml/add_brackets.py
@ -20,6 +20,9 @@ class AddBrackets:
    """
    Add brackets for old RTF.
    Logic:
+    When control words without their own brackets are encountered
+    and in the list of allowed words, this will add brackets
+    to facilitate the treatment of the file
    """
    def __init__(self, in_file,
            bug_handler,
@ -41,53 +44,56 @@ class AddBrackets:
        self.__copy = copy
        self.__write_to = better_mktemp()
        self.__run_level = run_level
-
-    def __initiate_values(self):
-        """
-        """
        self.__state_dict = {
            'before_body'           : self.__before_body_func,
            'in_body'               : self.__in_body_func,
            'after_control_word'    : self.__after_control_word_func,
            'in_ignore'             : self.__ignore_func,
        }
+        self.__accept = [
+            'cw<ci<bold______' ,
+            'cw<ci<annotation' ,
+            'cw<ci<blue______' ,
+            # 'cw<ci<bold______' ,
+            'cw<ci<caps______' ,
+            'cw<ci<char-style' ,
+            'cw<ci<dbl-strike' ,
+            'cw<ci<emboss____' ,
+            'cw<ci<engrave___' ,
+            'cw<ci<font-color' ,
+            'cw<ci<font-down_' ,
+            'cw<ci<font-size_' ,
+            'cw<ci<font-style' ,
+            'cw<ci<font-up___' ,
+            'cw<ci<footnot-mk' ,
+            'cw<ci<green_____' ,
+            'cw<ci<hidden____' ,
+            'cw<ci<italics___' ,
+            'cw<ci<outline___' ,
+            'cw<ci<red_______' ,
+            'cw<ci<shadow____' ,
+            'cw<ci<small-caps' ,
+            'cw<ci<strike-thr' ,
+            'cw<ci<subscript_' ,
+            'cw<ci<superscrip' ,
+            'cw<ci<underlined' ,
+            # 'cw<ul<underlined' ,
+        ]
+
+    def __initiate_values(self):
+        """
+        Init temp values
+        """
        self.__state = 'before_body'
        self.__inline = {}
        self.__temp_group = []
-        self.__open_bracket = 0
-        self.__found_brackets = 0
-        self.__accept = [
-        'cw<ci<bold______',
-        'cw<ci<annotation'  ,
-        'cw<ci<blue______' ,
-        'cw<ci<bold______' ,
-        'cw<ci<caps______' ,
-        'cw<ci<char-style' ,
-        'cw<ci<dbl-strike' ,
-        'cw<ci<emboss____'  ,
-        'cw<ci<engrave___' ,
-        'cw<ci<font-color' ,
-        'cw<ci<font-down_' ,
-        'cw<ci<font-size_' ,
-        'cw<ci<font-style' ,
-        'cw<ci<font-up___',
-        'cw<ci<footnot-mk',
-        'cw<ci<green_____' ,
-        'cw<ci<hidden____',
-        'cw<ci<italics___' ,
-        'cw<ci<outline___',
-        'cw<ci<red_______' ,
-        'cw<ci<shadow____',
-        'cw<ci<small-caps' ,
-        'cw<ci<strike-thr',
-        'cw<ci<subscript_' ,
-        'cw<ci<superscrip',
-        'cw<ci<underlined' ,
-        # 'cw<ul<underlined' ,
-        ]
+        self.__open_bracket = False
+        self.__found_brackets = False
+        

    def __before_body_func(self, line):
        """
+        If we are before the body, not interest in changing anything
        """
        if self.__token_info == 'mi<mk<body-open_':
            self.__state = 'in_body'
@ -95,6 +101,14 @@ class AddBrackets:

    def __in_body_func(self, line):
        """
+        Select what action to take in body:
+            1-At the end of the file close the braket if a bracket was opened
+            This happens if there is achange
+            2-If an open bracket is found the code inside is ignore
+            (written without modifications)
+            3-If an accepted control word is found put the line
+            in a buffer then chage state to after cw
+            4-Else simply write the line
        """
        if line == 'cb<nu<clos-brack<0001\n' and self.__open_bracket:
            self.__write_obj.write(
@ -102,7 +116,7 @@ class AddBrackets:
                    )
            self.__write_obj.write(line)
        elif self.__token_info == 'ob<nu<open-brack':
-            self.__found_brackets = 1
+            self.__found_brackets = True
            self.__state = 'in_ignore'
            self.__ignore_count = self.__ob_count
            self.__write_obj.write(line)
@ -114,6 +128,10 @@ class AddBrackets:

    def __after_control_word_func(self, line):
        """
+        After a cw either add next allowed cw to temporary list or
+        change groupe and write it.
+        If the token leading to an exit is an open bracket go to
+        ignore otherwise goto in body
        """
        if self.__token_info in self.__accept:
            self.__temp_group.append(line)
@ -129,82 +147,84 @@ class AddBrackets:

    def __write_group(self):
        """
+        Write a tempory group after accepted control words end
+        But this is mostly useless in my opinion as there is no list of rejected cw
+        This may be a way to implement future old rtf processing for cw
+        Utility: open a group to just put brackets but why be so complicated?
+        Scheme: open brackets, write cw then go to body and back with cw after 
        """
        if self.__open_bracket:
            self.__write_obj.write(
                'cb<nu<clos-brack<0003\n'
                )
-            self.__open_bracket = 0
-        inline_string = ''
-        the_keys = self.__inline.keys()
-        for the_key in the_keys:
-            value = self.__inline[the_key]
-            if value != 'false':
-                inline_string += '%s<nu<%s\n' % (the_key, value)
+            self.__open_bracket = False
+
+        inline_string = ''.join(['%s<nu<%s\n' % (k, v) \
+                for k, v in self.__inline.iteritems() \
+                    if v != 'false'])
        if inline_string:
-            self.__write_obj.write('ob<nu<open-brack<0003\n')
-            self.__write_obj.write(inline_string)
-            self.__open_bracket = 1
+            self.__write_obj.write('ob<nu<open-brack<0003\n'
+                '%s' % inline_string)
+            self.__open_bracket = True
        self.__temp_group = []

    def __change_permanent_group(self):
        """
-        use temp group to change permanent group
+        Use temp group to change permanent group
+        If the control word is not accepted remove it
+        What is the interest as it is build to accept only accepted cw
+        in __after_control_word_func?
        """
-        for line in self.__temp_group:
-            token_info = line[:16]
-            if token_info in self.__accept:
-                att = line[20:-1]
-                self.__inline[token_info] = att
+        self.__inline = {line[:16] : line[20:-1]\
+            for line in self.__temp_group\
+            # Is this really necessary?
+                if line[:16] in self.__accept}
+

    def __ignore_func(self, line):
        """
-        Don't add any brackets while inside of brackets RTF has already
-        added.
+        Just copy data inside of RTF brackets already here.
        """
        self.__write_obj.write(line)
-        if self.__token_info == 'cb<nu<clos-brack'and\
-            self.__cb_count == self.__ignore_count:
+        if self.__token_info == 'cb<nu<clos-brack'\
+            and self.__cb_count == self.__ignore_count:
            self.__state = 'in_body'

    def __check_brackets(self, in_file):
-        self.__check_brack_obj = check_brackets.CheckBrackets\
+        """
+        Return True if brackets match
+        """
+        check_brack_obj = check_brackets.CheckBrackets\
            (file = in_file)
-        good_br =  self.__check_brack_obj.check_brackets()[0]
-        if not good_br:
-            return 1
+        return check_brack_obj.check_brackets()[0]

    def add_brackets(self):
        """
        """
        self.__initiate_values()
-        read_obj = open(self.__file, 'r')
-        self.__write_obj = open(self.__write_to, 'w')
-        line_to_read = 1
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            self.__token_info = line[:16]
-            if self.__token_info == 'ob<nu<open-brack':
-                self.__ob_count = line[-5:-1]
-            if self.__token_info == 'cb<nu<clos-brack':
-                self.__cb_count = line[-5:-1]
-            action = self.__state_dict.get(self.__state)
-            if action == None:
-                sys.stderr.write('No matching state in module add_brackets.py\n')
-                sys.stderr.write(self.__state + '\n')
-            action(line)
-        read_obj.close()
-        self.__write_obj.close()
-        bad_brackets = self.__check_brackets(self.__write_to)
-        if not bad_brackets:
+        with open(self.__file, 'r') as read_obj:
+            with open(self.__write_to, 'w') as self.__write_obj:
+                for line in read_obj:
+                    self.__token_info = line[:16]
+                    if self.__token_info == 'ob<nu<open-brack':
+                        self.__ob_count = line[-5:-1]
+                    if self.__token_info == 'cb<nu<clos-brack':
+                        self.__cb_count = line[-5:-1]
+                    action = self.__state_dict.get(self.__state)
+                    if action is None:
+                        sys.stderr.write(
+                            'No matching state in module add_brackets.py\n'
+                            '%s\n' % self.__state)
+                    action(line)
+        #Check bad brackets
+        if self.__check_brackets(self.__write_to):
            copy_obj = copy.Copy(bug_handler = self.__bug_handler)
            if self.__copy:
                copy_obj.copy_file(self.__write_to, "add_brackets.data")
-            copy_obj.rename(self.__write_to, self.__file)
+            copy_obj.rename(self.__write_to, self.__file)  
        else:
            if self.__run_level > 0:
                sys.stderr.write(
                    'Sorry, but this files has a mix of old and new RTF.\n'
                    'Some characteristics cannot be converted.\n')
-        os.remove(self.__write_to)
+        os.remove(self.__write_to)
--- a/src/calibre/ebooks/rtf2xml/char_set.py
+++ b/src/calibre/ebooks/rtf2xml/char_set.py
--- a/src/calibre/ebooks/rtf2xml/convert_to_tags.py
+++ b/src/calibre/ebooks/rtf2xml/convert_to_tags.py
@ -1,4 +1,5 @@
 import os, sys
+from codecs import EncodedFile

 from calibre.ebooks.rtf2xml import copy, check_encoding
 from calibre.ptempfile import better_mktemp
@ -41,6 +42,7 @@ class ConvertToTags:
        self.__run_level = run_level
        self.__write_to = better_mktemp()
        self.__convert_utf = False
+        self.__bad_encoding = False

    def __initiate_values(self):
        """
@ -213,13 +215,14 @@ class ConvertToTags:

        if not check_encoding_obj.check_encoding(self.__file, verbose=False):
            self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
-        elif not check_encoding_obj.check_encoding(self.__file, self.__encoding):
+        elif not check_encoding_obj.check_encoding(self.__file, self.__encoding, verbose=False):
            self.__write_obj.write('<?xml version="1.0" encoding="UTF-8" ?>')
            self.__convert_utf = True
        else:
            self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
            sys.stderr.write('Bad RTF encoding, revert to US-ASCII chars and'
                    ' hope for the best')
+            self.__bad_encoding = True
        self.__new_line = 0
        self.__write_new_line()
        if self.__no_dtd:
@ -247,7 +250,7 @@ class ConvertToTags:
        the appropriate function.
        The functions that are called:
            a text function for text
-            an open funciton for open tags
+            an open function for open tags
            an open with attribute function for tags with attributes
            an empty with attribute function for tags that are empty but have
            attribtes.
@ -263,20 +266,19 @@ class ConvertToTags:
                    action = self.__state_dict.get(self.__token_info)
                    if action is not None:
                        action(line)
-        self.__write_obj.close()
-        #convert all encodings to UTF8 to avoid unsupported encodings in lxml
-        if self.__convert_utf:
+        #convert all encodings to UTF8 or ASCII to avoid unsupported encodings in lxml
+        if self.__convert_utf or self.__bad_encoding:
            copy_obj = copy.Copy(bug_handler = self.__bug_handler)
            copy_obj.rename(self.__write_to, self.__file)
+            file_encoding = "utf-8"
+            if self.__bad_encoding:
+                file_encoding = "us-ascii"
            with open(self.__file, 'r') as read_obj:
                with open(self.__write_to, 'w') as write_obj:
-                    file = read_obj.read()
-                    try:
-                        file = file.decode(self.__encoding)
-                        write_obj.write(file.encode('utf-8'))
-                    except:
-                        sys.stderr.write('Conversion to UTF-8 is not possible,'
-                        ' encoding should be very carefully checked')
+                    write_objenc = EncodedFile(write_obj, self.__encoding,
+                                    file_encoding, 'replace')
+                    for line in read_obj:
+                        write_objenc.write(line)
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "convert_to_tags.data")
--- a/src/calibre/ebooks/rtf2xml/header.py
+++ b/src/calibre/ebooks/rtf2xml/header.py
@ -11,6 +11,7 @@
 #                                                                       #
 #########################################################################
 import sys, os
+
 from calibre.ebooks.rtf2xml import copy
 from calibre.ptempfile import better_mktemp

@ -31,29 +32,29 @@ class Header:
        self.__bug_handler = bug_handler
        self.__copy = copy
        self.__write_to = better_mktemp()
-        self.__found_a_header = 0
+        self.__found_a_header = False
+
    def __in_header_func(self, line):
        """
        Handle all tokens that are part of header
        """
        if self.__cb_count == self.__header_bracket_count:
-            self.__in_header = 0
+            self.__in_header = False
            self.__write_obj.write(line)
            self.__write_to_head_obj.write(
-            'mi<mk<head___clo\n')
-            self.__write_to_head_obj.write(
-            'mi<tg<close_____<header-or-footer\n')
-            self.__write_to_head_obj.write(
+            'mi<mk<head___clo\n' \
+            'mi<tg<close_____<header-or-footer\n' \
            'mi<mk<header-clo\n')
        else:
            self.__write_to_head_obj.write(line)
+
    def __found_header(self, line):
        """
        Found a header
        """
        # but this could be header or footer
-        self.__found_a_header = 1
-        self.__in_header = 1
+        self.__found_a_header = True
+        self.__in_header = True
        self.__header_count += 1
        # temporarily set this to zero so I can enter loop
        self.__cb_count = 0
@ -69,18 +70,23 @@ class Header:
                    'mi<tg<open-att__<header-or-footer<type>%s\n' % (type)
                    )
        else:
-            sys.stderr.write('module is header\n')
-            sys.stderr.write('method is __found_header\n')
-            sys.stderr.write('no dict entry\n')
-            sys.stderr.write('line is %s' % line)
+            sys.stderr.write(
+            'module is header\n' \
+            'method is __found_header\n' \
+            'no dict entry\n' \
+            'line is %s' % line)
            self.__write_to_head_obj.write(
                    'mi<tg<open-att__<header-or-footer<type>none\n'
                    )
+
    def __default_sep(self, line):
-        """Handle all tokens that are not header tokens"""
+        """
+        Handle all tokens that are not header tokens
+        """
        if self.__token_info[3:5] == 'hf':
            self.__found_header(line)
        self.__write_obj.write(line)
+
    def __initiate_sep_values(self):
        """
        initiate counters for separate_footnotes method.
@ -89,7 +95,7 @@ class Header:
        self.__ob_count = 0
        self.__cb_count = 0
        self.__header_bracket_count = 0
-        self.__in_header = 0
+        self.__in_header = False
        self.__header_count = 0
        self.__head_dict = {
            'head-left_'        :   ('header-left'),
@ -101,6 +107,7 @@ class Header:
            'header____'        :   ('header' ),
            'footer____'        :   ('footer' ),
        }
+
    def separate_headers(self):
        """
        Separate all the footnotes in an RTF file and put them at the bottom,
@ -110,53 +117,47 @@ class Header:
        bottom of the main file.
        """
        self.__initiate_sep_values()
-        read_obj = open(self.__file)
-        self.__write_obj = open(self.__write_to, 'w')
        self.__header_holder = better_mktemp()
-        self.__write_to_head_obj = open(self.__header_holder, 'w')
-        line_to_read = 1
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            self.__token_info = line[:16]
-            # keep track of opening and closing brackets
-            if self.__token_info == 'ob<nu<open-brack':
-                self.__ob_count = line[-5:-1]
-            if self.__token_info == 'cb<nu<clos-brack':
-                self.__cb_count = line[-5:-1]
-            # In the middle of footnote text
-            if self.__in_header:
-                self.__in_header_func(line)
-            # not in the middle of footnote text
-            else:
-                self.__default_sep(line)
-        self.__write_obj.close()
-        read_obj.close()
-        self.__write_to_head_obj.close()
-        read_obj = open(self.__header_holder, 'r')
-        write_obj = open(self.__write_to, 'a')
-        write_obj.write(
-        'mi<mk<header-beg\n')
-        line = 1
-        while line:
-            line = read_obj.readline()
-            write_obj.write(line)
-        write_obj.write(
-        'mi<mk<header-end\n')
-        read_obj.close()
-        write_obj.close()
+        with open(self.__file) as read_obj:
+            with open(self.__write_to, 'w') as self.__write_obj:
+                with open(self.__header_holder, 'w') as self.__write_to_head_obj:
+                    for line in read_obj:
+                        self.__token_info = line[:16]
+                        # keep track of opening and closing brackets
+                        if self.__token_info == 'ob<nu<open-brack':
+                            self.__ob_count = line[-5:-1]
+                        if self.__token_info == 'cb<nu<clos-brack':
+                            self.__cb_count = line[-5:-1]
+                        # In the middle of footnote text
+                        if self.__in_header:
+                            self.__in_header_func(line)
+                        # not in the middle of footnote text
+                        else:
+                            self.__default_sep(line)
+        
+        with open(self.__header_holder, 'r') as read_obj:
+            with open(self.__write_to, 'a') as write_obj:
+                write_obj.write(
+                'mi<mk<header-beg\n')
+                for line in read_obj:
+                    write_obj.write(line)
+                write_obj.write(
+                'mi<mk<header-end\n')
        os.remove(self.__header_holder)
+
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
-            copy_obj.copy_file(self.__write_to, "header_separate.info")
+            copy_obj.copy_file(self.__write_to, "header_separate.data")
        copy_obj.rename(self.__write_to, self.__file)
        os.remove(self.__write_to)
+
    def update_info(self, file, copy):
        """
        Unused method
        """
        self.__file = file
        self.__copy = copy
+
    def __get_head_body_func(self, line):
        """
        Process lines in main body and look for beginning of headers.
@ -166,6 +167,7 @@ class Header:
            self.__state = 'head'
        else:
            self.__write_obj.write(line)
+
    def __get_head_head_func(self, line):
        """
        Copy headers and footers from bottom of file to a separate, temporary file.
@ -174,6 +176,7 @@ class Header:
            self.__state = 'body'
        else:
            self.__write_to_head_obj.write(line)
+
    def __get_headers(self):
        """
        Private method to remove footnotes from main file.  Read one line from
@ -182,21 +185,16 @@ class Header:
        These two functions do the work of separating the footnotes form the
        body.
        """
-        read_obj = open(self.__file)
-        self.__write_obj = open(self.__write_to, 'w')
-            # self.__write_to = "footnote_info.data"
-        self.__write_to_head_obj = open(self.__header_holder, 'w')
-        line = 1
-        while line:
-            line = read_obj.readline()
-            self.__token_info = line[:16]
-            if self.__state == 'body':
-                self.__get_head_body_func(line)
-            elif self.__state == 'head':
-                self.__get_head_head_func(line)
-        read_obj.close()
-        self.__write_obj.close()
-        self.__write_to_head_obj.close()
+        with open(self.__file) as read_obj:
+            with open(self.__write_to, 'w') as self.__write_obj:
+                with open(self.__header_holder, 'w') as self.__write_to_head_obj:
+                    for line in read_obj:
+                        self.__token_info = line[:16]
+                        if self.__state == 'body':
+                            self.__get_head_body_func(line)
+                        elif self.__state == 'head':
+                            self.__get_head_head_func(line)
+
    def __get_head_from_temp(self, num):
        """
        Private method for joining headers and footers to body. This method
@ -205,18 +203,17 @@ class Header:
        returns them as a string.
        """
        look_for = 'mi<mk<header-ope<' + num + '\n'
-        found_head = 0
+        found_head = False
        string_to_return = ''
-        line = 1
-        while line:
-            line = self.__read_from_head_obj.readline()
+        for line in self.__read_from_head_obj:
            if found_head:
                if line == 'mi<mk<header-clo\n':
                    return string_to_return
-                string_to_return = string_to_return + line
+                string_to_return += line
            else:
                if line == look_for:
-                    found_head = 1
+                    found_head = True
+
    def __join_from_temp(self):
        """
        Private method for rejoining footnotes to body.  Read from the
@ -227,15 +224,13 @@ class Header:
        If no footnote marker is found, simply print out the token (line).
        """
        self.__read_from_head_obj = open(self.__header_holder, 'r')
-        read_obj = open(self.__write_to, 'r')
        self.__write_obj = open(self.__write_to2, 'w')
-        line = 1
-        while line:
-            line = read_obj.readline()
-            if line[:16] == 'mi<mk<header-ind':
-                line = self.__get_head_from_temp(line[17:-1])
-            self.__write_obj.write(line)
-        read_obj.close()
+        with open(self.__write_to, 'r') as read_obj:
+            for line in read_obj:
+                if line[:16] == 'mi<mk<header-ind':
+                    line = self.__get_head_from_temp(line[17:-1])
+                self.__write_obj.write(line)
+
    def join_headers(self):
        """
        Join the footnotes from the bottom of the file and put them in their
--- a/src/calibre/ebooks/rtf2xml/hex_2_utf8.py
+++ b/src/calibre/ebooks/rtf2xml/hex_2_utf8.py
@ -181,7 +181,7 @@ class Hex2Utf8:
            self.__dingbats_dict.update(dingbats_base_dict)
            self.__dingbats_dict.update(ms_dingbats_dict)
        # load dictionary for caps, and make a string for the replacement
-        self.__caps_uni_dict = char_map_obj.get_char_map(map='caps_uni')
+        self.__caps_uni_dict = char_map_obj.get_char_map(map = 'caps_uni')
        # # print self.__caps_uni_dict
        # don't think I'll need this
        ##keys = self.__caps_uni_dict.keys()
--- a/src/calibre/ebooks/rtf2xml/old_rtf.py
+++ b/src/calibre/ebooks/rtf2xml/old_rtf.py
@ -11,14 +11,18 @@
 #                                                                       #
 #########################################################################
 import sys
-"""
-"""
+
 class OldRtf:
    """
    Check to see if the RTF is an older version
    Logic:
+    If allowable control word/properties happen in text without being enclosed
+    in brackets the file will be considered old rtf
    """
-    def __init__(self, in_file, bug_handler, run_level ):
+    def __init__(self, in_file,
+                bug_handler,
+                run_level,
+                ):
        """
        Required:
            'file'--file to parse
@ -32,46 +36,46 @@ class OldRtf:
            """
        self.__file = in_file
        self.__bug_handler = bug_handler
-        self.__initiate_values()
-        self.__ob_group = 0
-    def __initiate_values(self):
-        self.__previous_token = ''
-        self.__new_found = 0
+        self.__run_level = run_level
        self.__allowable = [
-        'annotation' ,
-        'blue______'  ,
-        'bold______',
-        'caps______',
-        'char-style' ,
-        'dbl-strike' ,
-        'emboss____',
-        'engrave___' ,
-        'font-color',
-        'font-down_' ,
-        'font-size_',
-        'font-style',
-        'font-up___',
-        'footnot-mk' ,
-        'green_____' ,
-        'hidden____',
-        'italics___',
-        'outline___',
-        'red_______',
-        'shadow____' ,
-        'small-caps',
-        'strike-thr',
-        'subscript_',
-        'superscrip' ,
-        'underlined' ,
+            'annotation' ,
+            'blue______'  ,
+            'bold______',
+            'caps______',
+            'char-style' ,
+            'dbl-strike' ,
+            'emboss____',
+            'engrave___' ,
+            'font-color',
+            'font-down_' ,
+            'font-size_',
+            'font-style',
+            'font-up___',
+            'footnot-mk' ,
+            'green_____' ,
+            'hidden____',
+            'italics___',
+            'outline___',
+            'red_______',
+            'shadow____' ,
+            'small-caps',
+            'strike-thr',
+            'subscript_',
+            'superscrip' ,
+            'underlined' ,
        ]
-        self.__state = 'before_body'
        self.__action_dict = {
            'before_body'   : self.__before_body_func,
            'in_body'       : self.__check_tokens_func,
            'after_pard'    : self.__after_pard_func,
        }
-        self.__is_old = 0
+
+    def __initiate_values(self):
+        self.__previous_token = ''
+        self.__state = 'before_body'
        self.__found_new = 0
+        self.__ob_group = 0
+
    def __check_tokens_func(self, line):
        if self.__inline_info in self.__allowable:
            if self.__ob_group == self.__base_ob_count:
@ -80,48 +84,56 @@ class OldRtf:
                self.__found_new += 1
        elif self.__token_info ==  'cw<pf<par-def___':
            self.__state = 'after_pard'
+
    def __before_body_func(self, line):
        if self.__token_info == 'mi<mk<body-open_':
            self.__state = 'in_body'
            self.__base_ob_count = self.__ob_group
+
    def __after_pard_func(self, line):
        if line[0:2] != 'cw':
            self.__state = 'in_body'
+
    def check_if_old_rtf(self):
        """
        Requires:
            nothing
        Returns:
-            1 if file is older RTf
-            0 if file is newer RTF
+            True if file is older RTf
+            False if file is newer RTF
        """
-
-        read_obj = open(self.__file, 'r')
-        line = 1
+        self.__initiate_values()
        line_num = 0
-        while line:
-            line = read_obj.readline()
-            line_num += 1
-            self.__token_info = line[:16]
-            if self.__token_info == 'mi<mk<body-close':
-                return 0
-                self.__ob_group = 0
-            if self.__token_info == 'ob<nu<open-brack':
-                self.__ob_group += 1
-                self.__ob_count = line[-5:-1]
-            if self.__token_info == 'cb<nu<clos-brack':
-                self.__ob_group -= 1
-                self.__cb_count = line[-5:-1]
-            self.__inline_info = line[6:16]
-            if self.__state == 'after_body':
-                return 0
-            action = self.__action_dict.get(self.__state)
-            if not action:
-                sys.stderr.write('No action for state!\n')
-            result = action(line)
-            if result == 'new_rtf':
-                return 0
-            elif result == 'old_rtf':
-                return 1
-            self.__previous_token = line[6:16]
-        return 0
+        with open(self.__file, 'r') as read_obj:
+            for line in read_obj:
+                line_num += 1
+                self.__token_info = line[:16]
+                if self.__token_info == 'mi<mk<body-close':
+                    return False
+                if self.__token_info == 'ob<nu<open-brack':
+                    self.__ob_group += 1
+                    self.__ob_count = line[-5:-1]
+                if self.__token_info == 'cb<nu<clos-brack':
+                    self.__ob_group -= 1
+                    self.__cb_count = line[-5:-1]
+                self.__inline_info = line[6:16]
+                if self.__state == 'after_body':
+                    return False
+                action = self.__action_dict.get(self.__state)
+                if action is None:
+                    try:
+                        sys.stderr.write('No action for this state!\n')
+                    except:
+                        pass
+                result = action(line)
+                if result == 'new_rtf':
+                    return False
+                elif result == 'old_rtf':
+                    if self.__run_level > 3:
+                        sys.stderr.write(
+                            'Old rtf construction %s (bracket %s, line %s)\n' 
+                                % (self.__inline_info, str(self.__ob_group), line_num)
+                        )
+                    return True
+                self.__previous_token = line[6:16]
+        return False
--- a/src/calibre/ebooks/rtf2xml/output.py
+++ b/src/calibre/ebooks/rtf2xml/output.py
@ -10,7 +10,9 @@
 #                                                                       #
 #                                                                       #
 #########################################################################
-import sys, os, codecs
+import sys, os
+# , codecs
+
 class Output:
    """
    Output file
@ -19,7 +21,8 @@ class Output:
            file,
            orig_file,
            output_dir = None,
-            out_file = None
+            out_file = None,
+            no_ask = True
            ):
        """
        Required:
@ -33,8 +36,9 @@ class Output:
        self.__file = file
        self.__orig_file = orig_file
        self.__output_dir = output_dir
-        self.__no_ask = 1
+        self.__no_ask = no_ask
        self.__out_file = out_file
+
    def output(self):
        """
        Required:
@ -45,13 +49,14 @@ class Output:
            output the line to the screen if no output file given. Otherwise, output to
            the file.
        """
-        # self.__output_xml(self.__file, self.__out_file)
        if self.__output_dir:
            self.__output_to_dir_func()
        elif self.__out_file:
-            self.__output_xml(self.__file, self.__out_file)
+            self.__output_to_file_func()
+            # self.__output_xml(self.__file, self.__out_file)
        else:
            self.__output_to_standard_func()
+
    def __output_to_dir_func(self):
        """
        Requires:
@ -64,32 +69,25 @@ class Output:
        """
        base_name = os.path.basename(self.__orig_file)
        base_name, ext  = os.path.splitext(base_name)
-        output_file = '%s.xml' % base_name
-        output_file = os.path.join(self.__output_dir, output_file)
+        output_file = os.path.join(self.__output_dir, '%s.xml' % base_name)
        # change if user wants to output to a specific file
        if self.__out_file:
            output_file = os.path.join(self.__output_dir, self.__out_file)
        user_response = 'o'
-        if os.path.isfile(output_file):
-            if self.__no_ask:
-                user_response = 'o'
-            else:
-                msg = 'Do you want to over-write %s?\n' % output_file
-                msg += 'Type "o" to over-write.\n'
-                msg += 'Type any other key to print to standard output.\n'
-                sys.stderr.write(msg)
-                user_response = raw_input()
+        if os.path.isfile(output_file) and not self.__no_ask:
+            msg = 'Do you want to overwrite %s?\n' % output_file
+            msg += ('Type "o" to overwrite.\n'
+                    'Type any other key to print to standard output.\n')
+            sys.stderr.write(msg)
+            user_response = raw_input()
        if user_response == 'o':
-            read_obj = open(self.__file, 'r')
-            write_obj = open(output_file, 'w')
-            line = 1
-            while line:
-                line = read_obj.readline()
-                write_obj.write(line)
-            read_obj.close()
-            write_obj.close()
+            with open(self.__file, 'r') as read_obj:
+                with open(self.output_file, 'w') as write_obj:
+                    for line in read_obj:
+                        write_obj.write(line)
        else:
            self.__output_to_standard_func()
+
    def __output_to_file_func(self):
        """
        Required:
@ -99,14 +97,11 @@ class Output:
        Logic:
            read one line at a time. Output to standard
        """
-        read_obj = open(self.__file, 'r')
-        write_obj = open(self.__out_file, 'w')
-        line = 1
-        while line:
-            line = read_obj.readline()
-            write_obj.write(line)
-        read_obj.close()
-        write_obj.close()
+        with open(self.__file, 'r') as read_obj:
+            with open(self.__out_file, 'w') as write_obj:
+                for line in read_obj:
+                    write_obj.write(line)
+
    def __output_to_standard_func(self):
        """
        Required:
@ -116,26 +111,24 @@ class Output:
        Logic:
            read one line at a time. Output to standard
        """
-        read_obj = open(self.__file, 'r')
-        line = 1
-        while line:
-            line = read_obj.readline()
-            sys.stdout.write(line)
-        read_obj.close()
-    def __output_xml(self, in_file, out_file):
-        """
-        output the ill-formed xml file
-        """
-        (utf8_encode, utf8_decode, utf8_reader, utf8_writer) = codecs.lookup("utf-8")
-        write_obj = utf8_writer(open(out_file, 'w'))
-        write_obj = open(out_file, 'w')
-        read_obj = utf8_writer(open(in_file, 'r'))
-        read_obj = open(in_file, 'r')
-        line = 1
-        while line:
-            line = read_obj.readline()
-            if isinstance(line, type(u"")):
-                line = line.encode("utf-8")
-            write_obj.write(line)
-        read_obj.close()
-        write_obj.close()
+        with open(self.__file, 'r') as read_obj:
+            for line in read_obj:
+                sys.stdout.write(line)
+
+    # def __output_xml(self, in_file, out_file):
+        # """
+        # output the ill-formed xml file
+        # """
+        # (utf8_encode, utf8_decode, utf8_reader, utf8_writer) = codecs.lookup("utf-8")
+        # write_obj = utf8_writer(open(out_file, 'w'))
+        # write_obj = open(out_file, 'w')
+        # read_obj = utf8_writer(open(in_file, 'r'))
+        # read_obj = open(in_file, 'r')
+        # line = 1
+        # while line:
+            # line = read_obj.readline()
+            # if isinstance(line, type(u"")):
+                # line = line.encode("utf-8")
+            # write_obj.write(line)
+        # read_obj.close()
+        # write_obj.close()
--- a/src/calibre/ebooks/rtf2xml/paragraphs.py
+++ b/src/calibre/ebooks/rtf2xml/paragraphs.py
@ -11,31 +11,32 @@
 #                                                                       #
 #########################################################################
 import sys, os
+
 from calibre.ebooks.rtf2xml import copy
 from calibre.ptempfile import better_mktemp

 class Paragraphs:
    """
-=================
-Purpose
-=================
-Write paragraph tags for a tokenized file. (This module won't be any use to use
-to you unless you use it as part of the other modules.)
-------------
-Method
-------------
-RTF does not tell you when a paragraph begins. It only tells you when the
-paragraph ends.
-In order to make paragraphs out of this limited info, the parser starts in the
-body of the documents and assumes it is not in a paragraph. It looks for clues
-to begin a paragraph. Text starts a paragraph; so does an inline field or
-list-text. If an end of paragraph marker (\par) is found, then this indicates
-a blank paragraph.
-Once a paragraph is found, the state changes to 'paragraph.' In this state,
-clues are looked to for the end of a paragraph. The end of a paragraph marker
-(\par) marks the end of a paragraph. So does the end of a footnote or heading;
-a paragraph definintion; the end of a field-block; and the beginning of a
-section. (How about the end of a section or the end of a field-block?)
+    =================
+    Purpose
+    =================
+    Write paragraph tags for a tokenized file. (This module won't be any use to use
+    to you unless you use it as part of the other modules.)
+    -------------
+    Method
+    -------------
+    RTF does not tell you when a paragraph begins. It only tells you when the
+    paragraph ends.
+    In order to make paragraphs out of this limited info, the parser starts in the
+    body of the documents and assumes it is not in a paragraph. It looks for clues
+    to begin a paragraph. Text starts a paragraph; so does an inline field or
+    list-text. If an end of paragraph marker (\par) is found, then this indicates
+    a blank paragraph.
+    Once a paragraph is found, the state changes to 'paragraph.' In this state,
+    clues are looked to for the end of a paragraph. The end of a paragraph marker
+    (\par) marks the end of a paragraph. So does the end of a footnote or heading;
+    a paragraph definition; the end of a field-block; and the beginning of a
+    section. (How about the end of a section or the end of a field-block?)
    """
    def __init__(self,
            in_file,
@ -60,6 +61,7 @@ section. (How about the end of a section or the end of a field-block?)
        self.__write_empty_para = write_empty_para
        self.__run_level = run_level
        self.__write_to = better_mktemp()
+
    def __initiate_values(self):
        """
        Initiate all values.
@ -77,7 +79,7 @@ section. (How about the end of a section or the end of a field-block?)
        self.__paragraph_dict = {
        'cw<pf<par-end___'      : self.__close_para_func,   # end of paragraph
        'mi<mk<headi_-end'      : self.__close_para_func,   # end of header or footer
-        ##'cw<pf<par-def___'      : self.__close_para_func,   # paragraph definition
+        ## 'cw<pf<par-def___'      : self.__close_para_func,   # paragraph definition
        # 'mi<mk<fld-bk-end'      : self.__close_para_func,   # end of field-block
        'mi<mk<fldbk-end_'      : self.__close_para_func,   # end of field-block
        'mi<mk<body-close'      : self.__close_para_func,   # end of body
@ -99,6 +101,7 @@ section. (How about the end of a section or the end of a field-block?)
        'mi<mk<pict-start'      : self.__start_para_func,
        'cw<pf<page-break'      : self.__empty_pgbk_func,    # page break
        }
+
    def __before_body_func(self, line):
        """
        Required:
@ -112,6 +115,7 @@ section. (How about the end of a section or the end of a field-block?)
        if self.__token_info == 'mi<mk<body-open_':
            self.__state = 'not_paragraph'
        self.__write_obj.write(line)
+
    def __not_paragraph_func(self, line):
        """
        Required:
@ -127,6 +131,7 @@ section. (How about the end of a section or the end of a field-block?)
        if action:
            action(line)
        self.__write_obj.write(line)
+
    def __paragraph_func(self, line):
        """
        Required:
@ -144,6 +149,7 @@ section. (How about the end of a section or the end of a field-block?)
            action(line)
        else:
            self.__write_obj.write(line)
+
    def __start_para_func(self, line):
        """
        Requires:
@ -160,6 +166,7 @@ section. (How about the end of a section or the end of a field-block?)
        )
        self.__write_obj.write(self.__start2_marker)
        self.__state = 'paragraph'
+
    def __empty_para_func(self, line):
        """
        Requires:
@ -176,6 +183,7 @@ section. (How about the end of a section or the end of a field-block?)
            'mi<tg<empty_____<para\n'
            )
            self.__write_obj.write(self.__end_marker)   # marker for later parsing
+
    def __empty_pgbk_func(self, line):
        """
        Requires:
@ -188,6 +196,7 @@ section. (How about the end of a section or the end of a field-block?)
        self.__write_obj.write(
        'mi<tg<empty_____<page-break\n'
        )
+
    def __close_para_func(self, line):
        """
        Requires:
@ -205,6 +214,7 @@ section. (How about the end of a section or the end of a field-block?)
        self.__write_obj.write(self.__end_marker) # marker for later parser
        self.__write_obj.write(line)
        self.__state = 'not_paragraph'
+
    def __bogus_para__def_func(self, line):
        """
        Requires:
@ -215,6 +225,7 @@ section. (How about the end of a section or the end of a field-block?)
            if a \pard occurs in a paragraph, I want to ignore it. (I believe)
        """
        self.__write_obj.write('mi<mk<bogus-pard\n')
+
    def make_paragraphs(self):
        """
        Requires:
@ -229,20 +240,18 @@ section. (How about the end of a section or the end of a field-block?)
            only other state is 'paragraph'.
        """
        self.__initiate_values()
-        read_obj = open(self.__file, 'r')
-        self.__write_obj = open(self.__write_to, 'w')
-        line_to_read = 1
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            self.__token_info = line[:16]
-            action = self.__state_dict.get(self.__state)
-            if action == None:
-                sys.stderr.write('no no matching state in module sections.py\n')
-                sys.stderr.write(self.__state + '\n')
-            action(line)
-        read_obj.close()
-        self.__write_obj.close()
+        with open(self.__file, 'r') as read_obj:
+            with open(self.__write_to, 'w') as self.__write_obj:
+                for line in read_obj:
+                    self.__token_info = line[:16]
+                    action = self.__state_dict.get(self.__state)
+                    if action is None:
+                        try:
+                            sys.stderr.write('no matching state in module paragraphs.py\n')
+                            sys.stderr.write(self.__state + '\n')
+                        except:
+                            pass
+                    action(line)
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "paragraphs.data")
--- a/src/calibre/ebooks/rtf2xml/preamble_rest.py
+++ b/src/calibre/ebooks/rtf2xml/preamble_rest.py
@ -11,16 +11,24 @@
 #                                                                       #
 #########################################################################
 import sys,os
+
 from calibre.ebooks.rtf2xml import copy
+
 class Preamble:
    """
    Fix the reamaing parts of the preamble. This module does very little. It
    makes sure that no text gets put in the revision of list table. In the
-    future, when I understand how to interprett he revision table and list
+    future, when I understand how to interpret the revision table and list
    table, I will make these methods more functional.
    """
-    def __init__(self, file, bug_handler,  platform, default_font, code_page,
-    copy=None, temp_dir=None):
+    def __init__(self, file,
+                bug_handler,
+                platform,
+                default_font,
+                code_page,
+                copy=None,
+                temp_dir=None,
+                ):
        """
        Required:
            file--file to parse
@ -44,6 +52,7 @@ class Preamble:
            self.__write_to = os.path.join(temp_dir,"info_table_info.data")
        else:
            self.__write_to = "info_table_info.data"
+
    def __initiate_values(self):
        """
        Initiate all values.
@ -62,12 +71,14 @@ class Preamble:
        'mi<mk<revtbl-beg'      : self.__found_revision_table_func,
        'mi<mk<body-open_'      : self.__found_body_func,
        }
+
    def __default_func(self, line):
        action = self.__default_dict.get(self.__token_info)
        if action:
            action(line)
        else:
            self.__write_obj.write(line)
+
    def __found_rtf_head_func(self, line):
        """
        Requires:
@ -84,8 +95,10 @@ class Preamble:
            '<platform>%s\n' % (self.__default_font, self.__code_page,
            self.__platform)
        )
+
    def __found_list_table_func(self, line):
        self.__state = 'list_table'
+
    def __list_table_func(self, line):
        if self.__token_info == 'mi<mk<listabend_':
            self.__state = 'default'
@ -93,8 +106,10 @@ class Preamble:
            pass
        else:
            self.__write_obj.write(line)
+
    def __found_revision_table_func(self, line):
        self.__state = 'revision'
+
    def __revision_table_func(self, line):
        if self.__token_info == 'mi<mk<revtbl-end':
            self.__state = 'default'
@ -102,11 +117,14 @@ class Preamble:
            pass
        else:
            self.__write_obj.write(line)
+
    def __found_body_func(self, line):
        self.__state = 'body'
        self.__write_obj.write(line)
+
    def __body_func(self, line):
        self.__write_obj.write(line)
+
    def fix_preamble(self):
        """
        Requires:
@ -119,20 +137,15 @@ class Preamble:
            the list table.
        """
        self.__initiate_values()
-        read_obj = open(self.__file, 'r')
-        self.__write_obj = open(self.__write_to, 'w')
-        line_to_read = 1
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            self.__token_info = line[:16]
-            action = self.__state_dict.get(self.__state)
-            if action == None:
-                sys.stderr.write('no no matching state in module preamble_rest.py\n')
-                sys.stderr.write(self.__state + '\n')
-            action(line)
-        read_obj.close()
-        self.__write_obj.close()
+        with open(self.__file, 'r') as read_obj:
+            with open(self.__write_to, 'w') as self.__write_obj:
+                for line in read_obj:
+                    self.__token_info = line[:16]
+                    action = self.__state_dict.get(self.__state)
+                    if action is None:
+                        sys.stderr.write(
+                        'no matching state in module preamble_rest.py\n' + self.__state + '\n')
+                    action(line)
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "preamble_div.data")
--- a/src/calibre/ebooks/rtf2xml/sections.py
+++ b/src/calibre/ebooks/rtf2xml/sections.py
@ -11,43 +11,44 @@
 #                                                                       #
 #########################################################################
 import sys, os
+
 from calibre.ebooks.rtf2xml import copy
 from calibre.ptempfile import better_mktemp

 class Sections:
    """
-=================
-Purpose
-=================
-Write section tags for a tokenized file. (This module won't be any use to use
-to you unless you use it as part of the other modules.)
---------------
-logic
---------------
-The tags for the first section breaks have already been written.
-RTF stores section breaks with the \sect tag. Each time this tag is
-encountered, add one to the counter.
-When I encounter the \sectd tag, I want to collect all the appropriate tokens
-that describe the section. When I reach a \pard, I know I an stop collecting
-tokens and write the section tags.
-The exception to this method occurs when sections occur in field blocks, such
-as the index. Normally, two section break occur within the index and other
-field-blocks. (If less or more section breaks occurr, this code may not work.)
-I want the sections to occurr outside of the index. That is, the index
-should be nested inside one section tag. After the index is complete, a new
-section should begin.
-In order to write the sections outside of the field blocks, I have to store
-all of the field block as a string. When I ecounter the \sect tag, add one to
-the section counter, but store this number in a list. Likewise, store the
-information describing the section in another list.
-When I reach the end of the field block, choose the first item from the
-numbered list as the section number. Choose the first item in the description
-list as the values and attributes of the section. Enclose the field string
-between the section tags.
-Start a new section outside the field-block strings. Use the second number in
-the list; use the second item in the description list.
-CHANGE (2004-04-26) No longer write sections that occurr in field-blocks.
-Instead, ingore all section information in a field-block.
+    =================
+    Purpose
+    =================
+    Write section tags for a tokenized file. (This module won't be any use to use
+    to you unless you use it as part of the other modules.)
+    ---------------
+    logic
+    ---------------
+    The tags for the first section breaks have already been written.
+    RTF stores section breaks with the \sect tag. Each time this tag is
+    encountered, add one to the counter.
+    When I encounter the \sectd tag, I want to collect all the appropriate tokens
+    that describe the section. When I reach a \pard, I know I an stop collecting
+    tokens and write the section tags.
+    The exception to this method occurs when sections occur in field blocks, such
+    as the index. Normally, two section break occur within the index and other
+    field-blocks. (If less or more section breaks occurr, this code may not work.)
+    I want the sections to occur outside of the index. That is, the index
+    should be nested inside one section tag. After the index is complete, a new
+    section should begin.
+    In order to write the sections outside of the field blocks, I have to store
+    all of the field block as a string. When I ecounter the \sect tag, add one to
+    the section counter, but store this number in a list. Likewise, store the
+    information describing the section in another list.
+    When I reach the end of the field block, choose the first item from the
+    numbered list as the section number. Choose the first item in the description
+    list as the values and attributes of the section. Enclose the field string
+    between the section tags.
+    Start a new section outside the field-block strings. Use the second number in
+    the list; use the second item in the description list.
+    CHANGE (2004-04-26) No longer write sections that occurr in field-blocks.
+    Instead, ingore all section information in a field-block.
    """
    def __init__(self,
            in_file,
--- a/src/calibre/gui2/init.py
+++ b/src/calibre/gui2/init.py
@ -137,8 +137,9 @@ def _config(): # {{{
    c.add_opt('LRF_ebook_viewer_options', default=None,
              help=_('Options for the LRF ebook viewer'))
    c.add_opt('internally_viewed_formats', default=['LRF', 'EPUB', 'LIT',
-        'MOBI', 'PRC', 'AZW', 'HTML', 'FB2', 'PDB', 'RB', 'SNB', 'HTMLZ'],
-              help=_('Formats that are viewed using the internal viewer'))
+        'MOBI', 'PRC', 'POBI', 'AZW', 'AZW3', 'HTML', 'FB2', 'PDB', 'RB',
+        'SNB', 'HTMLZ'], help=_(
+            'Formats that are viewed using the internal viewer'))
    c.add_opt('column_map', default=ALL_COLUMNS,
              help=_('Columns to be displayed in the book list'))
    c.add_opt('autolaunch_server', default=False, help=_('Automatically launch content server on application startup'))
--- a/src/calibre/gui2/actions/choose_library.py
+++ b/src/calibre/gui2/actions/choose_library.py
@ -10,7 +10,7 @@ from functools import partial

 from PyQt4.Qt import (QMenu, Qt, QInputDialog, QToolButton, QDialog,
        QDialogButtonBox, QGridLayout, QLabel, QLineEdit, QIcon, QSize,
-        QCoreApplication)
+        QCoreApplication, pyqtSignal)

 from calibre import isbytestring, sanitize_file_name_unicode
 from calibre.constants import filesystem_encoding, iswindows
@ -142,6 +142,7 @@ class ChooseLibraryAction(InterfaceAction):
    dont_add_to = frozenset(['context-menu-device'])
    action_add_menu = True
    action_menu_clone_qaction = _('Switch/create library...')
+    restore_view_state = pyqtSignal(object)

    def genesis(self):
        self.base_text = _('%d books')
@ -206,6 +207,17 @@ class ChooseLibraryAction(InterfaceAction):
        self.maintenance_menu.addAction(ac)

        self.choose_menu.addMenu(self.maintenance_menu)
+        self.view_state_map = {}
+        self.restore_view_state.connect(self._restore_view_state,
+                type=Qt.QueuedConnection)
+
+    @property
+    def preserve_state_on_switch(self):
+        ans = getattr(self, '_preserve_state_on_switch', None)
+        if ans is None:
+            self._preserve_state_on_switch = ans = \
+                self.gui.library_view.preserve_state(require_selected_ids=False)
+        return ans

    def pick_random(self, *args):
        self.gui.iactions['Pick Random Book'].pick_random()
@ -221,6 +233,13 @@ class ChooseLibraryAction(InterfaceAction):
    def library_changed(self, db):
        self.stats.library_used(db)
        self.build_menus()
+        state = self.view_state_map.get(self.stats.canonicalize_path(
+            db.library_path), None)
+        if state is not None:
+            self.restore_view_state.emit(state)
+
+    def _restore_view_state(self, state):
+        self.preserve_state_on_switch.state = state

    def initialization_complete(self):
        self.library_changed(self.gui.library_view.model().db)
@ -401,8 +420,11 @@ class ChooseLibraryAction(InterfaceAction):
    def switch_requested(self, location):
        if not self.change_library_allowed():
            return
+        db = self.gui.library_view.model().db
+        current_lib = self.stats.canonicalize_path(db.library_path)
+        self.view_state_map[current_lib] = self.preserve_state_on_switch.state
        loc = location.replace('/', os.sep)
-        exists = self.gui.library_view.model().db.exists_at(loc)
+        exists = db.exists_at(loc)
        if not exists:
            d = MovedDialog(self.stats, location, self.gui)
            ret = d.exec_()
--- a/src/calibre/gui2/actions/delete.py
+++ b/src/calibre/gui2/actions/delete.py
@ -6,6 +6,7 @@ __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

 from functools import partial
+from collections import Counter

 from PyQt4.Qt import QObject, QTimer

@ -117,13 +118,14 @@ class DeleteAction(InterfaceAction):

    def _get_selected_formats(self, msg, ids):
        from calibre.gui2.dialogs.select_formats import SelectFormats
-        fmts = set([])
+        c = Counter()
        db = self.gui.library_view.model().db
        for x in ids:
            fmts_ = db.formats(x, index_is_id=True, verify_formats=False)
            if fmts_:
-                fmts.update(frozenset([x.lower() for x in fmts_.split(',')]))
-        d = SelectFormats(list(sorted(fmts)), msg, parent=self.gui)
+                for x in frozenset([x.lower() for x in fmts_.split(',')]):
+                    c[x] += 1
+        d = SelectFormats(c, msg, parent=self.gui)
        if d.exec_() != d.Accepted:
            return None
        return d.selected_formats
--- a/src/calibre/gui2/actions/tweak_epub.py
+++ b/src/calibre/gui2/actions/tweak_epub.py
@ -12,11 +12,11 @@ from PyQt4.Qt import (QDialog, QVBoxLayout, QHBoxLayout, QRadioButton, QFrame,

 from calibre import as_unicode
 from calibre.constants import isosx
-from calibre.gui2 import error_dialog, question_dialog, open_local_file
+from calibre.gui2 import error_dialog, question_dialog, open_local_file, gprefs
 from calibre.gui2.actions import InterfaceAction
 from calibre.ptempfile import (PersistentTemporaryDirectory,
        PersistentTemporaryFile)
-from calibre.utils.config import prefs
+from calibre.utils.config import prefs, tweaks

 class TweakBook(QDialog):

@ -32,11 +32,16 @@ class TweakBook(QDialog):
            index_is_id=True))

        button = self.fmt_choice_buttons[0]
+        button_map = {unicode(x.text()):x for x in self.fmt_choice_buttons}
        of = prefs['output_format'].upper()
-        for x in self.fmt_choice_buttons:
-            if unicode(x.text()) == of:
-                button = x
-                break
+        df = tweaks.get('default_tweak_format', None)
+        lf = gprefs.get('last_tweak_format', None)
+        if df and df.lower() == 'remember' and lf in button_map:
+            button = button_map[lf]
+        elif df and df.upper() in button_map:
+            button = button_map[df.upper()]
+        elif of in button_map:
+            button = button_map[of]
        button.setChecked(True)

        self.init_state()
@ -148,6 +153,8 @@ class TweakBook(QDialog):

    def explode(self):
        self.show_msg(_('Exploding, please wait...'))
+        if len(self.fmt_choice_buttons) > 1:
+            gprefs.set('last_tweak_format', self.current_format.upper())
        QTimer.singleShot(5, self.do_explode)

    def ask_question(self, msg):
--- a/src/calibre/gui2/comments_editor.py
+++ b/src/calibre/gui2/comments_editor.py
@ -161,8 +161,14 @@ class EditorWidget(QWebView): # {{{
        self.page().setContentEditable(True)

    def clear_text(self, *args):
+        us = self.page().undoStack()
+        us.beginMacro('clear all text')
        self.action_select_all.trigger()
-        self.action_cut.trigger()
+        self.action_remove_format.trigger()
+        self.exec_command('delete')
+        us.endMacro()
+        self.set_font_style()
+        self.setFocus(Qt.OtherFocusReason)

    def link_clicked(self, url):
        open_url(url)
@ -262,20 +268,22 @@ class EditorWidget(QWebView): # {{{

        def fset(self, val):
            self.setHtml(val)
-            fi = QFontInfo(QApplication.font(self))
-            f  = fi.pixelSize() + 1 + int(tweaks['change_book_details_font_size_by'])
-            fam = unicode(fi.family()).strip().replace('"', '')
-            if not fam:
-                fam = 'sans-serif'
-            style = 'font-size: %fpx; font-family:"%s",sans-serif;' % (f, fam)
-
-            # toList() is needed because PyQt on Debian is old/broken
-            for body in self.page().mainFrame().documentElement().findAll('body').toList():
-                body.setAttribute('style', style)
-            self.page().setContentEditable(True)
-
+            self.set_font_style()
        return property(fget=fget, fset=fset)

+    def set_font_style(self):
+        fi = QFontInfo(QApplication.font(self))
+        f  = fi.pixelSize() + 1 + int(tweaks['change_book_details_font_size_by'])
+        fam = unicode(fi.family()).strip().replace('"', '')
+        if not fam:
+            fam = 'sans-serif'
+        style = 'font-size: %fpx; font-family:"%s",sans-serif;' % (f, fam)
+
+        # toList() is needed because PyQt on Debian is old/broken
+        for body in self.page().mainFrame().documentElement().findAll('body').toList():
+            body.setAttribute('style', style)
+        self.page().setContentEditable(True)
+
    def keyPressEvent(self, ev):
        if ev.key() in (Qt.Key_Tab, Qt.Key_Escape, Qt.Key_Backtab):
            ev.ignore()
@ -627,4 +635,6 @@ if __name__ == '__main__':
    w = Editor()
    w.resize(800, 600)
    w.show()
+    w.html = '<b>testing</b>'
+    app.exec_()
    #print w.html
--- a/src/calibre/gui2/convert/bulk.py
+++ b/src/calibre/gui2/convert/bulk.py
@ -126,7 +126,8 @@ class BulkConfig(Config):
    def setup_output_formats(self, db, preferred_output_format):
        if preferred_output_format:
            preferred_output_format = preferred_output_format.lower()
-        output_formats = sorted(available_output_formats())
+        output_formats = sorted(available_output_formats(),
+                key=lambda x:{'EPUB':'!A', 'MOBI':'!B'}.get(x.upper(), x))
        output_formats.remove('oeb')
        preferred_output_format = preferred_output_format if \
            preferred_output_format and preferred_output_format \
--- a/src/calibre/gui2/convert/page_setup.ui
+++ b/src/calibre/gui2/convert/page_setup.ui
@ -109,12 +109,18 @@
        </item>
        <item row="0" column="1">
         <widget class="QDoubleSpinBox" name="opt_margin_left">
+          <property name="specialValueText">
+           <string>No margin</string>
+          </property>
          <property name="suffix">
           <string> pt</string>
          </property>
          <property name="decimals">
           <number>1</number>
          </property>
+          <property name="minimum">
+           <double>-1.000000000000000</double>
+          </property>
          <property name="maximum">
           <double>200.000000000000000</double>
          </property>
@ -132,12 +138,18 @@
        </item>
        <item row="1" column="1">
         <widget class="QDoubleSpinBox" name="opt_margin_top">
+          <property name="specialValueText">
+           <string>No margin</string>
+          </property>
          <property name="suffix">
           <string> pt</string>
          </property>
          <property name="decimals">
           <number>1</number>
          </property>
+          <property name="minimum">
+           <double>-1.000000000000000</double>
+          </property>
          <property name="maximum">
           <double>200.000000000000000</double>
          </property>
@ -155,12 +167,18 @@
        </item>
        <item row="2" column="1">
         <widget class="QDoubleSpinBox" name="opt_margin_right">
+          <property name="specialValueText">
+           <string>No margin</string>
+          </property>
          <property name="suffix">
           <string> pt</string>
          </property>
          <property name="decimals">
           <number>1</number>
          </property>
+          <property name="minimum">
+           <double>-1.000000000000000</double>
+          </property>
          <property name="maximum">
           <double>200.000000000000000</double>
          </property>
@ -178,12 +196,18 @@
        </item>
        <item row="3" column="1">
         <widget class="QDoubleSpinBox" name="opt_margin_bottom">
+          <property name="specialValueText">
+           <string>No margin</string>
+          </property>
          <property name="suffix">
           <string> pt</string>
          </property>
          <property name="decimals">
           <number>1</number>
          </property>
+          <property name="minimum">
+           <double>-1.000000000000000</double>
+          </property>
          <property name="maximum">
           <double>200.000000000000000</double>
          </property>
--- a/src/calibre/gui2/convert/single.py
+++ b/src/calibre/gui2/convert/single.py
@ -242,7 +242,8 @@ class Config(ResizableDialog, Ui_Dialog):
            preferred_output_format):
        if preferred_output_format:
            preferred_output_format = preferred_output_format.lower()
-        output_formats = sorted(available_output_formats())
+        output_formats = sorted(available_output_formats(),
+                key=lambda x:{'EPUB':'!A', 'MOBI':'!B'}.get(x.upper(), x))
        output_formats.remove('oeb')
        input_format, input_formats = get_input_format_for_book(db, book_id,
                preferred_input_format)
--- a/src/calibre/gui2/custom_column_widgets.py
+++ b/src/calibre/gui2/custom_column_widgets.py
@ -349,7 +349,8 @@ class Text(Base):
        return d.exec_()

    def edit(self):
-        if self.getter() != self.initial_val:
+        if (self.getter() != self.initial_val and (self.getter() or
+            self.initial_val)):
            d = self._save_dialog(self.parent, _('Values changed'),
                    _('You have changed the values. In order to use this '
                       'editor, you must either discard or apply these '
--- a/src/calibre/gui2/dialogs/search.py
+++ b/src/calibre/gui2/dialogs/search.py
@ -182,7 +182,8 @@ class SearchDialog(QDialog, Ui_Dialog):
        global box_values
        box_values = copy.deepcopy(self.box_last_values)
        if general:
-            ans.append(unicode(self.general_combo.currentText()) + ':"' + general + '"')
+            ans.append(unicode(self.general_combo.currentText()) + ':"' +
+                    self.mc + general + '"')
        if ans:
            return ' and '.join(ans)
        return ''
--- a/Show More
+++ b/Show More