[Sync] Sync with trunk. Revison 7760

2025-08-30 23:00:21 -04:00 · 2011-01-22 23:42:35 +08:00 · 2011-01-22 23:42:35 +08:00 · 5c52c0f6bb
commit 5c52c0f6bb
parent 0e491e6c56 1a1af35af1
297 changed files with 151783 additions and 88846 deletions
--- a/Changelog.yaml
+++ b/Changelog.yaml
@ -4,6 +4,330 @@
 # for important features/bug fixes.
 # Also, each release can have new and improved recipes.
 - version: 0.7.42
  date: 2011-01-21
  new features:
    - title: "0.7.42 is a re-release of 0.7.41, because conversion to MOBI was broken in 0.7.41"
    - title: "Conversions: Replace the remove header/footer options with a more geenric search replace option, that allows you to not only remove but also replace text"
    - title: "Conversion: The preprocess html option has now become a new 'Heuristic Processing' option which allows you to control exactly which heuristics are used"
    - title: "Conversion: Various improvements to Heuristic Processing (used to be preprocess HTML)"
    - title: "When adding empty books to calibre, optionally set the author to the author of the currently selected book"
      tickets: [7702]
    - title: "Device drivers for the Archos 101, SmatQ T7 and Acer Lumiread"
    - title: "Catalog generation: Make By Authors optional"
    - title: "Allow bulk editing of Date and Published columns."
    - title: "Add a little button to clear date and published values to the edit metadata dialogs"
    - title: "When adding books by ISBN, allow the specification of special tags that will be added to the new book entries"
      tickets: [8436]
    - title: "Completion on multiple authors"
      tickets: [8405]
    - title: "Add AZW to default list of internally viewed formats, a I am tired of getting tickets about it"
    - title: "Nicer error message when catalog generation fails"
    - title: "Add capitalize option to context menus in the edit metadata dialog"
  bug fixes:
    - title: "RTF Input: Fix regression in 0.7.40 that broke conversion of some old style RTF files"
    - title: "Fix Tag editor forgets position"
      tickets: [8271]
    - title: "When converting books in the calibre GUI, override metadata from the input document, even when empty."
      description: >
        "So if you have removed all the tags and comments in the calibre GUI for the book in the calibre GUI, but the actual file that is being converted still has tags and comments, they are ignored. This affects only conversions in the calibre GUI, not from the command line via ebook-convert."
      tickets: [8390]
    - title: "Fix memory leak when switching libraries"
    - title: "RTF Output: Fix incorrent spacing between letters."
      tickets: [8422]
    - title: "Catalog generation: Add composite columns to Merge Comments eligible types"
    - title: "Add a confirmation when closing the add a custom news source dialog."
      tickets: [8460]
    - title: "Another workaround for LibraryThing UA sniffing that was preventing series metadata download, sigh."
      tickets: [8477]
    - title: "PD Novel driver: Put books on the SD card into the eBooks folder"
    - title: "When shortening filepaths to conform to windows path length limitations, remove text from the middle of each component instead of the ends."
      tickets: [8451]
    - title: "Make completion in most places case insensitive"
      tickets: [8441]
    - title: "Fix regression that caused the N key to stop working when editing a Yes/no column"
      tickets: [8417]
    - title: "Email: Fix bug when connecting to SMTP relays that use MD5 auth"
    - title: "MOBI Output: Fix bug that could cause a link pointing to the start of a section to go to a point later in the section is the section contained an empty id attribute"
    - title: "When auto converting books and the device is unplugged, do not raise an error."
      tickets: [8426]
    - title: "Ebook-viewer: Display cover when viewing FB2 files"
    - title: "MOBI Input: Special case handling of emptu div tags with a defined height used as paragraph separators."
      tickets: [8391]
    - title: "Fix sorting of author names into sub categories by first letter in the Tag Browser when the first letter has diacritics"
      tickets: [8378]
    - title: "Fix regression in 0.7.40 that caused commas in author names to become | when converting/saving to disk"
    - title: "Fix view specific format on a book with no formats gives an error"
      tickets: [8352]
  improved recipes:
    - Blic
    - Las Vegas Review Journal
    - La Vanguardia
    - New York Times
    - El Pais
    - Seattle Times
    - Ars Technica
    - Dilbert
    - Nature News
  new recipes:
    - title: "kath.net"
      author: "Bobus"
    - title: "iHNed"
      author: "Karel Bilek"
    - title: "Gulf News"
      author: "Darko Miletic"
    - title: "South Africa Mail and Guardian"
      author: "77ja65"
 - version: 0.7.40
  date: 2011-01-14
  new features:
    - title: "A new 'highlight matches' search mode" 
      description: >
        "There is now a checkbox next to the search bar named 'Highlight'. If you check it, searching will highlight
        all matched books instead of filtering the book list to all matched books."
    - title: "RTF Input: Improved support for conversion of images. The bug where some images were shrunk should no longer happen"
    - title: "Template language: Allow you to create your own formatting functions. Accessible via Preferences->Advanced->Template functions"
    - title: "News download: Convert various HTML 5 tags into <div> to support readers that cannot handle HTML 5 tags"
    - title: "RTF metadata: Add support for publisher and tags."
      tickets: [6657]
    - title: "BibTeX catalog: Add support for custom columns"
    - title: "TXT Input: Support for textile markup"
    - title: "Various minor tweaks to improve usability of Preferences->Plugins"
    - title: "TXT Output: Convert <hr> to scene break marker."
    - title: "Support for the Archos 70"
    - title: "SONY Driver: Add an option to automatically refresh the covers on every connect. Accessible via: Preferences->Plugins->Device interface plugins"
    - title: "Add access to the larger template editor from plugboards via context menu."
    - title: "Speed improvement when connecting a large library to a device"
    - title: "Speedup when searching on multiple words in a large library"
    - title: "TXT Input: Add a heauristic formatting processor"
  bug fixes:
    - title: "Fix bug that caused automatic news removal to remove any book that has a tag that contains the word 'news' instead of only books that have the tag News"
    - title: "Refactor the downloading social metadata message box to allow canceling."
      tickets: [8234]
    - title: "Kobo drive does not deal with Null value in DateCreated column"
      tickets: [8308]
    - title: "MOBI Input: Fix regression that caused images placed inside svg tags to be discarded"
    - title: "Fix selecting Tablet output profile would actually select the Samsung Galaxy S profile"
    - title: "Catalog generation: Fix a condition that could cause TOCs to not be properly generated in MOBI format catalogs"
      tickets: [8295]
    - title: "Zip file reading: Be more tolerant when a zip file has a damaged file directory"
    - title: "RTF Input: Various code cleanups. Go back to trying to handle unicode mappings without pre-processing. This will mean that some RTF files that used to convert, won't anymore. Please open tickets and attach them."
      tickets: [8171]
    - title: "ImageMagick: When identifying an image don't read the entire image"
    - title: "FB2 Output: Add cover to FB2 metadata."
    - title: "Fix inability to customize builting recipe when more than one recipe has the same name"
      tickets: [8281]
    - title: "RTF Input: Fix regression that broke the Preprocess HTML option"
    - title: "Fix XSS vulnerability in content server."
      tickets: [7980]
    - title: "TXT Output: Clean up and produce consistant output. Spacing around headings. Headings are not indented when using the remove paragraph spacing option."
    - title: "Catalog generation: Handle invalid covers gracefully"
    - title: "Email settings: Before displaying the email test dialog warn the user that it will expose their email password"
    - title: "PDB Output: Fix regression that caused some PDB files to not work with other software"
      tickets: [8231]
  improved recipes:
    - Financial Times UK
    - Globe and Mail
    - Wired Daily
    - MIT Technology Review
    - MSNBC
    - expansion.com
    - New York Times
    - Heraldo de Aragon
    - Exiled online
  new recipes:
    - title: "Yakima Herald and Tri-City Herald"
      author: "Laura Gjovaag"
    - title: "Wichita Eagle"
      author: "Jason Cameron"
    - title: "Pressthink and Zero Hedge"
      author: "Darko Miletic"
    - title: "tyzden"
      author: "zemiak"
    - title: "El Correo"
      author: "desUBIKado"
    - title: "Cicero"
      author: "mad"
    - title: "El Publico"
      author: "Gerardo Diez"
 - version: 0.7.38
  date: 2011-01-07
  new features:
    - title: "Reduce startup time when using a composite custom column"
    - title: "Template language: Add a list_item function for use with tags like columns. See User Manual for details"
    - title: "TXT Input: Attempt to detect the input encoding when not specified. Auto detect paragraph structure and formatting markup."
    - title: "Search & replace: Add ability to manipulate number and boolean columns."
    - title: "Add type ahead completion to the advanced search dialog."
      tickets: [8035]
    - title: "Double click on plugin in Preferences dialog to customize"
      tickets: [8175]
    - title: "Allow customization of the SONY driver to send thumbnail to the device. Useful with newer SONY readers"
      tickets: [8161]
    - title: "Smarten punctuation: Convert double dashes to em dashes. Preprocessing: Various tweaks"
  bug fixes:
    - title: "Fix regression causing the template formatter to intepret a missing format letter as ERROR instead of 's'."
    - title: "Fix regression that broke conversion of PNG images in PDF files on OS X."
      tickets: [8215]
    - title: "Content server: Fix improper XML escaping of category titles in the OPDS feeds"
      tickets: [8225]
    - title: "When decoding XML if the XML starts with a UTF-8 BOM decode as UTF-8. Fixes parsing of FB2 files with UTF-8 BOMs"
    - title: "E-book viewer: When scrolling to a bookmark and the content is wider than the window, do not scroll in the horizontal direction"
    - title: "E-book viewer: Fix next page skipping the bottom of chapters when the content is wider than the window."
      tickets: [8153]
    - title: " FB2 Output: Insert covers."
      tickets: [8172]
    - title: "Content server: When serving OPDS feeds handle html descriptions that have namespaced attributes."
      tickets: [7938]
    - title: "When downloading metadata from isbndb.com, download a maximum of 30 results rather than 1000"
    - title: "Fix sorting of tags column"
    - title: "Change search/replace to show commas instead of vertical bars as the separator for multiple authors"
    - title: "Template language: Make all column names case insensitive"
    - title: "Fix bug that prevent the Disabled option for Tag Browser partiotining from working in the Preferences dialog"
    - title: "Fix bug when using tags like custom column in the template language"
    - title: "Fix bug where composite custom columns using general_program_mode fields are not evaluated correctly when used in a template."
    - title: "ImageMagick interface: Don't crash when asked to open empty image files"
    - title: "Kobo driver: Add TXT,CBZ,CBR to supported formats list"
      tickets: [8124]
    - title: "Don't uneccessarily scroll the book list horizontally when re-selcting previously selected rows."
  new recipes:
    - title: "New London Day"
      author: "Being"
    - title: "Walla"
      author: "marbs"
    - title: "New Journal of Physics"
      author: "Chema Cortes"
    - title: "The Baltimore Sun"
      author: "Josh Hall"
    - title: "Arabian Business and Sunday Times (UK)"
      author: "Darko Miletic"
    - title: "Deia"
      author: "Gerardo Diez"
    - title: "Smarter Planet"
      author: "Jack Mason"
  improved recipes:
    - The Atlantic
    - Danas
    - Ledevoir
 - version: 0.7.37
  date: 2011-01-02
--- a/resources/calibre-portable.bat
+++ b/resources/calibre-portable.bat
@ -1,6 +1,4 @@
@echo OFF
 REM			CalibreRun.bat
 REM			~~~~~~~~~~~~~~
 REM Batch File to start a Calibre configuration on Windows
 REM giving explicit control of the location of:
 REM  - Calibe Program Files
@ -24,7 +22,10 @@ REM -------------------------------------
 REM Set up Calibre Config folder
 REM -------------------------------------
-If EXIST CalibreConfig SET CALIBRE_CONFIG_DIRECTORY=%cd%\CalibreConfig
+IF EXIST CalibreConfig (
 	SET CALIBRE_CONFIG_DIRECTORY=%cd%\CalibreConfig
 	ECHO CONFIG=%cd%\CalibreConfig
 )
 REM --------------------------------------------------------------
@ -38,24 +39,53 @@ REM drive letter of the USB stick.
 REM Comment out any of the following that are not to be used
 REM --------------------------------------------------------------
-SET CALIBRE_LIBRARY_DIRECTORY=U:\eBOOKS\CalibreLibrary
+IF EXIST U:\eBooks\CalibreLibrary (
-IF EXIST CalibreLibrary SET CALIBRE_LIBRARY_DIRECTORY=%cd%\CalibreLibrary
+	SET CALIBRE_LIBRARY_DIRECTORY=U:\eBOOKS\CalibreLibrary
-IF EXIST CalibreBooks SET CALIBRE_LIBRARY_DIRECTORY=%cd%\CalibreBooks
+	ECHO LIBRARY=U:\eBOOKS\CalibreLibrary
 )
 IF EXIST CalibreLibrary (
 	SET CALIBRE_LIBRARY_DIRECTORY=%cd%\CalibreLibrary
 	ECHO LIBRARY=%cd%\CalibreLibrary
 )
 IF EXIST CalibreBooks (
 	SET CALIBRE_LIBRARY_DIRECTORY=%cd%\CalibreBooks
 	ECHO LIBRARY=%cd%\CalibreBooks
 )
 REM --------------------------------------------------------------
-REM Specify Location of metadata database  (optional)
+REM Specify Location of metadata database (optional)
 REM
 REM Location where the metadata.db file is located.  If not set
 REM the same location as Books files will be assumed.  This.
 REM options is used to get better performance when the Library is
 REM on a (slow) network drive.  Putting the metadata.db file 
-REM locally gives a big performance improvement.
+REM locally makes gives a big performance improvement.
 REM
 REM NOTE.  If you use this option, then the ability to switch
 REM        libraries within Calibre will be disabled.  Therefore
 REM        you do not want to set it if the metadata.db file
 REM        is at the same location as the book files.
 REM --------------------------------------------------------------
-IF EXIST CalibreBooks SET SET CALIBRE_OVERRIDE_DATABASE_PATH=%cd%\CalibreBooks\metadata.db
+IF EXIST CalibreBooks (
-IF EXIST CalibreMetadata SET CALIBRE_OVERRIDE_DATABASE_PATH=%cd%\CalibreMetadata\metadata.db
+	IF NOT "%CALIBRE_LIBRARY_DIRECTORY%" == "%cd%\CalibreBooks" (
-
+		SET SET CALIBRE_OVERRIDE_DATABASE_PATH=%cd%\CalibreBooks\metadata.db
 		ECHO DATABASE=%cd%\CalibreBooks\metadata.db
 		ECHO '
 		ECHO ***CAUTION*** Library Switching will be disabled 
 		ECHO '
 	)
 )
 IF EXIST CalibreMetadata (
 	IF NOT "%CALIBRE_LIBRARY_DIRECTORY%" == "%cd%\CalibreMetadata" (
 		SET CALIBRE_OVERRIDE_DATABASE_PATH=%cd%\CalibreMetadata\metadata.db
 		ECHO DATABASE=%cd%\CalibreMetadata\metadata.db
 		ECHO '
 		ECHO ***CAUTION*** Library Switching will be disabled 
 		ECHO '
 	)
 )
 REM --------------------------------------------------------------
 REM Specify Location of source (optional)
@ -63,13 +93,20 @@ REM
 REM It is easy to run Calibre from source
 REM Just set the environment variable to where the source is located
 REM When running from source the GUI will have a '*' after the version.
 REM number that is displayed at the bottom of the Calibre main screen.
 REM --------------------------------------------------------------
-IF EXIST Calibre\src SET CALIBRE_DEVELOP_FROM=%cd%\Calibre\src
+IF EXIST Calibre\src (
-
+	SET CALIBRE_DEVELOP_FROM=%cd%\Calibre\src
 	ECHO SOURCE=%cd%\Calibre\src
 )
 IF EXIST D:\Calibre\Calibre\src (
 	SET CALIBRE_DEVELOP_FROM=D:\Calibre\Calibre\src
 	ECHO SOURCE=D:\Calibre\Calibre\src
 )
 REM --------------------------------------------------------------
-REM Specify Location of calibre binaries (optinal)
+REM Specify Location of calibre binaries (optional)
 REM
 REM To avoid needing Calibre to be set in the search path, ensure
 REM that Calibre Program Files is current directory when starting.
@ -78,21 +115,15 @@ REM This folder can be populated by cpying the Calibre2 folder from
 REM an existing isntallation or by isntalling direct to here.
 REM --------------------------------------------------------------
-IF EXIST Calibre2 CD Calibre2
+IF EXIST Calibre2 (
-
+	Calibre2 CD Calibre2
-
+	ECHO PROGRAMS=%cd%
-REM --------------------------------------------
+)
 REM Display settings that will be used
 REM --------------------------------------------
 echo PROGRAMS=%cd%
 echo SOURCE=%CALIBRE_DEVELOP_FROM%
 echo CONFIG=%CALIBRE_CONFIG_DIRECTORY%
 echo LIBRARY=%CALIBRE_LIBRARY_DIRECTORY%
 echo DATABASE=%CALIBRE_OVERRIDE_DATABASE_PATH%
 REM ----------------------------------------------------------
 REM  The following gives a chance to check the settings before
 REM  starting Calibre.  It can be commented out if not wanted.
 REM ----------------------------------------------------------
 echo "Press CTRL-C if you do not want to continue"
 pause
@ -111,4 +142,4 @@ REM Use with /WAIT to wait until Calibre completes to run a task on exit
 REM --------------------------------------------------------
 echo "Starting up Calibre"
-START /belownormal Calibre --with-library %CALIBRE_LIBRARY_DIRECTORY%
+START /belownormal Calibre --with-library "%CALIBRE_LIBRARY_DIRECTORY%"
--- a/resources/catalog/section_list_templates.py
+++ b/resources/catalog/section_list_templates.py
@ -0,0 +1,42 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 __license__   = 'GPL v3'
 __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 '''
    These templates control the content of titles displayed in the various sections
    Available fields:
    {title}          Title of the book
    {series}         Series name
    {series_index}   Number of the book in the series
    {rating}         Rating
    {rating_parens}  Rating, in parentheses
    {pubyear}        Year the book was published
    {pubyear_parens} Year the book was published, in parentheses
 '''
 # Books by Author
 by_authors_normal_title_template = '{title} {pubyear_parens}'
 by_authors_series_title_template = '[{series_index}] {title} {pubyear_parens}'
 # Books by Title
 by_titles_normal_title_template = '{title}'
 by_titles_series_title_template = '{title} ({series} [{series_index}])'
 # Books by Series
 by_series_title_template = '[{series_index}] {title} {pubyear_parens}'
 # Books by Genre
 by_genres_normal_title_template = '{title} {pubyear_parens}'
 by_genres_series_title_template = '{series_index}. {title} {pubyear_parens}'
 # Recently Added
 by_recently_added_normal_title_template = '{title}'
 by_recently_added_series_title_template = '{title} ({series} [{series_index}])'
 # By Month added
 by_month_added_normal_title_template = '{title} {pubyear_parens}'
 by_month_added_series_title_template = '[{series_index}] {title} {pubyear_parens}'
--- a/resources/images/document-encrypt.png
+++ b/resources/images/document-encrypt.png
--- a/resources/images/heuristics.png
+++ b/resources/images/heuristics.png
--- a/resources/images/news/exiled.png
+++ b/resources/images/news/exiled.png
--- a/resources/images/news/pressthink.png
+++ b/resources/images/news/pressthink.png
--- a/resources/images/news/zerohedge.png
+++ b/resources/images/news/zerohedge.png
--- a/resources/images/template_funcs.png
+++ b/resources/images/template_funcs.png
--- a/resources/recipes/ars_technica.recipe
+++ b/resources/recipes/ars_technica.recipe
@ -1,6 +1,5 @@
 __license__   = 'GPL v3'
-__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
 '''
 arstechnica.com
 '''
@ -9,19 +8,26 @@ import re
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
-class ArsTechnica2(BasicNewsRecipe):
+class ArsTechnica(BasicNewsRecipe):
    title                 = u'Ars Technica'
    language              = 'en'
-    __author__            = 'Darko Miletic and Sujata Raman'
+    __author__            = 'Darko Miletic, Sujata Raman, Alexis Rohou'
    description           = 'The art of technology'
    publisher             = 'Ars Technica'
    category              = 'news, IT, technology'
-    oldest_article        = 2
+    oldest_article        = 5
    max_articles_per_feed = 100
    no_stylesheets        = True
    encoding              = 'utf-8'
    use_embedded_content  = False
-    extra_css             = ' body {font-family: Arial,Helvetica,sans-serif} .title{text-align: left} .byline{font-weight: bold; line-height: 1em; font-size: 0.625em; text-decoration: none} '
+    extra_css             = 	'''
 				body {font-family: Arial,Helvetica,sans-serif}
 				.title{text-align: left}
 				.byline{font-weight: bold; line-height: 1em; font-size: 0.625em; text-decoration: none}
 				.news-item-figure-caption-text{font-size:small; font-style:italic}
 				.news-item-figure-caption-byline{font-size:small; font-style:italic; font-weight:bold}
 				'''
    ignoreEtcArticles     = True	# Etc feed items can be ignored, as they're not real stories
    conversion_options = {
                             'comments'  : description
@ -31,10 +37,10 @@ class ArsTechnica2(BasicNewsRecipe):
                         }
-    preprocess_regexps = [
+    #preprocess_regexps = [
-                (re.compile(r'<div class="news-item-figure', re.DOTALL|re.IGNORECASE),lambda match: '<div class="news-item-figure"')
+    #            (re.compile(r'<div class="news-item-figure', re.DOTALL|re.IGNORECASE),lambda match: '<div class="news-item-figure"')
-               ,(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')
+    #           ,(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')
-                         ]
+    #                     ]
    keep_only_tags = [dict(name='div', attrs={'id':['story','etc-story']})]
@ -42,7 +48,7 @@ class ArsTechnica2(BasicNewsRecipe):
                     dict(name=['object','link','embed'])
                    ,dict(name='div', attrs={'class':'read-more-link'})
                  ]
-    remove_attributes=['width','height']
+    #remove_attributes=['width','height']
    feeds = [
              (u'Infinite Loop (Apple content)'        , u'http://feeds.arstechnica.com/arstechnica/apple/'      )
@ -56,6 +62,7 @@ class ArsTechnica2(BasicNewsRecipe):
             ,(u'Law & Disorder (Tech policy content)' , u'http://feeds.arstechnica.com/arstechnica/tech-policy/')
            ]
    # This deals with multi-page stories
    def append_page(self, soup, appendtag, position):
        pager = soup.find('div',attrs={'class':'pager'})
        if pager:
@ -81,6 +88,7 @@ class ArsTechnica2(BasicNewsRecipe):
    def preprocess_html(self, soup):
 	# Adds line breaks near the byline (not sure why this is needed)
        ftag = soup.find('div', attrs={'class':'byline'})
        if ftag:
           brtag = Tag(soup,'br')
@ -88,12 +96,33 @@ class ArsTechnica2(BasicNewsRecipe):
           ftag.insert(4,brtag)
           ftag.insert(5,brtag2)
 	# Remove style items
        for item in soup.findAll(style=True):
           del item['style']
 	# Remove id
 	for item in soup.findAll(id=True):
 		del item['id']
 	# For some reason, links to authors don't have the domainname
 	a_author = soup.find('a',{'href':re.compile("^/author")})
 	if a_author:
 		a_author['href'] = 'http://arstechnica.com'+a_author['href']
 	# within div class news-item-figure, we need to grab images
 	# Deal with multi-page stories
        self.append_page(soup, soup.body, 3)
        return soup
    def get_article_url(self, article):
 	# If the article title starts with Etc:, don't return it
 	if self.ignoreEtcArticles:
 		article_title = article.get('title',None)
 		if re.match('Etc: ',article_title) is not None:
 			return None
 	# The actual article is in a guid tag
        return article.get('guid',  None).rpartition('?')[0]
--- a/resources/recipes/blic.recipe
+++ b/resources/recipes/blic.recipe
@ -1,6 +1,6 @@
 __license__   = 'GPL v3'
-__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
 '''
 blic.rs
 '''
@ -21,21 +21,53 @@ class Blic(BasicNewsRecipe):
    masthead_url          = 'http://www.blic.rs/resources/images/header/header_back.png'
    language              = 'sr'
    publication_type      = 'newspaper'
-    extra_css             = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: Georgia, serif1, serif} .article_description{font-family: Arial, sans1, sans-serif} .img_full{float: none} img{margin-bottom: 0.8em} '
+    extra_css             = """ 
                               @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} 
                               @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} 
                               body{font-family: Georgia, serif1, serif} 
                               .articledescription,#nadnaslov,.article_info{font-family: Arial, sans1, sans-serif} 
                               .img_full{float: none}
                               #nadnaslov{font-size: small}
                               #article_lead{font-size: 1.5em}
                               h1{color: red}
                               .potpis{font-size: x-small; color: gray}
                               .article_info{font-size: small}
                               img{margin-bottom: 0.8em; margin-top: 0.8em; display: block} 
                            """
    conversion_options = {
                          'comment'  : description
                        , 'tags'     : category
                        , 'publisher': publisher
                        , 'language' : language
                        , 'linearize_tables' : True
                        }
    preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
    remove_tags_before = dict(name='div', attrs={'id':'article_info'})
-    remove_tags        = [dict(name=['object','link'])]
+    remove_tags        = [dict(name=['object','link','meta','base','object','embed'])]
-    remove_attributes  = ['width','height']
+    remove_attributes  = ['width','height','m_id','m_ext','mlg_id','poll_id','v_id']
-    feeds              = [(u'Danasnje Vesti', u'http://www.blic.rs/rss/danasnje-vesti')]
+    feeds              = [
                            (u'Politika'         , u'http://www.blic.rs/rss/Vesti/Politika')
                           ,(u'Tema Dana'        , u'http://www.blic.rs/rss/Vesti/Tema-Dana')
                           ,(u'Svet'             , u'http://www.blic.rs/rss/Vesti/Svet')
                           ,(u'Drustvo'          , u'http://www.blic.rs/rss/Vesti/Drustvo')
                           ,(u'Ekonomija'        , u'http://www.blic.rs/rss/Vesti/Ekonomija')
                           ,(u'Hronika'          , u'http://www.blic.rs/rss/Vesti/Hronika')
                           ,(u'Beograd'          , u'http://www.blic.rs/rss/Vesti/Beograd')
                           ,(u'Srbija'           , u'http://www.blic.rs/rss/Vesti/Srbija')
                           ,(u'Vojvodina'        , u'http://www.blic.rs/rss/Vesti/Vojvodina')
                           ,(u'Republika Srpska' , u'http://www.blic.rs/rss/Vesti/Republika-Srpska')
                           ,(u'Reportaza'        , u'http://www.blic.rs/rss/Vesti/Reportaza')
                           ,(u'Dodatak'          , u'http://www.blic.rs/rss/Vesti/Dodatak')
                           ,(u'Zabava'           , u'http://www.blic.rs/rss/Zabava')
                           ,(u'Kultura'          , u'http://www.blic.rs/rss/Kultura')
                           ,(u'Slobodno Vreme'   , u'http://www.blic.rs/rss/Slobodno-vreme')
                           ,(u'IT'               , u'http://www.blic.rs/rss/IT')
                           ,(u'Komentar'         , u'http://www.blic.rs/rss/Komentar')
                           ,(u'Intervju'         , u'http://www.blic.rs/rss/Intervju')
                         ]
    def print_version(self, url):
@ -44,4 +76,4 @@ class Blic(BasicNewsRecipe):
    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
            del item['style']    
-        return self.adeify_images(soup)
+        return soup
--- a/resources/recipes/cicero.recipe
+++ b/resources/recipes/cicero.recipe
@ -0,0 +1,35 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 class Cicero(BasicNewsRecipe):
    timefmt               = ' [%Y-%m-%d]'
    title                 = u'Cicero'
    __author__            = 'mad@sharktooth.de'
    description           = u'Magazin f\xfcr politische Kultur'
    oldest_article        = 7
    language              = 'de'
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    publisher             = 'Ringier Publishing'
    category              = 'news, politics, Germany'
    encoding              = 'iso-8859-1'
    publication_type      = 'magazine'
    masthead_url          = 'http://www.cicero.de/img2/cicero_logo_rss.gif'
    feeds                 = [
 (u'Das gesamte Portfolio', u'http://www.cicero.de/rss/rss.php?ress_id='),
 #(u'Alle Heft-Inhalte', u'http://www.cicero.de/rss/rss.php?ress_id=heft'),
 #(u'Alle Online-Inhalte', u'http://www.cicero.de/rss/rss.php?ress_id=online'),
 #(u'Berliner Republik', u'http://www.cicero.de/rss/rss.php?ress_id=4'),
 #(u'Weltb\xfchne', u'http://www.cicero.de/rss/rss.php?ress_id=1'),
 #(u'Salon', u'http://www.cicero.de/rss/rss.php?ress_id=7'),
 #(u'Kapital', u'http://www.cicero.de/rss/rss.php?ress_id=6'),
 #(u'Netzst\xfccke', u'http://www.cicero.de/rss/rss.php?ress_id=9'),
 #(u'Leinwand', u'http://www.cicero.de/rss/rss.php?ress_id=12'),
 #(u'Bibliothek', u'http://www.cicero.de/rss/rss.php?ress_id=15'),
 (u'Kolumne - Alle Kolulmnen', u'http://www.cicero.de/rss/rss2.php?ress_id='),
 #(u'Kolumne - Schreiber, Berlin', u'http://www.cicero.de/rss/rss2.php?ress_id=35'),
 #(u'Kolumne - TV Kritik', u'http://www.cicero.de/rss/rss2.php?ress_id=34')
 ]
    def print_version(self, url):
        return 'http://www.cicero.de/page_print.php?' + url.rpartition('?')[2]
--- a/resources/recipes/cnetjapan.recipe
+++ b/resources/recipes/cnetjapan.recipe
@ -11,7 +11,7 @@ class CNetJapan(BasicNewsRecipe):
                      (u'CNet Blog', u'http://feed.japan.cnet.com/rss/blog/index.rdf')
                        ]
    language       = 'ja'
-    encoding       = 'Shift_JIS'
+    encoding       = 'utf-8'
    remove_javascript = True
    preprocess_regexps = [
--- a/resources/recipes/dallas.recipe
+++ b/resources/recipes/dallas.recipe
@ -7,22 +7,29 @@ class DallasNews(BasicNewsRecipe):
    max_articles_per_feed = 25
    no_stylesheets = True
-    remove_tags_before = dict(name='h2', attrs={'class':'vitstoryheadline'})
+    use_embedded_content = False
-    remove_tags_after  = dict(name='div', attrs={'style':'width: 100%; clear: right'})
+    remove_tags_before = dict(name='h1')
-    remove_tags_after  = dict(name='div', attrs={'id':'article_tools_bottom'})
+    keep_only_tags = {'class':lambda x: x and 'article' in x}
    remove_tags = [
-       dict(name='iframe'),
+            {'class':['DMNSocialTools', 'article ', 'article first ', 'article premium']},
       dict(name='div', attrs={'class':'biblockmore'}),
       dict(name='div', attrs={'style':'width: 100%; clear: right'}),
       dict(name='div', attrs={'id':'article_tools_bottom'}),
       #dict(name='ul', attrs={'class':'articleTools'}),
    ]
    feeds          = [
-                      ('Latest News', 'http://www.dallasnews.com/newskiosk/rss/dallasnewslatestnews.xml'),
+                      ('Local News',
-                      ('Local News', 'http://www.dallasnews.com/newskiosk/rss/dallasnewslocalnews.xml'),
+                          'http://www.dallasnews.com/news/politics/local-politics/?rss'),
-		      ('Nation and World', 'http://www.dallasnews.com/newskiosk/rss/dallasnewsnationworld.xml'),
+                      ('National Politics',
-		      ('Politics', 'http://www.dallasnews.com/newskiosk/rss/dallasnewsnationalpolitics.xml'),
+                          'http://www.dallasnews.com/news/politics/national-politic/?rss'),
-		      ('Science', 'http://www.dallasnews.com/newskiosk/rss/dallasnewsscience.xml'),
+                      ('State Politics',
                          'http://www.dallasnews.com/news/politics/state-politics/?rss'),
                      ('Religion',
                         'http://www.dallasnews.com/news/religion/?rss'),
                      ('Crime',
                          'http://www.dallasnews.com/news/crime/headlines/?rss'),
                      ('Celebrity News',
                          'http://www.dallasnews.com/entertainment/celebrity-news/?rss&listname=TopStories'),
                      ('Nation',
                          'http://www.dallasnews.com/news/nation-world/nation/?rss'),
                      ('World',
                          'http://www.dallasnews.com/news/nation-world/world/?rss'),
                    ]
--- a/resources/recipes/deia.recipe
+++ b/resources/recipes/deia.recipe
@ -22,7 +22,7 @@ class Deia(BasicNewsRecipe):
 	cover_url		='http://2.bp.blogspot.com/_RjrWzC6tI14/TM6jrPLaBZI/AAAAAAAAFaI/ayffwxidFEY/s1600/2009-10-13-logo-deia.jpg'
 	timefmt			='[%a, %d %b, %Y]'
 	encoding		='utf8'
-	language		='es_ES'
+	language		='es'
 	remove_javascript	=True
 	remove_tags_after	=dict(id='Texto')
 	remove_tags_before	=dict(id='Texto')
--- a/resources/recipes/dilbert.recipe
+++ b/resources/recipes/dilbert.recipe
@ -28,7 +28,7 @@ class DilbertBig(BasicNewsRecipe):
                            ,'publisher'       : publisher
                         }
-    feeds = [(u'Dilbert', u'http://feeds.dilbert.com/DilbertDailyStrip' )]
+    feeds = [(u'Dilbert', u'http://feed.dilbert.com/dilbert/daily_strip' )]
    def get_article_url(self, article):
        return article.get('feedburner_origlink', None)
--- a/resources/recipes/economist.recipe
+++ b/resources/recipes/economist.recipe
@ -9,7 +9,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.ebooks.BeautifulSoup import Tag, NavigableString
-import mechanize, string, urllib, time, re
+import string, time, re
 class Economist(BasicNewsRecipe):
@ -18,19 +18,19 @@ class Economist(BasicNewsRecipe):
    __author__ = "Kovid Goyal"
    INDEX = 'http://www.economist.com/printedition'
-    description = ('Global news and current affairs from a European perspective.'
+    description = 'Global news and current affairs from a European perspective.'
            ' Needs a subscription from ')+INDEX
    oldest_article = 7.0
    cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
    remove_tags = [dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
            dict(attrs={'class':['dblClkTrk', 'ec-article-info']})]
    keep_only_tags = [dict(id='ec-article-body')]
-    needs_subscription = True
+    needs_subscription = False
    no_stylesheets = True
    preprocess_regexps = [(re.compile('</html>.*', re.DOTALL),
        lambda x:'</html>')]
    '''
    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        br.open('http://www.economist.com')
@ -50,6 +50,7 @@ class Economist(BasicNewsRecipe):
                    }))
        br.open(req).read()
        return br
    '''
    def parse_index(self):
        try:
--- a/resources/recipes/economist_free.recipe
+++ b/resources/recipes/economist_free.recipe
@ -7,12 +7,12 @@ from lxml import html
 class Economist(BasicNewsRecipe):
-    title = 'The Economist (free)'
+    title = 'The Economist (RSS)'
    language = 'en'
    __author__ = "Kovid Goyal"
    description = ('Global news and current affairs from a European perspective.'
-            ' Much slower than the subscription based version.')
+            ' Much slower than the print edition based version.')
    oldest_article = 7.0
    cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
--- a/resources/recipes/el_correo.recipe
+++ b/resources/recipes/el_correo.recipe
@ -0,0 +1,122 @@
 #!/usr/bin/env  python
 __license__     = 'GPL v3'
 __copyright__   = '08 Januery 2011, desUBIKado'
 __author__      = 'desUBIKado'
 __description__ = 'Daily newspaper from Biscay'
 __version__     = 'v0.08'
 __date__        = '08, Januery 2011'
 '''
 [url]http://www.elcorreo.com/[/url]
 '''
 import time
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 class heraldo(BasicNewsRecipe):
    __author__            = 'desUBIKado'
    description           = 'Daily newspaper from Biscay'
    title                 = u'El Correo'
    publisher             = 'Vocento'
    category              = 'News, politics, culture, economy, general interest'
    oldest_article        = 2
    delay                 = 1
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    language              = 'es'
    timefmt               = '[%a, %d %b, %Y]'
    encoding              = 'iso-8859-1'
    remove_empty_feeds    = True
    remove_javascript     = False
    feeds              = [
                           (u'Portada',       u'http://www.elcorreo.com/vizcaya/portada.xml'),
                           (u'Local',         u'http://www.elcorreo.com/vizcaya/rss/feeds/vizcaya.xml'),
               (u'Internacional', u'hhttp://www.elcorreo.com/vizcaya/rss/feeds/internacional.xml'),
               (u'Econom\xeda',   u'http://www.elcorreo.com/vizcaya/rss/feeds/economia.xml'),
                           (u'Pol\xedtica',   u'http://www.elcorreo.com/vizcaya/rss/feeds/politica.xml'),
               (u'Opini\xf3n',    u'http://www.elcorreo.com/vizcaya/rss/feeds/opinion.xml'),
               (u'Deportes',      u'http://www.elcorreo.com/vizcaya/rss/feeds/deportes.xml'),
                           (u'Sociedad',      u'http://www.elcorreo.com/vizcaya/rss/feeds/sociedad.xml'),
               (u'Cultura',       u'http://www.elcorreo.com/vizcaya/rss/feeds/cultura.xml'),
               (u'Televisi\xf3n', u'http://www.elcorreo.com/vizcaya/rss/feeds/television.xml'),
               (u'Gente',         u'http://www.elcorreo.com/vizcaya/rss/feeds/gente.xml')
                         ]
    keep_only_tags     = [
                          dict(name='div', attrs={'class':['grouphead','date','art_head','story-texto','text','colC_articulo','contenido_comentarios']}),
                          dict(name='div' , attrs={'id':['articulo','story-texto','story-entradilla']})
                         ]
    remove_tags        = [
                          dict(name='div', attrs={'class':['art_barra','detalles-opinion','formdenunciar','modulo calculadoras','nubetags','pie']}),
                          dict(name='div', attrs={'class':['mod_lomas','bloque_lomas','blm_header','link-app3','link-app4','botones_listado']}),
                          dict(name='div', attrs={'class':['navegacion_galeria','modulocanalpromocion','separa','separacion','compartir','tags_relacionados']}),
                          dict(name='div', attrs={'class':['moduloBuscadorDeportes','modulo-gente','moddestacadopeq','OpcArt','articulopiniones']}),
                          dict(name='div', attrs={'class':['modulo-especial','publiEspecial']}),
                          dict(name='div', attrs={'id':['articulopina']}),
                          dict(name='br', attrs={'class':'clear'}),
                          dict(name='form', attrs={'name':'frm_conversor2'})
                         ]
    remove_tags_before = dict(name='div' , attrs={'class':'articulo  '})
    remove_tags_after  = dict(name='div' , attrs={'class':'comentarios'})
    def get_cover_url(self):
        cover = None
        st = time.localtime()
        year = str(st.tm_year)
        month = "%.2d" % st.tm_mon
        day = "%.2d" % st.tm_mday
        #[url]http://img.kiosko.net/2011/01/02/es/elcorreo.750.jpg[/url]
                #[url]http://info.elcorreo.com/pdf/06012011-viz.pdf[/url]
        cover='http://info.elcorreo.com/pdf/'+ day + month + year +'-viz.pdf'
        br = BasicNewsRecipe.get_browser()
        try:
            br.open(cover)
        except:
            self.log("\nPortada no disponible")
            cover ='http://www.elcorreo.com/vizcaya/noticias/201002/02/Media/logo-elcorreo-nuevo.png'
        return cover
    extra_css = '''
                    h1, .headline {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;}
                    h2, .subhead {font-family:Arial,Helvetica,sans-serif; font-style:italic; font-weight:normal;font-size:18px;}
                    h3, .overhead {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}
                    h4 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}
                    h5 {font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:16px;}
                    h6 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}
                    .date,.byline, .photo {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:14px;}
                    img{margin-bottom: 0.4em}
                '''
    preprocess_regexps = [
 # To present the image of the embedded video
                           (re.compile(r'var RUTA_IMAGEN', re.DOTALL|re.IGNORECASE), lambda match: '</script><img src'),
                           (re.compile(r'.jpg";', re.DOTALL|re.IGNORECASE), lambda match: '.jpg">'),
                           (re.compile(r'var SITIO = "elcorreo";', re.DOTALL|re.IGNORECASE), lambda match: '<SCRIPT TYPE="text/JavaScript"'),
 # To separate paragraphs with a blank line
                           (re.compile(r'<div class="p"', re.DOTALL|re.IGNORECASE), lambda match: '<p></p><div class="p"'),
 # To put a blank line between the subtitle and the date and time of the news
                           (re.compile(r'<div class="date">', re.DOTALL|re.IGNORECASE), lambda match: '<br><div class="date">'),
 # To put a blank line between the intro of the embedded videos and the previous text
                           (re.compile(r'<div class="video"', re.DOTALL|re.IGNORECASE), lambda match: '<br><div class="video"'),
 # To view photos from the first when these are presented as a gallery
                           (re.compile(r'src="/img/shim.gif"', re.DOTALL|re.IGNORECASE), lambda match: ''),
                           (re.compile(r'rel=', re.DOTALL|re.IGNORECASE), lambda match: 'src='),
 # To remove the link of the title
                           (re.compile(r'<h1 class="headline">\n<a href="', re.DOTALL|re.IGNORECASE), lambda match: '<h1 class="'),
                           (re.compile(r'</a>\n</h1>', re.DOTALL|re.IGNORECASE), lambda match: '</h1>'),
                         ]
--- a/resources/recipes/el_pais.recipe
+++ b/resources/recipes/el_pais.recipe
@ -9,13 +9,14 @@ __docformat__ = 'restructuredtext en'
 elpais.es
 '''
 from time import strftime
 from calibre.web.feeds.news import BasicNewsRecipe
 class ElPais(BasicNewsRecipe):
    __author__        = 'Kovid Goyal & Lorenzo Vigentini & Jordi Balcells'
    description   = 'Main daily newspaper from Spain'
    cover_url      = 'http://www.elpais.com/im/tit_logo_global.gif'
    title          = u'El Pais'
    publisher      = u'Ediciones El Pa\xeds SL'
    category       = 'News, politics, culture, economy, general interest'
@ -62,6 +63,6 @@ class ElPais(BasicNewsRecipe):
                        (u'Vi\xf1etas', u'http://www.elpais.com/rss/feed.html?feedId=17058')
                        ]
-def print_version(self, url):
+    def get_cover_url(self):
-    url = url+'?print=1'
+        return 'http://img5.kiosko.net/' + strftime("%Y/%m/%d") + '/es/elpais.750.jpg'
-    return url
+
--- a/resources/recipes/el_publico.recipe
+++ b/resources/recipes/el_publico.recipe
@ -0,0 +1,43 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __author__    = 'Gerardo Diez'
 __copyright__ = 'Gerardo Diez<gerardo.diez.garcia@gmail.com>'
 description   = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)'
 __docformat__ = 'restructuredtext en'
 '''
 publico.es
 '''
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class Publico(BasicNewsRecipe):
    title               =u'Publico.es'
    __author__      ='Gerardo Diez'
    publisher       =u'Mediapubli Sociedad de Publicaciones y Ediciones S.L.'
    category                ='news, politics, finances, world, spain, science, catalunya'
    oldest_article      =1
    max_articles_per_feed   =100
    simultaneous_downloads  =10
    cover_url       =u'http://imagenes.publico.es/css/img/logo_publico.gif'
    timefmt         ='[%a, %d %b, %Y]'
    encoding        ='utf8'
    language        ='es'
    remove_javascript   =True
    no_stylesheets      =True
    keep_only_tags      =dict(id='main')
    remove_tags         =[
                            dict(name='div', attrs={'class':['Noticias_642x50', 'contInfo ancho']}),
                            dict(name='ul', attrs={'class':['navComentarios', 'comentarios']}),
                            dict(name='div', attrs={'id':['commentsContext', 'toolbar', 'comentarios']}),
                            dict(name='h5', attrs={'id':'comentarios'})
                            ]
    feeds               =[(u'Internacional', u'http://www.publico.es/estaticos/rss/internacional'),
                 (u'Espa\xf1a', u'http://www.publico.es/estaticos/rss/espana'),
                 (u'Dinero', u'http://www.publico.es/estaticos/rss/dinero'),
                 (u'Ciencias', u'http://www.publico.es/estaticos/rss/ciencias'),
                 (u'Culturas', u'http://www.publico.es/estaticos/rss/culturas'),
                 (u'Deportes', u'http://www.publico.es/estaticos/rss/deportes'),
                 (u'Televisi\xf3n y Gente', u'http://www.publico.es/estaticos/rss/televisionygente'),
                 (u'Catalu\xf1a', u'http://www.publico.es/estaticos/rss/catalunya'),
                 (u'Viajes', u'http://www.publico.es/estaticos/rss/viajes')]
--- a/resources/recipes/elpais_impreso.recipe
+++ b/resources/recipes/elpais_impreso.recipe
@ -17,7 +17,7 @@ class ElPais_RSS(BasicNewsRecipe):
    no_stylesheets        = True
    encoding              = 'cp1252'
    use_embedded_content  = False
-    language              = 'es_ES'
+    language              = 'es'
    remove_empty_feeds    = True
    publication_type      = 'newspaper'
    masthead_url          = 'http://www.elpais.com/im/tit_logo.gif'
@ -57,14 +57,14 @@ class ElPais_RSS(BasicNewsRecipe):
             ,(u'Madrid'               , u'http://www.elpais.com/rss/feed.html?feedId=1016' )
             ,(u'Pais Vasco'           , u'http://www.elpais.com/rss/feed.html?feedId=17062')
             ,(u'Galicia'              , u'http://www.elpais.com/rss/feed.html?feedId=17063')
-             ,(u'Opinion'              , u'http://www.elpais.com/rss/feed.html?feedId=1003' )             
+             ,(u'Opinion'              , u'http://www.elpais.com/rss/feed.html?feedId=1003' )
-             ,(u'Sociedad'             , u'http://www.elpais.com/rss/feed.html?feedId=1004' )             
+             ,(u'Sociedad'             , u'http://www.elpais.com/rss/feed.html?feedId=1004' )
             ,(u'Deportes'             , u'http://www.elpais.com/rss/feed.html?feedId=1007' )
             ,(u'Cultura'              , u'http://www.elpais.com/rss/feed.html?feedId=1008' )
             ,(u'Cine'                 , u'http://www.elpais.com/rss/feed.html?feedId=17052')
             ,(u'Literatura'           , u'http://www.elpais.com/rss/feed.html?feedId=17053')
             ,(u'Musica'               , u'http://www.elpais.com/rss/feed.html?feedId=17051')
-             ,(u'Arte'                 , u'http://www.elpais.com/rss/feed.html?feedId=17060')             
+             ,(u'Arte'                 , u'http://www.elpais.com/rss/feed.html?feedId=17060')
             ,(u'Tecnologia'           , u'http://www.elpais.com/rss/feed.html?feedId=1005' )
             ,(u'Economia'             , u'http://www.elpais.com/rss/feed.html?feedId=1006' )
             ,(u'Ciencia'              , u'http://www.elpais.com/rss/feed.html?feedId=17068')
--- a/resources/recipes/exiled.recipe
+++ b/resources/recipes/exiled.recipe
@ -1,7 +1,5 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
-__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'
 '''
 exiledonline.com
 '''
@ -20,18 +18,20 @@ class Exiled(BasicNewsRecipe):
    use_embedded_content  = False
    encoding              = 'utf8'
    remove_javascript     = True
-    language = 'en'
+    language              = 'en'
-
+    publication_type      = 'newsblog'
-    cover_url             = 'http://exiledonline.com/wp-content/themes/exiledonline_theme/images/header-sm.gif'
+    masthead_url          = 'http://exiledonline.com/wp-content/themes/exiledonline_theme/images/header-sm.gif'
-
+    extra_css             = """
-    html2lrf_options = [
+                               body{font-family: Arial,Helvetica,sans-serif}
-                          '--comment'       , description
+                               #topslug{font-size: xx-large; font-weight: bold; color: red}                               
-                        , '--base-font-size', '10'
+                            """
-                        , '--category'      , category
+    
-                        , '--publisher'     , publisher
+    conversion_options = {
-                        ]
+                          'comment'   : description
-
+                        , 'tags'      : category
-    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
+                        , 'publisher' : publisher
                        , 'language'  : language
                        }
    keep_only_tags = [dict(name='div', attrs={'id':'main'})]
@ -47,12 +47,13 @@ class Exiled(BasicNewsRecipe):
    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
-        mtag = '\n<meta http-equiv="Content-Language" content="en"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n'
+        for alink in soup.findAll('a'):
-        soup.head.insert(0,mtag)
+            if alink.string is not None:
               tstr = alink.string
               alink.replaceWith(tstr)
        return soup
    def get_article_url(self, article):
        raw = article.get('link',  None)
        final = raw + 'all/1/'
        return final
--- a/resources/recipes/expansion_spanish.recipe
+++ b/resources/recipes/expansion_spanish.recipe
@ -1,59 +1,79 @@
 #!/usr/bin/env  python
 # -*- coding: utf-8 -*-
 __license__   = 'GPL v3'
-__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
+__author__    = 'Gerardo Diez'
 __copyright__ = 'Gerardo Diez<gerardo.diez.garcia@gmail.com>'
 description   = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)'
 __docformat__ = 'restructuredtext en'
 '''
-www.expansion.com
+expansion.es
 '''
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class Publico(BasicNewsRecipe):
    title               =u'Expansion.com'
    __author__      ='Gerardo Diez'
    publisher       =u'Unidad Editorial Información Económica, S.L.'
    category                ='finances, catalunya'
    oldest_article      =1
    max_articles_per_feed   =100
    simultaneous_downloads  =10
    cover_url       =u'http://estaticos01.expansion.com/iconos/v2.x/v2.0/cabeceras/logo_expansion.png'
    timefmt         ='[%A, %d %B, %Y]'
    encoding        ='latin'
    language        ='es'
    remove_javascript   =True
    no_stylesheets      =True
    keep_only_tags      =dict(name='div', attrs={'class':['noticia primer_elemento']})
    remove_tags         =[
                dict(name='div', attrs={'class':['compartir', 'metadata_desarrollo_noticia', 'relacionadas', 'mas_info','publicidad publicidad_textlink', 'ampliarfoto']}),
                dict(name='ul', attrs={'class':['bolos_desarrollo_noticia']}),
                dict(name='span', attrs={'class':['comentarios']}),
                dict(name='p', attrs={'class':['cintillo_comentarios', 'cintillo_comentarios formulario']}),
                dict(name='div', attrs={'id':['comentarios_lectores_listado']})
                            ]
    feeds               =[
                (u'Portada', u'http://estaticos.expansion.com/rss/portada.xml'),
                (u'Portada: Bolsas', u'http://estaticos.expansion.com/rss/mercados.xml'),
                (u'Divisas', u'http://estaticos.expansion.com/rss/mercadosdivisas.xml'),
                (u'Euribor', u'http://estaticos.expansion.com/rss/mercadoseuribor.xml'),
                (u'Materias Primas', u'http://estaticos.expansion.com/rss/mercadosmateriasprimas.xml'),
                (u'Renta Fija', u'http://estaticos.expansion.com/rss/mercadosrentafija.xml'),
-from calibre.web.feeds.news import BasicNewsRecipe
+                (u'Portada: Mi Dinero', u'http://estaticos.expansion.com/rss/midinero.xml'),
-from calibre.ebooks.BeautifulSoup import Tag
+                (u'Hipotecas', u'http://estaticos.expansion.com/rss/midinerohipotecas.xml'),
                (u'Créditos', u'http://estaticos.expansion.com/rss/midinerocreditos.xml'),
                (u'Pensiones', u'http://estaticos.expansion.com/rss/midineropensiones.xml'),
                (u'Fondos de Inversión', u'http://estaticos.expansion.com/rss/midinerofondos.xml'),
                (u'Motor', u'http://estaticos.expansion.com/rss/midineromotor.xml'),
-class Expansion(BasicNewsRecipe):
+                (u'Portada: Empresas', u'http://estaticos.expansion.com/rss/empresas.xml'),
-    title                 = 'Diario Expansion'
+                (u'Banca', u'http://estaticos.expansion.com/rss/empresasbanca.xml'),
-    __author__            = 'Darko Miletic'
+                (u'TMT', u'http://estaticos.expansion.com/rss/empresastmt.xml'),
-    description           = 'Lider de informacion de mercados, economica y politica'
+                (u'Energía', u'http://estaticos.expansion.com/rss/empresasenergia.xml'),
-    publisher             = 'expansion.com'
+                (u'Inmobiliario y Construcción', u'http://estaticos.expansion.com/rss/empresasinmobiliario.xml'),
-    category              = 'news, politics, Spain'
+                (u'Transporte y Turismo', u'http://estaticos.expansion.com/rss/empresastransporte.xml'),
-    oldest_article        = 2
+                (u'Automoción e Industria', u'http://estaticos.expansion.com/rss/empresasauto-industria.xml'),
-    max_articles_per_feed = 100
+                (u'Distribución', u'http://estaticos.expansion.com/rss/empresasdistribucion.xml'),
-    no_stylesheets        = True
+                (u'Deporte y Negocio', u' http://estaticos.expansion.com/rss/empresasdeporte.xml'),
-    use_embedded_content  = False
+                (u'Mi Negocio', u'http://estaticos.expansion.com/rss/empresasminegocio.xml'),
-    delay                 = 1
+                (u'Interiores', u'http://estaticos.expansion.com/rss/empresasinteriores.xml'),
-    encoding              = 'iso-8859-15'
+                (u'Digitech', u'http://estaticos.expansion.com/rss/empresasdigitech.xml'),
    language = 'es'
-    direction             = 'ltr'
+                (u'Portada: Economía y Política', u'http://estaticos.expansion.com/rss/economiapolitica.xml'),
                (u'Política', u'http://estaticos.expansion.com/rss/economia.xml'),
                (u'Portada: Sociedad', u'http://estaticos.expansion.com/rss/entorno.xml'),
-    html2lrf_options = [
+                (u'Portada: Opinión', u'http://estaticos.expansion.com/rss/opinion.xml'),
-                          '--comment'  , description
+                (u'Llaves y editoriales', u'http://estaticos.expansion.com/rss/opinioneditorialyllaves.xml'),
-                        , '--category' , category
+                (u'Tribunas', u'http://estaticos.expansion.com/rss/opiniontribunas.xml'),
                        , '--publisher', publisher
                        ]
-    html2epub_options  = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
+                (u'Portada: Jurídico', u'http://estaticos.expansion.com/rss/juridico.xml'),
                (u'Entrevistas', u'http://estaticos.expansion.com/rss/juridicoentrevistas.xml'),
                (u'Opinión', u'http://estaticos.expansion.com/rss/juridicoopinion.xml'),
                (u'Sentencias', u'http://estaticos.expansion.com/rss/juridicosentencias.xml'),
-    feeds              = [
+                (u'Mujer', u'http://estaticos.expansion.com/rss/mujer-empresa.xml'),
-                            (u'Ultimas noticias', u'http://rss.expansion.com/rss/descarga.htm?data2=178')
+                (u'Catalu&ntilde;a', u'http://estaticos.expansion.com/rss/catalunya.xml'),
-                           ,(u'Temas del dia'   , u'http://rss.expansion.com/rss/descarga.htm?data2=178')
+                (u'Función pública', u'http://estaticos.expansion.com/rss/funcion-publica.xml')
-                         ]
+                ]
    keep_only_tags = [dict(name='div', attrs={'id':'principal'})]
    remove_tags        = [
                             dict(name=['object','link','script'])
                            ,dict(name='div', attrs={'class':['utilidades','tit_relacionadas']})
                         ]
    remove_tags_after = [dict(name='div', attrs={'class':'tit_relacionadas'})]
    def preprocess_html(self, soup):
        soup.html['dir' ] = self.direction
        mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
        soup.head.insert(0,mcharset)
        for item in soup.findAll(style=True):
            del item['style']
        return soup
--- a/resources/recipes/financial_times_uk.recipe
+++ b/resources/recipes/financial_times_uk.recipe
@ -1,5 +1,5 @@
 __license__   = 'GPL v3'
-__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
 '''
 ft.com
 '''
@ -52,22 +52,38 @@ class FinancialTimes(BasicNewsRecipe):
                .copyright{font-size: x-small}
                """
-    def parse_index(self):
+    def get_artlinks(self, elem):
        articles = []
        for item in elem.findAll('a',href=True):
            url   = self.PREFIX + item['href']
            title = self.tag_to_string(item)
            date = strftime(self.timefmt)
            articles.append({
                              'title'      :title
                             ,'date'       :date
                             ,'url'        :url
                             ,'description':''
                            })
        return articles
    def parse_index(self):
        feeds = []
        soup = self.index_to_soup(self.INDEX)
        wide = soup.find('div',attrs={'class':'wide'})
-        if wide:
+        if not wide:
-            for item in wide.findAll('a',href=True):
+           return feeds
-                url   = self.PREFIX + item['href']
+        strest = wide.findAll('h3', attrs={'class':'section'})
-                title = self.tag_to_string(item)
+        if not strest:
-                date = strftime(self.timefmt)
+           return feeds
-                articles.append({
+        st = wide.find('h4',attrs={'class':'section-no-arrow'})
-                                  'title'      :title
+        if st:
-                                 ,'date'       :date
+           strest.insert(0,st)
-                                 ,'url'        :url
+        for item in strest:
-                                 ,'description':''
+            ftitle   = self.tag_to_string(item)
-                                })
+            self.report_progress(0, _('Fetching feed')+' %s...'%(ftitle))            
-        return [('FT UK edition',articles)]
+            feedarts = self.get_artlinks(item.parent.ul)
            feeds.append((ftitle,feedarts))
        return feeds
    def preprocess_html(self, soup):
        return self.adeify_images(soup)
--- a/resources/recipes/freenature.recipe
+++ b/resources/recipes/freenature.recipe
@ -1,4 +1,5 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import Tag
 import re
 class NatureNews(BasicNewsRecipe):
@ -10,17 +11,76 @@ class NatureNews(BasicNewsRecipe):
    max_articles_per_feed = 50
    no_stylesheets = True
-    remove_tags_before = dict(name='h1', attrs={'class':'heading entry-title'})
+    keep_only_tags = [dict(name='div', attrs={'id':'content'})]
-    remove_tags_after  = dict(name='h2', attrs={'id':'comments'})
+#    remove_tags_before = dict(name='h1', attrs={'class':'heading entry-title'})
 #    remove_tags_after  = dict(name='h2', attrs={'id':'comments'})
    remove_tags = [
       dict(name='h2', attrs={'id':'comments'}),
       dict(attrs={'alt':'Advertisement'}),
       dict(name='div', attrs={'class':'ad'}),
-    ] 
+       dict(attrs={'class':'Z3988'}),
       dict(attrs={'class':['formatpublished','type-of-article','cleardiv','disclaimer','buttons','comments xoxo']}),
       dict(name='a', attrs={'href':'#comments'}),
       dict(name='h2',attrs={'class':'subheading plusicon icon-add-comment'})
    ]
    preprocess_regexps = [
        (re.compile(r'<p>ADVERTISEMENT</p>', re.DOTALL|re.IGNORECASE), lambda match: ''),
        ]
    extra_css             = '''
                            .author { text-align: right; font-size: small; line-height:1em; margin-top:0px; margin-left:0; margin-right:0; margin-bottom: 0; }
                            .imagedescription { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
                            .imagecredit { font-size: x-small; font-style: normal; font-weight: bold}
                            '''
    feeds = [('Nature News', 'http://feeds.nature.com/news/rss/most_recent')]
    def preprocess_html(self,soup):
        # The author name is slightly buried - dig it up
        author = soup.find('p', {'class':'byline'})
        if author:
            # Find out the author's name
            authornamediv = author.find('span',{'class':'author fn'})
            authornamelink = authornamediv.find('a')
            if authornamelink:
                authorname = authornamelink.contents[0]
            else:
                authorname = authornamediv.contents[0]
            # Stick the author's name in the byline tag
            tag = Tag(soup,'div')
            tag['class'] = 'author'
            tag.insert(0,authorname.strip())
            author.replaceWith(tag)
        # Change the intro from a p to a div
        intro = soup.find('p',{'class':'intro'})
        if intro:
            tag = Tag(soup,'div')
            tag['class'] = 'intro'
            tag.insert(0,intro.contents[0])
            intro.replaceWith(tag)
        # Change span class=imagedescription to div
        descr = soup.find('span',{'class':'imagedescription'})
        if descr:
            tag = Tag(soup,'div')
            tag['class'] = 'imagedescription'
            tag.insert(0,descr.renderContents())
            descr.replaceWith(tag)
        # The references are in a list, let's make them simpler
        reflistcont =  soup.find('ul',{'id':'article-refrences'})
        if reflistcont:
            reflist = reflistcont.li.renderContents()
            tag = Tag(soup,'div')
            tag['class'] = 'article-references'
            tag.insert(0,reflist)
            reflistcont.replaceWith(tag)
        # Within the id=content div, we need to remove all the stuff after the end of the class=entry-content
        entrycontent = soup.find('div',{'class':'entry-content'})
        for nextSibling in entrycontent.findNextSiblings():
            nextSibling.extract()
        return soup
--- a/resources/recipes/globe_and_mail.recipe
+++ b/resources/recipes/globe_and_mail.recipe
@ -8,12 +8,13 @@ __docformat__ = 'restructuredtext en'
 globeandmail.com
 '''
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 class AdvancedUserRecipe1287083651(BasicNewsRecipe):
    title          = u'Globe & Mail'
-    __license__   = 'GPL v3'
+    __author__ = 'Kovid Goyal'
    __author__ = 'Szing'
    oldest_article = 2
    no_stylesheets = True
    max_articles_per_feed = 100
@ -38,24 +39,19 @@ class AdvancedUserRecipe1287083651(BasicNewsRecipe):
      (u'Sports', u'http://www.theglobeandmail.com/auto/?service=rss')
    ]
-    keep_only_tags = [
+    preprocess_regexps = [
-      dict(name='h1'),
+        (re.compile(r'<head.*?</head>', re.DOTALL), lambda m: '<head></head>'),
-      dict(name='h2', attrs={'id':'articletitle'}),
+        (re.compile(r'<script.*?</script>', re.DOTALL), lambda m: ''),
-      dict(name='p', attrs={'class':['leadText', 'meta', 'leadImage', 'redtext byline', 'bodyText']}),
+        ]
      dict(name='div', attrs={'class':['news','articlemeta','articlecopy']}),
      dict(name='id', attrs={'class':'article'}),
      dict(name='table', attrs={'class':'todays-market'}),
      dict(name='header', attrs={'id':'leadheader'})
    ]
    remove_tags_before = dict(name='h1')
    remove_tags = [
-      dict(name='div', attrs={'id':['tabInside', 'ShareArticles', 'topStories']})
+            dict(name='div', attrs={'id':['ShareArticles', 'topStories']}),
-    ]
+            dict(href=lambda x: x and 'tracking=' in x),
-
+            {'class':['articleTools', 'pagination', 'Ads', 'topad',
-    #this has to be here or the text in the article appears twice.
+                'breadcrumbs', 'footerNav', 'footerUtil', 'downloadlinks']}]
    remove_tags_after = [dict(id='article')]
    #Use the mobile version rather than the web version
    def print_version(self, url):
-        return url + '&service=mobile'
+        return url.rpartition('?')[0] + '?service=mobile'
--- a/resources/recipes/gulfnews.recipe
+++ b/resources/recipes/gulfnews.recipe
@ -0,0 +1,64 @@
 __license__   = 'GPL v3'
 __copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
 '''
 gulfnews.com
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 class GulfNews(BasicNewsRecipe):
    title                 = 'Gulf News'
    __author__            = 'Darko Miletic'
    description           = 'News from United Arab Emirrates, persian gulf and rest of the world'
    publisher             = 'Al Nisr Publishing LLC'
    category              = 'news, politics, UAE, world'
    oldest_article        = 2
    max_articles_per_feed = 200
    no_stylesheets        = True
    encoding              = 'utf8'
    use_embedded_content  = False
    language              = 'en'
    remove_empty_feeds    = True
    publication_type      = 'newsportal'
    masthead_url          = 'http://gulfnews.com/media/img/gulf_news_logo.jpg'
    extra_css             = """
                               body{font-family: Arial,Helvetica,sans-serif }
                               img{margin-bottom: 0.4em; display:block}
                               h1{font-family: Georgia, 'Times New Roman', Times, serif}
                               ol,ul{list-style: none}
                               .synopsis{font-size: small}
                               .details{font-size: x-small}
                               .image{font-size: xx-small}
                            """
    conversion_options = {
                          'comment'   : description
                        , 'tags'      : category
                        , 'publisher' : publisher
                        , 'language'  : language
                        }
    remove_tags = [
                     dict(name=['meta','link','object','embed'])
                     ,dict(attrs={'class':['quickLinks','ratings']})
                     ,dict(attrs={'id':'imageSelector'})
                  ]
    remove_attributes=['lang']
    keep_only_tags=[
                     dict(name='h1')
                     ,dict(attrs={'class':['synopsis','details','image','article']})
                   ]
    feeds = [
              (u'UAE News'      , u'http://gulfnews.com/cmlink/1.446094')
             ,(u'Business'      , u'http://gulfnews.com/cmlink/1.446098')
             ,(u'Entertainment' , u'http://gulfnews.com/cmlink/1.446095')
             ,(u'Sport'         , u'http://gulfnews.com/cmlink/1.446096')
             ,(u'Life'          , u'http://gulfnews.com/cmlink/1.446097')
            ]
    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
        return soup
--- a/resources/recipes/heraldo.recipe
+++ b/resources/recipes/heraldo.recipe
@ -3,29 +3,31 @@ __license__     = 'GPL v3'
 __copyright__   = '04 December 2010, desUBIKado'
 __author__      = 'desUBIKado'
 __description__ = 'Daily newspaper from Aragon'
-__version__     = 'v0.03'
+__version__     = 'v0.04'
-__date__        = '11, December 2010'
+__date__        = '6, Januery 2011'
 '''
 [url]http://www.heraldo.es/[/url]
 '''
 import time
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 class heraldo(BasicNewsRecipe):
-    __author__        = 'desUBIKado'
+    __author__     = 'desUBIKado'
-    description   = 'Daily newspaper from Aragon'
+    description    = 'Daily newspaper from Aragon'
    title          = u'Heraldo de Aragon'
    publisher      = 'OJD Nielsen'
    category       = 'News, politics, culture, economy, general interest'
    language       = 'es'
    timefmt        = '[%a, %d %b, %Y]'
-    oldest_article = 1
+    oldest_article = 2
    delay          = 1
    max_articles_per_feed = 100
    use_embedded_content  = False
    remove_javascript = True
    no_stylesheets = True
-    recursion      = 10
+
    feeds          = [
                        (u'Portadas', u'http://www.heraldo.es/index.php/mod.portadas/mem.rss')
@ -37,29 +39,39 @@ class heraldo(BasicNewsRecipe):
    remove_tags        = [dict(name='a', attrs={'class':['com flo-r','enl-if','enl-df']}),
                          dict(name='div', attrs={'class':['brb-b-s con marg-btt','cnt-rel con']}),
-                          dict(name='form', attrs={'class':'form'})]
+                          dict(name='form', attrs={'class':'form'}),
                          dict(name='ul', attrs={'id':['cont-tags','pag-1']})]
    remove_tags_before = dict(name='div' , attrs={'id':'dts'})
    remove_tags_after  = dict(name='div' , attrs={'id':'com'})
    def get_cover_url(self):
-        cover = None
+       cover = None
-        st = time.localtime()
+       st = time.localtime()
-        year = str(st.tm_year)
+       year = str(st.tm_year)
-        month = "%.2d" % st.tm_mon
+       month = "%.2d" % st.tm_mon
-        day = "%.2d" % st.tm_mday
+       day = "%.2d" % st.tm_mday
 		#[url]http://oldorigin-www.heraldo.es/20101211/primeras/portada_aragon.pdf[/url]
-        cover='http://oldorigin-www.heraldo.es/'+ year +  month + day +'/primeras/portada_aragon.pdf'
+       cover='http://oldorigin-www.heraldo.es/'+ year +  month + day +'/primeras/portada_aragon.pdf'
-        br = BasicNewsRecipe.get_browser()
+       br = BasicNewsRecipe.get_browser()
-        try:
+       try:
-            br.open(cover)
+           br.open(cover)
-        except:
+       except:
-            self.log("\nPortada no disponible")
+           self.log("\nPortada no disponible")
-            cover ='http://www.heraldo.es/MODULOS/global/publico/interfaces/img/logo-Heraldo.png'
+           cover ='http://www.heraldo.es/MODULOS/global/publico/interfaces/img/logo-Heraldo.png'
-        return cover
+       return cover
    extra_css = '''
-                    h2{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:xx-large;}
+                    .con strong{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:16px;}
-		'''
+                    .con h2{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;}
                    .con span{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:12px;}
                    .ent {font-family:Arial,Helvetica,sans-serif; font-weight:normal; font-style:italic; font-size:18px;}
                    img{margin-bottom: 0.4em}
                '''
    preprocess_regexps = [
 # To separate the comments with a blank line
                           (re.compile(r'<div id="com"', re.DOTALL|re.IGNORECASE), lambda match: '<br><div id="com"')
                         ]
--- a/resources/recipes/ibm_smarter_planet.recipe
+++ b/resources/recipes/ibm_smarter_planet.recipe
@ -5,6 +5,7 @@ class AdvancedUserRecipe1293122276(BasicNewsRecipe):
    __author__ = 'Jack Mason'
    author  = 'IBM Global Business Services'
    publisher             = 'IBM'
    language = 'en'
    category              = 'news, technology, IT, internet of things, analytics'
    oldest_article = 7
    max_articles_per_feed = 30
--- a/resources/recipes/ihned.recipe
+++ b/resources/recipes/ihned.recipe
@ -0,0 +1,182 @@
 import re, time
 from calibre import strftime
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class IHNed(BasicNewsRecipe):
    stahnout_vsechny = True
        #True   = stahuje vsechny z homepage
        #False  = stahuje pouze dnesni clanky (ze dne, kdy je skript spusten)
    title       = 'iHNed'
    __author__  = 'Karel Bílek'
    language = 'cs'
    description = 'Zprávy z iHNed.cz'
    timefmt = ' [%a, %d %b, %Y]'
    needs_subscription = False
    remove_tags = [dict(attrs={'class':['borderbottom', 'web', 'foot', 'reklama', 'd-elm d-rellinks', 'd-elm']}),
                 dict(style=['text-align: center;']),
                 dict(id=['r-bfull']),
                 dict(name=['script', 'noscript', 'style'])]
    encoding = 'windows-1250'
    no_stylesheets = True
    remove_tags_before = dict(attrs={'class':'d-nadtit'})
    remove_tags_after = dict(attrs={'class':'like'})
    conversion_options = {
      'linearize_tables' : True,
    }
    def preprocess_html(self, soup):
        def makeurl(wat):
            return "http://ihned.cz"+wat;
        for h1 in soup.findAll('h1'):
             a = h1.find('a')
             if a:
                 string = a.string
                 if string:
                     soup.a.replaceWith(string)
        for a in soup.findAll('a',  href=True) :
            cil = str(a['href'])
            if cil.startswith("/") or  cil.startswith("index"):
                a['href'] = makeurl(cil)
        return soup
    def parse_index(self):
        def makeurl(wat):
            if wat.startswith("/") or  wat.startswith("index"):
                return "http://ihned.cz"+wat;
            else:
                return wat
        articles = {} #vysledek, asi
        key = None #soucasna sekce
        ans = [] #vsechny sekce
        articles["Hlavní"] = []
        ans.append("Hlavní")
        was = {}
        def parse_subpage(url, name):
            articles[name] = []
            ans.append(name)
            soup = self.index_to_soup(url)
            otvirak = soup.find(True, attrs={'class':['otv']})
            if otvirak:
                #the code is copypasted here because I don't know python. simple as that.
                a = otvirak.find('a', href=True)
                title = self.tag_to_string(a, use_alt=True).strip()
                txt = otvirak.find(True, attrs={'class':['txt']})
                description = ''
                if txt:
                    match = re.match(r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L)
                    if match:
                        description = match.group(1)
                pubdate = strftime('%d. %m.')
                if not title in was:
                    articles[name].append(
                          dict(title=title, url=makeurl(a['href']), date=pubdate,
                                description=description,
                                content=''))
            otv234 = soup.find(True, attrs={'class':['otv234', 'col2a']})
            if otv234:
                for ow in otv234.findAll(True, attrs={'class':['ow']}):
                    a = ow.find('a', href=True)
                    title = self.tag_to_string(a, use_alt=True).strip()
                    description=''
                    prx = ow.find(True, attrs={'class':['prx']});
                    if prx:
                        description = str(prx.string)
                    nfo = ow.find(True, attrs={'class':['nfo']});
                    pubdate = ''
                    if nfo:
                        dtime = time.localtime();
                        day = dtime[2]
                        month = dtime[1]
                        pubdate = strftime('%d. %m.')
                        match = re.search(r'([0-9]*)\.([0-9]*)\.', str(nfo))
                        if self.stahnout_vsechny or (int(day) == int(match.group(1)) and int(month) == int(match.group(2))):
                            if not title in was:
                                articles[name].append(
                                      dict(title=title, url=makeurl(a['href']), date=pubdate,
                                            description=description,
                                            content=''))
        soup = self.index_to_soup('http://ihned.cz/')
        otvirak = soup.find(True, attrs={'class':['otv']})
        if otvirak:
            a = otvirak.find('a', href=True)
            title = self.tag_to_string(a, use_alt=True).strip()
            txt = otvirak.find(True, attrs={'class':['txt']})
            description = ''
            if txt:
                match = re.match(r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L)
                if match:
                    description = match.group(1)
            pubdate = strftime('%d. %m.')
            feed = "Hlavní"
            articles[feed].append(
                      dict(title=title, url=(a['href']), date=pubdate,
                            description=description,
                            content=''))
            was[title]=1
        otvirak2345 = soup.find(True, attrs={'class':['otv2345']})
        if otvirak2345:
            for otv2 in otvirak2345.findAll(True, attrs={'class':['otv2-5']}):
                a = otv2.find('a', attrs={'class':['tit2']}, href=True)
                title = self.tag_to_string(a, use_alt=True).strip()
                description=''
                span = otv2.find('span');
                if span:
                    match = re.match(r'<span>\s*([^<]*)\s*<a', str(span), re.L)
                    if match:
                        description = match.group(1)
                feed = "Hlavní"
                pubdate = strftime('%d. %m.')
                articles[feed].append(
                          dict(title=title, url=(a['href']), date=pubdate,
                                description=description,
                                content=''))
                was[title]=1
        parse_subpage("http://komentare.ihned.cz/", "Komentáře")
        parse_subpage("http://domaci.ihned.cz", "Domácí")
        parse_subpage("http://ekonomika.ihned.cz", "Ekonomika")
        parse_subpage("http://zahranicni.ihned.cz/", "Zahraničí");
        parse_subpage("http://finweb.ihned.cz/", "Finance");
        parse_subpage("http://digiweb.ihned.cz/", "DigiWeb");
        parse_subpage("http://kultura.ihned.cz/", "Kultura")
        parse_subpage("http://sport.ihned.cz/", "Sport");
        #seradi kategorie
        ans = self.sort_index_by(ans, {'Hlavni':1, 'Domácí':2, 'Ekonomika':5, 'Zahraničí':3, 'Finance':6, 'DigiWeb':7, 'Kultura':8, 'Sport':9, 'Komentáře':4})
        #vrati, ale pouze, kdyz je v kategoriich...
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans
--- a/resources/recipes/karlsruhe.recipe
+++ b/resources/recipes/karlsruhe.recipe
@ -6,6 +6,7 @@ class KANewsRecipe(BasicNewsRecipe):
    description = u'Nachrichten aus Karlsruhe, Deutschland und der Welt.'
    __author__ = 'tfeld'
    lang='de'
    language = 'de'
    no_stylesheets = True
    oldest_article = 7
--- a/resources/recipes/kath_net.recipe
+++ b/resources/recipes/kath_net.recipe
@ -0,0 +1,17 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 class AdvancedUserRecipe1295262156(BasicNewsRecipe):
    title          = u'kath.net'
    __author__     = 'Bobus'
    oldest_article = 7
    language = 'en'
    max_articles_per_feed = 100
    feeds          = [(u'kath.net', u'http://www.kath.net/2005/xml/index.xml')]
    def print_version(self, url):
        return url+"&print=yes"
    extra_css = 'td.textb {font-size: medium;}'
--- a/resources/recipes/las_vegas_review.recipe
+++ b/resources/recipes/las_vegas_review.recipe
@ -3,12 +3,17 @@ from calibre.web.feeds.news import BasicNewsRecipe
 class AdvancedUserRecipe1274742400(BasicNewsRecipe):
    title = u'Las Vegas Review Journal'
-    __author__ = 'Joel'
+    __author__ = 'Kovid Goyal'
    language = 'en'
    oldest_article = 7
    max_articles_per_feed = 100
    keep_only_tags = [dict(id='content-main')]
    remove_tags = [dict(id=['right-col-content', 'trending-topics']),
            {'class':['ppy-outer']}
            ]
    no_stylesheets = True
    feeds = [
            (u'News', u'http://www.lvrj.com/news.rss'),
--- a/resources/recipes/lavanguardia.recipe
+++ b/resources/recipes/lavanguardia.recipe
@ -20,8 +20,8 @@ class LaVanguardia(BasicNewsRecipe):
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
-    delay                 = 1
+    delay                 = 5
-    encoding              = 'cp1252'
+ #   encoding              = 'cp1252'
    language = 'es'
    direction             = 'ltr'
@ -35,8 +35,8 @@ class LaVanguardia(BasicNewsRecipe):
    html2epub_options  = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
    feeds              = [
-                            (u'Ciudadanos'           , u'http://feeds.feedburner.com/lavanguardia/ciudadanos'   )
+                            (u'Portada'           , u'http://feeds.feedburner.com/lavanguardia/home'   )
-                           ,(u'Cultura'              , u'http://feeds.feedburner.com/lavanguardia/cultura'      )
+                          ,(u'Cultura'              , u'http://feeds.feedburner.com/lavanguardia/cultura'      )
                           ,(u'Deportes'             , u'http://feeds.feedburner.com/lavanguardia/deportes'     )
                           ,(u'Economia'             , u'http://feeds.feedburner.com/lavanguardia/economia'     )
                           ,(u'El lector opina'      , u'http://feeds.feedburner.com/lavanguardia/lectoropina'  )
@ -45,17 +45,17 @@ class LaVanguardia(BasicNewsRecipe):
                           ,(u'Internet y tecnologia', u'http://feeds.feedburner.com/lavanguardia/internet'     )
                           ,(u'Motor'                , u'http://feeds.feedburner.com/lavanguardia/motor'        )
                           ,(u'Politica'             , u'http://feeds.feedburner.com/lavanguardia/politica'     )
-                           ,(u'Sucessos'             , u'http://feeds.feedburner.com/lavanguardia/sucesos'      )
+                           ,(u'Sucesos'             , u'http://feeds.feedburner.com/lavanguardia/sucesos'      )
                         ]
    keep_only_tags = [
-                       dict(name='div', attrs={'class':'element1_3'})
+                       dict(name='div', attrs={'class':'detalle  noticia'})
-                     ]
+                    ]
    remove_tags        = [
                             dict(name=['object','link','script'])
-                            ,dict(name='div', attrs={'class':['colC','peu']})
+                            ,dict(name='div', attrs={'class':['colC','peu','jstoolbar']})
                         ]
    remove_tags_after = [dict(name='div', attrs={'class':'text'})]
@ -67,4 +67,3 @@ class LaVanguardia(BasicNewsRecipe):
        for item in soup.findAll(style=True):
            del item['style']
        return soup
--- a/resources/recipes/mail_and_guardian.recipe
+++ b/resources/recipes/mail_and_guardian.recipe
@ -0,0 +1,32 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 class AdvancedUserRecipe1295081935(BasicNewsRecipe):
     title          = u'Mail & Guardian ZA News'
     __author__     = '77ja65'
     language = 'en'
     oldest_article = 7
     max_articles_per_feed = 30
     no_stylesheets = True
     masthead_url          =  'http://c1608832.cdn.cloudfiles.rackspacecloud.com/mg_logo.gif'
     remove_tags_after = [dict(id='content')]
     feeds          = [
         (u'National News', u'http://www.mg.co.za/rss/national'),
         (u'Top Stories', u'http://www.mg.co.za/rss'),
         (u'Africa News', u'http://www.mg.co.za/rss/africa'),
         (u'Sport', u'http://www.mg.co.za/rss/sport'),
         (u'Business', u'http://www.mg.co.za/rss/business'),
         (u'And In Other News', u'http://www.mg.co.za/rss/and-in-other-news'),
         (u'World News', u'http://www.mg.co.za/rss/world')
         ]
     def print_version(self, url):
           return url.replace('http://www.mg.co.za/article/',
 'http://www.mg.co.za/printformat/single/')
     extra_css = '''
                     h1{font-family:Arial,Helvetica,sans-serif; font-
 weight:bold;font-size:large;}
                     h2{font-family:Arial,Helvetica,sans-serif; font-
 weight:normal;font-size:small;}
                 '''
--- a/resources/recipes/msnbc.recipe
+++ b/resources/recipes/msnbc.recipe
@ -1,10 +1,9 @@
 __license__   = 'GPL v3'
-__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
 '''
 msnbc.msn.com
 '''
 import re
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class MsNBC(BasicNewsRecipe):
@ -19,7 +18,16 @@ class MsNBC(BasicNewsRecipe):
    publisher              = 'msnbc.com'
    category               = 'news, USA, world'
    language               = 'en'
-    extra_css              = ' body{ font-family: sans-serif } .head{font-family: serif; font-size: xx-large; font-weight: bold; color: #CC0000} .abstract{font-weight: bold} .source{font-size: small} .updateTime{font-size: small} '
+    extra_css              = """
                                body{ font-family: Georgia,Times,serif }
                                .hide{display: none}
                                .caption{font-family: Arial,sans-serif; font-size: x-small}
                                .entry-summary{font-family: Arial,sans-serif}
                                .copyright{font-size: 0.95em; font-style: italic}
                                .source-org{font-size: small; font-family: Arial,sans-serif}
                                img{display: block; margin-bottom: 0.5em}
                                span.byline{display: none}
                            """
    conversion_options = {
                             'comments' : description
@ -28,14 +36,20 @@ class MsNBC(BasicNewsRecipe):
                            ,'publisher': publisher
                         }
-    preprocess_regexps = [
+    remove_tags_before = dict(name='h1', attrs={'id':'headline'})
-        (re.compile(r'</style></head>', re.DOTALL|re.IGNORECASE),lambda match: '</style>')
+    remove_tags_after = dict(name='span', attrs={'class':['copyright','Linear copyright']})
-       ,(re.compile(r'<div class="head">', re.DOTALL|re.IGNORECASE),lambda match: '</head><body><div class="head">'),
+    keep_only_tags=[
-    ]
+                      dict(attrs={'id':['headline','deck','byline','source','intelliTXT']})
                     ,dict(attrs={'class':['gl_headline','articleText','drawer-content Linear','v-center3','byline','textBodyBlack']})
                   ]
    remove_attributes=['property','lang','rel','xmlns:fb','xmlns:v','xmlns:dc','xmlns:dcmitype','xmlns:og','xmlns:media','xmlns:vcard','typeof','itemscope','itemtype','itemprop','about','type','size','width','height','onreadystatechange','data','border','hspace','vspace']
    remove_tags      = [
                          dict(name=['iframe','object','link','embed','meta','table'])
                         ,dict(name='span', attrs={'class':['copyright','Linear copyright']})
                         ,dict(name='div', attrs={'class':'social'})
                       ]
    remove_tags_before = dict(name='div', attrs={'class':'head'})
    remove_tags_after = dict(name='div', attrs={'class':'copyright'})
    remove_tags      = [dict(name=['iframe','object','link','script','form'])]
    feeds = [
               (u'US News'       , u'http://rss.msnbc.msn.com/id/3032524/device/rss/rss.xml'      )
@ -48,11 +62,26 @@ class MsNBC(BasicNewsRecipe):
              ,(u'Tech & Science', u'http://rss.msnbc.msn.com/id/3032117/device/rss/rss.xml'      )
            ]
    def print_version(self, url):
        return url + 'print/1/displaymode/1098/'
    def preprocess_html(self, soup):
-        for item in soup.head.findAll('div'):
+        for item in soup.body.findAll('html'):
-            item.extract()
+            item.name='div'
        for item in soup.body.findAll('div'):
            if item.has_key('id') and item['id'].startswith('vine-'):
               item.extract()
            if item.has_key('class') and ( item['class'].startswith('ad') or item['class'].startswith('vine')):
               item.extract()
        for item in soup.body.findAll('img'):
            if not item.has_key('alt'):
               item['alt'] = 'image'
        for item in soup.body.findAll('ol'):
            if item.has_key('class') and item['class'].startswith('grid'):
               item.extract()
        for item in soup.body.findAll('span'):
            if ( item.has_key('id') and item['id'].startswith('byLine') and item.string is None) or ( item.has_key('class') and item['class'].startswith('inline') ):
               item.extract()
        for alink in soup.findAll('a'):
            if alink.string is not None:
               tstr = alink.string
               alink.replaceWith(tstr)
        return soup
--- a/resources/recipes/nationalgeographic.recipe
+++ b/resources/recipes/nationalgeographic.recipe
@ -10,6 +10,7 @@ import re
 class NationalGeographicNews(BasicNewsRecipe):
    title          = u'National Geographic News'
    oldest_article = 7
    language = 'en'
    max_articles_per_feed = 100
    remove_javascript = True
    no_stylesheets = True
--- a/resources/recipes/nikkei_sub_economy.recipe
+++ b/resources/recipes/nikkei_sub_economy.recipe
@ -27,6 +27,9 @@ class NikkeiNet_sub_economy(BasicNewsRecipe):
                       {'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"},
                       {'class':"cmn-article_keyword cmn-clearfix"},
                       {'class':"cmn-print_headline cmn-clearfix"},
                       {'class':"cmn-article_list"},
                       dict(id="ABOUT-NIKKEI"),
                       {'class':"cmn-sub_market"},
                         ]
    remove_tags_after = {'class':"cmn-pr_list"}
--- a/resources/recipes/nrc.nl.recipe
+++ b/resources/recipes/nrc.nl.recipe
@ -1,5 +1,5 @@
 __license__   = 'GPL v3'
-__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
 '''
 nrc.nl
 '''
@ -15,13 +15,18 @@ class Pagina12(BasicNewsRecipe):
    oldest_article        = 2
    max_articles_per_feed = 200
    no_stylesheets        = True
-    encoding              = 'cp1252'
+    encoding              = 'utf8'
    use_embedded_content  = False
    language              = 'nl'
    country               = 'NL'
    remove_empty_feeds    = True
    masthead_url          = 'http://www.nrc.nl/nrc.nl/images/logo_nrc.png'
-    extra_css             = ' body{font-family: Verdana,Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} h1,h2,h3{text-align:left} '
+    extra_css             = """ 
                                body{font-family: Georgia,serif } 
                                img{margin-bottom: 0.4em; display: block}
                                .bijschrift,.sectie{font-size: x-small}
                                .sectie{color: gray}
                            """
    conversion_options = {
                          'comment'   : description
@ -30,21 +35,42 @@ class Pagina12(BasicNewsRecipe):
                        , 'language'  : language
                        }
-    keep_only_tags = [dict(name='div',attrs={'class':'article clearfix'})]
+    keep_only_tags = [dict(attrs={'class':'uitstekendekeus'})]
-
+    remove_tags    = [
-
+                        dict(name=['meta','base','link','object','embed'])
                       ,dict(attrs={'class':['reclamespace','tags-and-sharing']})
                     ]
    remove_attributes=['lang']
    feeds = [
-              (u'Voorpagina'   , u'http://feeds.feedburner.com/NRCHandelsbladVoorpagina'     )
+              (u'Voor nieuws', u'http://www.nrc.nl/nieuws/categorie/nieuws/rss.php'           )
-             ,(u'Binnenland'   , u'http://feeds.feedburner.com/NRCHandelsbladBinnenland'     )
+             ,(u'Binnenland' , u'http://www.nrc.nl/nieuws/categorie/binnenland/rss.php'       )
-             ,(u'Buitenland'   , u'http://feeds.feedburner.com/NRCHandelsbladBuitenland'     )
+             ,(u'Buitenland' , u'http://www.nrc.nl/nieuws/categorie/buitenland/rss.php'       )
-             ,(u'Economie'     , u'http://feeds.feedburner.com/NRCHandelsbladEconomie'       )
+             ,(u'Economie'   , u'http://www.nrc.nl/nieuws/categorie/economie/rss.php'         )
-             ,(u'Kunst & Film' , u'http://feeds.feedburner.com/nrc/NRCHandelsbladKunstEnFilm')
+             ,(u'Cultuur'    , u'http://www.nrc.nl/nieuws/categorie/cultuur/rss.php'          )
-             ,(u'Sport'        , u'http://feeds.feedburner.com/NRCHandelsbladSport'          )
+             ,(u'Sport'      , u'http://www.nrc.nl/nieuws/categorie/sport/rss.php'            )
-             ,(u'Wetenschap '  , u'http://www.nrc.nl/rss/wetenschap'                         )
+             ,(u'Wetenschap ', u'http://www.nrc.nl/nieuws/categorie/wetenschap-nieuws/rss.php')
            ]
    def print_version(self, url):
        return url + '?service=Print'
    def preprocess_html(self, soup):
-        return self.adeify_images(soup)
+        for item in soup.findAll(style=True):
            del item['style']
        for item in soup.findAll('a'):
            limg = item.find('img')
            if item.string is not None:
               str = item.string
               item.replaceWith(str)
            else:
               if limg:
                  item.name = 'div'
                  atritems =['href','target','rel']
                  for atit in atritems:
                      if item.has_key(atit):
                         del item[atit]
               else:
                   str = self.tag_to_string(item)
                   item.replaceWith(str)            
        for item in soup.findAll('img'):
            if not item.has_key('alt'):
               item['alt'] = 'image'               
        return soup
--- a/resources/recipes/nytimes.recipe
+++ b/resources/recipes/nytimes.recipe
@ -586,102 +586,147 @@ class NYTimes(BasicNewsRecipe):
        return self.strip_anchors(soup)
    def postprocess_html(self,soup, True):
 		try:
 			if self.one_picture_per_article:
 				# Remove all images after first
 				largeImg = soup.find(True, {'class':'articleSpanImage'})
 				inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
 				if largeImg:
 					for inlineImg in inlineImgs:
 						inlineImg.extract()
 				else:
 					if inlineImgs:
 						firstImg = inlineImgs[0]
 						for inlineImg in inlineImgs[1:]:
 							inlineImg.extract()
 						# Move firstImg before article body
 						cgFirst = soup.find(True, {'class':re.compile('columnGroup  *first')})
 						if cgFirst:
 							# Strip all sibling NavigableStrings: noise
 							navstrings = cgFirst.findAll(text=True, recursive=False)
 							[ns.extract() for ns in navstrings]
 							headline_found = False
 							tag = cgFirst.find(True)
 							insertLoc = 0
 							while True:
 								insertLoc += 1
 								if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
 										headline_found = True
 										break
 								tag = tag.nextSibling
 								if not tag:
 									headline_found = False
 									break
 							if headline_found:
 								cgFirst.insert(insertLoc,firstImg)
 						else:
 							self.log(">>> No class:'columnGroup first' found <<<")
 		except:
 			self.log("ERROR: One picture per article in postprocess_html")
-        if self.one_picture_per_article:
+		try:
-            # Remove all images after first
+			# Change captions to italic
-            largeImg = soup.find(True, {'class':'articleSpanImage'})
+			for caption in soup.findAll(True, {'class':'caption'}) :
-            inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
+				if caption and len(caption) > 0:
-            if largeImg:
+					cTag = Tag(soup, "p", [("class", "caption")])
-                for inlineImg in inlineImgs:
+					c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
-                    inlineImg.extract()
+					mp_off = c.find("More Photos")
-            else:
+					if mp_off >= 0:
-                if inlineImgs:
+						c = c[:mp_off]
-                    firstImg = inlineImgs[0]
+					cTag.insert(0, c)
-                    for inlineImg in inlineImgs[1:]:
+					caption.replaceWith(cTag)
-                        inlineImg.extract()
+		except:
-                    # Move firstImg before article body
+			self.log("ERROR:  Problem in change captions to italic")
                    cgFirst = soup.find(True, {'class':re.compile('columnGroup  *first')})
                    if cgFirst:
                        # Strip all sibling NavigableStrings: noise
                        navstrings = cgFirst.findAll(text=True, recursive=False)
                        [ns.extract() for ns in navstrings]
                        headline_found = False
                        tag = cgFirst.find(True)
                        insertLoc = 0
                        while True:
                            insertLoc += 1
                            if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
                                    headline_found = True
                                    break
                            tag = tag.nextSibling
                            if not tag:
                                headline_found = False
                                break
                        if headline_found:
                            cgFirst.insert(insertLoc,firstImg)
                    else:
                        self.log(">>> No class:'columnGroup first' found <<<")
-        # Change captions to italic
+		try:
-        for caption in soup.findAll(True, {'class':'caption'}) :
+			# Change <nyt_headline> to <h2>
-            if caption and caption.contents[0]:
+			h1 = soup.find('h1')
-                cTag = Tag(soup, "p", [("class", "caption")])
+			if h1:
-                c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
+				headline = h1.find("nyt_headline")
-                mp_off = c.find("More Photos")
+				if headline:
-                if mp_off >= 0:
+					tag = Tag(soup, "h2")
-                    c = c[:mp_off]
+					tag['class'] = "headline"
-                cTag.insert(0, c)
+					tag.insert(0, self.fixChars(headline.contents[0]))
-                caption.replaceWith(cTag)
+					h1.replaceWith(tag)
 			else:
 				# Blog entry - replace headline, remove <hr> tags
 				headline = soup.find('title')
 				if headline:
 					tag = Tag(soup, "h2")
 					tag['class'] = "headline"
 					tag.insert(0, self.fixChars(headline.contents[0]))
 					soup.insert(0, tag)
 					hrs = soup.findAll('hr')
 					for hr in hrs:
 						hr.extract()
 		except:
 			self.log("ERROR:  Problem in Change <nyt_headline> to <h2>")
-        # Change <nyt_headline> to <h2>
+		try:
-        h1 = soup.find('h1')
+			# Change <h1> to <h3> - used in editorial blogs
-        if h1:
+			masthead = soup.find("h1")
-            headline = h1.find("nyt_headline")
+			if masthead:
-            if headline:
+				# Nuke the href
-                tag = Tag(soup, "h2")
+				if masthead.a:
-                tag['class'] = "headline"
+					del(masthead.a['href'])
-                tag.insert(0, self.fixChars(headline.contents[0]))
+				tag = Tag(soup, "h3")
-                h1.replaceWith(tag)
+				tag.insert(0, self.fixChars(masthead.contents[0]))
-        else:
+				masthead.replaceWith(tag)
-            # Blog entry - replace headline, remove <hr> tags
+		except:
-            headline = soup.find('title')
+			self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
            if headline:
                tag = Tag(soup, "h2")
                tag['class'] = "headline"
                tag.insert(0, self.fixChars(headline.contents[0]))
                soup.insert(0, tag)
                hrs = soup.findAll('hr')
                for hr in hrs:
                    hr.extract()
-        # Change <h1> to <h3> - used in editorial blogs
+		try:
-        masthead = soup.find("h1")
+			# Change <span class="bold"> to <b>
-        if masthead:
+			for subhead in soup.findAll(True, {'class':'bold'}) :
-            # Nuke the href
+				if subhead.contents:
-            if masthead.a:
+					bTag = Tag(soup, "b")
-                del(masthead.a['href'])
+					bTag.insert(0, subhead.contents[0])
-            tag = Tag(soup, "h3")
+					subhead.replaceWith(bTag)
-            tag.insert(0, self.fixChars(masthead.contents[0]))
+		except:
-            masthead.replaceWith(tag)
+			self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
-        # Change <span class="bold"> to <b>
+		try:
-        for subhead in soup.findAll(True, {'class':'bold'}) :
+			divTag = soup.find('div',attrs={'id':'articleBody'})
-            if subhead.contents:
+			if divTag:
-                bTag = Tag(soup, "b")
+				divTag['class'] = divTag['id']
-                bTag.insert(0, subhead.contents[0])
+		except:
-                subhead.replaceWith(bTag)
+			self.log("ERROR:  Problem in soup.find(div,attrs={id:articleBody})")
-        divTag = soup.find('div',attrs={'id':'articleBody'})
+		try:
-        if divTag:
+			# Add class="authorId" to <div> so we can format with CSS
-            divTag['class'] = divTag['id']
+			divTag = soup.find('div',attrs={'id':'authorId'})
 			if divTag and divTag.contents[0]:
 				tag = Tag(soup, "p")
 				tag['class'] = "authorId"
 				tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
 								 use_alt=False)))
 				divTag.replaceWith(tag)
 		except:
 			self.log("ERROR:  Problem in Add class=authorId to <div> so we can format with CSS")
-        # Add class="authorId" to <div> so we can format with CSS
+		return soup
-        divTag = soup.find('div',attrs={'id':'authorId'})
+
-        if divTag and divTag.contents[0]:
+    def populate_article_metadata(self, article, soup, first):
-            tag = Tag(soup, "p")
+        shortparagraph = ""
-            tag['class'] = "authorId"
+        try:
-            tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
+            if len(article.text_summary.strip()) == 0:
-                             use_alt=False)))
+                articlebodies = soup.findAll('div',attrs={'class':'articleBody'})
-            divTag.replaceWith(tag)
+                if articlebodies:
                    for articlebody in articlebodies:
                        if articlebody:
                            paras = articlebody.findAll('p')
                            for p in paras:
                                refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
                                #account for blank paragraphs and short paragraphs by appending them to longer ones
                                if len(refparagraph) > 0:
                                    if len(refparagraph) > 70: #approximately one line of text
                                        article.summary = article.text_summary = shortparagraph + refparagraph
                                        return
                                    else:
                                        shortparagraph = refparagraph + " "
                                        if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
                                            shortparagraph = shortparagraph + "- "
        except:
            self.log("Error creating article descriptions")
            return
        return soup
--- a/resources/recipes/nytimes_sub.recipe
+++ b/resources/recipes/nytimes_sub.recipe
@ -1,4 +1,5 @@
 #!/usr/bin/env  python
 # -*- coding: utf-8 -*-
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
@ -23,6 +24,10 @@ class NYTimes(BasicNewsRecipe):
    webEdition = False
    oldest_article = 7
    # replace paid Kindle Version:  the name will be changed to "The New York Times" to cause
    # previous paid versions of the new york times to best sent to the back issues folder on the kindle
    replaceKindleVersion = False
    # includeSections: List of sections to include. If empty, all sections found will be included.
    # Otherwise, only the sections named will be included. For example,
    #
@ -94,6 +99,10 @@ class NYTimes(BasicNewsRecipe):
        title='New York Times (Web)'
        description = 'New York Times on the Web'
        needs_subscription = True
    elif replaceKindleVersion:
 	title='The New York Times'
        description = 'Today\'s New York Times'
        needs_subscription = True
    else:
        title='New York Times'
        description = 'Today\'s New York Times'
@ -150,6 +159,11 @@ class NYTimes(BasicNewsRecipe):
                            'relatedSearchesModule',
                            'side_tool',
                            'singleAd',
                            'entry entry-utility', #added for DealBook
                            'entry-tags', #added for DealBook
                            'footer promos clearfix', #added for DealBook
                            'footer links clearfix', #added for DealBook
                            'inlineImage module', #added for DealBook
                            re.compile('^subNavigation'),
                            re.compile('^leaderboard'),
                            re.compile('^module'),
@ -183,6 +197,9 @@ class NYTimes(BasicNewsRecipe):
                            'side_index',
                            'side_tool',
                            'toolsRight',
                            'skybox', #added for DealBook
                            'TopAd', #added for DealBook
                            'related-content', #added for DealBook
                            ]),
                   dict(name=['script', 'noscript', 'style','form','hr'])]
    no_stylesheets = True
@ -237,7 +254,7 @@ class NYTimes(BasicNewsRecipe):
    def exclude_url(self,url):
        if not url.startswith("http"):
            return True
-        if not url.endswith(".html"):
+        if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: #added for DealBook
            return True
        if 'nytimes.com' not in url:
            return True
@ -560,7 +577,6 @@ class NYTimes(BasicNewsRecipe):
    def preprocess_html(self, soup):
        if self.webEdition & (self.oldest_article>0):
            date_tag = soup.find(True,attrs={'class': ['dateline','date']})
            if date_tag:
@ -583,106 +599,189 @@ class NYTimes(BasicNewsRecipe):
                img_div = soup.find('div','inlineImage module')
                if img_div:
                    img_div.extract()
        return self.strip_anchors(soup)
    def postprocess_html(self,soup, True):
-        if self.one_picture_per_article:
+        try:
-            # Remove all images after first
+                if self.one_picture_per_article:
-            largeImg = soup.find(True, {'class':'articleSpanImage'})
+                        # Remove all images after first
-            inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
+                        largeImg = soup.find(True, {'class':'articleSpanImage'})
-            if largeImg:
+                        inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
-                for inlineImg in inlineImgs:
+                        if largeImg:
-                    inlineImg.extract()
+                                for inlineImg in inlineImgs:
-            else:
+                                        inlineImg.extract()
-                if inlineImgs:
+                        else:
-                    firstImg = inlineImgs[0]
+                                if inlineImgs:
-                    for inlineImg in inlineImgs[1:]:
+                                        firstImg = inlineImgs[0]
-                        inlineImg.extract()
+                                        for inlineImg in inlineImgs[1:]:
-                    # Move firstImg before article body
+                                                inlineImg.extract()
-                    cgFirst = soup.find(True, {'class':re.compile('columnGroup  *first')})
+                                        # Move firstImg before article body
-                    if cgFirst:
+                                        cgFirst = soup.find(True, {'class':re.compile('columnGroup  *first')})
-                        # Strip all sibling NavigableStrings: noise
+                                        if cgFirst:
-                        navstrings = cgFirst.findAll(text=True, recursive=False)
+                                                # Strip all sibling NavigableStrings: noise
-                        [ns.extract() for ns in navstrings]
+                                                navstrings = cgFirst.findAll(text=True, recursive=False)
-                        headline_found = False
+                                                [ns.extract() for ns in navstrings]
-                        tag = cgFirst.find(True)
+                                                headline_found = False
-                        insertLoc = 0
+                                                tag = cgFirst.find(True)
-                        while True:
+                                                insertLoc = 0
-                            insertLoc += 1
+                                                while True:
-                            if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
+                                                        insertLoc += 1
-                                    headline_found = True
+                                                        if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
-                                    break
+                                                                        headline_found = True
-                            tag = tag.nextSibling
+                                                                        break
-                            if not tag:
+                                                        tag = tag.nextSibling
-                                headline_found = False
+                                                        if not tag:
-                                break
+                                                                headline_found = False
-                        if headline_found:
+                                                                break
-                            cgFirst.insert(insertLoc,firstImg)
+                                                if headline_found:
-                    else:
+                                                        cgFirst.insert(insertLoc,firstImg)
-                        self.log(">>> No class:'columnGroup first' found <<<")
+                                        else:
                                                self.log(">>> No class:'columnGroup first' found <<<")
        except:
                self.log("ERROR: One picture per article in postprocess_html")
-        # Change captions to italic
+        try:
-        for caption in soup.findAll(True, {'class':'caption'}) :
+                # Change captions to italic
-            if caption and caption.contents[0]:
+                for caption in soup.findAll(True, {'class':'caption'}) :
-                cTag = Tag(soup, "p", [("class", "caption")])
+                        if caption and len(caption) > 0:
-                c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
+                                cTag = Tag(soup, "p", [("class", "caption")])
-                mp_off = c.find("More Photos")
+                                c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
-                if mp_off >= 0:
+                                mp_off = c.find("More Photos")
-                    c = c[:mp_off]
+                                if mp_off >= 0:
-                cTag.insert(0, c)
+                                        c = c[:mp_off]
-                caption.replaceWith(cTag)
+                                cTag.insert(0, c)
                                caption.replaceWith(cTag)
        except:
                self.log("ERROR:  Problem in change captions to italic")
-        # Change <nyt_headline> to <h2>
+        try:
-        h1 = soup.find('h1')
+                # Change <nyt_headline> to <h2>
-        if h1:
+                h1 = soup.find('h1')
-            headline = h1.find("nyt_headline")
+                blogheadline = str(h1) #added for dealbook
-            if headline:
+                if h1:
-                tag = Tag(soup, "h2")
+                        headline = h1.find("nyt_headline")
-                tag['class'] = "headline"
+                        if headline:
-                tag.insert(0, self.fixChars(headline.contents[0]))
+                                tag = Tag(soup, "h2")
-                h1.replaceWith(tag)
+                                tag['class'] = "headline"
-        else:
+                                tag.insert(0, self.fixChars(headline.contents[0]))
-            # Blog entry - replace headline, remove <hr> tags
+                                h1.replaceWith(tag)
-            headline = soup.find('title')
+                        elif blogheadline.find('entry-title'):#added for dealbook
-            if headline:
+                                tag = Tag(soup, "h2")#added for dealbook
-                tag = Tag(soup, "h2")
+                                tag['class'] = "headline"#added for dealbook
-                tag['class'] = "headline"
+                                tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook
-                tag.insert(0, self.fixChars(headline.contents[0]))
+                                h1.replaceWith(tag)#added for dealbook
                soup.insert(0, tag)
                hrs = soup.findAll('hr')
                for hr in hrs:
                    hr.extract()
-        # Change <h1> to <h3> - used in editorial blogs
+                else:
-        masthead = soup.find("h1")
+                        # Blog entry - replace headline, remove <hr> tags  - BCC I think this is no longer functional 1-18-2011
-        if masthead:
+                        headline = soup.find('title')
-            # Nuke the href
+                        if headline:
-            if masthead.a:
+                                tag = Tag(soup, "h2")
-                del(masthead.a['href'])
+                                tag['class'] = "headline"
-            tag = Tag(soup, "h3")
+                                tag.insert(0, self.fixChars(headline.renderContents()))
-            tag.insert(0, self.fixChars(masthead.contents[0]))
+                                soup.insert(0, tag)
-            masthead.replaceWith(tag)
+                                hrs = soup.findAll('hr')
                                for hr in hrs:
                                        hr.extract()
        except:
                self.log("ERROR:  Problem in Change <nyt_headline> to <h2>")
-        # Change <span class="bold"> to <b>
+        try:
-        for subhead in soup.findAll(True, {'class':'bold'}) :
+                #if this is from a blog (dealbook, fix the byline format
-            if subhead.contents:
+                bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
-                bTag = Tag(soup, "b")
+                if bylineauthor:
-                bTag.insert(0, subhead.contents[0])
+                    tag = Tag(soup, "h6")
-                subhead.replaceWith(bTag)
+                    tag['class'] = "byline"
                    tag.insert(0, self.fixChars(bylineauthor.renderContents()))
                    bylineauthor.replaceWith(tag)
        except:
            self.log("ERROR:  fixing byline author format")
-        divTag = soup.find('div',attrs={'id':'articleBody'})
+        try:
-        if divTag:
+                #if this is a blog (dealbook) fix the credit style for the pictures
-            divTag['class'] = divTag['id']
+                blogcredit = soup.find('div',attrs={'class':'credit'})
                if blogcredit:
                    tag = Tag(soup, "h6")
                    tag['class'] = "credit"
                    tag.insert(0, self.fixChars(blogcredit.renderContents()))
                    blogcredit.replaceWith(tag)
        except:
            self.log("ERROR:  fixing credit format")
-        # Add class="authorId" to <div> so we can format with CSS
+
-        divTag = soup.find('div',attrs={'id':'authorId'})
+        try:
-        if divTag and divTag.contents[0]:
+                # Change <h1> to <h3> - used in editorial blogs
-            tag = Tag(soup, "p")
+                masthead = soup.find("h1")
-            tag['class'] = "authorId"
+                if masthead:
-            tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
+                        # Nuke the href
-                             use_alt=False)))
+                        if masthead.a:
-            divTag.replaceWith(tag)
+                                del(masthead.a['href'])
                        tag = Tag(soup, "h3")
                        tag.insert(0, self.fixChars(masthead.contents[0]))
                        masthead.replaceWith(tag)
        except:
                self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
        try:
                # Change <span class="bold"> to <b>
                for subhead in soup.findAll(True, {'class':'bold'}) :
                        if subhead.contents:
                                bTag = Tag(soup, "b")
                                bTag.insert(0, subhead.contents[0])
                                subhead.replaceWith(bTag)
        except:
                self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
        try:
                #remove the <strong> update tag
                blogupdated = soup.find('span', {'class':'update'})
                if blogupdated:
                    blogupdated.replaceWith("")
        except:
                self.log("ERROR:  Removing strong tag")
        try:
                divTag = soup.find('div',attrs={'id':'articleBody'})
                if divTag:
                        divTag['class'] = divTag['id']
        except:
                self.log("ERROR:  Problem in soup.find(div,attrs={id:articleBody})")
        try:
                # Add class="authorId" to <div> so we can format with CSS
                divTag = soup.find('div',attrs={'id':'authorId'})
                if divTag and divTag.contents[0]:
                        tag = Tag(soup, "p")
                        tag['class'] = "authorId"
                        tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
                                                         use_alt=False)))
                        divTag.replaceWith(tag)
        except:
                self.log("ERROR:  Problem in Add class=authorId to <div> so we can format with CSS")
        return soup
    def populate_article_metadata(self, article, soup, first):
        shortparagraph = ""
        try:
            if len(article.text_summary.strip()) == 0:
                articlebodies = soup.findAll('div',attrs={'class':'articleBody'})
                if articlebodies:
                    for articlebody in articlebodies:
                        if articlebody:
                            paras = articlebody.findAll('p')
                            for p in paras:
                                refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
                                #account for blank paragraphs and short paragraphs by appending them to longer ones
                                if len(refparagraph) > 0:
                                    if len(refparagraph) > 70: #approximately one line of text
                                        article.summary = article.text_summary = shortparagraph + refparagraph
                                        return
                                    else:
                                        shortparagraph = refparagraph + " "
                                        if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
                                            shortparagraph = shortparagraph + "- "
        except:
            self.log("Error creating article descriptions")
            return
--- a/resources/recipes/pressthink.recipe
+++ b/resources/recipes/pressthink.recipe
@ -0,0 +1,61 @@
 __license__   = 'GPL v3'
 __copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
 '''
 pressthink.org
 '''
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class PressThink(BasicNewsRecipe):
    title                  = 'PressThink'
    __author__             = 'Darko Miletic'
    description            = 'Ghost of democracy in the media machine'
    oldest_article         = 60
    max_articles_per_feed  = 100
    no_stylesheets         = True
    use_embedded_content   = False
    encoding               = 'utf8'
    publisher              = 'Arthur L. Carter Journalism Institute'
    category               = 'news, USA, world, economy, politics, media'
    language               = 'en'
    publication_type       = 'blog'
    extra_css              = """
                                body{ font-family: Helvetica,Arial,sans-serif }
                                img{display: block; margin-bottom: 0.5em}
                                h6{font-size: 1.1em; font-weight: bold}
                                .post-author{font-family: Georgia,serif}
                                .post-title{color: #AB0000}
                                .says{color: gray}
                                .comment {
                                    border-bottom: 1px dotted #555555;
                                    border-top: 1px dotted #DDDDDD;
                                    margin-left: 10px;
                                    min-height: 100px;
                                    padding: 15px 0 20px;
                                }
                             """
    conversion_options = {
                             'comments' : description
                            ,'tags'     : category
                            ,'language' : language
                            ,'publisher': publisher
                         }
    remove_tags       = [dict(name=['form','iframe','embed','object','link','base','table','meta'])]
    keep_only_tags    = [dict(attrs={'class':['post-title','post-author','entry','postmetadata alt','commentlist']})]
    feeds = [(u'Articles', u'http://pressthink.org/feed/')]
    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
        for item in soup.findAll('img', alt=False):
            item['alt'] = 'image'
        for alink in soup.findAll('a'):
            if alink.string is not None:
               tstr = alink.string
               alink.replaceWith(tstr)
        return soup
--- a/resources/recipes/seattle_times.recipe
+++ b/resources/recipes/seattle_times.recipe
@ -21,16 +21,53 @@ class SeattleTimes(BasicNewsRecipe):
    encoding              = 'cp1252'
    language = 'en'
-
+    feeds              = [
-    html2lrf_options = [
+                          (u'Top Stories',
-                          '--comment'  , description
+                              u'http://seattletimes.nwsource.com/rss/home.xml'),
-                        , '--category' , category
+                          #(u'Articles', u'http://seattletimes.nwsource.com/rss/seattletimes.xml')
-                        , '--publisher', publisher
+                          (u'Business & Technology',
-                        ]
+                              u'http://seattletimes.nwsource.com/rss/businesstechnology.xml'),
-
+                          (u'Personal Technology',
-    html2epub_options  = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
+                              u'http://seattletimes.nwsource.com/rss/personaltechnology.xml'),
-
+                          (u'Entertainment & the Arts',
-    feeds              = [(u'Articles', u'http://seattletimes.nwsource.com/rss/seattletimes.xml')]
+                              u'http://seattletimes.nwsource.com/rss/artsentertainment.xml'),
                          (u'Health',
                              u'http://seattletimes.nwsource.com/rss/health.xml'),
                          (u'Living',
                              u'http://seattletimes.nwsource.com/rss/living.xml'),
                          (u'Local News',
                              u'http://seattletimes.nwsource.com/rss/localnews.xml'),
                          (u'Nation & World',
                              u'http://seattletimes.nwsource.com/rss/nationworld.xml'),
                          (u'Opinion',
                              u'http://seattletimes.nwsource.com/rss/opinion.xml'),
                          (u'Politics',
                              u'http://seattletimes.nwsource.com/rss/politics.xml'),
                          (u'Sports',
                              u'http://seattletimes.nwsource.com/rss/sports.xml'),
                          (u'Nicole Brodeur',
                              u'http://seattletimes.nwsource.com/rss/nicolebrodeur.xml'),
                          (u'Danny Westneat',
                              u'http://seattletimes.nwsource.com/rss/dannywestneat.xml'),
                          (u'Jerry Large',
                              u'http://seattletimes.nwsource.com/rss/jerrylarge.xml'),
                          (u'Ron Judd',
                              u'http://seattletimes.nwsource.com/rss/ronjudd.xml'),
                          (u'Education',
                              u'http://seattletimes.nwsource.com/rss/education.xml'),
                          (u'Letters to the Editor',
                              u'http://seattletimes.nwsource.com/rss/northwestvoices.xml'),
                          (u'Travel',
                              u'http://seattletimes.nwsource.com/rss/travel.xml'),
                          (u'Outdoors',
                              u'http://seattletimes.nwsource.com/rss/outdoors.xml'),
                          (u'Steve Kelley',
                              u'http://seattletimes.nwsource.com/rss/stevekelley.xml'),
                          (u'Jerry Brewer',
                              u'http://seattletimes.nwsource.com/rss/jerrybrewer.xml'),
                          (u'Most Read Articles',
                              u'http://seattletimes.nwsource.com/rss/mostreadarticles.xml'),
                         ]
    remove_tags        = [
                             dict(name=['object','link','script'])
--- a/resources/recipes/sportsillustrated.recipe
+++ b/resources/recipes/sportsillustrated.recipe
@ -1,5 +1,5 @@
 from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
+#from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from urllib import quote
 class SportsIllustratedRecipe(BasicNewsRecipe) :
@ -91,7 +91,7 @@ class SportsIllustratedRecipe(BasicNewsRecipe) :
        #   expire : no idea what value to use
        # All this comes from the Javascript function that redirects to the print version. It's called PT() and is defined in the file 48.js
-    def preprocess_html(self, soup):
+    '''def preprocess_html(self, soup):
        header = soup.find('div', attrs = {'class' : 'siv_artheader'})
        homeMadeSoup = BeautifulSoup('<html><head></head><body></body></html>')
        body = homeMadeSoup.body
@ -115,4 +115,5 @@ class SportsIllustratedRecipe(BasicNewsRecipe) :
            body.append(para)
        return homeMadeSoup
        '''
--- a/resources/recipes/technology_review.recipe
+++ b/resources/recipes/technology_review.recipe
@ -35,7 +35,6 @@ class TechnologyReview(BasicNewsRecipe):
    def get_article_url(self, article):
        return article.get('guid', article.get('id', None))
    def print_version(self, url):
        baseurl='http://www.technologyreview.com/printer_friendly_article.aspx?id='
        split1 = string.split(url,"/")
@ -43,3 +42,25 @@ class TechnologyReview(BasicNewsRecipe):
        split2= string.split(xxx,"/")
        s =  baseurl + split2[0]
        return s
    def postprocess_html(self,soup, True):
        #remove picture
        headerhtml = soup.find(True, {'class':'header'})
        headerhtml.replaceWith("")
        #remove close button
        closehtml = soup.find(True, {'class':'close'})
        closehtml.replaceWith("")
        #remove banner advertisement
        bannerhtml = soup.find(True, {'class':'bannerad'})
        bannerhtml.replaceWith("")
        #thanks kiklop74!  This code removes all links from the text
        for alink in soup.findAll('a'):
            if alink.string is not None:
               tstr = alink.string
               alink.replaceWith(tstr)
        return soup
--- a/resources/recipes/tri_city_herald.recipe
+++ b/resources/recipes/tri_city_herald.recipe
@ -0,0 +1,25 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 class TriCityHeraldRecipe(BasicNewsRecipe):
    title          = u'Tri-City Herald'
    description    = 'The Tri-City Herald Mid-Columbia.'
    language = 'en'
    __author__     = 'Laura Gjovaag'
    oldest_article        = 1.5
    max_articles_per_feed = 100
    no_stylesheets        = True
    remove_javascript     = True
    keep_only_tags = [
                      dict(name='div', attrs={'id':'story_header'}),
                      dict(name='img', attrs={'class':'imageCycle'}),
                      dict(name='div', attrs={'id':['cycleImageCaption', 'story_body']})
                      ]
    remove_tags    = [
                      dict(name='div', attrs={'id':'story_mlt'}),
                      dict(name='a', attrs={'id':'commentCount'}),
                      dict(name=['script', 'noscript', 'style'])]
    extra_css      = 'h1{font: bold 140%;} #cycleImageCaption{font: monospace 60%}'
    feeds          = [
                      (u'Tri-City Herald Mid-Columbia', u'http://www.tri-cityherald.com/901/index.rss')
                     ]
--- a/resources/recipes/tyzden.recipe
+++ b/resources/recipes/tyzden.recipe
@ -0,0 +1,80 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __copyright__ = '2011, Miroslav Vasko zemiak@gmail.com'
 '''
 .tyzden, a weekly news magazine (a week old issue)
 '''
 from calibre import strftime
 from calibre.web.feeds.news import BasicNewsRecipe
 from datetime import date
 import re
 class TyzdenRecipe(BasicNewsRecipe):
    __license__  = 'GPL v3'
    __author__ = 'zemiak'
    language = 'sk'
    version = 1
    publisher = u'www.tyzden.sk'
    category = u'Magazine'
    description = u'A conservative weekly magazine. The latest free issue'
    today = date.today()
    iso = today.isocalendar()
    year = iso[0]
    weeknum = iso[1]
    if (weeknum > 1):
        weeknum -= 1
    title = u'tyzden'
    base_url_path = 'http://www.tyzden.sk/casopis/' + str(year) + '/' + str(weeknum)
    base_url = base_url_path + '.html'
    oldest_article = 20
    max_articles_per_feed = 100
    remove_javascript = True
    use_embedded_content    = False
    no_stylesheets = True
    keep_only_tags = []
    keep_only_tags.append(dict(name = 'h1'))
    keep_only_tags.append(dict(name = 'div', attrs = {'class': 'text_area top_nofoto'}))
    keep_only_tags.append(dict(name = 'div', attrs = {'class': 'text_block'}))
    remove_tags_after = [dict(name = 'div', attrs = {'class': 'text_block'})]
    def find_sections(self):
        soup = self.index_to_soup(self.base_url)
        # find cover pic
 	imgdiv = soup.find('div', attrs = {'class': 'foto'})
 	if imgdiv is not None:
            img = imgdiv.find('img')
            if img is not None:
                self.cover_url = 'http://www.tyzden.sk/' + img['src']
        # end find cover pic
        for s in soup.findAll('a', attrs={'href': re.compile(r'rubrika/.*')}):
            yield (self.tag_to_string(s), s)
    def find_articles(self, soup):
        for art in soup.findAllNext('a'):
            if (not art['href'].startswith('casopis/')):
                break;
            url = art['href']
            title = self.tag_to_string(art)
            yield {
                    'title': title, 'url':self.base_url_path + '/' + url, 'description':title,
                    'date' : strftime('%a, %d %b'),
                    }
    def parse_index(self):
        feeds = []
        for title, soup in self.find_sections():
            feeds.append((title, list(self.find_articles(soup))))
        return feeds
--- a/resources/recipes/wichita_eagle.recipe
+++ b/resources/recipes/wichita_eagle.recipe
@ -0,0 +1,29 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 class AdvancedUserRecipe1294938721(BasicNewsRecipe):
     title            = u'Wichita Eagle'
     language = 'en'
     __author__ = 'Jason Cameron'
     description  = 'Daily news from the Wichita Eagle'
     oldest_article = 1
     max_articles_per_feed = 30
     keep_only_tags = [dict(name='div', attrs={'id':'wide'})]
     feeds          = [
                                 (u'Local News',
 u'http://www.kansas.com/news/local/index.rss'),
                                 (u'National News',
 u'http://www.kansas.com/news/nation-world/index.rss'),
                                 (u'Sports',
 u'http://www.kansas.com/sports/index.rss'),
                                 (u'Opinion',
 u'http://www.kansas.com/opinion/index.rss'),
                                 (u'Life',
 u'http://www.kansas.com/living/index.rss'),
                                 (u'Entertainment',
 u'http://www.kansas.com/entertainment/index.rss')
                           ]
     def print_version(self, url):
         urlparts = url.split('/')
         newadd = urlparts[5]+'/v-print'
         return url.replace(url, newadd.join(url.split(urlparts[5])))
--- a/resources/recipes/wired_daily.recipe
+++ b/resources/recipes/wired_daily.recipe
@ -2,8 +2,10 @@
 __license__   = 'GPL v3'
 __docformat__ = 'restructuredtext en'
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.chardet import xml_to_unicode
 class Wired_Daily(BasicNewsRecipe):
@ -15,30 +17,43 @@ class Wired_Daily(BasicNewsRecipe):
    no_stylesheets = True
    preprocess_regexps = [(re.compile(r'<head.*</head>', re.DOTALL), lambda m:
        '<head></head>')]
    remove_tags_before = dict(name='div', id='content')
-    remove_tags = [dict(id=['social_tools', 'outerWrapper', 'sidebar',
+    remove_tags = [dict(id=['header', 'commenting_module', 'post_nav',
-        'footer', 'advertisement', 'blog_subscription_unit',
+        'social_tools', 'sidebar', 'footer', 'social_wishlist', 'pgwidget',
-        'brightcove_component']),
+        'outerWrapper', 'inf_widget']),
-        {'class':'entryActions'},
+        {'class':['entryActions', 'advertisement', 'entryTags']},
-        dict(name=['noscript', 'script'])]
+        dict(name=['noscript', 'script']),
        dict(name='h4', attrs={'class':re.compile(r'rat\d+')}),
        {'class':lambda x: x and x.startswith('contentjump')},
        dict(name='li', attrs={'class':['entryCategories', 'entryEdit']})]
    feeds = [
        ('Top News', 'http://feeds.wired.com/wired/index'),
-        ('Culture', 'http://feeds.wired.com/wired/culture'),
+        ('Product Reviews',
-        ('Software', 'http://feeds.wired.com/wired/software'),
+            'http://www.wired.com/reviews/feeds/latestProductsRss'),
-        ('Mac', 'http://feeds.feedburner.com/cultofmac/bFow'),
+        ('Autopia', 'http://www.wired.com/autopia/feed/'),
-        ('Gadgets', 'http://feeds.wired.com/wired/gadgets'),
+        ('Danger Room', 'http://www.wired.com/dangerroom/feed/'),
-        ('Cars', 'http://feeds.wired.com/wired/cars'),
+        ('Epicenter', 'http://www.wired.com/epicenter/feed/'),
-        ('Entertainment', 'http://feeds.wired.com/wired/entertainment'),
+        ('Gadget Lab', 'http://www.wired.com/gadgetlab/feed/'),
-        ('Gaming', 'http://feeds.wired.com/wired/gaming'),
+        ('Geek Dad', 'http://www.wired.com/geekdad/feed/'),
-        ('Science', 'http://feeds.wired.com/wired/science'),
+        ('Playbook', 'http://www.wired.com/playbook/feed/'),
-        ('Med Tech', 'http://feeds.wired.com/wired/medtech'),
+        ('Rawfile', 'http://www.wired.com/rawfile/feed/'),
-        ('Politics', 'http://feeds.wired.com/wired/politics'),
+        ('This Day in Tech', 'http://www.wired.com/thisdayintech/feed/'),
-        ('Tech Biz', 'http://feeds.wired.com/wired/techbiz'),
+        ('Threat Level', 'http://www.wired.com/threatlevel/feed/'),
-        ('Commentary', 'http://feeds.wired.com/wired/commentary'),
+        ('Underwire', 'http://www.wired.com/underwire/feed/'),
        ('Web Monkey', 'http://www.webmonkey.com/feed/'),
        ('Science', 'http://www.wired.com/wiredscience/feed/'),
        ]
    def populate_article_metadata(self, article, soup, first):
        if article.text_summary:
            article.text_summary = xml_to_unicode(article.text_summary,
                    resolve_entities=True)[0]
    def print_version(self, url):
-        return url.replace('http://www.wired.com/', 'http://www.wired.com/print/')
+        return url + '/all/1'
--- a/resources/recipes/yakima_herald.recipe
+++ b/resources/recipes/yakima_herald.recipe
@ -0,0 +1,21 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 class YakimaHeraldRepublicRecipe(BasicNewsRecipe):
    title          = u'Yakima Herald-Republic'
    description    = 'The Yakima Herald-Republic.'
    language = 'en'
    __author__     = 'Laura Gjovaag'
    oldest_article        = 1.5
    max_articles_per_feed = 100
    no_stylesheets        = True
    remove_javascript     = True
    keep_only_tags = [
                      dict(name='div', attrs={'id':['searchleft', 'headline_credit']}),
                      dict(name='div', attrs={'class':['photo', 'cauthor', 'photocredit']}),
                      dict(name='div', attrs={'id':['content_body', 'footerleft']})
                      ]
    extra_css = '.cauthor {font: monospace 60%;} .photocredit {font: monospace 60%}'
    feeds          = [
                      (u'Yakima Herald Online', u'http://feeds.feedburner.com/yhronlinenews'),
                     ]
--- a/resources/recipes/zerohedge.recipe
+++ b/resources/recipes/zerohedge.recipe
@ -0,0 +1,33 @@
 __license__   = 'GPL v3'
 __copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
 '''
 www.zerohedge.com
 '''
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class ZeroHedge(BasicNewsRecipe):
    title                  = 'Zero Hedge'
    __author__             = 'Darko Miletic'
    description            = 'On a long enough timeline the survival rate for everyone drops to zero'
    oldest_article         = 10
    max_articles_per_feed  = 100
    no_stylesheets         = True
    use_embedded_content   = True
    encoding               = 'utf8'
    publisher              = 'zero hedge'
    category               = 'news, USA, world, economy, politics'
    language               = 'en'
    masthead_url           = 'http://www.zerohedge.com/themes/newsflash/logo.png'
    publication_type       = 'blog'
    extra_css              = 'body{ font-family: sans-serif }'
    conversion_options = {
                             'comments' : description
                            ,'tags'     : category
                            ,'language' : language
                            ,'publisher': publisher
                         }
    feeds = [(u'Articles', u'http://feeds.feedburner.com/zerohedge/feed')]
--- a/resources/template-functions.json
+++ b/resources/template-functions.json
@ -0,0 +1,28 @@
 {
    "contains": "def evaluate(self, formatter, kwargs, mi, locals,\n             val, test, value_if_present, value_if_not):\n    if re.search(test, val):\n        return value_if_present\n    else:\n        return value_if_not\n", 
    "divide": "def evaluate(self, formatter, kwargs, mi, locals, x, y):\n    x = float(x if x else 0)\n    y = float(y if y else 0)\n    return unicode(x / y)\n", 
    "uppercase": "def evaluate(self, formatter, kwargs, mi, locals, val):\n    return val.upper()\n", 
    "strcat": "def evaluate(self, formatter, kwargs, mi, locals, *args):\n    i = 0\n    res = ''\n    for i in range(0, len(args)):\n        res += args[i]\n    return res\n", 
    "substr": "def evaluate(self, formatter, kwargs, mi, locals, str_, start_, end_):\n    return str_[int(start_): len(str_) if int(end_) == 0 else int(end_)]\n", 
    "ifempty": "def evaluate(self, formatter, kwargs, mi, locals, val, value_if_empty):\n    if val:\n        return val\n    else:\n        return value_if_empty\n", 
    "field": "def evaluate(self, formatter, kwargs, mi, locals, name):\n    return formatter.get_value(name, [], kwargs)\n", 
    "capitalize": "def evaluate(self, formatter, kwargs, mi, locals, val):\n    return capitalize(val)\n", 
    "list_item": "def evaluate(self, formatter, kwargs, mi, locals, val, index, sep):\n    if not val:\n        return ''\n    index = int(index)\n    val = val.split(sep)\n    try:\n        return val[index]\n    except:\n        return ''\n", 
    "shorten": "def evaluate(self, formatter, kwargs, mi, locals,\n             val, leading, center_string, trailing):\n    l = max(0, int(leading))\n    t = max(0, int(trailing))\n    if len(val) > l + len(center_string) + t:\n        return val[0:l] + center_string + ('' if t == 0 else val[-t:])\n    else:\n        return val\n", 
    "re": "def evaluate(self, formatter, kwargs, mi, locals, val, pattern, replacement):\n    return re.sub(pattern, replacement, val)\n", 
    "add": "def evaluate(self, formatter, kwargs, mi, locals, x, y):\n    x = float(x if x else 0)\n    y = float(y if y else 0)\n    return unicode(x + y)\n", 
    "lookup": "def evaluate(self, formatter, kwargs, mi, locals, val, *args):\n    if len(args) == 2: # here for backwards compatibility\n        if val:\n            return formatter.vformat('{'+args[0].strip()+'}', [], kwargs)\n        else:\n            return formatter.vformat('{'+args[1].strip()+'}', [], kwargs)\n    if (len(args) % 2) != 1:\n        raise ValueError(_('lookup requires either 2 or an odd number of arguments'))\n    i = 0\n    while i < len(args):\n        if i + 1 >= len(args):\n            return formatter.vformat('{' + args[i].strip() + '}', [], kwargs)\n        if re.search(args[i], val):\n            return formatter.vformat('{'+args[i+1].strip() + '}', [], kwargs)\n        i += 2\n", 
    "template": "def evaluate(self, formatter, kwargs, mi, locals, template):\n    template = template.replace('[[', '{').replace(']]', '}')\n    return formatter.safe_format(template, kwargs, 'TEMPLATE', mi)\n", 
    "print": "def evaluate(self, formatter, kwargs, mi, locals, *args):\n    print args\n    return None\n", 
    "titlecase": "def evaluate(self, formatter, kwargs, mi, locals, val):\n    return titlecase(val)\n", 
    "test": "def evaluate(self, formatter, kwargs, mi, locals, val, value_if_set, value_not_set):\n    if val:\n        return value_if_set\n    else:\n        return value_not_set\n", 
    "eval": "def evaluate(self, formatter, kwargs, mi, locals, template):\n    from formatter import eval_formatter\n    template = template.replace('[[', '{').replace(']]', '}')\n    return eval_formatter.safe_format(template, locals, 'EVAL', None)\n", 
    "multiply": "def evaluate(self, formatter, kwargs, mi, locals, x, y):\n    x = float(x if x else 0)\n    y = float(y if y else 0)\n    return unicode(x * y)\n", 
    "subtract": "def evaluate(self, formatter, kwargs, mi, locals, x, y):\n    x = float(x if x else 0)\n    y = float(y if y else 0)\n    return unicode(x - y)\n", 
    "count": "def evaluate(self, formatter, kwargs, mi, locals, val, sep):\n    return unicode(len(val.split(sep)))\n", 
    "lowercase": "def evaluate(self, formatter, kwargs, mi, locals, val):\n    return val.lower()\n", 
    "assign": "def evaluate(self, formatter, kwargs, mi, locals, target, value):\n    locals[target] = value\n    return value\n", 
    "switch": "def evaluate(self, formatter, kwargs, mi, locals, val, *args):\n    if (len(args) % 2) != 1:\n        raise ValueError(_('switch requires an odd number of arguments'))\n    i = 0\n    while i < len(args):\n        if i + 1 >= len(args):\n            return args[i]\n        if re.search(args[i], val):\n            return args[i+1]\n        i += 2\n", 
    "strcmp": "def evaluate(self, formatter, kwargs, mi, locals, x, y, lt, eq, gt):\n    v = strcmp(x, y)\n    if v < 0:\n        return lt\n    if v == 0:\n        return eq\n    return gt\n", 
    "cmp": "def evaluate(self, formatter, kwargs, mi, locals, x, y, lt, eq, gt):\n    x = float(x if x else 0)\n    y = float(y if y else 0)\n    if x < y:\n        return lt\n    if x == y:\n        return eq\n    return gt\n"
 }
--- a/resources/templates/rtf.xsl
+++ b/resources/templates/rtf.xsl
@ -287,7 +287,7 @@
                <xsl:value-of select="count(preceding::rtf:footnote) + 1"/>
                <xsl:text>]</xsl:text>
            </xsl:when>
-            <xsl:when test="(@superscript = 'true')">
+            <xsl:when test="(@superscript)">
                <xsl:element name="sup">
                    <xsl:element name="span">
                        <xsl:attribute name="class">
@ -297,7 +297,7 @@
                    </xsl:element>
                </xsl:element>
            </xsl:when>
-            <xsl:when test="(@underscript = 'true')">
+            <xsl:when test="(@underscript or @subscript)">
                <xsl:element name="sub">
                    <xsl:element name="span">
                        <xsl:attribute name="class">
--- a/setup/build_environment.py
+++ b/setup/build_environment.py
@ -117,7 +117,6 @@ if iswindows:
    poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
            r'%s\poppler;%s'%(sw_inc_dir, sw_inc_dir))
    popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[1]+r'\qt4']
    poppler_lib_dirs = consolidate('POPPLER_LIB_DIR', sw_lib_dir)
    popplerqt4_lib_dirs = poppler_lib_dirs
    poppler_libs = ['poppler']
@ -131,7 +130,6 @@ elif isosx:
    fc_lib = '/sw/lib'
    poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
            '/sw/build/poppler-0.14.5/poppler:/sw/build/poppler-0.14.5')
    popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[0]+'/qt4']
    poppler_lib_dirs = consolidate('POPPLER_LIB_DIR',
            '/sw/lib')
    poppler_libs = ['poppler']
@ -150,9 +148,6 @@ else:
    # Include directories
    poppler_inc_dirs = pkgconfig_include_dirs('poppler',
        'POPPLER_INC_DIR', '/usr/include/poppler')
    popplerqt4_inc_dirs = pkgconfig_include_dirs('poppler-qt4', '', '')
    if not popplerqt4_inc_dirs:
        popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[0]+'/qt4']
    png_inc_dirs = pkgconfig_include_dirs('libpng', 'PNG_INC_DIR',
        '/usr/include')
    magick_inc_dirs = pkgconfig_include_dirs('MagickWand', 'MAGICK_INC', '/usr/include/ImageMagick')
@ -187,20 +182,17 @@ if not poppler_inc_dirs or not os.path.exists(
    poppler_error = \
    ('Poppler not found on your system. Various PDF related',
    ' functionality will not work. Use the POPPLER_INC_DIR and',
-    ' POPPLER_LIB_DIR environment variables.')
+    ' POPPLER_LIB_DIR environment variables. calibre requires '
-
+    ' the poppler XPDF headers. If your distro does not '
-popplerqt4_error = None
+    ' include them you will have to re-compile poppler '
-if not popplerqt4_inc_dirs or not os.path.exists(
+    ' by hand with --enable-xpdf-headers')
        os.path.join(popplerqt4_inc_dirs[-1], 'poppler-qt4.h')):
    popplerqt4_error = \
            ('Poppler Qt4 bindings not found on your system.')
 magick_error = None
 if not magick_inc_dirs or not os.path.exists(os.path.join(magick_inc_dirs[0],
    'wand')):
    magick_error = ('ImageMagick not found on your system. '
            'Try setting the environment variables MAGICK_INC '
-            'and MAGICK_LIB to help calibre locate the inclue and libbrary '
+            'and MAGICK_LIB to help calibre locate the include and library '
            'files.')
 podofo_lib = os.environ.get('PODOFO_LIB_DIR', podofo_lib)
--- a/setup/publish.py
+++ b/setup/publish.py
@ -43,8 +43,9 @@ class Stage3(Command):
   description = 'Stage 3 of the publish process'
   sub_commands = ['upload_user_manual', 'upload_demo', 'sdist',
-            'upload_to_google_code', 'tag_release', 'upload_to_server',
+            'upload_to_google_code', 'upload_to_sourceforge',
-            'upload_to_sourceforge', 'upload_to_mobileread',
+            'tag_release', 'upload_to_server',
            'upload_to_mobileread',
   ]
 class Stage4(Command):
--- a/setup/resources.py
+++ b/setup/resources.py
@ -84,6 +84,23 @@ class Resources(Command):
            cPickle.dump(complete, open(dest, 'wb'), -1)
        self.info('\tCreating template-functions.json')
        dest = self.j(self.RESOURCES, 'template-functions.json')
        function_dict = {}
        import inspect
        from calibre.utils.formatter_functions import all_builtin_functions
        for obj in all_builtin_functions:
            eval_func = inspect.getmembers(obj,
                    lambda x: inspect.ismethod(x) and x.__name__ == 'evaluate')
            try:
                lines = [l[4:] for l in inspect.getsourcelines(eval_func[0][1])[0]]
            except:
                continue
            lines = ''.join(lines)
            function_dict[obj.name] = lines
        import json
        json.dump(function_dict, open(dest, 'wb'), indent=4)
    def clean(self):
        for x in ('scripts', 'recipes', 'ebook-convert-complete'):
            x = self.j(self.RESOURCES, x+'.pickle')
--- a/setup/upload.py
+++ b/setup/upload.py
@ -6,7 +6,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
-import os, re, cStringIO, base64, httplib, subprocess, hashlib, shutil
+import os, re, cStringIO, base64, httplib, subprocess, hashlib, shutil, time
 from subprocess import check_call
 from tempfile import NamedTemporaryFile, mkdtemp
@ -160,7 +160,7 @@ class UploadToGoogleCode(Command):
        return 'multipart/form-data; boundary=%s' % BOUNDARY, CRLF.join(body)
-    def upload(self, fname, desc, labels=[]):
+    def upload(self, fname, desc, labels=[], retry=0):
        form_fields = [('summary', desc)]
        form_fields.extend([('label', l.strip()) for l in labels])
@ -183,6 +183,10 @@ class UploadToGoogleCode(Command):
        print 'Failed to upload with code %d and reason: %s'%(resp.status,
                resp.reason)
        if retry < 1:
            print 'Retrying in 5 seconds....'
            time.sleep(5)
            return self.upload(fname, desc, labels=labels, retry=retry+1)
        raise Exception('Failed to upload '+fname)
--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -241,7 +241,7 @@ def get_parsed_proxy(typ='http', debug=True):
                return ans
-def browser(honor_time=True, max_time=2, mobile_browser=False):
+def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None):
    '''
    Create a mechanize browser for web scraping. The browser handles cookies,
    refresh requests and ignores robots.txt. Also uses proxy if avaialable.
@ -253,8 +253,10 @@ def browser(honor_time=True, max_time=2, mobile_browser=False):
    opener = Browser()
    opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time)
    opener.set_handle_robots(False)
-    opener.addheaders = [('User-agent', ' Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016' if mobile_browser else \
+    if user_agent is None:
-                          'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101210 Gentoo Firefox/3.6.13')]
+        user_agent = ' Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016' if mobile_browser else \
                          'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101210 Gentoo Firefox/3.6.13'
    opener.addheaders = [('User-agent', user_agent)]
    http_proxy = get_proxies().get('http', None)
    if http_proxy:
        opener.set_proxies({'http':http_proxy})
@ -459,6 +461,18 @@ def force_unicode(obj, enc=preferred_encoding):
                        obj = obj.decode('utf-8')
    return obj
 def as_unicode(obj, enc=preferred_encoding):
    if not isbytestring(obj):
        try:
            obj = unicode(obj)
        except:
            try:
                obj = str(obj)
            except:
                obj = repr(obj)
    return force_unicode(obj, enc=enc)
 def human_readable(size):
    """ Convert a size in bytes into a human readable form """
--- a/src/calibre/constants.py
+++ b/src/calibre/constants.py
@ -2,7 +2,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
 __appname__   = 'calibre'
-__version__   = '0.7.37'
+__version__   = '0.7.42'
 __author__    = "Kovid Goyal <kovid@kovidgoyal.net>"
 import re
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -705,13 +705,17 @@ class ActionTweakEpub(InterfaceActionBase):
    name = 'Tweak ePub'
    actual_plugin = 'calibre.gui2.actions.tweak_epub:TweakEpubAction'
 class ActionNextMatch(InterfaceActionBase):
    name = 'Next Match'
    actual_plugin = 'calibre.gui2.actions.next_match:NextMatchAction'
 plugins += [ActionAdd, ActionFetchAnnotations, ActionGenerateCatalog,
        ActionConvert, ActionDelete, ActionEditMetadata, ActionView,
        ActionFetchNews, ActionSaveToDisk, ActionShowBookDetails,
        ActionRestart, ActionOpenFolder, ActionConnectShare,
        ActionSendToDevice, ActionHelp, ActionPreferences, ActionSimilarBooks,
        ActionAddToLibrary, ActionEditCollections, ActionChooseLibrary,
-        ActionCopyToLibrary, ActionTweakEpub]
+        ActionCopyToLibrary, ActionTweakEpub, ActionNextMatch]
 # }}}
@ -843,6 +847,17 @@ class Plugboard(PreferencesPlugin):
    config_widget = 'calibre.gui2.preferences.plugboard'
    description = _('Change metadata fields before saving/sending')
 class TemplateFunctions(PreferencesPlugin):
    name = 'TemplateFunctions'
    icon = I('template_funcs.png')
    gui_name = _('Template Functions')
    category = 'Advanced'
    gui_category = _('Advanced')
    category_order = 5
    name_order = 4
    config_widget = 'calibre.gui2.preferences.template_functions'
    description = _('Create your own template functions')
 class Email(PreferencesPlugin):
    name = 'Email'
    icon = I('mail.png')
@ -904,6 +919,6 @@ class Misc(PreferencesPlugin):
 plugins += [LookAndFeel, Behavior, Columns, Toolbar, InputOptions,
        CommonOptions, OutputOptions, Adding, Saving, Sending, Plugboard,
-        Email, Server, Plugins, Tweaks, Misc]
+        Email, Server, Plugins, Tweaks, Misc, TemplateFunctions]
 #}}}
--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@ -160,18 +160,6 @@ class InputFormatPlugin(Plugin):
        '''
        raise NotImplementedError()
    def preprocess_html(self, opts, html):
        '''
        This method is called by the conversion pipeline on all HTML before it
        is parsed. It is meant to be used to do any required preprocessing on
        the HTML, like removing hard line breaks, etc.
        :param html: A unicode string
        :return: A unicode string
        '''
        return html
    def convert(self, stream, options, file_ext, log, accelerators):
        '''
        This method must be implemented in sub-classes. It must return
--- a/src/calibre/customize/profiles.py
+++ b/src/calibre/customize/profiles.py
@ -441,7 +441,7 @@ class TabletOutput(iPadOutput):
 class SamsungGalaxy(TabletOutput):
    name = 'Samsung Galaxy'
-    shortname = 'galaxy'
+    short_name = 'galaxy'
    description = _('Intended for the Samsung Galaxy and similar tablet devices with '
            'a resolution of 600x1280')
    screen_size = comic_screen_size = (600, 1280)
--- a/src/calibre/devices/android/driver.py
+++ b/src/calibre/devices/android/driver.py
@ -21,21 +21,22 @@ class ANDROID(USBMS):
            # HTC
            0x0bb4 : { 0x0c02 : [0x100, 0x0227, 0x0226], 0x0c01 : [0x100, 0x0227], 0x0ff9
                : [0x0100, 0x0227, 0x0226], 0x0c87: [0x0100, 0x0227, 0x0226],
-                0xc92 : [0x100], 0xc97: [0x226]},
+                0xc92 : [0x100], 0xc97: [0x226], 0xc99 : [0x0100]},
            # Eken
            0x040d : { 0x8510 : [0x0001], 0x0851 : [0x1] },
            # Motorola
-            0x22b8 : { 0x41d9 : [0x216], 0x2d61: [0x100], 0x2d67 : [0x100],
+            0x22b8 : { 0x41d9 : [0x216], 0x2d61 : [0x100], 0x2d67 : [0x100],
-                0x41db : [0x216], 0x4285 : [0x216], 0x42a3 : [0x216] },
+                       0x41db : [0x216], 0x4285 : [0x216], 0x42a3 : [0x216],
                       0x4286 : [0x216], 0x42b3 : [0x216] },
            # Sony Ericsson
            0xfce : { 0xd12e : [0x0100]},
            # Google
            0x18d1 : { 0x4e11 : [0x0100, 0x226, 0x227], 0x4e12: [0x0100, 0x226,
-                0x227], 0x4e21: [0x0100, 0x226, 0x227]},
+                0x227], 0x4e21: [0x0100, 0x226, 0x227], 0xb058: [0x0222]},
            # Samsung
            0x04e8 : { 0x681d : [0x0222, 0x0223, 0x0224, 0x0400],
@ -52,6 +53,9 @@ class ANDROID(USBMS):
            # LG
            0x1004 : { 0x61cc : [0x100] },
            # Archos
            0x0e79 : { 0x1419: [0x0216], 0x1420 : [0x0216]},
            }
    EBOOK_DIR_MAIN = ['eBooks/import', 'wordplayer/calibretransfer', 'Books']
    EXTRA_CUSTOMIZATION_MESSAGE = _('Comma separated list of directories to '
@ -60,18 +64,20 @@ class ANDROID(USBMS):
    EXTRA_CUSTOMIZATION_DEFAULT = ', '.join(EBOOK_DIR_MAIN)
    VENDOR_NAME      = ['HTC', 'MOTOROLA', 'GOOGLE_', 'ANDROID', 'ACER',
-            'GT-I5700', 'SAMSUNG', 'DELL', 'LINUX', 'GOOGLE']
+            'GT-I5700', 'SAMSUNG', 'DELL', 'LINUX', 'GOOGLE', 'ARCHOS',
            'TELECHIP']
    WINDOWS_MAIN_MEM = ['ANDROID_PHONE', 'A855', 'A853', 'INC.NEXUS_ONE',
            '__UMS_COMPOSITE', '_MB200', 'MASS_STORAGE', '_-_CARD', 'SGH-I897',
            'GT-I9000', 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID',
            'SCH-I500_CARD', 'SPH-D700_CARD', 'MB810', 'GT-P1000', 'DESIRE',
-            'SGH-T849', '_MB300']
+            'SGH-T849', '_MB300', 'A70S', 'S_ANDROID', 'A101IT']
    WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897',
-            'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD']
+            'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD',
            'A70S', 'A101IT']
-    OSX_MAIN_MEM = 'HTC Android Phone Media'
+    OSX_MAIN_MEM = 'Android Device Main Memory'
-    MAIN_MEMORY_VOLUME_LABEL  = 'Android Phone Internal Memory'
+    MAIN_MEMORY_VOLUME_LABEL  = 'Android Device Main Memory'
    SUPPORTS_SUB_DIRS = True
--- a/src/calibre/devices/eb600/driver.py
+++ b/src/calibre/devices/eb600/driver.py
@ -178,7 +178,7 @@ class INVESBOOK(EB600):
 class BOOQ(EB600):
    name = 'Booq Device Interface'
-    gui_name = 'Booq'
+    gui_name = 'bq Reader'
    FORMATS = ['epub', 'mobi', 'prc', 'fb2', 'pdf', 'doc', 'rtf', 'txt', 'html']
--- a/src/calibre/devices/kobo/books.py
+++ b/src/calibre/devices/kobo/books.py
@ -27,7 +27,7 @@ class Book(Book_):
        self.size = size # will be set later if None
-        if ContentType == '6':
+        if ContentType == '6' and date is not None:
            self.datetime = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%f")
        else:
            try:
--- a/src/calibre/devices/misc.py
+++ b/src/calibre/devices/misc.py
@ -33,8 +33,8 @@ class PALMPRE(USBMS):
 class AVANT(USBMS):
    name           = 'Booq Avant Device Interface'
-    gui_name       = 'Avant'
+    gui_name       = 'bq Avant'
-    description    = _('Communicate with the Booq Avant')
+    description    = _('Communicate with the Bq Avant')
    author         = 'Kovid Goyal'
    supported_platforms = ['windows', 'osx', 'linux']
@ -106,7 +106,7 @@ class PDNOVEL(USBMS):
    WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = '__UMS_COMPOSITE'
    THUMBNAIL_HEIGHT = 130
-    EBOOK_DIR_MAIN = 'eBooks'
+    EBOOK_DIR_MAIN = EBOOK_DIR_CARD_A = 'eBooks'
    SUPPORTS_SUB_DIRS = False
    DELETE_EXTS = ['.jpg', '.jpeg', '.png']
@ -193,6 +193,9 @@ class LUMIREAD(USBMS):
    THUMBNAIL_HEIGHT = 200
    VENDOR_NAME = 'ACER'
    WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'LUMIREAD_600'
    def upload_cover(self, path, filename, metadata, filepath):
        if metadata.thumbnail and metadata.thumbnail[-1]:
            cfilepath = filepath.replace('/', os.sep)
--- a/src/calibre/devices/nook/driver.py
+++ b/src/calibre/devices/nook/driver.py
@ -91,3 +91,19 @@ class NOOK_COLOR(NOOK):
    EBOOK_DIR_MAIN = 'My Files/Books'
    '''
    def create_upload_path(self, path, mdata, fname, create_dirs=True):
        filepath = NOOK.create_upload_path(self, path, mdata, fname,
                create_dirs=create_dirs)
        edm = self.EBOOK_DIR_MAIN.replace('/', os.sep)
        npath = os.path.join(edm, _('News')) + os.sep
        if npath in filepath:
            filepath = filepath.replace(npath, os.sep.join('My Files',
                'Magazines')+os.sep)
            filedir = os.path.dirname(filepath)
            if create_dirs and not os.path.exists(filedir):
                os.makedirs(filedir)
        return filepath
    '''
--- a/src/calibre/devices/prs505/driver.py
+++ b/src/calibre/devices/prs505/driver.py
@ -76,17 +76,31 @@ class PRS505(USBMS):
                'sending DRMed books in which you cannot change the cover.'
                ' WARNING: This option should only be used with newer '
                'SONY readers: 350, 650, 950 and newer.'),
            _('Refresh separate covers when using automatic management (newer readers)') +
                ':::' +
                _('Set this option to have separate book covers uploaded '
                  'every time you connect your device. Unset this option if '
                  'you have so many books on the reader that performance is '
                  'unacceptable.')
    ]
    EXTRA_CUSTOMIZATION_DEFAULT = [
                ', '.join(['series', 'tags']),
                False,
                False
    ]
    OPT_COLLECTIONS    = 0
    OPT_UPLOAD_COVERS  = 1
    OPT_REFRESH_COVERS = 2
    plugboard = None
    plugboard_func = None
    THUMBNAIL_HEIGHT = 200
    MAX_PATH_LEN = 201 # 250 - (max(len(CACHE_THUMBNAIL), len(MEDIA_THUMBNAIL)) +
                       # len('main_thumbnail.jpg') + 1)
    def windows_filter_pnp_id(self, pnp_id):
        return '_LAUNCHER' in pnp_id
@ -171,7 +185,7 @@ class PRS505(USBMS):
        opts = self.settings()
        if opts.extra_customization:
            collections = [x.strip() for x in
-                    opts.extra_customization[0].split(',')]
+                    opts.extra_customization[self.OPT_COLLECTIONS].split(',')]
        else:
            collections = []
        debug_print('PRS505: collection fields:', collections)
@ -183,6 +197,23 @@ class PRS505(USBMS):
        c.update(blists, collections, pb)
        c.write()
        if opts.extra_customization[self.OPT_REFRESH_COVERS]:
            debug_print('PRS505: uploading covers in sync_booklists')
            for idx,bl in blists.items():
                prefix = self._card_a_prefix if idx == 1 else \
                                self._card_b_prefix if idx == 2 \
                                    else self._main_prefix
                for book in bl:
                    try:
                        p = os.path.join(prefix, book.lpath)
                        self._upload_cover(os.path.dirname(p),
                                          os.path.splitext(os.path.basename(p))[0],
                                          book, p)
                    except:
                        debug_print('FAILED to upload cover', p)
        else:
            debug_print('PRS505: NOT uploading covers in sync_booklists')
        USBMS.sync_booklists(self, booklists, end_session=end_session)
        debug_print('PRS505: finished sync_booklists')
@ -199,11 +230,17 @@ class PRS505(USBMS):
    def upload_cover(self, path, filename, metadata, filepath):
        opts = self.settings()
-        if not opts.extra_customization[1]:
+        if not opts.extra_customization[self.OPT_UPLOAD_COVERS]:
            # Building thumbnails disabled
-            debug_print('PRS505: not uploading covers')
+            debug_print('PRS505: not uploading cover')
            return
-        debug_print('PRS505: uploading covers')
+        debug_print('PRS505: uploading cover')
        try:
            self._upload_cover(path, filename, metadata, filepath)
        except:
            debug_print('FAILED to upload cover', filepath)
    def _upload_cover(self, path, filename, metadata, filepath):
        if metadata.thumbnail and metadata.thumbnail[-1]:
            path = path.replace('/', os.sep)
            is_main = path.startswith(self._main_prefix)
--- a/src/calibre/devices/usbms/device.py
+++ b/src/calibre/devices/usbms/device.py
@ -98,6 +98,9 @@ class Device(DeviceConfig, DevicePlugin):
    # copy these back to the library
    BACKLOADING_ERROR_MESSAGE = None
    #: The maximum length of paths created on the device
    MAX_PATH_LEN = 250
    def reset(self, key='-1', log_packets=False, report_progress=None,
            detected_device=None):
        self._main_prefix = self._card_a_prefix = self._card_b_prefix = None
@ -875,7 +878,7 @@ class Device(DeviceConfig, DevicePlugin):
    def create_upload_path(self, path, mdata, fname, create_dirs=True):
        path = os.path.abspath(path)
-        extra_components = []
+        maxlen = self.MAX_PATH_LEN
        special_tag = None
        if mdata.tags:
@ -902,7 +905,7 @@ class Device(DeviceConfig, DevicePlugin):
        app_id = str(getattr(mdata, 'application_id', ''))
        # The db id will be in the created filename
        extra_components = get_components(template, mdata, fname,
-                timefmt=opts.send_timefmt, length=250-len(app_id)-1)
+                timefmt=opts.send_timefmt, length=maxlen-len(app_id)-1)
        if not extra_components:
            extra_components.append(sanitize(self.filename_callback(fname,
                mdata)))
@ -937,12 +940,11 @@ class Device(DeviceConfig, DevicePlugin):
            return ans
        extra_components = list(map(remove_trailing_periods, extra_components))
-        components = shorten_components_to(250 - len(path), extra_components)
+        components = shorten_components_to(maxlen - len(path), extra_components)
        components = self.sanitize_path_components(components)
        filepath = os.path.join(path, *components)
        filedir = os.path.dirname(filepath)
        if create_dirs and not os.path.exists(filedir):
            os.makedirs(filedir)
--- a/src/calibre/ebooks/chardet/init.py
+++ b/src/calibre/ebooks/chardet/init.py
@ -18,7 +18,7 @@
 __version__ = "1.0"
-import re
+import re, codecs
 def detect(aBuf):
    import calibre.ebooks.chardet.universaldetector as universaldetector
@ -83,9 +83,11 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
    if not raw:
        return u'', encoding
    if not isinstance(raw, unicode):
-        if raw.startswith('\xff\xfe'):
+        if raw.startswith(codecs.BOM_UTF8):
            raw, encoding = raw.decode('utf-8')[1:], 'utf-8'
        elif raw.startswith(codecs.BOM_UTF16_LE):
            raw, encoding = raw.decode('utf-16-le')[1:], 'utf-16-le'
-        elif raw.startswith('\xfe\xff'):
+        elif raw.startswith(codecs.BOM_UTF16_BE):
            raw, encoding = raw.decode('utf-16-be')[1:], 'utf-16-be'
    if not isinstance(raw, unicode):
        for pat in ENCODING_PATS:
--- a/src/calibre/ebooks/chm/input.py
+++ b/src/calibre/ebooks/chm/input.py
@ -75,7 +75,7 @@ class CHMInput(InputFormatPlugin):
    def _create_oebbook(self, hhcpath, basedir, opts, log, mi):
        from calibre.ebooks.conversion.plumber import create_oebbook
        from calibre.ebooks.oeb.base import DirContainer
-        oeb = create_oebbook(log, None, opts, self,
+        oeb = create_oebbook(log, None, opts,
                encoding=opts.input_encoding, populate=False)
        self.oeb = oeb
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@ -42,6 +42,12 @@ option.
 For full documentation of the conversion system see
 ''') + 'http://calibre-ebook.com/user_manual/conversion.html'
 HEURISTIC_OPTIONS = ['markup_chapter_headings',
                      'italicize_common_cases', 'fix_indents',
                      'html_unwrap_factor', 'unwrap_lines',
                      'delete_blank_paragraphs', 'format_scene_breaks',
                      'dehyphenate', 'renumber_headings']
 def print_help(parser, log):
    help = parser.format_help().encode(preferred_encoding, 'replace')
    log(help)
@ -83,6 +89,8 @@ def option_recommendation_to_cli_option(add_option, rec):
    if opt.long_switch == 'verbose':
        attrs['action'] = 'count'
        attrs.pop('type', '')
    if opt.name in HEURISTIC_OPTIONS and rec.recommended_value is True:
        switches = ['--disable-'+opt.long_switch]
    add_option(Option(*switches, **attrs))
 def add_input_output_options(parser, plumber):
@ -126,18 +134,33 @@ def add_pipeline_options(parser, plumber):
                      'margin_top', 'margin_left', 'margin_right',
                      'margin_bottom', 'change_justification',
                      'insert_blank_line', 'remove_paragraph_spacing','remove_paragraph_spacing_indent_size',
-                      'asciiize', 'remove_header', 'header_regex',
+                      'asciiize',
                      'remove_footer', 'footer_regex',
                  ]
                  ),
              'HEURISTIC PROCESSING' : (
                  _('Modify the document text and structure using common'
                     ' patterns. Disabled by default. Use %s to enable. '
                     ' Individual actions can be disabled with the %s options.')
                  % ('--enable-heuristics', '--disable-*'),
                  ['enable_heuristics'] + HEURISTIC_OPTIONS
                  ),
              'SEARCH AND REPLACE' : (
                 _('Modify the document text and structure using user defined patterns.'),
                 [
                      'sr1_search', 'sr1_replace',
                      'sr2_search', 'sr2_replace',
                      'sr3_search', 'sr3_replace',
                 ]
              ),
              'STRUCTURE DETECTION' : (
                  _('Control auto-detection of document structure.'),
                  [
                      'chapter', 'chapter_mark',
                      'prefer_metadata_cover', 'remove_first_image',
                      'insert_metadata', 'page_breaks_before',
                      'preprocess_html', 'html_unwrap_factor',
                  ]
                  ),
@ -164,7 +187,8 @@ def add_pipeline_options(parser, plumber):
              }
-    group_order = ['', 'LOOK AND FEEL', 'STRUCTURE DETECTION',
+    group_order = ['', 'LOOK AND FEEL', 'HEURISTIC PROCESSING',
            'SEARCH AND REPLACE', 'STRUCTURE DETECTION',
            'TABLE OF CONTENTS', 'METADATA', 'DEBUG']
    for group in group_order:
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -72,7 +72,8 @@ class Plumber(object):
        ]
    def __init__(self, input, output, log, report_progress=DummyReporter(),
-            dummy=False, merge_plugin_recs=True, abort_after_input_dump=False):
+            dummy=False, merge_plugin_recs=True, abort_after_input_dump=False,
            override_input_metadata=False):
        '''
        :param input: Path to input file.
        :param output: Path to output file/directory
@ -87,7 +88,9 @@ class Plumber(object):
        self.log = log
        self.ui_reporter = report_progress
        self.abort_after_input_dump = abort_after_input_dump
        self.override_input_metadata = override_input_metadata
        # Pipeline options {{{
        # Initialize the conversion options that are independent of input and
        # output formats. The input and output plugins can still disable these
        # options via recommendations.
@ -375,23 +378,6 @@ OptionRecommendation(name='insert_metadata',
            )
        ),
 OptionRecommendation(name='preprocess_html',
        recommended_value=False, level=OptionRecommendation.LOW,
        help=_('Attempt to detect and correct hard line breaks and other '
            'problems in the source file. This may make things worse, so use '
            'with care.'
            )
        ),
 OptionRecommendation(name='html_unwrap_factor',
        recommended_value=0.40, level=OptionRecommendation.LOW,
        help=_('Scale used to determine the length at which a line should '
            'be unwrapped if preprocess is enabled. Valid values are a decimal between 0 and 1. The '
            'default is 0.40, just below the median line length. This will unwrap typical books '
            ' with hard line breaks, but should be reduced if the line length is variable.'
            )
        ),
 OptionRecommendation(name='smarten_punctuation',
        recommended_value=False, level=OptionRecommendation.LOW,
        help=_('Convert plain quotes, dashes and ellipsis to their '
@ -400,32 +386,6 @@ OptionRecommendation(name='smarten_punctuation',
            )
        ),
 OptionRecommendation(name='remove_header',
        recommended_value=False, level=OptionRecommendation.LOW,
        help=_('Use a regular expression to try and remove the header.'
            )
        ),
 OptionRecommendation(name='header_regex',
        recommended_value='(?i)(?<=<hr>)((\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?\d+<br>\s*.*?\s*)|(\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?.*?<br>\s*\d+))(?=<br>)',
        level=OptionRecommendation.LOW,
        help=_('The regular expression to use to remove the header.'
            )
        ),
 OptionRecommendation(name='remove_footer',
        recommended_value=False, level=OptionRecommendation.LOW,
        help=_('Use a regular expression to try and remove the footer.'
            )
        ),
 OptionRecommendation(name='footer_regex',
        recommended_value='(?i)(?<=<hr>)((\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?\d+<br>\s*.*?\s*)|(\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?.*?<br>\s*\d+))(?=<br>)',
        level=OptionRecommendation.LOW,
        help=_('The regular expression to use to remove the footer.'
            )
        ),
 OptionRecommendation(name='read_metadata_from_opf',
            recommended_value=None, level=OptionRecommendation.LOW,
            short_switch='m',
@ -526,7 +486,91 @@ OptionRecommendation(name='timestamp',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('Set the book timestamp (used by the date column in calibre).')),
 OptionRecommendation(name='enable_heuristics',
    recommended_value=False, level=OptionRecommendation.LOW,
    help=_('Enable heuristic processing. This option must be set for any '
           'heuristic processing to take place.')),
 OptionRecommendation(name='markup_chapter_headings',
    recommended_value=True, level=OptionRecommendation.LOW,
    help=_('Detect unformatted chapter headings and sub headings. Change '
           'them to h2 and h3 tags.  This setting will not create a TOC, '
           'but can be used in conjunction with structure detection to create '
           'one.')),
 OptionRecommendation(name='italicize_common_cases',
    recommended_value=True, level=OptionRecommendation.LOW,
    help=_('Look for common words and patterns that denote '
           'italics and italicize them.')),
 OptionRecommendation(name='fix_indents',
    recommended_value=True, level=OptionRecommendation.LOW,
    help=_('Turn indentation created from multiple non-breaking space entities '
           'into CSS indents.')),
 OptionRecommendation(name='html_unwrap_factor',
    recommended_value=0.40, level=OptionRecommendation.LOW,
    help=_('Scale used to determine the length at which a line should '
            'be unwrapped. Valid values are a decimal between 0 and 1. The '
            'default is 0.4, just below the median line length.  If only a '
            'few lines in the document require unwrapping this value should '
            'be reduced')),
 OptionRecommendation(name='unwrap_lines',
    recommended_value=True, level=OptionRecommendation.LOW,
    help=_('Unwrap lines using punctuation and other formatting clues.')),
 OptionRecommendation(name='delete_blank_paragraphs',
    recommended_value=True, level=OptionRecommendation.LOW,
    help=_('Remove empty paragraphs from the document when they exist between '
           'every other paragraph')),
 OptionRecommendation(name='format_scene_breaks',
    recommended_value=True, level=OptionRecommendation.LOW,
    help=_('Left aligned scene break markers are center aligned. '
           'Replace soft scene breaks that use multiple blank lines with'
           'horizontal rules.')),
 OptionRecommendation(name='dehyphenate',
    recommended_value=True, level=OptionRecommendation.LOW,
    help=_('Analyze hyphenated words throughout the document.  The '
           'document itself is used as a dictionary to determine whether hyphens '
           'should be retained or removed.')),
 OptionRecommendation(name='renumber_headings',
    recommended_value=True, level=OptionRecommendation.LOW,
    help=_('Looks for occurrences of sequential <h1> or <h2> tags. '
           'The tags are renumbered to prevent splitting in the middle '
           'of chapter headings.')),
 OptionRecommendation(name='sr1_search',
    recommended_value='', level=OptionRecommendation.LOW,
    help=_('Search pattern (regular expression) to be replaced with '
           'sr1-replace.')),
 OptionRecommendation(name='sr1_replace',
    recommended_value='', level=OptionRecommendation.LOW,
    help=_('Replacement to replace the text found with sr1-search.')),
 OptionRecommendation(name='sr2_search',
    recommended_value='', level=OptionRecommendation.LOW,
    help=_('Search pattern (regular expression) to be replaced with '
           'sr2-replace.')),
 OptionRecommendation(name='sr2_replace',
    recommended_value='', level=OptionRecommendation.LOW,
    help=_('Replacement to replace the text found with sr2-search.')),
 OptionRecommendation(name='sr3_search',
    recommended_value='', level=OptionRecommendation.LOW,
    help=_('Search pattern (regular expression) to be replaced with '
           'sr3-replace.')),
 OptionRecommendation(name='sr3_replace',
    recommended_value='', level=OptionRecommendation.LOW,
    help=_('Replacement to replace the text found with sr3-search.')),
 ]
        # }}}
        input_fmt = os.path.splitext(self.input)[1]
        if not input_fmt:
@ -859,7 +903,6 @@ OptionRecommendation(name='timestamp',
                self.opts_to_mi(self.user_metadata)
            if not hasattr(self.oeb, 'manifest'):
                self.oeb = create_oebbook(self.log, self.oeb, self.opts,
                        self.input_plugin,
                        encoding=self.input_plugin.output_encoding)
            self.input_plugin.postprocess_book(self.oeb, self.opts, self.log)
            self.opts.is_image_collection = self.input_plugin.is_image_collection
@ -883,7 +926,8 @@ OptionRecommendation(name='timestamp',
        self.opts.dest = self.opts.output_profile
        from calibre.ebooks.oeb.transforms.metadata import MergeMetadata
-        MergeMetadata()(self.oeb, self.user_metadata, self.opts)
+        MergeMetadata()(self.oeb, self.user_metadata, self.opts,
                override_input_metadata=self.override_input_metadata)
        pr(0.2)
        self.flush()
@ -969,14 +1013,15 @@ OptionRecommendation(name='timestamp',
        self.log(self.output_fmt.upper(), 'output written to', self.output)
        self.flush()
-def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
+def create_oebbook(log, path_or_stream, opts, reader=None,
        encoding='utf-8', populate=True):
    '''
    Create an OEBBook.
    '''
    from calibre.ebooks.oeb.base import OEBBook
-    html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
+    html_preprocessor = HTMLPreProcessor(log, opts)
-            opts.preprocess_html, opts)
+    if not encoding:
        encoding = None
    oeb = OEBBook(log, html_preprocessor,
            pretty_print=opts.pretty_print, input_encoding=encoding)
    if not populate:
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en'
 import functools, re
-from calibre import entity_to_unicode
+from calibre import entity_to_unicode, as_unicode
 XMLDECL_RE    = re.compile(r'^\s*<[?]xml.*?[?]>')
 SVG_NS       = 'http://www.w3.org/2000/svg'
@ -78,6 +78,8 @@ class DocAnalysis(object):
            linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
        elif format == 'spanned_html':
            linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
        elif format == 'txt':
            linere = re.compile('.*?\n')
        self.lines = linere.findall(raw)
    def line_length(self, percent):
@ -172,13 +174,19 @@ class Dehyphenator(object):
    retain hyphens.
    '''
-    def __init__(self):
+    def __init__(self, verbose=0, log=None):
        self.log = log
        self.verbose = verbose
        # Add common suffixes to the regex below to increase the likelihood of a match -
        # don't add suffixes which are also complete words, such as 'able' or 'sex'
-        self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
+        # only remove if it's not already the point of hyphenation
        self.suffix_string = "((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$"
        self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE)
        self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE)
        # remove prefixes if the prefix was not already the point of hyphenation
-        self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
+        self.prefix_string = '^(dis|re|un|in|ex)'
-        self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
+        self.prefixes = re.compile(r'%s$' % self.prefix_string, re.IGNORECASE)
        self.removeprefix = re.compile(r'%s' % self.prefix_string, re.IGNORECASE)
    def dehyphenate(self, match):
        firsthalf = match.group('firstpart')
@ -189,31 +197,48 @@ class Dehyphenator(object):
            wraptags = ''
        hyphenated = unicode(firsthalf) + "-" + unicode(secondhalf)
        dehyphenated = unicode(firsthalf) + unicode(secondhalf)
-        lookupword = self.removesuffixes.sub('', dehyphenated)
+        if self.suffixes.match(secondhalf) is None:
-        if self.prefixes.match(firsthalf) is None:
+            lookupword = self.removesuffixes.sub('', dehyphenated)
        else:
            lookupword = dehyphenated
        if len(firsthalf) > 4 and self.prefixes.match(firsthalf) is None:
            lookupword = self.removeprefix.sub('', lookupword)
-        #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
+        if self.verbose > 2:
            self.log("lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated))
        try:
            searchresult = self.html.find(lookupword.lower())
        except:
            return hyphenated
-        if self.format == 'html_cleanup':
+        if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
            if self.html.find(lookupword) != -1 or searchresult != -1:
-                #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
+                if self.verbose > 2:
                    self.log("    Cleanup:returned dehyphenated word: " + str(dehyphenated))
                return dehyphenated
            elif self.html.find(hyphenated) != -1:
-                #print "Cleanup:returned hyphenated word: " + str(hyphenated)
+                if self.verbose > 2:
                    self.log("        Cleanup:returned hyphenated word: " + str(hyphenated))
                return hyphenated
            else:
-                #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
+                if self.verbose > 2:
                    self.log("            Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf))
                return firsthalf+u'\u2014'+wraptags+secondhalf
        else:
            if self.format == 'individual_words' and len(firsthalf) + len(secondhalf) <= 6:
                if self.verbose > 2:
                    self.log("too short, returned hyphenated word: " + str(hyphenated))
                return hyphenated
            if len(firsthalf) <= 2 and len(secondhalf) <= 2:
                if self.verbose > 2:
                    self.log("too short, returned hyphenated word: " + str(hyphenated))
                return hyphenated
            if self.html.find(lookupword) != -1 or searchresult != -1:
-                #print "returned dehyphenated word: " + str(dehyphenated)
+                if self.verbose > 2:
                    self.log("     returned dehyphenated word: " + str(dehyphenated))
                return dehyphenated
            else:
-                #print "           returned hyphenated word: " + str(hyphenated)
+                if self.verbose > 2:
                    self.log("          returned hyphenated word: " + str(hyphenated))
                return hyphenated
    def __call__(self, html, format, length=1):
@ -223,10 +248,15 @@ class Dehyphenator(object):
            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags>(</span>)?\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)' % length)
        elif format == 'pdf':
            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
        elif format == 'txt':
            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
        elif format == 'individual_words':
-            intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
+            intextmatch = re.compile(u'(?!<)(?P<firstpart>\w+)(-|‐)\s*(?P<secondpart>\w+)(?![^<]*?>)')
        elif format == 'html_cleanup':
            intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
        elif format == 'txt_cleanup':
            intextmatch = re.compile(u'(?P<firstpart>\w+)(-|‐)(?P<wraptags>\s+)(?P<secondpart>[\w\d]+)')
        html = intextmatch.sub(self.dehyphenate, html)
        return html
@ -353,7 +383,7 @@ class HTMLPreProcessor(object):
                  (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
                  # Center separator lines
-                  (re.compile(u'<br>\s*(?P<break>([*#•✦]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
+                  (re.compile(u'<br>\s*(?P<break>([*#•✦=]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
                  # Remove page links
                  (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
@ -390,10 +420,8 @@ class HTMLPreProcessor(object):
                     (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
                      lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
                     ]
-    def __init__(self, input_plugin_preprocess, plugin_preprocess,
+    def __init__(self, log=None, extra_opts=None):
-            extra_opts=None):
+        self.log = log
        self.input_plugin_preprocess = input_plugin_preprocess
        self.plugin_preprocess = plugin_preprocess
        self.extra_opts = extra_opts
    def is_baen(self, src):
@ -429,27 +457,20 @@ class HTMLPreProcessor(object):
        if not getattr(self.extra_opts, 'keep_ligatures', False):
            html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
        for search, replace in [['sr3_search', 'sr3_replace'], ['sr2_search', 'sr2_replace'], ['sr1_search', 'sr1_replace']]:
            search_pattern = getattr(self.extra_opts, search, '')
            if search_pattern:
                try:
                    search_re = re.compile(search_pattern)
                    replace_txt = getattr(self.extra_opts, replace, '')
                    if not replace_txt:
                        replace_txt = ''
                    rules.insert(0, (search_re, replace_txt))
                except Exception as e:
                    self.log.error('Failed to parse %r regexp because %s' %
                            (search, as_unicode(e)))
        end_rules = []
        if getattr(self.extra_opts, 'remove_header', None):
            try:
                rules.insert(0,
                    (re.compile(self.extra_opts.header_regex), lambda match : '')
                )
            except:
                import traceback
                print 'Failed to parse remove_header regexp'
                traceback.print_exc()
        if getattr(self.extra_opts, 'remove_footer', None):
            try:
                rules.insert(0,
                    (re.compile(self.extra_opts.footer_regex), lambda match : '')
                )
            except:
                import traceback
                print 'Failed to parse remove_footer regexp'
                traceback.print_exc()
        # delete soft hyphens - moved here so it's executed after header/footer removal
        if is_pdftohtml:
            # unwrap/delete soft hyphens
@ -457,12 +478,6 @@ class HTMLPreProcessor(object):
            # unwrap/delete soft hyphens with formatting
            end_rules.append((re.compile(u'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
        # Make the more aggressive chapter marking regex optional with the preprocess option to
        # reduce false positives and move after header/footer removal
        if getattr(self.extra_opts, 'preprocess_html', None):
            if is_pdftohtml:
                end_rules.append((re.compile(r'<p>\s*(?P<chap>(<[ibu]>){0,2}\s*([A-Z \'"!]{3,})\s*([\dA-Z:]+\s){0,4}\s*(</[ibu]>){0,2})\s*<p>\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<p>)?'), chap_head),)
        length = -1
        if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
            docanalysis = DocAnalysis('pdf', html)
@ -473,7 +488,7 @@ class HTMLPreProcessor(object):
                end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
                end_rules.append(
                    # Un wrap using punctuation
-                    (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
+                    (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßě,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
                )
        for rule in self.PREPROCESS + start_rules:
@ -505,15 +520,14 @@ class HTMLPreProcessor(object):
        if is_pdftohtml and length > -1:
            # Dehyphenate
-            dehyphenator = Dehyphenator()
+            dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
            html = dehyphenator(html,'html', length)
        if is_pdftohtml:
-            from calibre.ebooks.conversion.utils import PreProcessor
+            from calibre.ebooks.conversion.utils import HeuristicProcessor
-            pdf_markup = PreProcessor(self.extra_opts, None)
+            pdf_markup = HeuristicProcessor(self.extra_opts, None)
            totalwords = 0
-            totalwords = pdf_markup.get_word_count(html)
+            if pdf_markup.get_word_count(html) > 7000:
            if totalwords > 7000:
                html = pdf_markup.markup_chapters(html, totalwords, True)
        #dump(html, 'post-preprocess')
@ -533,8 +547,10 @@ class HTMLPreProcessor(object):
            unidecoder = Unidecoder()
            html = unidecoder.decode(html)
-        if self.plugin_preprocess:
+        if getattr(self.extra_opts, 'enable_heuristics', False):
-            html = self.input_plugin_preprocess(self.extra_opts, html)
+            from calibre.ebooks.conversion.utils import HeuristicProcessor
            preprocessor = HeuristicProcessor(self.extra_opts, self.log)
            html = preprocessor(html)
        if getattr(self.extra_opts, 'smarten_punctuation', False):
            html = self.smarten_punctuation(html)
@ -561,8 +577,8 @@ class HTMLPreProcessor(object):
        html = html.replace(start, '<!--')
        html = html.replace(stop, '-->')
        # convert ellipsis to entities to prevent wrapping
-        html = re.sub('(?u)(?<=\w)\s?(\.\s?){2}\.', '&hellip;', html)
+        html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '&hellip;', html)
        # convert double dashes to em-dash
-        html = re.sub('\s--\s', u'\u2014', html)
+        html = re.sub(r'\s--\s', u'\u2014', html)
        return substitute_entites(html)
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -11,13 +11,22 @@ from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
 from calibre.utils.logging import default_log
 from calibre.utils.wordcount import get_wordcount_obj
-class PreProcessor(object):
+class HeuristicProcessor(object):
    def __init__(self, extra_opts=None, log=None):
        self.log = default_log if log is None else log
        self.html_preprocess_sections = 0
        self.found_indents = 0
        self.extra_opts = extra_opts
        self.deleted_nbsps = False
        self.totalwords = 0
        self.min_chapters = 1
        self.chapters_no_title = 0
        self.chapters_with_title = 0
        self.blanks_deleted = False
        self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
        self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sid=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
        self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
    def is_pdftohtml(self, src):
        return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
@ -27,12 +36,12 @@ class PreProcessor(object):
        title = match.group('title')
        if not title:
            self.html_preprocess_sections = self.html_preprocess_sections + 1
-            self.log("marked " + unicode(self.html_preprocess_sections) +
+            self.log.debug("marked " + unicode(self.html_preprocess_sections) +
                    " chapters. - " + unicode(chap))
            return '<h2>'+chap+'</h2>\n'
        else:
            self.html_preprocess_sections = self.html_preprocess_sections + 1
-            self.log("marked " + unicode(self.html_preprocess_sections) +
+            self.log.debug("marked " + unicode(self.html_preprocess_sections) +
                    " chapters & titles. - " + unicode(chap) + ", " + unicode(title))
            return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
@ -40,10 +49,18 @@ class PreProcessor(object):
        chap = match.group('section')
        styles = match.group('styles')
        self.html_preprocess_sections = self.html_preprocess_sections + 1
-        self.log("marked " + unicode(self.html_preprocess_sections) +
+        self.log.debug("marked " + unicode(self.html_preprocess_sections) +
                " section markers based on punctuation. - " + unicode(chap))
        return '<'+styles+' style="page-break-before:always">'+chap
    def analyze_title_matches(self, match):
        #chap = match.group('chap')
        title = match.group('title')
        if not title:
            self.chapters_no_title = self.chapters_no_title + 1
        else:
            self.chapters_with_title = self.chapters_with_title + 1
    def insert_indent(self, match):
        pstyle = match.group('formatting')
        span = match.group('span')
@ -75,8 +92,8 @@ class PreProcessor(object):
        line_end = line_end_ere.findall(raw)
        tot_htm_ends = len(htm_end)
        tot_ln_fds = len(line_end)
-        self.log("There are " + unicode(tot_ln_fds) + " total Line feeds, and " +
+        #self.log.debug("There are " + unicode(tot_ln_fds) + " total Line feeds, and " +
-                unicode(tot_htm_ends) + " marked up endings")
+        #        unicode(tot_htm_ends) + " marked up endings")
        if percent > 1:
            percent = 1
@ -84,9 +101,8 @@ class PreProcessor(object):
            percent = 0
        min_lns = tot_ln_fds * percent
-        self.log("There must be fewer than " + unicode(min_lns) + " unmarked lines to add markup")
+        #self.log.debug("There must be fewer than " + unicode(min_lns) + " unmarked lines to add markup")
-        if min_lns > tot_htm_ends:
+        return min_lns > tot_htm_ends
            return True
    def dump(self, raw, where):
        import os
@ -112,16 +128,55 @@ class PreProcessor(object):
        wordcount = get_wordcount_obj(word_count_text)
        return wordcount.words
    def markup_italicis(self, html):
        ITALICIZE_WORDS = [
            'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
            'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
            'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
            'Mlle.', 'Mons.', 'PS.', 'PPS.',
        ]
        ITALICIZE_STYLE_PATS = [
            r'(?msu)(?<=\s)_(?P<words>\S[^_]{0,40}?\S)?_(?=\s)',
            r'(?msu)(?<=\s)/(?P<words>\S[^/]{0,40}?\S)?/(?=\s)',
            r'(?msu)(?<=\s)~~(?P<words>\S[^~]{0,40}?\S)?~~(?=\s)',
            r'(?msu)(?<=\s)\*(?P<words>\S[^\*]{0,40}?\S)?\*(?=\s)',
            r'(?msu)(?<=\s)~(?P<words>\S[^~]{0,40}?\S)?~(?=\s)',
            r'(?msu)(?<=\s)_/(?P<words>\S[^/_]{0,40}?\S)?/_(?=\s)',
            r'(?msu)(?<=\s)_\*(?P<words>\S[^\*_]{0,40}?\S)?\*_(?=\s)',
            r'(?msu)(?<=\s)\*/(?P<words>\S[^/\*]{0,40}?\S)?/\*(?=\s)',
            r'(?msu)(?<=\s)_\*/(?P<words>\S[^\*_]{0,40}?\S)?/\*_(?=\s)',
            r'(?msu)(?<=\s)/:(?P<words>\S[^:/]{0,40}?\S)?:/(?=\s)',
            r'(?msu)(?<=\s)\|:(?P<words>\S[^:\|]{0,40}?\S)?:\|(?=\s)',
        ]
        for word in ITALICIZE_WORDS:
            html = html.replace(word, '<i>%s</i>' % word)
        for pat in ITALICIZE_STYLE_PATS:
            html = re.sub(pat, lambda mo: '<i>%s</i>' % mo.group('words'), html)
        return html
    def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
        '''
        Searches for common chapter headings throughout the document
        attempts multiple patterns based on likelihood of a match
        with minimum false positives.  Exits after finding a successful pattern
        '''
        # Typical chapters are between 2000 and 7000 words, use the larger number to decide the
-        # minimum of chapters to search for
+        # minimum of chapters to search for.  A max limit is calculated to prevent things like OCR
-        self.min_chapters = 1
+        # or pdf page numbers from being treated as TOC markers
        max_chapters = 150
        typical_chapters = 7000.
        if wordcount > 7000:
-            self.min_chapters = int(ceil(wordcount / 7000.))
+            if wordcount > 200000:
-        #print "minimum chapters required are: "+str(self.min_chapters)
+                typical_chapters = 15000.
            self.min_chapters = int(ceil(wordcount / typical_chapters))
        self.log.debug("minimum chapters required are: "+str(self.min_chapters))
        heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
        self.html_preprocess_sections = len(heading.findall(html))
-        self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
+        self.log.debug("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
        # Build the Regular Expressions in pieces
        init_lookahead = "(?=<(p|div))"
@ -151,88 +206,160 @@ class PreProcessor(object):
        n_lookahead_open = "\s+(?!"
        n_lookahead_close = ")"
-        default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
+        default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
        simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!(Chapter|\s+<)).{0,65}?(</[ibu][^>]*>)?(?=<)"
        analysis_result = []
        chapter_types = [
-            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
+            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'],
-            [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
+            [r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'],  # Highest frequency headings which include titles
-            [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"],  # Numeric Chapters
+            [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, True, True, False, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
-            [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"],  # Spaced Lettering
+            [r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, True, True, False, "Searching for numeric chapter headings", 'numeric'],  # Numeric Chapters
-            [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
+            [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, True, True, False, "Searching for letter spaced headings", 'letter_spaced'],  # Spaced Lettering
-            [r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings"],  # Numeric Chapters, no dot or colon
+            [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, True, True, False, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
-            [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
+            [r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for simple numeric headings", 'plain_number'],  # Numeric Chapters, no dot or colon
            [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, True, False, False, "Searching for chapters with Uppercase Characters", 'uppercase' ] # Uppercase Chapters
            ]
-        # Start with most typical chapter headings, get more aggressive until one works
+        def recurse_patterns(html, analyze):
-        for [chapter_type, lookahead_ignorecase, log_message] in chapter_types:
+            # Start with most typical chapter headings, get more aggressive until one works
-            if self.html_preprocess_sections >= self.min_chapters:
+            for [chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name] in chapter_types:
-                break
+                n_lookahead = ''
-            full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
+                hits = 0
-            n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
+                self.chapters_no_title = 0
-            self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
+                self.chapters_with_title = 0
-            if lookahead_ignorecase:
+
-                chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+                if n_lookahead_req:
-                chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
+                    lp_n_lookahead_open = n_lookahead_open
-            else:
+                    lp_n_lookahead_close = n_lookahead_close
-                chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
+                else:
-                chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
+                    lp_n_lookahead_open = ''
-            html = chapdetect.sub(self.chapter_head, html)
+                    lp_n_lookahead_close = ''
                if strict_title:
                    lp_title = default_title
                else:
                    lp_title = simple_title
                if ignorecase:
                    arg_ignorecase = r'(?i)'
                else:
                    arg_ignorecase = ''
                if title_req:
                    lp_opt_title_open = ''
                    lp_opt_title_close = ''
                else:
                    lp_opt_title_open = opt_title_open
                    lp_opt_title_close = opt_title_close
                if self.html_preprocess_sections >= self.min_chapters:
                    break
                full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
                if n_lookahead_req:
                    n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
                if not analyze:
                    self.log.debug("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
                chapter_marker = arg_ignorecase+init_lookahead+full_chapter_line+blank_lines+lp_n_lookahead_open+n_lookahead+lp_n_lookahead_close+lp_opt_title_open+title_line_open+title_header_open+lp_title+title_header_close+title_line_close+lp_opt_title_close
                chapdetect = re.compile(r'%s' % chapter_marker)
                if analyze:
                    hits = len(chapdetect.findall(html))
                    if hits:
                        chapdetect.sub(self.analyze_title_matches, html)
                        if float(self.chapters_with_title) / float(hits) > .5:
                            title_req = True
                            strict_title = False
                        self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ")
                        if type_name == 'common':
                            analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
                        elif self.min_chapters <= hits < max_chapters:
                            analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
                            break
                else:
                    html = chapdetect.sub(self.chapter_head, html)
            return html
        recurse_patterns(html, True)
        chapter_types = analysis_result
        html = recurse_patterns(html, False)
        words_per_chptr = wordcount
        if words_per_chptr > 0 and self.html_preprocess_sections > 0:
            words_per_chptr = wordcount / self.html_preprocess_sections
-        self.log("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
+        self.log.debug("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
        return html
    def punctuation_unwrap(self, length, content, format):
        '''
        Unwraps lines based on line length and punctuation
        supports a range of html markup and text files
        '''
        # define the pieces of the regex
        lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßě,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
        em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
        soft_hyphen = u"\xad"
        line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"
        blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
        line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*"
        txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}"
        unwrap_regex = lookahead+line_ending+blanklines+line_opening
        em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening
        shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening
-    def __call__(self, html):
+        if format == 'txt':
-        self.log("*********  Preprocessing HTML  *********")
+            unwrap_regex = lookahead+txt_line_wrap
            em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
            shy_unwrap_regex = soft_hyphen+txt_line_wrap
-        # Count the words in the document to estimate how many chapters to look for and whether
+        unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
-        # other types of processing are attempted
+        em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
-        totalwords = 0
+        shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE)
        totalwords = self.get_word_count(html)
-        if totalwords < 20:
+        content = unwrap.sub(' ', content)
-            self.log("not enough text, not preprocessing")
+        content = em_en_unwrap.sub('', content)
-            return html
+        content = shy_unwrap.sub('', content)
        return content
-        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
+    def txt_process(self, match):
        from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
        separate_paragraphs_single_line
        content = match.group('text')
        content = separate_paragraphs_single_line(content)
        content = preserve_spaces(content)
        content = convert_basic(content, epub_split_size_kb=0)
        return content
    def markup_pre(self, html):
        pre = re.compile(r'<pre>', re.IGNORECASE)
        if len(pre.findall(html)) >= 1:
            self.log.debug("Running Text Processing")
            outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*?)</pre>', re.IGNORECASE|re.DOTALL)
            html = outerhtml.sub(self.txt_process, html)
        else:
            # Add markup naively
            # TODO - find out if there are cases where there are more than one <pre> tag or
            # other types of unmarked html and handle them in some better fashion
            add_markup = re.compile('(?<!>)(\n)')
            html = add_markup.sub('</p>\n<p>', html)
        return html
    def arrange_htm_line_endings(self, html):
        html = re.sub(r"\s*</(?P<tag>p|div)>", "</"+"\g<tag>"+">\n", html)
        html = re.sub(r"\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*", "\n<"+"\g<tag>"+"\g<style>"+">", html)
        return html
-        ###### Check Markup ######
+    def fix_nbsp_indents(self, html):
        #
        # some lit files don't have any <p> tags or equivalent (generally just plain text between
        # <pre> tags), check and  mark up line endings if required before proceeding
        if self.no_markup(html, 0.1):
            self.log("not enough paragraph markers, adding now")
            # check if content is in pre tags, use txt processor to mark up if so
            pre = re.compile(r'<pre>', re.IGNORECASE)
            if len(pre.findall(html)) == 1:
                self.log("Running Text Processing")
                from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
                separate_paragraphs_single_line
                outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL)
                html = outerhtml.sub('\g<text>', html)
                html = separate_paragraphs_single_line(html)
                html = preserve_spaces(html)
                html = convert_basic(html, epub_split_size_kb=0)
            else:
                # Add markup naively
                # TODO - find out if there are cases where there are more than one <pre> tag or
                # other types of unmarked html and handle them in some better fashion
                add_markup = re.compile('(?<!>)(\n)')
                html = add_markup.sub('</p>\n<p>', html)
        ###### Mark Indents/Cleanup ######
        #
        # Replace series of non-breaking spaces with text-indent
        txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
        html = txtindent.sub(self.insert_indent, html)
        if self.found_indents > 1:
-            self.log("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles")
+            self.log.debug("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles")
        return html
    def cleanup_markup(self, html):
        # remove remaining non-breaking spaces
        html = re.sub(ur'\u00a0', ' ', html)
        # Get rid of various common microsoft specific tags which can cause issues later
@ -240,109 +367,166 @@ class PreProcessor(object):
        html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
        # Delete microsoft 'smart' tags
        html = re.sub('(?i)</?st1:\w+>', '', html)
-        # Get rid of empty span, bold, & italics tags
+        # Get rid of empty span, bold, font, em, & italics tags
        html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
-        html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
+        html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html)
        html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
-        # ADE doesn't render <br />, change to empty paragraphs
+        html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html)
-        #html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
+        self.deleted_nbsps = True
        return html
-        # If more than 40% of the lines are empty paragraphs and the user has enabled remove
+    def analyze_line_endings(self, html):
-        # paragraph spacing then delete blank lines to clean up spacing
+        '''
-        linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
+        determines the type of html line ending used most commonly in a document
-        blankreg = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
+        use before calling docanalysis functions
-        #multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE)
+        '''
        blanklines = blankreg.findall(html)
        lines = linereg.findall(html)
        blanks_between_paragraphs = False
        if len(lines) > 1:
            self.log("There are " + unicode(len(blanklines)) + " blank lines. " +
                    unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")
            if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts,
            'remove_paragraph_spacing', False):
                self.log("deleting blank lines")
                html = blankreg.sub('', html)
            elif float(len(blanklines)) / float(len(lines)) > 0.40:
                blanks_between_paragraphs = True
                #print "blanks between paragraphs is marked True"
            else:
                blanks_between_paragraphs = False
        #self.dump(html, 'before_chapter_markup')
        # detect chapters/sections to match xpath or splitting logic
        #
        html = self.markup_chapters(html, totalwords, blanks_between_paragraphs)
        ###### Unwrap lines ######
        #
        # Some OCR sourced files have line breaks in the html using a combination of span & p tags
        # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
        # that lines can be un-wrapped across page boundaries
        paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
        spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
        paras = len(paras_reg.findall(html))
        spans = len(spans_reg.findall(html))
        if spans > 1:
            if float(paras) / float(spans) < 0.75:
-                format = 'spanned_html'
+                return 'spanned_html'
            else:
-                format = 'html'
+                return 'html'
        else:
-            format = 'html'
+            return 'html'
    def analyze_blanks(self, html):
        blanklines = self.blankreg.findall(html)
        lines = self.linereg.findall(html)
        if len(lines) > 1:
            self.log.debug("There are " + unicode(len(blanklines)) + " blank lines. " +
                    unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")
            if float(len(blanklines)) / float(len(lines)) > 0.40:
                return True
            else:
                return False
    def cleanup_required(self):
        for option in ['unwrap_lines', 'markup_chapter_headings', 'format_scene_breaks', 'delete_blank_paragraphs']:
            if getattr(self.extra_opts, option, False):
                return True
        return False
    def __call__(self, html):
        self.log.debug("*********  Heuristic processing HTML  *********")
        # Count the words in the document to estimate how many chapters to look for and whether
        # other types of processing are attempted
        try:
            self.totalwords = self.get_word_count(html)
        except:
            self.log.warn("Can't get wordcount")
        if self.totalwords < 50:
            self.log.warn("flow is too short, not running heuristics")
            return html
        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
        html = self.arrange_htm_line_endings(html)
        if self.cleanup_required():
            ###### Check Markup ######
            #
            # some lit files don't have any <p> tags or equivalent (generally just plain text between
            # <pre> tags), check and  mark up line endings if required before proceeding
            # fix indents must run after this step
            if self.no_markup(html, 0.1):
                self.log.debug("not enough paragraph markers, adding now")
                # markup using text processing
                html = self.markup_pre(html)
        # Replace series of non-breaking spaces with text-indent
        if getattr(self.extra_opts, 'fix_indents', False):
            html = self.fix_nbsp_indents(html)
        if self.cleanup_required():
            # fix indents must run before this step, as it removes non-breaking spaces
            html = self.cleanup_markup(html)
        # ADE doesn't render <br />, change to empty paragraphs
        #html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
        # Determine whether the document uses interleaved blank lines
        blanks_between_paragraphs = self.analyze_blanks(html)
        #self.dump(html, 'before_chapter_markup')
        # detect chapters/sections to match xpath or splitting logic
        if getattr(self.extra_opts, 'markup_chapter_headings', False):
            html = self.markup_chapters(html, self.totalwords, blanks_between_paragraphs)
        if getattr(self.extra_opts, 'italicize_common_cases', False):
            html = self.markup_italicis(html)
        # If more than 40% of the lines are empty paragraphs and the user has enabled delete
        # blank paragraphs then delete blank lines to clean up spacing
        if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
            self.log.debug("deleting blank lines")
            self.blanks_deleted = True
            html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
            html = self.blankreg.sub('', html)
        # Determine line ending type
        # Some OCR sourced files have line breaks in the html using a combination of span & p tags
        # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
        # that lines can be un-wrapped across page boundaries
        format = self.analyze_line_endings(html)
        # Check Line histogram to determine if the document uses hard line breaks, If 50% or
        # more of the lines break in the same region of the document then unwrapping is required
        docanalysis = DocAnalysis(format, html)
        hardbreaks = docanalysis.line_histogram(.50)
-        self.log("Hard line breaks check returned "+unicode(hardbreaks))
+        self.log.debug("Hard line breaks check returned "+unicode(hardbreaks))
        # Calculate Length
        unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
        length = docanalysis.line_length(unwrap_factor)
-        self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format")
+        self.log.debug("Median line length is " + unicode(length) + ", calculated with " + format + " format")
        # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
        if hardbreaks or unwrap_factor < 0.4:
            self.log("Unwrapping required, unwrapping Lines")
            # Unwrap em/en dashes
            html = re.sub(u'(?<=.{%i}[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % length, '', html)
            # Dehyphenate
            self.log("Unwrapping/Removing hyphens")
            dehyphenator = Dehyphenator()
            html = dehyphenator(html,'html', length)
            self.log("Done dehyphenating")
            # Unwrap lines using punctation and line length
            #unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE)
            unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
            html = unwrap.sub(' ', html)
            #check any remaining hyphens, but only unwrap if there is a match
            dehyphenator = Dehyphenator()
            html = dehyphenator(html,'html_cleanup', length)
        else:
            # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
            self.log("Cleaning up hyphenation")
            dehyphenator = Dehyphenator()
            html = dehyphenator(html,'html_cleanup', length)
            self.log("Done dehyphenating")
-        # delete soft hyphens
+        ###### Unwrap lines ######
-        html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
+        if getattr(self.extra_opts, 'unwrap_lines', False):
            # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
            if hardbreaks or unwrap_factor < 0.4:
                self.log.debug("Unwrapping required, unwrapping Lines")
                # Dehyphenate with line length limiters
                dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
                html = dehyphenator(html,'html', length)
                html = self.punctuation_unwrap(length, html, 'html')
        if getattr(self.extra_opts, 'dehyphenate', False):
            # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
            self.log.debug("Fixing hyphenated content")
            dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
            html = dehyphenator(html,'html_cleanup', length)
            html = dehyphenator(html, 'individual_words', length)
        # If still no sections after unwrapping mark split points on lines with no punctuation
-        if self.html_preprocess_sections < self.min_chapters:
+        if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
-            self.log("Looking for more split points based on punctuation,"
+            self.log.debug("Looking for more split points based on punctuation,"
                    " currently have " + unicode(self.html_preprocess_sections))
            chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
            html = chapdetect3.sub(self.chapter_break, html)
        # search for places where a first or second level heading is immediately followed by another
        # top level heading.  demote the second heading to h3 to prevent splitting between chapter
        # headings and titles, images, etc
        doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
        html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
-        # put back non-breaking spaces in empty paragraphs to preserve original formatting
+        if getattr(self.extra_opts, 'renumber_headings', False):
-        html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
+            # search for places where a first or second level heading is immediately followed by another
            # top level heading.  demote the second heading to h3 to prevent splitting between chapter
            # headings and titles, images, etc
            doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
            html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
-        # Center separator lines
+        if getattr(self.extra_opts, 'format_scene_breaks', False):
-        html = re.sub(u'<p>\s*(?P<break>([*#•]+\s*)+)\s*</p>', '<p style="text-align:center">' + '\g<break>' + '</p>', html)
+            # Center separator lines
            html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•=✦]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html)
            if not self.blanks_deleted:
                html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
            html = re.sub('<p\s+id="softbreak"[^>]*>\s*</p>', '<div id="softbreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em"><hr style="height: 3px; background:#505050" /></div>', html)
        if self.deleted_nbsps:
            # put back non-breaking spaces in empty paragraphs to preserve original formatting
            html = self.blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
        return html
--- a/src/calibre/ebooks/fb2/fb2ml.py
+++ b/src/calibre/ebooks/fb2/fb2ml.py
@ -16,7 +16,6 @@ import uuid
 from lxml import etree
 from calibre import guess_type
 from calibre import prepare_string_for_xml
 from calibre.constants import __appname__, __version__
 from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
@ -41,7 +40,7 @@ class FB2MLizer(object):
        # in different directories. FB2 images are all in a flat layout so we rename all images
        # into a sequential numbering system to ensure there are no collisions between image names.
        self.image_hrefs = {}
-        # Mapping of toc items and their 
+        # Mapping of toc items and their
        self.toc = {}
        # Used to see whether a new <section> needs to be opened
        self.section_level = 0
@ -51,7 +50,7 @@ class FB2MLizer(object):
        self.oeb_book = oeb_book
        self.opts = opts
        self.reset_state()
-        
+
        # Used for adding <section>s and <title>s to allow readers
        # to generate toc from the document.
        if self.opts.sectionize == 'toc':
@ -75,20 +74,20 @@ class FB2MLizer(object):
        text = re.sub(r'(?miu)<p>\s*</p>', '', text)
        text = re.sub(r'(?miu)\s*</p>', '</p>', text)
        text = re.sub(r'(?miu)</p>\s*<p>', '</p>\n\n<p>', text)
-        
+
        text = re.sub(r'(?miu)<title>\s*</title>', '', text)
        text = re.sub(r'(?miu)\s+</title>', '</title>', text)
-        
+
        text = re.sub(r'(?miu)<section>\s*</section>', '', text)
        text = re.sub(r'(?miu)\s*</section>', '\n</section>', text)
        text = re.sub(r'(?miu)</section>\s*', '</section>\n\n', text)
        text = re.sub(r'(?miu)\s*<section>', '\n<section>', text)
        text = re.sub(r'(?miu)<section>\s*', '<section>\n', text)
        text = re.sub(r'(?miu)</section><section>', '</section>\n\n<section>', text)
-        
+
        if self.opts.insert_blank_line:
            text = re.sub(r'(?miu)</p>', '</p><empty-line />', text)
-        
+
        return text
    def fb2_header(self):
@ -102,6 +101,7 @@ class FB2MLizer(object):
        metadata['date'] = '%i.%i.%i' % (datetime.now().day, datetime.now().month, datetime.now().year)
        metadata['lang'] = u''.join(self.oeb_book.metadata.lang) if self.oeb_book.metadata.lang else 'en'
        metadata['id'] = None
        metadata['cover'] = self.get_cover()
        author_parts = self.oeb_book.metadata.creator[0].value.split(' ')
        if len(author_parts) == 1:
@ -121,10 +121,11 @@ class FB2MLizer(object):
                break
        if metadata['id'] is None:
            self.log.warn('No UUID identifier found')
-            metadata['id'] = str(uuid.uuid4()) 
+            metadata['id'] = str(uuid.uuid4())
        for key, value in metadata.items():
-            metadata[key] = prepare_string_for_xml(value)
+            if not key == 'cover':
                metadata[key] = prepare_string_for_xml(value)
        return u'<FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:xlink="http://www.w3.org/1999/xlink">' \
                '<description>' \
@ -136,6 +137,7 @@ class FB2MLizer(object):
                            '<last-name>%(author_last)s</last-name>' \
                        '</author>' \
                        '<book-title>%(title)s</book-title>' \
                        '%(cover)s' \
                        '<lang>%(lang)s</lang>' \
                    '</title-info>' \
                    '<document-info>' \
@ -154,48 +156,64 @@ class FB2MLizer(object):
    def fb2_footer(self):
        return u'</FictionBook>'
    def get_cover(self):
        cover_href = None
        # Get the raster cover if it's available.
        if self.oeb_book.metadata.cover and unicode(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids:
            id = unicode(self.oeb_book.metadata.cover[0])
            cover_item = self.oeb_book.manifest.ids[id]
            if cover_item.media_type in OEB_RASTER_IMAGES:
                cover_href = cover_item.href
        else:
            # Figure out if we have a title page or a cover page
            page_name = ''
            if 'titlepage' in self.oeb_book.guide:
                page_name = 'titlepage'
            elif 'cover' in self.oeb_book.guide:
                page_name = 'cover'
            if page_name:
                cover_item = self.oeb_book.manifest.hrefs[self.oeb_book.guide[page_name].href]
                # Get the first image in the page
                for img in cover_item.xpath('//img'):
                    cover_href = cover_item.abshref(img.get('src'))
                    break
        if cover_href:
            # Only write the image tag if it is in the manifest.
            if cover_href in self.oeb_book.manifest.hrefs.keys():
                if cover_href not in self.image_hrefs.keys():
                    self.image_hrefs[cover_href] = '_%s.jpg' % len(self.image_hrefs.keys())
            return u'<coverpage><image xlink:href="#%s" /></coverpage>' % self.image_hrefs[cover_href]
        return u''
    def get_text(self):
        text = ['<body>']
-        
+
        # Create main section if there are no others to create
        if self.opts.sectionize == 'nothing':
            text.append('<section>')
            self.section_level += 1
-        
+
        # Insert the title page / cover into the spine if it is not already referenced.
        title_name = u''
        if 'titlepage' in self.oeb_book.guide:
            title_name = 'titlepage'
        elif 'cover' in self.oeb_book.guide:
            title_name = 'cover'
        if title_name:
            title_item = self.oeb_book.manifest.hrefs[self.oeb_book.guide[title_name].href]
            if title_item.spine_position is None and title_item.media_type == 'application/xhtml+xml':
                self.oeb_book.spine.insert(0, title_item, True)
        # Create xhtml page to reference cover image so it can be used.
        if not title_name and self.oeb_book.metadata.cover and unicode(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids:
            id = unicode(self.oeb_book.metadata.cover[0])
            cover_item = self.oeb_book.manifest.ids[id]
            if cover_item.media_type in OEB_RASTER_IMAGES:
                self.insert_image_cover(cover_item.href)
        for item in self.oeb_book.spine:
            self.log.debug('Converting %s to FictionBook2 XML' % item.href)
            stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
-            
+
            # Start a <section> if we must sectionize each file or if the TOC references this page
            page_section_open = False
            if self.opts.sectionize == 'files' or self.toc.get(item.href) == 'page':
                text.append('<section>')
                page_section_open = True
                self.section_level += 1
-            
+
            text += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
-            
+
            if page_section_open:
                text.append('</section>')
                self.section_level -= 1
-                
+
        # Close any open sections
        while self.section_level > 0:
            text.append('</section>')
@ -203,17 +221,6 @@ class FB2MLizer(object):
        return ''.join(text) + '</body>'
    def insert_image_cover(self, image_href):
        from calibre.ebooks.oeb.base import RECOVER_PARSER
        try:
            root = etree.fromstring(u'<html xmlns="%s"><body><img src="%s" /></body></html>' % (XHTML_NS, image_href), parser=RECOVER_PARSER)
        except:
            root = etree.fromstring(u'', parser=RECOVER_PARSER)
        id, href = self.oeb_book.manifest.generate('fb2_cover', 'fb2_cover.xhtml')
        item = self.oeb_book.manifest.add(id, href, guess_type(href)[0], data=root)
        self.oeb_book.spine.insert(0, item, True)
    def fb2mlize_images(self):
        '''
        This function uses the self.image_hrefs dictionary mapping. It is populated by the dump_text function.
@ -345,7 +352,7 @@ class FB2MLizer(object):
                        self.toc[page.href] = None
                elif toc_entry and elem_tree.attrib.get('id', None):
                    newlevel = toc_entry.get(elem_tree.attrib.get('id', None), None)
-                    
+
                # Start a new section if necessary
                if newlevel:
                    if not (newlevel > self.section_level):
--- a/src/calibre/ebooks/fb2/input.py
+++ b/src/calibre/ebooks/fb2/input.py
@ -46,15 +46,19 @@ class FB2Input(InputFormatPlugin):
        log.debug('Parsing XML...')
        raw = stream.read().replace('\0', '')
        raw = xml_to_unicode(raw, strip_encoding_pats=True,
-            assume_utf8=True)[0]
+            assume_utf8=True, resolve_entities=True)[0]
        try:
            doc = etree.fromstring(raw)
        except etree.XMLSyntaxError:
            try:
                doc = etree.fromstring(raw, parser=RECOVER_PARSER)
                if doc is None:
                    raise Exception('parse failed')
            except:
                doc = etree.fromstring(raw.replace('& ', '&amp;'),
                        parser=RECOVER_PARSER)
        if doc is None:
            raise ValueError('The FB2 file is not valid XML')
        stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]')
        css = ''
        for s in stylesheets:
@ -100,13 +104,17 @@ class FB2Input(InputFormatPlugin):
        entries = [(f, guess_type(f)[0]) for f in os.listdir('.')]
        opf.create_manifest(entries)
        opf.create_spine(['index.xhtml'])
-
+        if mi.cover_data and mi.cover_data[1]:
-        for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES):
+            with open('fb2_cover_calibre_mi.jpg', 'wb') as f:
-            href = img.get('{%s}href'%XLINK_NS, img.get('href', None))
+                f.write(mi.cover_data[1])
-            if href is not None:
+            opf.guide.set_cover(os.path.abspath('fb2_cover_calibre_mi.jpg'))
-                if href.startswith('#'):
+        else:
-                    href = href[1:]
+            for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES):
-                opf.guide.set_cover(os.path.abspath(href))
+                href = img.get('{%s}href'%XLINK_NS, img.get('href', None))
                if href is not None:
                    if href.startswith('#'):
                        href = href[1:]
                    opf.guide.set_cover(os.path.abspath(href))
        opf.render(open('metadata.opf', 'wb'))
        return os.path.join(os.getcwd(), 'metadata.opf')
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@ -21,10 +21,9 @@ from calibre.customize.conversion import InputFormatPlugin
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.customize.conversion import OptionRecommendation
 from calibre.constants import islinux, isfreebsd, iswindows
-from calibre import unicode_path
+from calibre import unicode_path, as_unicode
 from calibre.utils.localization import get_lang
 from calibre.utils.filenames import ascii_filename
 from calibre.ebooks.conversion.utils import PreProcessor
 class Link(object):
    '''
@ -112,14 +111,14 @@ class HTMLFile(object):
            with open(self.path, 'rb') as f:
                src = f.read()
        except IOError, err:
-            msg = 'Could not read from file: %s with error: %s'%(self.path, unicode(err))
+            msg = 'Could not read from file: %s with error: %s'%(self.path, as_unicode(err))
            if level == 0:
                raise IOError(msg)
            raise IgnoreFile(msg, err.errno)
        self.is_binary = level > 0 and not bool(self.HTML_PAT.search(src[:4096]))
        if not self.is_binary:
-            if encoding is None:
+            if not encoding:
                encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
                self.encoding = encoding
            else:
@ -296,7 +295,7 @@ class HTMLInput(InputFormatPlugin):
            return oeb
        from calibre.ebooks.conversion.plumber import create_oebbook
-        return create_oebbook(log, stream.name, opts, self,
+        return create_oebbook(log, stream.name, opts,
                encoding=opts.input_encoding)
    def is_case_sensitive(self, path):
@ -485,9 +484,3 @@ class HTMLInput(InputFormatPlugin):
            self.log.exception('Failed to read CSS file: %r'%link)
            return (None, None)
        return (None, raw)
    def preprocess_html(self, options, html):
        self.options = options
        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
        return preprocessor(html)
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@ -7,8 +7,6 @@ __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 from calibre.customize.conversion import InputFormatPlugin
 from calibre.ebooks.conversion.utils import PreProcessor
 class LITInput(InputFormatPlugin):
@ -22,7 +20,7 @@ class LITInput(InputFormatPlugin):
        from calibre.ebooks.lit.reader import LitReader
        from calibre.ebooks.conversion.plumber import create_oebbook
        self.log = log
-        return create_oebbook(log, stream, options, self, reader=LitReader)
+        return create_oebbook(log, stream, options, reader=LitReader)
    def postprocess_book(self, oeb, opts, log):
        from calibre.ebooks.oeb.base import XHTML_NS, XPath, XHTML
@ -39,10 +37,13 @@ class LITInput(InputFormatPlugin):
                body = body[0]
                if len(body) == 1 and body[0].tag == XHTML('pre'):
                    pre = body[0]
-                    from calibre.ebooks.txt.processor import convert_basic
+                    from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
                    separate_paragraphs_single_line
                    from lxml import etree
                    import copy
-                    html = convert_basic(pre.text).replace('<html>',
+                    html = separate_paragraphs_single_line(pre.text)
                    html = preserve_spaces(html)
                    html = convert_basic(html).replace('<html>',
                            '<html xmlns="%s">'%XHTML_NS)
                    root = etree.fromstring(html)
                    body = XPath('//h:body')(root)
@ -51,10 +52,3 @@ class LITInput(InputFormatPlugin):
                    for elem in body:
                        ne = copy.deepcopy(elem)
                        pre.append(ne)
    def preprocess_html(self, options, html):
        self.options = options
        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
        return preprocessor(html)
--- a/src/calibre/ebooks/lrf/input.py
+++ b/src/calibre/ebooks/lrf/input.py
@ -12,7 +12,6 @@ from copy import deepcopy
 from lxml import etree
 from calibre.customize.conversion import InputFormatPlugin
 from calibre.ebooks.conversion.utils import PreProcessor
 from calibre import guess_type
 class Canvas(etree.XSLTExtension):
@ -419,11 +418,3 @@ class LRFInput(InputFormatPlugin):
            f.write(result)
        styles.write()
        return os.path.abspath('content.opf')
    def preprocess_html(self, options, html):
        self.options = options
        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
        return preprocessor(html)
--- a/src/calibre/ebooks/metadata/book/base.py
+++ b/src/calibre/ebooks/metadata/book/base.py
@ -324,14 +324,16 @@ class Metadata(object):
            if metadata is None:
                traceback.print_stack()
                return
-            metadata = copy.deepcopy(metadata)
+            m = {}
-            if '#value#' not in metadata:
+            for k in metadata:
-                if metadata['datatype'] == 'text' and metadata['is_multiple']:
+                m[k] = copy.copy(metadata[k])
-                    metadata['#value#'] = []
+            if '#value#' not in m:
                if m['datatype'] == 'text' and m['is_multiple']:
                    m['#value#'] = []
                else:
-                    metadata['#value#'] = None
+                    m['#value#'] = None
            _data = object.__getattribute__(self, '_data')
-            _data['user_metadata'][field] = metadata
+            _data['user_metadata'][field] = m
    def template_to_attribute(self, other, ops):
        '''
--- a/src/calibre/ebooks/metadata/library_thing.py
+++ b/src/calibre/ebooks/metadata/library_thing.py
@ -4,7 +4,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 Fetch cover from LibraryThing.com based on ISBN number.
 '''
-import sys, socket, os, re
+import sys, socket, os, re, random
 from lxml import html
 import mechanize
@ -16,13 +16,26 @@ from calibre.ebooks.chardet import strip_encoding_declarations
 OPENLIBRARY = 'http://covers.openlibrary.org/b/isbn/%s-L.jpg?default=false'
 def get_ua():
    choices = [
        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11'
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)'
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)'
        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1)'
        'Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16'
        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.1 Safari/525.19'
        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11'
    ]
    return choices[random.randint(0, len(choices)-1)]
 class HeadRequest(mechanize.Request):
    def get_method(self):
        return 'HEAD'
 def check_for_cover(isbn, timeout=5.):
-    br = browser()
+    br = browser(user_agent=get_ua())
    br.set_handle_redirect(False)
    try:
        br.open_novisit(HeadRequest(OPENLIBRARY%isbn), timeout=timeout)
@ -51,7 +64,7 @@ def login(br, username, password, force=True):
 def cover_from_isbn(isbn, timeout=5., username=None, password=None):
    src = None
-    br = browser()
+    br = browser(user_agent=get_ua())
    try:
        return br.open(OPENLIBRARY%isbn, timeout=timeout).read(), 'jpg'
    except:
@ -100,7 +113,7 @@ def get_social_metadata(title, authors, publisher, isbn, username=None,
    from calibre.ebooks.metadata import MetaInformation
    mi = MetaInformation(title, authors)
    if isbn:
-        br = browser()
+        br = browser(user_agent=get_ua())
        if username and password:
            try:
                login(br, username, password, force=False)
--- a/src/calibre/ebooks/metadata/rtf.py
+++ b/src/calibre/ebooks/metadata/rtf.py
@ -10,7 +10,8 @@ from calibre.ebooks.metadata import MetaInformation, string_to_authors
 title_pat    = re.compile(r'\{\\info.*?\{\\title(.*?)(?<!\\)\}', re.DOTALL)
 author_pat   = re.compile(r'\{\\info.*?\{\\author(.*?)(?<!\\)\}', re.DOTALL)
 comment_pat  = re.compile(r'\{\\info.*?\{\\subject(.*?)(?<!\\)\}', re.DOTALL)
-category_pat = re.compile(r'\{\\info.*?\{\\category(.*?)(?<!\\)\}', re.DOTALL)
+tags_pat = re.compile(r'\{\\info.*?\{\\category(.*?)(?<!\\)\}', re.DOTALL)
 publisher_pat = re.compile(r'\{\\info.*?\{\\manager(.*?)(?<!\\)\}', re.DOTALL)
 def get_document_info(stream):
    """
@ -82,61 +83,73 @@ def decode(raw, codec):
 def get_metadata(stream):
    """ Return metadata as a L{MetaInfo} object """
    title, author, comment, category = None, None, None, None
    stream.seek(0)
    if stream.read(5) != r'{\rtf':
-        return MetaInformation(None, None)
+        return MetaInformation(_('Unknown'))
    block = get_document_info(stream)[0]
    if not block:
-        return MetaInformation(None, None)
+        return MetaInformation(_('Unknown'))
    stream.seek(0)
    cpg = detect_codepage(stream)
    stream.seek(0)
    title_match = title_pat.search(block)
-    if title_match:
+    if title_match is not None:
        title = decode(title_match.group(1).strip(), cpg)
    else:
        title = _('Unknown')
    author_match = author_pat.search(block)
-    if author_match:
+    if author_match is not None:
        author = decode(author_match.group(1).strip(), cpg)
-    comment_match = comment_pat.search(block)
+    else:
-    if comment_match:
+        author = None
-        comment = decode(comment_match.group(1).strip(), cpg)
+    mi = MetaInformation(title)
    category_match = category_pat.search(block)
    if category_match:
        category = decode(category_match.group(1).strip(), cpg)
    mi = MetaInformation(title, author)
    if author:
        mi.authors = string_to_authors(author)
-    mi.comments = comment
+
-    mi.category = category
+    comment_match = comment_pat.search(block)
    if comment_match is not None:
        comment = decode(comment_match.group(1).strip(), cpg)
        mi.comments = comment
    tags_match = tags_pat.search(block)
    if tags_match is not None:
        tags = decode(tags_match.group(1).strip(), cpg)
        mi.tags = tags
    publisher_match = publisher_pat.search(block)
    if publisher_match is not None:
        publisher = decode(publisher_match.group(1).strip(), cpg)
        mi.publisher = publisher
    return mi
 def create_metadata(stream, options):
-    md = r'{\info'
+    md = [r'{\info']
    if options.title:
        title = options.title.encode('ascii', 'ignore')
-        md += r'{\title %s}'%(title,)
+        md.append(r'{\title %s}'%(title,))
    if options.authors:
        au = options.authors
        if not isinstance(au, basestring):
            au = u', '.join(au)
        author = au.encode('ascii', 'ignore')
-        md += r'{\author %s}'%(author,)
+        md.append(r'{\author %s}'%(author,))
    if options.get('category', None):
        category = options.category.encode('ascii', 'ignore')
        md += r'{\category %s}'%(category,)
    comp = options.comment if hasattr(options, 'comment') else options.comments
    if comp:
        comment = comp.encode('ascii', 'ignore')
-        md += r'{\subject %s}'%(comment,)
+        md.append(r'{\subject %s}'%(comment,))
-    if len(md) > 6:
+    if options.publisher:
-        md += '}'
+        publisher = options.publisher.encode('ascii', 'ignore')
        md.append(r'{\manager %s}'%(publisher,))
    if options.tags:
        tags = u', '.join(options.tags)
        tags = tags.encode('ascii', 'ignore')
        md.append(r'{\category %s}'%(tags,))
    if len(md) > 1:
        md.append('}')
        stream.seek(0)
        src   = stream.read()
-        ans = src[:6] + md + src[6:]
+        ans = src[:6] + u''.join(md) + src[6:]
        stream.seek(0)
        stream.write(ans)
@ -156,7 +169,7 @@ def set_metadata(stream, options):
        base_pat = r'\{\\name(.*?)(?<!\\)\}'
        title = options.title
-        if title != None:
+        if title is not None:
            title = title.encode('ascii', 'replace')
            pat = re.compile(base_pat.replace('name', 'title'), re.DOTALL)
            if pat.search(src):
@ -164,7 +177,7 @@ def set_metadata(stream, options):
            else:
                src = add_metadata_item(src, 'title', title)
        comment = options.comments
-        if comment != None:
+        if comment is not None:
            comment = comment.encode('ascii', 'replace')
            pat = re.compile(base_pat.replace('name', 'subject'), re.DOTALL)
            if pat.search(src):
@ -172,7 +185,7 @@ def set_metadata(stream, options):
            else:
                src = add_metadata_item(src, 'subject', comment)
        author = options.authors
-        if author != None:
+        if author is not None:
            author =  ', '.join(author)
            author = author.encode('ascii', 'ignore')
            pat = re.compile(base_pat.replace('name', 'author'), re.DOTALL)
@ -180,14 +193,23 @@ def set_metadata(stream, options):
                src = pat.sub(r'{\\author ' + author + r'}', src)
            else:
                src = add_metadata_item(src, 'author', author)
-        category = options.get('category', None)
+        tags = options.tags
-        if category != None:
+        if tags is not None:
-            category = category.encode('ascii', 'replace')
+            tags =  ', '.join(tags)
            tags = tags.encode('ascii', 'replace')
            pat = re.compile(base_pat.replace('name', 'category'), re.DOTALL)
            if pat.search(src):
-                src = pat.sub(r'{\\category ' + category + r'}', src)
+                src = pat.sub(r'{\\category ' + tags + r'}', src)
            else:
-                src = add_metadata_item(src, 'category', category)
+                src = add_metadata_item(src, 'category', tags)
        publisher = options.publisher
        if publisher is not None:
            publisher = publisher.encode('ascii', 'replace')
            pat = re.compile(base_pat.replace('name', 'manager'), re.DOTALL)
            if pat.search(src):
                src = pat.sub(r'{\\manager ' + publisher + r'}', src)
            else:
                src = add_metadata_item(src, 'manager', publisher)
        stream.seek(pos + olen)
        after = stream.read()
        stream.seek(pos)
--- a/src/calibre/ebooks/mobi/input.py
+++ b/src/calibre/ebooks/mobi/input.py
@ -3,7 +3,6 @@ __license__ = 'GPL 3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import re
 from calibre.customize.conversion import InputFormatPlugin
 class MOBIInput(InputFormatPlugin):
@ -39,11 +38,3 @@ class MOBIInput(InputFormatPlugin):
                accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
        return mr.created_opf_path
    def preprocess_html(self, options, html):
        # search for places where a first or second level heading is immediately followed by another
        # top level heading.  demote the second heading to h3 to prevent splitting between chapter
        # headings and titles, images, etc
        doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
        html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
        return html
--- a/src/calibre/ebooks/mobi/reader.py
+++ b/src/calibre/ebooks/mobi/reader.py
@ -139,7 +139,7 @@ class BookHeader(object):
                    65001: 'utf-8',
                    }[self.codepage]
            except (IndexError, KeyError):
-                self.codec = 'cp1252' if user_encoding is None else user_encoding
+                self.codec = 'cp1252' if not user_encoding else user_encoding
                log.warn('Unknown codepage %d. Assuming %s' % (self.codepage,
                    self.codec))
            if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length \
@ -542,7 +542,17 @@ class MobiReader(object):
                        elif tag.tag == 'img':
                            tag.set('height', height)
                        else:
-                            styles.append('margin-top: %s' % self.ensure_unit(height))
+                            if tag.tag == 'div' and not tag.text and \
                                    (not tag.tail or not tag.tail.strip()) and \
                                    not len(list(tag.iterdescendants())):
                                # Paragraph spacer
                                # Insert nbsp so that the element is never
                                # discarded by a renderer
                                tag.text = u'\u00a0' # nbsp
                                styles.append('height: %s' %
                                        self.ensure_unit(height))
                            else:
                                styles.append('margin-top: %s' % self.ensure_unit(height))
            if attrib.has_key('width'):
                width = attrib.pop('width').strip()
                if width and re.search(r'\d+', width):
@ -632,9 +642,18 @@ class MobiReader(object):
                attrib['class'] = cls
        for tag in svg_tags:
-            p = tag.getparent()
+            images = tag.xpath('descendant::img[@src]')
-            if hasattr(p, 'remove'):
+            parent = tag.getparent()
-                p.remove(tag)
+
            if images and hasattr(parent, 'find'):
                index = parent.index(tag)
                for img in images:
                    img.getparent().remove(img)
                    img.tail = img.text = None
                    parent.insert(index, img)
            if hasattr(parent, 'remove'):
                parent.remove(tag)
    def create_opf(self, htmlfile, guide=None, root=None):
        mi = getattr(self.book_header.exth, 'mi', self.embedded_mi)
--- a/src/calibre/ebooks/mobi/writer.py
+++ b/src/calibre/ebooks/mobi/writer.py
@ -251,7 +251,7 @@ class Serializer(object):
        tag = prefixname(elem.tag, nsrmap)
        # Previous layers take care of @name
        id = elem.attrib.pop('id', None)
-        if id is not None:
+        if id:
            href = '#'.join((item.href, id))
            offset = self.anchor_offset or buffer.tell()
            self.id_offsets[urlnormalize(href)] = offset
@ -1541,7 +1541,10 @@ class MobiWriter(object):
                exth.write(data)
                nrecs += 1
            if term == 'rights' :
-                rights = unicode(oeb.metadata.rights[0]).encode('utf-8')
+                try:
                    rights = unicode(oeb.metadata.rights[0]).encode('utf-8')
                except:
                    rights = 'Unknown'
                exth.write(pack('>II', EXTH_CODES['rights'], len(rights) + 8))
                exth.write(rights)
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -1892,7 +1892,7 @@ class OEBBook(object):
                return fix_data(data.decode(bom_enc))
            except UnicodeDecodeError:
                pass
-        if self.input_encoding is not None:
+        if self.input_encoding:
            try:
                return fix_data(data.decode(self.input_encoding, 'replace'))
            except UnicodeDecodeError:
--- a/src/calibre/ebooks/oeb/iterator.py
+++ b/src/calibre/ebooks/oeb/iterator.py
@ -199,8 +199,8 @@ class EbookIterator(object):
                    not hasattr(self.pathtoopf, 'manifest'):
                if hasattr(self.pathtoopf, 'manifest'):
                    self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
-                self.pathtoopf = create_oebbook(self.log, self.pathtoopf, plumber.opts,
+                self.pathtoopf = create_oebbook(self.log, self.pathtoopf,
-                        plumber.input_plugin)
+                        plumber.opts)
        if hasattr(self.pathtoopf, 'manifest'):
            self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
@ -227,7 +227,7 @@ class EbookIterator(object):
                self.log.warn('Missing spine item:', repr(spath))
        cover = self.opf.cover
-        if self.ebook_ext in ('lit', 'mobi', 'prc', 'opf') and cover:
+        if self.ebook_ext in ('lit', 'mobi', 'prc', 'opf', 'fb2') and cover:
            cfile = os.path.join(self.base, 'calibre_iterator_cover.html')
            chtml = (TITLEPAGE%os.path.relpath(cover, self.base).replace(os.sep,
                '/')).encode('utf-8')
--- a/src/calibre/ebooks/oeb/transforms/metadata.py
+++ b/src/calibre/ebooks/oeb/transforms/metadata.py
@ -10,7 +10,7 @@ import os
 from calibre.utils.date import isoformat, now
 from calibre import guess_type
-def meta_info_to_oeb_metadata(mi, m, log):
+def meta_info_to_oeb_metadata(mi, m, log, override_input_metadata=False):
    from calibre.ebooks.oeb.base import OPF
    if not mi.is_null('title'):
        m.clear('title')
@ -29,15 +29,23 @@ def meta_info_to_oeb_metadata(mi, m, log):
    if not mi.is_null('book_producer'):
        m.filter('contributor', lambda x : x.role.lower() == 'bkp')
        m.add('contributor', mi.book_producer, role='bkp')
    elif override_input_metadata:
        m.filter('contributor', lambda x : x.role.lower() == 'bkp')
    if not mi.is_null('comments'):
        m.clear('description')
        m.add('description', mi.comments)
    elif override_input_metadata:
         m.clear('description')
    if not mi.is_null('publisher'):
        m.clear('publisher')
        m.add('publisher', mi.publisher)
    elif override_input_metadata:
        m.clear('publisher')
    if not mi.is_null('series'):
        m.clear('series')
        m.add('series', mi.series)
    elif override_input_metadata:
        m.clear('series')
    if not mi.is_null('isbn'):
        has = False
        for x in m.identifier:
@ -46,19 +54,27 @@ def meta_info_to_oeb_metadata(mi, m, log):
                has = True
        if not has:
            m.add('identifier', mi.isbn, scheme='ISBN')
    elif override_input_metadata:
        m.filter('identifier', lambda x: x.scheme.lower() == 'isbn')
    if not mi.is_null('language'):
        m.clear('language')
        m.add('language', mi.language)
    if not mi.is_null('series_index'):
        m.clear('series_index')
        m.add('series_index', mi.format_series_index())
    elif override_input_metadata:
        m.clear('series_index')
    if not mi.is_null('rating'):
        m.clear('rating')
        m.add('rating', '%.2f'%mi.rating)
    elif override_input_metadata:
        m.clear('rating')
    if not mi.is_null('tags'):
        m.clear('subject')
        for t in mi.tags:
            m.add('subject', t)
    elif override_input_metadata:
        m.clear('subject')
    if not mi.is_null('pubdate'):
        m.clear('date')
        m.add('date', isoformat(mi.pubdate))
@ -71,6 +87,7 @@ def meta_info_to_oeb_metadata(mi, m, log):
    if not mi.is_null('publication_type'):
        m.clear('publication_type')
        m.add('publication_type', mi.publication_type)
    if not m.timestamp:
        m.add('timestamp', isoformat(now()))
@ -78,11 +95,12 @@ def meta_info_to_oeb_metadata(mi, m, log):
 class MergeMetadata(object):
    'Merge in user metadata, including cover'
-    def __call__(self, oeb, mi, opts):
+    def __call__(self, oeb, mi, opts, override_input_metadata=False):
        self.oeb, self.log = oeb, oeb.log
        m = self.oeb.metadata
        self.log('Merging user specified metadata...')
-        meta_info_to_oeb_metadata(mi, m, oeb.log)
+        meta_info_to_oeb_metadata(mi, m, oeb.log,
                override_input_metadata=override_input_metadata)
        cover_id = self.set_cover(mi, opts.prefer_metadata_cover)
        m.clear('cover')
        if cover_id is not None:
--- a/src/calibre/ebooks/pdb/input.py
+++ b/src/calibre/ebooks/pdb/input.py
@ -9,7 +9,6 @@ import os
 from calibre.customize.conversion import InputFormatPlugin
 from calibre.ebooks.pdb.header import PdbHeaderReader
 from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
 from calibre.ebooks.conversion.utils import PreProcessor
 class PDBInput(InputFormatPlugin):
@ -32,8 +31,3 @@ class PDBInput(InputFormatPlugin):
        opf = reader.extract_content(os.getcwd())
        return opf
    def preprocess_html(self, options, html):
        self.options = options
        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
        return preprocessor(html)
--- a/src/calibre/ebooks/pdb/palmdoc/reader.py
+++ b/src/calibre/ebooks/pdb/palmdoc/reader.py
@ -65,9 +65,9 @@ class Reader(FormatReader):
        from calibre.customize.ui import plugin_for_input_format
        txt_plugin = plugin_for_input_format('txt')
-        for option in txt_plugin.options:
+        for opt in txt_plugin.options:
-            if not hasattr(self.options, option.option.name):
+            if not hasattr(self.options, opt.option.name):
-                setattr(self.options, option.name, option.recommended_value)
+                setattr(self.options, opt.option.name, opt.recommended_value)
        stream.seek(0)
        return txt_plugin.convert(stream, self.options, 'txt', self.log, {})
--- a/src/calibre/ebooks/pdb/pdf/reader.py
+++ b/src/calibre/ebooks/pdb/pdf/reader.py
@ -31,9 +31,9 @@ class Reader(FormatReader):
        from calibre.customize.ui import plugin_for_input_format
        pdf_plugin = plugin_for_input_format('pdf')
-        for option in pdf_plugin.options:
+        for opt in pdf_plugin.options:
-            if not hasattr(self.options, option.option.name):
+            if not hasattr(self.options, opt.option.name):
-                setattr(self.options, option.name, option.recommended_value)
+                setattr(self.options, opt.option.name, opt.recommended_value)
        pdf.seek(0)
        return pdf_plugin.convert(pdf, self.options, 'pdf', self.log, {})
--- a/src/calibre/ebooks/pdb/ztxt/reader.py
+++ b/src/calibre/ebooks/pdb/ztxt/reader.py
@ -83,9 +83,9 @@ class Reader(FormatReader):
        from calibre.customize.ui import plugin_for_input_format
        txt_plugin = plugin_for_input_format('txt')
-        for option in txt_plugin.options:
+        for opt in txt_plugin.options:
-            if not hasattr(self.options, option.option.name):
+            if not hasattr(self.options, opt.option.name):
-                setattr(self.options, option.name, option.recommended_value)
+                setattr(self.options, opt.option.name, opt.recommended_value)
        stream.seek(0)
        return txt_plugin.convert(stream, self.options, 'txt', self.log, {})
--- a/src/calibre/ebooks/pml/pmlconverter.py
+++ b/src/calibre/ebooks/pml/pmlconverter.py
@ -34,18 +34,15 @@ class PML_HTMLizer(object):
        'ra',
        'c',
        'r',
        't',
        's',
        'l',
        'k',
        'T',
        'FN',
        'SB',
    ]
    STATES_VALUE_REQ = [
        'a',
        'T',
        'FN',
        'SB',
    ]
@ -96,8 +93,6 @@ class PML_HTMLizer(object):
        'Sb': 'sb',
        'c': 'c',
        'r': 'r',
        't': 't',
        'T': 'T',
        'i': 'i',
        'I': 'i',
        'u': 'u',
@ -133,8 +128,6 @@ class PML_HTMLizer(object):
    DIV_STATES = [
        'c',
        'r',
        't',
        'T',
        'FN',
        'SB',
    ]
@ -255,8 +248,6 @@ class PML_HTMLizer(object):
        for key, val in self.state.items():
            if val[0]:
                if key == 'T':
                    self.state['T'][0] = False
                if key in self.DIV_STATES:
                    div.append(key)
                elif key in self.SPAN_STATES:
@ -506,6 +497,9 @@ class PML_HTMLizer(object):
        self.toc = TOC()
        self.file_name = file_name
        indent_state = {'t': False, 'T': False}
        adv_indent_val = ''
        for s in self.STATES:
            self.state[s] = [False, ''];
@ -515,6 +509,8 @@ class PML_HTMLizer(object):
            parsed = []
            empty = True
            basic_indent = indent_state['t']
            adv_indent = indent_state['T']
            # Must use StringIO, cStringIO does not support unicode
            line = StringIO.StringIO(line)
@ -527,7 +523,7 @@ class PML_HTMLizer(object):
                if c == '\\':
                    c = line.read(1)
-                    if c in 'qcrtTiIuobBlk':
+                    if c in 'qcriIuobBlk':
                        text = self.process_code(c, line)
                    elif c in 'FS':
                        l = line.read(1)
@ -574,6 +570,15 @@ class PML_HTMLizer(object):
                    elif c == 'w':
                        empty = False
                        text = '<hr width="%s" />' % self.code_value(line)
                    elif c == 't':
                        indent_state[c] = not indent_state[c]
                        if indent_state[c]:
                            basic_indent = True
                    elif c == 'T':
                        indent_state[c] = not indent_state[c]
                        if indent_state[c]:
                            adv_indent = True
                            adv_indent_val = self.code_value(line)
                    elif c == '-':
                        empty = False
                        text = '&shy;'
@ -590,6 +595,16 @@ class PML_HTMLizer(object):
            if not empty:
                text = self.end_line()
                parsed.append(text)
                if basic_indent:
                    parsed.insert(0, self.STATES_TAGS['t'][0])
                    parsed.append(self.STATES_TAGS['t'][1])
                elif adv_indent:
                    parsed.insert(0, self.STATES_TAGS['T'][0] % adv_indent_val)
                    parsed.append(self.STATES_TAGS['T'][1])
                    indent_state['T'] = False
                    adv_indent_val = ''
                output.append(u''.join(parsed))
            line.close()
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@ -7,7 +7,6 @@ import os, glob, re, textwrap
 from lxml import etree
 from calibre.customize.conversion import InputFormatPlugin
 from calibre.ebooks.conversion.utils import PreProcessor
 border_style_map = {
        'single' : 'solid',
@ -77,7 +76,15 @@ class RTFInput(InputFormatPlugin):
    def generate_xml(self, stream):
        from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf
-        ofile = 'out.xml'
+        ofile = 'dataxml.xml'
        run_lev, debug_dir = 1, None
        if getattr(self.opts, 'debug_pipeline', None) is not None:
            try:
                os.mkdir(debug_dir)
                debug_dir = 'rtfdebug'
                run_lev = 4
            except:
                pass
        parser = ParseRtf(
            in_file    = stream,
            out_file   = ofile,
@ -115,43 +122,45 @@ class RTFInput(InputFormatPlugin):
            # Write or do not write paragraphs. Default is 0.
            empty_paragraphs = 1,
            #debug
            deb_dir = debug_dir,
            run_level = run_lev,
        )
        parser.parse_rtf()
-        ans = open('out.xml').read()
+        with open(ofile, 'rb') as f:
-        os.remove('out.xml')
+            return f.read()
        return ans
    def extract_images(self, picts):
        import imghdr
        self.log('Extracting images...')
        with open(picts, 'rb') as f:
            raw = f.read()
        picts = filter(len, re.findall(r'\{\\pict([^}]+)\}', raw))
        hex = re.compile(r'[^a-fA-F0-9]')
        encs = [hex.sub('', pict) for pict in picts]
        count = 0
        raw = open(picts, 'rb').read()
        starts = []
        for match in re.finditer(r'\{\\pict([^}]+)\}', raw):
            starts.append(match.start(1))
        imap = {}
-
+        for enc in encs:
        for start in starts:
            pos, bc = start, 1
            while bc > 0:
                if raw[pos] == '}': bc -= 1
                elif raw[pos] == '{': bc += 1
                pos += 1
            pict = raw[start:pos+1]
            enc = re.sub(r'[^a-zA-Z0-9]', '', pict)
            if len(enc) % 2 == 1:
                enc = enc[:-1]
            data = enc.decode('hex')
            fmt = imghdr.what(None, data)
            if fmt is None:
                fmt = 'wmf'
            count += 1
-            name = (('%4d'%count).replace(' ', '0'))+'.wmf'
+            name = '%04d.%s' % (count, fmt)
-            open(name, 'wb').write(data)
+            with open(name, 'wb') as f:
                f.write(data)
            imap[count] = name
            #open(name+'.hex', 'wb').write(enc)
        return self.convert_images(imap)
    def convert_images(self, imap):
-        for count, val in imap.items():
+        self.default_img = None
        for count, val in imap.iteritems():
            try:
                imap[count] = self.convert_image(val)
            except:
@ -159,11 +168,34 @@ class RTFInput(InputFormatPlugin):
        return imap
    def convert_image(self, name):
-        from calibre.utils.magick import Image
+        if not name.endswith('.wmf'):
-        img = Image()
+            return name
-        img.open(name)
+        try:
            return self.rasterize_wmf(name)
        except:
            self.log.exception('Failed to convert WMF image %r'%name)
        return self.replace_wmf(name)
    def replace_wmf(self, name):
        from calibre.ebooks import calibre_cover
        if self.default_img is None:
            self.default_img = calibre_cover('Conversion of WMF images is not supported',
            'Use Microsoft Word or OpenOffice to save this RTF file'
            ' as HTML and convert that in calibre.', title_size=36,
            author_size=20)
        name = name.replace('.wmf', '.jpg')
-        img.save(name)
+        with open(name, 'wb') as f:
            f.write(self.default_img)
        return name
    def rasterize_wmf(self, name):
        from calibre.utils.wmf.parse import wmf_unwrap
        with open(name, 'rb') as f:
            data = f.read()
        data = wmf_unwrap(data)
        name = name.replace('.wmf', '.png')
        with open(name, 'wb') as f:
            f.write(data)
        return name
@ -192,27 +224,27 @@ class RTFInput(InputFormatPlugin):
        css += '\n'+'\n'.join(font_size_classes)
        css += '\n' +'\n'.join(color_classes)
-        for cls, val in border_styles.items():
+        for cls, val in border_styles.iteritems():
            css += '\n\n.%s {\n%s\n}'%(cls, val)
        with open('styles.css', 'ab') as f:
            f.write(css)
-    def preprocess(self, fname):
+    # def preprocess(self, fname):
-        self.log('\tPreprocessing to convert unicode characters')
+        # self.log('\tPreprocessing to convert unicode characters')
-        try:
+        # try:
-            data = open(fname, 'rb').read()
+            # data = open(fname, 'rb').read()
-            from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
+            # from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
-            tokenizer = RtfTokenizer(data)
+            # tokenizer = RtfTokenizer(data)
-            tokens = RtfTokenParser(tokenizer.tokens)
+            # tokens = RtfTokenParser(tokenizer.tokens)
-            data = tokens.toRTF()
+            # data = tokens.toRTF()
-            fname = 'preprocessed.rtf'
+            # fname = 'preprocessed.rtf'
-            with open(fname, 'wb') as f:
+            # with open(fname, 'wb') as f:
-                f.write(data)
+                # f.write(data)
-        except:
+        # except:
-            self.log.exception(
+            # self.log.exception(
-            'Failed to preprocess RTF to convert unicode sequences, ignoring...')
+            # 'Failed to preprocess RTF to convert unicode sequences, ignoring...')
-        return fname
+        # return fname
    def convert_borders(self, doc):
        border_styles = []
@ -249,17 +281,13 @@ class RTFInput(InputFormatPlugin):
        self.log = log
        self.log('Converting RTF to XML...')
        #Name of the preprocesssed RTF file
-        fname = self.preprocess(stream.name)
+        # fname = self.preprocess(stream.name)
        try:
-            xml = self.generate_xml(fname)
+            xml = self.generate_xml(stream.name)
        except RtfInvalidCodeException, e:
            raise ValueError(_('This RTF file has a feature calibre does not '
            'support. Convert it to HTML first and then try it.\n%s')%e)
        '''dataxml = open('dataxml.xml', 'w')
        dataxml.write(xml)
        dataxml.close'''
        d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
        if d:
            imap = {}
@ -290,13 +318,9 @@ class RTFInput(InputFormatPlugin):
            res = transform.tostring(result)
            res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
            # Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
-            if not getattr(self.opts, 'remove_paragraph_spacing', False):
+            res = re.sub('\s*<body>', '<body>', res)
-                res = re.sub('\s*<body>', '<body>', res)
+            res = re.sub('(?<=\n)\n{2}',
-                res = re.sub('(?<=\n)\n{2}',
+                    u'<p>\u00a0</p>\n'.encode('utf-8'), res)
                        u'<p>\u00a0</p>\n'.encode('utf-8'), res)
            if self.opts.preprocess_html:
                preprocessor = PreProcessor(self.opts, log=getattr(self, 'log', None))
                res = preprocessor(res)
            f.write(res)
        self.write_inline_css(inline_class, border_styles)
        stream.seek(0)
--- a/src/calibre/ebooks/rtf/rtfml.py
+++ b/src/calibre/ebooks/rtf/rtfml.py
@ -262,7 +262,7 @@ class RTFMLizer(object):
        if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '':
            if 'block' in tag_stack:
-                text += '%s ' % txt2rtf(elem.tail)
+                text += '%s' % txt2rtf(elem.tail)
            else:
                text += '{\\par \\pard \\hyphpar %s}' % txt2rtf(elem.tail)
--- a/src/calibre/ebooks/rtf2xml/ParseRtf.py
+++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py
@ -17,7 +17,8 @@
 #########################################################################
 # $Revision: 1.41 $
 # $Date: 2006/03/24 23:50:07 $
-import sys,os
+import sys, os
 from calibre.ebooks.rtf2xml import headings_to_sections, \
    line_endings, footnote, fields_small, default_encoding, \
    make_lists, preamble_div, header, colors, group_borders, \
@ -90,7 +91,6 @@ class ParseRtf:
                out_file = '',
                out_dir = None,
                dtd = '',
                #debug = 0, #why? calibre
                deb_dir = None,
                convert_symbol = None,
                convert_wingdings = None,
@ -107,6 +107,7 @@ class ParseRtf:
                no_dtd = 0,
                char_data = '',
                ):
        """
        Requires:
        'file' --file to parse
@ -119,12 +120,11 @@ class ParseRtf:
            script tries to output to directory where is script is exectued.)
            'deb_dir' --debug directory. If a debug_dir is provided, the script
            will copy each run through as a file to examine in the debug_dir
            'perl_script'--use perl to make tokens. This runs just a bit faster.
            (I will probably phase this out.)
            'check_brackets' -- make sure the brackets match up after each run
            through a file. Only for debugging.
        Returns: Nothing
        """
        self.__file = in_file
        self.__out_file = out_file
        self.__out_dir = out_dir
@ -132,7 +132,7 @@ class ParseRtf:
        self.__dtd_path = dtd
        self.__check_file(in_file,"file_to_parse")
        self.__char_data = char_data
-        self.__debug_dir = deb_dir #self.__debug_dir = debug calibre
+        self.__debug_dir = deb_dir
        self.__check_dir(self.__temp_dir)
        self.__copy = self.__check_dir(self.__debug_dir)
        self.__convert_caps = convert_caps
@ -155,25 +155,24 @@ class ParseRtf:
        if hasattr(the_file, 'read'): return
        if the_file == None:
            if type == "file_to_parse":
-                message = "You must provide a file for the script to work"
+                msg = "\nYou must provide a file for the script to work"
            msg = message
            raise RtfInvalidCodeException, msg
        elif os.path.exists(the_file):
            pass # do nothing
        else:
-            message = "The file '%s' cannot be found" % the_file
+            msg = "\nThe file '%s' cannot be found" % the_file
            msg = message
            raise RtfInvalidCodeException, msg
    def __check_dir(self, the_dir):
        """Check to see if directory exists"""
        if not the_dir :
            return
        dir_exists = os.path.isdir(the_dir)
        if not dir_exists:
-            message = "%s is not a directory" % the_dir
+            msg = "\n%s is not a directory" % the_dir
            msg = message
            raise RtfInvalidCodeException, msg
        return 1
    def parse_rtf(self):
        """
        Parse the file by calling on other classes.
@ -194,13 +193,14 @@ class ParseRtf:
            copy_obj.set_dir(self.__debug_dir)
            copy_obj.remove_files()
            copy_obj.copy_file(self.__temp_file, "original_file")
-        # new as of 2005-08-02. Do I want this?
+        # Function to check if bracket are well handled
        if self.__debug_dir or self.__run_level > 2:
            self.__check_brack_obj = check_brackets.CheckBrackets\
            (file = self.__temp_file,
                bug_handler = RtfInvalidCodeException,
                    )
-        # convert Macintosh line endings to Unix line endings
+        #convert Macintosh and Windows line endings to Unix line endings
        #why do this if you don't wb after?
        line_obj = line_endings.FixLineEndings(
                in_file = self.__temp_file,
                bug_handler = RtfInvalidCodeException,
@ -208,13 +208,13 @@ class ParseRtf:
                run_level = self.__run_level,
                replace_illegals = self.__replace_illegals,
                )
-        return_value = line_obj.fix_endings()
+        return_value = line_obj.fix_endings() #calibre return what?
        self.__return_code(return_value)
        tokenize_obj = tokenize.Tokenize(
                bug_handler = RtfInvalidCodeException,
                in_file = self.__temp_file,
                copy = self.__copy,
-                run_level = self.__run_level,)
+                run_level = self.__run_level)
        tokenize_obj.tokenize()
        process_tokens_obj = process_tokens.ProcessTokens(
            in_file = self.__temp_file,
@ -226,15 +226,27 @@ class ParseRtf:
        try:
            return_value = process_tokens_obj.process_tokens()
        except InvalidRtfException, msg:
            #Check to see if the file is correctly encoded
            encode_obj = default_encoding.DefaultEncoding(
            in_file = self.__temp_file,
            run_level = self.__run_level,
            bug_handler = RtfInvalidCodeException,
            check_raw = True,
            )
            platform, code_page, default_font_num = encode_obj.find_default_encoding()
            check_encoding_obj = check_encoding.CheckEncoding(
                    bug_handler = RtfInvalidCodeException,
                        )
            enc = 'cp' + encode_obj.get_codepage()
            msg = 'Exception in token processing'
            if check_encoding_obj.check_encoding(self.__file, enc):
                file_name = self.__file if isinstance(self.__file, str) \
                                    else self.__file.encode('utf-8')
                msg = 'File %s does not appear to be correctly encoded.\n' % file_name
            try:
                os.remove(self.__temp_file)
            except OSError:
                pass
            check_encoding_obj = check_encoding.CheckEncoding(
                bug_handler = RtfInvalidCodeException,
                    )
            check_encoding_obj.check_encoding(self.__file)
            sys.stderr.write('File "%s" does not appear to be RTF.\n' % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8'))
            raise InvalidRtfException, msg
        delete_info_obj = delete_info.DeleteInfo(
            in_file = self.__temp_file,
@ -508,6 +520,7 @@ class ParseRtf:
                indent = self.__indent,
                run_level = self.__run_level,
                no_dtd = self.__no_dtd,
                encoding = encode_obj.get_codepage(),
                bug_handler = RtfInvalidCodeException,
                )
        tags_obj.convert_to_tags()
@ -520,35 +533,28 @@ class ParseRtf:
        output_obj.output()
        os.remove(self.__temp_file)
        return self.__exit_level
    def __bracket_match(self, file_name):
        if self.__run_level > 2:
            good_br, msg =  self.__check_brack_obj.check_brackets()
            if good_br:
                pass
-                # sys.stderr.write( msg + ' in ' + file_name + "\n")
+                #sys.stderr.write( msg + ' in ' + file_name + "\n")
            else:
-                msg += msg +  " in file '" + file_name + "'\n"
+                msg = '%s in file %s\n' % (msg, file_name)
                raise RtfInvalidCodeException, msg
    def __return_code(self, num):
-        if num == None:
+      if num == None:
-            return
+          return
-        if int(num) > self.__exit_level:
+      if int(num) > self.__exit_level:
-            self.__exit_level = num
+          self.__exit_level = num
    def __make_temp_file(self,file):
        """Make a temporary file to parse"""
        write_file="rtf_write_file"
        read_obj = file if hasattr(file, 'read') else open(file,'r')
-        write_obj = open(write_file, 'w')
+        with open(write_file, 'wb') as write_obj:
-        line = "dummy"
+            for line in read_obj:
-        while line:
+                write_obj.write(line)
            line = read_obj.read(1000)
            write_obj.write(line )
        write_obj.close()
        return write_file
    """
 mi<tg<open______<style-sheet\n
 mi<tg<close_____<style-sheet\n
 mi<tg<open-att__<footnote<num>1\n
 mi<tg<empty-att_<page-definition<margin>33\n
 mi<tg<empty_____<para\n
 """
--- a/src/calibre/ebooks/rtf2xml/check_brackets.py
+++ b/src/calibre/ebooks/rtf2xml/check_brackets.py
@ -24,38 +24,38 @@ class CheckBrackets:
        self.__ob_count = 0
        self.__cb_count = 0
        self.__open_bracket_num = []
    def open_brack(self, line):
        num = line[-5:-1]
        self.__open_bracket_num.append(num)
        self.__bracket_count += 1
    def close_brack(self, line):
        num = line[-5:-1]
        ##self.__open_bracket_num.append(num)
        try:
            last_num = self.__open_bracket_num.pop()
        except:
-            return 0
+            return False
        if num != last_num:
-            return 0
+            return False
        self.__bracket_count -= 1
-        return 1
+        return True
    def check_brackets(self):
        read_obj = open(self.__file, 'r')
        line = 'dummy'
        line_count = 0
-        while line:
+        with open(self.__file, 'r') as read_obj:
-            line_count += 1
+            for line in read_obj:
-            line = read_obj.readline()
+                line_count += 1
-            self.__token_info = line[:16]
+                self.__token_info = line[:16]
-            if self.__token_info == 'ob<nu<open-brack':
+                if self.__token_info == 'ob<nu<open-brack':
-                self.open_brack(line)
+                    self.open_brack(line)
-            if self.__token_info == 'cb<nu<clos-brack':
+                if self.__token_info == 'cb<nu<clos-brack':
-                right_count = self.close_brack(line)
+                    if not self.close_brack(line):
-                if not right_count:
+                        return (False, "closed bracket doesn't match, line %s" % line_count)
-                    return (0, "closed bracket doesn't match, line %s" % line_count)
+
        read_obj.close()
        if self.__bracket_count != 0:
-            msg = 'At end of file open and closed brackets don\'t match\n'
+            msg = ('At end of file open and closed brackets don\'t match\n' \
-            msg = msg + 'total number of brackets is %s' % self.__bracket_count
+                        'total number of brackets is %s') % self.__bracket_count
-            return (0, msg)
+            return (False, msg)
-        return (1, "brackets match!")
+        return (True, "Brackets match!")
--- a/src/calibre/ebooks/rtf2xml/check_encoding.py
+++ b/src/calibre/ebooks/rtf2xml/check_encoding.py
@ -1,8 +1,11 @@
 #!/usr/bin/env python
 import sys
 class CheckEncoding:
    def __init__(self, bug_handler):
        self.__bug_handler = bug_handler
    def __get_position_error(self, line, encoding, line_num):
        char_position = 0
        for char in line:
@ -12,21 +15,23 @@ class CheckEncoding:
            except UnicodeError, msg:
                sys.stderr.write('line: %s char: %s\n' %  (line_num, char_position))
                sys.stderr.write(str(msg) + '\n')
-    def check_encoding(self, path, encoding='us-ascii'):
+
-        read_obj = open(path, 'r')
+    def check_encoding(self, path, encoding='us-ascii', verbose=True):
        line_to_read = 1
        line_num = 0
-        while line_to_read:
+        with open(path, 'r') as read_obj:
-            line_num += 1
+            for line in read_obj:
-            line_to_read = read_obj.readline()
+                line_num += 1
-            line = line_to_read
+                try:
-            try:
+                    line.decode(encoding)
-                line.decode(encoding)
+                except UnicodeError:
-            except UnicodeError:
+                    if verbose:
-                if len(line) < 1000:
+                        if len(line) < 1000:
-                    self.__get_position_error(line, encoding, line_num)
+                            self.__get_position_error(line, encoding, line_num)
-                else:
+                        else:
-                    sys.stderr.write('line: %d has bad encoding\n'%line_num)
+                            sys.stderr.write('line: %d has bad encoding\n' % line_num)
                    return True
        return False
 if __name__ == '__main__':
    check_encoding_obj = CheckEncoding()
    check_encoding_obj.check_encoding(sys.argv[1])
--- a/Show More
+++ b/Show More