Merge from trunk

2025-07-09 03:04:10 -04:00 · 2012-06-19 13:14:31 +02:00 · 2012-06-19 13:14:31 +02:00 · 916bb44179
commit 916bb44179
parent f6da689719 02eb54c1d0
14 changed files with 1084 additions and 65 deletions
--- a/recipes/bbc_brasil.recipe
+++ b/recipes/bbc_brasil.recipe
@ -0,0 +1,595 @@
 ##
 ## Title:        BBC News, Sport, and Blog Calibre Recipe
 ## Contact:      mattst - jmstanfield@gmail.com
 ##
 ## License:      GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
 ## Copyright:    mattst - jmstanfield@gmail.com
 ##
 ## Written:      November 2011
 ## Last Edited:  2011-11-19
 ##
 __license__     = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
 __copyright__   = 'mattst - jmstanfield@gmail.com'
 '''
 BBC News, Sport, and Blog Calibre Recipe
 '''
 # Import the regular expressions module.
 import re
 # Import the BasicNewsRecipe class which this class extends.
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class BBCBrasilRecipe(BasicNewsRecipe):
    #
    #    **** IMPORTANT USERS READ ME ****
    #
    #  First select the feeds you want then scroll down below the feeds list
    #  and select the values you want for the other user preferences, like
    #  oldest_article and such like.
    #
    #
    #  Select the BBC rss feeds which you want in your ebook.
    #  Selected feed have NO '#' at their start, de-selected feeds begin with a '#'.
    #
    #  Eg.  ("News Home", "http://feeds.bbci.co.uk/... - include feed.
    #  Eg. #("News Home", "http://feeds.bbci.co.uk/... - do not include feed.
    #
    # There are 68 feeds below which constitute the bulk of the available rss
    # feeds on the BBC web site. These include 5 blogs by editors and
    # correspondants, 16 sports feeds, 15 'sub' regional feeds (Eg. North West
    # Wales, Scotland Business), and 7 Welsh language feeds.
    #
    # Some of the feeds are low volume (Eg. blogs), or very low volume (Eg. Click)
    # so if "oldest_article = 1.5" (only articles published in the last 36 hours)
    # you may get some 'empty feeds' which will not then be included in the ebook.
    #
    # The 15 feeds currently selected below are simply my default ones.
    #
    # Note: With all 68 feeds selected, oldest_article set to 2,
    # max_articles_per_feed set to 100, and simultaneous_downloads set to 10,
    # the ebook creation took 29 minutes on my speedy 100 mbps net connection,
    # fairly high-end desktop PC running Linux (Ubuntu Lucid-Lynx).
    # More realistically with 15 feeds selected, oldest_article set to 1.5,
    # max_articles_per_feed set to 100, and simultaneous_downloads set to 20,
    # it took 6 minutes. If that's too slow increase 'simultaneous_downloads'.
    #
    # Select / de-select the feeds you want in your ebook.
    #
    feeds = [
                      (u'Primeira P\xe1gina', u'http://www.bbc.co.uk/portuguese/index.xml'),
                      (u'\xdaltimas Not\xedcias', u'http://www.bbc.co.uk/portuguese/ultimas_noticias/index.xml'),
                      (u'Internacional', u'http://www.bbc.co.uk/portuguese/topicos/internacional/index.xml'),
                      (u'Brasil', u'http://www.bbc.co.uk/portuguese/topicos/brasil/index.xml'),
                      (u'Am\xe9rica Latina', u'http://www.bbc.co.uk/portuguese/topicos/america_latina/index.xml'),
                      (u'Economia', u'http://www.bbc.co.uk/portuguese/topicos/economia/index.xml'),
                      (u'Sa\xfade', u'http://www.bbc.co.uk/portuguese/topicos/saude/index.xml'),
                      (u'Ci\xeancia e Tecnologia', u'http://www.bbc.co.uk/portuguese/topicos/ciencia_e_tecnologia/index.xml'),
                      (u'Cultura', u'http://www.bbc.co.uk/portuguese/topicos/cultura/index.xml'),
                      (u'V\xeddeos e Fotos', u'http://www.bbc.co.uk/portuguese/videos_e_fotos/index.xml'),
                      (u'Especiais', u'http://www.bbc.co.uk/portuguese/especiais/index.xml')
            ]
    #    **** SELECT YOUR USER PREFERENCES ****
    # Title to use for the ebook.
    #
    title = 'BBC Brasil'
    # A brief description for the ebook.
    #
    description = u'Not\xedcias do Brasil e do mundo pela British Broadcasting Corporation'
    # The max number of articles which may be downloaded from each feed.
    # I've never seen more than about 70 articles in a single feed in the
    # BBC feeds.
    #
    max_articles_per_feed = 100
    # The max age of articles which may be downloaded from each feed. This is
    # specified in days - note fractions of days are allowed, Eg. 2.5 (2 and a
    # half days). My default of 1.5 days is the last 36 hours, the point at
    # which I've decided 'news' becomes 'old news', but be warned this is not
    # so good for the blogs, technology, magazine, etc., and sports feeds.
    # You may wish to extend this to 2-5 but watch out ebook creation time will
    # increase as well. Setting this to 30 will get everything (AFAICT) as long
    # as max_articles_per_feed remains set high (except for 'Click' which is
    # v. low volume and its currently oldest article is 4th Feb 2011).
    #
    oldest_article = 1.5
    # Number of simultaneous downloads. 20 is consistantly working fine on the
    # BBC News feeds with no problems. Speeds things up from the defualt of 5.
    # If you have a lot of feeds and/or have increased oldest_article above 2
    # then you may wish to try increasing simultaneous_downloads to 25-30,
    # Or, of course, if you are in a hurry. [I've not tried beyond 20.]
    #
    simultaneous_downloads = 20
    # Timeout for fetching files from the server in seconds. The default of
    # 120 seconds, seems somewhat excessive.
    #
    timeout = 30
    # The format string for the date shown on the ebook's first page.
    # List of all values: http://docs.python.org/library/time.html
    # Default in news.py has a leading space so that's mirrored here.
    # As with 'feeds' select/de-select by adding/removing the initial '#',
    # only one timefmt should be selected, here's a few to choose from.
    #
    timefmt = ' [%a, %d %b %Y]'              # [Fri, 14 Nov 2011] (Calibre default)
    #timefmt = ' [%a, %d %b %Y %H:%M]'       # [Fri, 14 Nov 2011 18:30]
    #timefmt = ' [%a, %d %b %Y %I:%M %p]'    # [Fri, 14 Nov 2011 06:30 PM]
    #timefmt = ' [%d %b %Y]'                 # [14 Nov 2011]
    #timefmt = ' [%d %b %Y %H:%M]'           # [14 Nov 2011 18.30]
    #timefmt = ' [%Y-%m-%d]'                 # [2011-11-14]
    #timefmt = ' [%Y-%m-%d-%H-%M]'           # [2011-11-14-18-30]
    #
    #    **** IMPORTANT ****
    #
    #    DO NOT EDIT BELOW HERE UNLESS YOU KNOW WHAT YOU ARE DOING.
    #
    #    DO NOT EDIT BELOW HERE UNLESS YOU KNOW WHAT YOU ARE DOING.
    #
    #    I MEAN IT, YES I DO, ABSOLUTELY, AT YOU OWN RISK. :)
    #
    #    **** IMPORTANT ****
    #
    # Author of this recipe.
    __author__ = 'claviola'
    # Specify English as the language of the RSS feeds (ISO-639 code).
    language = 'en_GB'
    # Set tags.
    tags = 'news, sport, blog'
    # Set publisher and publication type.
    publisher = 'BBC'
    publication_type = 'newspaper'
    # Disable stylesheets from site.
    no_stylesheets = True
    # Specifies an override encoding for sites that have an incorrect charset
    # specified. Default of 'None' says to auto-detect. Some other BBC recipes
    # use 'utf8', which works fine (so use that if necessary) but auto-detecting
    # with None is working fine, so stick with that for robustness.
    encoding = None
    # Sets whether a feed has full articles embedded in it. The BBC feeds do not.
    use_embedded_content = False
    # Removes empty feeds - why keep them!?
    remove_empty_feeds = True
    # Create a custom title which fits nicely in the Kindle title list.
    # Requires "import time" above class declaration, and replacing
    # title with custom_title in conversion_options (right column only).
    # Example of string below: "BBC News - 14 Nov 2011"
    #
    # custom_title = "BBC News - " + time.strftime('%d %b %Y')
    '''
    # Conversion options for advanced users, but don't forget to comment out the
    # current conversion_options below. Avoid setting 'linearize_tables' as that
    # plays havoc with the 'old style' table based pages.
    #
    conversion_options = { 'title'       : title,
                           'comments'    : description,
                           'tags'        : tags,
                           'language'    : language,
                           'publisher'   : publisher,
                           'authors'     : publisher,
                           'smarten_punctuation' : True
                         }
    '''
    conversion_options = { 'smarten_punctuation' : True }
    # Specify extra CSS - overrides ALL other CSS (IE. Added last).
    extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
                 .introduction, .first { font-weight: bold; } \
                 .cross-head { font-weight: bold; font-size: 125%; } \
                 .cap, .caption { display: block; font-size: 80%; font-style: italic; } \
                 .cap, .caption, .caption img, .caption span { display: block; text-align: center; margin: 5px auto; } \
                 .byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
                    .correspondent-portrait img, .byline-lead-in, .name, .role, .bbc-role { display: block; \
                    text-align: center; font-size: 80%; font-style: italic; margin: 1px auto; } \
                 .story-date, .published, .datestamp { font-size: 80%; } \
                 table { width: 100%; } \
                 td img { display: block; margin: 5px auto; } \
                 ul { padding-top: 10px; } \
                 ol { padding-top: 10px; } \
                 li { padding-top: 5px; padding-bottom: 5px; } \
                 h1 { text-align: center; font-size: 175%; font-weight: bold; } \
                 h2 { text-align: center; font-size: 150%; font-weight: bold; } \
                 h3 { text-align: center; font-size: 125%; font-weight: bold; } \
                 h4, h5, h6 { text-align: center; font-size: 100%; font-weight: bold; }'
    # Remove various tag attributes to improve the look of the ebook pages.
    remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
                          'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
    # Remove the (admittedly rarely used) line breaks, "<br />", which sometimes
    # cause a section of the ebook to start in an unsightly fashion or, more
    # frequently, a "<br />" will muck up the formatting of a correspondant's byline.
    # "<br />" and "<br clear/>" are far more frequently used on the table formatted
    # style of pages, and really spoil the look of the ebook pages.
    preprocess_regexps     = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
                              (re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: '')]
    # Create regular expressions for tag keeping and removal to make the matches more
    # robust against minor changes and errors in the HTML, Eg. double spaces, leading
    # and trailing spaces, missing hyphens, and such like.
    # Python regular expression ('re' class) page: http://docs.python.org/library/re.html
    # ***************************************
    # Regular expressions for keep_only_tags:
    # ***************************************
    # The BBC News HTML pages use variants of 'storybody' to denote the section of a HTML
    # page which contains the main text of the article. Match storybody variants: 'storybody',
    # 'story-body', 'story body','storybody ', etc.
    storybody_reg_exp = '^.*story[_ -]*body.*$'
    # The BBC sport and 'newsbeat' (features) HTML pages use 'blq_content' to hold the title
    # and published date. This is one level above the usual news pages which have the title
    # and date within 'story-body'. This is annoying since 'blq_content' must also be kept,
    # resulting in a lot of extra things to be removed by remove_tags.
    blq_content_reg_exp = '^.*blq[_ -]*content.*$'
    # The BBC has an alternative page design structure, which I suspect is an out-of-date
    # design but which is still used in some articles, Eg. 'Click' (technology), 'FastTrack'
    # (travel), and in some sport pages. These alternative pages are table based (which is
    # why I think they are an out-of-date design) and account for -I'm guesstimaking- less
    # than 1% of all articles. They use a table class 'storycontent' to hold the article
    # and like blq_content (above) have required lots of extra removal by remove_tags.
    story_content_reg_exp = '^.*story[_ -]*content.*$'
    # Keep the sections of the HTML which match the list below. The HTML page created by
    # Calibre will fill <body> with those sections which are matched. Note that the
    # blq_content_reg_exp must be listed before storybody_reg_exp in keep_only_tags due to
    # it being the parent of storybody_reg_exp, that is to say the div class/id 'story-body'
    # will be inside div class/id 'blq_content' in the HTML (if 'blq_content' is there at
    # all). If they are the other way around in keep_only_tags then blq_content_reg_exp
    # will end up being discarded.
    keep_only_tags = [ dict(name='table', attrs={'class':re.compile(story_content_reg_exp, re.IGNORECASE)}),
                       dict(name='div',   attrs={'class':re.compile(blq_content_reg_exp, re.IGNORECASE)}),
                       dict(name='div',   attrs={'id':re.compile(blq_content_reg_exp, re.IGNORECASE)}),
                       dict(name='div',   attrs={'class':re.compile(storybody_reg_exp, re.IGNORECASE)}),
                       dict(name='div',   attrs={'id':re.compile(storybody_reg_exp, re.IGNORECASE)}) ]
    # ************************************
    # Regular expressions for remove_tags:
    # ************************************
    # Regular expression to remove share-help and variant tags. The share-help class
    # is used by the site for a variety of 'sharing' type links, Eg. Facebook, delicious,
    # twitter, email. Removed to avoid page clutter.
    share_help_reg_exp = '^.*share[_ -]*help.*$'
    # Regular expression to remove embedded-hyper and variant tags. This class is used to
    # display links to other BBC News articles on the same/similar subject.
    embedded_hyper_reg_exp = '^.*embed*ed[_ -]*hyper.*$'
    # Regular expression to remove hypertabs and variant tags. This class is used to
    # display a tab bar at the top of an article which allows the user to switch to
    # an article (viewed on the same page) providing further info., 'in depth' analysis,
    # an editorial, a correspondant's blog entry, and such like. The ability to handle
    # a tab bar of this nature is currently beyond the scope of this recipe and
    # possibly of Calibre itself (not sure about that - TO DO - check!).
    hypertabs_reg_exp = '^.*hyper[_ -]*tabs.*$'
    # Regular expression to remove story-feature and variant tags. Eg. 'story-feature',
    # 'story-feature related narrow', 'story-feature wide', 'story-feature narrow'.
    # This class is used to add additional info. boxes, or small lists, outside of
    # the main story. TO DO: Work out a way to incorporate these neatly.
    story_feature_reg_exp = '^.*story[_ -]*feature.*$'
    # Regular expression to remove video and variant tags, Eg. 'videoInStoryB',
    # 'videoInStoryC'. This class is used to embed video.
    video_reg_exp = '^.*video.*$'
    # Regular expression to remove audio and variant tags, Eg. 'audioInStoryD'.
    # This class is used to embed audio.
    audio_reg_exp = '^.*audio.*$'
    # Regular expression to remove pictureGallery and variant tags, Eg. 'pictureGallery'.
    # This class is used to embed a photo slideshow. See also 'slideshow' below.
    picture_gallery_reg_exp = '^.*picture.*$'
    # Regular expression to remove slideshow and variant tags, Eg. 'dslideshow-enclosure'.
    # This class is used to embed a slideshow (not necessarily photo) but both
    # 'slideshow' and 'pictureGallery' are used for slideshows.
    slideshow_reg_exp = '^.*slide[_ -]*show.*$'
    # Regular expression to remove social-links and variant tags. This class is used to
    # display links to a BBC bloggers main page, used in various columnist's blogs
    # (Eg. Nick Robinson, Robert Preston).
    social_links_reg_exp = '^.*social[_ -]*links.*$'
    # Regular expression to remove quote and (multi) variant tags, Eg. 'quote',
    # 'endquote', 'quote-credit', 'quote-credit-title', etc. These are usually
    # removed by 'story-feature' removal (as they are usually within them), but
    # not always. The quotation removed is always (AFAICT) in the article text
    # as well but a 2nd copy is placed in a quote tag to draw attention to it.
    # The quote class tags may or may not appear in div's.
    quote_reg_exp = '^.*quote.*$'
    # Regular expression to remove hidden and variant tags, Eg. 'hidden'.
    # The purpose of these is unclear, they seem to be an internal link to a
    # section within the article, but the text of the link (Eg. 'Continue reading
    # the main story') never seems to be displayed anyway. Removed to avoid clutter.
    # The hidden class tags may or may not appear in div's.
    hidden_reg_exp = '^.*hidden.*$'
    # Regular expression to remove comment and variant tags, Eg. 'comment-introduction'.
    # Used on the site to display text about registered users entering comments.
    comment_reg_exp = '^.*comment.*$'
    # Regular expression to remove form and variant tags, Eg. 'comment-form'.
    # Used on the site to allow registered BBC users to fill in forms, typically
    # for entering comments about an article.
    form_reg_exp = '^.*form.*$'
    # Extra things to remove due to the addition of 'blq_content' in keep_only_tags.
    #<div class="story-actions"> Used on sports pages for 'email' and 'print'.
    story_actions_reg_exp = '^.*story[_ -]*actions.*$'
    #<div class="bookmark-list"> Used on sports pages instead of 'share-help' (for
    # social networking links).
    bookmark_list_reg_exp = '^.*bookmark[_ -]*list.*$'
    #<div id="secondary-content" class="content-group">
    # NOTE: Don't remove class="content-group" that is needed.
    # Used on sports pages to link to 'similar stories'.
    secondary_content_reg_exp = '^.*secondary[_ -]*content.*$'
    #<div id="featured-content" class="content-group">
    # NOTE: Don't remove class="content-group" that is needed.
    # Used on sports pages to link to pages like 'tables', 'fixtures', etc.
    featured_content_reg_exp = '^.*featured[_ -]*content.*$'
    #<div id="navigation">
    # Used on sports pages to link to pages like 'tables', 'fixtures', etc.
    # Used sometimes instead of "featured-content" above.
    navigation_reg_exp = '^.*navigation.*$'
    #<a class="skip" href="#blq-container-inner">Skip to top</a>
    # Used on sports pages to link to the top of the page.
    skip_reg_exp = '^.*skip.*$'
    # Extra things to remove due to the addition of 'storycontent' in keep_only_tags,
    # which are the alterative table design based pages. The purpose of some of these
    # is not entirely clear from the pages (which are a total mess!).
    # Remove mapping based tags, Eg. <map id="world_map">
    # The dynamic maps don't seem to work during ebook creation. TO DO: Investigate.
    map_reg_exp = '^.*map.*$'
    # Remove social bookmarking variation, called 'socialBookMarks'.
    social_bookmarks_reg_exp = '^.*social[_ -]*bookmarks.*$'
    # Remove page navigation tools, like 'search', 'email', 'print', called 'blq-mast'.
    blq_mast_reg_exp = '^.*blq[_ -]*mast.*$'
    # Remove 'sharesb', I think this is a generic 'sharing' class. It seems to appear
    # alongside 'socialBookMarks' whenever that appears. I am removing it as well
    # under the assumption that it can appear alone as well.
    sharesb_reg_exp = '^.*sharesb.*$'
    # Remove class 'o'. The worst named user created css class of all time. The creator
    # should immediately be fired. I've seen it used to hold nothing at all but with
    # 20 or so empty lines in it. Also to hold a single link to another article.
    # Whatever it was designed to do it is not wanted by this recipe. Exact match only.
    o_reg_exp = '^o$'
    # Remove 'promotopbg' and 'promobottombg', link lists. Have decided to
    # use two reg expressions to make removing this (and variants) robust.
    promo_top_reg_exp = '^.*promotopbg.*$'
    promo_bottom_reg_exp = '^.*promobottombg.*$'
    # Remove 'nlp', provides heading for link lists. Requires an exact match due to
    # risk of matching those letters in something needed, unless I see a variation
    # of 'nlp' used at a later date.
    nlp_reg_exp = '^nlp$'
    # Remove 'mva', provides embedded floating content of various types. Variant 'mvb'
    # has also now been seen. Requires an exact match of 'mva' or 'mvb' due to risk of
    # matching those letters in something needed.
    mva_or_mvb_reg_exp = '^mv[ab]$'
    # Remove 'mvtb', seems to be page navigation tools, like 'blq-mast'.
    mvtb_reg_exp = '^mvtb$'
    # Remove 'blq-toplink', class to provide a link to the top of the page.
    blq_toplink_reg_exp = '^.*blq[_ -]*top[_ -]*link.*$'
    # Remove 'products and services' links, Eg. desktop tools, alerts, and so on.
    # Eg. Class="servicev4 ukfs_services" - what a mess of a name. Have decided to
    # use two reg expressions to make removing this (and variants) robust.
    prods_services_01_reg_exp = '^.*servicev4.*$'
    prods_services_02_reg_exp = '^.*ukfs[_ -]*services.*$'
    # Remove -what I think is- some kind of navigation tools helper class, though I am
    # not sure, it's called: 'blq-rst blq-new-nav'. What I do know is it pops up
    # frequently and it is not wanted. Have decided to use two reg expressions to make
    # removing this (and variants) robust.
    blq_misc_01_reg_exp = '^.*blq[_ -]*rst.*$'
    blq_misc_02_reg_exp = '^.*blq[_ -]*new[_ -]*nav.*$'
    # Remove 'puffbox' - this may only appear inside 'storyextra', so it may not
    # need removing - I have no clue what it does other than it contains links.
    # Whatever it is - it is not part of the article and is not wanted.
    puffbox_reg_exp = '^.*puffbox.*$'
    # Remove 'sibtbg' and 'sibtbgf' - some kind of table formatting classes.
    sibtbg_reg_exp = '^.*sibtbg.*$'
    # Remove 'storyextra' - links to relevant articles and external sites.
    storyextra_reg_exp = '^.*story[_ -]*extra.*$'
    remove_tags = [ dict(name='div',  attrs={'class':re.compile(story_feature_reg_exp, re.IGNORECASE)}),
                    dict(name='div',  attrs={'class':re.compile(share_help_reg_exp, re.IGNORECASE)}),
                    dict(name='div',  attrs={'class':re.compile(embedded_hyper_reg_exp, re.IGNORECASE)}),
                    dict(name='div',  attrs={'class':re.compile(hypertabs_reg_exp, re.IGNORECASE)}),
                    dict(name='div',  attrs={'class':re.compile(video_reg_exp, re.IGNORECASE)}),
                    dict(name='div',  attrs={'class':re.compile(audio_reg_exp, re.IGNORECASE)}),
                    dict(name='div',  attrs={'class':re.compile(picture_gallery_reg_exp, re.IGNORECASE)}),
                    dict(name='div',  attrs={'class':re.compile(slideshow_reg_exp, re.IGNORECASE)}),
                    dict(name='div',  attrs={'class':re.compile(quote_reg_exp, re.IGNORECASE)}),
                    dict(name='div',  attrs={'class':re.compile(hidden_reg_exp, re.IGNORECASE)}),
                    dict(name='div',  attrs={'class':re.compile(comment_reg_exp, re.IGNORECASE)}),
                    dict(name='div',  attrs={'class':re.compile(story_actions_reg_exp, re.IGNORECASE)}),
                    dict(name='div',  attrs={'class':re.compile(bookmark_list_reg_exp, re.IGNORECASE)}),
                    dict(name='div',  attrs={'id':re.compile(secondary_content_reg_exp, re.IGNORECASE)}),
                    dict(name='div',  attrs={'id':re.compile(featured_content_reg_exp, re.IGNORECASE)}),
                    dict(name='div',  attrs={'id':re.compile(navigation_reg_exp, re.IGNORECASE)}),
                    dict(name='form', attrs={'id':re.compile(form_reg_exp, re.IGNORECASE)}),
                    dict(attrs={'class':re.compile(quote_reg_exp, re.IGNORECASE)}),
                    dict(attrs={'class':re.compile(hidden_reg_exp, re.IGNORECASE)}),
                    dict(attrs={'class':re.compile(social_links_reg_exp, re.IGNORECASE)}),
                    dict(attrs={'class':re.compile(comment_reg_exp, re.IGNORECASE)}),
                    dict(attrs={'class':re.compile(skip_reg_exp, re.IGNORECASE)}),
                    dict(name='map', attrs={'id':re.compile(map_reg_exp, re.IGNORECASE)}),
                    dict(name='map', attrs={'name':re.compile(map_reg_exp, re.IGNORECASE)}),
                    dict(name='div', attrs={'id':re.compile(social_bookmarks_reg_exp, re.IGNORECASE)}),
                    dict(name='div', attrs={'id':re.compile(blq_mast_reg_exp, re.IGNORECASE)}),
                    dict(name='div', attrs={'class':re.compile(sharesb_reg_exp, re.IGNORECASE)}),
                    dict(name='div', attrs={'class':re.compile(o_reg_exp, re.IGNORECASE)}),
                    dict(name='div',  attrs={'class':re.compile(promo_top_reg_exp, re.IGNORECASE)}),
                    dict(name='div',  attrs={'class':re.compile(promo_bottom_reg_exp, re.IGNORECASE)}),
                    dict(name='div',  attrs={'class':re.compile(nlp_reg_exp, re.IGNORECASE)}),
                    dict(name='div',  attrs={'class':re.compile(mva_or_mvb_reg_exp, re.IGNORECASE)}),
                    dict(name='div',  attrs={'class':re.compile(mvtb_reg_exp, re.IGNORECASE)}),
                    dict(name='div',  attrs={'class':re.compile(blq_toplink_reg_exp, re.IGNORECASE)}),
                    dict(name='div',  attrs={'class':re.compile(prods_services_01_reg_exp, re.IGNORECASE)}),
                    dict(name='div',  attrs={'class':re.compile(prods_services_02_reg_exp, re.IGNORECASE)}),
                    dict(name='div',  attrs={'class':re.compile(blq_misc_01_reg_exp, re.IGNORECASE)}),
                    dict(name='div',  attrs={'class':re.compile(blq_misc_02_reg_exp, re.IGNORECASE)}),
                    dict(name='div',  attrs={'class':re.compile(puffbox_reg_exp, re.IGNORECASE)}),
                    dict(attrs={'class':re.compile(sibtbg_reg_exp, re.IGNORECASE)}),
                    dict(attrs={'class':re.compile(storyextra_reg_exp, re.IGNORECASE)}),
                    dict(name='div',  attrs={'class':'tools-container'}),
                    dict(name='div',  attrs={'class':'tools-container-end'}),
                    dict(name='div',  attrs={'class':'g-block story-body contextual-links'}),
                    dict(name='div',  attrs={'class':' g-w11 sidebar'})
                  ]
    # Uses url to create and return the 'printer friendly' version of the url.
    # In other words the 'print this page' address of the page.
    #
    # There are 3 types of urls used in the BBC site's rss feeds. There is just
    # 1 type for the standard news while there are 2 used for sports feed urls.
    # Note: Sports urls are linked from regular news feeds (Eg. 'News Home') when
    # there is a major story of interest to 'everyone'. So even if no BBC sports
    # feeds are added to 'feeds' the logic of this method is still needed to avoid
    # blank / missing / empty articles which have an index title and then no body.
    def print_version(self, url):
        # Handle sports page urls type 01:
        if (url.find("go/rss/-/sport1/") != -1):
            temp_url = url.replace("go/rss/-/", "")
        # Handle sports page urls type 02:
        elif (url.find("go/rss/int/news/-/sport1/") != -1):
            temp_url = url.replace("go/rss/int/news/-/", "")
        # Handle regular news page urls:
        else:
            temp_url = url.replace("go/rss/int/news/-/", "")
        # Always add "?print=true" to the end of the url.
        print_url = temp_url + "?print=true"
        return print_url
    # Remove articles in feeds based on a string in the article title or url.
    #
    # Code logic written by: Starson17 - posted in: "Recipes - Re-usable code"
    # thread, in post with title: "Remove articles from feed", see url:
    # http://www.mobileread.com/forums/showpost.php?p=1165462&postcount=6
    # Many thanks and all credit to Starson17.
    #
    # Starson17's code has obviously been altered to suite my requirements.
    def parse_feeds(self):
        # Call parent's method.
        feeds = BasicNewsRecipe.parse_feeds(self)
        # Loop through all feeds.
        for feed in feeds:
            # Loop through all articles in feed.
            for article in feed.articles[:]:
                # Match key words and remove article if there's a match.
                # Most BBC rss feed video only 'articles' use upper case 'VIDEO'
                # as a title prefix. Just match upper case 'VIDEO', so that
                # articles like 'Video game banned' won't be matched and removed.
                if 'VIDEO' in article.title:
                    feed.articles.remove(article)
                # Most BBC rss feed audio only 'articles' use upper case 'AUDIO'
                # as a title prefix. Just match upper case 'AUDIO', so that
                # articles like 'Hi-Def audio...' won't be matched and removed.
                elif 'AUDIO' in article.title:
                    feed.articles.remove(article)
                # Most BBC rss feed photo slideshow 'articles' use 'In Pictures',
                # 'In pictures', and 'in pictures', somewhere in their title.
                # Match any case of that phrase.
                elif 'IN PICTURES' in article.title.upper():
                    feed.articles.remove(article)
                # As above, but user contributed pictures. Match any case.
                elif 'YOUR PICTURES' in article.title.upper():
                    feed.articles.remove(article)
                # 'Sportsday Live' are articles which contain a constantly and
                # dynamically updated 'running commentary' during a live sporting
                # event. Match any case.
                elif 'SPORTSDAY LIVE' in article.title.upper():
                    feed.articles.remove(article)
                # Sometimes 'Sportsday Live' (above) becomes 'Live - Sport Name'.
                # These are being matched below using 'Live - ' because removing all
                # articles with 'live' in their titles would remove some articles
                # that are in fact not live sports pages. Match any case.
                elif 'LIVE - ' in article.title.upper():
                    feed.articles.remove(article)
                # 'Quiz of the week' is a Flash player weekly news quiz. Match only
                # the 'Quiz of the' part in anticipation of monthly and yearly
                # variants. Match any case.
                elif 'QUIZ OF THE' in article.title.upper():
                    feed.articles.remove(article)
                # Remove articles with 'scorecards' in the url. These are BBC sports
                # pages which just display a cricket scorecard. The pages have a mass
                # of table and css entries to display the scorecards nicely. Probably
                # could make them work with this recipe, but might take a whole day
                # of work to sort out all the css - basically a formatting nightmare.
                elif 'scorecards' in article.url:
                    feed.articles.remove(article)
        return feeds
 # End of class and file.
--- a/resources/compiled_coffeescript.zip
+++ b/resources/compiled_coffeescript.zip
--- a/resources/viewer/blank.html
+++ b/resources/viewer/blank.html
@ -0,0 +1,11 @@
 <!DOCTYPE html>
 <html>
 	<head>
 		<title>blank</title>
 		<meta http-equiv="content-type" content="text/html; charset=utf-8"/>
 	</head>
 	<body>
        <div>&nbsp;</div>	
 	</body>
 </html>
--- a/src/calibre/constants.py
+++ b/src/calibre/constants.py
@ -34,6 +34,7 @@ isfrozen  = hasattr(sys, 'frozen')
 isunix = isosx or islinux
 isportable = os.environ.get('CALIBRE_PORTABLE_BUILD', None) is not None
 ispy3 = sys.version_info.major > 2
 isxp = iswindows and sys.getwindowsversion().major < 6
 try:
    preferred_encoding = locale.getpreferredencoding()
--- a/src/calibre/ebooks/mobi/reader/index.py
+++ b/src/calibre/ebooks/mobi/reader/index.py
@ -224,7 +224,18 @@ def parse_index_record(table, data, control_byte_count, tags, codec,
    for j in xrange(entry_count):
        start, end = idx_positions[j:j+2]
        rec = data[start:end]
-        ident, consumed = decode_string(rec, codec=codec, ordt_map=ordt_map)
+        # Sometimes (in the guide table if the type attribute has non ascii
        # values) the ident is UTF-16 encoded. Try to handle that.
        try:
            ident, consumed = decode_string(rec, codec=codec, ordt_map=ordt_map)
        except UnicodeDecodeError:
            ident, consumed = decode_string(rec, codec='utf-16', ordt_map=ordt_map)
        if u'\x00' in ident:
            try:
                ident, consumed = decode_string(rec, codec='utf-16',
                        ordt_map=ordt_map)
            except UnicodeDecodeError:
                ident = ident.replace('u\x00', u'')
        rec = rec[consumed:]
        tag_map = get_tag_map(control_byte_count, tags, rec, strict=strict)
        table[ident] = tag_map
--- a/src/calibre/ebooks/oeb/display/paged.coffee
+++ b/src/calibre/ebooks/oeb/display/paged.coffee
@ -0,0 +1,250 @@
 #!/usr/bin/env coffee
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 ###
 Copyright 2012, Kovid Goyal <kovid@kovidgoyal.net>
 Released under the GPLv3 License
 ###
 log = (args...) -> # {{{
    if args
        msg = args.join(' ')
        if window?.console?.log
            window.console.log(msg)
        else if process?.stdout?.write
            process.stdout.write(msg + '\n')
 # }}}
 body_height = () -> # {{{
    db = document.body
    dde = document.documentElement
    if db? and dde?
        return Math.max(db.scrollHeight, dde.scrollHeight, db.offsetHeight,
            dde.offsetHeight, db.clientHeight, dde.clientHeight)
    return 0
 # }}}
 window_scroll_pos = (win=window) -> # {{{
    if typeof(win.pageXOffset) == 'number'
        x = win.pageXOffset
        y = win.pageYOffset
    else # IE < 9
        if document.body and ( document.body.scrollLeft or document.body.scrollTop )
            x = document.body.scrollLeft
            y = document.body.scrollTop
        else if document.documentElement and ( document.documentElement.scrollLeft or document.documentElement.scrollTop)
            y = document.documentElement.scrollTop
            x = document.documentElement.scrollLeft
    return [x, y]
 # }}}
 viewport_to_document = (x, y, doc=window?.document) -> # {{{
    until doc == window.document
        # We are in a frame
        frame = doc.defaultView.frameElement
        rect = frame.getBoundingClientRect()
        x += rect.left
        y += rect.top
        doc = frame.ownerDocument
    win = doc.defaultView
    [wx, wy] = window_scroll_pos(win)
    x += wx
    y += wy
    return [x, y]
 # }}}
 absleft = (elem) -> # {{{
    r = elem.getBoundingClientRect()
    return viewport_to_document(r.left, 0, elem.ownerDocument)[0]
 # }}}
 class PagedDisplay
    ###
    This class is a namespace to expose functions via the
    window.paged_display object. The most important functions are:
    layout(): causes the currently loaded document to be laid out in columns.
    ###
    constructor: () ->
        this.set_geometry()
        this.page_width = 0
        this.screen_width = 0
        this.in_paged_mode = false
        this.current_margin_side = 0
    set_geometry: (cols_per_screen=2, margin_top=20, margin_side=40, margin_bottom=20) ->
        this.margin_top = margin_top
        this.margin_side = margin_side
        this.margin_bottom = margin_bottom
        this.cols_per_screen = cols_per_screen
    layout: () ->
        ww = window.innerWidth
        wh = window.innerHeight
        body_height = wh - this.margin_bottom = this.margin_top
        n = this.cols_per_screen
        # Calculate the column width so that cols_per_screen columns fit in the
        # window in such a way the right margin of the last column is <=
        # side_margin (it may be less if the window width is not a
        # multiple of n*(col_width+2*side_margin).
        adjust = ww - Math.floor(ww/n)*n
        # Ensure that the margins are large enough that the adjustment does not
        # cause them to become negative semidefinite
        sm = Math.max(2*adjust, this.margin_side)
        # Minimum column width, for the cases when the window is too
        # narrow
        col_width = Math.max(100, ((ww - adjust)/n) - 2*sm)
        this.page_width = col_width + 2*sm
        this.screen_width = this.page_width * this.cols_per_screen
        body_style = window.getComputedStyle(document.body)
        fgcolor = body_style.getPropertyValue('color')
        bs = document.body.style
        bs.setProperty('-webkit-column-gap', (2*sm)+'px')
        bs.setProperty('-webkit-column-width', col_width+'px')
        bs.setProperty('-webkit-column-rule-color', fgcolor)
        bs.setProperty('overflow', 'visible')
        bs.setProperty('height', 'auto')
        bs.setProperty('width', 'auto')
        bs.setProperty('margin-top', this.margin_top+'px')
        bs.setProperty('margin-bottom', this.margin_bottom+'px')
        bs.setProperty('margin-left', sm+'px')
        bs.setProperty('margin-right', sm+'px')
        for edge in ['left', 'right', 'top', 'bottom']
            bs.setProperty('padding-'+edge, '0px')
            bs.setProperty('border-'+edge+'-width', '0px')
        bs.setProperty('min-width', '0')
        bs.setProperty('max-width', 'none')
        bs.setProperty('min-height', '0')
        bs.setProperty('max-height', 'none')
        # Ensure that the top margin is correct, otherwise for some documents,
        # webkit lays out the body with a lot of space on top
        brect = document.body.getBoundingClientRect()
        if brect.top > this.margin_top
            bs.setProperty('margin-top', (this.margin_top - brect.top)+'px')
        brect = document.body.getBoundingClientRect()
        this.in_paged_mode = true
        this.current_margin_side = sm
        return sm
    scroll_to_pos: (frac) ->
        # Scroll to the position represented by frac (number between 0 and 1)
        xpos = Math.floor(document.body.scrollWidth * frac)
        this.scroll_to_xpos(xpos)
    scroll_to_xpos: (xpos) ->
        # Scroll so that the column containing xpos is the left most column in
        # the viewport
        if typeof(xpos) != 'number'
            log(xpos, 'is not a number, cannot scroll to it!')
            return
        pos = 0
        until (pos <= xpos < pos + this.page_width)
            pos += this.page_width
        limit = document.body.scrollWidth - this.screen_width
        pos = limit if pos > limit
        window.scrollTo(pos, 0)
    current_pos: (frac) ->
        # The current scroll position as a fraction between 0 and 1
        limit = document.body.scrollWidth - window.innerWidth
        if limit <= 0
            return 0.0
        return window.pageXOffset / limit
    current_column_location: () ->
        # The location of the left edge of the left most column currently
        # visible in the viewport
        x = window.pageXOffset + Math.max(10, this.current_margin_side)
        edge = Math.floor(x/this.page_width)
        while edge < x
            edge += this.page_width
        return edge - this.page_width
    next_screen_location: () ->
        # The position to scroll to for the next screen (which could contain
        # more than one pages). Returns -1 if no further scrolling is possible.
        cc = this.current_column_location()
        ans = cc + this.screen_width
        limit = document.body.scrollWidth - window.innerWidth
        if ans > limit
            ans = if window.pageXOffset < limit then limit else -1
        return ans
    previous_screen_location: () ->
        # The position to scroll to for the previous screen (which could contain
        # more than one pages). Returns -1 if no further scrolling is possible.
        cc = this.current_column_location()
        ans = cc - this.screen_width
        if ans < 0
            # We ignore small scrolls (less than 15px) when going to previous
            # screen
            ans = if window.pageXOffset > 15 then 0 else -1
        return ans
    next_col_location: () ->
        # The position to scroll to for the next column (same as
        # next_screen_location() if columns per screen == 1). Returns -1 if no
        # further scrolling is possible.
        cc = this.current_column_location()
        ans = cc + this.page_width
        limit = document.body.scrollWidth - window.innerWidth
        if ans > limit
            ans = if window.pageXOffset < limit then limit else -1
        return ans
    previous_col_location: () ->
        # The position to scroll to for the previous column (same as
        # previous_screen_location() if columns per screen == 1). Returns -1 if
        # no further scrolling is possible.
        cc = this.current_column_location()
        ans = cc - this.page_width
        if ans < 0
            ans = if window.pageXOffset > 0 then 0 else -1
        return ans
    jump_to_anchor: (name) ->
        # Jump to the element identified by anchor name. Ensures that the left
        # most column in the viewport is the column containing the start of the
        # element and that the scroll position is at the start of the column.
        elem = document.getElementById(name)
        if !elem
            elems = document.getElementsByName(name)
            if elems
                elem = elems[0]
        if !elem
            return
        elem.scrollIntoView()
        if this.in_paged_mode
            # Ensure we are scrolled to the column containing elem
            this.scroll_to_xpos(absleft(elem) + 5)
    snap_to_selection: () ->
        # Ensure that the viewport is positioned at the start of the column
        # containing the start of the current selection
        if this.in_paged_mode
            sel = window.getSelection()
            r = sel.getRangeAt(0).getBoundingClientRect()
            node = sel.anchorNode
            left = viewport_to_document(r.left, r.top, doc=node.ownerDocument)[0]
            # Ensure we are scrolled to the column containing the start of the
            # selection
            this.scroll_to_xpos(left+5)
 if window?
    window.paged_display = new PagedDisplay()
 # TODO:
 # css pagebreak rules
 # CFI and bookmarks
 # Go to reference positions
 # Indexing
 # Resizing of images
 # Special handling for identifiable covers (colspan)?
 # Full screen mode
--- a/src/calibre/ebooks/oeb/display/webview.py
+++ b/src/calibre/ebooks/oeb/display/webview.py
@ -31,12 +31,15 @@ def self_closing_sub(match):
    return '<%s %s></%s>'%(match.group(1), match.group(2), match.group(1))
 def load_html(path, view, codec='utf-8', mime_type=None,
-        pre_load_callback=lambda x:None):
+        pre_load_callback=lambda x:None, path_is_html=False):
    from PyQt4.Qt import QUrl, QByteArray
    if mime_type is None:
        mime_type = guess_type(path)[0]
-    with open(path, 'rb') as f:
+    if path_is_html:
-        html = f.read().decode(codec, 'replace')
+        html = path
    else:
        with open(path, 'rb') as f:
            html = f.read().decode(codec, 'replace')
    html = EntityDeclarationProcessor(html).processed_html
    has_svg = re.search(r'<[:a-zA-Z]*svg', html) is not None
--- a/src/calibre/gui2/init.py
+++ b/src/calibre/gui2/init.py
@ -7,7 +7,7 @@ from urllib import unquote
 from PyQt4.Qt import (QVariant, QFileInfo, QObject, SIGNAL, QBuffer, Qt,
                    QByteArray, QTranslator, QCoreApplication, QThread,
                    QEvent, QTimer, pyqtSignal, QDateTime, QDesktopServices,
-                    QFileDialog, QFileIconProvider, QSettings,
+                    QFileDialog, QFileIconProvider, QSettings, QColor,
                    QIcon, QApplication, QDialog, QUrl, QFont, QPalette)
 ORG_NAME = 'KovidsBrain'
@ -738,11 +738,18 @@ class Application(QApplication):
    def load_calibre_style(self):
        # On OS X QtCurve resets the palette, so we preserve it explicitly
        orig_pal = QPalette(self.palette())
        from calibre.constants import plugins
        pi = plugins['progress_indicator'][0]
        path = os.path.join(sys.extensions_location, 'calibre_style.'+(
            'pyd' if iswindows else 'so'))
        pi.load_style(path, 'Calibre')
        # On OSX, on some machines, colors can be invalid. See https://bugs.launchpad.net/bugs/1014900
        for role in (orig_pal.Button, orig_pal.Window):
            c = orig_pal.brush(role).color()
            if not c.isValid() or not c.toRgb().isValid():
                orig_pal.setColor(role, QColor(u'lightgray'))
        self.setPalette(orig_pal)
        style = self.style()
        icon_map = {}
--- a/src/calibre/gui2/viewer/config.py
+++ b/src/calibre/gui2/viewer/config.py
@ -11,7 +11,7 @@ import zipfile
 from PyQt4.Qt import QFont, QVariant, QDialog
-from calibre.constants import iswindows
+from calibre.constants import iswindows, isxp
 from calibre.utils.config import Config, StringConfig
 from calibre.gui2.shortcuts import ShortcutConfig
 from calibre.gui2.viewer.config_ui import Ui_Dialog
@ -113,7 +113,10 @@ class ConfigDialog(QDialog, Ui_Dialog):
        p = self.tabs.widget(1)
        p.layout().addWidget(self.shortcut_config)
        self.opt_fit_images.setChecked(opts.fit_images)
-
+        if isxp:
            self.hyphenate.setVisible(False)
            self.hyphenate_default_lang.setVisible(False)
            self.hyphenate_label.setVisible(False)
    def accept(self, *args):
        if self.shortcut_config.is_editing:
--- a/src/calibre/gui2/viewer/config.ui
+++ b/src/calibre/gui2/viewer/config.ui
@ -196,7 +196,7 @@
          </widget>
         </item>
         <item row="6" column="0">
-          <widget class="QLabel" name="label_8">
+          <widget class="QLabel" name="hyphenate_label">
           <property name="text">
            <string>Default &amp;language for hyphenation:</string>
           </property>
--- a/src/calibre/gui2/viewer/documentview.py
+++ b/src/calibre/gui2/viewer/documentview.py
@ -22,7 +22,8 @@ from calibre.gui2.viewer.javascript import JavaScriptLoader
 from calibre.gui2.viewer.position import PagePosition
 from calibre.gui2.viewer.config import config, ConfigDialog
 from calibre.ebooks.oeb.display.webview import load_html
-
+from calibre.utils.config import tweaks
 from calibre.constants import isxp
 # }}}
 def load_builtin_fonts():
@ -59,10 +60,12 @@ class Document(QWebPage): # {{{
    def __init__(self, shortcuts, parent=None, debug_javascript=False):
        QWebPage.__init__(self, parent)
        self.setObjectName("py_bridge")
        self.in_paged_mode = tweaks.get('viewer_test_paged_mode', False)
        # Use this to pass arbitrary JSON encodable objects between python and
        # javascript. In python get/set the value as: self.bridge_value. In
        # javascript, get/set the value as: py_bridge.value
        self.bridge_value = None
        self.first_load = True
        self.debug_javascript = debug_javascript
        self.anchor_positions = {}
@ -104,6 +107,13 @@ class Document(QWebPage): # {{{
        self.mainFrame().javaScriptWindowObjectCleared.connect(
                self.add_window_objects)
        self.turn_off_internal_scrollbars()
    def turn_off_internal_scrollbars(self):
        mf = self.mainFrame()
        mf.setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff)
        mf.setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff)
    def set_user_stylesheet(self):
        raw = config().parse().user_css
        raw = '::selection {background:#ffff00; color:#000;}\nbody {background-color: white;}\n'+raw
@ -154,7 +164,8 @@ class Document(QWebPage): # {{{
    @pyqtSignature("")
    def init_hyphenate(self):
-        if self.hyphenate and getattr(self, 'loaded_lang', ''):
+        # Qt fails to render soft hyphens correctly on windows xp
        if not isxp and self.hyphenate and getattr(self, 'loaded_lang', ''):
            self.javascript('do_hyphenation("%s")'%self.loaded_lang)
    def _pass_json_value_getter(self):
@ -175,9 +186,12 @@ class Document(QWebPage): # {{{
                        'document.body.style.marginLeft').toString())
        self.initial_right_margin = unicode(self.javascript(
                        'document.body.style.marginRight').toString())
        if self.in_paged_mode:
            self.switch_to_paged_mode()
        if self.in_fullscreen_mode:
            self.switch_to_fullscreen_mode()
        self.read_anchor_positions(use_cache=False)
        self.first_load = False
    def read_anchor_positions(self, use_cache=True):
        self.bridge_value = tuple(self.index_anchors)
@ -190,6 +204,22 @@ class Document(QWebPage): # {{{
            self.anchor_positions = {}
        return self.anchor_positions
    def switch_to_paged_mode(self, onresize=False):
        side_margin = self.javascript('paged_display.layout()', typ=int)
        # Setup the contents size to ensure that there is a right most margin.
        # Without this webkit renders the final column with no margin, as the
        # columns extend beyond the boundaries (and margin) of body
        mf = self.mainFrame()
        sz = mf.contentsSize()
        if sz.width() > self.window_width:
            sz.setWidth(sz.width()+side_margin)
            self.setPreferredContentsSize(sz)
    def after_resize(self):
        if self.in_paged_mode:
            self.setPreferredContentsSize(QSize())
            self.switch_to_paged_mode(onresize=True)
    def switch_to_fullscreen_mode(self):
        self.in_fullscreen_mode = True
        self.javascript('''
@ -233,20 +263,21 @@ class Document(QWebPage): # {{{
    def javascript(self, string, typ=None):
        ans = self.mainFrame().evaluateJavaScript(string)
-        if typ == 'int':
+        if typ in {'int', int}:
            ans = ans.toInt()
            if ans[1]:
                return ans[0]
            return 0
        if typ in {'float', float}:
            ans = ans.toReal()
            return ans[0] if ans[1] else 0.0
        if typ == 'string':
            return unicode(ans.toString())
        return ans
    def javaScriptConsoleMessage(self, msg, lineno, msgid):
        if self.debug_javascript:
            prints( 'JS:', msgid, lineno)
            prints(msg)
            prints(' ')
        else:
            return QWebPage.javaScriptConsoleMessage(self, msg, lineno, msgid)
@ -263,13 +294,7 @@ class Document(QWebPage): # {{{
        self.mainFrame().setScrollPosition(QPoint(x, y))
    def jump_to_anchor(self, anchor):
-        self.javascript('document.location.hash = "%s"'%anchor)
+        self.javascript('paged_display.jump_to_anchor("%s")'%anchor)
    def quantize(self):
        if self.height > self.window_height:
            r = self.height%self.window_height
            if r > 0:
                self.javascript('document.body.style.paddingBottom = "%dpx"'%r)
    def element_ypos(self, elem):
        ans, ok = elem.evaluateJavaScript('$(this).offset().top').toInt()
@ -314,15 +339,22 @@ class Document(QWebPage): # {{{
    @dynamic_property
    def scroll_fraction(self):
        def fget(self):
-            try:
+            if self.in_paged_mode:
-                return abs(float(self.ypos)/(self.height-self.window_height))
+                return self.javascript('paged_display.current_pos()',
-            except ZeroDivisionError:
+                        typ='float')
-                return 0.
+            else:
                try:
                    return abs(float(self.ypos)/(self.height-self.window_height))
                except ZeroDivisionError:
                    return 0.
        def fset(self, val):
-            npos = val * (self.height - self.window_height)
+            if self.in_paged_mode:
-            if npos < 0:
+                self.javascript('paged_display.scroll_to_pos(%f)'%val)
-                npos = 0
+            else:
-            self.scroll_to(x=self.xpos, y=npos)
+                npos = val * (self.height - self.window_height)
                if npos < 0:
                    npos = 0
                self.scroll_to(x=self.xpos, y=npos)
        return property(fget=fget, fset=fset)
    @property
@ -363,6 +395,7 @@ class DocumentView(QWebView): # {{{
    DISABLED_BRUSH = QBrush(Qt.lightGray, Qt.Dense5Pattern)
    def initialize_view(self, debug_javascript=False):
        self.setRenderHints(QPainter.Antialiasing|QPainter.TextAntialiasing|QPainter.SmoothPixmapTransform)
        self.flipper = SlideFlip(self)
        self.is_auto_repeat_event = False
        self.debug_javascript = debug_javascript
@ -555,9 +588,11 @@ class DocumentView(QWebView): # {{{
        return property(fget=fget, fset=fset)
    def search(self, text, backwards=False):
-        if backwards:
+        flags = self.document.FindBackward if backwards else self.document.FindFlags(0)
-            return self.findText(text, self.document.FindBackward)
+        found = self.findText(text, flags)
-        return self.findText(text)
+        if found and self.document.in_paged_mode:
            self.document.javascript('paged_display.snap_to_selection()')
        return found
    def path(self):
        return os.path.abspath(unicode(self.url().toLocalFile()))
@ -570,7 +605,7 @@ class DocumentView(QWebView): # {{{
            if self.manager is not None:
                self.manager.load_started()
-        load_html(path, self, codec=path.encoding, mime_type=getattr(path,
+        load_html(path, self, codec=getattr(path, 'encoding', 'utf-8'), mime_type=getattr(path,
            'mime_type', None), pre_load_callback=callback)
        entries = set()
        for ie in getattr(path, 'index_entries', []):
@ -579,10 +614,12 @@ class DocumentView(QWebView): # {{{
            if ie.end_anchor:
                entries.add(ie.end_anchor)
        self.document.index_anchors = entries
        self.turn_off_internal_scrollbars()
    def initialize_scrollbar(self):
        if getattr(self, 'scrollbar', None) is not None:
            if self.document.in_paged_mode:
                self.scrollbar.setVisible(False)
                return
            delta = self.document.width - self.size().width()
            if delta > 0:
                self._ignore_scrollbar_signals = True
@ -623,7 +660,6 @@ class DocumentView(QWebView): # {{{
                self.manager.scrolled(self.document.scroll_fraction,
                        onload=True)
        self.turn_off_internal_scrollbars()
        if self.flipper.isVisible():
            if self.flipper.running:
                self.flipper.setVisible(False)
@ -631,12 +667,6 @@ class DocumentView(QWebView): # {{{
                self.flipper(self.current_page_image(),
                        duration=self.document.page_flip_duration)
    def turn_off_internal_scrollbars(self):
        self.document.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff)
        self.document.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff)
    @classmethod
    def test_line(cls, img, y):
        'Test if line contains pixels of exactly the same color'
@ -651,6 +681,7 @@ class DocumentView(QWebView): # {{{
            overlap = self.height()
        img = QImage(self.width(), overlap, QImage.Format_ARGB32_Premultiplied)
        painter = QPainter(img)
        painter.setRenderHints(self.renderHints())
        self.document.mainFrame().render(painter, QRegion(0, 0, self.width(), overlap))
        painter.end()
        return img
@ -670,6 +701,28 @@ class DocumentView(QWebView): # {{{
            return
        epf = self.document.enable_page_flip and not self.is_auto_repeat_event
        if self.document.in_paged_mode:
            loc = self.document.javascript(
                    'paged_display.previous_screen_location()', typ='int')
            if loc < 0:
                if self.manager is not None:
                    if epf:
                        self.flipper.initialize(self.current_page_image(),
                                forwards=False)
                    self.manager.previous_document()
            else:
                if epf:
                    self.flipper.initialize(self.current_page_image(),
                            forwards=False)
                self.document.scroll_to(x=loc, y=0)
                if epf:
                    self.flipper(self.current_page_image(),
                            duration=self.document.page_flip_duration)
                if self.manager is not None:
                    self.manager.scrolled(self.scroll_fraction)
            return
        delta_y = self.document.window_height - 25
        if self.document.at_top:
            if self.manager is not None:
@ -700,6 +753,26 @@ class DocumentView(QWebView): # {{{
            return
        epf = self.document.enable_page_flip and not self.is_auto_repeat_event
        if self.document.in_paged_mode:
            loc = self.document.javascript(
                    'paged_display.next_screen_location()', typ='int')
            if loc < 0:
                if self.manager is not None:
                    if epf:
                        self.flipper.initialize(self.current_page_image())
                    self.manager.next_document()
            else:
                if epf:
                    self.flipper.initialize(self.current_page_image())
                self.document.scroll_to(x=loc, y=0)
                if epf:
                    self.flipper(self.current_page_image(),
                            duration=self.document.page_flip_duration)
                if self.manager is not None:
                    self.manager.scrolled(self.scroll_fraction)
            return
        window_height = self.document.window_height
        document_height = self.document.height
        ddelta = document_height - window_height
@ -762,25 +835,38 @@ class DocumentView(QWebView): # {{{
            #print 'After all:', self.document.ypos
    def scroll_by(self, x=0, y=0, notify=True):
-        old_pos = self.document.ypos
+        old_pos = (self.document.xpos if self.document.in_paged_mode else
                self.document.ypos)
        self.document.scroll_by(x, y)
-        if notify and self.manager is not None and self.document.ypos != old_pos:
+        new_pos = (self.document.xpos if self.document.in_paged_mode else
                self.document.ypos)
        if notify and self.manager is not None and new_pos != old_pos:
            self.manager.scrolled(self.scroll_fraction)
    def scroll_to(self, pos, notify=True):
        if self._ignore_scrollbar_signals:
            return
-        old_pos = self.document.ypos
+        old_pos = (self.document.xpos if self.document.in_paged_mode else
-        if isinstance(pos, basestring):
+                self.document.ypos)
-            self.document.jump_to_anchor(pos)
+        if self.document.in_paged_mode:
-        else:
+            if isinstance(pos, basestring):
-            if pos >= 1:
+                self.document.jump_to_anchor(pos)
                self.document.scroll_to(0, self.document.height)
            else:
-                y = int(math.ceil(
+                self.document.scroll_fraction = pos
-                        pos*(self.document.height-self.document.window_height)))
+        else:
-                self.document.scroll_to(0, y)
+            if isinstance(pos, basestring):
-        if notify and self.manager is not None and self.document.ypos != old_pos:
+                self.document.jump_to_anchor(pos)
            else:
                if pos >= 1:
                    self.document.scroll_to(0, self.document.height)
                else:
                    y = int(math.ceil(
                            pos*(self.document.height-self.document.window_height)))
                    self.document.scroll_to(0, y)
        new_pos = (self.document.xpos if self.document.in_paged_mode else
                self.document.ypos)
        if notify and self.manager is not None and new_pos != old_pos:
            self.manager.scrolled(self.scroll_fraction)
    @dynamic_property
@ -813,9 +899,8 @@ class DocumentView(QWebView): # {{{
        return QWebView.changeEvent(self, event)
    def paintEvent(self, event):
        self.turn_off_internal_scrollbars()
        painter = QPainter(self)
        painter.setRenderHints(self.renderHints())
        self.document.mainFrame().render(painter, event.region())
        if not self.isEnabled():
            painter.fillRect(event.region().boundingRect(), self.DISABLED_BRUSH)
@ -827,6 +912,18 @@ class DocumentView(QWebView): # {{{
            if self.manager is not None and event.delta() != 0:
                (self.manager.font_size_larger if event.delta() > 0 else
                        self.manager.font_size_smaller)()
                return
        if self.document.in_paged_mode:
            if abs(event.delta()) < 15: return
            typ = 'screen' if self.document.wheel_flips_pages else 'col'
            direction = 'next' if event.delta() < 0 else 'previous'
            loc = self.document.javascript('paged_display.%s_%s_location()'%(
                direction, typ), typ='int')
            if loc > -1:
                self.document.scroll_to(x=loc, y=0)
            return
        if event.delta() < -14:
            if self.document.wheel_flips_pages:
                self.next_page()
@ -866,6 +963,17 @@ class DocumentView(QWebView): # {{{
        if not self.handle_key_press(event):
            return QWebView.keyPressEvent(self, event)
    def paged_col_scroll(self, forward=True):
        dir = 'next' if forward else 'previous'
        loc = self.document.javascript(
                'paged_display.%s_col_location()'%dir, typ='int')
        if loc > -1:
            self.document.scroll_to(x=loc, y=0)
            self.manager.scrolled(self.document.scroll_fraction)
        else:
            (self.manager.next_document() if forward else
                    self.manager.previous_document())
    def handle_key_press(self, event):
        handled = True
        key = self.shortcuts.get_match(event)
@ -877,21 +985,33 @@ class DocumentView(QWebView): # {{{
            finally:
                self.is_auto_repeat_event = False
        elif key == 'Down':
-            if (not self.document.line_scrolling_stops_on_pagebreaks and
+            if self.document.in_paged_mode:
-                    self.document.at_bottom):
+                self.paged_col_scroll()
                self.manager.next_document()
            else:
-                self.scroll_by(y=15)
+                if (not self.document.line_scrolling_stops_on_pagebreaks and
                        self.document.at_bottom):
                    self.manager.next_document()
                else:
                    self.scroll_by(y=15)
        elif key == 'Up':
-            if (not self.document.line_scrolling_stops_on_pagebreaks and
+            if self.document.in_paged_mode:
-                    self.document.at_top):
+                self.paged_col_scroll(forward=False)
                self.manager.previous_document()
            else:
-                self.scroll_by(y=-15)
+                if (not self.document.line_scrolling_stops_on_pagebreaks and
                        self.document.at_top):
                    self.manager.previous_document()
                else:
                    self.scroll_by(y=-15)
        elif key == 'Left':
-            self.scroll_by(x=-15)
+            if self.document.in_paged_mode:
                self.paged_col_scroll(forward=False)
            else:
                self.scroll_by(x=-15)
        elif key == 'Right':
-            self.scroll_by(x=15)
+            if self.document.in_paged_mode:
                self.paged_col_scroll()
            else:
                self.scroll_by(x=15)
        else:
            handled = False
        return handled
--- a/src/calibre/gui2/viewer/javascript.py
+++ b/src/calibre/gui2/viewer/javascript.py
@ -30,10 +30,11 @@ class JavaScriptLoader(object):
    CS = {
            'cfi':'ebooks.oeb.display.cfi',
            'indexing':'ebooks.oeb.display.indexing',
            'paged':'ebooks.oeb.display.paged',
        }
    ORDER = ('jquery', 'jquery_scrollTo', 'bookmarks', 'referencing', 'images',
-            'hyphenation', 'hyphenator', 'cfi', 'indexing',)
+            'hyphenation', 'hyphenator', 'cfi', 'indexing', 'paged')
    def __init__(self, dynamic_coffeescript=False):
--- a/src/calibre/gui2/viewer/main.py
+++ b/src/calibre/gui2/viewer/main.py
@ -747,6 +747,7 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
        # There hasn't been a resize event for some time
        # restore the current page position.
        self.resize_in_progress = False
        self.view.document.after_resize()
        if self.window_mode_changed:
            # This resize is part of a window mode change, special case it
            self.handle_window_mode_toggle()
@ -1003,6 +1004,12 @@ def main(args=sys.argv):
        QApplication.setApplicationName(APP_UID)
        main = EbookViewer(args[1] if len(args) > 1 else None,
                debug_javascript=opts.debug_javascript, open_at=open_at)
        # This is needed for paged mode. Without it, the first document that is
        # loaded will have extra blank space at the bottom, as
        # turn_off_internal_scrollbars does not take effect for the first
        # rendered document
        main.view.load_path(P('viewer/blank.html', allow_user_override=False))
        sys.excepthook = main.unhandled_exception
        main.show()
        if opts.raise_window:
--- a/src/calibre/library/prefs.py
+++ b/src/calibre/library/prefs.py
@ -57,4 +57,14 @@ class DBPrefs(dict):
    def set(self, key, val):
        self.__setitem__(key, val)
    def get_namespaced(self, namespace, key, default=None):
        key = u'namespaced:%s:%s'%(namespace, key)
        try:
            return dict.__getitem__(self, key)
        except KeyError:
            return default
    def set_namespaced(self, namespace, key, val):
        key = u'namespaced:%s:%s'%(namespace, key)
        self[key] = val