Sync to trunk.

2025-10-19 21:10:30 -04:00 · 2011-09-20 19:13:14 -04:00 · 2011-09-20 19:13:14 -04:00 · beea6bcd11
commit beea6bcd11
parent cb94627b3a 84ee53fc18
14 changed files with 560 additions and 62 deletions
--- a/recipes/cicero.recipe
+++ b/recipes/cicero.recipe
@ -1,38 +1,47 @@
 from calibre.web.feeds.news import BasicNewsRecipe

 class BasicUserRecipe1316245412(BasicNewsRecipe):
-
    title = u'Cicero Online'
    description = u'Magazin f\xfcr politische Kultur'
    publisher = 'Ringier Publishing GmbH'
    category = 'news, politics, Germany'
    language = 'de'
    encoding = 'UTF-8'
-    __author__ = 'Armin Geller' # 2011-09-17
+    __author__ = 'Armin Geller' # Upd. 2011-09-19

-    oldest_article = 7
+    oldest_article        = 7
    max_articles_per_feed = 100
    no_stylesheets = True
    auto_cleanup = False

+#    remove_javascript = True
+
    remove_tags = [
-    dict(name='div', attrs={'id':["header", "navigation", "skip-link", "header-print", "header-print-url", "meta-toolbar", "footer"]}),
-    dict(name='div', attrs={'class':["region region-sidebar-first column sidebar", "breadcrumb", "breadcrumb-title", "meta", "comment-wrapper",
-    "field field-name-field-show-teaser-right field-type-list-boolean field-label-above"]}),
-    dict(name='div', attrs={'title':["Dossier Auswahl"]}),
-    dict(name='h2', attrs={'class':["title comment-form"]}),
-    dict(name='form', attrs={'class':["comment-form user-info-from-cookie"]}),
-    ]
+                    dict(name='div', attrs={'id':["header", "navigation", "skip-link", "header-print", "header-print-url", "meta-toolbar", "footer"]}),
+                    dict(name='div', attrs={'class':["region region-sidebar-first column sidebar", "breadcrumb", "breadcrumb-title", "meta", "comment-wrapper",
+                                                        "field field-name-field-show-teaser-right field-type-list-boolean field-label-above"]}),
+                    dict(name='div', attrs={'title':["Dossier Auswahl"]}),
+                    dict(name='h2', attrs={'class':["title comment-form"]}),
+                    dict(name='form', attrs={'class':["comment-form user-info-from-cookie"]}),
+                    # 2011-09-19 clean-up on first feed historical caricature- and video preview pictures and social icons
+                    dict(name='table', attrs={'class':["mcx-social-horizontal", "page-header"]}),   # 2011-09-19
+                    dict(name='div', attrs={'class':["page-header", "view view-alle-karikaturen view-id-alle_karikaturen view-display-id-default view-dom-id-1",
+                                                      "pagination",
+                                                      "view view-letzte-videos view-id-letzte_videos view-display-id-default view-dom-id-1"]}), # 2011-09-19
+                   ]

    feeds = [
-    (u'Das gesamte Portfolio', u'http://www.cicero.de/rss.xml'),
-    (u'Berliner Republik', u'http://www.cicero.de/berliner-republik.xml'),
-    (u'Weltb\xfchne', u'http://www.cicero.de/weltbuehne.xml'),
-    (u'Kapital', u'http://www.cicero.de/kapital.xml'),
-    (u'Salon', u'http://www.cicero.de/salon.xml'),
-    (u'Blogs', u'http://www.cicero.de/blogs.xml'), #seems not to be in use at the moment
-    ]
+              (u'Das gesamte Portfolio', u'http://www.cicero.de/rss.xml'),
+              (u'Berliner Republik', u'http://www.cicero.de/berliner-republik.xml'),
+              (u'Weltb\xfchne', u'http://www.cicero.de/weltbuehne.xml'),
+              (u'Kapital', u'http://www.cicero.de/kapital.xml'),
+              (u'Salon', u'http://www.cicero.de/salon.xml'),
+              (u'Blogs', u'http://www.cicero.de/blogs.xml'), #seems not to be in use at the moment
+             ]

    def print_version(self, url):
-        return url + '?print'
+          return url + '?print'
+
+#    def get_cover_url(self):
+#          return 'http://www.cicero.de/sites/all/themes/cicero/logo.png' # need to find a good logo on their home page!

--- a/recipes/idg_se.recipe
+++ b/recipes/idg_se.recipe
@ -4,19 +4,19 @@ from calibre.web.feeds.news import BasicNewsRecipe

 class IDGse(BasicNewsRecipe):
    title               = 'IDG'
-    description = 'IDG.se'
-    language = 'se'
    __author__ = 'zapt0'
+    language = 'sv'
+    description = 'IDG.se'
    oldest_article = 1
-    max_articles_per_feed = 40
+    max_articles_per_feed = 256
    no_stylesheets = True
    encoding = 'ISO-8859-1'
    remove_javascript = True

-    feeds          = [(u'Senaste nytt',u'http://feeds.idg.se/idg/vzzs')]
+    feeds          = [(u'Dagens IDG-nyheter',u'http://feeds.idg.se/idg/ETkj?format=xml')]

    def print_version(self,url):
-            return url + '?articleRenderMode=print&m=print'
+        return url + '?articleRenderMode=print&m=print'

    def get_cover_url(this):
        return 'http://idgmedia.idg.se/polopoly_fs/2.3275!images/idgmedia_logo_75.jpg'
@ -30,4 +30,3 @@ class IDGse(BasicNewsRecipe):
                                    dict(name='div', attrs={'id':['preamble_ad']}),
                                    dict(name='ul', attrs={'class':['share']})
                                ]
-
--- a/recipes/macleans.recipe
+++ b/recipes/macleans.recipe
@ -4,25 +4,17 @@ from calibre.web.feeds.news import BasicNewsRecipe
 class AdvancedUserRecipe1308306308(BasicNewsRecipe):
    title          = u'Macleans Magazine'
    language = 'en_CA'
-    __author__ = 'sexymax15'
-    oldest_article = 30
-    max_articles_per_feed = 12
+    __author__ = 'Medius'
+    oldest_article = 7
+    cover_url = 'http://www.rogersmagazines.com/rms_covers/md/CLE_md.jpg'

    use_embedded_content = False

    remove_empty_feeds = True
    no_stylesheets = True
    remove_javascript = True
-    remove_tags = [dict(name ='img'),dict (id='header'),{'class':'postmetadata'}]
-    remove_tags_after = {'class':'postmetadata'}
+    remove_tags = [dict(id='header'),{'class':'comment'}]
+    remove_tags_after = {'class':'pagination'}

-    feeds          = [(u'Blog Central', u'http://www2.macleans.ca/category/blog-central/feed/'),
- (u'Canada', u'http://www2.macleans.ca/category/canada/feed/'),
-(u'World', u'http://www2.macleans.ca/category/world-from-the-magazine/feed/'),
-(u'Business', u'http://www2.macleans.ca/category/business/feed/'),
-(u'Arts & Culture', u'http://www2.macleans.ca/category/arts-culture/feed/'),
-(u'Opinion', u'http://www2.macleans.ca/category/opinion/feed/'),
-(u'Health', u'http://www2.macleans.ca/category/health-from-the-magazine/feed/'),
- (u'Environment', u'http://www2.macleans.ca/category/environment-from-the-magazine/feed/')]
-    def print_version(self, url):
-        return url + 'print/'
+    feeds          = [(u'Canada', u'http://www2.macleans.ca/category/canada/feed/'),
+(u'World', u'http://www2.macleans.ca/category/news-politics/world/feed/'), (u'Business', u'http://www2.macleans.ca/category/business/feed/'), (u'Arts & Culture', u'http://www2.macleans.ca/category/arts/feed/'), (u'Opinion', u'http://www2.macleans.ca/category/opinion/feed/'), (u'Health', u'http://www2.macleans.ca/category/life/health/feed/'), (u'Sports', u'http://www2.macleans.ca/category/life/sports/feed/'), (u'Environment', u'http://www2.macleans.ca/category/life/environment/feed/'), (u'Technology', u'http://www2.macleans.ca/category/life/technology/feed/'), (u'Travel', u'http://www2.macleans.ca/category/life/travel/feed/'), (u'Blog Central', u'http://www2.macleans.ca/category/blog-central/feed/')]
--- a/recipes/ming_pao.recipe
+++ b/recipes/ming_pao.recipe
@ -16,6 +16,7 @@ __UseLife__ = True

 '''
 Change Log:
+2011/09/18: parse "column" section stuff from source text files directly.
 2011/09/07: disable "column" section as it is no longer offered free.
 2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
            provide options to remove all images in the file
@ -52,16 +53,19 @@ class MPRecipe(BasicNewsRecipe):
        title       = 'Ming Pao - Hong Kong'
        description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
        category    = 'Chinese, News, Hong Kong'
-        extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
+        extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
        masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
        keep_only_tags = [dict(name='h1'),
                          dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
                          dict(name='font', attrs={'color':['AA0000']}), # for column articles title
+                          dict(attrs={'class':['heading']}),  # for heading from txt
                          dict(attrs={'id':['newscontent']}), # entertainment and column page content
                          dict(attrs={'id':['newscontent01','newscontent02']}),
+                          dict(attrs={'class':['content']}),  # for content from txt
                          dict(attrs={'class':['photo']}),
                          dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}),  # content in printed version of life.mingpao.com
-                          dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com
+                          dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}), # images for source from life.mingpao.com
+                          dict(attrs={'class':['images']})   # for images from txt
                          ]
        if __KeepImages__:
            remove_tags = [dict(name='style'),
@ -232,12 +236,18 @@ class MPRecipe(BasicNewsRecipe):
                                           (u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
                                           (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
                                           (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
-                                           #(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
                                          ]:
                    articles = self.parse_section2(url, keystr)
                    if articles:
                        feeds.append((title, articles))

+                # parse column section articles directly from .txt files
+                for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
+                                          ]:
+                    articles = self.parse_section2_col(url, keystr)
+                    if articles:
+                        feeds.append((title, articles))
+
                for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
                                   (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
                    articles = self.parse_section(url)
@ -358,6 +368,24 @@ class MPRecipe(BasicNewsRecipe):
        current_articles.reverse()
        return current_articles

+    # parse from life.mingpao.com
+    def parse_section2_col(self, url, keystr):
+        self.get_fetchdate()
+        soup = self.index_to_soup(url)
+        a = soup.findAll('a', href=True)
+        a.reverse()
+        current_articles = []
+        included_urls = []
+        for i in a:
+            title = self.tag_to_string(i)
+            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
+            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
+                url = url.replace('cfm/dailynews3.cfm?File=', 'ftp/Life3/')  # use printed version of the article
+                current_articles.append({'title': title, 'url': url, 'description': ''})
+                included_urls.append(url)
+        current_articles.reverse()
+        return current_articles
+
    # parse from www.mingpaovan.com
    def parse_section3(self, url, baseUrl):
        self.get_fetchdate()
@ -440,6 +468,39 @@ class MPRecipe(BasicNewsRecipe):
        current_articles.reverse()
        return current_articles

+    # preprocess those .txt based files
+    def preprocess_raw_html(self, raw_html, url):
+        if url.rfind('ftp') == -1:
+            return raw_html
+        else:
+            splitter = re.compile(r'\n') # Match non-digits
+            new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
+            next_is_img_txt = False
+            title_started = False
+            met_article_start_char = False
+            for item in splitter.split(raw_html):
+                if item.startswith(u'\u3010'):
+                    met_article_start_char = True
+                    new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
+                else:
+                    if next_is_img_txt == False:
+                        if item.startswith('='):
+                            next_is_img_txt = True
+                            new_raw_html += '<img src="' + str(item)[1:].strip() + '.jpg" /><p>\n'
+                        else:
+                            if met_article_start_char == False:
+                                if title_started == False:
+                                    new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
+                                    title_started = True
+                                else:
+                                    new_raw_html = new_raw_html + item + '\n'
+                            else:
+                                new_raw_html = new_raw_html + item + '<p>\n'
+                    else:
+                        next_is_img_txt = False
+                        new_raw_html = new_raw_html + item + '\n'
+            return new_raw_html + '</div></body></html>'
+
    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
@ -593,3 +654,4 @@ class MPRecipe(BasicNewsRecipe):

        with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
            opf.render(opf_file, ncx_file)
+
--- a/recipes/taipei.recipe
+++ b/recipes/taipei.recipe
@ -0,0 +1,30 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class TN(BasicNewsRecipe):
+    title          = u'Taipei Times'
+    language       = 'en_CN'
+    __author__     = 'Krittika Goyal'
+    oldest_article = 1 #days
+    max_articles_per_feed = 25
+    use_embedded_content = False
+
+    no_stylesheets = True
+    auto_cleanup = True
+    auto_cleanup_keep = '//*[@class="main_ipic"]'
+
+    feeds          = [
+('Editorials',
+ 'http://www.taipeitimes.com/xml/editorials.rss'),
+('Taiwan',
+ 'http://www.taipeitimes.com/xml/taiwan.rss'),
+('Features',
+ 'http://www.taipeitimes.com/xml/feat.rss'),
+('Business',
+ 'http://www.taipeitimes.com/xml/biz.rss'),
+('World',
+ 'http://www.taipeitimes.com/xml/world.rss'),
+('Sports',
+ 'http://www.taipeitimes.com/xml/sport.rss'),
+]
+
+
--- a/src/calibre/ebooks/metadata/meta.py
+++ b/src/calibre/ebooks/metadata/meta.py
@ -31,7 +31,7 @@ def metadata_from_formats(formats, force_read_metadata=False, pattern=None):
    try:
        return _metadata_from_formats(formats, force_read_metadata, pattern)
    except:
-        mi = metadata_from_filename(list(iter(formats), pattern)[0])
+        mi = metadata_from_filename(list(iter(formats))[0], pat=pattern)
        if not mi.authors:
            mi.authors = [_('Unknown')]
        return mi
--- a/src/calibre/gui2/catalog/catalog_bibtex.ui
+++ b/src/calibre/gui2/catalog/catalog_bibtex.ui
@ -110,9 +110,9 @@
      <string>Some explanation about this template:
 -The fields availables are 'author_sort', 'authors', 'id',
    'isbn', 'pubdate', 'publisher', 'series_index', 'series',
-   'tags', 'timestamp', 'title', 'uuid'
+   'tags', 'timestamp', 'title', 'uuid', 'title_sort'
 -For list types ie authors and tags, only the first element
-   wil be selected.
+   will be selected.
 -For time field, only the date will be used. </string>
     </property>
     <property name="scaledContents">
--- a/src/calibre/gui2/catalog/catalog_csv_xml.py
+++ b/src/calibre/gui2/catalog/catalog_csv_xml.py
@ -29,7 +29,7 @@ class PluginWidget(QWidget, Ui_Form):
                QListWidgetItem(x, self.db_fields)

        db = db_()
-        for x in  sorted(db.custom_field_keys()):
+        for x in sorted(db.custom_field_keys()):
            self.all_fields.append(x)
            QListWidgetItem(x, self.db_fields)

--- a/src/calibre/gui2/device.py
+++ b/src/calibre/gui2/device.py
@ -87,7 +87,7 @@ class DeviceJob(BaseJob): # {{{
            self.failed = True
            ex = as_unicode(err)
            self._details = ex + '\n\n' + \
-                traceback.format_exc()
+                force_unicode(traceback.format_exc())
            self.exception = err
        finally:
            self.job_done()
--- a/src/calibre/library/catalog.py
+++ b/src/calibre/library/catalog.py
@ -32,7 +32,7 @@ FIELDS = ['all', 'title', 'title_sort', 'author_sort', 'authors', 'comments',
          'rating', 'series_index', 'series', 'size', 'tags', 'timestamp', 'uuid']

 #Allowed fields for template
-TEMPLATE_ALLOWED_FIELDS = [ 'author_sort', 'authors', 'id', 'isbn', 'pubdate',
+TEMPLATE_ALLOWED_FIELDS = [ 'author_sort', 'authors', 'id', 'isbn', 'pubdate', 'title_sort',
    'publisher', 'series_index', 'series', 'tags', 'timestamp', 'title', 'uuid' ]

 class CSV_XML(CatalogPlugin): # {{{
@ -324,7 +324,7 @@ class BIBTEX(CatalogPlugin): # {{{
    def run(self, path_to_output, opts, db, notification=DummyReporter()):

        def create_bibtex_entry(entry, fields, mode, template_citation,
-            bibtexdict, citation_bibtex=True, calibre_files=True):
+                                    bibtexdict, db, citation_bibtex=True, calibre_files=True):

            #Bibtex doesn't like UTF-8 but keep unicode until writing
            #Define starting chain or if book valid strict and not book return a Fail string
@ -345,7 +345,13 @@ class BIBTEX(CatalogPlugin): # {{{
                bibtex_entry = [u' '.join(bibtex_entry)]

            for field in fields:
-                item = entry[field]
+                if field.startswith('#'):
+                        item = db.get_field(entry['id'],field,index_is_id=True)
+                elif field == 'title_sort':
+                    item = entry['sort']
+                else:
+                    item = entry[field]
+
                #check if the field should be included (none or empty)
                if item is None:
                    continue
@ -358,10 +364,6 @@ class BIBTEX(CatalogPlugin): # {{{
                if field == 'authors' :
                    bibtex_entry.append(u'author = "%s"' % bibtexdict.bibtex_author_format(item))

-                elif field in ['title', 'publisher', 'cover', 'uuid', 'ondevice',
-                        'author_sort', 'series'] :
-                    bibtex_entry.append(u'%s = "%s"' % (field, bibtexdict.utf8ToBibtex(item)))
-
                elif field == 'id' :
                    bibtex_entry.append(u'calibreid = "%s"' % int(item))

@ -409,6 +411,14 @@ class BIBTEX(CatalogPlugin): # {{{
                    bibtex_entry.append(u'year = "%s"' % item.year)
                    bibtex_entry.append(u'month = "%s"' % bibtexdict.utf8ToBibtex(strftime("%b", item)))

+                elif field.startswith('#') :
+                    bibtex_entry.append(u'%s = "%s"' % (field[1:], bibtexdict.utf8ToBibtex(item)))
+
+                else:
+                    # elif field in ['title', 'publisher', 'cover', 'uuid', 'ondevice',
+                        # 'author_sort', 'series', 'title_sort'] :
+                    bibtex_entry.append(u'%s = "%s"' % (field, bibtexdict.utf8ToBibtex(item)))
+
            bibtex_entry = u',\n    '.join(bibtex_entry)
            bibtex_entry += u' }\n\n'

@ -588,7 +598,7 @@ class BIBTEX(CatalogPlugin): # {{{

            for entry in data:
                outfile.write(create_bibtex_entry(entry, fields, bib_entry, template_citation,
-                    bibtexc, citation_bibtex, addfiles_bibtex))
+                    bibtexc, db, citation_bibtex, addfiles_bibtex))
 # }}}

 class EPUB_MOBI(CatalogPlugin):
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -147,13 +147,14 @@ class BasicNewsRecipe(Recipe):
    #: Specify elements that the auto cleanup algorithm should never remove
    #: The syntax is a XPath expression. For example::
    #:
-    #: auto_cleanup_keep = '//div[@id="article-image"]' will keep all divs with
+    #:   auto_cleanup_keep = '//div[@id="article-image"]' will keep all divs with
    #:                                                  id="article-image"
-    #: auto_cleanup_keep = '//*[@class="important"]' will keep all elements
+    #:   auto_cleanup_keep = '//*[@class="important"]' will keep all elements
    #:                                               with class="important"
-    #: auto_cleanup_keep = '//div[@id="article-image"]|//span[@class="important"]'
+    #:   auto_cleanup_keep = '//div[@id="article-image"]|//span[@class="important"]'
    #:                     will keep all divs with id="article-image" and spans
    #:                     with class="important"
+    #:
    auto_cleanup_keep = None

    #: Specify any extra :term:`CSS` that should be addded to downloaded :term:`HTML` files
--- a/src/calibre/web/jsbrowser/browser.py
+++ b/src/calibre/web/jsbrowser/browser.py
@ -7,16 +7,22 @@ __license__   = 'GPL v3'
 __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-import os, pprint
+import os, pprint, time

 from PyQt4.Qt import (QObject, QNetworkAccessManager, QNetworkDiskCache,
-        QNetworkProxy, QNetworkProxyFactory)
-from PyQt4.QtWebKit import QWebPage
+        QNetworkProxy, QNetworkProxyFactory, QEventLoop, QUrl,
+        QDialog, QVBoxLayout, QSize)
+from PyQt4.QtWebKit import QWebPage, QWebSettings, QWebView

 from calibre import USER_AGENT, prints, get_proxies, get_proxy_info
 from calibre.constants import ispy3, config_dir
 from calibre.utils.logging import ThreadSafeLog
 from calibre.gui2 import must_use_qt
+from calibre.web.jsbrowser.forms import FormsMixin
+
+class Timeout(Exception): pass
+
+class LoadError(Exception): pass

 class WebPage(QWebPage): # {{{

@ -24,6 +30,7 @@ class WebPage(QWebPage): # {{{
            confirm_callback=None,
            prompt_callback=None,
            user_agent=USER_AGENT,
+            enable_developer_tools=False,
            parent=None):
        QWebPage.__init__(self, parent)

@ -33,6 +40,12 @@ class WebPage(QWebPage): # {{{
        self.prompt_callback = prompt_callback
        self.setForwardUnsupportedContent(True)
        self.unsupportedContent.connect(self.on_unsupported_content)
+        settings = self.settings()
+        if enable_developer_tools:
+            settings.setAttribute(QWebSettings.DeveloperExtrasEnabled, True)
+        QWebSettings.enablePersistentStorage(os.path.join(config_dir, 'caches',
+                'webkit-persistence'))
+        QWebSettings.setMaximumPagesInCache(0)

    def userAgentForUrl(self, url):
        return self.user_agent
@ -173,7 +186,36 @@ class NetworkAccessManager(QNetworkAccessManager): # {{{
            self.log.debug('\n'.join(debug))
 # }}}

-class Browser(QObject):
+class LoadWatcher(QObject): # {{{
+
+    def __init__(self, page, parent=None):
+        QObject.__init__(self, parent)
+        self.is_loading = True
+        self.loaded_ok = None
+        page.loadFinished.connect(self)
+        self.page = page
+
+    def __call__(self, ok):
+        self.loaded_ok = ok
+        self.is_loading = False
+        self.page.loadFinished.disconnect(self)
+        self.page = None
+# }}}
+
+class BrowserView(QDialog): # {{{
+
+    def __init__(self, page, parent=None):
+        QDialog.__init__(self, parent)
+        self.l = l = QVBoxLayout(self)
+        self.setLayout(l)
+        self.webview = QWebView(self)
+        l.addWidget(self.webview)
+        self.resize(QSize(1024, 768))
+        self.webview.setPage(page)
+
+# }}}
+
+class Browser(QObject, FormsMixin):

    '''
    Browser (WebKit with no GUI).
@ -202,16 +244,21 @@ class Browser(QObject):
            # If True a disk cache is used
            use_disk_cache=True,

+            # Enable Inspect element functionality
+            enable_developer_tools=False,
+
            # Verbosity
            verbosity = 0
        ):
        must_use_qt()
        QObject.__init__(self)
+        FormsMixin.__init__(self)

        if log is None:
            log = ThreadSafeLog()
        if verbosity:
            log.filter_level = log.DEBUG
+        self.log = log

        self.jquery_lib = P('content_server/jquery.js', data=True,
                allow_user_override=False).decode('utf-8')
@ -220,7 +267,64 @@ class Browser(QObject):

        self.page = WebPage(log, confirm_callback=confirm_callback,
                prompt_callback=prompt_callback, user_agent=user_agent,
+                enable_developer_tools=enable_developer_tools,
                parent=self)
        self.nam = NetworkAccessManager(log, use_disk_cache=use_disk_cache, parent=self)
        self.page.setNetworkAccessManager(self.nam)

+    def _wait_for_load(self, timeout, url=None):
+        loop = QEventLoop(self)
+        start_time = time.time()
+        end_time = start_time + timeout
+        lw = LoadWatcher(self.page, parent=self)
+        while lw.is_loading and end_time > time.time():
+            if not loop.processEvents():
+                time.sleep(0.01)
+        if lw.is_loading:
+            raise Timeout('Loading of %r took longer than %d seconds'%(
+                url, timeout))
+
+        return lw.loaded_ok
+
+    def visit(self, url, timeout=30.0):
+        '''
+        Open the page specified in URL and wait for it to complete loading.
+        Note that when this method returns, there may still be javascript
+        that needs to execute (this method returns when the loadFinished()
+        signal is called on QWebPage). This method will raise a Timeout
+        exception if loading takes more than timeout seconds.
+
+        Returns True if loading was successful, False otherwise.
+        '''
+        self.current_form = None
+        self.page.mainFrame().load(QUrl(url))
+        return self._wait_for_load(timeout, url)
+
+    def click(self, qwe, wait_for_load=True, ajax_replies=0, timeout=30.0):
+        '''
+        Click the QWebElement pointed to by qwe.
+
+        :param wait_for_load: If you know that the click is going to cause a
+                              new page to be loaded, set this to True to have
+                              the method block until the new page is loaded
+        :para ajax_replies: Number of replies to wait for after clicking a link
+                            that triggers some AJAX interaction
+        '''
+        js = '''
+            var e = document.createEvent('MouseEvents');
+            e.initEvent( 'click', true, true );
+            this.dispatchEvent(e);
+        '''
+        qwe.evaluateJavaScript(js)
+        if ajax_replies > 0:
+            raise NotImplementedError('AJAX clicking not implemented')
+        elif wait_for_load and not self._wait_for_load(timeout):
+            raise LoadError('Clicking resulted in a failed load')
+
+    def show_browser(self):
+        '''
+        Show the currently loaded web page in a window. Useful for debugging.
+        '''
+        view = BrowserView(self.page)
+        view.exec_()
+
--- a/src/calibre/web/jsbrowser/forms.py
+++ b/src/calibre/web/jsbrowser/forms.py
@ -0,0 +1,160 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+from future_builtins import map
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from calibre import as_unicode
+
+class Control(object):
+
+    def __init__(self, qwe):
+        self.qwe = qwe
+        self.name = unicode(qwe.attribute('name'))
+        self.type = unicode(qwe.attribute('type'))
+
+    def __repr__(self):
+        return unicode(self.qwe.toOuterXml())
+
+    @dynamic_property
+    def value(self):
+        def fget(self):
+            if self.type in ('checkbox', 'radio'):
+                return unicode(self.qwe.attribute('checked')) == 'checked'
+            if self.type in ('text', 'password'):
+                return unicode(self.qwe.attribute('value'))
+
+        def fset(self, val):
+            if self.type in ('checkbox', 'radio'):
+                if val:
+                    self.qwe.setAttribute('checked', 'checked')
+                else:
+                    self.qwe.removeAttribute('checked')
+            elif self.type in ('text', 'password'):
+                self.qwe.setAttribute('value', as_unicode(val))
+
+        return property(fget=fget, fset=fset)
+
+class RadioControl(object):
+
+    def __init__(self, name, controls):
+        self.name = name
+        self.type = 'radio'
+        self.values = {unicode(c.attribute('value')):c for c in controls}
+
+    def __repr__(self):
+        return 'RadioControl(%s)'%(', '.join(self.values))
+
+    @dynamic_property
+    def value(self):
+        def fget(self):
+            for val, x in self.values.iteritems():
+                if unicode(x.attribute('checked')) == 'checked':
+                    return val
+
+        def fset(self, val):
+            control = None
+            for value, x in self.values.iteritems():
+                if val == value:
+                    control = x
+                    break
+            if control is not None:
+                for x in self.values.itervalues():
+                    x.removeAttribute('checked')
+                control.setAttribute('checked', 'checked')
+
+        return property(fget=fget, fset=fset)
+
+class Form(object):
+
+    def __init__(self, qwe):
+        self.qwe = qwe
+        self.attributes = {unicode(x):unicode(qwe.attribute(x)) for x in
+                qwe.attributeNames()}
+        self.input_controls = list(map(Control, qwe.findAll('input')))
+        rc = [x for x in self.input_controls if x.type == 'radio']
+        self.input_controls = [x for x in self.input_controls if x.type != 'radio']
+        rc_names = {x.name for x in rc}
+        self.radio_controls = {name:RadioControl(name, [x.qwe for x in rc if x.name == name]) for name in rc_names}
+
+    def __getitem__(self, key):
+        for x in self.input_controls:
+            if key == x.name:
+                return x
+        try:
+            return self.radio_controls.get(key)
+        except KeyError:
+            pass
+        raise KeyError('No control with the name %s in this form'%key)
+
+    def __repr__(self):
+        attrs = ['%s=%s'%(k, v) for k, v in self.attributes.iteritems()]
+        return '<form %s>'%(' '.join(attrs))
+
+    def submit_control(self, submit_control_selector=None):
+        if submit_control_selector is not None:
+            sc = self.qwe.findFirst(submit_control_selector)
+            if not sc.isNull():
+                return sc
+        for c in self.input_controls:
+            if c.type == 'submit':
+                return c
+        for c in self.input_controls:
+            if c.type == 'image':
+                return c
+
+
+
+class FormsMixin(object):
+
+    def __init__(self):
+        self.current_form = None
+
+    def find_form(self, css2_selector=None, nr=None):
+        mf = self.page.mainFrame()
+        if css2_selector is not None:
+            candidate = mf.findFirstElement(css2_selector)
+            if not candidate.isNull():
+                return Form(candidate)
+        if nr is not None and int(nr) > -1:
+            nr = int(nr)
+            forms = mf.findAllElements('form')
+            if nr < forms.count():
+                return Form(forms.at(nr))
+
+    def all_forms(self):
+        '''
+        Return all forms present in the current page.
+        '''
+        mf = self.page.mainFrame()
+        return list(map(Form, mf.findAllElements('form').toList()))
+
+    def select_form(self, css2_selector=None, nr=None):
+        '''
+        Select a form for further processing. Specify the form either with
+        css2_selector or nr. Raises ValueError if no matching form is found.
+
+        :param css2_selector: A CSS2 selector, for example:
+                    'form[action="/accounts/login"]' or 'form[id="loginForm"]'
+
+        :param nr: An integer >= 0. Selects the nr'th form in the current page.
+
+        '''
+        self.current_form = self.find_form(css2_selector=css2_selector, nr=nr)
+        if self.current_form is None:
+            raise ValueError('No such form found')
+        return self.current_form
+
+    def submit(self, submit_control_selector=None, ajax_replies=0, timeout=30.0):
+        if self.current_form is None:
+            raise ValueError('No form selected, use select_form() first')
+        sc = self.current_form.submit_control(submit_control_selector)
+        if sc is None:
+            raise ValueError('No submit control found in the current form')
+        self.current_form = None
+        self.click(sc.qwe, ajax_replies=ajax_replies, timeout=timeout)
+
--- a/src/calibre/web/jsbrowser/test.py
+++ b/src/calibre/web/jsbrowser/test.py
@ -0,0 +1,131 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import unittest, pprint, threading
+
+import cherrypy
+
+from calibre.web.jsbrowser.browser import Browser
+
+class Server(object):
+
+    def __init__(self):
+        self.form_data = {}
+
+    @cherrypy.expose
+    def index(self):
+        return '''
+    <html>
+    <head><title>JS Browser test</title></head>
+    <body>
+    <form id="controls_test" method="post" action="controls_test">
+        <h3>Test controls</h3>
+        <div><label>Simple Text:</label><input type="text" name="text"/></div>
+        <div><label>Password:</label><input type="password" name="password"/></div>
+        <div><label>Checked Checkbox:</label><input type="checkbox" checked="checked" name="checked_checkbox"/></div>
+        <div><label>UnChecked Checkbox:</label><input type="checkbox" name="unchecked_checkbox"/></div>
+        <div><input type="radio" name="sex" value="male" checked="checked" /> Male</div>
+        <div><input type="radio" name="sex" value="female" /> Female</div>
+        <div><input type="submit" value="Submit" /></div>
+    </form>
+    <form id="image_test" method="post" action="controls_test">
+        <h3>Test Image submit</h3>
+        <div><label>Simple Text:</label><input type="text" name="text" value="Image Test" /></div>
+        <input type="image" src="button_image" alt="Submit" />
+    </form>
+    </body>
+    </html>
+    '''
+
+    @cherrypy.expose
+    def controls_test(self, **kwargs):
+        self.form_data = kwargs.copy()
+        #pprint.pprint(kwargs)
+        return pprint.pformat(kwargs)
+
+    @cherrypy.expose
+    def button_image(self):
+        cherrypy.response.headers['Content-Type'] = 'image/png'
+        return I('next.png', data=True)
+
+class Test(unittest.TestCase):
+
+    @classmethod
+    def run_server(cls):
+        cherrypy.engine.start()
+        try:
+            cherrypy.engine.block()
+        except:
+            pass
+
+    @classmethod
+    def setUpClass(cls):
+        cls.port = 17983
+        cls.server = Server()
+        cherrypy.config.update({
+            'log.screen'             : False,
+            'checker.on'             : False,
+            'engine.autoreload_on'   : False,
+            'request.show_tracebacks': True,
+            'server.socket_host'     : b'127.0.0.1',
+            'server.socket_port'     : cls.port,
+            'server.socket_timeout'  : 10, #seconds
+            'server.thread_pool'     : 1, # number of threads
+            'server.shutdown_timeout': 0.1, # minutes
+        })
+        cherrypy.tree.mount(cls.server, '/', config={'/':{}})
+
+        cls.server_thread = threading.Thread(target=cls.run_server)
+        cls.server_thread.daemon = True
+        cls.server_thread.start()
+        cls.browser = Browser(verbosity=0)
+
+    @classmethod
+    def tearDownClass(cls):
+        cherrypy.engine.exit()
+        cls.browser = None
+
+    def test_control_types(self):
+        'Test setting data in the various control types'
+        self.assertEqual(self.browser.visit('http://127.0.0.1:%d'%self.port),
+                True)
+        values = {
+                'checked_checkbox'  : (False, None),
+                'unchecked_checkbox': (True, 'on'),
+                'text': ('some text', 'some text'),
+                'password': ('some password', 'some password'),
+                'sex': ('female', 'female'),
+        }
+        f = self.browser.select_form('#controls_test')
+        for k, vals in values.iteritems():
+            f[k].value = vals[0]
+        self.browser.submit()
+        dat = self.server.form_data
+        for k, vals in values.iteritems():
+            self.assertEqual(vals[1], dat.get(k, None),
+                    'Field %s: %r != %r'%(k, vals[1], dat.get(k, None)))
+
+
+    def test_image_submit(self):
+        'Test submitting a form with a image as the submit control'
+        self.assertEqual(self.browser.visit('http://127.0.0.1:%d'%self.port),
+                True)
+        self.browser.select_form('#image_test')
+        self.browser.submit()
+        self.assertEqual(self.server.form_data['text'], 'Image Test')
+
+def tests():
+    return unittest.TestLoader().loadTestsFromTestCase(Test)
+
+def run():
+    unittest.TextTestRunner(verbosity=2).run(tests())
+
+if __name__ == '__main__':
+    run()
+