diff --git a/recipes/cicero.recipe b/recipes/cicero.recipe index de6676ae44..beb323cf31 100644 --- a/recipes/cicero.recipe +++ b/recipes/cicero.recipe @@ -1,38 +1,47 @@ from calibre.web.feeds.news import BasicNewsRecipe class BasicUserRecipe1316245412(BasicNewsRecipe): - title = u'Cicero Online' description = u'Magazin f\xfcr politische Kultur' publisher = 'Ringier Publishing GmbH' category = 'news, politics, Germany' language = 'de' encoding = 'UTF-8' - __author__ = 'Armin Geller' # 2011-09-17 + __author__ = 'Armin Geller' # Upd. 2011-09-19 - oldest_article = 7 + oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True auto_cleanup = False +# remove_javascript = True + remove_tags = [ - dict(name='div', attrs={'id':["header", "navigation", "skip-link", "header-print", "header-print-url", "meta-toolbar", "footer"]}), - dict(name='div', attrs={'class':["region region-sidebar-first column sidebar", "breadcrumb", "breadcrumb-title", "meta", "comment-wrapper", - "field field-name-field-show-teaser-right field-type-list-boolean field-label-above"]}), - dict(name='div', attrs={'title':["Dossier Auswahl"]}), - dict(name='h2', attrs={'class':["title comment-form"]}), - dict(name='form', attrs={'class':["comment-form user-info-from-cookie"]}), - ] + dict(name='div', attrs={'id':["header", "navigation", "skip-link", "header-print", "header-print-url", "meta-toolbar", "footer"]}), + dict(name='div', attrs={'class':["region region-sidebar-first column sidebar", "breadcrumb", "breadcrumb-title", "meta", "comment-wrapper", + "field field-name-field-show-teaser-right field-type-list-boolean field-label-above"]}), + dict(name='div', attrs={'title':["Dossier Auswahl"]}), + dict(name='h2', attrs={'class':["title comment-form"]}), + dict(name='form', attrs={'class':["comment-form user-info-from-cookie"]}), + # 2011-09-19 clean-up on first feed historical caricature- and video preview pictures and social icons + dict(name='table', attrs={'class':["mcx-social-horizontal", "page-header"]}), # 2011-09-19 + dict(name='div', attrs={'class':["page-header", "view view-alle-karikaturen view-id-alle_karikaturen view-display-id-default view-dom-id-1", + "pagination", + "view view-letzte-videos view-id-letzte_videos view-display-id-default view-dom-id-1"]}), # 2011-09-19 + ] feeds = [ - (u'Das gesamte Portfolio', u'http://www.cicero.de/rss.xml'), - (u'Berliner Republik', u'http://www.cicero.de/berliner-republik.xml'), - (u'Weltb\xfchne', u'http://www.cicero.de/weltbuehne.xml'), - (u'Kapital', u'http://www.cicero.de/kapital.xml'), - (u'Salon', u'http://www.cicero.de/salon.xml'), - (u'Blogs', u'http://www.cicero.de/blogs.xml'), #seems not to be in use at the moment - ] + (u'Das gesamte Portfolio', u'http://www.cicero.de/rss.xml'), + (u'Berliner Republik', u'http://www.cicero.de/berliner-republik.xml'), + (u'Weltb\xfchne', u'http://www.cicero.de/weltbuehne.xml'), + (u'Kapital', u'http://www.cicero.de/kapital.xml'), + (u'Salon', u'http://www.cicero.de/salon.xml'), + (u'Blogs', u'http://www.cicero.de/blogs.xml'), #seems not to be in use at the moment + ] def print_version(self, url): - return url + '?print' + return url + '?print' + +# def get_cover_url(self): +# return 'http://www.cicero.de/sites/all/themes/cicero/logo.png' # need to find a good logo on their home page! diff --git a/recipes/idg_se.recipe b/recipes/idg_se.recipe index b4e86f9643..e5f0203e09 100644 --- a/recipes/idg_se.recipe +++ b/recipes/idg_se.recipe @@ -4,19 +4,19 @@ from calibre.web.feeds.news import BasicNewsRecipe class IDGse(BasicNewsRecipe): title = 'IDG' - description = 'IDG.se' - language = 'se' __author__ = 'zapt0' + language = 'sv' + description = 'IDG.se' oldest_article = 1 - max_articles_per_feed = 40 + max_articles_per_feed = 256 no_stylesheets = True encoding = 'ISO-8859-1' remove_javascript = True - feeds = [(u'Senaste nytt',u'http://feeds.idg.se/idg/vzzs')] + feeds = [(u'Dagens IDG-nyheter',u'http://feeds.idg.se/idg/ETkj?format=xml')] def print_version(self,url): - return url + '?articleRenderMode=print&m=print' + return url + '?articleRenderMode=print&m=print' def get_cover_url(this): return 'http://idgmedia.idg.se/polopoly_fs/2.3275!images/idgmedia_logo_75.jpg' @@ -30,4 +30,3 @@ class IDGse(BasicNewsRecipe): dict(name='div', attrs={'id':['preamble_ad']}), dict(name='ul', attrs={'class':['share']}) ] - diff --git a/recipes/macleans.recipe b/recipes/macleans.recipe index 22f94638d9..0421a9c560 100644 --- a/recipes/macleans.recipe +++ b/recipes/macleans.recipe @@ -4,25 +4,17 @@ from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1308306308(BasicNewsRecipe): title = u'Macleans Magazine' language = 'en_CA' - __author__ = 'sexymax15' - oldest_article = 30 - max_articles_per_feed = 12 + __author__ = 'Medius' + oldest_article = 7 + cover_url = 'http://www.rogersmagazines.com/rms_covers/md/CLE_md.jpg' use_embedded_content = False remove_empty_feeds = True no_stylesheets = True remove_javascript = True - remove_tags = [dict(name ='img'),dict (id='header'),{'class':'postmetadata'}] - remove_tags_after = {'class':'postmetadata'} + remove_tags = [dict(id='header'),{'class':'comment'}] + remove_tags_after = {'class':'pagination'} - feeds = [(u'Blog Central', u'http://www2.macleans.ca/category/blog-central/feed/'), - (u'Canada', u'http://www2.macleans.ca/category/canada/feed/'), -(u'World', u'http://www2.macleans.ca/category/world-from-the-magazine/feed/'), -(u'Business', u'http://www2.macleans.ca/category/business/feed/'), -(u'Arts & Culture', u'http://www2.macleans.ca/category/arts-culture/feed/'), -(u'Opinion', u'http://www2.macleans.ca/category/opinion/feed/'), -(u'Health', u'http://www2.macleans.ca/category/health-from-the-magazine/feed/'), - (u'Environment', u'http://www2.macleans.ca/category/environment-from-the-magazine/feed/')] - def print_version(self, url): - return url + 'print/' + feeds = [(u'Canada', u'http://www2.macleans.ca/category/canada/feed/'), +(u'World', u'http://www2.macleans.ca/category/news-politics/world/feed/'), (u'Business', u'http://www2.macleans.ca/category/business/feed/'), (u'Arts & Culture', u'http://www2.macleans.ca/category/arts/feed/'), (u'Opinion', u'http://www2.macleans.ca/category/opinion/feed/'), (u'Health', u'http://www2.macleans.ca/category/life/health/feed/'), (u'Sports', u'http://www2.macleans.ca/category/life/sports/feed/'), (u'Environment', u'http://www2.macleans.ca/category/life/environment/feed/'), (u'Technology', u'http://www2.macleans.ca/category/life/technology/feed/'), (u'Travel', u'http://www2.macleans.ca/category/life/travel/feed/'), (u'Blog Central', u'http://www2.macleans.ca/category/blog-central/feed/')] diff --git a/recipes/ming_pao.recipe b/recipes/ming_pao.recipe index 7060a7cd3e..ef8ad98bb9 100644 --- a/recipes/ming_pao.recipe +++ b/recipes/ming_pao.recipe @@ -16,6 +16,7 @@ __UseLife__ = True ''' Change Log: +2011/09/18: parse "column" section stuff from source text files directly. 2011/09/07: disable "column" section as it is no longer offered free. 2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source provide options to remove all images in the file @@ -52,16 +53,19 @@ class MPRecipe(BasicNewsRecipe): title = 'Ming Pao - Hong Kong' description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)' category = 'Chinese, News, Hong Kong' - extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}' + extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}' masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' keep_only_tags = [dict(name='h1'), dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title dict(name='font', attrs={'color':['AA0000']}), # for column articles title + dict(attrs={'class':['heading']}), # for heading from txt dict(attrs={'id':['newscontent']}), # entertainment and column page content dict(attrs={'id':['newscontent01','newscontent02']}), + dict(attrs={'class':['content']}), # for content from txt dict(attrs={'class':['photo']}), dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com - dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com + dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}), # images for source from life.mingpao.com + dict(attrs={'class':['images']}) # for images from txt ] if __KeepImages__: remove_tags = [dict(name='style'), @@ -232,12 +236,18 @@ class MPRecipe(BasicNewsRecipe): (u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'), (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'), (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal') - #(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl') ]: articles = self.parse_section2(url, keystr) if articles: feeds.append((title, articles)) + # parse column section articles directly from .txt files + for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl') + ]: + articles = self.parse_section2_col(url, keystr) + if articles: + feeds.append((title, articles)) + for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: articles = self.parse_section(url) @@ -358,6 +368,24 @@ class MPRecipe(BasicNewsRecipe): current_articles.reverse() return current_articles + # parse from life.mingpao.com + def parse_section2_col(self, url, keystr): + self.get_fetchdate() + soup = self.index_to_soup(url) + a = soup.findAll('a', href=True) + a.reverse() + current_articles = [] + included_urls = [] + for i in a: + title = self.tag_to_string(i) + url = 'http://life.mingpao.com/cfm/' + i.get('href', False) + if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1): + url = url.replace('cfm/dailynews3.cfm?File=', 'ftp/Life3/') # use printed version of the article + current_articles.append({'title': title, 'url': url, 'description': ''}) + included_urls.append(url) + current_articles.reverse() + return current_articles + # parse from www.mingpaovan.com def parse_section3(self, url, baseUrl): self.get_fetchdate() @@ -440,6 +468,39 @@ class MPRecipe(BasicNewsRecipe): current_articles.reverse() return current_articles + # preprocess those .txt based files + def preprocess_raw_html(self, raw_html, url): + if url.rfind('ftp') == -1: + return raw_html + else: + splitter = re.compile(r'\n') # Match non-digits + new_raw_html = 'Untitled
' + next_is_img_txt = False + title_started = False + met_article_start_char = False + for item in splitter.split(raw_html): + if item.startswith(u'\u3010'): + met_article_start_char = True + new_raw_html = new_raw_html + '

' + item + '

\n' + else: + if next_is_img_txt == False: + if item.startswith('='): + next_is_img_txt = True + new_raw_html += '

\n' + else: + if met_article_start_char == False: + if title_started == False: + new_raw_html = new_raw_html + '

' + item + '\n' + title_started = True + else: + new_raw_html = new_raw_html + item + '\n' + else: + new_raw_html = new_raw_html + item + '

\n' + else: + next_is_img_txt = False + new_raw_html = new_raw_html + item + '\n' + return new_raw_html + '

' + def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] @@ -593,3 +654,4 @@ class MPRecipe(BasicNewsRecipe): with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file): opf.render(opf_file, ncx_file) + diff --git a/recipes/taipei.recipe b/recipes/taipei.recipe new file mode 100644 index 0000000000..58579bf65c --- /dev/null +++ b/recipes/taipei.recipe @@ -0,0 +1,30 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class TN(BasicNewsRecipe): + title = u'Taipei Times' + language = 'en_CN' + __author__ = 'Krittika Goyal' + oldest_article = 1 #days + max_articles_per_feed = 25 + use_embedded_content = False + + no_stylesheets = True + auto_cleanup = True + auto_cleanup_keep = '//*[@class="main_ipic"]' + + feeds = [ +('Editorials', + 'http://www.taipeitimes.com/xml/editorials.rss'), +('Taiwan', + 'http://www.taipeitimes.com/xml/taiwan.rss'), +('Features', + 'http://www.taipeitimes.com/xml/feat.rss'), +('Business', + 'http://www.taipeitimes.com/xml/biz.rss'), +('World', + 'http://www.taipeitimes.com/xml/world.rss'), +('Sports', + 'http://www.taipeitimes.com/xml/sport.rss'), +] + + diff --git a/src/calibre/ebooks/metadata/meta.py b/src/calibre/ebooks/metadata/meta.py index 3f3cc3e142..d82a2268fa 100644 --- a/src/calibre/ebooks/metadata/meta.py +++ b/src/calibre/ebooks/metadata/meta.py @@ -31,7 +31,7 @@ def metadata_from_formats(formats, force_read_metadata=False, pattern=None): try: return _metadata_from_formats(formats, force_read_metadata, pattern) except: - mi = metadata_from_filename(list(iter(formats), pattern)[0]) + mi = metadata_from_filename(list(iter(formats))[0], pat=pattern) if not mi.authors: mi.authors = [_('Unknown')] return mi diff --git a/src/calibre/gui2/catalog/catalog_bibtex.ui b/src/calibre/gui2/catalog/catalog_bibtex.ui index 5b41e96267..b3d2b56b65 100644 --- a/src/calibre/gui2/catalog/catalog_bibtex.ui +++ b/src/calibre/gui2/catalog/catalog_bibtex.ui @@ -110,9 +110,9 @@ Some explanation about this template: -The fields availables are 'author_sort', 'authors', 'id', 'isbn', 'pubdate', 'publisher', 'series_index', 'series', - 'tags', 'timestamp', 'title', 'uuid' + 'tags', 'timestamp', 'title', 'uuid', 'title_sort' -For list types ie authors and tags, only the first element - wil be selected. + will be selected. -For time field, only the date will be used. diff --git a/src/calibre/gui2/catalog/catalog_csv_xml.py b/src/calibre/gui2/catalog/catalog_csv_xml.py index 18f2c210dc..a64816cf98 100644 --- a/src/calibre/gui2/catalog/catalog_csv_xml.py +++ b/src/calibre/gui2/catalog/catalog_csv_xml.py @@ -29,7 +29,7 @@ class PluginWidget(QWidget, Ui_Form): QListWidgetItem(x, self.db_fields) db = db_() - for x in sorted(db.custom_field_keys()): + for x in sorted(db.custom_field_keys()): self.all_fields.append(x) QListWidgetItem(x, self.db_fields) diff --git a/src/calibre/gui2/device.py b/src/calibre/gui2/device.py index e929825245..f3ee0e575e 100644 --- a/src/calibre/gui2/device.py +++ b/src/calibre/gui2/device.py @@ -87,7 +87,7 @@ class DeviceJob(BaseJob): # {{{ self.failed = True ex = as_unicode(err) self._details = ex + '\n\n' + \ - traceback.format_exc() + force_unicode(traceback.format_exc()) self.exception = err finally: self.job_done() diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py index 0f5a31e1d7..1aa114762f 100644 --- a/src/calibre/library/catalog.py +++ b/src/calibre/library/catalog.py @@ -32,7 +32,7 @@ FIELDS = ['all', 'title', 'title_sort', 'author_sort', 'authors', 'comments', 'rating', 'series_index', 'series', 'size', 'tags', 'timestamp', 'uuid'] #Allowed fields for template -TEMPLATE_ALLOWED_FIELDS = [ 'author_sort', 'authors', 'id', 'isbn', 'pubdate', +TEMPLATE_ALLOWED_FIELDS = [ 'author_sort', 'authors', 'id', 'isbn', 'pubdate', 'title_sort', 'publisher', 'series_index', 'series', 'tags', 'timestamp', 'title', 'uuid' ] class CSV_XML(CatalogPlugin): # {{{ @@ -324,7 +324,7 @@ class BIBTEX(CatalogPlugin): # {{{ def run(self, path_to_output, opts, db, notification=DummyReporter()): def create_bibtex_entry(entry, fields, mode, template_citation, - bibtexdict, citation_bibtex=True, calibre_files=True): + bibtexdict, db, citation_bibtex=True, calibre_files=True): #Bibtex doesn't like UTF-8 but keep unicode until writing #Define starting chain or if book valid strict and not book return a Fail string @@ -345,7 +345,13 @@ class BIBTEX(CatalogPlugin): # {{{ bibtex_entry = [u' '.join(bibtex_entry)] for field in fields: - item = entry[field] + if field.startswith('#'): + item = db.get_field(entry['id'],field,index_is_id=True) + elif field == 'title_sort': + item = entry['sort'] + else: + item = entry[field] + #check if the field should be included (none or empty) if item is None: continue @@ -358,10 +364,6 @@ class BIBTEX(CatalogPlugin): # {{{ if field == 'authors' : bibtex_entry.append(u'author = "%s"' % bibtexdict.bibtex_author_format(item)) - elif field in ['title', 'publisher', 'cover', 'uuid', 'ondevice', - 'author_sort', 'series'] : - bibtex_entry.append(u'%s = "%s"' % (field, bibtexdict.utf8ToBibtex(item))) - elif field == 'id' : bibtex_entry.append(u'calibreid = "%s"' % int(item)) @@ -409,6 +411,14 @@ class BIBTEX(CatalogPlugin): # {{{ bibtex_entry.append(u'year = "%s"' % item.year) bibtex_entry.append(u'month = "%s"' % bibtexdict.utf8ToBibtex(strftime("%b", item))) + elif field.startswith('#') : + bibtex_entry.append(u'%s = "%s"' % (field[1:], bibtexdict.utf8ToBibtex(item))) + + else: + # elif field in ['title', 'publisher', 'cover', 'uuid', 'ondevice', + # 'author_sort', 'series', 'title_sort'] : + bibtex_entry.append(u'%s = "%s"' % (field, bibtexdict.utf8ToBibtex(item))) + bibtex_entry = u',\n '.join(bibtex_entry) bibtex_entry += u' }\n\n' @@ -588,7 +598,7 @@ class BIBTEX(CatalogPlugin): # {{{ for entry in data: outfile.write(create_bibtex_entry(entry, fields, bib_entry, template_citation, - bibtexc, citation_bibtex, addfiles_bibtex)) + bibtexc, db, citation_bibtex, addfiles_bibtex)) # }}} class EPUB_MOBI(CatalogPlugin): diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index b7efd611e0..da037ca43b 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -147,13 +147,14 @@ class BasicNewsRecipe(Recipe): #: Specify elements that the auto cleanup algorithm should never remove #: The syntax is a XPath expression. For example:: #: - #: auto_cleanup_keep = '//div[@id="article-image"]' will keep all divs with + #: auto_cleanup_keep = '//div[@id="article-image"]' will keep all divs with #: id="article-image" - #: auto_cleanup_keep = '//*[@class="important"]' will keep all elements + #: auto_cleanup_keep = '//*[@class="important"]' will keep all elements #: with class="important" - #: auto_cleanup_keep = '//div[@id="article-image"]|//span[@class="important"]' + #: auto_cleanup_keep = '//div[@id="article-image"]|//span[@class="important"]' #: will keep all divs with id="article-image" and spans #: with class="important" + #: auto_cleanup_keep = None #: Specify any extra :term:`CSS` that should be addded to downloaded :term:`HTML` files diff --git a/src/calibre/web/jsbrowser/browser.py b/src/calibre/web/jsbrowser/browser.py index 133d720aac..5d569a3fac 100644 --- a/src/calibre/web/jsbrowser/browser.py +++ b/src/calibre/web/jsbrowser/browser.py @@ -7,16 +7,22 @@ __license__ = 'GPL v3' __copyright__ = '2011, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import os, pprint +import os, pprint, time from PyQt4.Qt import (QObject, QNetworkAccessManager, QNetworkDiskCache, - QNetworkProxy, QNetworkProxyFactory) -from PyQt4.QtWebKit import QWebPage + QNetworkProxy, QNetworkProxyFactory, QEventLoop, QUrl, + QDialog, QVBoxLayout, QSize) +from PyQt4.QtWebKit import QWebPage, QWebSettings, QWebView from calibre import USER_AGENT, prints, get_proxies, get_proxy_info from calibre.constants import ispy3, config_dir from calibre.utils.logging import ThreadSafeLog from calibre.gui2 import must_use_qt +from calibre.web.jsbrowser.forms import FormsMixin + +class Timeout(Exception): pass + +class LoadError(Exception): pass class WebPage(QWebPage): # {{{ @@ -24,6 +30,7 @@ class WebPage(QWebPage): # {{{ confirm_callback=None, prompt_callback=None, user_agent=USER_AGENT, + enable_developer_tools=False, parent=None): QWebPage.__init__(self, parent) @@ -33,6 +40,12 @@ class WebPage(QWebPage): # {{{ self.prompt_callback = prompt_callback self.setForwardUnsupportedContent(True) self.unsupportedContent.connect(self.on_unsupported_content) + settings = self.settings() + if enable_developer_tools: + settings.setAttribute(QWebSettings.DeveloperExtrasEnabled, True) + QWebSettings.enablePersistentStorage(os.path.join(config_dir, 'caches', + 'webkit-persistence')) + QWebSettings.setMaximumPagesInCache(0) def userAgentForUrl(self, url): return self.user_agent @@ -173,7 +186,36 @@ class NetworkAccessManager(QNetworkAccessManager): # {{{ self.log.debug('\n'.join(debug)) # }}} -class Browser(QObject): +class LoadWatcher(QObject): # {{{ + + def __init__(self, page, parent=None): + QObject.__init__(self, parent) + self.is_loading = True + self.loaded_ok = None + page.loadFinished.connect(self) + self.page = page + + def __call__(self, ok): + self.loaded_ok = ok + self.is_loading = False + self.page.loadFinished.disconnect(self) + self.page = None +# }}} + +class BrowserView(QDialog): # {{{ + + def __init__(self, page, parent=None): + QDialog.__init__(self, parent) + self.l = l = QVBoxLayout(self) + self.setLayout(l) + self.webview = QWebView(self) + l.addWidget(self.webview) + self.resize(QSize(1024, 768)) + self.webview.setPage(page) + +# }}} + +class Browser(QObject, FormsMixin): ''' Browser (WebKit with no GUI). @@ -202,16 +244,21 @@ class Browser(QObject): # If True a disk cache is used use_disk_cache=True, + # Enable Inspect element functionality + enable_developer_tools=False, + # Verbosity verbosity = 0 ): must_use_qt() QObject.__init__(self) + FormsMixin.__init__(self) if log is None: log = ThreadSafeLog() if verbosity: log.filter_level = log.DEBUG + self.log = log self.jquery_lib = P('content_server/jquery.js', data=True, allow_user_override=False).decode('utf-8') @@ -220,7 +267,64 @@ class Browser(QObject): self.page = WebPage(log, confirm_callback=confirm_callback, prompt_callback=prompt_callback, user_agent=user_agent, + enable_developer_tools=enable_developer_tools, parent=self) self.nam = NetworkAccessManager(log, use_disk_cache=use_disk_cache, parent=self) self.page.setNetworkAccessManager(self.nam) + def _wait_for_load(self, timeout, url=None): + loop = QEventLoop(self) + start_time = time.time() + end_time = start_time + timeout + lw = LoadWatcher(self.page, parent=self) + while lw.is_loading and end_time > time.time(): + if not loop.processEvents(): + time.sleep(0.01) + if lw.is_loading: + raise Timeout('Loading of %r took longer than %d seconds'%( + url, timeout)) + + return lw.loaded_ok + + def visit(self, url, timeout=30.0): + ''' + Open the page specified in URL and wait for it to complete loading. + Note that when this method returns, there may still be javascript + that needs to execute (this method returns when the loadFinished() + signal is called on QWebPage). This method will raise a Timeout + exception if loading takes more than timeout seconds. + + Returns True if loading was successful, False otherwise. + ''' + self.current_form = None + self.page.mainFrame().load(QUrl(url)) + return self._wait_for_load(timeout, url) + + def click(self, qwe, wait_for_load=True, ajax_replies=0, timeout=30.0): + ''' + Click the QWebElement pointed to by qwe. + + :param wait_for_load: If you know that the click is going to cause a + new page to be loaded, set this to True to have + the method block until the new page is loaded + :para ajax_replies: Number of replies to wait for after clicking a link + that triggers some AJAX interaction + ''' + js = ''' + var e = document.createEvent('MouseEvents'); + e.initEvent( 'click', true, true ); + this.dispatchEvent(e); + ''' + qwe.evaluateJavaScript(js) + if ajax_replies > 0: + raise NotImplementedError('AJAX clicking not implemented') + elif wait_for_load and not self._wait_for_load(timeout): + raise LoadError('Clicking resulted in a failed load') + + def show_browser(self): + ''' + Show the currently loaded web page in a window. Useful for debugging. + ''' + view = BrowserView(self.page) + view.exec_() + diff --git a/src/calibre/web/jsbrowser/forms.py b/src/calibre/web/jsbrowser/forms.py new file mode 100644 index 0000000000..9f68e1e003 --- /dev/null +++ b/src/calibre/web/jsbrowser/forms.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) +from future_builtins import map + +__license__ = 'GPL v3' +__copyright__ = '2011, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from calibre import as_unicode + +class Control(object): + + def __init__(self, qwe): + self.qwe = qwe + self.name = unicode(qwe.attribute('name')) + self.type = unicode(qwe.attribute('type')) + + def __repr__(self): + return unicode(self.qwe.toOuterXml()) + + @dynamic_property + def value(self): + def fget(self): + if self.type in ('checkbox', 'radio'): + return unicode(self.qwe.attribute('checked')) == 'checked' + if self.type in ('text', 'password'): + return unicode(self.qwe.attribute('value')) + + def fset(self, val): + if self.type in ('checkbox', 'radio'): + if val: + self.qwe.setAttribute('checked', 'checked') + else: + self.qwe.removeAttribute('checked') + elif self.type in ('text', 'password'): + self.qwe.setAttribute('value', as_unicode(val)) + + return property(fget=fget, fset=fset) + +class RadioControl(object): + + def __init__(self, name, controls): + self.name = name + self.type = 'radio' + self.values = {unicode(c.attribute('value')):c for c in controls} + + def __repr__(self): + return 'RadioControl(%s)'%(', '.join(self.values)) + + @dynamic_property + def value(self): + def fget(self): + for val, x in self.values.iteritems(): + if unicode(x.attribute('checked')) == 'checked': + return val + + def fset(self, val): + control = None + for value, x in self.values.iteritems(): + if val == value: + control = x + break + if control is not None: + for x in self.values.itervalues(): + x.removeAttribute('checked') + control.setAttribute('checked', 'checked') + + return property(fget=fget, fset=fset) + +class Form(object): + + def __init__(self, qwe): + self.qwe = qwe + self.attributes = {unicode(x):unicode(qwe.attribute(x)) for x in + qwe.attributeNames()} + self.input_controls = list(map(Control, qwe.findAll('input'))) + rc = [x for x in self.input_controls if x.type == 'radio'] + self.input_controls = [x for x in self.input_controls if x.type != 'radio'] + rc_names = {x.name for x in rc} + self.radio_controls = {name:RadioControl(name, [x.qwe for x in rc if x.name == name]) for name in rc_names} + + def __getitem__(self, key): + for x in self.input_controls: + if key == x.name: + return x + try: + return self.radio_controls.get(key) + except KeyError: + pass + raise KeyError('No control with the name %s in this form'%key) + + def __repr__(self): + attrs = ['%s=%s'%(k, v) for k, v in self.attributes.iteritems()] + return '
'%(' '.join(attrs)) + + def submit_control(self, submit_control_selector=None): + if submit_control_selector is not None: + sc = self.qwe.findFirst(submit_control_selector) + if not sc.isNull(): + return sc + for c in self.input_controls: + if c.type == 'submit': + return c + for c in self.input_controls: + if c.type == 'image': + return c + + + +class FormsMixin(object): + + def __init__(self): + self.current_form = None + + def find_form(self, css2_selector=None, nr=None): + mf = self.page.mainFrame() + if css2_selector is not None: + candidate = mf.findFirstElement(css2_selector) + if not candidate.isNull(): + return Form(candidate) + if nr is not None and int(nr) > -1: + nr = int(nr) + forms = mf.findAllElements('form') + if nr < forms.count(): + return Form(forms.at(nr)) + + def all_forms(self): + ''' + Return all forms present in the current page. + ''' + mf = self.page.mainFrame() + return list(map(Form, mf.findAllElements('form').toList())) + + def select_form(self, css2_selector=None, nr=None): + ''' + Select a form for further processing. Specify the form either with + css2_selector or nr. Raises ValueError if no matching form is found. + + :param css2_selector: A CSS2 selector, for example: + 'form[action="/accounts/login"]' or 'form[id="loginForm"]' + + :param nr: An integer >= 0. Selects the nr'th form in the current page. + + ''' + self.current_form = self.find_form(css2_selector=css2_selector, nr=nr) + if self.current_form is None: + raise ValueError('No such form found') + return self.current_form + + def submit(self, submit_control_selector=None, ajax_replies=0, timeout=30.0): + if self.current_form is None: + raise ValueError('No form selected, use select_form() first') + sc = self.current_form.submit_control(submit_control_selector) + if sc is None: + raise ValueError('No submit control found in the current form') + self.current_form = None + self.click(sc.qwe, ajax_replies=ajax_replies, timeout=timeout) + diff --git a/src/calibre/web/jsbrowser/test.py b/src/calibre/web/jsbrowser/test.py new file mode 100644 index 0000000000..b5de5321be --- /dev/null +++ b/src/calibre/web/jsbrowser/test.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2011, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import unittest, pprint, threading + +import cherrypy + +from calibre.web.jsbrowser.browser import Browser + +class Server(object): + + def __init__(self): + self.form_data = {} + + @cherrypy.expose + def index(self): + return ''' + + JS Browser test + + +

Test controls

+
+
+
+
+
Male
+
Female
+
+ +
+

Test Image submit

+
+ +
+ + + ''' + + @cherrypy.expose + def controls_test(self, **kwargs): + self.form_data = kwargs.copy() + #pprint.pprint(kwargs) + return pprint.pformat(kwargs) + + @cherrypy.expose + def button_image(self): + cherrypy.response.headers['Content-Type'] = 'image/png' + return I('next.png', data=True) + +class Test(unittest.TestCase): + + @classmethod + def run_server(cls): + cherrypy.engine.start() + try: + cherrypy.engine.block() + except: + pass + + @classmethod + def setUpClass(cls): + cls.port = 17983 + cls.server = Server() + cherrypy.config.update({ + 'log.screen' : False, + 'checker.on' : False, + 'engine.autoreload_on' : False, + 'request.show_tracebacks': True, + 'server.socket_host' : b'127.0.0.1', + 'server.socket_port' : cls.port, + 'server.socket_timeout' : 10, #seconds + 'server.thread_pool' : 1, # number of threads + 'server.shutdown_timeout': 0.1, # minutes + }) + cherrypy.tree.mount(cls.server, '/', config={'/':{}}) + + cls.server_thread = threading.Thread(target=cls.run_server) + cls.server_thread.daemon = True + cls.server_thread.start() + cls.browser = Browser(verbosity=0) + + @classmethod + def tearDownClass(cls): + cherrypy.engine.exit() + cls.browser = None + + def test_control_types(self): + 'Test setting data in the various control types' + self.assertEqual(self.browser.visit('http://127.0.0.1:%d'%self.port), + True) + values = { + 'checked_checkbox' : (False, None), + 'unchecked_checkbox': (True, 'on'), + 'text': ('some text', 'some text'), + 'password': ('some password', 'some password'), + 'sex': ('female', 'female'), + } + f = self.browser.select_form('#controls_test') + for k, vals in values.iteritems(): + f[k].value = vals[0] + self.browser.submit() + dat = self.server.form_data + for k, vals in values.iteritems(): + self.assertEqual(vals[1], dat.get(k, None), + 'Field %s: %r != %r'%(k, vals[1], dat.get(k, None))) + + + def test_image_submit(self): + 'Test submitting a form with a image as the submit control' + self.assertEqual(self.browser.visit('http://127.0.0.1:%d'%self.port), + True) + self.browser.select_form('#image_test') + self.browser.submit() + self.assertEqual(self.server.form_data['text'], 'Image Test') + +def tests(): + return unittest.TestLoader().loadTestsFromTestCase(Test) + +def run(): + unittest.TextTestRunner(verbosity=2).run(tests()) + +if __name__ == '__main__': + run() +