mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Sync to trunk.
This commit is contained in:
commit
beea6bcd11
@ -1,38 +1,47 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class BasicUserRecipe1316245412(BasicNewsRecipe):
|
||||
|
||||
title = u'Cicero Online'
|
||||
description = u'Magazin f\xfcr politische Kultur'
|
||||
publisher = 'Ringier Publishing GmbH'
|
||||
category = 'news, politics, Germany'
|
||||
language = 'de'
|
||||
encoding = 'UTF-8'
|
||||
__author__ = 'Armin Geller' # 2011-09-17
|
||||
__author__ = 'Armin Geller' # Upd. 2011-09-19
|
||||
|
||||
oldest_article = 7
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
auto_cleanup = False
|
||||
|
||||
# remove_javascript = True
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'id':["header", "navigation", "skip-link", "header-print", "header-print-url", "meta-toolbar", "footer"]}),
|
||||
dict(name='div', attrs={'class':["region region-sidebar-first column sidebar", "breadcrumb", "breadcrumb-title", "meta", "comment-wrapper",
|
||||
"field field-name-field-show-teaser-right field-type-list-boolean field-label-above"]}),
|
||||
dict(name='div', attrs={'title':["Dossier Auswahl"]}),
|
||||
dict(name='h2', attrs={'class':["title comment-form"]}),
|
||||
dict(name='form', attrs={'class':["comment-form user-info-from-cookie"]}),
|
||||
]
|
||||
dict(name='div', attrs={'id':["header", "navigation", "skip-link", "header-print", "header-print-url", "meta-toolbar", "footer"]}),
|
||||
dict(name='div', attrs={'class':["region region-sidebar-first column sidebar", "breadcrumb", "breadcrumb-title", "meta", "comment-wrapper",
|
||||
"field field-name-field-show-teaser-right field-type-list-boolean field-label-above"]}),
|
||||
dict(name='div', attrs={'title':["Dossier Auswahl"]}),
|
||||
dict(name='h2', attrs={'class':["title comment-form"]}),
|
||||
dict(name='form', attrs={'class':["comment-form user-info-from-cookie"]}),
|
||||
# 2011-09-19 clean-up on first feed historical caricature- and video preview pictures and social icons
|
||||
dict(name='table', attrs={'class':["mcx-social-horizontal", "page-header"]}), # 2011-09-19
|
||||
dict(name='div', attrs={'class':["page-header", "view view-alle-karikaturen view-id-alle_karikaturen view-display-id-default view-dom-id-1",
|
||||
"pagination",
|
||||
"view view-letzte-videos view-id-letzte_videos view-display-id-default view-dom-id-1"]}), # 2011-09-19
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Das gesamte Portfolio', u'http://www.cicero.de/rss.xml'),
|
||||
(u'Berliner Republik', u'http://www.cicero.de/berliner-republik.xml'),
|
||||
(u'Weltb\xfchne', u'http://www.cicero.de/weltbuehne.xml'),
|
||||
(u'Kapital', u'http://www.cicero.de/kapital.xml'),
|
||||
(u'Salon', u'http://www.cicero.de/salon.xml'),
|
||||
(u'Blogs', u'http://www.cicero.de/blogs.xml'), #seems not to be in use at the moment
|
||||
]
|
||||
(u'Das gesamte Portfolio', u'http://www.cicero.de/rss.xml'),
|
||||
(u'Berliner Republik', u'http://www.cicero.de/berliner-republik.xml'),
|
||||
(u'Weltb\xfchne', u'http://www.cicero.de/weltbuehne.xml'),
|
||||
(u'Kapital', u'http://www.cicero.de/kapital.xml'),
|
||||
(u'Salon', u'http://www.cicero.de/salon.xml'),
|
||||
(u'Blogs', u'http://www.cicero.de/blogs.xml'), #seems not to be in use at the moment
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '?print'
|
||||
return url + '?print'
|
||||
|
||||
# def get_cover_url(self):
|
||||
# return 'http://www.cicero.de/sites/all/themes/cicero/logo.png' # need to find a good logo on their home page!
|
||||
|
||||
|
@ -4,19 +4,19 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class IDGse(BasicNewsRecipe):
|
||||
title = 'IDG'
|
||||
description = 'IDG.se'
|
||||
language = 'se'
|
||||
__author__ = 'zapt0'
|
||||
language = 'sv'
|
||||
description = 'IDG.se'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 40
|
||||
max_articles_per_feed = 256
|
||||
no_stylesheets = True
|
||||
encoding = 'ISO-8859-1'
|
||||
remove_javascript = True
|
||||
|
||||
feeds = [(u'Senaste nytt',u'http://feeds.idg.se/idg/vzzs')]
|
||||
feeds = [(u'Dagens IDG-nyheter',u'http://feeds.idg.se/idg/ETkj?format=xml')]
|
||||
|
||||
def print_version(self,url):
|
||||
return url + '?articleRenderMode=print&m=print'
|
||||
return url + '?articleRenderMode=print&m=print'
|
||||
|
||||
def get_cover_url(this):
|
||||
return 'http://idgmedia.idg.se/polopoly_fs/2.3275!images/idgmedia_logo_75.jpg'
|
||||
@ -30,4 +30,3 @@ class IDGse(BasicNewsRecipe):
|
||||
dict(name='div', attrs={'id':['preamble_ad']}),
|
||||
dict(name='ul', attrs={'class':['share']})
|
||||
]
|
||||
|
||||
|
@ -4,25 +4,17 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class AdvancedUserRecipe1308306308(BasicNewsRecipe):
|
||||
title = u'Macleans Magazine'
|
||||
language = 'en_CA'
|
||||
__author__ = 'sexymax15'
|
||||
oldest_article = 30
|
||||
max_articles_per_feed = 12
|
||||
__author__ = 'Medius'
|
||||
oldest_article = 7
|
||||
cover_url = 'http://www.rogersmagazines.com/rms_covers/md/CLE_md.jpg'
|
||||
|
||||
use_embedded_content = False
|
||||
|
||||
remove_empty_feeds = True
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
remove_tags = [dict(name ='img'),dict (id='header'),{'class':'postmetadata'}]
|
||||
remove_tags_after = {'class':'postmetadata'}
|
||||
remove_tags = [dict(id='header'),{'class':'comment'}]
|
||||
remove_tags_after = {'class':'pagination'}
|
||||
|
||||
feeds = [(u'Blog Central', u'http://www2.macleans.ca/category/blog-central/feed/'),
|
||||
(u'Canada', u'http://www2.macleans.ca/category/canada/feed/'),
|
||||
(u'World', u'http://www2.macleans.ca/category/world-from-the-magazine/feed/'),
|
||||
(u'Business', u'http://www2.macleans.ca/category/business/feed/'),
|
||||
(u'Arts & Culture', u'http://www2.macleans.ca/category/arts-culture/feed/'),
|
||||
(u'Opinion', u'http://www2.macleans.ca/category/opinion/feed/'),
|
||||
(u'Health', u'http://www2.macleans.ca/category/health-from-the-magazine/feed/'),
|
||||
(u'Environment', u'http://www2.macleans.ca/category/environment-from-the-magazine/feed/')]
|
||||
def print_version(self, url):
|
||||
return url + 'print/'
|
||||
feeds = [(u'Canada', u'http://www2.macleans.ca/category/canada/feed/'),
|
||||
(u'World', u'http://www2.macleans.ca/category/news-politics/world/feed/'), (u'Business', u'http://www2.macleans.ca/category/business/feed/'), (u'Arts & Culture', u'http://www2.macleans.ca/category/arts/feed/'), (u'Opinion', u'http://www2.macleans.ca/category/opinion/feed/'), (u'Health', u'http://www2.macleans.ca/category/life/health/feed/'), (u'Sports', u'http://www2.macleans.ca/category/life/sports/feed/'), (u'Environment', u'http://www2.macleans.ca/category/life/environment/feed/'), (u'Technology', u'http://www2.macleans.ca/category/life/technology/feed/'), (u'Travel', u'http://www2.macleans.ca/category/life/travel/feed/'), (u'Blog Central', u'http://www2.macleans.ca/category/blog-central/feed/')]
|
||||
|
@ -16,6 +16,7 @@ __UseLife__ = True
|
||||
|
||||
'''
|
||||
Change Log:
|
||||
2011/09/18: parse "column" section stuff from source text files directly.
|
||||
2011/09/07: disable "column" section as it is no longer offered free.
|
||||
2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
|
||||
provide options to remove all images in the file
|
||||
@ -52,16 +53,19 @@ class MPRecipe(BasicNewsRecipe):
|
||||
title = 'Ming Pao - Hong Kong'
|
||||
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
|
||||
category = 'Chinese, News, Hong Kong'
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
|
||||
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
|
||||
keep_only_tags = [dict(name='h1'),
|
||||
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
|
||||
dict(name='font', attrs={'color':['AA0000']}), # for column articles title
|
||||
dict(attrs={'class':['heading']}), # for heading from txt
|
||||
dict(attrs={'id':['newscontent']}), # entertainment and column page content
|
||||
dict(attrs={'id':['newscontent01','newscontent02']}),
|
||||
dict(attrs={'class':['content']}), # for content from txt
|
||||
dict(attrs={'class':['photo']}),
|
||||
dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com
|
||||
dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com
|
||||
dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}), # images for source from life.mingpao.com
|
||||
dict(attrs={'class':['images']}) # for images from txt
|
||||
]
|
||||
if __KeepImages__:
|
||||
remove_tags = [dict(name='style'),
|
||||
@ -232,12 +236,18 @@ class MPRecipe(BasicNewsRecipe):
|
||||
(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
|
||||
(u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
|
||||
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
|
||||
#(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
|
||||
]:
|
||||
articles = self.parse_section2(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
# parse column section articles directly from .txt files
|
||||
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
|
||||
]:
|
||||
articles = self.parse_section2_col(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
@ -358,6 +368,24 @@ class MPRecipe(BasicNewsRecipe):
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
# parse from life.mingpao.com
|
||||
def parse_section2_col(self, url, keystr):
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href=True)
|
||||
a.reverse()
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
for i in a:
|
||||
title = self.tag_to_string(i)
|
||||
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
|
||||
url = url.replace('cfm/dailynews3.cfm?File=', 'ftp/Life3/') # use printed version of the article
|
||||
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||
included_urls.append(url)
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
# parse from www.mingpaovan.com
|
||||
def parse_section3(self, url, baseUrl):
|
||||
self.get_fetchdate()
|
||||
@ -440,6 +468,39 @@ class MPRecipe(BasicNewsRecipe):
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
# preprocess those .txt based files
|
||||
def preprocess_raw_html(self, raw_html, url):
|
||||
if url.rfind('ftp') == -1:
|
||||
return raw_html
|
||||
else:
|
||||
splitter = re.compile(r'\n') # Match non-digits
|
||||
new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
|
||||
next_is_img_txt = False
|
||||
title_started = False
|
||||
met_article_start_char = False
|
||||
for item in splitter.split(raw_html):
|
||||
if item.startswith(u'\u3010'):
|
||||
met_article_start_char = True
|
||||
new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
|
||||
else:
|
||||
if next_is_img_txt == False:
|
||||
if item.startswith('='):
|
||||
next_is_img_txt = True
|
||||
new_raw_html += '<img src="' + str(item)[1:].strip() + '.jpg" /><p>\n'
|
||||
else:
|
||||
if met_article_start_char == False:
|
||||
if title_started == False:
|
||||
new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
|
||||
title_started = True
|
||||
else:
|
||||
new_raw_html = new_raw_html + item + '\n'
|
||||
else:
|
||||
new_raw_html = new_raw_html + item + '<p>\n'
|
||||
else:
|
||||
next_is_img_txt = False
|
||||
new_raw_html = new_raw_html + item + '\n'
|
||||
return new_raw_html + '</div></body></html>'
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
@ -593,3 +654,4 @@ class MPRecipe(BasicNewsRecipe):
|
||||
|
||||
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
||||
opf.render(opf_file, ncx_file)
|
||||
|
||||
|
30
recipes/taipei.recipe
Normal file
30
recipes/taipei.recipe
Normal file
@ -0,0 +1,30 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class TN(BasicNewsRecipe):
|
||||
title = u'Taipei Times'
|
||||
language = 'en_CN'
|
||||
__author__ = 'Krittika Goyal'
|
||||
oldest_article = 1 #days
|
||||
max_articles_per_feed = 25
|
||||
use_embedded_content = False
|
||||
|
||||
no_stylesheets = True
|
||||
auto_cleanup = True
|
||||
auto_cleanup_keep = '//*[@class="main_ipic"]'
|
||||
|
||||
feeds = [
|
||||
('Editorials',
|
||||
'http://www.taipeitimes.com/xml/editorials.rss'),
|
||||
('Taiwan',
|
||||
'http://www.taipeitimes.com/xml/taiwan.rss'),
|
||||
('Features',
|
||||
'http://www.taipeitimes.com/xml/feat.rss'),
|
||||
('Business',
|
||||
'http://www.taipeitimes.com/xml/biz.rss'),
|
||||
('World',
|
||||
'http://www.taipeitimes.com/xml/world.rss'),
|
||||
('Sports',
|
||||
'http://www.taipeitimes.com/xml/sport.rss'),
|
||||
]
|
||||
|
||||
|
@ -31,7 +31,7 @@ def metadata_from_formats(formats, force_read_metadata=False, pattern=None):
|
||||
try:
|
||||
return _metadata_from_formats(formats, force_read_metadata, pattern)
|
||||
except:
|
||||
mi = metadata_from_filename(list(iter(formats), pattern)[0])
|
||||
mi = metadata_from_filename(list(iter(formats))[0], pat=pattern)
|
||||
if not mi.authors:
|
||||
mi.authors = [_('Unknown')]
|
||||
return mi
|
||||
|
@ -110,9 +110,9 @@
|
||||
<string>Some explanation about this template:
|
||||
-The fields availables are 'author_sort', 'authors', 'id',
|
||||
'isbn', 'pubdate', 'publisher', 'series_index', 'series',
|
||||
'tags', 'timestamp', 'title', 'uuid'
|
||||
'tags', 'timestamp', 'title', 'uuid', 'title_sort'
|
||||
-For list types ie authors and tags, only the first element
|
||||
wil be selected.
|
||||
will be selected.
|
||||
-For time field, only the date will be used. </string>
|
||||
</property>
|
||||
<property name="scaledContents">
|
||||
|
@ -29,7 +29,7 @@ class PluginWidget(QWidget, Ui_Form):
|
||||
QListWidgetItem(x, self.db_fields)
|
||||
|
||||
db = db_()
|
||||
for x in sorted(db.custom_field_keys()):
|
||||
for x in sorted(db.custom_field_keys()):
|
||||
self.all_fields.append(x)
|
||||
QListWidgetItem(x, self.db_fields)
|
||||
|
||||
|
@ -87,7 +87,7 @@ class DeviceJob(BaseJob): # {{{
|
||||
self.failed = True
|
||||
ex = as_unicode(err)
|
||||
self._details = ex + '\n\n' + \
|
||||
traceback.format_exc()
|
||||
force_unicode(traceback.format_exc())
|
||||
self.exception = err
|
||||
finally:
|
||||
self.job_done()
|
||||
|
@ -32,7 +32,7 @@ FIELDS = ['all', 'title', 'title_sort', 'author_sort', 'authors', 'comments',
|
||||
'rating', 'series_index', 'series', 'size', 'tags', 'timestamp', 'uuid']
|
||||
|
||||
#Allowed fields for template
|
||||
TEMPLATE_ALLOWED_FIELDS = [ 'author_sort', 'authors', 'id', 'isbn', 'pubdate',
|
||||
TEMPLATE_ALLOWED_FIELDS = [ 'author_sort', 'authors', 'id', 'isbn', 'pubdate', 'title_sort',
|
||||
'publisher', 'series_index', 'series', 'tags', 'timestamp', 'title', 'uuid' ]
|
||||
|
||||
class CSV_XML(CatalogPlugin): # {{{
|
||||
@ -324,7 +324,7 @@ class BIBTEX(CatalogPlugin): # {{{
|
||||
def run(self, path_to_output, opts, db, notification=DummyReporter()):
|
||||
|
||||
def create_bibtex_entry(entry, fields, mode, template_citation,
|
||||
bibtexdict, citation_bibtex=True, calibre_files=True):
|
||||
bibtexdict, db, citation_bibtex=True, calibre_files=True):
|
||||
|
||||
#Bibtex doesn't like UTF-8 but keep unicode until writing
|
||||
#Define starting chain or if book valid strict and not book return a Fail string
|
||||
@ -345,7 +345,13 @@ class BIBTEX(CatalogPlugin): # {{{
|
||||
bibtex_entry = [u' '.join(bibtex_entry)]
|
||||
|
||||
for field in fields:
|
||||
item = entry[field]
|
||||
if field.startswith('#'):
|
||||
item = db.get_field(entry['id'],field,index_is_id=True)
|
||||
elif field == 'title_sort':
|
||||
item = entry['sort']
|
||||
else:
|
||||
item = entry[field]
|
||||
|
||||
#check if the field should be included (none or empty)
|
||||
if item is None:
|
||||
continue
|
||||
@ -358,10 +364,6 @@ class BIBTEX(CatalogPlugin): # {{{
|
||||
if field == 'authors' :
|
||||
bibtex_entry.append(u'author = "%s"' % bibtexdict.bibtex_author_format(item))
|
||||
|
||||
elif field in ['title', 'publisher', 'cover', 'uuid', 'ondevice',
|
||||
'author_sort', 'series'] :
|
||||
bibtex_entry.append(u'%s = "%s"' % (field, bibtexdict.utf8ToBibtex(item)))
|
||||
|
||||
elif field == 'id' :
|
||||
bibtex_entry.append(u'calibreid = "%s"' % int(item))
|
||||
|
||||
@ -409,6 +411,14 @@ class BIBTEX(CatalogPlugin): # {{{
|
||||
bibtex_entry.append(u'year = "%s"' % item.year)
|
||||
bibtex_entry.append(u'month = "%s"' % bibtexdict.utf8ToBibtex(strftime("%b", item)))
|
||||
|
||||
elif field.startswith('#') :
|
||||
bibtex_entry.append(u'%s = "%s"' % (field[1:], bibtexdict.utf8ToBibtex(item)))
|
||||
|
||||
else:
|
||||
# elif field in ['title', 'publisher', 'cover', 'uuid', 'ondevice',
|
||||
# 'author_sort', 'series', 'title_sort'] :
|
||||
bibtex_entry.append(u'%s = "%s"' % (field, bibtexdict.utf8ToBibtex(item)))
|
||||
|
||||
bibtex_entry = u',\n '.join(bibtex_entry)
|
||||
bibtex_entry += u' }\n\n'
|
||||
|
||||
@ -588,7 +598,7 @@ class BIBTEX(CatalogPlugin): # {{{
|
||||
|
||||
for entry in data:
|
||||
outfile.write(create_bibtex_entry(entry, fields, bib_entry, template_citation,
|
||||
bibtexc, citation_bibtex, addfiles_bibtex))
|
||||
bibtexc, db, citation_bibtex, addfiles_bibtex))
|
||||
# }}}
|
||||
|
||||
class EPUB_MOBI(CatalogPlugin):
|
||||
|
@ -147,13 +147,14 @@ class BasicNewsRecipe(Recipe):
|
||||
#: Specify elements that the auto cleanup algorithm should never remove
|
||||
#: The syntax is a XPath expression. For example::
|
||||
#:
|
||||
#: auto_cleanup_keep = '//div[@id="article-image"]' will keep all divs with
|
||||
#: auto_cleanup_keep = '//div[@id="article-image"]' will keep all divs with
|
||||
#: id="article-image"
|
||||
#: auto_cleanup_keep = '//*[@class="important"]' will keep all elements
|
||||
#: auto_cleanup_keep = '//*[@class="important"]' will keep all elements
|
||||
#: with class="important"
|
||||
#: auto_cleanup_keep = '//div[@id="article-image"]|//span[@class="important"]'
|
||||
#: auto_cleanup_keep = '//div[@id="article-image"]|//span[@class="important"]'
|
||||
#: will keep all divs with id="article-image" and spans
|
||||
#: with class="important"
|
||||
#:
|
||||
auto_cleanup_keep = None
|
||||
|
||||
#: Specify any extra :term:`CSS` that should be addded to downloaded :term:`HTML` files
|
||||
|
@ -7,16 +7,22 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, pprint
|
||||
import os, pprint, time
|
||||
|
||||
from PyQt4.Qt import (QObject, QNetworkAccessManager, QNetworkDiskCache,
|
||||
QNetworkProxy, QNetworkProxyFactory)
|
||||
from PyQt4.QtWebKit import QWebPage
|
||||
QNetworkProxy, QNetworkProxyFactory, QEventLoop, QUrl,
|
||||
QDialog, QVBoxLayout, QSize)
|
||||
from PyQt4.QtWebKit import QWebPage, QWebSettings, QWebView
|
||||
|
||||
from calibre import USER_AGENT, prints, get_proxies, get_proxy_info
|
||||
from calibre.constants import ispy3, config_dir
|
||||
from calibre.utils.logging import ThreadSafeLog
|
||||
from calibre.gui2 import must_use_qt
|
||||
from calibre.web.jsbrowser.forms import FormsMixin
|
||||
|
||||
class Timeout(Exception): pass
|
||||
|
||||
class LoadError(Exception): pass
|
||||
|
||||
class WebPage(QWebPage): # {{{
|
||||
|
||||
@ -24,6 +30,7 @@ class WebPage(QWebPage): # {{{
|
||||
confirm_callback=None,
|
||||
prompt_callback=None,
|
||||
user_agent=USER_AGENT,
|
||||
enable_developer_tools=False,
|
||||
parent=None):
|
||||
QWebPage.__init__(self, parent)
|
||||
|
||||
@ -33,6 +40,12 @@ class WebPage(QWebPage): # {{{
|
||||
self.prompt_callback = prompt_callback
|
||||
self.setForwardUnsupportedContent(True)
|
||||
self.unsupportedContent.connect(self.on_unsupported_content)
|
||||
settings = self.settings()
|
||||
if enable_developer_tools:
|
||||
settings.setAttribute(QWebSettings.DeveloperExtrasEnabled, True)
|
||||
QWebSettings.enablePersistentStorage(os.path.join(config_dir, 'caches',
|
||||
'webkit-persistence'))
|
||||
QWebSettings.setMaximumPagesInCache(0)
|
||||
|
||||
def userAgentForUrl(self, url):
|
||||
return self.user_agent
|
||||
@ -173,7 +186,36 @@ class NetworkAccessManager(QNetworkAccessManager): # {{{
|
||||
self.log.debug('\n'.join(debug))
|
||||
# }}}
|
||||
|
||||
class Browser(QObject):
|
||||
class LoadWatcher(QObject): # {{{
|
||||
|
||||
def __init__(self, page, parent=None):
|
||||
QObject.__init__(self, parent)
|
||||
self.is_loading = True
|
||||
self.loaded_ok = None
|
||||
page.loadFinished.connect(self)
|
||||
self.page = page
|
||||
|
||||
def __call__(self, ok):
|
||||
self.loaded_ok = ok
|
||||
self.is_loading = False
|
||||
self.page.loadFinished.disconnect(self)
|
||||
self.page = None
|
||||
# }}}
|
||||
|
||||
class BrowserView(QDialog): # {{{
|
||||
|
||||
def __init__(self, page, parent=None):
|
||||
QDialog.__init__(self, parent)
|
||||
self.l = l = QVBoxLayout(self)
|
||||
self.setLayout(l)
|
||||
self.webview = QWebView(self)
|
||||
l.addWidget(self.webview)
|
||||
self.resize(QSize(1024, 768))
|
||||
self.webview.setPage(page)
|
||||
|
||||
# }}}
|
||||
|
||||
class Browser(QObject, FormsMixin):
|
||||
|
||||
'''
|
||||
Browser (WebKit with no GUI).
|
||||
@ -202,16 +244,21 @@ class Browser(QObject):
|
||||
# If True a disk cache is used
|
||||
use_disk_cache=True,
|
||||
|
||||
# Enable Inspect element functionality
|
||||
enable_developer_tools=False,
|
||||
|
||||
# Verbosity
|
||||
verbosity = 0
|
||||
):
|
||||
must_use_qt()
|
||||
QObject.__init__(self)
|
||||
FormsMixin.__init__(self)
|
||||
|
||||
if log is None:
|
||||
log = ThreadSafeLog()
|
||||
if verbosity:
|
||||
log.filter_level = log.DEBUG
|
||||
self.log = log
|
||||
|
||||
self.jquery_lib = P('content_server/jquery.js', data=True,
|
||||
allow_user_override=False).decode('utf-8')
|
||||
@ -220,7 +267,64 @@ class Browser(QObject):
|
||||
|
||||
self.page = WebPage(log, confirm_callback=confirm_callback,
|
||||
prompt_callback=prompt_callback, user_agent=user_agent,
|
||||
enable_developer_tools=enable_developer_tools,
|
||||
parent=self)
|
||||
self.nam = NetworkAccessManager(log, use_disk_cache=use_disk_cache, parent=self)
|
||||
self.page.setNetworkAccessManager(self.nam)
|
||||
|
||||
def _wait_for_load(self, timeout, url=None):
|
||||
loop = QEventLoop(self)
|
||||
start_time = time.time()
|
||||
end_time = start_time + timeout
|
||||
lw = LoadWatcher(self.page, parent=self)
|
||||
while lw.is_loading and end_time > time.time():
|
||||
if not loop.processEvents():
|
||||
time.sleep(0.01)
|
||||
if lw.is_loading:
|
||||
raise Timeout('Loading of %r took longer than %d seconds'%(
|
||||
url, timeout))
|
||||
|
||||
return lw.loaded_ok
|
||||
|
||||
def visit(self, url, timeout=30.0):
|
||||
'''
|
||||
Open the page specified in URL and wait for it to complete loading.
|
||||
Note that when this method returns, there may still be javascript
|
||||
that needs to execute (this method returns when the loadFinished()
|
||||
signal is called on QWebPage). This method will raise a Timeout
|
||||
exception if loading takes more than timeout seconds.
|
||||
|
||||
Returns True if loading was successful, False otherwise.
|
||||
'''
|
||||
self.current_form = None
|
||||
self.page.mainFrame().load(QUrl(url))
|
||||
return self._wait_for_load(timeout, url)
|
||||
|
||||
def click(self, qwe, wait_for_load=True, ajax_replies=0, timeout=30.0):
|
||||
'''
|
||||
Click the QWebElement pointed to by qwe.
|
||||
|
||||
:param wait_for_load: If you know that the click is going to cause a
|
||||
new page to be loaded, set this to True to have
|
||||
the method block until the new page is loaded
|
||||
:para ajax_replies: Number of replies to wait for after clicking a link
|
||||
that triggers some AJAX interaction
|
||||
'''
|
||||
js = '''
|
||||
var e = document.createEvent('MouseEvents');
|
||||
e.initEvent( 'click', true, true );
|
||||
this.dispatchEvent(e);
|
||||
'''
|
||||
qwe.evaluateJavaScript(js)
|
||||
if ajax_replies > 0:
|
||||
raise NotImplementedError('AJAX clicking not implemented')
|
||||
elif wait_for_load and not self._wait_for_load(timeout):
|
||||
raise LoadError('Clicking resulted in a failed load')
|
||||
|
||||
def show_browser(self):
|
||||
'''
|
||||
Show the currently loaded web page in a window. Useful for debugging.
|
||||
'''
|
||||
view = BrowserView(self.page)
|
||||
view.exec_()
|
||||
|
||||
|
160
src/calibre/web/jsbrowser/forms.py
Normal file
160
src/calibre/web/jsbrowser/forms.py
Normal file
@ -0,0 +1,160 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
from future_builtins import map
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre import as_unicode
|
||||
|
||||
class Control(object):
|
||||
|
||||
def __init__(self, qwe):
|
||||
self.qwe = qwe
|
||||
self.name = unicode(qwe.attribute('name'))
|
||||
self.type = unicode(qwe.attribute('type'))
|
||||
|
||||
def __repr__(self):
|
||||
return unicode(self.qwe.toOuterXml())
|
||||
|
||||
@dynamic_property
|
||||
def value(self):
|
||||
def fget(self):
|
||||
if self.type in ('checkbox', 'radio'):
|
||||
return unicode(self.qwe.attribute('checked')) == 'checked'
|
||||
if self.type in ('text', 'password'):
|
||||
return unicode(self.qwe.attribute('value'))
|
||||
|
||||
def fset(self, val):
|
||||
if self.type in ('checkbox', 'radio'):
|
||||
if val:
|
||||
self.qwe.setAttribute('checked', 'checked')
|
||||
else:
|
||||
self.qwe.removeAttribute('checked')
|
||||
elif self.type in ('text', 'password'):
|
||||
self.qwe.setAttribute('value', as_unicode(val))
|
||||
|
||||
return property(fget=fget, fset=fset)
|
||||
|
||||
class RadioControl(object):
|
||||
|
||||
def __init__(self, name, controls):
|
||||
self.name = name
|
||||
self.type = 'radio'
|
||||
self.values = {unicode(c.attribute('value')):c for c in controls}
|
||||
|
||||
def __repr__(self):
|
||||
return 'RadioControl(%s)'%(', '.join(self.values))
|
||||
|
||||
@dynamic_property
|
||||
def value(self):
|
||||
def fget(self):
|
||||
for val, x in self.values.iteritems():
|
||||
if unicode(x.attribute('checked')) == 'checked':
|
||||
return val
|
||||
|
||||
def fset(self, val):
|
||||
control = None
|
||||
for value, x in self.values.iteritems():
|
||||
if val == value:
|
||||
control = x
|
||||
break
|
||||
if control is not None:
|
||||
for x in self.values.itervalues():
|
||||
x.removeAttribute('checked')
|
||||
control.setAttribute('checked', 'checked')
|
||||
|
||||
return property(fget=fget, fset=fset)
|
||||
|
||||
class Form(object):
|
||||
|
||||
def __init__(self, qwe):
|
||||
self.qwe = qwe
|
||||
self.attributes = {unicode(x):unicode(qwe.attribute(x)) for x in
|
||||
qwe.attributeNames()}
|
||||
self.input_controls = list(map(Control, qwe.findAll('input')))
|
||||
rc = [x for x in self.input_controls if x.type == 'radio']
|
||||
self.input_controls = [x for x in self.input_controls if x.type != 'radio']
|
||||
rc_names = {x.name for x in rc}
|
||||
self.radio_controls = {name:RadioControl(name, [x.qwe for x in rc if x.name == name]) for name in rc_names}
|
||||
|
||||
def __getitem__(self, key):
|
||||
for x in self.input_controls:
|
||||
if key == x.name:
|
||||
return x
|
||||
try:
|
||||
return self.radio_controls.get(key)
|
||||
except KeyError:
|
||||
pass
|
||||
raise KeyError('No control with the name %s in this form'%key)
|
||||
|
||||
def __repr__(self):
|
||||
attrs = ['%s=%s'%(k, v) for k, v in self.attributes.iteritems()]
|
||||
return '<form %s>'%(' '.join(attrs))
|
||||
|
||||
def submit_control(self, submit_control_selector=None):
|
||||
if submit_control_selector is not None:
|
||||
sc = self.qwe.findFirst(submit_control_selector)
|
||||
if not sc.isNull():
|
||||
return sc
|
||||
for c in self.input_controls:
|
||||
if c.type == 'submit':
|
||||
return c
|
||||
for c in self.input_controls:
|
||||
if c.type == 'image':
|
||||
return c
|
||||
|
||||
|
||||
|
||||
class FormsMixin(object):
|
||||
|
||||
def __init__(self):
|
||||
self.current_form = None
|
||||
|
||||
def find_form(self, css2_selector=None, nr=None):
|
||||
mf = self.page.mainFrame()
|
||||
if css2_selector is not None:
|
||||
candidate = mf.findFirstElement(css2_selector)
|
||||
if not candidate.isNull():
|
||||
return Form(candidate)
|
||||
if nr is not None and int(nr) > -1:
|
||||
nr = int(nr)
|
||||
forms = mf.findAllElements('form')
|
||||
if nr < forms.count():
|
||||
return Form(forms.at(nr))
|
||||
|
||||
def all_forms(self):
|
||||
'''
|
||||
Return all forms present in the current page.
|
||||
'''
|
||||
mf = self.page.mainFrame()
|
||||
return list(map(Form, mf.findAllElements('form').toList()))
|
||||
|
||||
def select_form(self, css2_selector=None, nr=None):
|
||||
'''
|
||||
Select a form for further processing. Specify the form either with
|
||||
css2_selector or nr. Raises ValueError if no matching form is found.
|
||||
|
||||
:param css2_selector: A CSS2 selector, for example:
|
||||
'form[action="/accounts/login"]' or 'form[id="loginForm"]'
|
||||
|
||||
:param nr: An integer >= 0. Selects the nr'th form in the current page.
|
||||
|
||||
'''
|
||||
self.current_form = self.find_form(css2_selector=css2_selector, nr=nr)
|
||||
if self.current_form is None:
|
||||
raise ValueError('No such form found')
|
||||
return self.current_form
|
||||
|
||||
def submit(self, submit_control_selector=None, ajax_replies=0, timeout=30.0):
|
||||
if self.current_form is None:
|
||||
raise ValueError('No form selected, use select_form() first')
|
||||
sc = self.current_form.submit_control(submit_control_selector)
|
||||
if sc is None:
|
||||
raise ValueError('No submit control found in the current form')
|
||||
self.current_form = None
|
||||
self.click(sc.qwe, ajax_replies=ajax_replies, timeout=timeout)
|
||||
|
131
src/calibre/web/jsbrowser/test.py
Normal file
131
src/calibre/web/jsbrowser/test.py
Normal file
@ -0,0 +1,131 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import unittest, pprint, threading
|
||||
|
||||
import cherrypy
|
||||
|
||||
from calibre.web.jsbrowser.browser import Browser
|
||||
|
||||
class Server(object):
|
||||
|
||||
def __init__(self):
|
||||
self.form_data = {}
|
||||
|
||||
@cherrypy.expose
|
||||
def index(self):
|
||||
return '''
|
||||
<html>
|
||||
<head><title>JS Browser test</title></head>
|
||||
<body>
|
||||
<form id="controls_test" method="post" action="controls_test">
|
||||
<h3>Test controls</h3>
|
||||
<div><label>Simple Text:</label><input type="text" name="text"/></div>
|
||||
<div><label>Password:</label><input type="password" name="password"/></div>
|
||||
<div><label>Checked Checkbox:</label><input type="checkbox" checked="checked" name="checked_checkbox"/></div>
|
||||
<div><label>UnChecked Checkbox:</label><input type="checkbox" name="unchecked_checkbox"/></div>
|
||||
<div><input type="radio" name="sex" value="male" checked="checked" /> Male</div>
|
||||
<div><input type="radio" name="sex" value="female" /> Female</div>
|
||||
<div><input type="submit" value="Submit" /></div>
|
||||
</form>
|
||||
<form id="image_test" method="post" action="controls_test">
|
||||
<h3>Test Image submit</h3>
|
||||
<div><label>Simple Text:</label><input type="text" name="text" value="Image Test" /></div>
|
||||
<input type="image" src="button_image" alt="Submit" />
|
||||
</form>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
@cherrypy.expose
|
||||
def controls_test(self, **kwargs):
|
||||
self.form_data = kwargs.copy()
|
||||
#pprint.pprint(kwargs)
|
||||
return pprint.pformat(kwargs)
|
||||
|
||||
@cherrypy.expose
|
||||
def button_image(self):
|
||||
cherrypy.response.headers['Content-Type'] = 'image/png'
|
||||
return I('next.png', data=True)
|
||||
|
||||
class Test(unittest.TestCase):
|
||||
|
||||
@classmethod
|
||||
def run_server(cls):
|
||||
cherrypy.engine.start()
|
||||
try:
|
||||
cherrypy.engine.block()
|
||||
except:
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.port = 17983
|
||||
cls.server = Server()
|
||||
cherrypy.config.update({
|
||||
'log.screen' : False,
|
||||
'checker.on' : False,
|
||||
'engine.autoreload_on' : False,
|
||||
'request.show_tracebacks': True,
|
||||
'server.socket_host' : b'127.0.0.1',
|
||||
'server.socket_port' : cls.port,
|
||||
'server.socket_timeout' : 10, #seconds
|
||||
'server.thread_pool' : 1, # number of threads
|
||||
'server.shutdown_timeout': 0.1, # minutes
|
||||
})
|
||||
cherrypy.tree.mount(cls.server, '/', config={'/':{}})
|
||||
|
||||
cls.server_thread = threading.Thread(target=cls.run_server)
|
||||
cls.server_thread.daemon = True
|
||||
cls.server_thread.start()
|
||||
cls.browser = Browser(verbosity=0)
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
cherrypy.engine.exit()
|
||||
cls.browser = None
|
||||
|
||||
def test_control_types(self):
|
||||
'Test setting data in the various control types'
|
||||
self.assertEqual(self.browser.visit('http://127.0.0.1:%d'%self.port),
|
||||
True)
|
||||
values = {
|
||||
'checked_checkbox' : (False, None),
|
||||
'unchecked_checkbox': (True, 'on'),
|
||||
'text': ('some text', 'some text'),
|
||||
'password': ('some password', 'some password'),
|
||||
'sex': ('female', 'female'),
|
||||
}
|
||||
f = self.browser.select_form('#controls_test')
|
||||
for k, vals in values.iteritems():
|
||||
f[k].value = vals[0]
|
||||
self.browser.submit()
|
||||
dat = self.server.form_data
|
||||
for k, vals in values.iteritems():
|
||||
self.assertEqual(vals[1], dat.get(k, None),
|
||||
'Field %s: %r != %r'%(k, vals[1], dat.get(k, None)))
|
||||
|
||||
|
||||
def test_image_submit(self):
|
||||
'Test submitting a form with a image as the submit control'
|
||||
self.assertEqual(self.browser.visit('http://127.0.0.1:%d'%self.port),
|
||||
True)
|
||||
self.browser.select_form('#image_test')
|
||||
self.browser.submit()
|
||||
self.assertEqual(self.server.form_data['text'], 'Image Test')
|
||||
|
||||
def tests():
|
||||
return unittest.TestLoader().loadTestsFromTestCase(Test)
|
||||
|
||||
def run():
|
||||
unittest.TextTestRunner(verbosity=2).run(tests())
|
||||
|
||||
if __name__ == '__main__':
|
||||
run()
|
||||
|
Loading…
x
Reference in New Issue
Block a user