This commit is contained in:
Sengian 2011-09-16 00:01:05 +02:00
commit 033ea53de1
53 changed files with 1090 additions and 835 deletions

View File

@ -1,59 +1,54 @@
#!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __copyright__ = '2008 Kovid Goyal kovid@kovidgoyal.net, 2010 Darko Miletic <darko.miletic at gmail.com>'
__docformat__ = 'restructuredtext en'
''' '''
businessweek.com www.businessweek.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class BusinessWeek(BasicNewsRecipe): class BusinessWeek(BasicNewsRecipe):
title = 'Business Week' title = 'Business Week'
description = 'Business News, Stock Market and Financial Advice' __author__ = 'Kovid Goyal and Darko Miletic'
__author__ = 'ChuckEggDotCom and Sujata Raman' description = 'Read the latest international business news & stock market news. Get updated company profiles, financial advice, global economy and technology news.'
language = 'en' publisher = 'Bloomberg L.P.'
category = 'Business, business news, stock market, stock market news, financial advice, company profiles, financial advice, global economy, technology news'
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 10 max_articles_per_feed = 200
no_stylesheets = True no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
language = 'en'
remove_empty_feeds = True
publication_type = 'magazine'
cover_url = 'http://images.businessweek.com/mz/covers/current_120x160.jpg'
masthead_url = 'http://assets.businessweek.com/images/bw-logo.png'
extra_css = """
body{font-family: Helvetica,Arial,sans-serif }
img{margin-bottom: 0.4em; display:block}
.tagline{color: gray; font-style: italic}
.photoCredit{font-size: small; color: gray}
"""
recursions = 1 conversion_options = {
match_regexps = [r'http://www.businessweek.com/.*_page_[1-9].*'] 'comment' : description
extra_css = ''' , 'tags' : category
h1{font-family :Arial,Helvetica,sans-serif; font-size:large;} , 'publisher' : publisher
.news_story_title{font-family :Arial,Helvetica,sans-serif; font-size:large;font-weight:bold;} , 'language' : language
h2{font-family :Arial,Helvetica,sans-serif; font-size:medium;color:#666666;} }
h3{text-transform:uppercase;font-family :Arial,Helvetica,sans-serif; font-size:large;font-weight:bold;}
h4{font-family :Arial,Helvetica,sans-serif; font-size:small;font-weight:bold;}
p{font-family :Arial,Helvetica,sans-serif; }
#lede600{font-size:x-small;}
#storybody{font-size:x-small;}
p{font-family :Arial,Helvetica,sans-serif;}
.strap{font-family :Arial,Helvetica,sans-serif; font-size:x-small; color:#064599;}
.byline{font-family :Arial,Helvetica,sans-serif; font-size:x-small;}
.postedBy{font-family :Arial,Helvetica,sans-serif; font-size:x-small;color:#666666;}
.trackback{font-family :Arial,Helvetica,sans-serif; font-size:x-small;color:#666666;}
.date{font-family :Arial,Helvetica,sans-serif; font-size:x-small;color:#666666;}
.wrapper{font-family :Arial,Helvetica,sans-serif; font-size:x-small;}
.photoCredit{font-family :Arial,Helvetica,sans-serif; font-size:x-small;color:#666666;}
.tagline{font-family :Arial,Helvetica,sans-serif; font-size:x-small;color:#666666;}
.pageCount{color:#666666;font-family :Arial,Helvetica,sans-serif; font-size:x-small;}
.note{font-family :Arial,Helvetica,sans-serif; font-size:small;color:#666666;font-style:italic;}
.highlight{font-family :Arial,Helvetica,sans-serif; font-size:small;background-color:#FFF200;}
.annotation{font-family :Arial,Helvetica,sans-serif; font-size:x-small;color:#666666;}
'''
remove_tags = [ dict(name='div', attrs={'id':["log","feedback","footer","secondarynav","secondnavbar","header","email","bw2-header","column2","wrapper-bw2-footer","wrapper-mgh-footer","inset","commentForm","commentDisplay","bwExtras","bw2-umbrella","readerComments","leg","rightcol"]}), remove_tags = [
dict(name='div', attrs={'class':["menu",'sponsorbox smallertext',"TopNavTile","graybottom leaderboard"]}), dict(attrs={'class':'inStory'})
dict(name='img', alt ="News"), ,dict(name=['meta','link','iframe','base','embed','object','table','th','tr','td'])
dict(name='td', width ="1"), ,dict(attrs={'id':['inset','videoDisplay']})
] ]
keep_only_tags = [dict(name='div', attrs={'id':['story-body','storyBody','article_body','articleBody']})]
remove_attributes = ['lang']
match_regexps = [r'http://www.businessweek.com/.*_page_[1-9].*']
feeds = [ feeds = [
(u'Top Stories', u'http://www.businessweek.com/topStories/rss/topStories.rss'), (u'Top Stories', u'http://www.businessweek.com/topStories/rss/topStories.rss'),
(u'Top News', u'http://www.businessweek.com/rss/bwdaily.rss'), (u'Top News' , u'http://www.businessweek.com/rss/bwdaily.rss' ),
(u'Asia', u'http://www.businessweek.com/rss/asia.rss'), (u'Asia', u'http://www.businessweek.com/rss/asia.rss'),
(u'Autos', u'http://www.businessweek.com/rss/autos/index.rss'), (u'Autos', u'http://www.businessweek.com/rss/autos/index.rss'),
(u'Classic Cars', u'http://rss.businessweek.com/bw_rss/classiccars'), (u'Classic Cars', u'http://rss.businessweek.com/bw_rss/classiccars'),
@ -75,19 +70,36 @@ class BusinessWeek(BasicNewsRecipe):
] ]
def get_article_url(self, article): def get_article_url(self, article):
url = article.get('guid', None) url = article.get('guid', None)
if 'podcasts' in url:
return None
if 'surveys' in url:
return None
if 'images' in url:
return None
if 'feedroom' in url:
return None
if '/magazine/toc/' in url:
return None
rurl, sep, rest = url.rpartition('?')
if rurl:
return rurl
return rest
if 'podcasts' in url or 'surveys' in url: def print_version(self, url):
url = None if '/news/' in url or '/blog/' in url:
return url return url
if '/magazine' in url:
rurl = url.replace('http://www.businessweek.com/','http://www.businessweek.com/printer/')
else:
rurl = url.replace('http://www.businessweek.com/','http://www.businessweek.com/print/')
return rurl.replace('/investing/','/investor/')
def postprocess_html(self, soup, first): def preprocess_html(self, soup):
for item in soup.findAll(style=True):
for tag in soup.findAll(name=['ul','li','table','td','tr','span']): del item['style']
tag.name = 'div' for alink in soup.findAll('a'):
for tag in soup.findAll(name= 'div',attrs={ 'id':'pageNav'}): if alink.string is not None:
tag.extract() tstr = alink.string
alink.replaceWith(tstr)
return soup return soup

View File

@ -4,95 +4,73 @@ __copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
www.businessworld.in www.businessworld.in
''' '''
from calibre import strftime import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class BusinessWorldMagazine(BasicNewsRecipe): class BusinessWorldMagazine(BasicNewsRecipe):
title = 'Business World Magazine' title = 'Business World Magazine'
__author__ = 'Darko Miletic' __author__ = 'Kovid Goyal'
description = 'News from India' description = 'News from India'
publisher = 'ABP Pvt Ltd Publication' publisher = 'ABP Pvt Ltd Publication'
category = 'news, politics, finances, India, Asia' category = 'news, politics, finances, India, Asia'
delay = 1 delay = 1
no_stylesheets = True no_stylesheets = True
INDEX = 'http://www.businessworld.in/bw/Magazine_Current_Issue' INDEX = 'http://www.businessworld.in/businessworld/magazine_latest_issue.php'
ROOT = 'http://www.businessworld.in' ROOT = 'http://www.businessworld.in'
use_embedded_content = False
encoding = 'utf-8' encoding = 'utf-8'
language = 'en_IN' language = 'en_IN'
extra_css = """ auto_cleanup = True
img{display: block; margin-bottom: 0.5em}
body{font-family: Arial,Helvetica,sans-serif}
h2{color: gray; display: block}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
def is_in_list(self,linklist,url):
for litem in linklist:
if litem == url:
return True
return False
def parse_index(self): def parse_index(self):
br = self.browser
br.open(self.ROOT)
raw = br.open(br.click_link(text_regex=re.compile('Current.*Issue',
re.I))).read()
soup = self.index_to_soup(raw)
mc = soup.find(attrs={'class':'mag_cover'})
if mc is not None:
img = mc.find('img', src=True)
if img is not None:
self.cover_url = img['src']
feeds = []
current_section = None
articles = [] articles = []
linklist = [] for tag in soup.findAll(['h3', 'h2']):
soup = self.index_to_soup(self.INDEX) inner_a = tag.find('a')
if tag.name == 'h3' and inner_a is not None:
continue
if tag.name == 'h2' and (inner_a is None or current_section is
None):
continue
if tag.name == 'h3':
if current_section is not None and articles:
feeds.append((current_section, articles))
current_section = self.tag_to_string(tag)
self.log('Found section:', current_section)
articles = []
elif tag.name == 'h2':
url = inner_a.get('href', None)
if url is None: continue
if url.startswith('/'): url = self.ROOT + url
title = self.tag_to_string(inner_a)
h1 = tag.findPreviousSibling('h1')
if h1 is not None:
title = self.tag_to_string(h1) + title
self.log('\tFound article:', title)
articles.append({'title':title, 'url':url, 'date':'',
'description':''})
if current_section and articles:
feeds.append((current_section, articles))
return feeds
tough = soup.find('div', attrs={'id':'tough'})
if tough:
for item in tough.findAll('h1'):
description = ''
title_prefix = ''
feed_link = item.find('a')
if feed_link and feed_link.has_key('href'):
url = self.ROOT + feed_link['href']
if not self.is_in_list(linklist,url):
title = title_prefix + self.tag_to_string(feed_link)
date = strftime(self.timefmt)
articles.append({
'title' :title
,'date' :date
,'url' :url
,'description':description
})
linklist.append(url)
for item in soup.findAll('div', attrs={'class':'nametitle'}):
description = ''
title_prefix = ''
feed_link = item.find('a')
if feed_link and feed_link.has_key('href'):
url = self.ROOT + feed_link['href']
if not self.is_in_list(linklist,url):
title = title_prefix + self.tag_to_string(feed_link)
date = strftime(self.timefmt)
articles.append({
'title' :title
,'date' :date
,'url' :url
,'description':description
})
linklist.append(url)
return [(soup.head.title.string, articles)]
keep_only_tags = [dict(name='div', attrs={'id':'printwrapper'})]
remove_tags = [dict(name=['object','link','meta','base','iframe','link','table'])]
def print_version(self, url):
return url.replace('/bw/','/bw/storyContent/')
def get_cover_url(self):
cover_url = None
soup = self.index_to_soup(self.INDEX)
cover_item = soup.find('img',attrs={'class':'toughbor'})
if cover_item:
cover_url = self.ROOT + cover_item['src']
return cover_url

128
recipes/cio_magazine.recipe Normal file
View File

@ -0,0 +1,128 @@
# Los primeros comentarios son las dificultades que he tenido con el Piton
# Cuando da error UTF8 revisa los comentarios (acentos). En notepad++ Search, Goto, posicion y lo ves.
# Editar con Notepad++ Si pone - donde no debe es que ha indentado mal... Edit - Blank operations - tab to space
# He entendido lo que significa el from... son paths dentro de pylib.zip...
# Con from importa solo un simbolo...con import,la libreria completa
from calibre.web.feeds.news import BasicNewsRecipe
# sys no hace falta... lo intente usar para escribir en stderr
from calibre import strftime
# Para convertir el tiempo del articulo
import string, re
# Para usar expresiones regulares
# Visto en pylib.zip... la primera letra es mayuscula
# Estas dos ultimas han sido un vago intento de establecer una cookie (no usado)
class CIO_Magazine(BasicNewsRecipe):
title = 'CIO Magazine'
oldest_article = 14
max_articles_per_feed = 100
auto_cleanup = True
__author__ = 'Julio Map'
description = 'CIO is the leading information brand for today-s busy Chief information Officer - CIO Magazine bi-monthly '
language = 'en'
encoding = 'utf8'
cover_url = 'http://www.cio.com/homepage/images/hp-cio-logo-linkedin.png'
remove_tags_before = dict(name='div', attrs={'id':'container'})
# Absolutamente innecesario... al final he visto un print_version (ver mas adelante)
# Dentro de una revista dada...
# issue_details contiene el titulo y las secciones de este ejemplar
# DetailModule esta dentro de issue_details contiene las urls y resumenes
# Dentro de un articulo dado...
# Article-default-body contiene el texto. Pero como digo, he encontrado una print_version
no_stylesheets = True
remove_javascript = True
def print_version(self,url):
# A esta funcion le llama el sistema... no hay que llamarla uno mismo (porque seria llamada dos veces)
# Existe una version imprimible de los articulos cambiando
# http://www.cio.com/article/<num>/<titulo> por
# http://www.cio.com/article/print/<num> que contiene todas las paginas dentro del div id=container
if url.startswith('/'):
url = 'http://www.cio.com'+url
segments = url.split('/')
printURL = '/'.join(segments[0:4]) + '/print/' + segments[4] +'#'
return printURL
def parse_index(self):
###########################################################################
# This method should be implemented in recipes that parse a website
# instead of feeds to generate a list of articles. Typical uses are for
# news sources that have a Print Edition webpage that lists all the
# articles in the current print edition. If this function is implemented,
# it will be used in preference to BasicNewsRecipe.parse_feeds().
#
# It must return a list. Each element of the list must be a 2-element
# tuple of the form ('feed title', list of articles).
#
# Each list of articles must contain dictionaries of the form:
#
# {
# 'title' : article title,
# 'url' : URL of print version,
# 'date' : The publication date of the article as a string,
# 'description' : A summary of the article
# 'content' : The full article (can be an empty string). This is used by FullContentProfile
# }
#
# For an example, see the recipe for downloading The Atlantic.
# In addition, you can add 'author' for the author of the article.
###############################################################################
# Primero buscamos cual es la ultima revista que se ha creado
soupinicial = self.index_to_soup('http://www.cio.com/magazine')
# Es el primer enlace que hay en el DIV con class content_body
a= soupinicial.find(True, attrs={'class':'content_body'}).find('a', href=True)
INDEX = re.sub(r'\?.*', '', a['href'])
# Como cio.com usa enlaces relativos, le anteponemos el domain name.
if INDEX.startswith('/'): # protegiendonos de que dejen de usarlos
INDEX = 'http://www.cio.com'+INDEX
# Y nos aseguramos en los logs que lo estamos haciendo bien
print ("INDEX en parse_index: ", INDEX)
# Ya sabemos cual es la revista... procesemosla.
soup = self.index_to_soup(INDEX)
articles = {}
key = None
feeds = []
# Para empezar nos quedamos solo con dos DIV, 'heading' y ' issue_item'
# Del primero sacamos las categorias (key) y del segundo las urls y resumenes
for div in soup.findAll(True,
attrs={'class':['heading', 'issue_item']}):
if div['class'] == 'heading':
key = string.capwords(self.tag_to_string(div.span))
print ("Key: ",key) # Esto es para depurar
articles[key] = []
feeds.append(key)
elif div['class'] == 'issue_item':
a = div.find('a', href=True)
if not a:
continue
url = re.sub(r'\?.*', '', a['href'])
print("url: ",url) # Esto es para depurar
title = self.tag_to_string(a, use_alt=True).strip() # Ya para nota, quitar al final las dos ultimas palabras
pubdate = strftime('%a, %d %b') # No es la fecha de publicacion sino la de colecta
summary = div.find('p') # Dentro de la div 'issue_item' el unico parrafo que hay es el resumen
description = '' # Si hay summary la description sera el summary... si no, la dejamos en blanco
if summary:
description = self.tag_to_string(summary, use_alt=False)
print ("Description = ", description)
feed = key if key is not None else 'Uncategorized' # Esto esta copiado del NY times
if not articles.has_key(feed):
articles[feed] = []
if not 'podcasts' in url:
articles[feed].append(
dict(title=title, url=url, date=pubdate,
description=description,
content=''))
feeds = [(key, articles[key]) for key in feeds if articles.has_key(key)]
return feeds

View File

@ -15,8 +15,10 @@ class Guardian(BasicNewsRecipe):
title = u'The Guardian and The Observer' title = u'The Guardian and The Observer'
if date.today().weekday() == 6: if date.today().weekday() == 6:
base_url = "http://www.guardian.co.uk/theobserver" base_url = "http://www.guardian.co.uk/theobserver"
cover_pic = 'Observer digital edition'
else: else:
base_url = "http://www.guardian.co.uk/theguardian" base_url = "http://www.guardian.co.uk/theguardian"
cover_pic = 'Guardian digital edition'
__author__ = 'Seabound and Sujata Raman' __author__ = 'Seabound and Sujata Raman'
language = 'en_GB' language = 'en_GB'
@ -79,7 +81,7 @@ class Guardian(BasicNewsRecipe):
# soup = self.index_to_soup("http://www.guardian.co.uk/theobserver") # soup = self.index_to_soup("http://www.guardian.co.uk/theobserver")
soup = self.index_to_soup(self.base_url) soup = self.index_to_soup(self.base_url)
# find cover pic # find cover pic
img = soup.find( 'img',attrs ={'alt':'Guardian digital edition'}) img = soup.find( 'img',attrs ={'alt':self.cover_pic})
if img is not None: if img is not None:
self.cover_url = img['src'] self.cover_url = img['src']
# end find cover pic # end find cover pic

View File

@ -0,0 +1,29 @@
from calibre.web.feeds.news import BasicNewsRecipe
class HindustanTimes(BasicNewsRecipe):
title = u'Hindustan Times'
language = 'en_IN'
__author__ = 'Krittika Goyal'
oldest_article = 1 #days
max_articles_per_feed = 25
use_embedded_content = False
no_stylesheets = True
auto_cleanup = True
feeds = [
('News',
'http://feeds.hindustantimes.com/HT-NewsSectionPage-Topstories'),
('Views',
'http://feeds.hindustantimes.com/HT-ViewsSectionpage-Topstories'),
('Cricket',
'http://feeds.hindustantimes.com/HT-Cricket-TopStories'),
('Business',
'http://feeds.hindustantimes.com/HT-BusinessSectionpage-TopStories'),
('Entertainment',
'http://feeds.hindustantimes.com/HT-HomePage-Entertainment'),
('Lifestyle',
'http://feeds.hindustantimes.com/HT-Homepage-LifestyleNews'),
]

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 KiB

BIN
recipes/icons/rtnews.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 606 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 200 B

View File

@ -1,76 +1,25 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class IndiaToday(BasicNewsRecipe): class IndiaToday(BasicNewsRecipe):
title = u'India Today'
title = 'India Today'
__author__ = 'Kovid Goyal'
language = 'en_IN' language = 'en_IN'
timefmt = ' [%d %m, %Y]' __author__ = 'Krittika Goyal'
oldest_article = 15 #days
oldest_article = 700 max_articles_per_feed = 25
max_articles_per_feed = 10
no_stylesheets = True no_stylesheets = True
auto_cleanup = True
remove_tags_before = dict(id='content_story_title') feeds = [
remove_tags_after = dict(id='rightblockdiv') ('Latest News', 'http://indiatoday.intoday.in/rss/article.jsp?sid=4'),
remove_tags = [dict(id=['rightblockdiv', 'share_links'])] ('Cover Story', 'http://indiatoday.intoday.in/rss/article.jsp?sid=30'),
('Nation', 'http://indiatoday.intoday.in/rss/article.jsp?sid=36'),
extra_css = '#content_story_title { font-size: 170%; font-weight: bold;}' ('States', 'http://indiatoday.intoday.in/rss/article.jsp?sid=21'),
conversion_options = { 'linearize_tables': True } ('Economy', 'http://indiatoday.intoday.in/rss/article.jsp?sid=34'),
('World', 'http://indiatoday.intoday.in/rss/article.jsp?sid=61'),
def it_get_index(self): ('Sport', 'http://indiatoday.intoday.in/rss/article.jsp?sid=41'),
soup = self.index_to_soup('http://indiatoday.intoday.in/site/archive')
a = soup.find('a', href=lambda x: x and 'issueId=' in x)
url = 'http://indiatoday.intoday.in/site/'+a.get('href')
img = a.find('img')
self.cover_url = img.get('src')
return self.index_to_soup(url)
def parse_index(self):
soup = self.it_get_index()
feeds, current_section, current_articles = [], None, []
for x in soup.findAll(name=['h1', 'a']):
if x.name == 'h1':
if current_section and current_articles:
feeds.append((current_section, current_articles))
current_section = self.tag_to_string(x)
current_articles = []
self.log('\tFound section:', current_section)
elif x.name == 'a' and 'Story' in x.get('href', ''):
title = self.tag_to_string(x)
url = x.get('href')
url = url.replace(' ', '%20')
if not url.startswith('/'):
url = 'http://indiatoday.intoday.in/site/' + url
if title and url:
url += '?complete=1'
self.log('\tFound article:', title)
self.log('\t\t', url)
desc = ''
h3 = x.parent.findNextSibling('h3')
if h3 is not None:
desc = 'By ' + self.tag_to_string(h3)
h4 = h3.findNextSibling('h4')
if h4 is not None:
desc = self.tag_to_string(h4) + ' ' + desc
if desc:
self.log('\t\t', desc)
current_articles.append({'title':title, 'description':desc,
'url':url, 'date':''})
if current_section and current_articles:
feeds.append((current_section, current_articles))
return feeds
def postprocess_html(self, soup, first):
a = soup.find(text='Print')
if a is not None:
tr = a.findParent('tr')
if tr is not None:
tr.extract()
return soup
]

View File

@ -7,56 +7,33 @@ www.inquirer.net
''' '''
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class InquirerNet(BasicNewsRecipe): class InquirerNet(BasicNewsRecipe):
title = 'Inquirer.net' title = 'Inquirer.net'
__author__ = 'Darko Miletic' __author__ = 'Krittika Goyal'
description = 'News from Philipines' description = 'News from Philipines'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'cp1252' encoding = 'utf8'
publisher = 'inquirer.net' publisher = 'inquirer.net'
category = 'news, politics, philipines' category = 'news, politics, philipines'
lang = 'en' lang = 'en'
language = 'en' language = 'en'
extra_css = ' .fontheadline{font-size: x-large} .fontsubheadline{font-size: large} .fontkick{font-size: medium}' use_embedded_content = False
html2lrf_options = [ no_stylesheets = True
'--comment', description auto_cleanup = True
, '--category', category
, '--publisher', publisher
, '--ignore-tables'
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
remove_tags = [dict(name=['object','link','script','iframe','form'])]
feeds = [ feeds = [
(u'Breaking news', u'http://services.inquirer.net/rss/breakingnews.xml' ) (u'Inquirer', u'http://www.inquirer.net/fullfeed')
,(u'Top stories' , u'http://services.inquirer.net/rss/topstories.xml' )
,(u'Sports' , u'http://services.inquirer.net/rss/brk_breakingnews.xml' )
,(u'InfoTech' , u'http://services.inquirer.net/rss/infotech_tech.xml' )
,(u'InfoTech' , u'http://services.inquirer.net/rss/infotech_tech.xml' )
,(u'Business' , u'http://services.inquirer.net/rss/inq7money_breaking_news.xml' )
,(u'Editorial' , u'http://services.inquirer.net/rss/opinion_editorial.xml' )
,(u'Global Nation', u'http://services.inquirer.net/rss/globalnation_breakingnews.xml')
] ]
def preprocess_html(self, soup): def get_browser(self):
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) br = BasicNewsRecipe.get_browser(self)
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")]) br.set_handle_gzip(True)
soup.head.insert(0,mlang) return br
soup.head.insert(1,mcharset)
for item in soup.findAll(style=True):
del item['style']
return soup
def print_version(self, url):
rest, sep, art = url.rpartition('/view/')
art_id, sp, rrest = art.partition('/')
return 'http://services.inquirer.net/print/print.php?article_id=' + art_id

View File

@ -1,7 +1,5 @@
#!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
''' '''
japantimes.co.jp japantimes.co.jp
''' '''
@ -9,24 +7,61 @@ japantimes.co.jp
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class JapanTimes(BasicNewsRecipe): class JapanTimes(BasicNewsRecipe):
title = u'The Japan Times' title = 'The Japan Times'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'News from Japan' description = "Daily news and features on Japan from the most widely read English-language newspaper in Japan. Coverage includes national news, business news, sports news, commentary and features on living in Japan, entertainment, the arts, education and more."
language = 'en' language = 'en_JP'
category = 'news, politics, japan'
oldest_article = 7 publisher = 'The Japan Times'
max_articles_per_feed = 100 oldest_article = 5
max_articles_per_feed = 150
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'utf8'
publication_type = 'newspaper'
masthead_url = 'http://search.japantimes.co.jp/images/header_title.gif'
extra_css = 'body{font-family: Geneva,Arial,Helvetica,sans-serif}'
keep_only_tags = [ dict(name='div', attrs={'id':'searchresult'}) ] conversion_options = {
remove_tags_after = [ dict(name='div', attrs={'id':'mainbody' }) ] 'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
, 'linearize_tables' : True
}
keep_only_tags = [dict(name='div', attrs={'id':'printresult'})]
remove_tags = [ remove_tags = [
dict(name='div' , attrs={'id':'ads' }) dict(name=['iframe','meta','link','embed','object','base'])
,dict(name='table', attrs={'width':470}) ,dict(attrs={'id':'searchfooter'})
] ]
feeds = [(u'The Japan Times', u'http://feeds.feedburner.com/japantimes')]
remove_attributes = ['border']
def get_article_url(self, article):
rurl = BasicNewsRecipe.get_article_url(self, article)
return rurl.partition('?')[0]
feeds = [ def print_version(self, url):
(u'The Japan Times', u'http://feedproxy.google.com/japantimes') return url.replace('/cgi-bin/','/print/')
]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll('img'):
if not item.has_key('alt'):
item['alt'] = 'image'
for item in soup.findAll('photo'):
item.name = 'div'
for item in soup.head.findAll('paragraph'):
item.extract()
for item in soup.findAll('wwfilename'):
item.extract()
for item in soup.findAll('jtcategory'):
item.extract()
for item in soup.findAll('nomooter'):
item.extract()
for item in soup.body.findAll('paragraph'):
item.name = 'p'
return soup

View File

@ -14,54 +14,11 @@ class PeopleMag(BasicNewsRecipe):
use_embedded_content = False use_embedded_content = False
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 50 max_articles_per_feed = 50
use_embedded_content = False
extra_css = ''' no_stylesheets = True
h1{font-family:verdana,arial,helvetica,sans-serif; font-size: large;} auto_cleanup = True
h2{font-family:verdana,arial,helvetica,sans-serif; font-size: small;} auto_cleanup_keep = '//div[@id="article-image"]'
.body-content{font-family:verdana,arial,helvetica,sans-serif; font-size: small;}
.byline {font-size: small; color: #666666; font-style:italic; }
.lastline {font-size: small; color: #666666; font-style:italic;}
.contact {font-size: small; color: #666666;}
.contact p {font-size: small; color: #666666;}
.photoCaption { font-family:verdana,arial,helvetica,sans-serif; font-size:x-small;}
.photoCredit{ font-family:verdana,arial,helvetica,sans-serif; font-size:x-small; color:#666666;}
.article_timestamp{font-size:x-small; color:#666666;}
a {font-family:verdana,arial,helvetica,sans-serif; font-size: x-small;}
'''
keep_only_tags = [
dict(name='div', attrs={'class': 'panel_news_article_main'}),
dict(name='div', attrs={'class':'article_content'}),
dict(name='div', attrs={'class': 'headline'}),
dict(name='div', attrs={'class': 'post'}),
dict(name='div', attrs={'class': 'packageheadlines'}),
dict(name='div', attrs={'class': 'snap_preview'}),
dict(name='div', attrs={'id': 'articlebody'})
]
remove_tags = [
dict(name='div', attrs={'class':'share_comments'}),
dict(name='p', attrs={'class':'twitter_facebook'}),
dict(name='div', attrs={'class':'share_comments_bottom'}),
dict(name='h2', attrs={'id':'related_content'}),
dict(name='div', attrs={'class':'next_article'}),
dict(name='div', attrs={'class':'prev_article'}),
dict(name='ul', attrs={'id':'sharebar'}),
dict(name='div', attrs={'class':'sharelinkcont'}),
dict(name='div', attrs={'class':'categories'}),
dict(name='ul', attrs={'class':'categories'}),
dict(name='div', attrs={'class':'related_content'}),
dict(name='div', attrs={'id':'promo'}),
dict(name='div', attrs={'class':'linksWrapper'}),
dict(name='p', attrs={'class':'tag tvnews'}),
dict(name='p', attrs={'class':'tag movienews'}),
dict(name='p', attrs={'class':'tag musicnews'}),
dict(name='p', attrs={'class':'tag couples'}),
dict(name='p', attrs={'class':'tag gooddeeds'}),
dict(name='p', attrs={'class':'tag weddings'}),
dict(name='p', attrs={'class':'tag health'})
]
feeds = [ feeds = [
@ -69,26 +26,4 @@ class PeopleMag(BasicNewsRecipe):
('US Headlines', 'http://www.usmagazine.com/celebrity_news/rss') ('US Headlines', 'http://www.usmagazine.com/celebrity_news/rss')
] ]
def get_article_url(self, article):
ans = article.link
try:
self.log('Looking for full story link in', ans)
soup = self.index_to_soup(ans)
x = soup.find(text="View All")
if x is not None:
ans = ans + '?viewAll=y'
self.log('Found full story link', ans)
except:
pass
return ans
def postprocess_html(self, soup,first):
for tag in soup.findAll(name='div',attrs={'class':"container_ate_qandatitle"}):
tag.extract()
for tag in soup.findAll(name='br'):
tag.extract()
return soup

64
recipes/rtnews.recipe Normal file
View File

@ -0,0 +1,64 @@
__license__ = 'GPL v3'
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
'''
rt.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class RT_eng(BasicNewsRecipe):
title = 'RT in English'
__author__ = 'Darko Miletic'
description = 'RT is the first Russian 24/7 English-language news channel which brings the Russian view on global news.'
publisher = 'Autonomous Nonprofit Organization "TV-Novosti"'
category = 'news, politics, economy, finances, Russia, world'
oldest_article = 2
no_stylesheets = True
encoding = 'utf8'
masthead_url = 'http://rt.com/s/css/img/printlogo.gif'
use_embedded_content = False
remove_empty_feeds = True
language = 'en_RU'
publication_type = 'newsportal'
extra_css = """
body{font-family: Arial,Helvetica,sans-serif}
h1{font-family: Georgia,"Times New Roman",Times,serif}
.grey{color: gray}
.fs12{font-size: small}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher': publisher
, 'language' : language
}
keep_only_tags = [dict(name='div', attrs={'class':'all'})]
remove_tags = [
dict(name=['object','link','embed','iframe','meta','link'])
,dict(attrs={'class':'crumbs oh'})
]
remove_attributes = ['clear']
feeds = [
(u'Politics' , u'http://rt.com/politics/rss/' )
,(u'USA' , u'http://rt.com/usa/news/rss/' )
,(u'Business' , u'http://rt.com/business/news/rss/' )
,(u'Sport' , u'http://rt.com/sport/rss/' )
,(u'Art&Culture', u'http://rt.com/art-and-culture/news/rss/')
]
def print_version(self, url):
return url + 'print/'
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll('a'):
str = item.string
if str is None:
str = self.tag_to_string(item)
item.replaceWith(str)
return soup

View File

@ -1,12 +1,9 @@
#!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'
''' '''
twitchfilm.net/site/ twitchfilm.net/news/
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class Twitchfilm(BasicNewsRecipe): class Twitchfilm(BasicNewsRecipe):
title = 'Twitch Films' title = 'Twitch Films'
@ -15,29 +12,46 @@ class Twitchfilm(BasicNewsRecipe):
oldest_article = 30 oldest_article = 30
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = True use_embedded_content = False
encoding = 'utf-8' encoding = 'utf-8'
publisher = 'Twitch' publisher = 'Twitch'
masthead_url = 'http://twitchfilm.com/img/logo.png'
category = 'twitch, twitchfilm, movie news, movie reviews, cult cinema, independent cinema, anime, foreign cinema, geek talk' category = 'twitch, twitchfilm, movie news, movie reviews, cult cinema, independent cinema, anime, foreign cinema, geek talk'
language = 'en' language = 'en'
lang = 'en-US'
conversion_options = { conversion_options = {
'comment' : description 'comment' : description
, 'tags' : category , 'tags' : category
, 'publisher' : publisher , 'publisher': publisher
, 'language' : lang , 'language' : language
, 'pretty_print' : True
} }
remove_tags = [dict(name='div', attrs={'class':'feedflare'})] keep_only_tags=[dict(attrs={'class':'asset-header'})]
remove_tags_after=dict(attrs={'class':'asset-body'})
remove_tags = [ dict(name='div', attrs={'class':['social','categories']})
, dict(attrs={'id':'main-asset'})
, dict(name=['meta','link','iframe','embed','object'])
]
feeds = [(u'News', u'http://feedproxy.google.com/TwitchEverything')] feeds = [(u'News', u'http://feeds.twitchfilm.net/TwitchEverything')]
def preprocess_html(self, soup): def preprocess_html(self, soup):
mtag = Tag(soup,'meta',[('http-equiv','Content-Type'),('context','text/html; charset=utf-8')]) for item in soup.findAll(style=True):
soup.head.insert(0,mtag) del item['style']
soup.html['lang'] = self.lang for item in soup.findAll('a'):
return self.adeify_images(soup) limg = item.find('img')
if item.string is not None:
str = item.string
item.replaceWith(str)
else:
if limg:
item.name = 'div'
item.attrs = []
else:
str = self.tag_to_string(item)
item.replaceWith(str)
for item in soup.findAll('img'):
if not item.has_key('alt'):
item['alt'] = 'image'
return soup

View File

@ -13,6 +13,7 @@ class USAToday(BasicNewsRecipe):
title = 'USA Today' title = 'USA Today'
__author__ = 'Kovid Goyal' __author__ = 'Kovid Goyal'
oldest_article = 1 oldest_article = 1
publication_type = 'newspaper'
timefmt = '' timefmt = ''
max_articles_per_feed = 20 max_articles_per_feed = 20
language = 'en' language = 'en'

View File

@ -94,9 +94,11 @@ class WallStreetJournal(BasicNewsRecipe):
if date is not None: if date is not None:
self.timefmt = ' [%s]'%self.tag_to_string(date) self.timefmt = ' [%s]'%self.tag_to_string(date)
cov = soup.find('a', attrs={'class':'icon pdf'}, href=True) cov = soup.find('div', attrs={'class':'itpSectionHeaderPdf'})
if cov is not None: if cov is not None:
self.cover_url = cov['href'] a = cov.find('a', href=True)
if a is not None:
self.cover_url = a['href']
feeds = [] feeds = []
div = soup.find('div', attrs={'class':'itpHeader'}) div = soup.find('div', attrs={'class':'itpHeader'})

View File

@ -61,7 +61,7 @@ authors_completer_append_separator = False
# selecting 'manage authors', and pressing 'Recalculate all author sort values'. # selecting 'manage authors', and pressing 'Recalculate all author sort values'.
# The author name suffixes are words that are ignored when they occur at the # The author name suffixes are words that are ignored when they occur at the
# end of an author name. The case of the suffix is ignored and trailing # end of an author name. The case of the suffix is ignored and trailing
# periods are automatically handled. # periods are automatically handled. The same is true for prefixes.
# The author name copy words are a set of words which if they occur in an # The author name copy words are a set of words which if they occur in an
# author name cause the automatically generated author sort string to be # author name cause the automatically generated author sort string to be
# identical to the author name. This means that the sort for a string like Acme # identical to the author name. This means that the sort for a string like Acme

View File

@ -653,6 +653,15 @@ class KOBO(USBMS):
debug_print(' Commit: Set FavouritesIndex') debug_print(' Commit: Set FavouritesIndex')
def update_device_database_collections(self, booklists, collections_attributes, oncard): def update_device_database_collections(self, booklists, collections_attributes, oncard):
# Only process categories in this list
supportedcategories = {
"Im_Reading":1,
"Read":2,
"Closed":3,
"Shortlist":4,
# "Preview":99, # Unsupported as we don't want to change it
}
# Define lists for the ReadStatus # Define lists for the ReadStatus
readstatuslist = { readstatuslist = {
"Im_Reading":1, "Im_Reading":1,
@ -692,6 +701,7 @@ class KOBO(USBMS):
# Process any collections that exist # Process any collections that exist
for category, books in collections.items(): for category, books in collections.items():
if category in supportedcategories:
debug_print("Category: ", category, " id = ", readstatuslist.get(category)) debug_print("Category: ", category, " id = ", readstatuslist.get(category))
for book in books: for book in books:
debug_print(' Title:', book.title, 'category: ', category) debug_print(' Title:', book.title, 'category: ', category)

View File

@ -368,7 +368,10 @@ OptionRecommendation(name='remove_paragraph_spacing_indent_size',
recommended_value=1.5, level=OptionRecommendation.LOW, recommended_value=1.5, level=OptionRecommendation.LOW,
help=_('When calibre removes blank lines between paragraphs, it automatically ' help=_('When calibre removes blank lines between paragraphs, it automatically '
'sets a paragraph indent, to ensure that paragraphs can be easily ' 'sets a paragraph indent, to ensure that paragraphs can be easily '
'distinguished. This option controls the width of that indent (in em).') 'distinguished. This option controls the width of that indent (in em). '
'If you set this value to 0, then the indent specified in the input '
'document is used, unless you also set the insert line between '
'paragraphs option.')
), ),
OptionRecommendation(name='prefer_metadata_cover', OptionRecommendation(name='prefer_metadata_cover',
@ -394,8 +397,9 @@ OptionRecommendation(name='insert_blank_line_size',
OptionRecommendation(name='remove_first_image', OptionRecommendation(name='remove_first_image',
recommended_value=False, level=OptionRecommendation.LOW, recommended_value=False, level=OptionRecommendation.LOW,
help=_('Remove the first image from the input ebook. Useful if the ' help=_('Remove the first image from the input ebook. Useful if the '
'first image in the source file is a cover and you are specifying ' 'input document has a cover image that is not identified as a cover. '
'an external cover.' 'In this case, if you set a cover in calibre, the output document will '
'end up with two cover images if you do not specify this option.'
) )
), ),

View File

@ -75,7 +75,7 @@ class IgnoreFile(Exception):
def __init__(self, msg, errno): def __init__(self, msg, errno):
Exception.__init__(self, msg) Exception.__init__(self, msg)
self.doesnt_exist = errno == 2 self.doesnt_exist = errno == errno.ENOENT
self.errno = errno self.errno = errno
class HTMLFile(object): class HTMLFile(object):

View File

@ -65,20 +65,27 @@ def author_to_author_sort(author, method=None):
suffixes = set([x.lower() for x in tweaks['author_name_suffixes']]) suffixes = set([x.lower() for x in tweaks['author_name_suffixes']])
suffixes |= set([x+u'.' for x in suffixes]) suffixes |= set([x+u'.' for x in suffixes])
suffix = u''
while True:
if not tokens:
return author
last = tokens[-1].lower() last = tokens[-1].lower()
suffix = None
if last in suffixes: if last in suffixes:
suffix = tokens[-1] suffix = tokens[-1] + ' ' + suffix
tokens = tokens[:-1] tokens = tokens[:-1]
else:
break
suffix = suffix.strip()
if method == u'comma' and u',' in u''.join(tokens): if method == u'comma' and u',' in u''.join(tokens):
return author return author
atokens = tokens[-1:] + tokens[:-1] atokens = tokens[-1:] + tokens[:-1]
num_toks = len(atokens)
if suffix: if suffix:
atokens.append(suffix) atokens.append(suffix)
if method != u'nocomma' and len(atokens) > 1: if method != u'nocomma' and num_toks > 1:
atokens[0] += u',' atokens[0] += u','
return u' '.join(atokens) return u' '.join(atokens)

View File

@ -330,9 +330,11 @@ class MetadataUpdater(object):
prefs = load_defaults('mobi_output') prefs = load_defaults('mobi_output')
pas = prefs.get('prefer_author_sort', False) pas = prefs.get('prefer_author_sort', False)
kindle_pdoc = prefs.get('personal_doc', None) kindle_pdoc = prefs.get('personal_doc', None)
share_not_sync = prefs.get('share_not_sync', False)
except: except:
pas = False pas = False
kindle_pdoc = None kindle_pdoc = None
share_not_sync = False
if mi.author_sort and pas: if mi.author_sort and pas:
authors = mi.author_sort authors = mi.author_sort
update_exth_record((100, normalize(authors).encode(self.codec, 'replace'))) update_exth_record((100, normalize(authors).encode(self.codec, 'replace')))
@ -376,7 +378,7 @@ class MetadataUpdater(object):
# Add a 113 record if not present to allow Amazon syncing # Add a 113 record if not present to allow Amazon syncing
if (113 not in self.original_exth_records and if (113 not in self.original_exth_records and
self.original_exth_records.get(501, None) == 'EBOK' and self.original_exth_records.get(501, None) == 'EBOK' and
not added_501): not added_501 and not share_not_sync):
from uuid import uuid4 from uuid import uuid4
update_exth_record((113, str(uuid4()))) update_exth_record((113, str(uuid4())))
if 503 in self.original_exth_records: if 503 in self.original_exth_records:

View File

@ -116,7 +116,8 @@ def cap_author_token(token):
lt = lower(token) lt = lower(token)
if lt in ('von', 'de', 'el', 'van', 'le'): if lt in ('von', 'de', 'el', 'van', 'le'):
return lt return lt
if re.match(r'([a-z]\.){2,}$', lt) is not None: # no digits no spez. characters
if re.match(r'([^\d\W]\.){2,}$', lt, re.UNICODE) is not None:
# Normalize tokens of the form J.K. to J. K. # Normalize tokens of the form J.K. to J. K.
parts = token.split('.') parts = token.split('.')
return '. '.join(map(capitalize, parts)).strip() return '. '.join(map(capitalize, parts)).strip()

View File

@ -28,7 +28,7 @@ class Ozon(Source):
touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon', touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon',
'publisher', 'pubdate', 'comments', 'series', 'rating', 'language']) 'publisher', 'pubdate', 'comments', 'series', 'rating', 'language'])
# Test purpose only, test function does not like when sometimes some filed are empty # Test purpose only, test function does not like when sometimes some filed are empty
#touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon', # touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon',
# 'publisher', 'pubdate', 'comments']) # 'publisher', 'pubdate', 'comments'])
supports_gzip_transfer_encoding = True supports_gzip_transfer_encoding = True
@ -109,8 +109,16 @@ class Ozon(Source):
# }}} # }}}
def get_metadata(self, log, entries, title, authors, identifiers): # {{{ def get_metadata(self, log, entries, title, authors, identifiers): # {{{
# some book titles have extra charactes like this
# TODO: make a twick
reRemoveFromTitle = None
#reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]')
title = unicode(title).upper() if title else '' title = unicode(title).upper() if title else ''
authors = map(unicode.upper, map(unicode, authors)) if authors else None if reRemoveFromTitle:
title = reRemoveFromTitle.sub('', title)
authors = map(_normalizeAuthorNameWithInitials,
map(unicode.upper, map(unicode, authors))) if authors else None
ozon_id = identifiers.get('ozon', None) ozon_id = identifiers.get('ozon', None)
unk = unicode(_('Unknown')).upper() unk = unicode(_('Unknown')).upper()
@ -124,6 +132,7 @@ class Ozon(Source):
def in_authors(authors, miauthors): def in_authors(authors, miauthors):
for author in authors: for author in authors:
for miauthor in miauthors: for miauthor in miauthors:
#log.debug(u'=> %s <> %s'%(author, miauthor))
if author in miauthor: return True if author in miauthor: return True
return None return None
@ -131,7 +140,10 @@ class Ozon(Source):
match = True match = True
if title: if title:
mititle = unicode(mi.title).upper() if mi.title else '' mititle = unicode(mi.title).upper() if mi.title else ''
if reRemoveFromTitle:
mititle = reRemoveFromTitle.sub('', mititle)
match = title in mititle match = title in mititle
#log.debug(u't=> %s <> %s'%(title, mititle))
if match and authors: if match and authors:
miauthors = map(unicode.upper, map(unicode, mi.authors)) if mi.authors else [] miauthors = map(unicode.upper, map(unicode, mi.authors)) if mi.authors else []
match = in_authors(authors, miauthors) match = in_authors(authors, miauthors)
@ -190,7 +202,8 @@ class Ozon(Source):
title = entry.xpath(xp_template.format('Name')) title = entry.xpath(xp_template.format('Name'))
author = entry.xpath(xp_template.format('Author')) author = entry.xpath(xp_template.format('Author'))
mi = Metadata(title, author.split(',')) norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u',')))
mi = Metadata(title, norm_authors)
ozon_id = entry.xpath(xp_template.format('ID')) ozon_id = entry.xpath(xp_template.format('ID'))
mi.identifiers = {'ozon':ozon_id} mi.identifiers = {'ozon':ozon_id}
@ -202,6 +215,11 @@ class Ozon(Source):
if cover: if cover:
mi.ozon_cover_url = _translateToBigCoverUrl(cover) mi.ozon_cover_url = _translateToBigCoverUrl(cover)
pub_year = entry.xpath(xp_template.format('Year'))
if pub_year:
mi.pubdate = toPubdate(log, pub_year)
#log.debug('pubdate %s'%mi.pubdate)
rating = entry.xpath(xp_template.format('ClientRatingValue')) rating = entry.xpath(xp_template.format('ClientRatingValue'))
if rating: if rating:
try: try:
@ -269,13 +287,17 @@ class Ozon(Source):
raw = self.browser.open_novisit(url, timeout=timeout).read() raw = self.browser.open_novisit(url, timeout=timeout).read()
doc = html.fromstring(raw) doc = html.fromstring(raw)
xpt_prod_det_at = u'string(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "%s")]/a[1]/@title)'
xpt_prod_det_tx = u'substring-after(//div[contains(@class, "product-detail")]//text()[contains(., "%s")], ":")'
# series # series
xpt = u'normalize-space(//div[@class="frame_content"]//div[contains(normalize-space(text()), "Серия:")]//a/@title)' xpt = xpt_prod_det_at % u'Сери'
# % u'Серия:'
series = doc.xpath(xpt) series = doc.xpath(xpt)
if series: if series:
metadata.series = series metadata.series = series
xpt = u'substring-after(//meta[@name="description"]/@content, "ISBN")' xpt = u'normalize-space(substring-after(//meta[@name="description"]/@content, "ISBN"))'
isbn_str = doc.xpath(xpt) isbn_str = doc.xpath(xpt)
if isbn_str: if isbn_str:
all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if check_isbn(isbn)] all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if check_isbn(isbn)]
@ -283,38 +305,42 @@ class Ozon(Source):
metadata.all_isbns = all_isbns metadata.all_isbns = all_isbns
metadata.isbn = all_isbns[0] metadata.isbn = all_isbns[0]
xpt = u'//div[@class="frame_content"]//div[contains(normalize-space(text()), "Издатель")]//a[@title="Издательство"]' xpt = xpt_prod_det_at % u'Издатель'
publishers = doc.xpath(xpt) publishers = doc.xpath(xpt)
if publishers: if publishers:
metadata.publisher = publishers[0].text metadata.publisher = publishers
xpt = u'string(../text()[contains(., "г.")])' displ_lang = None
yearIn = publishers[0].xpath(xpt) xpt = xpt_prod_det_tx % u'Язык'
langs = doc.xpath(xpt)
if langs:
lng_splt = langs.split(u',')
if lng_splt:
displ_lang = lng_splt[0].strip()
metadata.language = _translageLanguageToCode(displ_lang)
#log.debug(u'language: %s'%displ_lang)
# can be set before from xml search responce
if not metadata.pubdate:
xpt = u'normalize-space(//div[@class="product-misc"]//text()[contains(., "г.")])'
yearIn = doc.xpath(xpt)
if yearIn: if yearIn:
matcher = re.search(r'\d{4}', yearIn) matcher = re.search(r'\d{4}', yearIn)
if matcher: if matcher:
year = int(matcher.group(0)) metadata.pubdate = toPubdate(log, matcher.group(0))
# only year is available, so use 1-st of Jan
metadata.pubdate = datetime.datetime(year, 1, 1) #<- failed comparation in identify.py
#metadata.pubdate = datetime(year, 1, 1)
xpt = u'substring-after(string(../text()[contains(., "Язык")]), ": ")'
displLang = publishers[0].xpath(xpt)
lang_code =_translageLanguageToCode(displLang)
if lang_code:
metadata.language = lang_code
# overwrite comments from HTML if any # overwrite comments from HTML if any
# tr/td[contains(.//text(), "От издателя")] -> does not work, why? xpt = u'//table[@id="detail_description"]//tr/td'
xpt = u'//div[contains(@class, "detail")]//tr/td//text()[contains(., "От издателя")]'\
u'/ancestor::tr[1]/following-sibling::tr[1]/td[contains(./@class, "description")][1]'
comment_elem = doc.xpath(xpt) comment_elem = doc.xpath(xpt)
if comment_elem: if comment_elem:
comments = unicode(etree.tostring(comment_elem[0])) comments = unicode(etree.tostring(comment_elem[0]))
if comments: if comments:
# cleanup root tag, TODO: remove tags like object/embeded # cleanup root tag, TODO: remove tags like object/embeded
comments = re.sub(r'^<td.+?>|</td>.+?$', u'', comments).strip() comments = re.sub(r'\A.*?<td.*?>|</td>.*\Z', u'', comments.strip(), re.MULTILINE).strip()
if comments: if comments and (not metadata.comments or len(comments) > len(metadata.comments)):
metadata.comments = comments metadata.comments = comments
else:
log.debug('HTML book description skipped in favour of search service xml responce')
else: else:
log.debug('No book description found in HTML') log.debug('No book description found in HTML')
# }}} # }}}
@ -390,10 +416,40 @@ def _translageLanguageToCode(displayLang): # {{{
u'Итальянский': 'it', u'Итальянский': 'it',
u'Испанский': 'es', u'Испанский': 'es',
u'Китайский': 'zh', u'Китайский': 'zh',
u'Японский': 'ja' } u'Японский': 'ja',
u'Финский' : 'fi',
u'Польский' : 'pl',}
return langTbl.get(displayLang, None) return langTbl.get(displayLang, None)
# }}} # }}}
# [В.П. Колесников | Колесников В.П.]-> В. П. BКолесников
def _normalizeAuthorNameWithInitials(name): # {{{
res = name
if name:
re1 = u'^(?P<lname>\S+)\s+(?P<fname>[^\d\W]\.)(?:\s*(?P<mname>[^\d\W]\.))?$'
re2 = u'^(?P<fname>[^\d\W]\.)(?:\s*(?P<mname>[^\d\W]\.))?\s+(?P<lname>\S+)$'
matcher = re.match(re1, unicode(name), re.UNICODE)
if not matcher:
matcher = re.match(re2, unicode(name), re.UNICODE)
if matcher:
d = matcher.groupdict()
res = ' '.join(x for x in (d['fname'], d['mname'], d['lname']) if x)
return res
# }}}
def toPubdate(log, yearAsString):
res = None
if yearAsString:
try:
year = int(yearAsString)
# only year is available, so use 1-st of Jan
res = datetime.datetime(year, 1, 1)
except:
log.error('cannot parse to date %s'%yearAsString)
return res
if __name__ == '__main__': # tests {{{ if __name__ == '__main__': # tests {{{
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/ozon.py # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/ozon.py
# comment some touched_fields before run thoses tests # comment some touched_fields before run thoses tests
@ -403,7 +459,12 @@ if __name__ == '__main__': # tests {{{
test_identify_plugin(Ozon.name, test_identify_plugin(Ozon.name,
[ [
# (
# {'identifiers':{}, 'title':u'Норвежский язык: Практический курс',
# 'authors':[u'Колесников В.П.', u'Г.В. Шатков']},
# [title_test(u'Норвежский язык: Практический курс', exact=True),
# authors_test([u'В. П. Колесников', u'Г. В. Шатков'])]
# ),
( (
{'identifiers':{'isbn': '9785916572629'} }, {'identifiers':{'isbn': '9785916572629'} },
[title_test(u'На все четыре стороны', exact=True), [title_test(u'На все четыре стороны', exact=True),

View File

@ -55,6 +55,11 @@ class MOBIOutput(OutputFormatPlugin):
' specified directory. If the directory already ' ' specified directory. If the directory already '
'exists, it will be deleted.') 'exists, it will be deleted.')
), ),
OptionRecommendation(name='share_not_sync', recommended_value=False,
help=_('Enable sharing of book content via Facebook etc. '
' on the Kindle. WARNING: Using this feature means that '
' the book will not auto sync its last read position '
' on multiple devices. Complain to Amazon.'))
]) ])
def check_for_periodical(self): def check_for_periodical(self):

View File

@ -61,6 +61,13 @@ class MobiWriter(object):
def __call__(self, oeb, path_or_stream): def __call__(self, oeb, path_or_stream):
self.log = oeb.log self.log = oeb.log
pt = None
if oeb.metadata.publication_type:
x = unicode(oeb.metadata.publication_type[0]).split(':')
if len(x) > 1:
pt = x[1].lower()
self.publication_type = pt
if hasattr(path_or_stream, 'write'): if hasattr(path_or_stream, 'write'):
return self.dump_stream(oeb, path_or_stream) return self.dump_stream(oeb, path_or_stream)
with open(path_or_stream, 'w+b') as stream: with open(path_or_stream, 'w+b') as stream:
@ -346,12 +353,14 @@ class MobiWriter(object):
bt = 0x002 bt = 0x002
if self.primary_index_record_idx is not None: if self.primary_index_record_idx is not None:
if self.indexer.is_flat_periodical: if False and self.indexer.is_flat_periodical:
# Disabled as setting this to 0x102 causes the Kindle to not
# auto archive the issues
bt = 0x102 bt = 0x102
elif self.indexer.is_periodical: elif self.indexer.is_periodical:
# If you change this, remember to change the cdetype in the EXTH # If you change this, remember to change the cdetype in the EXTH
# header as well # header as well
bt = 0x103 bt = {'newspaper':0x101}.get(self.publication_type, 0x103)
record0.write(pack(b'>IIIII', record0.write(pack(b'>IIIII',
0xe8, bt, 65001, uid, 6)) 0xe8, bt, 65001, uid, 6))
@ -520,20 +529,22 @@ class MobiWriter(object):
if isinstance(uuid, unicode): if isinstance(uuid, unicode):
uuid = uuid.encode('utf-8') uuid = uuid.encode('utf-8')
if not self.opts.share_not_sync:
exth.write(pack(b'>II', 113, len(uuid) + 8)) exth.write(pack(b'>II', 113, len(uuid) + 8))
exth.write(uuid) exth.write(uuid)
nrecs += 1 nrecs += 1
# Write cdetype # Write cdetype
if self.is_periodical: if not self.is_periodical:
# If you set the book type header field to 0x101 use NWPR here if exth.write(pack(b'>II', 501, 12))
# you use 0x103 use MAGZ exth.write(b'EBOK')
data = b'MAGZ'
else:
data = b'EBOK'
exth.write(pack(b'>II', 501, len(data)+8))
exth.write(data)
nrecs += 1 nrecs += 1
else:
# Should be b'NWPR' for doc type of 0x101 and b'MAGZ' for doctype
# of 0x103 but the old writer didn't write them, and I dont know
# what it should be for type 0x102 (b'BLOG'?) so write nothing
# instead
pass
# Add a publication date entry # Add a publication date entry
if oeb.metadata['date']: if oeb.metadata['date']:

View File

@ -160,7 +160,9 @@ class Serializer(object):
buf.write(b'title="') buf.write(b'title="')
self.serialize_text(ref.title, quot=True) self.serialize_text(ref.title, quot=True)
buf.write(b'" ') buf.write(b'" ')
if ref.title == 'start': if (ref.title.lower() == 'start' or
(ref.type and ref.type.lower() in ('start',
'other.start'))):
self._start_href = ref.href self._start_href = ref.href
self.serialize_href(ref.href) self.serialize_href(ref.href)
# Space required or won't work, I kid you not # Space required or won't work, I kid you not
@ -348,8 +350,9 @@ class Serializer(object):
''' '''
buf = self.buf buf = self.buf
id_offsets = self.id_offsets id_offsets = self.id_offsets
start_href = getattr(self, '_start_href', None)
for href, hoffs in self.href_offsets.items(): for href, hoffs in self.href_offsets.items():
is_start = (href and href == getattr(self, '_start_href', None)) is_start = (href and href == start_href)
# Iterate over all filepos items # Iterate over all filepos items
if href not in id_offsets: if href not in id_offsets:
self.logger.warn('Hyperlink target %r not found' % href) self.logger.warn('Hyperlink target %r not found' % href)

View File

@ -320,9 +320,11 @@ class CSSFlattener(object):
if self.context.insert_blank_line: if self.context.insert_blank_line:
cssdict['margin-top'] = cssdict['margin-bottom'] = \ cssdict['margin-top'] = cssdict['margin-bottom'] = \
'%fem'%self.context.insert_blank_line_size '%fem'%self.context.insert_blank_line_size
if (self.context.remove_paragraph_spacing and indent_size = self.context.remove_paragraph_spacing_indent_size
keep_indents = indent_size == 0.0 and not self.context.insert_blank_line
if (self.context.remove_paragraph_spacing and not keep_indents and
cssdict.get('text-align', None) not in ('center', 'right')): cssdict.get('text-align', None) not in ('center', 'right')):
cssdict['text-indent'] = "%1.1fem" % self.context.remove_paragraph_spacing_indent_size cssdict['text-indent'] = "%1.1fem" % indent_size
if cssdict: if cssdict:
items = cssdict.items() items = cssdict.items()

View File

@ -53,7 +53,7 @@ def pdftohtml(output_dir, pdf_path, no_images):
p = popen(cmd, stderr=logf._fd, stdout=logf._fd, p = popen(cmd, stderr=logf._fd, stdout=logf._fd,
stdin=subprocess.PIPE) stdin=subprocess.PIPE)
except OSError as err: except OSError as err:
if err.errno == 2: if err.errno == errno.ENOENT:
raise ConversionError(_('Could not find pdftohtml, check it is in your PATH')) raise ConversionError(_('Could not find pdftohtml, check it is in your PATH'))
else: else:
raise raise

View File

@ -11,6 +11,7 @@ Write content to PDF.
import os import os
import shutil import shutil
from calibre import isosx
from calibre.ptempfile import PersistentTemporaryDirectory from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.ebooks.pdf.pageoptions import unit, paper_size, \ from calibre.ebooks.pdf.pageoptions import unit, paper_size, \
orientation orientation
@ -164,6 +165,12 @@ class PDFWriter(QObject): # {{{
self.logger.debug('\tRendering item %s as %i.pdf' % (os.path.basename(str(self.view.url().toLocalFile())), len(self.combine_queue))) self.logger.debug('\tRendering item %s as %i.pdf' % (os.path.basename(str(self.view.url().toLocalFile())), len(self.combine_queue)))
printer = get_pdf_printer(self.opts) printer = get_pdf_printer(self.opts)
printer.setOutputFileName(item_path) printer.setOutputFileName(item_path)
# We have to set the engine to Native on OS X after the call to set
# filename. Setting a filename with .pdf as the extension causes
# Qt to set the format to use Qt's PDF engine even if native was
# previously set on the printer.
if isosx:
printer.setOutputFormat(QPrinter.NativeFormat)
self.view.print_(printer) self.view.print_(printer)
printer.abort() printer.abort()
self._render_book() self._render_book()
@ -179,6 +186,8 @@ class PDFWriter(QObject): # {{{
item_path = os.path.join(self.tmp_path, 'cover.pdf') item_path = os.path.join(self.tmp_path, 'cover.pdf')
printer = get_pdf_printer(self.opts) printer = get_pdf_printer(self.opts)
printer.setOutputFileName(item_path) printer.setOutputFileName(item_path)
if isosx:
printer.setOutputFormat(QPrinter.NativeFormat)
self.combine_queue.insert(0, item_path) self.combine_queue.insert(0, item_path)
p = QPixmap() p = QPixmap()
p.loadFromData(self.cover_data) p.loadFromData(self.cover_data)
@ -229,6 +238,8 @@ class ImagePDFWriter(object):
def render_images(self, outpath, mi, items): def render_images(self, outpath, mi, items):
printer = get_pdf_printer(self.opts, for_comic=True) printer = get_pdf_printer(self.opts, for_comic=True)
printer.setOutputFileName(outpath) printer.setOutputFileName(outpath)
if isosx:
printer.setOutputFormat(QPrinter.NativeFormat)
printer.setDocName(mi.title) printer.setDocName(mi.title)
printer.setCreator(u'%s [%s]'%(__appname__, __version__)) printer.setCreator(u'%s [%s]'%(__appname__, __version__))
# Seems to be no way to set author # Seems to be no way to set author

View File

@ -1,3 +1,8 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
import re, sys import re, sys
from collections import defaultdict from collections import defaultdict
@ -72,10 +77,15 @@ class Document:
self.options[k] = v self.options[k] = v
self.html = None self.html = None
self.log = log self.log = log
self.keep_elements = set()
def _html(self, force=False): def _html(self, force=False):
if force or self.html is None: if force or self.html is None:
self.html = self._parse(self.input) self.html = self._parse(self.input)
path = self.options['keep_elements']
if path is not None:
self.keep_elements = set(self.html.xpath(path))
return self.html return self.html
def _parse(self, input): def _parse(self, input):
@ -152,8 +162,9 @@ class Document:
append = False append = False
if sibling is best_elem: if sibling is best_elem:
append = True append = True
sibling_key = sibling #HashableElement(sibling) if sibling in candidates and candidates[sibling]['content_score'] >= sibling_score_threshold:
if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold: append = True
if sibling in self.keep_elements:
append = True append = True
if sibling.tag == "p": if sibling.tag == "p":
@ -283,6 +294,8 @@ class Document:
def remove_unlikely_candidates(self): def remove_unlikely_candidates(self):
for elem in self.html.iter(): for elem in self.html.iter():
if elem in self.keep_elements:
continue
s = "%s %s" % (elem.get('class', ''), elem.get('id', '')) s = "%s %s" % (elem.get('class', ''), elem.get('id', ''))
#self.debug(s) #self.debug(s)
if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag != 'body': if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag != 'body':
@ -337,7 +350,7 @@ class Document:
allowed = {} allowed = {}
# Conditionally clean <table>s, <ul>s, and <div>s # Conditionally clean <table>s, <ul>s, and <div>s
for el in self.reverse_tags(node, "table", "ul", "div"): for el in self.reverse_tags(node, "table", "ul", "div"):
if el in allowed: if el in allowed or el in self.keep_elements:
continue continue
weight = self.class_weight(el) weight = self.class_weight(el)
if el in candidates: if el in candidates:
@ -450,64 +463,39 @@ class Document:
#self.debug("pname %s pweight %.3f" %(pname, pweight)) #self.debug("pname %s pweight %.3f" %(pname, pweight))
el.drop_tree() el.drop_tree()
for el in ([node] + [n for n in node.iter()]):
if not (self.options['attributes']):
#el.attrib = {} #FIXME:Checkout the effects of disabling this
pass
return clean_attributes(tounicode(node)) return clean_attributes(tounicode(node))
def option_parser():
from calibre.utils.config import OptionParser
parser = OptionParser(usage='%prog: [options] file')
parser.add_option('-v', '--verbose', default=False, action='store_true',
dest='verbose',
help='Show detailed output information. Useful for debugging')
parser.add_option('-k', '--keep-elements', default=None, action='store',
dest='keep_elements',
help='XPath specifying elements that should not be removed')
class HashableElement(): return parser
def __init__(self, node):
self.node = node
self._path = None
def _get_path(self):
if self._path is None:
reverse_path = []
node = self.node
while node is not None:
node_id = (node.tag, tuple(node.attrib.items()), node.text)
reverse_path.append(node_id)
node = node.getparent()
self._path = tuple(reverse_path)
return self._path
path = property(_get_path)
def __hash__(self):
return hash(self.path)
def __eq__(self, other):
return self.path == other.path
def __getattr__(self, tag):
return getattr(self.node, tag)
def main(): def main():
import logging from calibre.utils.logging import default_log
from optparse import OptionParser parser = option_parser()
parser = OptionParser(usage="%prog: [options] [file]") options, args = parser.parse_args()
parser.add_option('-v', '--verbose', action='store_true')
parser.add_option('-u', '--url', help="use URL instead of a local file")
(options, args) = parser.parse_args()
if not (len(args) == 1 or options.url): if len(args) != 1:
parser.print_help() parser.print_help()
sys.exit(1) raise SystemExit(1)
logging.basicConfig(level=logging.INFO)
with open(args[0], 'rb') as f:
raw = f.read()
file = None
if options.url:
import urllib
file = urllib.urlopen(options.url)
else:
file = open(args[0], 'rt')
enc = sys.__stdout__.encoding or 'utf-8' enc = sys.__stdout__.encoding or 'utf-8'
try: if options.verbose:
print Document(file.read(), debug=options.verbose).summary().encode(enc, 'replace') default_log.filter_level = default_log.DEBUG
finally: print (Document(raw, default_log,
file.close() debug=options.verbose,
keep_elements=options.keep_elements).summary().encode(enc,
'replace'))
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View File

@ -142,7 +142,7 @@ def _config(): # {{{
c.add_opt('upload_news_to_device', default=True, c.add_opt('upload_news_to_device', default=True,
help=_('Upload downloaded news to device')) help=_('Upload downloaded news to device'))
c.add_opt('delete_news_from_library_on_upload', default=False, c.add_opt('delete_news_from_library_on_upload', default=False,
help=_('Delete books from library after uploading to device')) help=_('Delete news books from library after uploading to device'))
c.add_opt('separate_cover_flow', default=False, c.add_opt('separate_cover_flow', default=False,
help=_('Show the cover flow in a separate window instead of in the main calibre window')) help=_('Show the cover flow in a separate window instead of in the main calibre window'))
c.add_opt('disable_tray_notification', default=False, c.add_opt('disable_tray_notification', default=False,

View File

@ -8,6 +8,7 @@ __docformat__ = 'restructuredtext en'
import os import os
from functools import partial from functools import partial
from threading import Thread from threading import Thread
from contextlib import closing
from PyQt4.Qt import QToolButton from PyQt4.Qt import QToolButton
@ -52,7 +53,13 @@ class Worker(Thread): # {{{
def doit(self): def doit(self):
from calibre.library.database2 import LibraryDatabase2 from calibre.library.database2 import LibraryDatabase2
newdb = LibraryDatabase2(self.loc) newdb = LibraryDatabase2(self.loc, is_second_db=True)
with closing(newdb):
self._doit(newdb)
newdb.break_cycles()
del newdb
def _doit(self, newdb):
for i, x in enumerate(self.ids): for i, x in enumerate(self.ids):
mi = self.db.get_metadata(x, index_is_id=True, get_cover=True, mi = self.db.get_metadata(x, index_is_id=True, get_cover=True,
cover_as_data=True) cover_as_data=True)
@ -111,6 +118,7 @@ class Worker(Thread): # {{{
os.remove(path) os.remove(path)
except: except:
pass pass
# }}} # }}}
class CopyToLibraryAction(InterfaceAction): class CopyToLibraryAction(InterfaceAction):

View File

@ -23,7 +23,7 @@ class PluginWidget(Widget, Ui_Form):
Widget.__init__(self, parent, Widget.__init__(self, parent,
['prefer_author_sort', 'rescale_images', 'toc_title', ['prefer_author_sort', 'rescale_images', 'toc_title',
'mobi_ignore_margins', 'mobi_toc_at_start', 'mobi_ignore_margins', 'mobi_toc_at_start',
'dont_compress', 'no_inline_toc', 'dont_compress', 'no_inline_toc', 'share_not_sync',
'personal_doc']#, 'mobi_navpoints_only_deepest'] 'personal_doc']#, 'mobi_navpoints_only_deepest']
) )
self.db, self.book_id = db, book_id self.db, self.book_id = db, book_id

View File

@ -75,6 +75,13 @@
</item> </item>
</layout> </layout>
</item> </item>
<item>
<widget class="QCheckBox" name="opt_share_not_sync">
<property name="text">
<string>Enable sharing of book content via Facebook, etc. WARNING: Disables last read syncing</string>
</property>
</widget>
</item>
<item> <item>
<spacer name="verticalSpacer"> <spacer name="verticalSpacer">
<property name="orientation"> <property name="orientation">

View File

@ -266,7 +266,7 @@ class JobManager(QAbstractTableModel): # {{{
def kill_multiple_jobs(self, rows, view): def kill_multiple_jobs(self, rows, view):
jobs = [self.jobs[row] for row in rows] jobs = [self.jobs[row] for row in rows]
devjobs = [j for j in jobs is isinstance(j, DeviceJob)] devjobs = [j for j in jobs if isinstance(j, DeviceJob)]
if devjobs: if devjobs:
error_dialog(view, _('Cannot kill job'), error_dialog(view, _('Cannot kill job'),
_('Cannot kill jobs that communicate with the device')).exec_() _('Cannot kill jobs that communicate with the device')).exec_()

View File

@ -443,7 +443,13 @@ class Editor(QFrame): # {{{
return QWidget.keyPressEvent(self, ev) return QWidget.keyPressEvent(self, ev)
button = getattr(self, 'button%d'%which) button = getattr(self, 'button%d'%which)
button.setStyleSheet('QPushButton { font-weight: normal}') button.setStyleSheet('QPushButton { font-weight: normal}')
sequence = QKeySequence(code|(int(ev.modifiers())&~Qt.KeypadModifier)) mods = int(ev.modifiers()) & ~Qt.KeypadModifier
txt = unicode(ev.text())
if txt and txt.lower() == txt.upper():
# We have a symbol like ! or > etc. In this case the value of code
# already includes Shift, so remove it
mods &= ~Qt.ShiftModifier
sequence = QKeySequence(code|mods)
button.setText(sequence.toString(QKeySequence.NativeText)) button.setText(sequence.toString(QKeySequence.NativeText))
self.capture = 0 self.capture = 0
dup_desc = self.dup_check(sequence) dup_desc = self.dup_check(sequence)

View File

@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import textwrap, re, os import textwrap, re, os, errno
from PyQt4.Qt import (Qt, QDateEdit, QDate, pyqtSignal, QMessageBox, from PyQt4.Qt import (Qt, QDateEdit, QDate, pyqtSignal, QMessageBox,
QIcon, QToolButton, QWidget, QLabel, QGridLayout, QApplication, QIcon, QToolButton, QWidget, QLabel, QGridLayout, QApplication,
@ -98,7 +98,7 @@ class TitleEdit(EnLineEdit):
getattr(db, 'set_'+ self.TITLE_ATTR)(id_, title, notify=False, getattr(db, 'set_'+ self.TITLE_ATTR)(id_, title, notify=False,
commit=False) commit=False)
except (IOError, OSError) as err: except (IOError, OSError) as err:
if getattr(err, 'errno', -1) == 13: # Permission denied if getattr(err, 'errno', -1) == errno.EACCES: # Permission denied
import traceback import traceback
fname = err.filename if err.filename else 'file' fname = err.filename if err.filename else 'file'
error_dialog(self, _('Permission denied'), error_dialog(self, _('Permission denied'),
@ -262,7 +262,7 @@ class AuthorsEdit(MultiCompleteComboBox):
self.books_to_refresh |= db.set_authors(id_, authors, notify=False, self.books_to_refresh |= db.set_authors(id_, authors, notify=False,
allow_case_change=True) allow_case_change=True)
except (IOError, OSError) as err: except (IOError, OSError) as err:
if getattr(err, 'errno', -1) == 13: # Permission denied if getattr(err, 'errno', -1) == errno.EACCES: # Permission denied
import traceback import traceback
fname = err.filename if err.filename else 'file' fname = err.filename if err.filename else 'file'
error_dialog(self, _('Permission denied'), error_dialog(self, _('Permission denied'),

View File

@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os import os, errno
from functools import partial from functools import partial
from PyQt4.Qt import (Qt, QVBoxLayout, QHBoxLayout, QWidget, QPushButton, from PyQt4.Qt import (Qt, QVBoxLayout, QHBoxLayout, QWidget, QPushButton,
@ -427,7 +427,7 @@ class MetadataSingleDialogBase(ResizableDialog):
self.books_to_refresh |= getattr(widget, 'books_to_refresh', self.books_to_refresh |= getattr(widget, 'books_to_refresh',
set([])) set([]))
except IOError as err: except IOError as err:
if err.errno == 13: # Permission denied if err.errno == errno.EACCES: # Permission denied
import traceback import traceback
fname = err.filename if err.filename else 'file' fname = err.filename if err.filename else 'file'
error_dialog(self, _('Permission denied'), error_dialog(self, _('Permission denied'),

View File

@ -80,13 +80,15 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
doc = html.fromstring(f.read()) doc = html.fromstring(f.read())
# example where we are going to find formats # example where we are going to find formats
# <div class="box"> # <div class="l">
# ... # <p>
# <b>Доступные&nbsp;форматы:</b> # Доступно:
# <div class="vertpadd">.epub, .fb2, .pdf, .pdf, .txt</div> # </p>
# ...
# </div> # </div>
xpt = u'normalize-space(//div[@class="box"]//*[contains(normalize-space(text()), "Доступные форматы:")][1]/following-sibling::div[1]/text())' # <div class="l">
# <p>.epub, .fb2.zip, .pdf</p>
# </div>
xpt = u'normalize-space(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "Доступ")]/ancestor-or-self::div[1]/following-sibling::div[1]/*[1])'
formats = doc.xpath(xpt) formats = doc.xpath(xpt)
if formats: if formats:
result = True result = True

View File

@ -161,7 +161,8 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
return path and os.path.exists(os.path.join(path, 'metadata.db')) return path and os.path.exists(os.path.join(path, 'metadata.db'))
def __init__(self, library_path, row_factory=False, default_prefs=None, def __init__(self, library_path, row_factory=False, default_prefs=None,
read_only=False): read_only=False, is_second_db=False):
self.is_second_db = is_second_db
try: try:
if isbytestring(library_path): if isbytestring(library_path):
library_path = library_path.decode(filesystem_encoding) library_path = library_path.decode(filesystem_encoding)
@ -263,6 +264,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
migrate_preference('user_categories', {}) migrate_preference('user_categories', {})
migrate_preference('saved_searches', {}) migrate_preference('saved_searches', {})
if not self.is_second_db:
set_saved_searches(self, 'saved_searches') set_saved_searches(self, 'saved_searches')
# migrate grouped_search_terms # migrate grouped_search_terms

View File

@ -34,7 +34,7 @@ class DispatchController(object): # {{{
def __init__(self, prefix, wsgi=False): def __init__(self, prefix, wsgi=False):
self.dispatcher = cherrypy.dispatch.RoutesDispatcher() self.dispatcher = cherrypy.dispatch.RoutesDispatcher()
self.funcs = [] self.funcs = []
self.seen = set([]) self.seen = set()
self.prefix = prefix if prefix else '' self.prefix = prefix if prefix else ''
if wsgi: if wsgi:
self.prefix = '' self.prefix = ''
@ -146,6 +146,11 @@ class LibraryServer(ContentServer, MobileServer, XMLServer, OPDSServer, Cache,
self.config = {} self.config = {}
self.is_running = False self.is_running = False
self.exception = None self.exception = None
#self.config['/'] = {
# 'tools.sessions.on' : True,
# 'tools.sessions.timeout': 60, # Session times out after 60 minutes
#}
if not wsgi: if not wsgi:
self.setup_loggers() self.setup_loggers()
cherrypy.engine.bonjour.subscribe() cherrypy.engine.bonjour.subscribe()
@ -154,6 +159,7 @@ class LibraryServer(ContentServer, MobileServer, XMLServer, OPDSServer, Cache,
'tools.gzip.mime_types': ['text/html', 'text/plain', 'tools.gzip.mime_types': ['text/html', 'text/plain',
'text/xml', 'text/javascript', 'text/css'], 'text/xml', 'text/javascript', 'text/css'],
} }
if opts.password: if opts.password:
self.config['/'] = { self.config['/'] = {
'tools.digest_auth.on' : True, 'tools.digest_auth.on' : True,

View File

@ -202,7 +202,7 @@ class ContentServer(object):
mode='rb') mode='rb')
if fmt is None: if fmt is None:
raise cherrypy.HTTPError(404, 'book: %d does not have format: %s'%(id, format)) raise cherrypy.HTTPError(404, 'book: %d does not have format: %s'%(id, format))
mi = self.db.get_metadata(id, index_is_id=True) mi = newmi = self.db.get_metadata(id, index_is_id=True)
if format == 'EPUB': if format == 'EPUB':
# Get the original metadata # Get the original metadata
@ -214,9 +214,8 @@ class ContentServer(object):
# Transform the metadata via the plugboard # Transform the metadata via the plugboard
newmi = mi.deepcopy_metadata() newmi = mi.deepcopy_metadata()
newmi.template_to_attribute(mi, cpb) newmi.template_to_attribute(mi, cpb)
else:
newmi = mi
if format in ('MOBI', 'EPUB'):
# Write the updated file # Write the updated file
from calibre.ebooks.metadata.meta import set_metadata from calibre.ebooks.metadata.meta import set_metadata
set_metadata(fmt, newmi, 'epub') set_metadata(fmt, newmi, 'epub')

View File

@ -277,12 +277,15 @@ class MobileServer(object):
cherrypy.response.headers['Content-Type'] = 'text/html; charset=utf-8' cherrypy.response.headers['Content-Type'] = 'text/html; charset=utf-8'
cherrypy.response.headers['Last-Modified'] = self.last_modified(updated) cherrypy.response.headers['Last-Modified'] = self.last_modified(updated)
url_base = "/mobile?search=" + search+";order="+order+";sort="+sort+";num="+str(num) url_base = "/mobile?search=" + search+";order="+order+";sort="+sort+";num="+str(num)
return html.tostring(build_index(books, num, search, sort, order, raw = html.tostring(build_index(books, num, search, sort, order,
start, len(ids), url_base, CKEYS, start, len(ids), url_base, CKEYS,
self.opts.url_prefix), self.opts.url_prefix),
encoding='utf-8', include_meta_content_type=True, encoding='utf-8',
pretty_print=True) pretty_print=True)
# tostring's include_meta_content_type is broken
raw = raw.replace('<head>', '<head>\n'
'<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">')
return raw

View File

@ -28,6 +28,10 @@ class Browser(B):
B.set_cookiejar(self, *args, **kwargs) B.set_cookiejar(self, *args, **kwargs)
self._clone_actions['set_cookiejar'] = ('set_cookiejar', args, kwargs) self._clone_actions['set_cookiejar'] = ('set_cookiejar', args, kwargs)
@property
def cookiejar(self):
return self._clone_actions['set_cookiejar'][1][0]
def set_handle_redirect(self, *args, **kwargs): def set_handle_redirect(self, *args, **kwargs):
B.set_handle_redirect(self, *args, **kwargs) B.set_handle_redirect(self, *args, **kwargs)
self._clone_actions['set_handle_redirect'] = ('set_handle_redirect', self._clone_actions['set_handle_redirect'] = ('set_handle_redirect',

View File

@ -125,6 +125,7 @@ _extra_lang_codes = {
'en_HR' : _('English (Croatia)'), 'en_HR' : _('English (Croatia)'),
'en_ID' : _('English (Indonesia)'), 'en_ID' : _('English (Indonesia)'),
'en_IL' : _('English (Israel)'), 'en_IL' : _('English (Israel)'),
'en_RU' : _('English (Russia)'),
'en_SG' : _('English (Singapore)'), 'en_SG' : _('English (Singapore)'),
'en_YE' : _('English (Yemen)'), 'en_YE' : _('English (Yemen)'),
'en_IE' : _('English (Ireland)'), 'en_IE' : _('English (Ireland)'),

View File

@ -1,6 +1,6 @@
# module pyparsing.py # module pyparsing.py
# #
# Copyright (c) 2003-2010 Paul T. McGuire # Copyright (c) 2003-2011 Paul T. McGuire
# #
# Permission is hereby granted, free of charge, to any person obtaining # Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the # a copy of this software and associated documentation files (the
@ -58,8 +58,8 @@ The pyparsing module handles some of the problems that are typically vexing when
- embedded comments - embedded comments
""" """
__version__ = "1.5.5" __version__ = "1.5.6"
__versionTime__ = "12 Aug 2010 03:56" __versionTime__ = "26 June 2011 10:53"
__author__ = "Paul McGuire <ptmcg@users.sourceforge.net>" __author__ = "Paul McGuire <ptmcg@users.sourceforge.net>"
import string import string
@ -101,11 +101,12 @@ if _PY3K:
basestring = str basestring = str
unichr = chr unichr = chr
_ustr = str _ustr = str
_str2dict = set
alphas = string.ascii_lowercase + string.ascii_uppercase alphas = string.ascii_lowercase + string.ascii_uppercase
else: else:
_MAX_INT = sys.maxint _MAX_INT = sys.maxint
range = xrange range = xrange
set = lambda s : dict( [(c,0) for c in s] )
alphas = string.lowercase + string.uppercase
def _ustr(obj): def _ustr(obj):
"""Drop-in replacement for str(obj) that tries to be Unicode friendly. It first tries """Drop-in replacement for str(obj) that tries to be Unicode friendly. It first tries
@ -134,9 +135,6 @@ else:
#return unicode(obj).encode(sys.getdefaultencoding(), 'replace') #return unicode(obj).encode(sys.getdefaultencoding(), 'replace')
# ... # ...
def _str2dict(strg):
return dict( [(c,0) for c in strg] )
alphas = string.lowercase + string.uppercase alphas = string.lowercase + string.uppercase
# build list of single arg builtins, tolerant of Python version, that can be used as parse actions # build list of single arg builtins, tolerant of Python version, that can be used as parse actions
@ -606,10 +604,10 @@ class ParseResults(object):
def __setstate__(self,state): def __setstate__(self,state):
self.__toklist = state[0] self.__toklist = state[0]
self.__tokdict, \ (self.__tokdict,
par, \ par,
inAccumNames, \ inAccumNames,
self.__name = state[1] self.__name) = state[1]
self.__accumNames = {} self.__accumNames = {}
self.__accumNames.update(inAccumNames) self.__accumNames.update(inAccumNames)
if par is not None: if par is not None:
@ -667,6 +665,35 @@ def nullDebugAction(*args):
"""'Do-nothing' debug action, to suppress debugging output during parsing.""" """'Do-nothing' debug action, to suppress debugging output during parsing."""
pass pass
'decorator to trim function calls to match the arity of the target'
if not _PY3K:
def _trim_arity(func, maxargs=2):
limit = [0]
def wrapper(*args):
while 1:
try:
return func(*args[limit[0]:])
except TypeError:
if limit[0] <= maxargs:
limit[0] += 1
continue
raise
return wrapper
else:
def _trim_arity(func, maxargs=2):
limit = maxargs
def wrapper(*args):
#~ nonlocal limit
while 1:
try:
return func(*args[limit:])
except TypeError:
if limit:
limit -= 1
continue
raise
return wrapper
class ParserElement(object): class ParserElement(object):
"""Abstract base level parser element class.""" """Abstract base level parser element class."""
DEFAULT_WHITE_CHARS = " \n\t\r" DEFAULT_WHITE_CHARS = " \n\t\r"
@ -731,6 +758,9 @@ class ParserElement(object):
see L{I{__call__}<__call__>}. see L{I{__call__}<__call__>}.
""" """
newself = self.copy() newself = self.copy()
if name.endswith("*"):
name = name[:-1]
listAllMatches=True
newself.resultsName = name newself.resultsName = name
newself.modalResults = not listAllMatches newself.modalResults = not listAllMatches
return newself return newself
@ -753,104 +783,6 @@ class ParserElement(object):
self._parse = self._parse._originalParseMethod self._parse = self._parse._originalParseMethod
return self return self
def _normalizeParseActionArgs( f ):
"""Internal method used to decorate parse actions that take fewer than 3 arguments,
so that all parse actions can be called as C{f(s,l,t)}."""
STAR_ARGS = 4
# special handling for single-argument builtins
if (f in singleArgBuiltins):
numargs = 1
else:
try:
restore = None
if isinstance(f,type):
restore = f
f = f.__init__
if not _PY3K:
codeObj = f.func_code
else:
codeObj = f.code
if codeObj.co_flags & STAR_ARGS:
return f
numargs = codeObj.co_argcount
if not _PY3K:
if hasattr(f,"im_self"):
numargs -= 1
else:
if hasattr(f,"__self__"):
numargs -= 1
if restore:
f = restore
except AttributeError:
try:
if not _PY3K:
call_im_func_code = f.__call__.im_func.func_code
else:
call_im_func_code = f.__code__
# not a function, must be a callable object, get info from the
# im_func binding of its bound __call__ method
if call_im_func_code.co_flags & STAR_ARGS:
return f
numargs = call_im_func_code.co_argcount
if not _PY3K:
if hasattr(f.__call__,"im_self"):
numargs -= 1
else:
if hasattr(f.__call__,"__self__"):
numargs -= 0
except AttributeError:
if not _PY3K:
call_func_code = f.__call__.func_code
else:
call_func_code = f.__call__.__code__
# not a bound method, get info directly from __call__ method
if call_func_code.co_flags & STAR_ARGS:
return f
numargs = call_func_code.co_argcount
if not _PY3K:
if hasattr(f.__call__,"im_self"):
numargs -= 1
else:
if hasattr(f.__call__,"__self__"):
numargs -= 1
#~ print ("adding function %s with %d args" % (f.func_name,numargs))
if numargs == 3:
return f
else:
if numargs > 3:
def tmp(s,l,t):
return f(f.__call__.__self__, s,l,t)
if numargs == 2:
def tmp(s,l,t):
return f(l,t)
elif numargs == 1:
def tmp(s,l,t):
return f(t)
else: #~ numargs == 0:
def tmp(s,l,t):
return f()
try:
tmp.__name__ = f.__name__
except (AttributeError,TypeError):
# no need for special handling if attribute doesnt exist
pass
try:
tmp.__doc__ = f.__doc__
except (AttributeError,TypeError):
# no need for special handling if attribute doesnt exist
pass
try:
tmp.__dict__.update(f.__dict__)
except (AttributeError,TypeError):
# no need for special handling if attribute doesnt exist
pass
return tmp
_normalizeParseActionArgs = staticmethod(_normalizeParseActionArgs)
def setParseAction( self, *fns, **kwargs ): def setParseAction( self, *fns, **kwargs ):
"""Define action to perform when successfully matching parse element definition. """Define action to perform when successfully matching parse element definition.
Parse action fn is a callable method with 0-3 arguments, called as C{fn(s,loc,toks)}, Parse action fn is a callable method with 0-3 arguments, called as C{fn(s,loc,toks)},
@ -868,13 +800,13 @@ class ParserElement(object):
consistent view of the parsed string, the parse location, and line and column consistent view of the parsed string, the parse location, and line and column
positions within the parsed string. positions within the parsed string.
""" """
self.parseAction = list(map(self._normalizeParseActionArgs, list(fns))) self.parseAction = list(map(_trim_arity, list(fns)))
self.callDuringTry = ("callDuringTry" in kwargs and kwargs["callDuringTry"]) self.callDuringTry = ("callDuringTry" in kwargs and kwargs["callDuringTry"])
return self return self
def addParseAction( self, *fns, **kwargs ): def addParseAction( self, *fns, **kwargs ):
"""Add parse action to expression's list of parse actions. See L{I{setParseAction}<setParseAction>}.""" """Add parse action to expression's list of parse actions. See L{I{setParseAction}<setParseAction>}."""
self.parseAction += list(map(self._normalizeParseActionArgs, list(fns))) self.parseAction += list(map(_trim_arity, list(fns)))
self.callDuringTry = self.callDuringTry or ("callDuringTry" in kwargs and kwargs["callDuringTry"]) self.callDuringTry = self.callDuringTry or ("callDuringTry" in kwargs and kwargs["callDuringTry"])
return self return self
@ -1012,9 +944,9 @@ class ParserElement(object):
lookup = (self,instring,loc,callPreParse,doActions) lookup = (self,instring,loc,callPreParse,doActions)
if lookup in ParserElement._exprArgCache: if lookup in ParserElement._exprArgCache:
value = ParserElement._exprArgCache[ lookup ] value = ParserElement._exprArgCache[ lookup ]
if isinstance(value,Exception): if isinstance(value, Exception):
raise value raise value
return value return (value[0],value[1].copy())
else: else:
try: try:
value = self._parseNoCache( instring, loc, doActions, callPreParse ) value = self._parseNoCache( instring, loc, doActions, callPreParse )
@ -1088,8 +1020,8 @@ class ParserElement(object):
try: try:
loc, tokens = self._parse( instring, 0 ) loc, tokens = self._parse( instring, 0 )
if parseAll: if parseAll:
#loc = self.preParse( instring, loc ) loc = self.preParse( instring, loc )
se = StringEnd() se = Empty() + StringEnd()
se._parse( instring, loc ) se._parse( instring, loc )
except ParseBaseException: except ParseBaseException:
if ParserElement.verbose_stacktrace: if ParserElement.verbose_stacktrace:
@ -1101,10 +1033,11 @@ class ParserElement(object):
else: else:
return tokens return tokens
def scanString( self, instring, maxMatches=_MAX_INT ): def scanString( self, instring, maxMatches=_MAX_INT, overlap=False ):
"""Scan the input string for expression matches. Each match will return the """Scan the input string for expression matches. Each match will return the
matching tokens, start location, and end location. May be called with optional matching tokens, start location, and end location. May be called with optional
C{maxMatches} argument, to clip scanning after 'n' matches are found. C{maxMatches} argument, to clip scanning after 'n' matches are found. If
C{overlap} is specified, then overlapping matches will be reported.
Note that the start and end locations are reported relative to the string Note that the start and end locations are reported relative to the string
being parsed. See L{I{parseString}<parseString>} for more information on parsing being parsed. See L{I{parseString}<parseString>} for more information on parsing
@ -1133,6 +1066,13 @@ class ParserElement(object):
if nextLoc > loc: if nextLoc > loc:
matches += 1 matches += 1
yield tokens, preloc, nextLoc yield tokens, preloc, nextLoc
if overlap:
nextloc = preparseFn( instring, loc )
if nextloc > loc:
loc = nextLoc
else:
loc += 1
else:
loc = nextLoc loc = nextLoc
else: else:
loc = preloc+1 loc = preloc+1
@ -1168,6 +1108,7 @@ class ParserElement(object):
out.append(t) out.append(t)
lastE = e lastE = e
out.append(instring[lastE:]) out.append(instring[lastE:])
out = [o for o in out if o]
return "".join(map(_ustr,_flatten(out))) return "".join(map(_ustr,_flatten(out)))
except ParseBaseException: except ParseBaseException:
if ParserElement.verbose_stacktrace: if ParserElement.verbose_stacktrace:
@ -1372,6 +1313,9 @@ class ParserElement(object):
userdata = Word(alphas).setResultsName("name") + Word(nums+"-").setResultsName("socsecno") userdata = Word(alphas).setResultsName("name") + Word(nums+"-").setResultsName("socsecno")
could be written as:: could be written as::
userdata = Word(alphas)("name") + Word(nums+"-")("socsecno") userdata = Word(alphas)("name") + Word(nums+"-")("socsecno")
If C{name} is given with a trailing C{'*'} character, then C{listAllMatches} will be
passed as C{True}.
""" """
return self.setResultsName(name) return self.setResultsName(name)
@ -1398,9 +1342,9 @@ class ParserElement(object):
return self return self
def parseWithTabs( self ): def parseWithTabs( self ):
"""Overrides default behavior to expand <TAB>s to spaces before parsing the input string. """Overrides default behavior to expand C{<TAB>}s to spaces before parsing the input string.
Must be called before C{parseString} when the input grammar contains elements that Must be called before C{parseString} when the input grammar contains elements that
match <TAB> characters.""" match C{<TAB>} characters."""
self.keepTabs = True self.keepTabs = True
return self return self
@ -1508,12 +1452,10 @@ class Token(ParserElement):
"""Abstract C{ParserElement} subclass, for defining atomic matching patterns.""" """Abstract C{ParserElement} subclass, for defining atomic matching patterns."""
def __init__( self ): def __init__( self ):
super(Token,self).__init__( savelist=False ) super(Token,self).__init__( savelist=False )
#self.myException = ParseException("",0,"",self)
def setName(self, name): def setName(self, name):
s = super(Token,self).setName(name) s = super(Token,self).setName(name)
self.errmsg = "Expected " + self.name self.errmsg = "Expected " + self.name
#s.myException.msg = self.errmsg
return s return s
@ -1534,7 +1476,6 @@ class NoMatch(Token):
self.mayReturnEmpty = True self.mayReturnEmpty = True
self.mayIndexError = False self.mayIndexError = False
self.errmsg = "Unmatchable token" self.errmsg = "Unmatchable token"
#self.myException.msg = self.errmsg
def parseImpl( self, instring, loc, doActions=True ): def parseImpl( self, instring, loc, doActions=True ):
exc = self.myException exc = self.myException
@ -1558,7 +1499,6 @@ class Literal(Token):
self.name = '"%s"' % _ustr(self.match) self.name = '"%s"' % _ustr(self.match)
self.errmsg = "Expected " + self.name self.errmsg = "Expected " + self.name
self.mayReturnEmpty = False self.mayReturnEmpty = False
#self.myException.msg = self.errmsg
self.mayIndexError = False self.mayIndexError = False
# Performance tuning: this routine gets called a *lot* # Performance tuning: this routine gets called a *lot*
@ -1579,12 +1519,12 @@ _L = Literal
class Keyword(Token): class Keyword(Token):
"""Token to exactly match a specified string as a keyword, that is, it must be """Token to exactly match a specified string as a keyword, that is, it must be
immediately followed by a non-keyword character. Compare with C{Literal}:: immediately followed by a non-keyword character. Compare with C{Literal}::
Literal("if") will match the leading 'if' in 'ifAndOnlyIf'. Literal("if") will match the leading C{'if'} in C{'ifAndOnlyIf'}.
Keyword("if") will not; it will only match the leading 'if in 'if x=1', or 'if(y==2)' Keyword("if") will not; it will only match the leading C{'if'} in C{'if x=1'}, or C{'if(y==2)'}
Accepts two optional constructor arguments in addition to the keyword string: Accepts two optional constructor arguments in addition to the keyword string:
C{identChars} is a string of characters that would be valid identifier characters, C{identChars} is a string of characters that would be valid identifier characters,
defaulting to all alphanumerics + "_" and "$"; C{caseless} allows case-insensitive defaulting to all alphanumerics + "_" and "$"; C{caseless} allows case-insensitive
matching, default is False. matching, default is C{False}.
""" """
DEFAULT_KEYWORD_CHARS = alphanums+"_$" DEFAULT_KEYWORD_CHARS = alphanums+"_$"
@ -1600,13 +1540,12 @@ class Keyword(Token):
self.name = '"%s"' % self.match self.name = '"%s"' % self.match
self.errmsg = "Expected " + self.name self.errmsg = "Expected " + self.name
self.mayReturnEmpty = False self.mayReturnEmpty = False
#self.myException.msg = self.errmsg
self.mayIndexError = False self.mayIndexError = False
self.caseless = caseless self.caseless = caseless
if caseless: if caseless:
self.caselessmatch = matchString.upper() self.caselessmatch = matchString.upper()
identChars = identChars.upper() identChars = identChars.upper()
self.identChars = _str2dict(identChars) self.identChars = set(identChars)
def parseImpl( self, instring, loc, doActions=True ): def parseImpl( self, instring, loc, doActions=True ):
if self.caseless: if self.caseless:
@ -1648,7 +1587,6 @@ class CaselessLiteral(Literal):
self.returnString = matchString self.returnString = matchString
self.name = "'%s'" % self.returnString self.name = "'%s'" % self.returnString
self.errmsg = "Expected " + self.name self.errmsg = "Expected " + self.name
#self.myException.msg = self.errmsg
def parseImpl( self, instring, loc, doActions=True ): def parseImpl( self, instring, loc, doActions=True ):
if instring[ loc:loc+self.matchLen ].upper() == self.match: if instring[ loc:loc+self.matchLen ].upper() == self.match:
@ -1680,18 +1618,25 @@ class Word(Token):
defaults to the initial character set), and an optional minimum, defaults to the initial character set), and an optional minimum,
maximum, and/or exact length. The default value for C{min} is 1 (a maximum, and/or exact length. The default value for C{min} is 1 (a
minimum value < 1 is not valid); the default values for C{max} and C{exact} minimum value < 1 is not valid); the default values for C{max} and C{exact}
are 0, meaning no maximum or exact length restriction. are 0, meaning no maximum or exact length restriction. An optional
C{exclude} parameter can list characters that might be found in
the input C{bodyChars} string; useful to define a word of all printables
except for one or two characters, for instance.
""" """
def __init__( self, initChars, bodyChars=None, min=1, max=0, exact=0, asKeyword=False ): def __init__( self, initChars, bodyChars=None, min=1, max=0, exact=0, asKeyword=False, excludeChars=None ):
super(Word,self).__init__() super(Word,self).__init__()
if excludeChars:
initChars = ''.join([c for c in initChars if c not in excludeChars])
if bodyChars:
bodyChars = ''.join([c for c in bodyChars if c not in excludeChars])
self.initCharsOrig = initChars self.initCharsOrig = initChars
self.initChars = _str2dict(initChars) self.initChars = set(initChars)
if bodyChars : if bodyChars :
self.bodyCharsOrig = bodyChars self.bodyCharsOrig = bodyChars
self.bodyChars = _str2dict(bodyChars) self.bodyChars = set(bodyChars)
else: else:
self.bodyCharsOrig = initChars self.bodyCharsOrig = initChars
self.bodyChars = _str2dict(initChars) self.bodyChars = set(initChars)
self.maxSpecified = max > 0 self.maxSpecified = max > 0
@ -1711,7 +1656,6 @@ class Word(Token):
self.name = _ustr(self) self.name = _ustr(self)
self.errmsg = "Expected " + self.name self.errmsg = "Expected " + self.name
#self.myException.msg = self.errmsg
self.mayIndexError = False self.mayIndexError = False
self.asKeyword = asKeyword self.asKeyword = asKeyword
@ -1743,7 +1687,7 @@ class Word(Token):
raise exc raise exc
loc = result.end() loc = result.end()
return loc,result.group() return loc, result.group()
if not(instring[ loc ] in self.initChars): if not(instring[ loc ] in self.initChars):
#~ raise ParseException( instring, loc, self.errmsg ) #~ raise ParseException( instring, loc, self.errmsg )
@ -1807,7 +1751,7 @@ class Regex(Token):
""" """
compiledREtype = type(re.compile("[A-Z]")) compiledREtype = type(re.compile("[A-Z]"))
def __init__( self, pattern, flags=0): def __init__( self, pattern, flags=0):
"""The parameters pattern and flags are passed to the re.compile() function as-is. See the Python re module for an explanation of the acceptable patterns and flags.""" """The parameters C{pattern} and C{flags} are passed to the C{re.compile()} function as-is. See the Python C{re} module for an explanation of the acceptable patterns and flags."""
super(Regex,self).__init__() super(Regex,self).__init__()
if isinstance(pattern, basestring): if isinstance(pattern, basestring):
@ -1837,7 +1781,6 @@ class Regex(Token):
self.name = _ustr(self) self.name = _ustr(self)
self.errmsg = "Expected " + self.name self.errmsg = "Expected " + self.name
#self.myException.msg = self.errmsg
self.mayIndexError = False self.mayIndexError = False
self.mayReturnEmpty = True self.mayReturnEmpty = True
@ -1929,7 +1872,8 @@ class QuotedString(Token):
self.pattern += (r'|(?:%s)' % re.escape(escQuote)) self.pattern += (r'|(?:%s)' % re.escape(escQuote))
if escChar: if escChar:
self.pattern += (r'|(?:%s.)' % re.escape(escChar)) self.pattern += (r'|(?:%s.)' % re.escape(escChar))
self.escCharReplacePattern = re.escape(self.escChar)+"(.)" charset = ''.join(set(self.quoteChar[0]+self.endQuoteChar[0])).replace('^',r'\^').replace('-',r'\-')
self.escCharReplacePattern = re.escape(self.escChar)+("([%s])" % charset)
self.pattern += (r')*%s' % re.escape(self.endQuoteChar)) self.pattern += (r')*%s' % re.escape(self.endQuoteChar))
try: try:
@ -1942,7 +1886,6 @@ class QuotedString(Token):
self.name = _ustr(self) self.name = _ustr(self)
self.errmsg = "Expected " + self.name self.errmsg = "Expected " + self.name
#self.myException.msg = self.errmsg
self.mayIndexError = False self.mayIndexError = False
self.mayReturnEmpty = True self.mayReturnEmpty = True
@ -2014,7 +1957,6 @@ class CharsNotIn(Token):
self.name = _ustr(self) self.name = _ustr(self)
self.errmsg = "Expected " + self.name self.errmsg = "Expected " + self.name
self.mayReturnEmpty = ( self.minLen == 0 ) self.mayReturnEmpty = ( self.minLen == 0 )
#self.myException.msg = self.errmsg
self.mayIndexError = False self.mayIndexError = False
def parseImpl( self, instring, loc, doActions=True ): def parseImpl( self, instring, loc, doActions=True ):
@ -2077,7 +2019,6 @@ class White(Token):
self.name = ("".join([White.whiteStrs[c] for c in self.matchWhite])) self.name = ("".join([White.whiteStrs[c] for c in self.matchWhite]))
self.mayReturnEmpty = True self.mayReturnEmpty = True
self.errmsg = "Expected " + self.name self.errmsg = "Expected " + self.name
#self.myException.msg = self.errmsg
self.minLen = min self.minLen = min
@ -2150,7 +2091,6 @@ class LineStart(_PositionToken):
super(LineStart,self).__init__() super(LineStart,self).__init__()
self.setWhitespaceChars( ParserElement.DEFAULT_WHITE_CHARS.replace("\n","") ) self.setWhitespaceChars( ParserElement.DEFAULT_WHITE_CHARS.replace("\n","") )
self.errmsg = "Expected start of line" self.errmsg = "Expected start of line"
#self.myException.msg = self.errmsg
def preParse( self, instring, loc ): def preParse( self, instring, loc ):
preloc = super(LineStart,self).preParse(instring,loc) preloc = super(LineStart,self).preParse(instring,loc)
@ -2175,7 +2115,6 @@ class LineEnd(_PositionToken):
super(LineEnd,self).__init__() super(LineEnd,self).__init__()
self.setWhitespaceChars( ParserElement.DEFAULT_WHITE_CHARS.replace("\n","") ) self.setWhitespaceChars( ParserElement.DEFAULT_WHITE_CHARS.replace("\n","") )
self.errmsg = "Expected end of line" self.errmsg = "Expected end of line"
#self.myException.msg = self.errmsg
def parseImpl( self, instring, loc, doActions=True ): def parseImpl( self, instring, loc, doActions=True ):
if loc<len(instring): if loc<len(instring):
@ -2200,7 +2139,6 @@ class StringStart(_PositionToken):
def __init__( self ): def __init__( self ):
super(StringStart,self).__init__() super(StringStart,self).__init__()
self.errmsg = "Expected start of text" self.errmsg = "Expected start of text"
#self.myException.msg = self.errmsg
def parseImpl( self, instring, loc, doActions=True ): def parseImpl( self, instring, loc, doActions=True ):
if loc != 0: if loc != 0:
@ -2218,7 +2156,6 @@ class StringEnd(_PositionToken):
def __init__( self ): def __init__( self ):
super(StringEnd,self).__init__() super(StringEnd,self).__init__()
self.errmsg = "Expected end of text" self.errmsg = "Expected end of text"
#self.myException.msg = self.errmsg
def parseImpl( self, instring, loc, doActions=True ): def parseImpl( self, instring, loc, doActions=True ):
if loc < len(instring): if loc < len(instring):
@ -2239,14 +2176,14 @@ class StringEnd(_PositionToken):
class WordStart(_PositionToken): class WordStart(_PositionToken):
"""Matches if the current position is at the beginning of a Word, and """Matches if the current position is at the beginning of a Word, and
is not preceded by any character in a given set of wordChars is not preceded by any character in a given set of C{wordChars}
(default=C{printables}). To emulate the C{\b} behavior of regular expressions, (default=C{printables}). To emulate the C{\b} behavior of regular expressions,
use C{WordStart(alphanums)}. C{WordStart} will also match at the beginning of use C{WordStart(alphanums)}. C{WordStart} will also match at the beginning of
the string being parsed, or at the beginning of a line. the string being parsed, or at the beginning of a line.
""" """
def __init__(self, wordChars = printables): def __init__(self, wordChars = printables):
super(WordStart,self).__init__() super(WordStart,self).__init__()
self.wordChars = _str2dict(wordChars) self.wordChars = set(wordChars)
self.errmsg = "Not at the start of a word" self.errmsg = "Not at the start of a word"
def parseImpl(self, instring, loc, doActions=True ): def parseImpl(self, instring, loc, doActions=True ):
@ -2261,14 +2198,14 @@ class WordStart(_PositionToken):
class WordEnd(_PositionToken): class WordEnd(_PositionToken):
"""Matches if the current position is at the end of a Word, and """Matches if the current position is at the end of a Word, and
is not followed by any character in a given set of wordChars is not followed by any character in a given set of C{wordChars}
(default=C{printables}). To emulate the C{\b} behavior of regular expressions, (default=C{printables}). To emulate the C{\b} behavior of regular expressions,
use C{WordEnd(alphanums)}. C{WordEnd} will also match at the end of use C{WordEnd(alphanums)}. C{WordEnd} will also match at the end of
the string being parsed, or at the end of a line. the string being parsed, or at the end of a line.
""" """
def __init__(self, wordChars = printables): def __init__(self, wordChars = printables):
super(WordEnd,self).__init__() super(WordEnd,self).__init__()
self.wordChars = _str2dict(wordChars) self.wordChars = set(wordChars)
self.skipWhitespace = False self.skipWhitespace = False
self.errmsg = "Not at the end of a word" self.errmsg = "Not at the end of a word"
@ -2309,7 +2246,7 @@ class ParseExpression(ParserElement):
return self return self
def leaveWhitespace( self ): def leaveWhitespace( self ):
"""Extends leaveWhitespace defined in base class, and also invokes leaveWhitespace on """Extends C{leaveWhitespace} defined in base class, and also invokes C{leaveWhitespace} on
all contained expressions.""" all contained expressions."""
self.skipWhitespace = False self.skipWhitespace = False
self.exprs = [ e.copy() for e in self.exprs ] self.exprs = [ e.copy() for e in self.exprs ]
@ -2381,10 +2318,15 @@ class ParseExpression(ParserElement):
e.validate(tmp) e.validate(tmp)
self.checkRecursion( [] ) self.checkRecursion( [] )
def copy(self):
ret = super(ParseExpression,self).copy()
ret.exprs = [e.copy() for e in self.exprs]
return ret
class And(ParseExpression): class And(ParseExpression):
"""Requires all given C{ParseExpressions} to be found in the given order. """Requires all given C{ParseExpression}s to be found in the given order.
Expressions may be separated by whitespace. Expressions may be separated by whitespace.
May be constructed using the '+' operator. May be constructed using the C{'+'} operator.
""" """
class _ErrorStop(Empty): class _ErrorStop(Empty):
@ -2453,7 +2395,7 @@ class And(ParseExpression):
class Or(ParseExpression): class Or(ParseExpression):
"""Requires that at least one C{ParseExpression} is found. """Requires that at least one C{ParseExpression} is found.
If two expressions match, the expression that matches the longest string will be used. If two expressions match, the expression that matches the longest string will be used.
May be constructed using the '^' operator. May be constructed using the C{'^'} operator.
""" """
def __init__( self, exprs, savelist = False ): def __init__( self, exprs, savelist = False ):
super(Or,self).__init__(exprs, savelist) super(Or,self).__init__(exprs, savelist)
@ -2515,7 +2457,7 @@ class Or(ParseExpression):
class MatchFirst(ParseExpression): class MatchFirst(ParseExpression):
"""Requires that at least one C{ParseExpression} is found. """Requires that at least one C{ParseExpression} is found.
If two expressions match, the first one listed is the one that will match. If two expressions match, the first one listed is the one that will match.
May be constructed using the '|' operator. May be constructed using the C{'|'} operator.
""" """
def __init__( self, exprs, savelist = False ): def __init__( self, exprs, savelist = False ):
super(MatchFirst,self).__init__(exprs, savelist) super(MatchFirst,self).__init__(exprs, savelist)
@ -2572,9 +2514,9 @@ class MatchFirst(ParseExpression):
class Each(ParseExpression): class Each(ParseExpression):
"""Requires all given C{ParseExpressions} to be found, but in any order. """Requires all given C{ParseExpression}s to be found, but in any order.
Expressions may be separated by whitespace. Expressions may be separated by whitespace.
May be constructed using the '&' operator. May be constructed using the C{'&'} operator.
""" """
def __init__( self, exprs, savelist = True ): def __init__( self, exprs, savelist = True ):
super(Each,self).__init__(exprs, savelist) super(Each,self).__init__(exprs, savelist)
@ -2757,7 +2699,6 @@ class NotAny(ParseElementEnhance):
self.skipWhitespace = False # do NOT use self.leaveWhitespace(), don't want to propagate to exprs self.skipWhitespace = False # do NOT use self.leaveWhitespace(), don't want to propagate to exprs
self.mayReturnEmpty = True self.mayReturnEmpty = True
self.errmsg = "Found unwanted token, "+_ustr(self.expr) self.errmsg = "Found unwanted token, "+_ustr(self.expr)
#self.myException = ParseException("",0,self.errmsg,self)
def parseImpl( self, instring, loc, doActions=True ): def parseImpl( self, instring, loc, doActions=True ):
try: try:
@ -2916,7 +2857,6 @@ class SkipTo(ParseElementEnhance):
else: else:
self.failOn = failOn self.failOn = failOn
self.errmsg = "No match found for "+_ustr(self.expr) self.errmsg = "No match found for "+_ustr(self.expr)
#self.myException = ParseException("",0,self.errmsg,self)
def parseImpl( self, instring, loc, doActions=True ): def parseImpl( self, instring, loc, doActions=True ):
startLoc = loc startLoc = loc
@ -3040,7 +2980,7 @@ class _ForwardNoRecurse(Forward):
return "..." return "..."
class TokenConverter(ParseElementEnhance): class TokenConverter(ParseElementEnhance):
"""Abstract subclass of ParseExpression, for converting parsed results.""" """Abstract subclass of C{ParseExpression}, for converting parsed results."""
def __init__( self, expr, savelist=False ): def __init__( self, expr, savelist=False ):
super(TokenConverter,self).__init__( expr )#, savelist ) super(TokenConverter,self).__init__( expr )#, savelist )
self.saveAsList = False self.saveAsList = False
@ -3089,7 +3029,7 @@ class Combine(TokenConverter):
return retToks return retToks
class Group(TokenConverter): class Group(TokenConverter):
"""Converter to return the matched tokens as a list - useful for returning tokens of ZeroOrMore and OneOrMore expressions.""" """Converter to return the matched tokens as a list - useful for returning tokens of C{ZeroOrMore} and C{OneOrMore} expressions."""
def __init__( self, expr ): def __init__( self, expr ):
super(Group,self).__init__( expr ) super(Group,self).__init__( expr )
self.saveAsList = True self.saveAsList = True
@ -3143,7 +3083,7 @@ class Suppress(TokenConverter):
class OnlyOnce(object): class OnlyOnce(object):
"""Wrapper for parse actions, to ensure they are only called once.""" """Wrapper for parse actions, to ensure they are only called once."""
def __init__(self, methodCall): def __init__(self, methodCall):
self.callable = ParserElement._normalizeParseActionArgs(methodCall) self.callable = _trim_arity(methodCall)
self.called = False self.called = False
def __call__(self,s,l,t): def __call__(self,s,l,t):
if not self.called: if not self.called:
@ -3156,7 +3096,7 @@ class OnlyOnce(object):
def traceParseAction(f): def traceParseAction(f):
"""Decorator for debugging parse actions.""" """Decorator for debugging parse actions."""
f = ParserElement._normalizeParseActionArgs(f) f = _trim_arity(f)
def z(*paArgs): def z(*paArgs):
thisFunc = f.func_name thisFunc = f.func_name
s,l,t = paArgs[-3:] s,l,t = paArgs[-3:]
@ -3194,7 +3134,7 @@ def delimitedList( expr, delim=",", combine=False ):
else: else:
return ( expr + ZeroOrMore( Suppress( delim ) + expr ) ).setName(dlName) return ( expr + ZeroOrMore( Suppress( delim ) + expr ) ).setName(dlName)
def countedArray( expr ): def countedArray( expr, intExpr=None ):
"""Helper to define a counted list of expressions. """Helper to define a counted list of expressions.
This helper defines a pattern of the form:: This helper defines a pattern of the form::
integer expr expr expr... integer expr expr expr...
@ -3203,15 +3143,25 @@ def countedArray( expr ):
""" """
arrayExpr = Forward() arrayExpr = Forward()
def countFieldParseAction(s,l,t): def countFieldParseAction(s,l,t):
n = int(t[0]) n = t[0]
arrayExpr << (n and Group(And([expr]*n)) or Group(empty)) arrayExpr << (n and Group(And([expr]*n)) or Group(empty))
return [] return []
return ( Word(nums).setName("arrayLen").setParseAction(countFieldParseAction, callDuringTry=True) + arrayExpr ) if intExpr is None:
intExpr = Word(nums).setParseAction(lambda t:int(t[0]))
else:
intExpr = intExpr.copy()
intExpr.setName("arrayLen")
intExpr.addParseAction(countFieldParseAction, callDuringTry=True)
return ( intExpr + arrayExpr )
def _flatten(L): def _flatten(L):
if type(L) is not list: return [L] ret = []
if L == []: return L for i in L:
return _flatten(L[0]) + _flatten(L[1:]) if isinstance(i,list):
ret.extend(_flatten(i))
else:
ret.append(i)
return ret
def matchPreviousLiteral(expr): def matchPreviousLiteral(expr):
"""Helper to define an expression that is indirectly defined from """Helper to define an expression that is indirectly defined from
@ -3346,15 +3296,15 @@ def originalTextFor(expr, asString=True):
"""Helper to return the original, untokenized text for a given expression. Useful to """Helper to return the original, untokenized text for a given expression. Useful to
restore the parsed fields of an HTML start tag into the raw tag text itself, or to restore the parsed fields of an HTML start tag into the raw tag text itself, or to
revert separate tokens with intervening whitespace back to the original matching revert separate tokens with intervening whitespace back to the original matching
input text. Simpler to use than the parse action C{keepOriginalText}, and does not input text. Simpler to use than the parse action C{L{keepOriginalText}}, and does not
require the inspect module to chase up the call stack. By default, returns a require the inspect module to chase up the call stack. By default, returns a
string containing the original parsed text. string containing the original parsed text.
If the optional C{asString} argument is passed as False, then the return value is a If the optional C{asString} argument is passed as C{False}, then the return value is a
C{ParseResults} containing any results names that were originally matched, and a C{ParseResults} containing any results names that were originally matched, and a
single token containing the original matched text from the input string. So if single token containing the original matched text from the input string. So if
the expression passed to C{originalTextFor} contains expressions with defined the expression passed to C{L{originalTextFor}} contains expressions with defined
results names, you must set C{asString} to False if you want to preserve those results names, you must set C{asString} to C{False} if you want to preserve those
results name values.""" results name values."""
locMarker = Empty().setParseAction(lambda s,loc,t: loc) locMarker = Empty().setParseAction(lambda s,loc,t: loc)
endlocMarker = locMarker.copy() endlocMarker = locMarker.copy()
@ -3371,6 +3321,11 @@ def originalTextFor(expr, asString=True):
matchExpr.setParseAction(extractText) matchExpr.setParseAction(extractText)
return matchExpr return matchExpr
def ungroup(expr):
"""Helper to undo pyparsing's default grouping of And expressions, even
if all but one are non-empty."""
return TokenConverter(expr).setParseAction(lambda t:t[0])
# convenience constants for positional expressions # convenience constants for positional expressions
empty = Empty().setName("empty") empty = Empty().setName("empty")
lineStart = LineStart().setName("lineStart") lineStart = LineStart().setName("lineStart")
@ -3380,8 +3335,8 @@ stringEnd = StringEnd().setName("stringEnd")
_escapedPunc = Word( _bslash, r"\[]-*.$+^?()~ ", exact=2 ).setParseAction(lambda s,l,t:t[0][1]) _escapedPunc = Word( _bslash, r"\[]-*.$+^?()~ ", exact=2 ).setParseAction(lambda s,l,t:t[0][1])
_printables_less_backslash = "".join([ c for c in printables if c not in r"\]" ]) _printables_less_backslash = "".join([ c for c in printables if c not in r"\]" ])
_escapedHexChar = Combine( Suppress(_bslash + "0x") + Word(hexnums) ).setParseAction(lambda s,l,t:unichr(int(t[0],16))) _escapedHexChar = Regex(r"\\0?[xX][0-9a-fA-F]+").setParseAction(lambda s,l,t:unichr(int(t[0][1:],16)))
_escapedOctChar = Combine( Suppress(_bslash) + Word("0","01234567") ).setParseAction(lambda s,l,t:unichr(int(t[0],8))) _escapedOctChar = Regex(r"\\0[0-7]+").setParseAction(lambda s,l,t:unichr(int(t[0][1:],8)))
_singleChar = _escapedPunc | _escapedHexChar | _escapedOctChar | Word(_printables_less_backslash,exact=1) _singleChar = _escapedPunc | _escapedHexChar | _escapedOctChar | Word(_printables_less_backslash,exact=1)
_charRange = Group(_singleChar + Suppress("-") + _singleChar) _charRange = Group(_singleChar + Suppress("-") + _singleChar)
_reBracketExpr = Literal("[") + Optional("^").setResultsName("negate") + Group( OneOrMore( _charRange | _singleChar ) ).setResultsName("body") + "]" _reBracketExpr = Literal("[") + Optional("^").setResultsName("negate") + Group( OneOrMore( _charRange | _singleChar ) ).setResultsName("body") + "]"
@ -3399,7 +3354,8 @@ def srange(s):
The values enclosed in the []'s may be:: The values enclosed in the []'s may be::
a single character a single character
an escaped character with a leading backslash (such as \- or \]) an escaped character with a leading backslash (such as \- or \])
an escaped hex character with a leading '\0x' (\0x21, which is a '!' character) an escaped hex character with a leading '\x' (\x21, which is a '!' character)
(\0x## is also supported for backwards compatibility)
an escaped octal character with a leading '\0' (\041, which is a '!' character) an escaped octal character with a leading '\0' (\041, which is a '!' character)
a range of any of the above, separated by a dash ('a-z', etc.) a range of any of the above, separated by a dash ('a-z', etc.)
any combination of the above ('aeiouy', 'a-zA-Z0-9_$', etc.) any combination of the above ('aeiouy', 'a-zA-Z0-9_$', etc.)
@ -3486,7 +3442,7 @@ def _makeTags(tagStr, xml):
else: else:
printablesLessRAbrack = "".join( [ c for c in printables if c not in ">" ] ) printablesLessRAbrack = "".join( [ c for c in printables if c not in ">" ] )
tagAttrValue = quotedString.copy().setParseAction( removeQuotes ) | Word(printablesLessRAbrack) tagAttrValue = quotedString.copy().setParseAction( removeQuotes ) | Word(printablesLessRAbrack)
openTag = Suppress("<") + tagStr + \ openTag = Suppress("<") + tagStr("tag") + \
Dict(ZeroOrMore(Group( tagAttrName.setParseAction(downcaseTokens) + \ Dict(ZeroOrMore(Group( tagAttrName.setParseAction(downcaseTokens) + \
Optional( Suppress("=") + tagAttrValue ) ))) + \ Optional( Suppress("=") + tagAttrValue ) ))) + \
Optional("/",default=[False]).setResultsName("empty").setParseAction(lambda s,l,t:t[0]=='/') + Suppress(">") Optional("/",default=[False]).setResultsName("empty").setParseAction(lambda s,l,t:t[0]=='/') + Suppress(">")
@ -3508,19 +3464,21 @@ def makeXMLTags(tagStr):
def withAttribute(*args,**attrDict): def withAttribute(*args,**attrDict):
"""Helper to create a validating parse action to be used with start tags created """Helper to create a validating parse action to be used with start tags created
with makeXMLTags or makeHTMLTags. Use withAttribute to qualify a starting tag with C{makeXMLTags} or C{makeHTMLTags}. Use C{withAttribute} to qualify a starting tag
with a required attribute value, to avoid false matches on common tags such as with a required attribute value, to avoid false matches on common tags such as
<TD> or <DIV>. C{<TD>} or C{<DIV>}.
Call withAttribute with a series of attribute names and values. Specify the list Call C{withAttribute} with a series of attribute names and values. Specify the list
of filter attributes names and values as: of filter attributes names and values as:
- keyword arguments, as in (class="Customer",align="right"), or - keyword arguments, as in C{(align="right")}, or
- as an explicit dict with C{**} operator, when an attribute name is also a Python
reserved word, as in C{**{"class":"Customer", "align":"right"}}
- a list of name-value tuples, as in ( ("ns1:class", "Customer"), ("ns2:align","right") ) - a list of name-value tuples, as in ( ("ns1:class", "Customer"), ("ns2:align","right") )
For attribute names with a namespace prefix, you must use the second form. Attribute For attribute names with a namespace prefix, you must use the second form. Attribute
names are matched insensitive to upper/lower case. names are matched insensitive to upper/lower case.
To verify that the attribute exists, but without specifying a value, pass To verify that the attribute exists, but without specifying a value, pass
withAttribute.ANY_VALUE as the value. C{withAttribute.ANY_VALUE} as the value.
""" """
if args: if args:
attrs = args[:] attrs = args[:]
@ -3631,12 +3589,12 @@ def nestedExpr(opener="(", closer=")", content=None, ignoreExpr=quotedString.cop
expression will capture all whitespace-delimited content between delimiters expression will capture all whitespace-delimited content between delimiters
as a list of separate values. as a list of separate values.
Use the ignoreExpr argument to define expressions that may contain Use the C{ignoreExpr} argument to define expressions that may contain
opening or closing characters that should not be treated as opening opening or closing characters that should not be treated as opening
or closing characters for nesting, such as quotedString or a comment or closing characters for nesting, such as quotedString or a comment
expression. Specify multiple expressions using an Or or MatchFirst. expression. Specify multiple expressions using an C{L{Or}} or C{L{MatchFirst}}.
The default is quotedString, but if no expressions are to be ignored, The default is L{quotedString}, but if no expressions are to be ignored,
then pass None for this argument. then pass C{None} for this argument.
""" """
if opener == closer: if opener == closer:
raise ValueError("opening and closing strings cannot be the same") raise ValueError("opening and closing strings cannot be the same")
@ -3683,7 +3641,7 @@ def indentedBlock(blockStatementExpr, indentStack, indent=True):
the current level; set to False for block of left-most statements the current level; set to False for block of left-most statements
(default=True) (default=True)
A valid block must contain at least one blockStatement. A valid block must contain at least one C{blockStatement}.
""" """
def checkPeerIndent(s,l,t): def checkPeerIndent(s,l,t):
if l >= len(s): return if l >= len(s): return

View File

@ -16,11 +16,11 @@ methods :method:`SearchQueryParser.universal_set` and
If this module is run, it will perform a series of unit tests. If this module is run, it will perform a series of unit tests.
''' '''
import sys, operator import sys, operator, weakref
from calibre.utils.pyparsing import CaselessKeyword, Group, Forward, \ from calibre.utils.pyparsing import (CaselessKeyword, Group, Forward,
CharsNotIn, Suppress, OneOrMore, MatchFirst, CaselessLiteral, \ CharsNotIn, Suppress, OneOrMore, MatchFirst, CaselessLiteral,
Optional, NoMatch, ParseException, QuotedString Optional, NoMatch, ParseException, QuotedString)
from calibre.constants import preferred_encoding from calibre.constants import preferred_encoding
from calibre.utils.icu import sort_key from calibre.utils.icu import sort_key
from calibre import prints from calibre import prints
@ -37,11 +37,19 @@ class SavedSearchQueries(object):
def __init__(self, db, _opt_name): def __init__(self, db, _opt_name):
self.opt_name = _opt_name; self.opt_name = _opt_name;
self.db = db
if db is not None: if db is not None:
self.queries = db.prefs.get(self.opt_name, {}) self.queries = db.prefs.get(self.opt_name, {})
else: else:
self.queries = {} self.queries = {}
try:
self._db = weakref.ref(db)
except:
# db could be None
self._db = lambda : None
@property
def db(self):
return self._db()
def force_unicode(self, x): def force_unicode(self, x):
if not isinstance(x, unicode): if not isinstance(x, unicode):
@ -49,21 +57,27 @@ class SavedSearchQueries(object):
return x return x
def add(self, name, value): def add(self, name, value):
db = self.db
if db is not None:
self.queries[self.force_unicode(name)] = self.force_unicode(value).strip() self.queries[self.force_unicode(name)] = self.force_unicode(value).strip()
self.db.prefs[self.opt_name] = self.queries db.prefs[self.opt_name] = self.queries
def lookup(self, name): def lookup(self, name):
return self.queries.get(self.force_unicode(name), None) return self.queries.get(self.force_unicode(name), None)
def delete(self, name): def delete(self, name):
db = self.db
if db is not None:
self.queries.pop(self.force_unicode(name), False) self.queries.pop(self.force_unicode(name), False)
self.db.prefs[self.opt_name] = self.queries db.prefs[self.opt_name] = self.queries
def rename(self, old_name, new_name): def rename(self, old_name, new_name):
db = self.db
if db is not None:
self.queries[self.force_unicode(new_name)] = \ self.queries[self.force_unicode(new_name)] = \
self.queries.get(self.force_unicode(old_name), None) self.queries.get(self.force_unicode(old_name), None)
self.queries.pop(self.force_unicode(old_name), False) self.queries.pop(self.force_unicode(old_name), False)
self.db.prefs[self.opt_name] = self.queries db.prefs[self.opt_name] = self.queries
def names(self): def names(self):
return sorted(self.queries.keys(),key=sort_key) return sorted(self.queries.keys(),key=sort_key)

View File

@ -144,6 +144,18 @@ class BasicNewsRecipe(Recipe):
#: manually (though manual cleanup will always be superior). #: manually (though manual cleanup will always be superior).
auto_cleanup = False auto_cleanup = False
#: Specify elements that the auto cleanup algorithm should never remove
#: The syntax is a XPath expression. For example::
#:
#: auto_cleanup_keep = '//div[@id="article-image"]' will keep all divs with
#: id="article-image"
#: auto_cleanup_keep = '//*[@class="important"]' will keep all elements
#: with class="important"
#: auto_cleanup_keep = '//div[@id="article-image"]|//span[@class="important"]'
#: will keep all divs with id="article-image" and spans
#: with class="important"
auto_cleanup_keep = None
#: Specify any extra :term:`CSS` that should be addded to downloaded :term:`HTML` files #: Specify any extra :term:`CSS` that should be addded to downloaded :term:`HTML` files
#: It will be inserted into `<style>` tags, just before the closing #: It will be inserted into `<style>` tags, just before the closing
#: `</head>` tag thereby overriding all :term:`CSS` except that which is #: `</head>` tag thereby overriding all :term:`CSS` except that which is
@ -552,7 +564,8 @@ class BasicNewsRecipe(Recipe):
from lxml.html import (fragment_fromstring, tostring, from lxml.html import (fragment_fromstring, tostring,
document_fromstring) document_fromstring)
doc = readability.Document(html, self.log, url=url) doc = readability.Document(html, self.log, url=url,
keep_elements=self.auto_cleanup_keep)
article_html = doc.summary() article_html = doc.summary()
extracted_title = doc.title() extracted_title = doc.title()

View File

@ -22,7 +22,7 @@ E = ElementMaker(namespace=NS, nsmap={None:NS})
def iterate_over_builtin_recipe_files(): def iterate_over_builtin_recipe_files():
exclude = ['craigslist', 'iht', 'toronto_sun', exclude = ['craigslist', 'iht', 'toronto_sun',
'india_today', 'livemint'] 'livemint']
d = os.path.dirname d = os.path.dirname
base = os.path.join(d(d(d(d(d(d(os.path.abspath(__file__))))))), 'recipes') base = os.path.join(d(d(d(d(d(d(os.path.abspath(__file__))))))), 'recipes')
for f in os.listdir(base): for f in os.listdir(base):

View File

@ -75,7 +75,7 @@ MD5_SESS = "MD5-sess"
AUTH = "auth" AUTH = "auth"
AUTH_INT = "auth-int" AUTH_INT = "auth-int"
SUPPORTED_ALGORITHM = ('md5', MD5, MD5_SESS) SUPPORTED_ALGORITHM = ('md5', MD5, MD5_SESS) # Changed by Kovid
SUPPORTED_QOP = (AUTH, AUTH_INT) SUPPORTED_QOP = (AUTH, AUTH_INT)
################################################################################ ################################################################################
@ -83,7 +83,7 @@ SUPPORTED_QOP = (AUTH, AUTH_INT)
# #
DIGEST_AUTH_ENCODERS = { DIGEST_AUTH_ENCODERS = {
MD5: lambda val: md5(val).hexdigest(), MD5: lambda val: md5(val).hexdigest(),
'md5': lambda val:md5(val).hexdigest(), 'md5': lambda val:md5(val).hexdigest(), # Added by Kovid
MD5_SESS: lambda val: md5(val).hexdigest(), MD5_SESS: lambda val: md5(val).hexdigest(),
# SHA: lambda val: sha(val).hexdigest(), # SHA: lambda val: sha(val).hexdigest(),
} }
@ -225,7 +225,7 @@ def _A1(params, password):
algorithm = params.get ("algorithm", MD5) algorithm = params.get ("algorithm", MD5)
H = DIGEST_AUTH_ENCODERS[algorithm] H = DIGEST_AUTH_ENCODERS[algorithm]
if algorithm in (MD5, 'md5'): if algorithm in (MD5, 'md5'): # Changed by Kovid
# If the "algorithm" directive's value is "MD5" or is # If the "algorithm" directive's value is "MD5" or is
# unspecified, then A1 is: # unspecified, then A1 is:
# A1 = unq(username-value) ":" unq(realm-value) ":" passwd # A1 = unq(username-value) ":" unq(realm-value) ":" passwd

View File

@ -671,8 +671,9 @@ def set_response_cookie(path=None, path_header=None, name='session_id',
# save it to disk and the session is lost if people close # save it to disk and the session is lost if people close
# the browser. So we have to use the old "expires" ... sigh ... # the browser. So we have to use the old "expires" ... sigh ...
## cookie[name]['max-age'] = timeout * 60 ## cookie[name]['max-age'] = timeout * 60
if timeout: if False and timeout: # Changed by Kovid, we want the user to have to
cookie[name]['expires'] = http.HTTPDate(time.time() + (timeout * 60)) # re-authenticate on browser restart
cookie[name]['expires'] = http.HTTPDate(time.time() + timeout)
if domain is not None: if domain is not None:
cookie[name]['domain'] = domain cookie[name]['domain'] = domain
if secure: if secure:

View File

@ -241,10 +241,10 @@ def wait_for_free_port(host, port):
for trial in xrange(50): for trial in xrange(50):
try: try:
# we are expecting a free port, so reduce the timeout # we are expecting a free port, so reduce the timeout
check_port(host, port, timeout=0.2) check_port(host, port, timeout=0.2) # Changed by Kovid
except IOError: except IOError:
# Give the old server thread time to free the port. # Give the old server thread time to free the port.
time.sleep(0.2) time.sleep(0.2) # Changed by Kovid
else: else:
return return