Sync to trunk.

This commit is contained in:
John Schember 2011-09-14 18:53:08 -04:00
commit 869fa05db3
43 changed files with 789 additions and 582 deletions

View File

@ -1,93 +1,105 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
__copyright__ = '2008 Kovid Goyal kovid@kovidgoyal.net, 2010 Darko Miletic <darko.miletic at gmail.com>'
'''
businessweek.com
www.businessweek.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class BusinessWeek(BasicNewsRecipe):
title = 'Business Week'
description = 'Business News, Stock Market and Financial Advice'
__author__ = 'ChuckEggDotCom and Sujata Raman'
language = 'en'
title = 'Business Week'
__author__ = 'Kovid Goyal and Darko Miletic'
description = 'Read the latest international business news & stock market news. Get updated company profiles, financial advice, global economy and technology news.'
publisher = 'Bloomberg L.P.'
category = 'Business, business news, stock market, stock market news, financial advice, company profiles, financial advice, global economy, technology news'
oldest_article = 7
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
language = 'en'
remove_empty_feeds = True
publication_type = 'magazine'
cover_url = 'http://images.businessweek.com/mz/covers/current_120x160.jpg'
masthead_url = 'http://assets.businessweek.com/images/bw-logo.png'
extra_css = """
body{font-family: Helvetica,Arial,sans-serif }
img{margin-bottom: 0.4em; display:block}
.tagline{color: gray; font-style: italic}
.photoCredit{font-size: small; color: gray}
"""
oldest_article = 7
max_articles_per_feed = 10
no_stylesheets = True
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
recursions = 1
match_regexps = [r'http://www.businessweek.com/.*_page_[1-9].*']
extra_css = '''
h1{font-family :Arial,Helvetica,sans-serif; font-size:large;}
.news_story_title{font-family :Arial,Helvetica,sans-serif; font-size:large;font-weight:bold;}
h2{font-family :Arial,Helvetica,sans-serif; font-size:medium;color:#666666;}
h3{text-transform:uppercase;font-family :Arial,Helvetica,sans-serif; font-size:large;font-weight:bold;}
h4{font-family :Arial,Helvetica,sans-serif; font-size:small;font-weight:bold;}
p{font-family :Arial,Helvetica,sans-serif; }
#lede600{font-size:x-small;}
#storybody{font-size:x-small;}
p{font-family :Arial,Helvetica,sans-serif;}
.strap{font-family :Arial,Helvetica,sans-serif; font-size:x-small; color:#064599;}
.byline{font-family :Arial,Helvetica,sans-serif; font-size:x-small;}
.postedBy{font-family :Arial,Helvetica,sans-serif; font-size:x-small;color:#666666;}
.trackback{font-family :Arial,Helvetica,sans-serif; font-size:x-small;color:#666666;}
.date{font-family :Arial,Helvetica,sans-serif; font-size:x-small;color:#666666;}
.wrapper{font-family :Arial,Helvetica,sans-serif; font-size:x-small;}
.photoCredit{font-family :Arial,Helvetica,sans-serif; font-size:x-small;color:#666666;}
.tagline{font-family :Arial,Helvetica,sans-serif; font-size:x-small;color:#666666;}
.pageCount{color:#666666;font-family :Arial,Helvetica,sans-serif; font-size:x-small;}
.note{font-family :Arial,Helvetica,sans-serif; font-size:small;color:#666666;font-style:italic;}
.highlight{font-family :Arial,Helvetica,sans-serif; font-size:small;background-color:#FFF200;}
.annotation{font-family :Arial,Helvetica,sans-serif; font-size:x-small;color:#666666;}
'''
remove_tags = [
dict(attrs={'class':'inStory'})
,dict(name=['meta','link','iframe','base','embed','object','table','th','tr','td'])
,dict(attrs={'id':['inset','videoDisplay']})
]
keep_only_tags = [dict(name='div', attrs={'id':['story-body','storyBody','article_body','articleBody']})]
remove_attributes = ['lang']
match_regexps = [r'http://www.businessweek.com/.*_page_[1-9].*']
remove_tags = [ dict(name='div', attrs={'id':["log","feedback","footer","secondarynav","secondnavbar","header","email","bw2-header","column2","wrapper-bw2-footer","wrapper-mgh-footer","inset","commentForm","commentDisplay","bwExtras","bw2-umbrella","readerComments","leg","rightcol"]}),
dict(name='div', attrs={'class':["menu",'sponsorbox smallertext',"TopNavTile","graybottom leaderboard"]}),
dict(name='img', alt ="News"),
dict(name='td', width ="1"),
]
feeds = [
(u'Top Stories', u'http://www.businessweek.com/topStories/rss/topStories.rss'),
(u'Top News', u'http://www.businessweek.com/rss/bwdaily.rss'),
(u'Asia', u'http://www.businessweek.com/rss/asia.rss'),
(u'Autos', u'http://www.businessweek.com/rss/autos/index.rss'),
(u'Classic Cars', u'http://rss.businessweek.com/bw_rss/classiccars'),
(u'Hybrids', u'http://rss.businessweek.com/bw_rss/hybrids'),
(u'Europe', u'http://www.businessweek.com/rss/europe.rss'),
(u'Auto Reviews', u'http://rss.businessweek.com/bw_rss/autoreviews'),
(u'Innovation & Design', u'http://www.businessweek.com/rss/innovate.rss'),
(u'Architecture', u'http://www.businessweek.com/rss/architecture.rss'),
(u'Brand Equity', u'http://www.businessweek.com/rss/brandequity.rss'),
(u'Auto Design', u'http://www.businessweek.com/rss/carbuff.rss'),
(u'Game Room', u'http://rss.businessweek.com/bw_rss/gameroom'),
(u'Technology', u'http://www.businessweek.com/rss/technology.rss'),
(u'Investing', u'http://rss.businessweek.com/bw_rss/investor'),
(u'Small Business', u'http://www.businessweek.com/rss/smallbiz.rss'),
(u'Careers', u'http://rss.businessweek.com/bw_rss/careers'),
(u'B-Schools', u'http://www.businessweek.com/rss/bschools.rss'),
(u'Magazine Selections', u'http://www.businessweek.com/rss/magazine.rss'),
(u'CEO Guide to Tech', u'http://www.businessweek.com/rss/ceo_guide_tech.rss'),
]
feeds = [
(u'Top Stories', u'http://www.businessweek.com/topStories/rss/topStories.rss'),
(u'Top News' , u'http://www.businessweek.com/rss/bwdaily.rss' ),
(u'Asia', u'http://www.businessweek.com/rss/asia.rss'),
(u'Autos', u'http://www.businessweek.com/rss/autos/index.rss'),
(u'Classic Cars', u'http://rss.businessweek.com/bw_rss/classiccars'),
(u'Hybrids', u'http://rss.businessweek.com/bw_rss/hybrids'),
(u'Europe', u'http://www.businessweek.com/rss/europe.rss'),
(u'Auto Reviews', u'http://rss.businessweek.com/bw_rss/autoreviews'),
(u'Innovation & Design', u'http://www.businessweek.com/rss/innovate.rss'),
(u'Architecture', u'http://www.businessweek.com/rss/architecture.rss'),
(u'Brand Equity', u'http://www.businessweek.com/rss/brandequity.rss'),
(u'Auto Design', u'http://www.businessweek.com/rss/carbuff.rss'),
(u'Game Room', u'http://rss.businessweek.com/bw_rss/gameroom'),
(u'Technology', u'http://www.businessweek.com/rss/technology.rss'),
(u'Investing', u'http://rss.businessweek.com/bw_rss/investor'),
(u'Small Business', u'http://www.businessweek.com/rss/smallbiz.rss'),
(u'Careers', u'http://rss.businessweek.com/bw_rss/careers'),
(u'B-Schools', u'http://www.businessweek.com/rss/bschools.rss'),
(u'Magazine Selections', u'http://www.businessweek.com/rss/magazine.rss'),
(u'CEO Guide to Tech', u'http://www.businessweek.com/rss/ceo_guide_tech.rss'),
]
def get_article_url(self, article):
url = article.get('guid', None)
if 'podcasts' in url:
return None
if 'surveys' in url:
return None
if 'images' in url:
return None
if 'feedroom' in url:
return None
if '/magazine/toc/' in url:
return None
rurl, sep, rest = url.rpartition('?')
if rurl:
return rurl
return rest
if 'podcasts' in url or 'surveys' in url:
url = None
return url
def postprocess_html(self, soup, first):
for tag in soup.findAll(name=['ul','li','table','td','tr','span']):
tag.name = 'div'
for tag in soup.findAll(name= 'div',attrs={ 'id':'pageNav'}):
tag.extract()
return soup
def print_version(self, url):
if '/news/' in url or '/blog/' in url:
return url
if '/magazine' in url:
rurl = url.replace('http://www.businessweek.com/','http://www.businessweek.com/printer/')
else:
rurl = url.replace('http://www.businessweek.com/','http://www.businessweek.com/print/')
return rurl.replace('/investing/','/investor/')
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
for alink in soup.findAll('a'):
if alink.string is not None:
tstr = alink.string
alink.replaceWith(tstr)
return soup

View File

@ -4,95 +4,73 @@ __copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
www.businessworld.in
'''
from calibre import strftime
import re
from calibre.web.feeds.news import BasicNewsRecipe
class BusinessWorldMagazine(BasicNewsRecipe):
title = 'Business World Magazine'
__author__ = 'Darko Miletic'
__author__ = 'Kovid Goyal'
description = 'News from India'
publisher = 'ABP Pvt Ltd Publication'
category = 'news, politics, finances, India, Asia'
delay = 1
no_stylesheets = True
INDEX = 'http://www.businessworld.in/bw/Magazine_Current_Issue'
INDEX = 'http://www.businessworld.in/businessworld/magazine_latest_issue.php'
ROOT = 'http://www.businessworld.in'
use_embedded_content = False
encoding = 'utf-8'
language = 'en_IN'
extra_css = """
img{display: block; margin-bottom: 0.5em}
body{font-family: Arial,Helvetica,sans-serif}
h2{color: gray; display: block}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
def is_in_list(self,linklist,url):
for litem in linklist:
if litem == url:
return True
return False
auto_cleanup = True
def parse_index(self):
br = self.browser
br.open(self.ROOT)
raw = br.open(br.click_link(text_regex=re.compile('Current.*Issue',
re.I))).read()
soup = self.index_to_soup(raw)
mc = soup.find(attrs={'class':'mag_cover'})
if mc is not None:
img = mc.find('img', src=True)
if img is not None:
self.cover_url = img['src']
feeds = []
current_section = None
articles = []
linklist = []
soup = self.index_to_soup(self.INDEX)
for tag in soup.findAll(['h3', 'h2']):
inner_a = tag.find('a')
if tag.name == 'h3' and inner_a is not None:
continue
if tag.name == 'h2' and (inner_a is None or current_section is
None):
continue
if tag.name == 'h3':
if current_section is not None and articles:
feeds.append((current_section, articles))
current_section = self.tag_to_string(tag)
self.log('Found section:', current_section)
articles = []
elif tag.name == 'h2':
url = inner_a.get('href', None)
if url is None: continue
if url.startswith('/'): url = self.ROOT + url
title = self.tag_to_string(inner_a)
h1 = tag.findPreviousSibling('h1')
if h1 is not None:
title = self.tag_to_string(h1) + title
self.log('\tFound article:', title)
articles.append({'title':title, 'url':url, 'date':'',
'description':''})
if current_section and articles:
feeds.append((current_section, articles))
return feeds
tough = soup.find('div', attrs={'id':'tough'})
if tough:
for item in tough.findAll('h1'):
description = ''
title_prefix = ''
feed_link = item.find('a')
if feed_link and feed_link.has_key('href'):
url = self.ROOT + feed_link['href']
if not self.is_in_list(linklist,url):
title = title_prefix + self.tag_to_string(feed_link)
date = strftime(self.timefmt)
articles.append({
'title' :title
,'date' :date
,'url' :url
,'description':description
})
linklist.append(url)
for item in soup.findAll('div', attrs={'class':'nametitle'}):
description = ''
title_prefix = ''
feed_link = item.find('a')
if feed_link and feed_link.has_key('href'):
url = self.ROOT + feed_link['href']
if not self.is_in_list(linklist,url):
title = title_prefix + self.tag_to_string(feed_link)
date = strftime(self.timefmt)
articles.append({
'title' :title
,'date' :date
,'url' :url
,'description':description
})
linklist.append(url)
return [(soup.head.title.string, articles)]
keep_only_tags = [dict(name='div', attrs={'id':'printwrapper'})]
remove_tags = [dict(name=['object','link','meta','base','iframe','link','table'])]
def print_version(self, url):
return url.replace('/bw/','/bw/storyContent/')
def get_cover_url(self):
cover_url = None
soup = self.index_to_soup(self.INDEX)
cover_item = soup.find('img',attrs={'class':'toughbor'})
if cover_item:
cover_url = self.ROOT + cover_item['src']
return cover_url

128
recipes/cio_magazine.recipe Normal file
View File

@ -0,0 +1,128 @@
# Los primeros comentarios son las dificultades que he tenido con el Piton
# Cuando da error UTF8 revisa los comentarios (acentos). En notepad++ Search, Goto, posicion y lo ves.
# Editar con Notepad++ Si pone - donde no debe es que ha indentado mal... Edit - Blank operations - tab to space
# He entendido lo que significa el from... son paths dentro de pylib.zip...
# Con from importa solo un simbolo...con import,la libreria completa
from calibre.web.feeds.news import BasicNewsRecipe
# sys no hace falta... lo intente usar para escribir en stderr
from calibre import strftime
# Para convertir el tiempo del articulo
import string, re
# Para usar expresiones regulares
# Visto en pylib.zip... la primera letra es mayuscula
# Estas dos ultimas han sido un vago intento de establecer una cookie (no usado)
class CIO_Magazine(BasicNewsRecipe):
title = 'CIO Magazine'
oldest_article = 14
max_articles_per_feed = 100
auto_cleanup = True
__author__ = 'Julio Map'
description = 'CIO is the leading information brand for today-s busy Chief information Officer - CIO Magazine bi-monthly '
language = 'en'
encoding = 'utf8'
cover_url = 'http://www.cio.com/homepage/images/hp-cio-logo-linkedin.png'
remove_tags_before = dict(name='div', attrs={'id':'container'})
# Absolutamente innecesario... al final he visto un print_version (ver mas adelante)
# Dentro de una revista dada...
# issue_details contiene el titulo y las secciones de este ejemplar
# DetailModule esta dentro de issue_details contiene las urls y resumenes
# Dentro de un articulo dado...
# Article-default-body contiene el texto. Pero como digo, he encontrado una print_version
no_stylesheets = True
remove_javascript = True
def print_version(self,url):
# A esta funcion le llama el sistema... no hay que llamarla uno mismo (porque seria llamada dos veces)
# Existe una version imprimible de los articulos cambiando
# http://www.cio.com/article/<num>/<titulo> por
# http://www.cio.com/article/print/<num> que contiene todas las paginas dentro del div id=container
if url.startswith('/'):
url = 'http://www.cio.com'+url
segments = url.split('/')
printURL = '/'.join(segments[0:4]) + '/print/' + segments[4] +'#'
return printURL
def parse_index(self):
###########################################################################
# This method should be implemented in recipes that parse a website
# instead of feeds to generate a list of articles. Typical uses are for
# news sources that have a Print Edition webpage that lists all the
# articles in the current print edition. If this function is implemented,
# it will be used in preference to BasicNewsRecipe.parse_feeds().
#
# It must return a list. Each element of the list must be a 2-element
# tuple of the form ('feed title', list of articles).
#
# Each list of articles must contain dictionaries of the form:
#
# {
# 'title' : article title,
# 'url' : URL of print version,
# 'date' : The publication date of the article as a string,
# 'description' : A summary of the article
# 'content' : The full article (can be an empty string). This is used by FullContentProfile
# }
#
# For an example, see the recipe for downloading The Atlantic.
# In addition, you can add 'author' for the author of the article.
###############################################################################
# Primero buscamos cual es la ultima revista que se ha creado
soupinicial = self.index_to_soup('http://www.cio.com/magazine')
# Es el primer enlace que hay en el DIV con class content_body
a= soupinicial.find(True, attrs={'class':'content_body'}).find('a', href=True)
INDEX = re.sub(r'\?.*', '', a['href'])
# Como cio.com usa enlaces relativos, le anteponemos el domain name.
if INDEX.startswith('/'): # protegiendonos de que dejen de usarlos
INDEX = 'http://www.cio.com'+INDEX
# Y nos aseguramos en los logs que lo estamos haciendo bien
print ("INDEX en parse_index: ", INDEX)
# Ya sabemos cual es la revista... procesemosla.
soup = self.index_to_soup(INDEX)
articles = {}
key = None
feeds = []
# Para empezar nos quedamos solo con dos DIV, 'heading' y ' issue_item'
# Del primero sacamos las categorias (key) y del segundo las urls y resumenes
for div in soup.findAll(True,
attrs={'class':['heading', 'issue_item']}):
if div['class'] == 'heading':
key = string.capwords(self.tag_to_string(div.span))
print ("Key: ",key) # Esto es para depurar
articles[key] = []
feeds.append(key)
elif div['class'] == 'issue_item':
a = div.find('a', href=True)
if not a:
continue
url = re.sub(r'\?.*', '', a['href'])
print("url: ",url) # Esto es para depurar
title = self.tag_to_string(a, use_alt=True).strip() # Ya para nota, quitar al final las dos ultimas palabras
pubdate = strftime('%a, %d %b') # No es la fecha de publicacion sino la de colecta
summary = div.find('p') # Dentro de la div 'issue_item' el unico parrafo que hay es el resumen
description = '' # Si hay summary la description sera el summary... si no, la dejamos en blanco
if summary:
description = self.tag_to_string(summary, use_alt=False)
print ("Description = ", description)
feed = key if key is not None else 'Uncategorized' # Esto esta copiado del NY times
if not articles.has_key(feed):
articles[feed] = []
if not 'podcasts' in url:
articles[feed].append(
dict(title=title, url=url, date=pubdate,
description=description,
content=''))
feeds = [(key, articles[key]) for key in feeds if articles.has_key(key)]
return feeds

View File

@ -15,8 +15,10 @@ class Guardian(BasicNewsRecipe):
title = u'The Guardian and The Observer'
if date.today().weekday() == 6:
base_url = "http://www.guardian.co.uk/theobserver"
cover_pic = 'Observer digital edition'
else:
base_url = "http://www.guardian.co.uk/theguardian"
cover_pic = 'Guardian digital edition'
__author__ = 'Seabound and Sujata Raman'
language = 'en_GB'
@ -79,7 +81,7 @@ class Guardian(BasicNewsRecipe):
# soup = self.index_to_soup("http://www.guardian.co.uk/theobserver")
soup = self.index_to_soup(self.base_url)
# find cover pic
img = soup.find( 'img',attrs ={'alt':'Guardian digital edition'})
img = soup.find( 'img',attrs ={'alt':self.cover_pic})
if img is not None:
self.cover_url = img['src']
# end find cover pic

View File

@ -0,0 +1,29 @@
from calibre.web.feeds.news import BasicNewsRecipe
class HindustanTimes(BasicNewsRecipe):
title = u'Hindustan Times'
language = 'en_IN'
__author__ = 'Krittika Goyal'
oldest_article = 1 #days
max_articles_per_feed = 25
use_embedded_content = False
no_stylesheets = True
auto_cleanup = True
feeds = [
('News',
'http://feeds.hindustantimes.com/HT-NewsSectionPage-Topstories'),
('Views',
'http://feeds.hindustantimes.com/HT-ViewsSectionpage-Topstories'),
('Cricket',
'http://feeds.hindustantimes.com/HT-Cricket-TopStories'),
('Business',
'http://feeds.hindustantimes.com/HT-BusinessSectionpage-TopStories'),
('Entertainment',
'http://feeds.hindustantimes.com/HT-HomePage-Entertainment'),
('Lifestyle',
'http://feeds.hindustantimes.com/HT-Homepage-LifestyleNews'),
]

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 KiB

BIN
recipes/icons/rtnews.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 606 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 200 B

View File

@ -1,76 +1,25 @@
from calibre.web.feeds.news import BasicNewsRecipe
class IndiaToday(BasicNewsRecipe):
title = 'India Today'
__author__ = 'Kovid Goyal'
language = 'en_IN'
timefmt = ' [%d %m, %Y]'
oldest_article = 700
max_articles_per_feed = 10
title = u'India Today'
language = 'en_IN'
__author__ = 'Krittika Goyal'
oldest_article = 15 #days
max_articles_per_feed = 25
no_stylesheets = True
auto_cleanup = True
remove_tags_before = dict(id='content_story_title')
remove_tags_after = dict(id='rightblockdiv')
remove_tags = [dict(id=['rightblockdiv', 'share_links'])]
extra_css = '#content_story_title { font-size: 170%; font-weight: bold;}'
conversion_options = { 'linearize_tables': True }
def it_get_index(self):
soup = self.index_to_soup('http://indiatoday.intoday.in/site/archive')
a = soup.find('a', href=lambda x: x and 'issueId=' in x)
url = 'http://indiatoday.intoday.in/site/'+a.get('href')
img = a.find('img')
self.cover_url = img.get('src')
return self.index_to_soup(url)
def parse_index(self):
soup = self.it_get_index()
feeds, current_section, current_articles = [], None, []
for x in soup.findAll(name=['h1', 'a']):
if x.name == 'h1':
if current_section and current_articles:
feeds.append((current_section, current_articles))
current_section = self.tag_to_string(x)
current_articles = []
self.log('\tFound section:', current_section)
elif x.name == 'a' and 'Story' in x.get('href', ''):
title = self.tag_to_string(x)
url = x.get('href')
url = url.replace(' ', '%20')
if not url.startswith('/'):
url = 'http://indiatoday.intoday.in/site/' + url
if title and url:
url += '?complete=1'
self.log('\tFound article:', title)
self.log('\t\t', url)
desc = ''
h3 = x.parent.findNextSibling('h3')
if h3 is not None:
desc = 'By ' + self.tag_to_string(h3)
h4 = h3.findNextSibling('h4')
if h4 is not None:
desc = self.tag_to_string(h4) + ' ' + desc
if desc:
self.log('\t\t', desc)
current_articles.append({'title':title, 'description':desc,
'url':url, 'date':''})
if current_section and current_articles:
feeds.append((current_section, current_articles))
return feeds
def postprocess_html(self, soup, first):
a = soup.find(text='Print')
if a is not None:
tr = a.findParent('tr')
if tr is not None:
tr.extract()
return soup
feeds = [
('Latest News', 'http://indiatoday.intoday.in/rss/article.jsp?sid=4'),
('Cover Story', 'http://indiatoday.intoday.in/rss/article.jsp?sid=30'),
('Nation', 'http://indiatoday.intoday.in/rss/article.jsp?sid=36'),
('States', 'http://indiatoday.intoday.in/rss/article.jsp?sid=21'),
('Economy', 'http://indiatoday.intoday.in/rss/article.jsp?sid=34'),
('World', 'http://indiatoday.intoday.in/rss/article.jsp?sid=61'),
('Sport', 'http://indiatoday.intoday.in/rss/article.jsp?sid=41'),
]

View File

@ -7,56 +7,33 @@ www.inquirer.net
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class InquirerNet(BasicNewsRecipe):
title = 'Inquirer.net'
__author__ = 'Darko Miletic'
__author__ = 'Krittika Goyal'
description = 'News from Philipines'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
encoding = 'utf8'
publisher = 'inquirer.net'
category = 'news, politics, philipines'
lang = 'en'
language = 'en'
extra_css = ' .fontheadline{font-size: x-large} .fontsubheadline{font-size: large} .fontkick{font-size: medium}'
use_embedded_content = False
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
, '--ignore-tables'
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
remove_tags = [dict(name=['object','link','script','iframe','form'])]
no_stylesheets = True
auto_cleanup = True
feeds = [
(u'Breaking news', u'http://services.inquirer.net/rss/breakingnews.xml' )
,(u'Top stories' , u'http://services.inquirer.net/rss/topstories.xml' )
,(u'Sports' , u'http://services.inquirer.net/rss/brk_breakingnews.xml' )
,(u'InfoTech' , u'http://services.inquirer.net/rss/infotech_tech.xml' )
,(u'InfoTech' , u'http://services.inquirer.net/rss/infotech_tech.xml' )
,(u'Business' , u'http://services.inquirer.net/rss/inq7money_breaking_news.xml' )
,(u'Editorial' , u'http://services.inquirer.net/rss/opinion_editorial.xml' )
,(u'Global Nation', u'http://services.inquirer.net/rss/globalnation_breakingnews.xml')
(u'Inquirer', u'http://www.inquirer.net/fullfeed')
]
def preprocess_html(self, soup):
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
for item in soup.findAll(style=True):
del item['style']
return soup
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
br.set_handle_gzip(True)
return br
def print_version(self, url):
rest, sep, art = url.rpartition('/view/')
art_id, sp, rrest = art.partition('/')
return 'http://services.inquirer.net/print/print.php?article_id=' + art_id

View File

@ -1,7 +1,5 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
'''
japantimes.co.jp
'''
@ -9,24 +7,61 @@ japantimes.co.jp
from calibre.web.feeds.news import BasicNewsRecipe
class JapanTimes(BasicNewsRecipe):
title = u'The Japan Times'
title = 'The Japan Times'
__author__ = 'Darko Miletic'
description = 'News from Japan'
language = 'en'
oldest_article = 7
max_articles_per_feed = 100
description = "Daily news and features on Japan from the most widely read English-language newspaper in Japan. Coverage includes national news, business news, sports news, commentary and features on living in Japan, entertainment, the arts, education and more."
language = 'en_JP'
category = 'news, politics, japan'
publisher = 'The Japan Times'
oldest_article = 5
max_articles_per_feed = 150
no_stylesheets = True
use_embedded_content = False
encoding = 'utf8'
publication_type = 'newspaper'
masthead_url = 'http://search.japantimes.co.jp/images/header_title.gif'
extra_css = 'body{font-family: Geneva,Arial,Helvetica,sans-serif}'
keep_only_tags = [ dict(name='div', attrs={'id':'searchresult'}) ]
remove_tags_after = [ dict(name='div', attrs={'id':'mainbody' }) ]
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
, 'linearize_tables' : True
}
keep_only_tags = [dict(name='div', attrs={'id':'printresult'})]
remove_tags = [
dict(name='div' , attrs={'id':'ads' })
,dict(name='table', attrs={'width':470})
dict(name=['iframe','meta','link','embed','object','base'])
,dict(attrs={'id':'searchfooter'})
]
feeds = [(u'The Japan Times', u'http://feeds.feedburner.com/japantimes')]
remove_attributes = ['border']
def get_article_url(self, article):
rurl = BasicNewsRecipe.get_article_url(self, article)
return rurl.partition('?')[0]
feeds = [
(u'The Japan Times', u'http://feedproxy.google.com/japantimes')
]
def print_version(self, url):
return url.replace('/cgi-bin/','/print/')
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll('img'):
if not item.has_key('alt'):
item['alt'] = 'image'
for item in soup.findAll('photo'):
item.name = 'div'
for item in soup.head.findAll('paragraph'):
item.extract()
for item in soup.findAll('wwfilename'):
item.extract()
for item in soup.findAll('jtcategory'):
item.extract()
for item in soup.findAll('nomooter'):
item.extract()
for item in soup.body.findAll('paragraph'):
item.name = 'p'
return soup

View File

@ -14,54 +14,11 @@ class PeopleMag(BasicNewsRecipe):
use_embedded_content = False
oldest_article = 2
max_articles_per_feed = 50
use_embedded_content = False
extra_css = '''
h1{font-family:verdana,arial,helvetica,sans-serif; font-size: large;}
h2{font-family:verdana,arial,helvetica,sans-serif; font-size: small;}
.body-content{font-family:verdana,arial,helvetica,sans-serif; font-size: small;}
.byline {font-size: small; color: #666666; font-style:italic; }
.lastline {font-size: small; color: #666666; font-style:italic;}
.contact {font-size: small; color: #666666;}
.contact p {font-size: small; color: #666666;}
.photoCaption { font-family:verdana,arial,helvetica,sans-serif; font-size:x-small;}
.photoCredit{ font-family:verdana,arial,helvetica,sans-serif; font-size:x-small; color:#666666;}
.article_timestamp{font-size:x-small; color:#666666;}
a {font-family:verdana,arial,helvetica,sans-serif; font-size: x-small;}
'''
keep_only_tags = [
dict(name='div', attrs={'class': 'panel_news_article_main'}),
dict(name='div', attrs={'class':'article_content'}),
dict(name='div', attrs={'class': 'headline'}),
dict(name='div', attrs={'class': 'post'}),
dict(name='div', attrs={'class': 'packageheadlines'}),
dict(name='div', attrs={'class': 'snap_preview'}),
dict(name='div', attrs={'id': 'articlebody'})
]
remove_tags = [
dict(name='div', attrs={'class':'share_comments'}),
dict(name='p', attrs={'class':'twitter_facebook'}),
dict(name='div', attrs={'class':'share_comments_bottom'}),
dict(name='h2', attrs={'id':'related_content'}),
dict(name='div', attrs={'class':'next_article'}),
dict(name='div', attrs={'class':'prev_article'}),
dict(name='ul', attrs={'id':'sharebar'}),
dict(name='div', attrs={'class':'sharelinkcont'}),
dict(name='div', attrs={'class':'categories'}),
dict(name='ul', attrs={'class':'categories'}),
dict(name='div', attrs={'class':'related_content'}),
dict(name='div', attrs={'id':'promo'}),
dict(name='div', attrs={'class':'linksWrapper'}),
dict(name='p', attrs={'class':'tag tvnews'}),
dict(name='p', attrs={'class':'tag movienews'}),
dict(name='p', attrs={'class':'tag musicnews'}),
dict(name='p', attrs={'class':'tag couples'}),
dict(name='p', attrs={'class':'tag gooddeeds'}),
dict(name='p', attrs={'class':'tag weddings'}),
dict(name='p', attrs={'class':'tag health'})
]
no_stylesheets = True
auto_cleanup = True
auto_cleanup_keep = '//div[@id="article-image"]'
feeds = [
@ -69,26 +26,4 @@ class PeopleMag(BasicNewsRecipe):
('US Headlines', 'http://www.usmagazine.com/celebrity_news/rss')
]
def get_article_url(self, article):
ans = article.link
try:
self.log('Looking for full story link in', ans)
soup = self.index_to_soup(ans)
x = soup.find(text="View All")
if x is not None:
ans = ans + '?viewAll=y'
self.log('Found full story link', ans)
except:
pass
return ans
def postprocess_html(self, soup,first):
for tag in soup.findAll(name='div',attrs={'class':"container_ate_qandatitle"}):
tag.extract()
for tag in soup.findAll(name='br'):
tag.extract()
return soup

64
recipes/rtnews.recipe Normal file
View File

@ -0,0 +1,64 @@
__license__ = 'GPL v3'
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
'''
rt.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class RT_eng(BasicNewsRecipe):
title = 'RT in English'
__author__ = 'Darko Miletic'
description = 'RT is the first Russian 24/7 English-language news channel which brings the Russian view on global news.'
publisher = 'Autonomous Nonprofit Organization "TV-Novosti"'
category = 'news, politics, economy, finances, Russia, world'
oldest_article = 2
no_stylesheets = True
encoding = 'utf8'
masthead_url = 'http://rt.com/s/css/img/printlogo.gif'
use_embedded_content = False
remove_empty_feeds = True
language = 'en_RU'
publication_type = 'newsportal'
extra_css = """
body{font-family: Arial,Helvetica,sans-serif}
h1{font-family: Georgia,"Times New Roman",Times,serif}
.grey{color: gray}
.fs12{font-size: small}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher': publisher
, 'language' : language
}
keep_only_tags = [dict(name='div', attrs={'class':'all'})]
remove_tags = [
dict(name=['object','link','embed','iframe','meta','link'])
,dict(attrs={'class':'crumbs oh'})
]
remove_attributes = ['clear']
feeds = [
(u'Politics' , u'http://rt.com/politics/rss/' )
,(u'USA' , u'http://rt.com/usa/news/rss/' )
,(u'Business' , u'http://rt.com/business/news/rss/' )
,(u'Sport' , u'http://rt.com/sport/rss/' )
,(u'Art&Culture', u'http://rt.com/art-and-culture/news/rss/')
]
def print_version(self, url):
return url + 'print/'
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll('a'):
str = item.string
if str is None:
str = self.tag_to_string(item)
item.replaceWith(str)
return soup

View File

@ -1,12 +1,9 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'
'''
twitchfilm.net/site/
twitchfilm.net/news/
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class Twitchfilm(BasicNewsRecipe):
title = 'Twitch Films'
@ -15,29 +12,46 @@ class Twitchfilm(BasicNewsRecipe):
oldest_article = 30
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = True
use_embedded_content = False
encoding = 'utf-8'
publisher = 'Twitch'
masthead_url = 'http://twitchfilm.com/img/logo.png'
category = 'twitch, twitchfilm, movie news, movie reviews, cult cinema, independent cinema, anime, foreign cinema, geek talk'
language = 'en'
lang = 'en-US'
language = 'en'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
'comment' : description
, 'tags' : category
, 'publisher': publisher
, 'language' : language
}
remove_tags = [dict(name='div', attrs={'class':'feedflare'})]
keep_only_tags=[dict(attrs={'class':'asset-header'})]
remove_tags_after=dict(attrs={'class':'asset-body'})
remove_tags = [ dict(name='div', attrs={'class':['social','categories']})
, dict(attrs={'id':'main-asset'})
, dict(name=['meta','link','iframe','embed','object'])
]
feeds = [(u'News', u'http://feedproxy.google.com/TwitchEverything')]
feeds = [(u'News', u'http://feeds.twitchfilm.net/TwitchEverything')]
def preprocess_html(self, soup):
mtag = Tag(soup,'meta',[('http-equiv','Content-Type'),('context','text/html; charset=utf-8')])
soup.head.insert(0,mtag)
soup.html['lang'] = self.lang
return self.adeify_images(soup)
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll('a'):
limg = item.find('img')
if item.string is not None:
str = item.string
item.replaceWith(str)
else:
if limg:
item.name = 'div'
item.attrs = []
else:
str = self.tag_to_string(item)
item.replaceWith(str)
for item in soup.findAll('img'):
if not item.has_key('alt'):
item['alt'] = 'image'
return soup

View File

@ -13,6 +13,7 @@ class USAToday(BasicNewsRecipe):
title = 'USA Today'
__author__ = 'Kovid Goyal'
oldest_article = 1
publication_type = 'newspaper'
timefmt = ''
max_articles_per_feed = 20
language = 'en'

View File

@ -94,9 +94,11 @@ class WallStreetJournal(BasicNewsRecipe):
if date is not None:
self.timefmt = ' [%s]'%self.tag_to_string(date)
cov = soup.find('a', attrs={'class':'icon pdf'}, href=True)
cov = soup.find('div', attrs={'class':'itpSectionHeaderPdf'})
if cov is not None:
self.cover_url = cov['href']
a = cov.find('a', href=True)
if a is not None:
self.cover_url = a['href']
feeds = []
div = soup.find('div', attrs={'class':'itpHeader'})

View File

@ -61,7 +61,7 @@ authors_completer_append_separator = False
# selecting 'manage authors', and pressing 'Recalculate all author sort values'.
# The author name suffixes are words that are ignored when they occur at the
# end of an author name. The case of the suffix is ignored and trailing
# periods are automatically handled.
# periods are automatically handled. The same is true for prefixes.
# The author name copy words are a set of words which if they occur in an
# author name cause the automatically generated author sort string to be
# identical to the author name. This means that the sort for a string like Acme

View File

@ -653,6 +653,15 @@ class KOBO(USBMS):
debug_print(' Commit: Set FavouritesIndex')
def update_device_database_collections(self, booklists, collections_attributes, oncard):
# Only process categories in this list
supportedcategories = {
"Im_Reading":1,
"Read":2,
"Closed":3,
"Shortlist":4,
# "Preview":99, # Unsupported as we don't want to change it
}
# Define lists for the ReadStatus
readstatuslist = {
"Im_Reading":1,
@ -692,6 +701,7 @@ class KOBO(USBMS):
# Process any collections that exist
for category, books in collections.items():
if category in supportedcategories:
debug_print("Category: ", category, " id = ", readstatuslist.get(category))
for book in books:
debug_print(' Title:', book.title, 'category: ', category)

View File

@ -368,7 +368,10 @@ OptionRecommendation(name='remove_paragraph_spacing_indent_size',
recommended_value=1.5, level=OptionRecommendation.LOW,
help=_('When calibre removes blank lines between paragraphs, it automatically '
'sets a paragraph indent, to ensure that paragraphs can be easily '
'distinguished. This option controls the width of that indent (in em).')
'distinguished. This option controls the width of that indent (in em). '
'If you set this value to 0, then the indent specified in the input '
'document is used, unless you also set the insert line between '
'paragraphs option.')
),
OptionRecommendation(name='prefer_metadata_cover',
@ -394,8 +397,9 @@ OptionRecommendation(name='insert_blank_line_size',
OptionRecommendation(name='remove_first_image',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Remove the first image from the input ebook. Useful if the '
'first image in the source file is a cover and you are specifying '
'an external cover.'
'input document has a cover image that is not identified as a cover. '
'In this case, if you set a cover in calibre, the output document will '
'end up with two cover images if you do not specify this option.'
)
),

View File

@ -11,7 +11,7 @@ __docformat__ = 'restructuredtext en'
Input plugin for HTML or OPF ebooks.
'''
import os, re, sys, uuid, tempfile
import os, re, sys, uuid, tempfile, errno
from urlparse import urlparse, urlunparse
from urllib import unquote
from functools import partial
@ -75,7 +75,7 @@ class IgnoreFile(Exception):
def __init__(self, msg, errno):
Exception.__init__(self, msg)
self.doesnt_exist = errno == 2
self.doesnt_exist = errno == errno.ENOENT
self.errno = errno
class HTMLFile(object):

View File

@ -65,20 +65,27 @@ def author_to_author_sort(author, method=None):
suffixes = set([x.lower() for x in tweaks['author_name_suffixes']])
suffixes |= set([x+u'.' for x in suffixes])
last = tokens[-1].lower()
suffix = None
if last in suffixes:
suffix = tokens[-1]
tokens = tokens[:-1]
suffix = u''
while True:
if not tokens:
return author
last = tokens[-1].lower()
if last in suffixes:
suffix = tokens[-1] + ' ' + suffix
tokens = tokens[:-1]
else:
break
suffix = suffix.strip()
if method == u'comma' and u',' in u''.join(tokens):
return author
atokens = tokens[-1:] + tokens[:-1]
num_toks = len(atokens)
if suffix:
atokens.append(suffix)
if method != u'nocomma' and len(atokens) > 1:
if method != u'nocomma' and num_toks > 1:
atokens[0] += u','
return u' '.join(atokens)

View File

@ -61,6 +61,13 @@ class MobiWriter(object):
def __call__(self, oeb, path_or_stream):
self.log = oeb.log
pt = None
if oeb.metadata.publication_type:
x = unicode(oeb.metadata.publication_type[0]).split(':')
if len(x) > 1:
pt = x[1].lower()
self.publication_type = pt
if hasattr(path_or_stream, 'write'):
return self.dump_stream(oeb, path_or_stream)
with open(path_or_stream, 'w+b') as stream:
@ -346,12 +353,14 @@ class MobiWriter(object):
bt = 0x002
if self.primary_index_record_idx is not None:
if self.indexer.is_flat_periodical:
if False and self.indexer.is_flat_periodical:
# Disabled as setting this to 0x102 causes the Kindle to not
# auto archive the issues
bt = 0x102
elif self.indexer.is_periodical:
# If you change this, remember to change the cdetype in the EXTH
# header as well
bt = 0x103
bt = {'newspaper':0x101}.get(self.publication_type, 0x103)
record0.write(pack(b'>IIIII',
0xe8, bt, 65001, uid, 6))
@ -525,15 +534,16 @@ class MobiWriter(object):
nrecs += 1
# Write cdetype
if self.is_periodical:
# If you set the book type header field to 0x101 use NWPR here if
# you use 0x103 use MAGZ
data = b'MAGZ'
if not self.is_periodical:
exth.write(pack(b'>II', 501, 12))
exth.write(b'EBOK')
nrecs += 1
else:
data = b'EBOK'
exth.write(pack(b'>II', 501, len(data)+8))
exth.write(data)
nrecs += 1
# Should be b'NWPR' for doc type of 0x101 and b'MAGZ' for doctype
# of 0x103 but the old writer didn't write them, and I dont know
# what it should be for type 0x102 (b'BLOG'?) so write nothing
# instead
pass
# Add a publication date entry
if oeb.metadata['date']:

View File

@ -160,7 +160,9 @@ class Serializer(object):
buf.write(b'title="')
self.serialize_text(ref.title, quot=True)
buf.write(b'" ')
if ref.title == 'start':
if (ref.title.lower() == 'start' or
(ref.type and ref.type.lower() in ('start',
'other.start'))):
self._start_href = ref.href
self.serialize_href(ref.href)
# Space required or won't work, I kid you not
@ -348,8 +350,9 @@ class Serializer(object):
'''
buf = self.buf
id_offsets = self.id_offsets
start_href = getattr(self, '_start_href', None)
for href, hoffs in self.href_offsets.items():
is_start = (href and href == getattr(self, '_start_href', None))
is_start = (href and href == start_href)
# Iterate over all filepos items
if href not in id_offsets:
self.logger.warn('Hyperlink target %r not found' % href)

View File

@ -320,9 +320,11 @@ class CSSFlattener(object):
if self.context.insert_blank_line:
cssdict['margin-top'] = cssdict['margin-bottom'] = \
'%fem'%self.context.insert_blank_line_size
if (self.context.remove_paragraph_spacing and
indent_size = self.context.remove_paragraph_spacing_indent_size
keep_indents = indent_size == 0.0 and not self.context.insert_blank_line
if (self.context.remove_paragraph_spacing and not keep_indents and
cssdict.get('text-align', None) not in ('center', 'right')):
cssdict['text-indent'] = "%1.1fem" % self.context.remove_paragraph_spacing_indent_size
cssdict['text-indent'] = "%1.1fem" % indent_size
if cssdict:
items = cssdict.items()

View File

@ -53,7 +53,7 @@ def pdftohtml(output_dir, pdf_path, no_images):
p = popen(cmd, stderr=logf._fd, stdout=logf._fd,
stdin=subprocess.PIPE)
except OSError as err:
if err.errno == 2:
if err.errno == errno.ENOENT:
raise ConversionError(_('Could not find pdftohtml, check it is in your PATH'))
else:
raise

View File

@ -1,3 +1,8 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
import re, sys
from collections import defaultdict
@ -72,10 +77,15 @@ class Document:
self.options[k] = v
self.html = None
self.log = log
self.keep_elements = set()
def _html(self, force=False):
if force or self.html is None:
self.html = self._parse(self.input)
path = self.options['keep_elements']
if path is not None:
self.keep_elements = set(self.html.xpath(path))
return self.html
def _parse(self, input):
@ -152,8 +162,9 @@ class Document:
append = False
if sibling is best_elem:
append = True
sibling_key = sibling #HashableElement(sibling)
if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold:
if sibling in candidates and candidates[sibling]['content_score'] >= sibling_score_threshold:
append = True
if sibling in self.keep_elements:
append = True
if sibling.tag == "p":
@ -283,6 +294,8 @@ class Document:
def remove_unlikely_candidates(self):
for elem in self.html.iter():
if elem in self.keep_elements:
continue
s = "%s %s" % (elem.get('class', ''), elem.get('id', ''))
#self.debug(s)
if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag != 'body':
@ -337,7 +350,7 @@ class Document:
allowed = {}
# Conditionally clean <table>s, <ul>s, and <div>s
for el in self.reverse_tags(node, "table", "ul", "div"):
if el in allowed:
if el in allowed or el in self.keep_elements:
continue
weight = self.class_weight(el)
if el in candidates:
@ -450,64 +463,39 @@ class Document:
#self.debug("pname %s pweight %.3f" %(pname, pweight))
el.drop_tree()
for el in ([node] + [n for n in node.iter()]):
if not (self.options['attributes']):
#el.attrib = {} #FIXME:Checkout the effects of disabling this
pass
return clean_attributes(tounicode(node))
def option_parser():
from calibre.utils.config import OptionParser
parser = OptionParser(usage='%prog: [options] file')
parser.add_option('-v', '--verbose', default=False, action='store_true',
dest='verbose',
help='Show detailed output information. Useful for debugging')
parser.add_option('-k', '--keep-elements', default=None, action='store',
dest='keep_elements',
help='XPath specifying elements that should not be removed')
class HashableElement():
def __init__(self, node):
self.node = node
self._path = None
def _get_path(self):
if self._path is None:
reverse_path = []
node = self.node
while node is not None:
node_id = (node.tag, tuple(node.attrib.items()), node.text)
reverse_path.append(node_id)
node = node.getparent()
self._path = tuple(reverse_path)
return self._path
path = property(_get_path)
def __hash__(self):
return hash(self.path)
def __eq__(self, other):
return self.path == other.path
def __getattr__(self, tag):
return getattr(self.node, tag)
return parser
def main():
import logging
from optparse import OptionParser
parser = OptionParser(usage="%prog: [options] [file]")
parser.add_option('-v', '--verbose', action='store_true')
parser.add_option('-u', '--url', help="use URL instead of a local file")
(options, args) = parser.parse_args()
from calibre.utils.logging import default_log
parser = option_parser()
options, args = parser.parse_args()
if not (len(args) == 1 or options.url):
if len(args) != 1:
parser.print_help()
sys.exit(1)
logging.basicConfig(level=logging.INFO)
raise SystemExit(1)
with open(args[0], 'rb') as f:
raw = f.read()
file = None
if options.url:
import urllib
file = urllib.urlopen(options.url)
else:
file = open(args[0], 'rt')
enc = sys.__stdout__.encoding or 'utf-8'
try:
print Document(file.read(), debug=options.verbose).summary().encode(enc, 'replace')
finally:
file.close()
if options.verbose:
default_log.filter_level = default_log.DEBUG
print (Document(raw, default_log,
debug=options.verbose,
keep_elements=options.keep_elements).summary().encode(enc,
'replace'))
if __name__ == '__main__':
main()

View File

@ -142,7 +142,7 @@ def _config(): # {{{
c.add_opt('upload_news_to_device', default=True,
help=_('Upload downloaded news to device'))
c.add_opt('delete_news_from_library_on_upload', default=False,
help=_('Delete books from library after uploading to device'))
help=_('Delete news books from library after uploading to device'))
c.add_opt('separate_cover_flow', default=False,
help=_('Show the cover flow in a separate window instead of in the main calibre window'))
c.add_opt('disable_tray_notification', default=False,

View File

@ -8,6 +8,7 @@ __docformat__ = 'restructuredtext en'
import os
from functools import partial
from threading import Thread
from contextlib import closing
from PyQt4.Qt import QToolButton
@ -52,7 +53,13 @@ class Worker(Thread): # {{{
def doit(self):
from calibre.library.database2 import LibraryDatabase2
newdb = LibraryDatabase2(self.loc)
newdb = LibraryDatabase2(self.loc, is_second_db=True)
with closing(newdb):
self._doit(newdb)
newdb.break_cycles()
del newdb
def _doit(self, newdb):
for i, x in enumerate(self.ids):
mi = self.db.get_metadata(x, index_is_id=True, get_cover=True,
cover_as_data=True)
@ -111,6 +118,7 @@ class Worker(Thread): # {{{
os.remove(path)
except:
pass
# }}}
class CopyToLibraryAction(InterfaceAction):

View File

@ -443,7 +443,13 @@ class Editor(QFrame): # {{{
return QWidget.keyPressEvent(self, ev)
button = getattr(self, 'button%d'%which)
button.setStyleSheet('QPushButton { font-weight: normal}')
sequence = QKeySequence(code|(int(ev.modifiers())&~Qt.KeypadModifier))
mods = int(ev.modifiers()) & ~Qt.KeypadModifier
txt = unicode(ev.text())
if txt and txt.lower() == txt.upper():
# We have a symbol like ! or > etc. In this case the value of code
# already includes Shift, so remove it
mods &= ~Qt.ShiftModifier
sequence = QKeySequence(code|mods)
button.setText(sequence.toString(QKeySequence.NativeText))
self.capture = 0
dup_desc = self.dup_check(sequence)

View File

@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import textwrap, re, os
import textwrap, re, os, errno
from PyQt4.Qt import (Qt, QDateEdit, QDate, pyqtSignal, QMessageBox,
QIcon, QToolButton, QWidget, QLabel, QGridLayout, QApplication,
@ -98,7 +98,7 @@ class TitleEdit(EnLineEdit):
getattr(db, 'set_'+ self.TITLE_ATTR)(id_, title, notify=False,
commit=False)
except (IOError, OSError) as err:
if getattr(err, 'errno', -1) == 13: # Permission denied
if getattr(err, 'errno', -1) == errno.EACCES: # Permission denied
import traceback
fname = err.filename if err.filename else 'file'
error_dialog(self, _('Permission denied'),
@ -262,7 +262,7 @@ class AuthorsEdit(MultiCompleteComboBox):
self.books_to_refresh |= db.set_authors(id_, authors, notify=False,
allow_case_change=True)
except (IOError, OSError) as err:
if getattr(err, 'errno', -1) == 13: # Permission denied
if getattr(err, 'errno', -1) == errno.EACCES: # Permission denied
import traceback
fname = err.filename if err.filename else 'file'
error_dialog(self, _('Permission denied'),

View File

@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os
import os, errno
from functools import partial
from PyQt4.Qt import (Qt, QVBoxLayout, QHBoxLayout, QWidget, QPushButton,
@ -427,7 +427,7 @@ class MetadataSingleDialogBase(ResizableDialog):
self.books_to_refresh |= getattr(widget, 'books_to_refresh',
set([]))
except IOError as err:
if err.errno == 13: # Permission denied
if err.errno == errno.EACCES: # Permission denied
import traceback
fname = err.filename if err.filename else 'file'
error_dialog(self, _('Permission denied'),

View File

@ -161,7 +161,8 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
return path and os.path.exists(os.path.join(path, 'metadata.db'))
def __init__(self, library_path, row_factory=False, default_prefs=None,
read_only=False):
read_only=False, is_second_db=False):
self.is_second_db = is_second_db
try:
if isbytestring(library_path):
library_path = library_path.decode(filesystem_encoding)
@ -263,7 +264,8 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
migrate_preference('user_categories', {})
migrate_preference('saved_searches', {})
set_saved_searches(self, 'saved_searches')
if not self.is_second_db:
set_saved_searches(self, 'saved_searches')
# migrate grouped_search_terms
if self.prefs.get('grouped_search_terms', None) is None:

View File

@ -34,7 +34,7 @@ class DispatchController(object): # {{{
def __init__(self, prefix, wsgi=False):
self.dispatcher = cherrypy.dispatch.RoutesDispatcher()
self.funcs = []
self.seen = set([])
self.seen = set()
self.prefix = prefix if prefix else ''
if wsgi:
self.prefix = ''
@ -146,6 +146,11 @@ class LibraryServer(ContentServer, MobileServer, XMLServer, OPDSServer, Cache,
self.config = {}
self.is_running = False
self.exception = None
#self.config['/'] = {
# 'tools.sessions.on' : True,
# 'tools.sessions.timeout': 60, # Session times out after 60 minutes
#}
if not wsgi:
self.setup_loggers()
cherrypy.engine.bonjour.subscribe()
@ -154,6 +159,7 @@ class LibraryServer(ContentServer, MobileServer, XMLServer, OPDSServer, Cache,
'tools.gzip.mime_types': ['text/html', 'text/plain',
'text/xml', 'text/javascript', 'text/css'],
}
if opts.password:
self.config['/'] = {
'tools.digest_auth.on' : True,

View File

@ -202,7 +202,7 @@ class ContentServer(object):
mode='rb')
if fmt is None:
raise cherrypy.HTTPError(404, 'book: %d does not have format: %s'%(id, format))
mi = self.db.get_metadata(id, index_is_id=True)
mi = newmi = self.db.get_metadata(id, index_is_id=True)
if format == 'EPUB':
# Get the original metadata
@ -214,9 +214,8 @@ class ContentServer(object):
# Transform the metadata via the plugboard
newmi = mi.deepcopy_metadata()
newmi.template_to_attribute(mi, cpb)
else:
newmi = mi
if format in ('MOBI', 'EPUB'):
# Write the updated file
from calibre.ebooks.metadata.meta import set_metadata
set_metadata(fmt, newmi, 'epub')

View File

@ -277,12 +277,15 @@ class MobileServer(object):
cherrypy.response.headers['Content-Type'] = 'text/html; charset=utf-8'
cherrypy.response.headers['Last-Modified'] = self.last_modified(updated)
url_base = "/mobile?search=" + search+";order="+order+";sort="+sort+";num="+str(num)
return html.tostring(build_index(books, num, search, sort, order,
raw = html.tostring(build_index(books, num, search, sort, order,
start, len(ids), url_base, CKEYS,
self.opts.url_prefix),
encoding='utf-8', include_meta_content_type=True,
encoding='utf-8',
pretty_print=True)
# tostring's include_meta_content_type is broken
raw = raw.replace('<head>', '<head>\n'
'<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">')
return raw

View File

@ -28,6 +28,10 @@ class Browser(B):
B.set_cookiejar(self, *args, **kwargs)
self._clone_actions['set_cookiejar'] = ('set_cookiejar', args, kwargs)
@property
def cookiejar(self):
return self._clone_actions['set_cookiejar'][1][0]
def set_handle_redirect(self, *args, **kwargs):
B.set_handle_redirect(self, *args, **kwargs)
self._clone_actions['set_handle_redirect'] = ('set_handle_redirect',

View File

@ -125,6 +125,7 @@ _extra_lang_codes = {
'en_HR' : _('English (Croatia)'),
'en_ID' : _('English (Indonesia)'),
'en_IL' : _('English (Israel)'),
'en_RU' : _('English (Russia)'),
'en_SG' : _('English (Singapore)'),
'en_YE' : _('English (Yemen)'),
'en_IE' : _('English (Ireland)'),

View File

@ -16,11 +16,11 @@ methods :method:`SearchQueryParser.universal_set` and
If this module is run, it will perform a series of unit tests.
'''
import sys, operator
import sys, operator, weakref
from calibre.utils.pyparsing import CaselessKeyword, Group, Forward, \
CharsNotIn, Suppress, OneOrMore, MatchFirst, CaselessLiteral, \
Optional, NoMatch, ParseException, QuotedString
from calibre.utils.pyparsing import (CaselessKeyword, Group, Forward,
CharsNotIn, Suppress, OneOrMore, MatchFirst, CaselessLiteral,
Optional, NoMatch, ParseException, QuotedString)
from calibre.constants import preferred_encoding
from calibre.utils.icu import sort_key
from calibre import prints
@ -37,11 +37,19 @@ class SavedSearchQueries(object):
def __init__(self, db, _opt_name):
self.opt_name = _opt_name;
self.db = db
if db is not None:
self.queries = db.prefs.get(self.opt_name, {})
else:
self.queries = {}
try:
self._db = weakref.ref(db)
except:
# db could be None
self._db = lambda : None
@property
def db(self):
return self._db()
def force_unicode(self, x):
if not isinstance(x, unicode):
@ -49,21 +57,27 @@ class SavedSearchQueries(object):
return x
def add(self, name, value):
self.queries[self.force_unicode(name)] = self.force_unicode(value).strip()
self.db.prefs[self.opt_name] = self.queries
db = self.db
if db is not None:
self.queries[self.force_unicode(name)] = self.force_unicode(value).strip()
db.prefs[self.opt_name] = self.queries
def lookup(self, name):
return self.queries.get(self.force_unicode(name), None)
def delete(self, name):
self.queries.pop(self.force_unicode(name), False)
self.db.prefs[self.opt_name] = self.queries
db = self.db
if db is not None:
self.queries.pop(self.force_unicode(name), False)
db.prefs[self.opt_name] = self.queries
def rename(self, old_name, new_name):
self.queries[self.force_unicode(new_name)] = \
self.queries.get(self.force_unicode(old_name), None)
self.queries.pop(self.force_unicode(old_name), False)
self.db.prefs[self.opt_name] = self.queries
db = self.db
if db is not None:
self.queries[self.force_unicode(new_name)] = \
self.queries.get(self.force_unicode(old_name), None)
self.queries.pop(self.force_unicode(old_name), False)
db.prefs[self.opt_name] = self.queries
def names(self):
return sorted(self.queries.keys(),key=sort_key)

View File

@ -144,6 +144,18 @@ class BasicNewsRecipe(Recipe):
#: manually (though manual cleanup will always be superior).
auto_cleanup = False
#: Specify elements that the auto cleanup algorithm should never remove
#: The syntax is a XPath expression. For example::
#:
#: auto_cleanup_keep = '//div[@id="article-image"]' will keep all divs with
#: id="article-image"
#: auto_cleanup_keep = '//*[@class="important"]' will keep all elements
#: with class="important"
#: auto_cleanup_keep = '//div[@id="article-image"]|//span[@class="important"]'
#: will keep all divs with id="article-image" and spans
#: with class="important"
auto_cleanup_keep = None
#: Specify any extra :term:`CSS` that should be addded to downloaded :term:`HTML` files
#: It will be inserted into `<style>` tags, just before the closing
#: `</head>` tag thereby overriding all :term:`CSS` except that which is
@ -552,7 +564,8 @@ class BasicNewsRecipe(Recipe):
from lxml.html import (fragment_fromstring, tostring,
document_fromstring)
doc = readability.Document(html, self.log, url=url)
doc = readability.Document(html, self.log, url=url,
keep_elements=self.auto_cleanup_keep)
article_html = doc.summary()
extracted_title = doc.title()

View File

@ -22,7 +22,7 @@ E = ElementMaker(namespace=NS, nsmap={None:NS})
def iterate_over_builtin_recipe_files():
exclude = ['craigslist', 'iht', 'toronto_sun',
'india_today', 'livemint']
'livemint']
d = os.path.dirname
base = os.path.join(d(d(d(d(d(d(os.path.abspath(__file__))))))), 'recipes')
for f in os.listdir(base):

View File

@ -75,7 +75,7 @@ MD5_SESS = "MD5-sess"
AUTH = "auth"
AUTH_INT = "auth-int"
SUPPORTED_ALGORITHM = ('md5', MD5, MD5_SESS)
SUPPORTED_ALGORITHM = ('md5', MD5, MD5_SESS) # Changed by Kovid
SUPPORTED_QOP = (AUTH, AUTH_INT)
################################################################################
@ -83,7 +83,7 @@ SUPPORTED_QOP = (AUTH, AUTH_INT)
#
DIGEST_AUTH_ENCODERS = {
MD5: lambda val: md5(val).hexdigest(),
'md5': lambda val:md5(val).hexdigest(),
'md5': lambda val:md5(val).hexdigest(), # Added by Kovid
MD5_SESS: lambda val: md5(val).hexdigest(),
# SHA: lambda val: sha(val).hexdigest(),
}
@ -225,7 +225,7 @@ def _A1(params, password):
algorithm = params.get ("algorithm", MD5)
H = DIGEST_AUTH_ENCODERS[algorithm]
if algorithm in (MD5, 'md5'):
if algorithm in (MD5, 'md5'): # Changed by Kovid
# If the "algorithm" directive's value is "MD5" or is
# unspecified, then A1 is:
# A1 = unq(username-value) ":" unq(realm-value) ":" passwd

View File

@ -671,8 +671,9 @@ def set_response_cookie(path=None, path_header=None, name='session_id',
# save it to disk and the session is lost if people close
# the browser. So we have to use the old "expires" ... sigh ...
## cookie[name]['max-age'] = timeout * 60
if timeout:
cookie[name]['expires'] = http.HTTPDate(time.time() + (timeout * 60))
if False and timeout: # Changed by Kovid, we want the user to have to
# re-authenticate on browser restart
cookie[name]['expires'] = http.HTTPDate(time.time() + timeout)
if domain is not None:
cookie[name]['domain'] = domain
if secure:

View File

@ -241,10 +241,10 @@ def wait_for_free_port(host, port):
for trial in xrange(50):
try:
# we are expecting a free port, so reduce the timeout
check_port(host, port, timeout=0.2)
check_port(host, port, timeout=0.2) # Changed by Kovid
except IOError:
# Give the old server thread time to free the port.
time.sleep(0.2)
time.sleep(0.2) # Changed by Kovid
else:
return