Merge from trunk

This commit is contained in:
Charles Haley 2011-12-16 09:24:15 +01:00
commit f9b6ce9470
156 changed files with 54033 additions and 20129 deletions

View File

@ -2,6 +2,8 @@
.check-cache.pickle .check-cache.pickle
src/calibre/plugins src/calibre/plugins
resources/images.qrc resources/images.qrc
src/calibre/ebooks/oeb/display/test/*.js
resources/display/*.js
src/calibre/manual/.build/ src/calibre/manual/.build/
src/calibre/manual/cli/ src/calibre/manual/cli/
src/calibre/manual/template_ref.rst src/calibre/manual/template_ref.rst

View File

@ -19,6 +19,65 @@
# new recipes: # new recipes:
# - title: # - title:
- version: 0.8.31
date: 2011-12-16
new features:
- title: "Conversion engine: When parsing invalid XHTML use the HTML 5 algorithm, for greater robustness."
tickets: [901466]
- title: "Driver for PocketBook 611 and Lenovo IdeaPad"
- title: "Allow customization of the order in which custom column editing is performed in the edit metadata dialog. Setting is available via Preferences->Tweaks."
tickets: [902731]
- title: "MOBI news download: Allow recipes to set a thumbnail for entries in the periodical table of contents. Currently used by the NYTimes, WSJ, Independent, GUardian and Globe and Mail recipes"
tickets: [900130]
- title: "E-book viewer: Add an option to the right click menu to search for the currently selected word"
- title: "Automatically hide the no internet connection available error message if the connection is restored before the user clicks OK"
bug fixes:
- title: "Fix comments not hidden in Book details panel when they are turned off via Preferences->Look & Feel->Book Details"
- title: "E-book viewer: Do not popup an error message if the user tries to use the mouse wheel to scroll before a document is loaded."
tickets: [903449]
- title: "Add docx to the list of ebook extensions."
tickets: [903452]
- title: "When downloading metadata from non-English Amazon websites, do not correct the case of book titles."
- title: "Fix regression in 0.8.30 that broke bulk conversion of a single book."
tickets: [902506]
- title: "When minimized to system tray do not display the no internet connection error as a dialog box, instead use a system tray notification"
- title: "Catalog generation: Include the series_index field for custom series columns as well"
- title: "Comic Input: Do not rescale images when using the Tablet output profile (or any output profile with a screen size larger than 3000x3000)"
- title: "HTML Input: Ignore unparseable URLs instead of crashing on them."
tickets: [902372]
improved recipes:
- La Republica
- CND
- Berliner Zeitung
- Zaman Gazetesi
new recipes:
- title: CND Weekly
author: Derek Liang
- title: descopera.org
author: Marius Ignatescu
- title: Rynek Zdrowia
author: spi630
- version: 0.8.30 - version: 0.8.30
date: 2011-12-09 date: 2011-12-09

View File

@ -1,61 +1,44 @@
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
import re
'''Calibre recipe to convert the RSS feeds of the Berliner Zeitung to an ebook.'''
class SportsIllustratedRecipe(BasicNewsRecipe) : class SportsIllustratedRecipe(BasicNewsRecipe) :
__author__ = 'ape' __author__ = 'a.peter'
__copyright__ = 'ape' __copyright__ = 'a.peter'
__license__ = 'GPL v3' __license__ = 'GPL v3'
language = 'de' language = 'de'
description = 'Berliner Zeitung' description = 'Berliner Zeitung RSS'
version = 2 version = 4
title = u'Berliner Zeitung' title = u'Berliner Zeitung'
timefmt = ' [%d.%m.%Y]' timefmt = ' [%d.%m.%Y]'
#oldest_article = 7.0
no_stylesheets = True no_stylesheets = True
remove_javascript = True remove_javascript = True
use_embedded_content = False use_embedded_content = False
publication_type = 'newspaper' publication_type = 'newspaper'
keep_only_tags = [dict(name='div', attrs={'class':'teaser t_split t_artikel'})] remove_tags_before = dict(name='div', attrs={'class':'newstype'})
remove_tags_after = [dict(id='article_text')]
INDEX = 'http://www.berlinonline.de/berliner-zeitung/' feeds = [(u'Startseite', u'http://www.berliner-zeitung.de/home/10808950,10808950,view,asFeed.xml'),
(u'Politik', u'http://www.berliner-zeitung.de/home/10808018,10808018,view,asFeed.xml'),
def parse_index(self): (u'Wirtschaft', u'http://www.berliner-zeitung.de/home/10808230,10808230,view,asFeed.xml'),
base = 'http://www.berlinonline.de' (u'Berlin', u'http://www.berliner-zeitung.de/home/10809148,10809148,view,asFeed.xml'),
answer = [] (u'Brandenburg', u'http://www.berliner-zeitung.de/home/10809312,10809312,view,asFeed.xml'),
articles = {} (u'Wissenschaft', u'http://www.berliner-zeitung.de/home/10808894,10808894,view,asFeed.xml'),
more = 1 (u'Digital', u'http://www.berliner-zeitung.de/home/10808718,10808718,view,asFeed.xml'),
(u'Kultur', u'http://www.berliner-zeitung.de/home/10809150,10809150,view,asFeed.xml'),
soup = self.index_to_soup(self.INDEX) (u'Panorama', u'http://www.berliner-zeitung.de/home/10808334,10808334,view,asFeed.xml'),
(u'Sport', u'http://www.berliner-zeitung.de/home/10808794,10808794,view,asFeed.xml'),
# Get list of links to ressorts from index page (u'Hertha', u'http://www.berliner-zeitung.de/home/10808800,10808800,view,asFeed.xml'),
ressort_list = soup.findAll('ul', attrs={'class': re.compile('ressortlist')}) (u'Union', u'http://www.berliner-zeitung.de/home/10808802,10808802,view,asFeed.xml'),
for ressort in ressort_list[0].findAll('a'): (u'Verkehr', u'http://www.berliner-zeitung.de/home/10809298,10809298,view,asFeed.xml'),
feed_title = ressort.string (u'Polizei', u'http://www.berliner-zeitung.de/home/10809296,10809296,view,asFeed.xml'),
print 'Analyzing', feed_title (u'Meinung', u'http://www.berliner-zeitung.de/home/10808020,10808020,view,asFeed.xml')]
if not articles.has_key(feed_title):
articles[feed_title] = []
answer.append(feed_title)
# Load ressort page.
feed = self.index_to_soup('http://www.berlinonline.de' + ressort['href'])
# find mainbar div which contains the list of all articles
for article_container in feed.findAll('div', attrs={'class': re.compile('mainbar')}):
# iterate over all articles
for article_teaser in article_container.findAll('div', attrs={'class': re.compile('teaser')}):
# extract title of article
if article_teaser.h3 != None:
article = {'title' : article_teaser.h3.a.string, 'date' : u'', 'url' : base + article_teaser.h3.a['href'], 'description' : u''}
articles[feed_title].append(article)
else:
# Skip teasers for missing photos
if article_teaser.div.p.contents[0].find('Foto:') > -1:
continue
article = {'title': 'Weitere Artikel ' + str(more), 'date': u'', 'url': base + article_teaser.div.p.a['href'], 'description': u''}
articles[feed_title].append(article)
more += 1
answer = [[key, articles[key]] for key in answer if articles.has_key(key)]
return answer
def get_masthead_url(self): def get_masthead_url(self):
return 'http://www.berlinonline.de/.img/berliner-zeitung/blz_logo.gif' return 'http://www.berliner-zeitung.de/image/view/10810244,7040611,data,logo.png'
def print_version(self, url):
return url.replace('.html', ',view,printVersion.html')

View File

@ -23,7 +23,9 @@ class TheCND(BasicNewsRecipe):
remove_tags = [dict(name='table', attrs={'align':'right'}), dict(name='img', attrs={'src':'http://my.cnd.org/images/logo.gif'}), dict(name='hr', attrs={}), dict(name='small', attrs={})] remove_tags = [dict(name='table', attrs={'align':'right'}), dict(name='img', attrs={'src':'http://my.cnd.org/images/logo.gif'}), dict(name='hr', attrs={}), dict(name='small', attrs={})]
no_stylesheets = True no_stylesheets = True
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')] preprocess_regexps = [ (re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''),
(re.compile('<table width.*?</table>', re.DOTALL), lambda m: ''),
]
def print_version(self, url): def print_version(self, url):
if url.find('news/article.php') >= 0: if url.find('news/article.php') >= 0:
@ -46,13 +48,15 @@ class TheCND(BasicNewsRecipe):
title = self.tag_to_string(a) title = self.tag_to_string(a)
self.log('\tFound article: ', title, 'at', url) self.log('\tFound article: ', title, 'at', url)
date = a.nextSibling date = a.nextSibling
if re.search('cm', date):
continue
if (date is not None) and len(date)>2: if (date is not None) and len(date)>2:
if not articles.has_key(date): if not articles.has_key(date):
articles[date] = [] articles[date] = []
articles[date].append({'title':title, 'url':url, 'description': '', 'date':''}) articles[date].append({'title':title, 'url':url, 'description': '', 'date':''})
self.log('\t\tAppend to : ', date) self.log('\t\tAppend to : ', date)
self.log('log articles', articles) #self.log('log articles', articles)
mostCurrent = sorted(articles).pop() mostCurrent = sorted(articles).pop()
self.title = 'CND ' + mostCurrent self.title = 'CND ' + mostCurrent

72
recipes/cnd_weekly.recipe Normal file
View File

@ -0,0 +1,72 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2010, Derek Liang <Derek.liang.ca @@@at@@@ gmail.com>'
'''
cnd.org
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class TheCND(BasicNewsRecipe):
title = 'CND Weekly'
__author__ = 'Derek Liang'
description = ''
INDEX = 'http://cnd.org'
language = 'zh'
conversion_options = {'linearize_tables':True}
remove_tags_before = dict(name='div', id='articleHead')
remove_tags_after = dict(id='copyright')
remove_tags = [dict(name='table', attrs={'align':'right'}), dict(name='img', attrs={'src':'http://my.cnd.org/images/logo.gif'}), dict(name='hr', attrs={}), dict(name='small', attrs={})]
no_stylesheets = True
preprocess_regexps = [ (re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''),
(re.compile('<table width.*?</table>', re.DOTALL), lambda m: ''),
]
def print_version(self, url):
if url.find('news/article.php') >= 0:
return re.sub("^[^=]*", "http://my.cnd.org/modules/news/print.php?storyid", url)
else:
return re.sub("^[^=]*", "http://my.cnd.org/modules/wfsection/print.php?articleid", url)
def parse_index(self):
soup = self.index_to_soup(self.INDEX)
feeds = []
articles = {}
for a in soup.findAll('a', attrs={'target':'_cnd'}):
url = a['href']
if url.find('article.php') < 0 :
continue
if url.startswith('/'):
url = 'http://cnd.org'+url
title = self.tag_to_string(a)
date = a.nextSibling
if not re.search('cm', date):
continue
self.log('\tFound article: ', title, 'at', url, '@', date)
if (date is not None) and len(date)>2:
if not articles.has_key(date):
articles[date] = []
articles[date].append({'title':title, 'url':url, 'description': '', 'date':''})
self.log('\t\tAppend to : ', date)
sorted_articles = sorted(articles)
while sorted_articles:
mostCurrent = sorted_articles.pop()
self.title = 'CND ' + mostCurrent
feeds.append((self.title, articles[mostCurrent]))
return feeds
def populate_article_metadata(self, article, soup, first):
header = soup.find('h3')
self.log('header: ' + self.tag_to_string(header))
pass

View File

@ -1,4 +1,4 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
''' '''
descopera.org descopera.org
''' '''

View File

@ -51,6 +51,13 @@ class AdvancedUserRecipe1287083651(BasicNewsRecipe):
{'class':['articleTools', 'pagination', 'Ads', 'topad', {'class':['articleTools', 'pagination', 'Ads', 'topad',
'breadcrumbs', 'footerNav', 'footerUtil', 'downloadlinks']}] 'breadcrumbs', 'footerNav', 'footerUtil', 'downloadlinks']}]
def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
picdiv = soup.find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,picdiv['src'])
#Use the mobile version rather than the web version #Use the mobile version rather than the web version
def print_version(self, url): def print_version(self, url):
return url.rpartition('?')[0] + '?service=mobile' return url.rpartition('?')[0] + '?service=mobile'

View File

@ -79,6 +79,12 @@ class Guardian(BasicNewsRecipe):
url = None url = None
return url return url
def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
picdiv = soup.find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,picdiv['src'])
def preprocess_html(self, soup): def preprocess_html(self, soup):
# multiple html sections in soup, useful stuff in the first # multiple html sections in soup, useful stuff in the first

View File

@ -104,6 +104,13 @@ class TheIndependentNew(BasicNewsRecipe):
url = None url = None
return url return url
def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
picdiv = soup.find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,picdiv['src'])
def preprocess_html(self, soup): def preprocess_html(self, soup):
#remove 'advertorial articles' #remove 'advertorial articles'

View File

@ -1,13 +1,12 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__author__ = 'Lorenzo Vigentini, based on Darko Miletic, Gabriele Marini' __author__ = 'Lorenzo Vigentini, based on Darko Miletic, Gabriele Marini'
__copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>' __copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>'
description = 'Italian daily newspaper - v1.01 (04, January 2010); 16.05.2010 new version; 17.10.2011 new version' description = 'Italian daily newspaper - v1.01 (04, January 2010); 16.05.2010 new version; 17.10.2011 new version; 14.12.2011 new version'
''' '''
http://www.repubblica.it/ http://www.repubblica.it/
''' '''
import re
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -33,12 +32,6 @@ class LaRepubblica(BasicNewsRecipe):
remove_attributes = ['width','height','lang','xmlns:og','xmlns:fb'] remove_attributes = ['width','height','lang','xmlns:og','xmlns:fb']
preprocess_regexps = [
(re.compile(r'.*?<head>', re.DOTALL|re.IGNORECASE), lambda match: '<head>'),
(re.compile(r'<head>.*?<title>', re.DOTALL|re.IGNORECASE), lambda match: '<head><title>'),
(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE), lambda match: '</title></head>')
]
def get_article_url(self, article): def get_article_url(self, article):
link = BasicNewsRecipe.get_article_url(self, article) link = BasicNewsRecipe.get_article_url(self, article)
if link and not '.repubblica.it/' in link: if link and not '.repubblica.it/' in link:
@ -73,15 +66,15 @@ class LaRepubblica(BasicNewsRecipe):
remove_tags = [ remove_tags = [
dict(name=['object','link','meta','iframe','embed']), dict(name=['object','link','meta','iframe','embed']),
dict(name='span',attrs={'class':'linkindice'}), dict(name='span',attrs={'class':'linkindice'}),
dict(name='div', attrs={'class':'bottom-mobile'}), dict(name='div', attrs={'class':['bottom-mobile','adv adv-middle-inline']}),
dict(name='div', attrs={'id':['rssdiv','blocco']}), dict(name='div', attrs={'id':['rssdiv','blocco','fb-like-head']}),
dict(name='div', attrs={'class':'utility'}), dict(name='div', attrs={'class':['utility','fb-like-button','archive-button']}),
dict(name='div', attrs={'class':'generalbox'}), dict(name='div', attrs={'class':'generalbox'}),
dict(name='ul', attrs={'id':'hystory'}) dict(name='ul', attrs={'id':'hystory'})
] ]
feeds = [ feeds = [
(u'Rilievo', u'http://www.repubblica.it/rss/homepage/rss2.0.xml'), (u'Homepage', u'http://www.repubblica.it/rss/homepage/rss2.0.xml'),
(u'Cronaca', u'http://www.repubblica.it/rss/cronaca/rss2.0.xml'), (u'Cronaca', u'http://www.repubblica.it/rss/cronaca/rss2.0.xml'),
(u'Esteri', u'http://www.repubblica.it/rss/esteri/rss2.0.xml'), (u'Esteri', u'http://www.repubblica.it/rss/esteri/rss2.0.xml'),
(u'Economia', u'http://www.repubblica.it/rss/economia/rss2.0.xml'), (u'Economia', u'http://www.repubblica.it/rss/economia/rss2.0.xml'),
@ -110,3 +103,5 @@ class LaRepubblica(BasicNewsRecipe):
del item['style'] del item['style']
return soup return soup
def preprocess_raw_html(self, raw, url):
return '<html><head>'+raw[raw.find('</head>'):]

View File

@ -1,5 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
''' '''
@ -707,6 +707,16 @@ class NYTimes(BasicNewsRecipe):
return soup return soup
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
idxdiv = soup.find('div',attrs={'class':'articleSpanImage'})
if idxdiv is not None:
if idxdiv.img:
self.add_toc_thumbnail(article, idxdiv.img['src'])
else:
img = soup.find('img')
if img is not None:
self.add_toc_thumbnail(article, img['src'])
shortparagraph = "" shortparagraph = ""
try: try:
if len(article.text_summary.strip()) == 0: if len(article.text_summary.strip()) == 0:

View File

@ -855,6 +855,16 @@ class NYTimes(BasicNewsRecipe):
return soup return soup
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
idxdiv = soup.find('div',attrs={'class':'articleSpanImage'})
if idxdiv is not None:
if idxdiv.img:
self.add_toc_thumbnail(article, idxdiv.img['src'])
else:
img = soup.find('img')
if img is not None:
self.add_toc_thumbnail(article, img['src'])
shortparagraph = "" shortparagraph = ""
try: try:
if len(article.text_summary.strip()) == 0: if len(article.text_summary.strip()) == 0:

View File

@ -12,39 +12,39 @@ class Sueddeutsche(BasicNewsRecipe):
title = u'sueddeutsche.de' title = u'sueddeutsche.de'
description = 'News from Germany' description = 'News from Germany'
__author__ = 'Oliver Niesner and Armin Geller' #AGe 2011-11-25 __author__ = 'Oliver Niesner and Armin Geller' #AGe 2011-12-16
use_embedded_content = False use_embedded_content = False
timefmt = ' [%d %b %Y]' timefmt = ' [%d %b %Y]'
oldest_article = 7 oldest_article = 1#7
max_articles_per_feed = 50 max_articles_per_feed = 2#50
no_stylesheets = True no_stylesheets = True
language = 'de' language = 'de'
auto_cleanup = True
encoding = 'utf-8' encoding = 'utf-8'
remove_javascript = True remove_javascript = True
cover_url = 'http://polpix.sueddeutsche.com/polopoly_fs/1.1219199.1322239289!/image/image.jpg_gen/derivatives/860x860/image.jpg' # 2011-11-25 AGe cover_url = 'http://polpix.sueddeutsche.com/polopoly_fs/1.1236175.1323967473!/image/image.jpg_gen/derivatives/860x860/image.jpg' # 2011-12-16 AGe
# 2011-12-16 AGe
remove_tags = [ dict(name='link'), dict(name='iframe'), # remove_tags = [ dict(name='link'), dict(name='iframe'),
dict(name='div', attrs={'id':["bookmarking","themenbox","artikelfoot","CAD_AD", # dict(name='div', attrs={'id':["bookmarking","themenbox","artikelfoot","CAD_AD",
"SKY_AD","NT1_AD","navbar1","sdesiteheader"]}), # "SKY_AD","NT1_AD","navbar1","sdesiteheader"]}),
#
dict(name='div', attrs={'class':["similar-article-box","artikelliste","nteaser301bg", # dict(name='div', attrs={'class':["similar-article-box","artikelliste","nteaser301bg",
"pages closed","basebox right narrow","headslot galleried"]}), # "pages closed","basebox right narrow","headslot galleried"]}),
#
dict(name='div', attrs={'class':["articleDistractor","listHeader","listHeader2","hr2", # dict(name='div', attrs={'class':["articleDistractor","listHeader","listHeader2","hr2",
"item","videoBigButton","articlefooter full-column", # "item","videoBigButton","articlefooter full-column",
"bildbanderolle full-column","footerCopy padleft5"]}), # "bildbanderolle full-column","footerCopy padleft5"]}),
#
dict(name='p', attrs={'class':["ressortartikeln","artikelFliestext","entry-summary"]}), # dict(name='p', attrs={'class':["ressortartikeln","artikelFliestext","entry-summary"]}),
dict(name='div', attrs={'style':["position:relative;"]}), # dict(name='div', attrs={'style':["position:relative;"]}),
dict(name='span', attrs={'class':["nlinkheaderteaserschwarz","artikelLink","r10000000"]}), # dict(name='span', attrs={'class':["nlinkheaderteaserschwarz","artikelLink","r10000000"]}),
dict(name='table', attrs={'class':["stoerBS","kommentare","footer","pageBoxBot","pageAktiv","bgcontent"]}), # dict(name='table', attrs={'class':["stoerBS","kommentare","footer","pageBoxBot","pageAktiv","bgcontent"]}),
dict(name='ul', attrs={'class':["breadcrumb","articles","activities","sitenav","actions"]}), # dict(name='ul', attrs={'class':["breadcrumb","articles","activities","sitenav","actions"]}),
dict(name='td', attrs={'class':["artikelDruckenRight"]}), # dict(name='td', attrs={'class':["artikelDruckenRight"]}),
dict(name='p', text = "ANZEIGE") # dict(name='p', text = "ANZEIGE")
] # ]
remove_tags_after = [dict(name='div', attrs={'class':["themenbox full-column"]})] # remove_tags_after = [dict(name='div', attrs={'class':["themenbox full-column"]})]
#
extra_css = ''' extra_css = '''
h2{font-family:Arial,Helvetica,sans-serif; font-size: x-small; color: #003399;} h2{font-family:Arial,Helvetica,sans-serif; font-size: x-small; color: #003399;}
a{font-family:Arial,Helvetica,sans-serif; font-style:italic;} a{font-family:Arial,Helvetica,sans-serif; font-style:italic;}
@ -53,30 +53,45 @@ class Sueddeutsche(BasicNewsRecipe):
.artikelTeaser{font-family:Arial,Helvetica,sans-serif; font-size: x-small; font-weight:bold; } .artikelTeaser{font-family:Arial,Helvetica,sans-serif; font-size: x-small; font-weight:bold; }
body{font-family:Arial,Helvetica,sans-serif; } body{font-family:Arial,Helvetica,sans-serif; }
.photo {font-family:Arial,Helvetica,sans-serif; font-size: x-small; color: #666666;} ''' .photo {font-family:Arial,Helvetica,sans-serif; font-size: x-small; color: #666666;} '''
#
feeds = [ feeds = [
(u'Politik', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EPolitik%24?output=rss'), # (u'Politik', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EPolitik%24?output=rss'), #AGe 2011-12-16 deactivated
(u'Wirtschaft', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EWirtschaft%24?output=rss'), # (u'Wirtschaft', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EWirtschaft%24?output=rss'), #AGe 2011-12-16 deactivated
(u'Geld', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EGeld%24?output=rss'), # (u'Geld', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EGeld%24?output=rss'), #AGe 2011-12-16 deactivated
(u'Kultur', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EKultur%24?output=rss'), # (u'Kultur', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EKultur%24?output=rss'), #AGe 2011-12-16 deactivated
(u'Sport', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ESport%24?output=rss'), # (u'Sport', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ESport%24?output=rss'), #AGe 2011-12-16 deactivated
(u'Leben', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ELeben%24?output=rss'), # (u'Leben', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ELeben%24?output=rss'), #AGe 2011-12-16 deactivated
(u'Karriere', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EKarriere%24?output=rss'), # (u'Karriere', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EKarriere%24?output=rss'), #AGe 2011-12-16 deactivated
(u'München & Region', u'http://www.sueddeutsche.de/app/service/rss/ressort/muenchen/rss.xml'), # AGe 2011-11-13 # (u'München & Region', u'http://www.sueddeutsche.de/app/service/rss/ressort/muenchen/rss.xml'), # AGe 2011-11-13
(u'Bayern', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EBayern%24?output=rss'), # (u'Bayern', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EBayern%24?output=rss'), #AGe 2011-12-16 deactivated
(u'Medien', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EMedien%24?output=rss'), # (u'Medien', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EMedien%24?output=rss'), #AGe 2011-12-16 deactivated
(u'Digital', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EDigital%24?output=rss'), # (u'Digital', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EDigital%24?output=rss'), #AGe 2011-12-16 deactivated
(u'Auto', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EAuto%24?output=rss'), # (u'Auto', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EAuto%24?output=rss'), #AGe 2011-12-16 deactivated
(u'Wissen', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EWissen%24?output=rss'), # (u'Wissen', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EWissen%24?output=rss'), #AGe 2011-12-16 deactivated
(u'Panorama', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EPanorama%24?output=rss'), # (u'Panorama', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EPanorama%24?output=rss'), #AGe 2011-12-16 deactivated
(u'Reise', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EReise%24?output=rss'), # (u'Reise', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EReise%24?output=rss'), #AGe 2011-12-16 deactivated
(u'Technik', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ETechnik%24?output=rss'), # sometimes only # (u'Technik', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ETechnik%24?output=rss'), # sometimes only #AGe 2011-12-16 deactivated
(u'Macht', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EMacht%24?output=rss'), # sometimes only # (u'Macht', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EMacht%24?output=rss'), # sometimes only #AGe 2011-12-16 deactivated
(u'Job', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EJob%24?output=rss'), # sometimes only # (u'Job', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EJob%24?output=rss'), # sometimes only #AGe 2011-12-16 deactivated
(u'Service', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EService%24?output=rss'), # sometimes only # (u'Service', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EService%24?output=rss'), # sometimes only #AGe 2011-12-16 deactivated
(u'Verlag', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EVerlag%24?output=rss'), # sometimes only # (u'Verlag', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EVerlag%24?output=rss'), # sometimes only #AGe 2011-12-16 deactivated
(u'Politik', u'http://www.sueddeutsche.de/app/service/rss/ressort/politik/rss.xml'),
(u'Wirtschaft', u'http://www.sueddeutsche.de/app/service/rss/ressort/wirtschaft/rss.xml'),
(u'Geld', u'http://www.sueddeutsche.de/app/service/rss/ressort/finanzen/rss.xml'),
(u'Kultur', u'http://www.sueddeutsche.de/app/service/rss/ressort/kultur/rss.xml'),
(u'Sport', u'http://www.sueddeutsche.de/app/service/rss/ressort/sport/rss.xml'),
(u'Leben', u'http://www.sueddeutsche.de/app/service/rss/ressort/leben/rss.xml'),
(u'Karriere', u'http://www.sueddeutsche.de/app/service/rss/ressort/karriere/rss.xml'),
(u'München & Region', u'http://www.sueddeutsche.de/app/service/rss/ressort/muenchen/rss.xml'),
(u'Bayern', u'http://www.sueddeutsche.de/app/service/rss/ressort/bayern/rss.xml'),
(u'Medien', u'http://www.sueddeutsche.de/app/service/rss/ressort/medien/rss.xml'),
(u'Digital', u'http://www.sueddeutsche.de/app/service/rss/ressort/computerwissen/rss.xml'),
(u'Auto', u'http://www.sueddeutsche.de/app/service/rss/ressort/autoreise/rss.xml'),
(u'Wissen', u'http://www.sueddeutsche.de/app/service/rss/ressort/wissen/rss.xml'),
(u'Panorama', u'http://www.sueddeutsche.de/app/service/rss/ressort/panorama/rss.xml'),
(u'Reise', u'http://www.sueddeutsche.de/app/service/rss/ressort/reise/rss.xml'),
] ]
def print_version(self, url): # def print_version(self, url): #AGe 2011-12-16 deactivated
main, sep, id = url.rpartition('/') # main, sep, id = url.rpartition('/') #AGe 2011-12-16 deactivated
return main + '/2.220/' + id # return main + '/2.220/' + id #AGe 2011-12-16 deactivated

View File

@ -59,6 +59,11 @@ class TelegraphUK(BasicNewsRecipe):
,(u'Travel' , u'http://www.telegraph.co.uk/travel/rss' ) ,(u'Travel' , u'http://www.telegraph.co.uk/travel/rss' )
,(u'How about that?', u'http://www.telegraph.co.uk/news/newstopics/howaboutthat/rss' ) ,(u'How about that?', u'http://www.telegraph.co.uk/news/newstopics/howaboutthat/rss' )
] ]
def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
picdiv = soup.find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,picdiv['src'])
def get_article_url(self, article): def get_article_url(self, article):
url = article.get('link', None) url = article.get('link', None)

View File

@ -57,6 +57,12 @@ class WallStreetJournal(BasicNewsRecipe):
'username and password') 'username and password')
return br return br
def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
picdiv = soup.find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,picdiv['src'])
def postprocess_html(self, soup, first): def postprocess_html(self, soup, first):
for tag in soup.findAll(name=['table', 'tr', 'td']): for tag in soup.findAll(name=['table', 'tr', 'td']):
tag.name = 'div' tag.name = 'div'

View File

@ -44,6 +44,12 @@ class WallStreetJournal(BasicNewsRecipe):
] ]
remove_tags_after = [dict(id="article_story_body"), {'class':"article story"},] remove_tags_after = [dict(id="article_story_body"), {'class':"article story"},]
def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
picdiv = soup.find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,picdiv['src'])
def postprocess_html(self, soup, first): def postprocess_html(self, soup, first):
for tag in soup.findAll(name=['table', 'tr', 'td']): for tag in soup.findAll(name=['table', 'tr', 'td']):
tag.name = 'div' tag.name = 'div'

View File

@ -1,5 +1,5 @@
" Project wide builtins " Project wide builtins
let g:pyflakes_builtins = ["_", "dynamic_property", "__", "P", "I", "lopen", "icu_lower", "icu_upper", "icu_title", "ngettext"] let $PYFLAKES_BUILTINS = "_,dynamic_property,__,P,I,lopen,icu_lower,icu_upper,icu_title,ngettext"
python << EOFPY python << EOFPY
import os, sys import os, sys

View File

@ -11,7 +11,7 @@ __all__ = [
'build', 'build_pdf2xml', 'server', 'build', 'build_pdf2xml', 'server',
'gui', 'gui',
'develop', 'install', 'develop', 'install',
'kakasi', 'resources', 'kakasi', 'coffee', 'resources',
'check', 'check',
'sdist', 'sdist',
'manual', 'tag_release', 'manual', 'tag_release',
@ -49,9 +49,10 @@ gui = GUI()
from setup.check import Check from setup.check import Check
check = Check() check = Check()
from setup.resources import Resources, Kakasi from setup.resources import Resources, Kakasi, Coffee
resources = Resources() resources = Resources()
kakasi = Kakasi() kakasi = Kakasi()
coffee = Coffee()
from setup.publish import Manual, TagRelease, Stage1, Stage2, \ from setup.publish import Manual, TagRelease, Stage1, Stage2, \
Stage3, Stage4, Stage5, Publish Stage3, Stage4, Stage5, Publish

View File

@ -12,14 +12,14 @@ msgstr ""
"Report-Msgid-Bugs-To: Debian iso-codes team <pkg-isocodes-" "Report-Msgid-Bugs-To: Debian iso-codes team <pkg-isocodes-"
"devel@lists.alioth.debian.org>\n" "devel@lists.alioth.debian.org>\n"
"POT-Creation-Date: 2011-11-25 14:01+0000\n" "POT-Creation-Date: 2011-11-25 14:01+0000\n"
"PO-Revision-Date: 2011-11-22 16:45+0000\n" "PO-Revision-Date: 2011-12-14 19:48+0000\n"
"Last-Translator: Ferran Rius <frius64@hotmail.com>\n" "Last-Translator: Ferran Rius <frius64@hotmail.com>\n"
"Language-Team: Catalan <linux@softcatala.org>\n" "Language-Team: Catalan <linux@softcatala.org>\n"
"MIME-Version: 1.0\n" "MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=UTF-8\n" "Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n" "Content-Transfer-Encoding: 8bit\n"
"X-Launchpad-Export-Date: 2011-11-26 05:10+0000\n" "X-Launchpad-Export-Date: 2011-12-15 05:18+0000\n"
"X-Generator: Launchpad (build 14381)\n" "X-Generator: Launchpad (build 14487)\n"
"Language: ca\n" "Language: ca\n"
#. name for aaa #. name for aaa
@ -9348,7 +9348,7 @@ msgstr "Seit-Kaitetu"
#. name for hil #. name for hil
msgid "Hiligaynon" msgid "Hiligaynon"
msgstr "" msgstr "Hiligainon"
#. name for hin #. name for hin
msgid "Hindi" msgid "Hindi"
@ -9356,39 +9356,39 @@ msgstr "Hindi"
#. name for hio #. name for hio
msgid "Tsoa" msgid "Tsoa"
msgstr "" msgstr "Tsoa"
#. name for hir #. name for hir
msgid "Himarimã" msgid "Himarimã"
msgstr "" msgstr "Himarimà"
#. name for hit #. name for hit
msgid "Hittite" msgid "Hittite"
msgstr "" msgstr "Hittita"
#. name for hiw #. name for hiw
msgid "Hiw" msgid "Hiw"
msgstr "" msgstr "Hiw"
#. name for hix #. name for hix
msgid "Hixkaryána" msgid "Hixkaryána"
msgstr "" msgstr "Hishkaryana"
#. name for hji #. name for hji
msgid "Haji" msgid "Haji"
msgstr "" msgstr "Aji"
#. name for hka #. name for hka
msgid "Kahe" msgid "Kahe"
msgstr "" msgstr "Kahe"
#. name for hke #. name for hke
msgid "Hunde" msgid "Hunde"
msgstr "" msgstr "Hunde"
#. name for hkk #. name for hkk
msgid "Hunjara-Kaina Ke" msgid "Hunjara-Kaina Ke"
msgstr "" msgstr "Hunjara"
#. name for hks #. name for hks
msgid "Hong Kong Sign Language" msgid "Hong Kong Sign Language"
@ -9396,27 +9396,27 @@ msgstr "Llenguatge de signes de Hong Kong"
#. name for hla #. name for hla
msgid "Halia" msgid "Halia"
msgstr "" msgstr "Halia"
#. name for hlb #. name for hlb
msgid "Halbi" msgid "Halbi"
msgstr "" msgstr "Halbi"
#. name for hld #. name for hld
msgid "Halang Doan" msgid "Halang Doan"
msgstr "" msgstr "Halang Doan"
#. name for hle #. name for hle
msgid "Hlersu" msgid "Hlersu"
msgstr "" msgstr "Sansu"
#. name for hlt #. name for hlt
msgid "Nga La" msgid "Nga La"
msgstr "" msgstr "Nga La"
#. name for hlu #. name for hlu
msgid "Luwian; Hieroglyphic" msgid "Luwian; Hieroglyphic"
msgstr "" msgstr "Luvi; jeroglífic"
#. name for hma #. name for hma
msgid "Miao; Southern Mashan" msgid "Miao; Southern Mashan"
@ -9424,7 +9424,7 @@ msgstr "Miao; Mashan meridional"
#. name for hmb #. name for hmb
msgid "Songhay; Humburi Senni" msgid "Songhay; Humburi Senni"
msgstr "" msgstr "Songhai; central"
#. name for hmc #. name for hmc
msgid "Miao; Central Huishui" msgid "Miao; Central Huishui"
@ -9440,11 +9440,11 @@ msgstr "Miao; Huishui oriental"
#. name for hmf #. name for hmf
msgid "Hmong Don" msgid "Hmong Don"
msgstr "" msgstr "Miao; Don"
#. name for hmg #. name for hmg
msgid "Hmong; Southwestern Guiyang" msgid "Hmong; Southwestern Guiyang"
msgstr "" msgstr "Miao; Guiyang sudoccidental"
#. name for hmh #. name for hmh
msgid "Miao; Southwestern Huishui" msgid "Miao; Southwestern Huishui"
@ -9456,11 +9456,11 @@ msgstr "Miao; Huishui septentrional"
#. name for hmj #. name for hmj
msgid "Ge" msgid "Ge"
msgstr "" msgstr "Ge"
#. name for hmk #. name for hmk
msgid "Maek" msgid "Maek"
msgstr "" msgstr "Maek"
#. name for hml #. name for hml
msgid "Miao; Luopohe" msgid "Miao; Luopohe"
@ -9472,11 +9472,11 @@ msgstr "Miao; Mashan central"
#. name for hmn #. name for hmn
msgid "Hmong" msgid "Hmong"
msgstr "" msgstr "Hmong (macrollengua)"
#. name for hmo #. name for hmo
msgid "Hiri Motu" msgid "Hiri Motu"
msgstr "" msgstr "Hiri Motu"
#. name for hmp #. name for hmp
msgid "Miao; Northern Mashan" msgid "Miao; Northern Mashan"
@ -9488,7 +9488,7 @@ msgstr "Miao; Qiandong oriental"
#. name for hmr #. name for hmr
msgid "Hmar" msgid "Hmar"
msgstr "" msgstr "Hmar"
#. name for hms #. name for hms
msgid "Miao; Southern Qiandong" msgid "Miao; Southern Qiandong"
@ -9496,15 +9496,15 @@ msgstr "Miao; Qiandong meridional"
#. name for hmt #. name for hmt
msgid "Hamtai" msgid "Hamtai"
msgstr "" msgstr "Hamtai"
#. name for hmu #. name for hmu
msgid "Hamap" msgid "Hamap"
msgstr "" msgstr "Hamap"
#. name for hmv #. name for hmv
msgid "Hmong Dô" msgid "Hmong Dô"
msgstr "" msgstr "Miao; Do"
#. name for hmw #. name for hmw
msgid "Miao; Western Mashan" msgid "Miao; Western Mashan"
@ -9520,19 +9520,19 @@ msgstr "Miao; Shua"
#. name for hna #. name for hna
msgid "Mina (Cameroon)" msgid "Mina (Cameroon)"
msgstr "" msgstr "Mina (Camerun)"
#. name for hnd #. name for hnd
msgid "Hindko; Southern" msgid "Hindko; Southern"
msgstr "" msgstr "Hindko; meridional"
#. name for hne #. name for hne
msgid "Chhattisgarhi" msgid "Chhattisgarhi"
msgstr "" msgstr "Chattisgarbi"
#. name for hnh #. name for hnh
msgid "//Ani" msgid "//Ani"
msgstr "" msgstr "Ani"
#. name for hni #. name for hni
msgid "Hani" msgid "Hani"
@ -9540,7 +9540,7 @@ msgstr ""
#. name for hnj #. name for hnj
msgid "Hmong Njua" msgid "Hmong Njua"
msgstr "" msgstr "Miao; Hmong Njua"
#. name for hnn #. name for hnn
msgid "Hanunoo" msgid "Hanunoo"
@ -9548,7 +9548,7 @@ msgstr ""
#. name for hno #. name for hno
msgid "Hindko; Northern" msgid "Hindko; Northern"
msgstr "" msgstr "Hindko; septentrional"
#. name for hns #. name for hns
msgid "Hindustani; Caribbean" msgid "Hindustani; Caribbean"
@ -11800,7 +11800,7 @@ msgstr ""
#. name for khq #. name for khq
msgid "Songhay; Koyra Chiini" msgid "Songhay; Koyra Chiini"
msgstr "" msgstr "Songhai; Koyra"
#. name for khr #. name for khr
msgid "Kharia" msgid "Kharia"
@ -17288,7 +17288,7 @@ msgstr ""
#. name for mww #. name for mww
msgid "Hmong Daw" msgid "Hmong Daw"
msgstr "" msgstr "Miao; blanc"
#. name for mwx #. name for mwx
msgid "Mediak" msgid "Mediak"
@ -28680,7 +28680,7 @@ msgstr ""
#. name for xlu #. name for xlu
msgid "Luwian; Cuneiform" msgid "Luwian; Cuneiform"
msgstr "" msgstr "Luvi; cuneïforme"
#. name for xly #. name for xly
msgid "Elymian" msgid "Elymian"

View File

@ -6,7 +6,7 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, cPickle, re, shutil, marshal, zipfile, glob import os, cPickle, re, shutil, marshal, zipfile, glob, subprocess, time
from zlib import compress from zlib import compress
from setup import Command, basenames, __appname__ from setup import Command, basenames, __appname__
@ -23,7 +23,70 @@ def get_opts_from_parser(parser):
for o in g.option_list: for o in g.option_list:
for x in do_opt(o): yield x for x in do_opt(o): yield x
class Kakasi(Command): class Coffee(Command): # {{{
description = 'Compile coffeescript files into javascript'
COFFEE_DIRS = {'ebooks/oeb/display': 'display'}
def add_options(self, parser):
parser.add_option('--watch', '-w', action='store_true', default=False,
help='Autocompile when .coffee files are changed')
parser.add_option('--show-js', action='store_true', default=False,
help='Display the generated javascript')
def run(self, opts):
self.do_coffee_compile(opts)
if opts.watch:
try:
while True:
time.sleep(0.5)
self.do_coffee_compile(opts, timestamp=True,
ignore_errors=True)
except KeyboardInterrupt:
pass
def show_js(self, jsfile):
from pygments.lexers import JavascriptLexer
from pygments.formatters import TerminalFormatter
from pygments import highlight
with open(jsfile, 'rb') as f:
raw = f.read()
print highlight(raw, JavascriptLexer(), TerminalFormatter())
def do_coffee_compile(self, opts, timestamp=False, ignore_errors=False):
for toplevel, dest in self.COFFEE_DIRS.iteritems():
dest = self.j(self.RESOURCES, dest)
for x in glob.glob(self.j(self.SRC, __appname__, toplevel, '*.coffee')):
js = self.j(dest, os.path.basename(x.rpartition('.')[0]+'.js'))
if self.newer(js, x):
print ('\t%sCompiling %s'%(time.strftime('[%H:%M:%S] ') if
timestamp else '', os.path.basename(x)))
try:
subprocess.check_call(['coffee', '-c', '-o', dest, x])
except:
print ('\n\tCompilation of %s failed'%os.path.basename(x))
if ignore_errors:
with open(js, 'wb') as f:
f.write('# Compilation from coffeescript failed')
else:
raise SystemExit(1)
else:
if opts.show_js:
self.show_js(js)
print ('#'*80)
print ('#'*80)
def clean(self):
for toplevel, dest in self.COFFEE_DIRS.iteritems():
dest = self.j(self.RESOURCES, dest)
for x in glob.glob(self.j(self.SRC, __appname__, toplevel, '*.coffee')):
x = x.rpartition('.')[0] + '.js'
x = self.j(dest, os.path.basename(x))
if os.path.exists(x):
os.remove(x)
# }}}
class Kakasi(Command): # {{{
description = 'Compile resources for unihandecode' description = 'Compile resources for unihandecode'
@ -62,9 +125,6 @@ class Kakasi(Command):
self.info('\tGenerating kanadict') self.info('\tGenerating kanadict')
self.mkkanadict(src, dest) self.mkkanadict(src, dest)
return
def mkitaiji(self, src, dst): def mkitaiji(self, src, dst):
dic = {} dic = {}
for line in open(src, "r"): for line in open(src, "r"):
@ -125,11 +185,12 @@ class Kakasi(Command):
kakasi = self.j(self.RESOURCES, 'localization', 'pykakasi') kakasi = self.j(self.RESOURCES, 'localization', 'pykakasi')
if os.path.exists(kakasi): if os.path.exists(kakasi):
shutil.rmtree(kakasi) shutil.rmtree(kakasi)
# }}}
class Resources(Command): class Resources(Command): # {{{
description = 'Compile various needed calibre resources' description = 'Compile various needed calibre resources'
sub_commands = ['kakasi'] sub_commands = ['kakasi', 'coffee']
def run(self, opts): def run(self, opts):
scripts = {} scripts = {}
@ -223,13 +284,13 @@ class Resources(Command):
x = self.j(self.RESOURCES, x+'.pickle') x = self.j(self.RESOURCES, x+'.pickle')
if os.path.exists(x): if os.path.exists(x):
os.remove(x) os.remove(x)
from setup.commands import kakasi from setup.commands import kakasi, coffee
kakasi.clean() kakasi.clean()
coffee.clean()
for x in ('builtin_recipes.xml', 'builtin_recipes.zip', for x in ('builtin_recipes.xml', 'builtin_recipes.zip',
'template-functions.json'): 'template-functions.json'):
x = self.j(self.RESOURCES, x) x = self.j(self.RESOURCES, x)
if os.path.exists(x): if os.path.exists(x):
os.remove(x) os.remove(x)
# }}}

View File

@ -215,32 +215,34 @@ class GetTranslations(Translations): # {{{
description = 'Get updated translations from Launchpad' description = 'Get updated translations from Launchpad'
BRANCH = 'lp:~kovid/calibre/translations' BRANCH = 'lp:~kovid/calibre/translations'
@classmethod @property
def modified_translations(cls): def modified_translations(self):
raw = subprocess.Popen(['bzr', 'status'], raw = subprocess.Popen(['bzr', 'status', '-S', self.PATH],
stdout=subprocess.PIPE).stdout.read().strip() stdout=subprocess.PIPE).stdout.read().strip()
ans = []
for line in raw.splitlines(): for line in raw.splitlines():
line = line.strip() line = line.strip()
if line.startswith(cls.PATH) and line.endswith('.po'): if line.startswith('M') and line.endswith('.po'):
yield line ans.append(line.split()[-1])
return ans
def run(self, opts): def run(self, opts):
if len(list(self.modified_translations())) == 0: if not self.modified_translations:
subprocess.check_call(['bzr', 'merge', self.BRANCH]) subprocess.check_call(['bzr', 'merge', self.BRANCH])
if len(list(self.modified_translations())) == 0:
print 'No updated translations available'
else:
subprocess.check_call(['bzr', 'commit', '-m',
'IGN:Updated translations', self.PATH])
self.check_for_errors() self.check_for_errors()
@classmethod if self.modified_translations:
def check_for_errors(cls): subprocess.check_call(['bzr', 'commit', '-m',
'IGN:Updated translations', self.PATH])
else:
print('No updated translations available')
def check_for_errors(self):
errors = os.path.join(tempfile.gettempdir(), 'calibre-translation-errors') errors = os.path.join(tempfile.gettempdir(), 'calibre-translation-errors')
if os.path.exists(errors): if os.path.exists(errors):
shutil.rmtree(errors) shutil.rmtree(errors)
os.mkdir(errors) os.mkdir(errors)
pofilter = ('pofilter', '-i', cls.PATH, '-o', errors, pofilter = ('pofilter', '-i', self.PATH, '-o', errors,
'-t', 'accelerators', '-t', 'escapes', '-t', 'variables', '-t', 'accelerators', '-t', 'escapes', '-t', 'variables',
#'-t', 'xmltags', #'-t', 'xmltags',
#'-t', 'brackets', #'-t', 'brackets',
@ -253,23 +255,20 @@ class GetTranslations(Translations): # {{{
'-t', 'printf') '-t', 'printf')
subprocess.check_call(pofilter) subprocess.check_call(pofilter)
errfiles = glob.glob(errors+os.sep+'*.po') errfiles = glob.glob(errors+os.sep+'*.po')
subprocess.check_call(['gvim', '-f', '-p', '--']+errfiles) if errfiles:
for f in errfiles: subprocess.check_call(['gvim', '-f', '-p', '--']+errfiles)
with open(f, 'r+b') as f: for f in errfiles:
raw = f.read() with open(f, 'r+b') as f:
raw = re.sub(r'# \(pofilter\).*', '', raw) raw = f.read()
f.seek(0) raw = re.sub(r'# \(pofilter\).*', '', raw)
f.truncate() f.seek(0)
f.write(raw) f.truncate()
f.write(raw)
subprocess.check_call(['pomerge', '-t', cls.PATH, '-i', errors, '-o', subprocess.check_call(['pomerge', '-t', self.PATH, '-i', errors, '-o',
cls.PATH]) self.PATH])
if len(list(cls.modified_translations())) > 0: return True
subprocess.call(['bzr', 'diff', cls.PATH]) return False
yes = raw_input('Merge corrections? [y/n]: ').strip()
if yes in ['', 'y']:
subprocess.check_call(['bzr', 'commit', '-m',
'IGN:Translation corrections', cls.PATH])
# }}} # }}}

View File

@ -558,11 +558,11 @@ xml_entity_to_unicode = partial(entity_to_unicode, result_exceptions = {
'>' : '&gt;', '>' : '&gt;',
'&' : '&amp;'}) '&' : '&amp;'})
def replace_entities(raw): def replace_entities(raw, encoding='cp1252'):
return _ent_pat.sub(entity_to_unicode, raw) return _ent_pat.sub(partial(entity_to_unicode, encoding=encoding), raw)
def xml_replace_entities(raw): def xml_replace_entities(raw, encoding='cp1252'):
return _ent_pat.sub(xml_entity_to_unicode, raw) return _ent_pat.sub(partial(xml_entity_to_unicode, encoding=encoding), raw)
def prepare_string_for_xml(raw, attribute=False): def prepare_string_for_xml(raw, attribute=False):
raw = _ent_pat.sub(entity_to_unicode, raw) raw = _ent_pat.sub(entity_to_unicode, raw)

View File

@ -4,7 +4,7 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
__appname__ = u'calibre' __appname__ = u'calibre'
numeric_version = (0, 8, 30) numeric_version = (0, 8, 31)
__version__ = u'.'.join(map(unicode, numeric_version)) __version__ = u'.'.join(map(unicode, numeric_version))
__author__ = u"Kovid Goyal <kovid@kovidgoyal.net>" __author__ = u"Kovid Goyal <kovid@kovidgoyal.net>"

View File

@ -173,8 +173,9 @@ class INVESBOOK(EB600):
FORMATS = ['epub', 'mobi', 'prc', 'fb2', 'html', 'pdf', 'rtf', 'txt'] FORMATS = ['epub', 'mobi', 'prc', 'fb2', 'html', 'pdf', 'rtf', 'txt']
BCD = [0x110, 0x323] BCD = [0x110, 0x323]
VENDOR_NAME = ['INVES_E6', 'INVES-WI'] VENDOR_NAME = ['INVES_E6', 'INVES-WI', 'POCKETBO']
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = ['00INVES_E600', 'INVES-WIBOOK'] WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = ['00INVES_E600', 'INVES-WIBOOK',
'OK_POCKET_611_61']
class BOOQ(EB600): class BOOQ(EB600):
name = 'Booq Device Interface' name = 'Booq Device Interface'

View File

@ -30,7 +30,7 @@ BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'txtz', 'text', 'ht
'html', 'htmlz', 'xhtml', 'pdf', 'pdb', 'pdr', 'prc', 'mobi', 'azw', 'doc', 'html', 'htmlz', 'xhtml', 'pdf', 'pdb', 'pdr', 'prc', 'mobi', 'azw', 'doc',
'epub', 'fb2', 'djv', 'djvu', 'lrx', 'cbr', 'cbz', 'cbc', 'oebzip', 'epub', 'fb2', 'djv', 'djvu', 'lrx', 'cbr', 'cbz', 'cbc', 'oebzip',
'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1', 'pml', 'pmlz', 'mbp', 'tan', 'snb', 'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1', 'pml', 'pmlz', 'mbp', 'tan', 'snb',
'xps', 'oxps', 'azw4', 'book', 'zbf', 'pobi'] 'xps', 'oxps', 'azw4', 'book', 'zbf', 'pobi', 'docx']
class HTMLRenderer(object): class HTMLRenderer(object):

View File

@ -229,7 +229,10 @@ class EPUBOutput(OutputFormatPlugin):
if opts.extract_to is not None: if opts.extract_to is not None:
from calibre.utils.zipfile import ZipFile from calibre.utils.zipfile import ZipFile
if os.path.exists(opts.extract_to): if os.path.exists(opts.extract_to):
shutil.rmtree(opts.extract_to) if os.path.isdir(opts.extract_to):
shutil.rmtree(opts.extract_to)
else:
os.remove(opts.extract_to)
os.mkdir(opts.extract_to) os.mkdir(opts.extract_to)
with ZipFile(output_path) as zf: with ZipFile(output_path) as zf:
zf.extractall(path=opts.extract_to) zf.extractall(path=opts.extract_to)

View File

@ -16,7 +16,8 @@ from lxml.html import tostring
from calibre import as_unicode from calibre import as_unicode
from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.sources.base import Source, Option from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase,
fixauthors)
from calibre.utils.cleantext import clean_ascii_chars from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
@ -509,6 +510,15 @@ class Amazon(Source):
return domain return domain
def clean_downloaded_metadata(self, mi):
if mi.title and self.domain in ('com', 'uk'):
mi.title = fixcase(mi.title)
mi.authors = fixauthors(mi.authors)
if self.domain in ('com', 'uk'):
mi.tags = list(map(fixcase, mi.tags))
mi.isbn = check_isbn(mi.isbn)
def create_query(self, log, title=None, authors=None, identifiers={}, # {{{ def create_query(self, log, title=None, authors=None, identifiers={}, # {{{
domain=None): domain=None):
if domain is None: if domain is None:

View File

@ -31,7 +31,7 @@ class TOC(list):
def __init__(self, href=None, fragment=None, text=None, parent=None, play_order=0, def __init__(self, href=None, fragment=None, text=None, parent=None, play_order=0,
base_path=os.getcwd(), type='unknown', author=None, base_path=os.getcwd(), type='unknown', author=None,
description=None): description=None, toc_thumbnail=None):
self.href = href self.href = href
self.fragment = fragment self.fragment = fragment
if not self.fragment: if not self.fragment:
@ -43,6 +43,7 @@ class TOC(list):
self.type = type self.type = type
self.author = author self.author = author
self.description = description self.description = description
self.toc_thumbnail = toc_thumbnail
def __str__(self): def __str__(self):
lines = ['TOC: %s#%s'%(self.href, self.fragment)] lines = ['TOC: %s#%s'%(self.href, self.fragment)]
@ -72,12 +73,12 @@ class TOC(list):
entry.parent = None entry.parent = None
def add_item(self, href, fragment, text, play_order=None, type='unknown', def add_item(self, href, fragment, text, play_order=None, type='unknown',
author=None, description=None): author=None, description=None, toc_thumbnail=None):
if play_order is None: if play_order is None:
play_order = (self[-1].play_order if len(self) else self.play_order) + 1 play_order = (self[-1].play_order if len(self) else self.play_order) + 1
self.append(TOC(href=href, fragment=fragment, text=text, parent=self, self.append(TOC(href=href, fragment=fragment, text=text, parent=self,
base_path=self.base_path, play_order=play_order, base_path=self.base_path, play_order=play_order,
type=type, author=author, description=description)) type=type, author=author, description=description, toc_thumbnail=toc_thumbnail))
return self[-1] return self[-1]
def top_level_items(self): def top_level_items(self):
@ -269,6 +270,9 @@ class TOC(list):
if desc: if desc:
desc = re.sub(r'\s+', ' ', desc) desc = re.sub(r'\s+', ' ', desc)
elem.append(C.meta(desc, name='description')) elem.append(C.meta(desc, name='description'))
idx = getattr(np, 'toc_thumbnail', None)
if idx:
elem.append(C.meta(idx, name='toc_thumbnail'))
parent.append(elem) parent.append(elem)
for np2 in np: for np2 in np:
navpoint(elem, np2) navpoint(elem, np2)

View File

@ -656,11 +656,11 @@ class Tag(object): # {{{
' image record associated with this article', ' image record associated with this article',
'image_index'), 'image_index'),
70 : ('Description offset in cncx', 'desc_offset'), 70 : ('Description offset in cncx', 'desc_offset'),
71 : ('Image attribution offset in cncx', 71 : ('Author offset in cncx', 'author_offset'),
'image_attr_offset'),
72 : ('Image caption offset in cncx', 72 : ('Image caption offset in cncx',
'image_caption_offset'), 'image_caption_offset'),
73 : ('Author offset in cncx', 'author_offset'), 73 : ('Image attribution offset in cncx',
'image_attr_offset'),
}, },
'chapter_with_subchapters' : { 'chapter_with_subchapters' : {

View File

@ -973,7 +973,8 @@ class MobiReader(object):
continue continue
processed_records.append(i) processed_records.append(i)
data = self.sections[i][0] data = self.sections[i][0]
if data[:4] in (b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n'): if data[:4] in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n',
b'RESC', b'BOUN', b'FDST', b'DATP'}:
# A FLIS, FCIS, SRCS or EOF record, ignore # A FLIS, FCIS, SRCS or EOF record, ignore
continue continue
buf = cStringIO.StringIO(data) buf = cStringIO.StringIO(data)

View File

@ -136,7 +136,8 @@ class IndexEntry(object):
'last_child_index': 23, 'last_child_index': 23,
'image_index': 69, 'image_index': 69,
'desc_offset': 70, 'desc_offset': 70,
'author_offset': 73, 'author_offset': 71,
} }
RTAG_MAP = {v:k for k, v in TAG_VALUES.iteritems()} RTAG_MAP = {v:k for k, v in TAG_VALUES.iteritems()}
@ -754,6 +755,13 @@ class Indexer(object): # {{{
normalized_articles.append(article) normalized_articles.append(article)
article.author_offset = self.cncx[art.author] article.author_offset = self.cncx[art.author]
article.desc_offset = self.cncx[art.description] article.desc_offset = self.cncx[art.description]
if getattr(art, 'toc_thumbnail', None) is not None:
try:
ii = self.serializer.images[art.toc_thumbnail] - 1
if ii > -1:
article.image_index = ii
except KeyError:
pass # Image not found in serializer
if normalized_articles: if normalized_articles:
normalized_articles.sort(key=lambda x:x.offset) normalized_articles.sort(key=lambda x:x.offset)

View File

@ -161,7 +161,7 @@ class MobiWriter(object):
index = 1 index = 1
mh_href = None mh_href = None
if 'masthead' in oeb.guide: if 'masthead' in oeb.guide and oeb.guide['masthead'].href:
mh_href = oeb.guide['masthead'].href mh_href = oeb.guide['masthead'].href
self.image_records.append(None) self.image_records.append(None)
index += 1 index += 1

View File

@ -16,15 +16,13 @@ from urllib import unquote as urlunquote
from lxml import etree, html from lxml import etree, html
from calibre.constants import filesystem_encoding, __version__ from calibre.constants import filesystem_encoding, __version__
from calibre.translations.dynamic import translate from calibre.translations.dynamic import translate
from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
from calibre.ebooks.conversion.preprocess import CSSPreProcessor from calibre.ebooks.conversion.preprocess import CSSPreProcessor
from calibre import isbytestring, as_unicode, get_types_map from calibre import (isbytestring, as_unicode, get_types_map)
from calibre.ebooks.oeb.parse_utils import (barename, XHTML_NS, RECOVER_PARSER,
RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True) namespace, XHTML, parse_html, NotHTML)
XML_NS = 'http://www.w3.org/XML/1998/namespace' XML_NS = 'http://www.w3.org/XML/1998/namespace'
XHTML_NS = 'http://www.w3.org/1999/xhtml'
OEB_DOC_NS = 'http://openebook.org/namespaces/oeb-document/1.0/' OEB_DOC_NS = 'http://openebook.org/namespaces/oeb-document/1.0/'
OPF1_NS = 'http://openebook.org/namespaces/oeb-package/1.0/' OPF1_NS = 'http://openebook.org/namespaces/oeb-package/1.0/'
OPF2_NS = 'http://www.idpf.org/2007/opf' OPF2_NS = 'http://www.idpf.org/2007/opf'
@ -55,9 +53,6 @@ OPF2_NSMAP = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS,
def XML(name): def XML(name):
return '{%s}%s' % (XML_NS, name) return '{%s}%s' % (XML_NS, name)
def XHTML(name):
return '{%s}%s' % (XHTML_NS, name)
def OPF(name): def OPF(name):
return '{%s}%s' % (OPF2_NS, name) return '{%s}%s' % (OPF2_NS, name)
@ -279,22 +274,11 @@ PREFIXNAME_RE = re.compile(r'^[^:]+[:][^:]+')
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>') XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
CSSURL_RE = re.compile(r'''url[(](?P<q>["']?)(?P<url>[^)]+)(?P=q)[)]''') CSSURL_RE = re.compile(r'''url[(](?P<q>["']?)(?P<url>[^)]+)(?P=q)[)]''')
def element(parent, *args, **kwargs): def element(parent, *args, **kwargs):
if parent is not None: if parent is not None:
return etree.SubElement(parent, *args, **kwargs) return etree.SubElement(parent, *args, **kwargs)
return etree.Element(*args, **kwargs) return etree.Element(*args, **kwargs)
def namespace(name):
if '}' in name:
return name.split('}', 1)[0][1:]
return ''
def barename(name):
if '}' in name:
return name.split('}', 1)[1]
return name
def prefixname(name, nsrmap): def prefixname(name, nsrmap):
if not isqname(name): if not isqname(name):
return name return name
@ -373,25 +357,6 @@ def urlnormalize(href):
parts = (urlquote(part) for part in parts) parts = (urlquote(part) for part in parts)
return urlunparse(parts) return urlunparse(parts)
def merge_multiple_html_heads_and_bodies(root, log=None):
heads, bodies = xpath(root, '//h:head'), xpath(root, '//h:body')
if not (len(heads) > 1 or len(bodies) > 1): return root
for child in root: root.remove(child)
head = root.makeelement(XHTML('head'))
body = root.makeelement(XHTML('body'))
for h in heads:
for x in h:
head.append(x)
for b in bodies:
for x in b:
body.append(x)
map(root.append, (head, body))
if log is not None:
log.warn('Merging multiple <head> and <body> sections')
return root
class DummyHandler(logging.Handler): class DummyHandler(logging.Handler):
@ -418,10 +383,6 @@ class OEBError(Exception):
"""Generic OEB-processing error.""" """Generic OEB-processing error."""
pass pass
class NotHTML(OEBError):
'''Raised when a file that should be HTML (as per manifest) is not'''
pass
class NullContainer(object): class NullContainer(object):
"""An empty container. """An empty container.
@ -801,7 +762,6 @@ class Manifest(object):
""" """
NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)') NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)')
META_XP = XPath('/h:html/h:head/h:meta[@http-equiv="Content-Type"]')
def __init__(self, oeb, id, href, media_type, def __init__(self, oeb, id, href, media_type,
fallback=None, loader=str, data=None): fallback=None, loader=str, data=None):
@ -830,244 +790,17 @@ class Manifest(object):
return None return None
return etree.fromstring(data, parser=RECOVER_PARSER) return etree.fromstring(data, parser=RECOVER_PARSER)
def clean_word_doc(self, data):
prefixes = []
for match in re.finditer(r'xmlns:(\S+?)=".*?microsoft.*?"', data):
prefixes.append(match.group(1))
if prefixes:
self.oeb.log.warn('Found microsoft markup, cleaning...')
# Remove empty tags as they are not rendered by browsers
# but can become renderable HTML tags like <p/> if the
# document is parsed by an HTML parser
pat = re.compile(
r'<(%s):([a-zA-Z0-9]+)[^>/]*?></\1:\2>'%('|'.join(prefixes)),
re.DOTALL)
data = pat.sub('', data)
pat = re.compile(
r'<(%s):([a-zA-Z0-9]+)[^>/]*?/>'%('|'.join(prefixes)))
data = pat.sub('', data)
return data
def _parse_xhtml(self, data): def _parse_xhtml(self, data):
orig_data = data orig_data = data
self.oeb.log.debug('Parsing', self.href, '...') fname = urlunquote(self.href)
# Convert to Unicode and normalize line endings self.oeb.log.debug('Parsing', fname, '...')
data = self.oeb.decode(data)
data = strip_encoding_declarations(data)
data = self.oeb.html_preprocessor(data)
# There could be null bytes in data if it had &#0; entities in it
data = data.replace('\0', '')
# Remove DOCTYPE declaration as it messes up parsing
# In particular, it causes tostring to insert xmlns
# declarations, which messes up the coercing logic
idx = data.find('<html')
if idx == -1:
idx = data.find('<HTML')
if idx > -1:
pre = data[:idx]
data = data[idx:]
if '<!DOCTYPE' in pre:
user_entities = {}
for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
val = match.group(2)
if val.startswith('"') and val.endswith('"'):
val = val[1:-1]
user_entities[match.group(1)] = val
if user_entities:
pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
data = pat.sub(lambda m:user_entities[m.group(1)], data)
# Setting huge_tree=True causes crashes in windows with large files
parser = etree.XMLParser(no_network=True)
# Try with more & more drastic measures to parse
def first_pass(data):
try:
data = etree.fromstring(data, parser=parser)
except etree.XMLSyntaxError as err:
self.oeb.log.debug('Initial parse failed, using more'
' forgiving parsers')
repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
data = ENTITY_RE.sub(repl, data)
try:
data = etree.fromstring(data, parser=parser)
except etree.XMLSyntaxError as err:
self.oeb.logger.warn('Parsing file %r as HTML' % self.href)
if err.args and err.args[0].startswith('Excessive depth'):
from calibre.utils.soupparser import fromstring
data = fromstring(data)
else:
data = html.fromstring(data)
data.attrib.pop('xmlns', None)
for elem in data.iter(tag=etree.Comment):
if elem.text:
elem.text = elem.text.strip('-')
data = etree.tostring(data, encoding=unicode)
try:
data = etree.fromstring(data, parser=parser)
except etree.XMLSyntaxError:
data = etree.fromstring(data, parser=RECOVER_PARSER)
return data
try: try:
data = self.clean_word_doc(data) data = parse_html(data, log=self.oeb.log,
except: decoder=self.oeb.decode,
pass preprocessor=self.oeb.html_preprocessor,
data = first_pass(data) filename=fname, non_html_file_tags={'ncx'})
except NotHTML:
if data.tag == 'HTML': return self._parse_xml(orig_data)
# Lower case all tag and attribute names
data.tag = data.tag.lower()
for x in data.iterdescendants():
try:
x.tag = x.tag.lower()
for key, val in list(x.attrib.iteritems()):
del x.attrib[key]
key = key.lower()
x.attrib[key] = val
except:
pass
# Handle weird (non-HTML/fragment) files
if barename(data.tag) != 'html':
if barename(data.tag) == 'ncx':
return self._parse_xml(orig_data)
self.oeb.log.warn('File %r does not appear to be (X)HTML'%self.href)
nroot = etree.fromstring('<html></html>')
has_body = False
for child in list(data):
if isinstance(child.tag, (unicode, str)) and barename(child.tag) == 'body':
has_body = True
break
parent = nroot
if not has_body:
self.oeb.log.warn('File %r appears to be a HTML fragment'%self.href)
nroot = etree.fromstring('<html><body/></html>')
parent = nroot[0]
for child in list(data.iter()):
oparent = child.getparent()
if oparent is not None:
oparent.remove(child)
parent.append(child)
data = nroot
# Force into the XHTML namespace
if not namespace(data.tag):
self.oeb.log.warn('Forcing', self.href, 'into XHTML namespace')
data.attrib['xmlns'] = XHTML_NS
data = etree.tostring(data, encoding=unicode)
try:
data = etree.fromstring(data, parser=parser)
except:
data = data.replace(':=', '=').replace(':>', '>')
data = data.replace('<http:/>', '')
try:
data = etree.fromstring(data, parser=parser)
except etree.XMLSyntaxError:
self.oeb.logger.warn('Stripping comments from %s'%
self.href)
data = re.compile(r'<!--.*?-->', re.DOTALL).sub('',
data)
data = data.replace(
"<?xml version='1.0' encoding='utf-8'?><o:p></o:p>",
'')
data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '')
try:
data = etree.fromstring(data,
parser=RECOVER_PARSER)
except etree.XMLSyntaxError:
self.oeb.logger.warn('Stripping meta tags from %s'%
self.href)
data = re.sub(r'<meta\s+[^>]+?>', '', data)
data = etree.fromstring(data, parser=RECOVER_PARSER)
elif namespace(data.tag) != XHTML_NS:
# OEB_DOC_NS, but possibly others
ns = namespace(data.tag)
attrib = dict(data.attrib)
nroot = etree.Element(XHTML('html'),
nsmap={None: XHTML_NS}, attrib=attrib)
for elem in data.iterdescendants():
if isinstance(elem.tag, basestring) and \
namespace(elem.tag) == ns:
elem.tag = XHTML(barename(elem.tag))
for elem in data:
nroot.append(elem)
data = nroot
data = merge_multiple_html_heads_and_bodies(data, self.oeb.logger)
# Ensure has a <head/>
head = xpath(data, '/h:html/h:head')
head = head[0] if head else None
if head is None:
self.oeb.logger.warn(
'File %r missing <head/> element' % self.href)
head = etree.Element(XHTML('head'))
data.insert(0, head)
title = etree.SubElement(head, XHTML('title'))
title.text = self.oeb.translate(__('Unknown'))
elif not xpath(data, '/h:html/h:head/h:title'):
self.oeb.logger.warn(
'File %r missing <title/> element' % self.href)
title = etree.SubElement(head, XHTML('title'))
title.text = self.oeb.translate(__('Unknown'))
# Remove any encoding-specifying <meta/> elements
for meta in self.META_XP(data):
meta.getparent().remove(meta)
etree.SubElement(head, XHTML('meta'),
attrib={'http-equiv': 'Content-Type',
'content': '%s; charset=utf-8' % XHTML_NS})
# Ensure has a <body/>
if not xpath(data, '/h:html/h:body'):
body = xpath(data, '//h:body')
if body:
body = body[0]
body.getparent().remove(body)
data.append(body)
else:
self.oeb.logger.warn(
'File %r missing <body/> element' % self.href)
etree.SubElement(data, XHTML('body'))
# Remove microsoft office markup
r = [x for x in data.iterdescendants(etree.Element) if 'microsoft-com' in x.tag]
for x in r:
x.tag = XHTML('span')
# Remove lang redefinition inserted by the amazing Microsoft Word!
body = xpath(data, '/h:html/h:body')[0]
for key in list(body.attrib.keys()):
if key == 'lang' or key.endswith('}lang'):
body.attrib.pop(key)
def remove_elem(a):
p = a.getparent()
idx = p.index(a) -1
p.remove(a)
if a.tail:
if idx <= 0:
if p.text is None:
p.text = ''
p.text += a.tail
else:
if p[idx].tail is None:
p[idx].tail = ''
p[idx].tail += a.tail
# Remove hyperlinks with no content as they cause rendering
# artifacts in browser based renderers
# Also remove empty <b>, <u> and <i> tags
for a in xpath(data, '//h:a[@href]|//h:i|//h:b|//h:u'):
if a.get('id', None) is None and a.get('name', None) is None \
and len(a) == 0 and not a.text:
remove_elem(a)
# Convert <br>s with content into paragraphs as ADE can't handle
# them
for br in xpath(data, '//h:br'):
if len(br) > 0 or br.text:
br.tag = XHTML('div')
return data return data
def _parse_txt(self, data): def _parse_txt(self, data):
@ -1629,9 +1362,10 @@ class TOC(object):
:attr:`id`: Option unique identifier for this node. :attr:`id`: Option unique identifier for this node.
:attr:`author`: Optional author attribution for periodicals <mbp:> :attr:`author`: Optional author attribution for periodicals <mbp:>
:attr:`description`: Optional description attribute for periodicals <mbp:> :attr:`description`: Optional description attribute for periodicals <mbp:>
:attr:`toc_thumbnail`: Optional toc thumbnail image
""" """
def __init__(self, title=None, href=None, klass=None, id=None, def __init__(self, title=None, href=None, klass=None, id=None,
play_order=None, author=None, description=None): play_order=None, author=None, description=None, toc_thumbnail=None):
self.title = title self.title = title
self.href = urlnormalize(href) if href else href self.href = urlnormalize(href) if href else href
self.klass = klass self.klass = klass
@ -1643,10 +1377,11 @@ class TOC(object):
self.play_order = play_order self.play_order = play_order
self.author = author self.author = author
self.description = description self.description = description
self.toc_thumbnail = toc_thumbnail
def add(self, title, href, klass=None, id=None, play_order=0, author=None, description=None): def add(self, title, href, klass=None, id=None, play_order=0, author=None, description=None, toc_thumbnail=None):
"""Create and return a new sub-node of this node.""" """Create and return a new sub-node of this node."""
node = TOC(title, href, klass, id, play_order, author, description) node = TOC(title, href, klass, id, play_order, author, description, toc_thumbnail)
self.nodes.append(node) self.nodes.append(node)
return node return node

View File

@ -0,0 +1,225 @@
#!/usr/bin/env coffee
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
###
Copyright 2011, Kovid Goyal <kovid@kovidgoyal.net>
Released under the GPLv3 License
###
#
log = (error) ->
if error
if window?.console?.log
window.console.log(error)
else if process?.stdout?.write
process.stdout.write(error + '\n')
# CFI escaping {{{
escape_for_cfi = (raw) ->
if raw
for c in ['^', '[', ']', ',', '(', ')', ';', '~', '@', '-', '!']
raw = raw.replace(c, '^'+c)
raw
unescape_from_cfi = (raw) ->
ans = raw
if raw
dropped = false
ans = []
for c in raw
if not dropped and c == '^'
dropped = true
continue
dropped = false
ans.push(c)
ans = ans.join('')
ans
# }}}
fstr = (d) -> # {{{
# Convert a timestamp floating point number to a string
ans = ""
if ( d < 0 )
ans = "-"
d = -d
n = Math.floor(d)
ans += n
n = Math.round((d-n)*100)
if( n != 0 )
ans += "."
ans += if (n % 10 == 0) then (n/10) else n
ans
# }}}
class CanonicalFragmentIdentifier
# This class is a namespace to expose CFI functions via the window.cfi
# object
constructor: () ->
encode: (doc, node, offset, tail) -> # {{{
cfi = tail or ""
# Handle the offset, if any
switch node.nodeType
when 1 # Element node
if typeoff(offset) == 'number'
node = node.childNodes.item(offset)
when 3, 4, 5, 6 # Text/entity/CDATA node
offset or= 0
while true
p = node.previousSibling
if (p?.nodeType not in [3, 4, 5, 6])
break
offset += p.nodeValue.length
node = p
cfi = ":" + offset + cfi
else # Not handled
log("Offsets for nodes of type #{ node.nodeType } are not handled")
# Construct the path to node from root
until node == doc
p = node.parentNode
if not p
if node.nodeType == 9 # Document node (iframe)
win = node.defaultView
if win.frameElement
node = win.frameElement
cfi = "!" + cfi
continue
break
# Increase index by the length of all previous sibling text nodes
index = 0
child = p.firstChild
while true
index |= 1
if child.nodeType in [1, 7]
index++
if child == node
break
child = child.nextSibling
# Add id assertions for robustness where possible
id = node.getAttribute?('id')
idspec = if id then "[#{ escape_for_cfi(id) }]" else ''
cfi = '/' + index + idspec + cfi
node = p
cfi
# }}}
decode: (cfi, doc=window?.document) -> # {{{
simple_node_regex = ///
^/(\d+) # The node count
(\[[^\]]*\])? # The optional id assertion
///
error = null
node = doc
until cfi.length <= 0 or error
if ( (r = cfi.match(simple_node_regex)) is not null ) # Path step
target = parseInt(r[1])
assertion = r[2]
if assertion
assertion = unescape_from_cfi(assertion.slice(1, assertion.length-1))
index = 0
child = node.firstChild
while true
if not child
if assertion # Try to use the assertion to find the node
child = doc.getElementById(assertion)
if child
node = child
if not child
error = "No matching child found for CFI: " + cfi
break
index |= 1 # Increment index by 1 if it is even
if child.nodeType in [1, 7] # We have an element or a PI
index++
if ( index == target )
cfi = cfi.substr(r[0].length)
node = child
break
child = child.nextSibling
else if cfi[0] == '!' # Indirection
if node.contentDocument
node = node.contentDocument
cfi = cfi.substr(1)
else
error = "Cannot reference #{ node.nodeName }'s content:" + cfi
else
break
if error
log(error)
return null
point = {}
error = null
point
# }}}
at: (x, y, doc=window?.document) -> # {{{
cdoc = doc
target = null
cwin = cdoc.defaultView
tail = ''
offset = null
name = null
# Drill down into iframes, etc.
while true
target = cdoc.elementFromPoint x, y
if not target or target.localName == 'html'
log("No element at (#{ x }, #{ y })")
return null
name = target.localName
if name not in ['iframe', 'embed', 'object']
break
cd = target.contentDocument
if not cd
break
x = x + cwin.pageXOffset - target.offsetLeft
y = y + cwin.pageYOffset - target.offsetTop
cdoc = cd
cwin = cdoc.defaultView
target.normalize()
if name in ['audio', 'video']
tail = "~" + fstr target.currentTime
if name in ['img', 'video']
px = ((x + cwin.scrollX - target.offsetLeft)*100)/target.offsetWidth
py = ((y + cwin.scrollY - target.offsetTop)*100)/target.offsetHeight
tail = "#{ tail }@#{ fstr px },#{ fstr py }"
else if name != 'audio'
if cdoc.caretRangeFromPoint # WebKit
range = cdoc.caretRangeFromPoint(x, y)
if range
target = range.startContainer
offset = range.startOffset
else
# TODO: implement a span bisection algorithm for UAs
# without caretRangeFromPoint (Gecko, IE)
this.encode(doc, target, offset, tail)
# }}}
if window?
window.cfi = new CanonicalFragmentIdentifier()
else if process?
# Some debugging code goes here to be run with the coffee interpreter
cfi = new CanonicalFragmentIdentifier()
t = 'a^!,1'
log(t)
log(escape_for_cfi(t))
log(unescape_from_cfi(escape_for_cfi(t)))

View File

@ -0,0 +1,24 @@
#!/usr/bin/env coffee
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
###
Copyright 2011, Kovid Goyal <kovid@kovidgoyal.net>
Released under the GPLv3 License
###
viewport_top = (node) ->
$(node).offset().top - window.pageYOffset
viewport_left = (node) ->
$(node).offset().left - window.pageXOffset
window.onload = ->
h1 = document.getElementsByTagName('h1')[0]
x = h1.scrollLeft + 150
y = viewport_top(h1) + h1.offsetHeight/2
e = document.elementFromPoint x, y
if e.getAttribute('id') != 'first-h1'
alert 'Failed to find top h1'
return
alert window.cfi.at x, y

View File

@ -0,0 +1,14 @@
<!DOCTYPE html>
<html>
<head>
<title>Testing CFI functionality</title>
<script type="text/javascript" src="cfi.js"></script>
<script type="text/javascript" src="jquery.js"></script>
<script type="text/javascript" src="cfi-test.js"></script>
</head>
<body>
<h1 id="first-h1" style="border: solid 1px red">Testing CFI functionality</h1>
</body>
</html>

View File

@ -0,0 +1,26 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os
try:
from calibre.utils.coffeescript import serve
except ImportError:
import init_calibre
if False: init_calibre, serve
from calibre.utils.coffeescript import serve
def run_devel_server():
os.chdir(os.path.dirname(__file__))
serve(['../cfi.coffee', 'cfi-test.coffee'])
if __name__ == '__main__':
run_devel_server()

View File

@ -1,256 +0,0 @@
"""
Replacement for htmlentitydefs which uses purely numeric entities.
"""
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
ENTITYDEFS = \
{'AElig': '&#198;',
'Aacute': '&#193;',
'Acirc': '&#194;',
'Agrave': '&#192;',
'Alpha': '&#913;',
'Aring': '&#197;',
'Atilde': '&#195;',
'Auml': '&#196;',
'Beta': '&#914;',
'Ccedil': '&#199;',
'Chi': '&#935;',
'Dagger': '&#8225;',
'Delta': '&#916;',
'ETH': '&#208;',
'Eacute': '&#201;',
'Ecirc': '&#202;',
'Egrave': '&#200;',
'Epsilon': '&#917;',
'Eta': '&#919;',
'Euml': '&#203;',
'Gamma': '&#915;',
'Iacute': '&#205;',
'Icirc': '&#206;',
'Igrave': '&#204;',
'Iota': '&#921;',
'Iuml': '&#207;',
'Kappa': '&#922;',
'Lambda': '&#923;',
'Mu': '&#924;',
'Ntilde': '&#209;',
'Nu': '&#925;',
'OElig': '&#338;',
'Oacute': '&#211;',
'Ocirc': '&#212;',
'Ograve': '&#210;',
'Omega': '&#937;',
'Omicron': '&#927;',
'Oslash': '&#216;',
'Otilde': '&#213;',
'Ouml': '&#214;',
'Phi': '&#934;',
'Pi': '&#928;',
'Prime': '&#8243;',
'Psi': '&#936;',
'Rho': '&#929;',
'Scaron': '&#352;',
'Sigma': '&#931;',
'THORN': '&#222;',
'Tau': '&#932;',
'Theta': '&#920;',
'Uacute': '&#218;',
'Ucirc': '&#219;',
'Ugrave': '&#217;',
'Upsilon': '&#933;',
'Uuml': '&#220;',
'Xi': '&#926;',
'Yacute': '&#221;',
'Yuml': '&#376;',
'Zeta': '&#918;',
'aacute': '&#225;',
'acirc': '&#226;',
'acute': '&#180;',
'aelig': '&#230;',
'agrave': '&#224;',
'alefsym': '&#8501;',
'alpha': '&#945;',
'and': '&#8743;',
'ang': '&#8736;',
'aring': '&#229;',
'asymp': '&#8776;',
'atilde': '&#227;',
'auml': '&#228;',
'bdquo': '&#8222;',
'beta': '&#946;',
'brvbar': '&#166;',
'bull': '&#8226;',
'cap': '&#8745;',
'ccedil': '&#231;',
'cedil': '&#184;',
'cent': '&#162;',
'chi': '&#967;',
'circ': '&#710;',
'clubs': '&#9827;',
'cong': '&#8773;',
'copy': '&#169;',
'crarr': '&#8629;',
'cup': '&#8746;',
'curren': '&#164;',
'dArr': '&#8659;',
'dagger': '&#8224;',
'darr': '&#8595;',
'deg': '&#176;',
'delta': '&#948;',
'diams': '&#9830;',
'divide': '&#247;',
'eacute': '&#233;',
'ecirc': '&#234;',
'egrave': '&#232;',
'empty': '&#8709;',
'emsp': '&#8195;',
'ensp': '&#8194;',
'epsilon': '&#949;',
'equiv': '&#8801;',
'eta': '&#951;',
'eth': '&#240;',
'euml': '&#235;',
'euro': '&#8364;',
'exist': '&#8707;',
'fnof': '&#402;',
'forall': '&#8704;',
'frac12': '&#189;',
'frac14': '&#188;',
'frac34': '&#190;',
'frasl': '&#8260;',
'gamma': '&#947;',
'ge': '&#8805;',
'hArr': '&#8660;',
'harr': '&#8596;',
'hearts': '&#9829;',
'hellip': '&#8230;',
'iacute': '&#237;',
'icirc': '&#238;',
'iexcl': '&#161;',
'igrave': '&#236;',
'image': '&#8465;',
'infin': '&#8734;',
'int': '&#8747;',
'iota': '&#953;',
'iquest': '&#191;',
'isin': '&#8712;',
'iuml': '&#239;',
'kappa': '&#954;',
'lArr': '&#8656;',
'lambda': '&#955;',
'lang': '&#9001;',
'laquo': '&#171;',
'larr': '&#8592;',
'lceil': '&#8968;',
'ldquo': '&#8220;',
'le': '&#8804;',
'lfloor': '&#8970;',
'lowast': '&#8727;',
'loz': '&#9674;',
'lrm': '&#8206;',
'lsaquo': '&#8249;',
'lsquo': '&#8216;',
'macr': '&#175;',
'mdash': '&#8212;',
'micro': '&#181;',
'middot': '&#183;',
'minus': '&#8722;',
'mu': '&#956;',
'nabla': '&#8711;',
'nbsp': '&#160;',
'ndash': '&#8211;',
'ne': '&#8800;',
'ni': '&#8715;',
'not': '&#172;',
'notin': '&#8713;',
'nsub': '&#8836;',
'ntilde': '&#241;',
'nu': '&#957;',
'oacute': '&#243;',
'ocirc': '&#244;',
'oelig': '&#339;',
'ograve': '&#242;',
'oline': '&#8254;',
'omega': '&#969;',
'omicron': '&#959;',
'oplus': '&#8853;',
'or': '&#8744;',
'ordf': '&#170;',
'ordm': '&#186;',
'oslash': '&#248;',
'otilde': '&#245;',
'otimes': '&#8855;',
'ouml': '&#246;',
'para': '&#182;',
'part': '&#8706;',
'permil': '&#8240;',
'perp': '&#8869;',
'phi': '&#966;',
'pi': '&#960;',
'piv': '&#982;',
'plusmn': '&#177;',
'pound': '&#163;',
'prime': '&#8242;',
'prod': '&#8719;',
'prop': '&#8733;',
'psi': '&#968;',
'rArr': '&#8658;',
'radic': '&#8730;',
'rang': '&#9002;',
'raquo': '&#187;',
'rarr': '&#8594;',
'rceil': '&#8969;',
'rdquo': '&#8221;',
'real': '&#8476;',
'reg': '&#174;',
'rfloor': '&#8971;',
'rho': '&#961;',
'rlm': '&#8207;',
'rsaquo': '&#8250;',
'rsquo': '&#8217;',
'sbquo': '&#8218;',
'scaron': '&#353;',
'sdot': '&#8901;',
'sect': '&#167;',
'shy': '&#173;',
'sigma': '&#963;',
'sigmaf': '&#962;',
'sim': '&#8764;',
'spades': '&#9824;',
'sub': '&#8834;',
'sube': '&#8838;',
'sum': '&#8721;',
'sup': '&#8835;',
'sup1': '&#185;',
'sup2': '&#178;',
'sup3': '&#179;',
'supe': '&#8839;',
'szlig': '&#223;',
'tau': '&#964;',
'there4': '&#8756;',
'theta': '&#952;',
'thetasym': '&#977;',
'thinsp': '&#8201;',
'thorn': '&#254;',
'tilde': '&#732;',
'times': '&#215;',
'trade': '&#8482;',
'uArr': '&#8657;',
'uacute': '&#250;',
'uarr': '&#8593;',
'ucirc': '&#251;',
'ugrave': '&#249;',
'uml': '&#168;',
'upsih': '&#978;',
'upsilon': '&#965;',
'uuml': '&#252;',
'weierp': '&#8472;',
'xi': '&#958;',
'yacute': '&#253;',
'yen': '&#165;',
'yuml': '&#255;',
'zeta': '&#950;',
'zwj': '&#8205;',
'zwnj': '&#8204;'}

View File

@ -0,0 +1,347 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re
from lxml import etree, html
from calibre import xml_replace_entities, force_unicode
from calibre.constants import filesystem_encoding
from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True)
XHTML_NS = 'http://www.w3.org/1999/xhtml'
class NotHTML(Exception):
def __init__(self, root_tag):
Exception.__init__(self, 'Data is not HTML')
self.root_tag = root_tag
def barename(name):
return name.rpartition('}')[-1]
def namespace(name):
if '}' in name:
return name.split('}', 1)[0][1:]
return ''
def XHTML(name):
return '{%s}%s' % (XHTML_NS, name)
def xpath(elem, expr):
return elem.xpath(expr, namespaces={'h':XHTML_NS})
def XPath(expr):
return etree.XPath(expr, namespaces={'h':XHTML_NS})
META_XP = XPath('/h:html/h:head/h:meta[@http-equiv="Content-Type"]')
def merge_multiple_html_heads_and_bodies(root, log=None):
heads, bodies = xpath(root, '//h:head'), xpath(root, '//h:body')
if not (len(heads) > 1 or len(bodies) > 1): return root
for child in root: root.remove(child)
head = root.makeelement(XHTML('head'))
body = root.makeelement(XHTML('body'))
for h in heads:
for x in h:
head.append(x)
for b in bodies:
for x in b:
body.append(x)
map(root.append, (head, body))
if log is not None:
log.warn('Merging multiple <head> and <body> sections')
return root
def _html5_parse(data):
import html5lib
data = html5lib.parse(data, treebuilder='lxml').getroot()
html_ns = [ns for ns, val in data.nsmap.iteritems() if (val == XHTML_NS and
ns is not None)]
if html_ns:
# html5lib causes the XHTML namespace to not
# be set as the default namespace
nsmap = dict(data.nsmap)
nsmap[None] = XHTML_NS
for x in html_ns:
nsmap.pop(x)
nroot = etree.Element(data.tag, nsmap=nsmap,
attrib=dict(data.attrib))
nroot.text = data.text
nroot.tail = data.tail
for child in data:
nroot.append(child)
data = nroot
return data
def _html4_parse(data, prefer_soup=False):
if prefer_soup:
from calibre.utils.soupparser import fromstring
data = fromstring(data)
else:
data = html.fromstring(data)
data.attrib.pop('xmlns', None)
for elem in data.iter(tag=etree.Comment):
if elem.text:
elem.text = elem.text.strip('-')
data = etree.tostring(data, encoding=unicode)
# Setting huge_tree=True causes crashes in windows with large files
parser = etree.XMLParser(no_network=True)
try:
data = etree.fromstring(data, parser=parser)
except etree.XMLSyntaxError:
data = etree.fromstring(data, parser=RECOVER_PARSER)
return data
def clean_word_doc(data, log):
prefixes = []
for match in re.finditer(r'xmlns:(\S+?)=".*?microsoft.*?"', data):
prefixes.append(match.group(1))
if prefixes:
log.warn('Found microsoft markup, cleaning...')
# Remove empty tags as they are not rendered by browsers
# but can become renderable HTML tags like <p/> if the
# document is parsed by an HTML parser
pat = re.compile(
r'<(%s):([a-zA-Z0-9]+)[^>/]*?></\1:\2>'%('|'.join(prefixes)),
re.DOTALL)
data = pat.sub('', data)
pat = re.compile(
r'<(%s):([a-zA-Z0-9]+)[^>/]*?/>'%('|'.join(prefixes)))
data = pat.sub('', data)
return data
def parse_html(data, log=None, decoder=None, preprocessor=None,
filename='<string>', non_html_file_tags=frozenset()):
if log is None:
from calibre.utils.logging import default_log
log = default_log
filename = force_unicode(filename, enc=filesystem_encoding)
if not isinstance(data, unicode):
if decoder is not None:
data = decoder(data)
else:
data = xml_to_unicode(data)[0]
data = strip_encoding_declarations(data)
if preprocessor is not None:
data = preprocessor(data)
# There could be null bytes in data if it had &#0; entities in it
data = data.replace('\0', '')
# Remove DOCTYPE declaration as it messes up parsing
# In particular, it causes tostring to insert xmlns
# declarations, which messes up the coercing logic
idx = data.find('<html')
if idx == -1:
idx = data.find('<HTML')
if idx > -1:
pre = data[:idx]
data = data[idx:]
if '<!DOCTYPE' in pre: # Handle user defined entities
user_entities = {}
for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
val = match.group(2)
if val.startswith('"') and val.endswith('"'):
val = val[1:-1]
user_entities[match.group(1)] = val
if user_entities:
pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
data = pat.sub(lambda m:user_entities[m.group(1)], data)
data = clean_word_doc(data, log)
# Setting huge_tree=True causes crashes in windows with large files
parser = etree.XMLParser(no_network=True)
# Try with more & more drastic measures to parse
try:
data = etree.fromstring(data, parser=parser)
except etree.XMLSyntaxError:
log.debug('Initial parse failed, using more'
' forgiving parsers')
data = xml_replace_entities(data)
try:
data = etree.fromstring(data, parser=parser)
except etree.XMLSyntaxError:
log.debug('Parsing %s as HTML' % filename)
try:
data = _html5_parse(data)
except:
log.exception(
'HTML 5 parsing failed, falling back to older parsers')
data = _html4_parse(data)
if data.tag == 'HTML':
# Lower case all tag and attribute names
data.tag = data.tag.lower()
for x in data.iterdescendants():
try:
x.tag = x.tag.lower()
for key, val in list(x.attrib.iteritems()):
del x.attrib[key]
key = key.lower()
x.attrib[key] = val
except:
pass
if barename(data.tag) != 'html':
if barename(data.tag) in non_html_file_tags:
raise NotHTML(data.tag)
log.warn('File %r does not appear to be (X)HTML'%filename)
nroot = etree.fromstring('<html></html>')
has_body = False
for child in list(data):
if isinstance(child.tag, (unicode, str)) and barename(child.tag) == 'body':
has_body = True
break
parent = nroot
if not has_body:
log.warn('File %r appears to be a HTML fragment'%filename)
nroot = etree.fromstring('<html><body/></html>')
parent = nroot[0]
for child in list(data.iter()):
oparent = child.getparent()
if oparent is not None:
oparent.remove(child)
parent.append(child)
data = nroot
# Force into the XHTML namespace
if not namespace(data.tag):
log.warn('Forcing', filename, 'into XHTML namespace')
data.attrib['xmlns'] = XHTML_NS
data = etree.tostring(data, encoding=unicode)
try:
data = etree.fromstring(data, parser=parser)
except:
data = data.replace(':=', '=').replace(':>', '>')
data = data.replace('<http:/>', '')
try:
data = etree.fromstring(data, parser=parser)
except etree.XMLSyntaxError:
log.warn('Stripping comments from %s'%
filename)
data = re.compile(r'<!--.*?-->', re.DOTALL).sub('',
data)
data = data.replace(
"<?xml version='1.0' encoding='utf-8'?><o:p></o:p>",
'')
data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '')
try:
data = etree.fromstring(data,
parser=RECOVER_PARSER)
except etree.XMLSyntaxError:
log.warn('Stripping meta tags from %s'% filename)
data = re.sub(r'<meta\s+[^>]+?>', '', data)
data = etree.fromstring(data, parser=RECOVER_PARSER)
elif namespace(data.tag) != XHTML_NS:
# OEB_DOC_NS, but possibly others
ns = namespace(data.tag)
attrib = dict(data.attrib)
nroot = etree.Element(XHTML('html'),
nsmap={None: XHTML_NS}, attrib=attrib)
for elem in data.iterdescendants():
if isinstance(elem.tag, basestring) and \
namespace(elem.tag) == ns:
elem.tag = XHTML(barename(elem.tag))
for elem in data:
nroot.append(elem)
data = nroot
data = merge_multiple_html_heads_and_bodies(data, log)
# Ensure has a <head/>
head = xpath(data, '/h:html/h:head')
head = head[0] if head else None
if head is None:
log.warn('File %s missing <head/> element' % filename)
head = etree.Element(XHTML('head'))
data.insert(0, head)
title = etree.SubElement(head, XHTML('title'))
title.text = _('Unknown')
elif not xpath(data, '/h:html/h:head/h:title'):
log.warn('File %s missing <title/> element' % filename)
title = etree.SubElement(head, XHTML('title'))
title.text = _('Unknown')
# Remove any encoding-specifying <meta/> elements
for meta in META_XP(data):
meta.getparent().remove(meta)
etree.SubElement(head, XHTML('meta'),
attrib={'http-equiv': 'Content-Type',
'content': '%s; charset=utf-8' % XHTML_NS})
# Ensure has a <body/>
if not xpath(data, '/h:html/h:body'):
body = xpath(data, '//h:body')
if body:
body = body[0]
body.getparent().remove(body)
data.append(body)
else:
log.warn('File %s missing <body/> element' % filename)
etree.SubElement(data, XHTML('body'))
# Remove microsoft office markup
r = [x for x in data.iterdescendants(etree.Element) if 'microsoft-com' in x.tag]
for x in r:
x.tag = XHTML('span')
# Remove lang redefinition inserted by the amazing Microsoft Word!
body = xpath(data, '/h:html/h:body')[0]
for key in list(body.attrib.keys()):
if key == 'lang' or key.endswith('}lang'):
body.attrib.pop(key)
def remove_elem(a):
p = a.getparent()
idx = p.index(a) -1
p.remove(a)
if a.tail:
if idx <= 0:
if p.text is None:
p.text = ''
p.text += a.tail
else:
if p[idx].tail is None:
p[idx].tail = ''
p[idx].tail += a.tail
# Remove hyperlinks with no content as they cause rendering
# artifacts in browser based renderers
# Also remove empty <b>, <u> and <i> tags
for a in xpath(data, '//h:a[@href]|//h:i|//h:b|//h:u'):
if a.get('id', None) is None and a.get('name', None) is None \
and len(a) == 0 and not a.text:
remove_elem(a)
# Convert <br>s with content into paragraphs as ADE can't handle
# them
for br in xpath(data, '//h:br'):
if len(br) > 0 or br.text:
br.tag = XHTML('div')
# Remove any stray text in the <head> section and format it nicely
data.text = '\n '
head = xpath(data, '//h:head')
if head:
head = head[0]
head.text = '\n '
head.tail = '\n '
for child in head:
child.tail = '\n '
child.tail = '\n '
return data

View File

@ -19,16 +19,15 @@ from calibre.ebooks.oeb.base import OPF1_NS, OPF2_NS, OPF2_NSMAP, DC11_NS, \
from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_IMAGES, \ from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_IMAGES, \
PAGE_MAP_MIME, JPEG_MIME, NCX_MIME, SVG_MIME PAGE_MAP_MIME, JPEG_MIME, NCX_MIME, SVG_MIME
from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, \ from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, \
ENTITY_RE, MS_COVER_TYPE, iterlinks MS_COVER_TYPE, iterlinks
from calibre.ebooks.oeb.base import namespace, barename, XPath, xpath, \ from calibre.ebooks.oeb.base import namespace, barename, XPath, xpath, \
urlnormalize, BINARY_MIME, \ urlnormalize, BINARY_MIME, \
OEBError, OEBBook, DirContainer OEBError, OEBBook, DirContainer
from calibre.ebooks.oeb.writer import OEBWriter from calibre.ebooks.oeb.writer import OEBWriter
from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
from calibre.utils.localization import get_lang from calibre.utils.localization import get_lang
from calibre.ptempfile import TemporaryDirectory from calibre.ptempfile import TemporaryDirectory
from calibre.constants import __appname__, __version__ from calibre.constants import __appname__, __version__
from calibre import guess_type from calibre import guess_type, xml_replace_entities
__all__ = ['OEBReader'] __all__ = ['OEBReader']
@ -107,8 +106,7 @@ class OEBReader(object):
try: try:
opf = etree.fromstring(data) opf = etree.fromstring(data)
except etree.XMLSyntaxError: except etree.XMLSyntaxError:
repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0)) data = xml_replace_entities(data, encoding=None)
data = ENTITY_RE.sub(repl, data)
try: try:
opf = etree.fromstring(data) opf = etree.fromstring(data)
self.logger.warn('OPF contains invalid HTML named entities') self.logger.warn('OPF contains invalid HTML named entities')
@ -371,8 +369,15 @@ class OEBReader(object):
else : else :
description = None description = None
index_image = xpath(child,
'descendant::calibre:meta[@name = "toc_thumbnail"]')
toc_thumbnail = (index_image[0].text if index_image else None)
if not toc_thumbnail or not toc_thumbnail.strip():
toc_thumbnail = None
node = toc.add(title, href, id=id, klass=klass, node = toc.add(title, href, id=id, klass=klass,
play_order=po, description=description, author=author) play_order=po, description=description, author=author,
toc_thumbnail=toc_thumbnail)
self._toc_from_navpoint(item, node, child) self._toc_from_navpoint(item, node, child)

View File

@ -56,8 +56,11 @@ def render_html(mi, css, vertical, widget, all_fields=False): # {{{
</body> </body>
<html> <html>
'''%(f, c, css) '''%(f, c, css)
fm = getattr(mi, 'field_metadata', field_metadata)
fl = dict(get_field_list(fm))
show_comments = (all_fields or fl.get('comments', True))
comments = u'' comments = u''
if mi.comments: if mi.comments and show_comments:
comments = comments_to_html(force_unicode(mi.comments)) comments = comments_to_html(force_unicode(mi.comments))
right_pane = u'<div id="comments" class="comments">%s</div>'%comments right_pane = u'<div id="comments" class="comments">%s</div>'%comments

View File

@ -429,7 +429,7 @@ def populate_metadata_page(layout, db, book_id, bulk=False, two_column=False, pa
# The fields named here must be first in the widget list # The fields named here must be first in the widget list
tweak_cols = tweaks['metadata_edit_custom_column_order'] tweak_cols = tweaks['metadata_edit_custom_column_order']
comments_in_tweak = 0 comments_in_tweak = 0
for key in tweak_cols: for key in (tweak_cols or ()):
# Add the key if it really exists in the database # Add the key if it really exists in the database
if key in cols_to_display: if key in cols_to_display:
cols.append(key) cols.append(key)

View File

@ -441,7 +441,7 @@ class Scheduler(QObject):
self.news_menu.addAction(self.cac) self.news_menu.addAction(self.cac)
self.news_menu.addSeparator() self.news_menu.addSeparator()
self.all_action = self.news_menu.addAction( self.all_action = self.news_menu.addAction(
_('Download all scheduled new sources'), _('Download all scheduled news sources'),
self.download_all_scheduled) self.download_all_scheduled)
self.timer = QTimer(self) self.timer = QTimer(self)

View File

@ -758,11 +758,12 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
self.set_page_number(frac) self.set_page_number(frac)
def next_document(self): def next_document(self):
if self.current_index < len(self.iterator.spine) - 1: if (hasattr(self, 'current_index') and self.current_index <
len(self.iterator.spine) - 1):
self.load_path(self.iterator.spine[self.current_index+1]) self.load_path(self.iterator.spine[self.current_index+1])
def previous_document(self): def previous_document(self):
if self.current_index > 0: if hasattr(self, 'current_index') and self.current_index > 0:
self.load_path(self.iterator.spine[self.current_index-1], pos=1.0) self.load_path(self.iterator.spine[self.current_index-1], pos=1.0)
def keyPressEvent(self, event): def keyPressEvent(self, event):

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

19163
src/calibre/translations/ku.po Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More