mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Merge from trunk
This commit is contained in:
commit
f9b6ce9470
@ -2,6 +2,8 @@
|
||||
.check-cache.pickle
|
||||
src/calibre/plugins
|
||||
resources/images.qrc
|
||||
src/calibre/ebooks/oeb/display/test/*.js
|
||||
resources/display/*.js
|
||||
src/calibre/manual/.build/
|
||||
src/calibre/manual/cli/
|
||||
src/calibre/manual/template_ref.rst
|
||||
|
@ -19,6 +19,65 @@
|
||||
# new recipes:
|
||||
# - title:
|
||||
|
||||
- version: 0.8.31
|
||||
date: 2011-12-16
|
||||
|
||||
new features:
|
||||
- title: "Conversion engine: When parsing invalid XHTML use the HTML 5 algorithm, for greater robustness."
|
||||
tickets: [901466]
|
||||
|
||||
- title: "Driver for PocketBook 611 and Lenovo IdeaPad"
|
||||
|
||||
- title: "Allow customization of the order in which custom column editing is performed in the edit metadata dialog. Setting is available via Preferences->Tweaks."
|
||||
tickets: [902731]
|
||||
|
||||
- title: "MOBI news download: Allow recipes to set a thumbnail for entries in the periodical table of contents. Currently used by the NYTimes, WSJ, Independent, GUardian and Globe and Mail recipes"
|
||||
tickets: [900130]
|
||||
|
||||
- title: "E-book viewer: Add an option to the right click menu to search for the currently selected word"
|
||||
|
||||
- title: "Automatically hide the no internet connection available error message if the connection is restored before the user clicks OK"
|
||||
|
||||
bug fixes:
|
||||
- title: "Fix comments not hidden in Book details panel when they are turned off via Preferences->Look & Feel->Book Details"
|
||||
|
||||
- title: "E-book viewer: Do not popup an error message if the user tries to use the mouse wheel to scroll before a document is loaded."
|
||||
tickets: [903449]
|
||||
|
||||
- title: "Add docx to the list of ebook extensions."
|
||||
tickets: [903452]
|
||||
|
||||
- title: "When downloading metadata from non-English Amazon websites, do not correct the case of book titles."
|
||||
|
||||
- title: "Fix regression in 0.8.30 that broke bulk conversion of a single book."
|
||||
tickets: [902506]
|
||||
|
||||
- title: "When minimized to system tray do not display the no internet connection error as a dialog box, instead use a system tray notification"
|
||||
|
||||
- title: "Catalog generation: Include the series_index field for custom series columns as well"
|
||||
|
||||
- title: "Comic Input: Do not rescale images when using the Tablet output profile (or any output profile with a screen size larger than 3000x3000)"
|
||||
|
||||
- title: "HTML Input: Ignore unparseable URLs instead of crashing on them."
|
||||
tickets: [902372]
|
||||
|
||||
|
||||
improved recipes:
|
||||
- La Republica
|
||||
- CND
|
||||
- Berliner Zeitung
|
||||
- Zaman Gazetesi
|
||||
|
||||
new recipes:
|
||||
- title: CND Weekly
|
||||
author: Derek Liang
|
||||
|
||||
- title: descopera.org
|
||||
author: Marius Ignatescu
|
||||
|
||||
- title: Rynek Zdrowia
|
||||
author: spi630
|
||||
|
||||
- version: 0.8.30
|
||||
date: 2011-12-09
|
||||
|
||||
|
@ -1,61 +1,44 @@
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
import re
|
||||
|
||||
'''Calibre recipe to convert the RSS feeds of the Berliner Zeitung to an ebook.'''
|
||||
|
||||
class SportsIllustratedRecipe(BasicNewsRecipe) :
|
||||
__author__ = 'ape'
|
||||
__copyright__ = 'ape'
|
||||
__author__ = 'a.peter'
|
||||
__copyright__ = 'a.peter'
|
||||
__license__ = 'GPL v3'
|
||||
language = 'de'
|
||||
description = 'Berliner Zeitung'
|
||||
version = 2
|
||||
description = 'Berliner Zeitung RSS'
|
||||
version = 4
|
||||
title = u'Berliner Zeitung'
|
||||
timefmt = ' [%d.%m.%Y]'
|
||||
|
||||
#oldest_article = 7.0
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
publication_type = 'newspaper'
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'teaser t_split t_artikel'})]
|
||||
remove_tags_before = dict(name='div', attrs={'class':'newstype'})
|
||||
remove_tags_after = [dict(id='article_text')]
|
||||
|
||||
INDEX = 'http://www.berlinonline.de/berliner-zeitung/'
|
||||
|
||||
def parse_index(self):
|
||||
base = 'http://www.berlinonline.de'
|
||||
answer = []
|
||||
articles = {}
|
||||
more = 1
|
||||
|
||||
soup = self.index_to_soup(self.INDEX)
|
||||
|
||||
# Get list of links to ressorts from index page
|
||||
ressort_list = soup.findAll('ul', attrs={'class': re.compile('ressortlist')})
|
||||
for ressort in ressort_list[0].findAll('a'):
|
||||
feed_title = ressort.string
|
||||
print 'Analyzing', feed_title
|
||||
if not articles.has_key(feed_title):
|
||||
articles[feed_title] = []
|
||||
answer.append(feed_title)
|
||||
# Load ressort page.
|
||||
feed = self.index_to_soup('http://www.berlinonline.de' + ressort['href'])
|
||||
# find mainbar div which contains the list of all articles
|
||||
for article_container in feed.findAll('div', attrs={'class': re.compile('mainbar')}):
|
||||
# iterate over all articles
|
||||
for article_teaser in article_container.findAll('div', attrs={'class': re.compile('teaser')}):
|
||||
# extract title of article
|
||||
if article_teaser.h3 != None:
|
||||
article = {'title' : article_teaser.h3.a.string, 'date' : u'', 'url' : base + article_teaser.h3.a['href'], 'description' : u''}
|
||||
articles[feed_title].append(article)
|
||||
else:
|
||||
# Skip teasers for missing photos
|
||||
if article_teaser.div.p.contents[0].find('Foto:') > -1:
|
||||
continue
|
||||
article = {'title': 'Weitere Artikel ' + str(more), 'date': u'', 'url': base + article_teaser.div.p.a['href'], 'description': u''}
|
||||
articles[feed_title].append(article)
|
||||
more += 1
|
||||
answer = [[key, articles[key]] for key in answer if articles.has_key(key)]
|
||||
return answer
|
||||
feeds = [(u'Startseite', u'http://www.berliner-zeitung.de/home/10808950,10808950,view,asFeed.xml'),
|
||||
(u'Politik', u'http://www.berliner-zeitung.de/home/10808018,10808018,view,asFeed.xml'),
|
||||
(u'Wirtschaft', u'http://www.berliner-zeitung.de/home/10808230,10808230,view,asFeed.xml'),
|
||||
(u'Berlin', u'http://www.berliner-zeitung.de/home/10809148,10809148,view,asFeed.xml'),
|
||||
(u'Brandenburg', u'http://www.berliner-zeitung.de/home/10809312,10809312,view,asFeed.xml'),
|
||||
(u'Wissenschaft', u'http://www.berliner-zeitung.de/home/10808894,10808894,view,asFeed.xml'),
|
||||
(u'Digital', u'http://www.berliner-zeitung.de/home/10808718,10808718,view,asFeed.xml'),
|
||||
(u'Kultur', u'http://www.berliner-zeitung.de/home/10809150,10809150,view,asFeed.xml'),
|
||||
(u'Panorama', u'http://www.berliner-zeitung.de/home/10808334,10808334,view,asFeed.xml'),
|
||||
(u'Sport', u'http://www.berliner-zeitung.de/home/10808794,10808794,view,asFeed.xml'),
|
||||
(u'Hertha', u'http://www.berliner-zeitung.de/home/10808800,10808800,view,asFeed.xml'),
|
||||
(u'Union', u'http://www.berliner-zeitung.de/home/10808802,10808802,view,asFeed.xml'),
|
||||
(u'Verkehr', u'http://www.berliner-zeitung.de/home/10809298,10809298,view,asFeed.xml'),
|
||||
(u'Polizei', u'http://www.berliner-zeitung.de/home/10809296,10809296,view,asFeed.xml'),
|
||||
(u'Meinung', u'http://www.berliner-zeitung.de/home/10808020,10808020,view,asFeed.xml')]
|
||||
|
||||
def get_masthead_url(self):
|
||||
return 'http://www.berlinonline.de/.img/berliner-zeitung/blz_logo.gif'
|
||||
return 'http://www.berliner-zeitung.de/image/view/10810244,7040611,data,logo.png'
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('.html', ',view,printVersion.html')
|
||||
|
@ -23,7 +23,9 @@ class TheCND(BasicNewsRecipe):
|
||||
remove_tags = [dict(name='table', attrs={'align':'right'}), dict(name='img', attrs={'src':'http://my.cnd.org/images/logo.gif'}), dict(name='hr', attrs={}), dict(name='small', attrs={})]
|
||||
no_stylesheets = True
|
||||
|
||||
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
|
||||
preprocess_regexps = [ (re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''),
|
||||
(re.compile('<table width.*?</table>', re.DOTALL), lambda m: ''),
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
if url.find('news/article.php') >= 0:
|
||||
@ -46,16 +48,18 @@ class TheCND(BasicNewsRecipe):
|
||||
title = self.tag_to_string(a)
|
||||
self.log('\tFound article: ', title, 'at', url)
|
||||
date = a.nextSibling
|
||||
if re.search('cm', date):
|
||||
continue
|
||||
if (date is not None) and len(date)>2:
|
||||
if not articles.has_key(date):
|
||||
articles[date] = []
|
||||
articles[date].append({'title':title, 'url':url, 'description': '', 'date':''})
|
||||
self.log('\t\tAppend to : ', date)
|
||||
|
||||
self.log('log articles', articles)
|
||||
#self.log('log articles', articles)
|
||||
mostCurrent = sorted(articles).pop()
|
||||
self.title = 'CND ' + mostCurrent
|
||||
|
||||
self.title = 'CND ' + mostCurrent
|
||||
|
||||
feeds.append((self.title, articles[mostCurrent]))
|
||||
|
||||
return feeds
|
||||
|
72
recipes/cnd_weekly.recipe
Normal file
72
recipes/cnd_weekly.recipe
Normal file
@ -0,0 +1,72 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Derek Liang <Derek.liang.ca @@@at@@@ gmail.com>'
|
||||
'''
|
||||
cnd.org
|
||||
'''
|
||||
import re
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class TheCND(BasicNewsRecipe):
|
||||
|
||||
title = 'CND Weekly'
|
||||
__author__ = 'Derek Liang'
|
||||
description = ''
|
||||
INDEX = 'http://cnd.org'
|
||||
language = 'zh'
|
||||
conversion_options = {'linearize_tables':True}
|
||||
|
||||
remove_tags_before = dict(name='div', id='articleHead')
|
||||
remove_tags_after = dict(id='copyright')
|
||||
remove_tags = [dict(name='table', attrs={'align':'right'}), dict(name='img', attrs={'src':'http://my.cnd.org/images/logo.gif'}), dict(name='hr', attrs={}), dict(name='small', attrs={})]
|
||||
no_stylesheets = True
|
||||
|
||||
preprocess_regexps = [ (re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''),
|
||||
(re.compile('<table width.*?</table>', re.DOTALL), lambda m: ''),
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
if url.find('news/article.php') >= 0:
|
||||
return re.sub("^[^=]*", "http://my.cnd.org/modules/news/print.php?storyid", url)
|
||||
else:
|
||||
return re.sub("^[^=]*", "http://my.cnd.org/modules/wfsection/print.php?articleid", url)
|
||||
|
||||
def parse_index(self):
|
||||
soup = self.index_to_soup(self.INDEX)
|
||||
|
||||
feeds = []
|
||||
articles = {}
|
||||
|
||||
for a in soup.findAll('a', attrs={'target':'_cnd'}):
|
||||
url = a['href']
|
||||
if url.find('article.php') < 0 :
|
||||
continue
|
||||
if url.startswith('/'):
|
||||
url = 'http://cnd.org'+url
|
||||
title = self.tag_to_string(a)
|
||||
date = a.nextSibling
|
||||
if not re.search('cm', date):
|
||||
continue
|
||||
self.log('\tFound article: ', title, 'at', url, '@', date)
|
||||
if (date is not None) and len(date)>2:
|
||||
if not articles.has_key(date):
|
||||
articles[date] = []
|
||||
articles[date].append({'title':title, 'url':url, 'description': '', 'date':''})
|
||||
self.log('\t\tAppend to : ', date)
|
||||
|
||||
|
||||
sorted_articles = sorted(articles)
|
||||
while sorted_articles:
|
||||
mostCurrent = sorted_articles.pop()
|
||||
self.title = 'CND ' + mostCurrent
|
||||
feeds.append((self.title, articles[mostCurrent]))
|
||||
|
||||
return feeds
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
header = soup.find('h3')
|
||||
self.log('header: ' + self.tag_to_string(header))
|
||||
pass
|
||||
|
@ -1,27 +1,27 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
'''
|
||||
descopera.org
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Descopera(BasicNewsRecipe):
|
||||
title = u'Descoperă.org'
|
||||
__author__ = 'Marius Ignătescu'
|
||||
description = 'Descoperă. Placerea de a cunoaște'
|
||||
publisher = 'descopera.org'
|
||||
category = 'science, technology, culture, history, earth'
|
||||
language = 'ro'
|
||||
oldest_article = 14
|
||||
max_articles_per_feed = 100
|
||||
encoding = 'utf8'
|
||||
no_stylesheets = True
|
||||
extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
|
||||
keep_only_tags = [dict(name='div', attrs={'class':['post']})]
|
||||
remove_tags = [dict(name='div', attrs={'class':['topnav', 'box_a', 'shr-bookmarks shr-bookmarks-expand shr-bookmarks-center shr-bookmarks-bg-knowledge']})]
|
||||
remove_attributes = ['width','height']
|
||||
cover_url = 'http://www.descopera.org/wp-content/themes/dorg/styles/default/img/b_top.png?width=400'
|
||||
feeds = [(u'Articles', u'http://www.descopera.org/feed/')]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
||||
# -*- coding: utf-8 -*-
|
||||
'''
|
||||
descopera.org
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Descopera(BasicNewsRecipe):
|
||||
title = u'Descoperă.org'
|
||||
__author__ = 'Marius Ignătescu'
|
||||
description = 'Descoperă. Placerea de a cunoaște'
|
||||
publisher = 'descopera.org'
|
||||
category = 'science, technology, culture, history, earth'
|
||||
language = 'ro'
|
||||
oldest_article = 14
|
||||
max_articles_per_feed = 100
|
||||
encoding = 'utf8'
|
||||
no_stylesheets = True
|
||||
extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
|
||||
keep_only_tags = [dict(name='div', attrs={'class':['post']})]
|
||||
remove_tags = [dict(name='div', attrs={'class':['topnav', 'box_a', 'shr-bookmarks shr-bookmarks-expand shr-bookmarks-center shr-bookmarks-bg-knowledge']})]
|
||||
remove_attributes = ['width','height']
|
||||
cover_url = 'http://www.descopera.org/wp-content/themes/dorg/styles/default/img/b_top.png?width=400'
|
||||
feeds = [(u'Articles', u'http://www.descopera.org/feed/')]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
||||
|
@ -51,6 +51,13 @@ class AdvancedUserRecipe1287083651(BasicNewsRecipe):
|
||||
{'class':['articleTools', 'pagination', 'Ads', 'topad',
|
||||
'breadcrumbs', 'footerNav', 'footerUtil', 'downloadlinks']}]
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||
picdiv = soup.find('img')
|
||||
if picdiv is not None:
|
||||
self.add_toc_thumbnail(article,picdiv['src'])
|
||||
|
||||
|
||||
#Use the mobile version rather than the web version
|
||||
def print_version(self, url):
|
||||
return url.rpartition('?')[0] + '?service=mobile'
|
||||
|
@ -79,6 +79,12 @@ class Guardian(BasicNewsRecipe):
|
||||
url = None
|
||||
return url
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||
picdiv = soup.find('img')
|
||||
if picdiv is not None:
|
||||
self.add_toc_thumbnail(article,picdiv['src'])
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
|
||||
# multiple html sections in soup, useful stuff in the first
|
||||
|
@ -104,6 +104,13 @@ class TheIndependentNew(BasicNewsRecipe):
|
||||
url = None
|
||||
return url
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||
picdiv = soup.find('img')
|
||||
if picdiv is not None:
|
||||
self.add_toc_thumbnail(article,picdiv['src'])
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
|
||||
#remove 'advertorial articles'
|
||||
|
@ -1,13 +1,12 @@
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'Lorenzo Vigentini, based on Darko Miletic, Gabriele Marini'
|
||||
__copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>'
|
||||
description = 'Italian daily newspaper - v1.01 (04, January 2010); 16.05.2010 new version; 17.10.2011 new version'
|
||||
description = 'Italian daily newspaper - v1.01 (04, January 2010); 16.05.2010 new version; 17.10.2011 new version; 14.12.2011 new version'
|
||||
|
||||
'''
|
||||
http://www.repubblica.it/
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
@ -25,27 +24,21 @@ class LaRepubblica(BasicNewsRecipe):
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
publication_type = 'newspaper'
|
||||
articles_are_obfuscated = True
|
||||
temp_files = []
|
||||
articles_are_obfuscated = True
|
||||
temp_files = []
|
||||
extra_css = """
|
||||
img{display: block}
|
||||
"""
|
||||
|
||||
|
||||
remove_attributes = ['width','height','lang','xmlns:og','xmlns:fb']
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'.*?<head>', re.DOTALL|re.IGNORECASE), lambda match: '<head>'),
|
||||
(re.compile(r'<head>.*?<title>', re.DOTALL|re.IGNORECASE), lambda match: '<head><title>'),
|
||||
(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE), lambda match: '</title></head>')
|
||||
]
|
||||
|
||||
|
||||
def get_article_url(self, article):
|
||||
link = BasicNewsRecipe.get_article_url(self, article)
|
||||
if link and not '.repubblica.it/' in link:
|
||||
link2 = article.get('id', article.get('guid', None))
|
||||
if link2:
|
||||
link = link2
|
||||
return link.rpartition('?')[0]
|
||||
return link.rpartition('?')[0]
|
||||
|
||||
def get_obfuscated_article(self, url):
|
||||
count = 0
|
||||
@ -56,12 +49,12 @@ class LaRepubblica(BasicNewsRecipe):
|
||||
count = 10
|
||||
except:
|
||||
print "Retrying download..."
|
||||
count += 1
|
||||
count += 1
|
||||
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
||||
self.temp_files[-1].write(html)
|
||||
self.temp_files[-1].close()
|
||||
return self.temp_files[-1].name
|
||||
|
||||
|
||||
keep_only_tags = [
|
||||
dict(attrs={'class':'articolo'}),
|
||||
dict(attrs={'class':'body-text'}),
|
||||
@ -73,15 +66,15 @@ class LaRepubblica(BasicNewsRecipe):
|
||||
remove_tags = [
|
||||
dict(name=['object','link','meta','iframe','embed']),
|
||||
dict(name='span',attrs={'class':'linkindice'}),
|
||||
dict(name='div', attrs={'class':'bottom-mobile'}),
|
||||
dict(name='div', attrs={'id':['rssdiv','blocco']}),
|
||||
dict(name='div', attrs={'class':'utility'}),
|
||||
dict(name='div', attrs={'class':['bottom-mobile','adv adv-middle-inline']}),
|
||||
dict(name='div', attrs={'id':['rssdiv','blocco','fb-like-head']}),
|
||||
dict(name='div', attrs={'class':['utility','fb-like-button','archive-button']}),
|
||||
dict(name='div', attrs={'class':'generalbox'}),
|
||||
dict(name='ul', attrs={'id':'hystory'})
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Rilievo', u'http://www.repubblica.it/rss/homepage/rss2.0.xml'),
|
||||
(u'Homepage', u'http://www.repubblica.it/rss/homepage/rss2.0.xml'),
|
||||
(u'Cronaca', u'http://www.repubblica.it/rss/cronaca/rss2.0.xml'),
|
||||
(u'Esteri', u'http://www.repubblica.it/rss/esteri/rss2.0.xml'),
|
||||
(u'Economia', u'http://www.repubblica.it/rss/economia/rss2.0.xml'),
|
||||
@ -105,8 +98,10 @@ class LaRepubblica(BasicNewsRecipe):
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(['hgroup','deresponsabilizzazione','per']):
|
||||
item.name = 'div'
|
||||
item.attrs = []
|
||||
item.attrs = []
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
del item['style']
|
||||
return soup
|
||||
|
||||
|
||||
def preprocess_raw_html(self, raw, url):
|
||||
return '<html><head>'+raw[raw.find('</head>'):]
|
||||
|
@ -1,5 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
'''
|
||||
@ -707,6 +707,16 @@ class NYTimes(BasicNewsRecipe):
|
||||
return soup
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||
idxdiv = soup.find('div',attrs={'class':'articleSpanImage'})
|
||||
if idxdiv is not None:
|
||||
if idxdiv.img:
|
||||
self.add_toc_thumbnail(article, idxdiv.img['src'])
|
||||
else:
|
||||
img = soup.find('img')
|
||||
if img is not None:
|
||||
self.add_toc_thumbnail(article, img['src'])
|
||||
|
||||
shortparagraph = ""
|
||||
try:
|
||||
if len(article.text_summary.strip()) == 0:
|
||||
|
@ -855,6 +855,16 @@ class NYTimes(BasicNewsRecipe):
|
||||
|
||||
return soup
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||
idxdiv = soup.find('div',attrs={'class':'articleSpanImage'})
|
||||
if idxdiv is not None:
|
||||
if idxdiv.img:
|
||||
self.add_toc_thumbnail(article, idxdiv.img['src'])
|
||||
else:
|
||||
img = soup.find('img')
|
||||
if img is not None:
|
||||
self.add_toc_thumbnail(article, img['src'])
|
||||
|
||||
shortparagraph = ""
|
||||
try:
|
||||
if len(article.text_summary.strip()) == 0:
|
||||
|
@ -12,39 +12,39 @@ class Sueddeutsche(BasicNewsRecipe):
|
||||
|
||||
title = u'sueddeutsche.de'
|
||||
description = 'News from Germany'
|
||||
__author__ = 'Oliver Niesner and Armin Geller' #AGe 2011-11-25
|
||||
__author__ = 'Oliver Niesner and Armin Geller' #AGe 2011-12-16
|
||||
use_embedded_content = False
|
||||
timefmt = ' [%d %b %Y]'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 50
|
||||
oldest_article = 1#7
|
||||
max_articles_per_feed = 2#50
|
||||
no_stylesheets = True
|
||||
language = 'de'
|
||||
|
||||
auto_cleanup = True
|
||||
encoding = 'utf-8'
|
||||
remove_javascript = True
|
||||
cover_url = 'http://polpix.sueddeutsche.com/polopoly_fs/1.1219199.1322239289!/image/image.jpg_gen/derivatives/860x860/image.jpg' # 2011-11-25 AGe
|
||||
|
||||
remove_tags = [ dict(name='link'), dict(name='iframe'),
|
||||
dict(name='div', attrs={'id':["bookmarking","themenbox","artikelfoot","CAD_AD",
|
||||
"SKY_AD","NT1_AD","navbar1","sdesiteheader"]}),
|
||||
|
||||
dict(name='div', attrs={'class':["similar-article-box","artikelliste","nteaser301bg",
|
||||
"pages closed","basebox right narrow","headslot galleried"]}),
|
||||
|
||||
dict(name='div', attrs={'class':["articleDistractor","listHeader","listHeader2","hr2",
|
||||
"item","videoBigButton","articlefooter full-column",
|
||||
"bildbanderolle full-column","footerCopy padleft5"]}),
|
||||
|
||||
dict(name='p', attrs={'class':["ressortartikeln","artikelFliestext","entry-summary"]}),
|
||||
dict(name='div', attrs={'style':["position:relative;"]}),
|
||||
dict(name='span', attrs={'class':["nlinkheaderteaserschwarz","artikelLink","r10000000"]}),
|
||||
dict(name='table', attrs={'class':["stoerBS","kommentare","footer","pageBoxBot","pageAktiv","bgcontent"]}),
|
||||
dict(name='ul', attrs={'class':["breadcrumb","articles","activities","sitenav","actions"]}),
|
||||
dict(name='td', attrs={'class':["artikelDruckenRight"]}),
|
||||
dict(name='p', text = "ANZEIGE")
|
||||
]
|
||||
remove_tags_after = [dict(name='div', attrs={'class':["themenbox full-column"]})]
|
||||
|
||||
cover_url = 'http://polpix.sueddeutsche.com/polopoly_fs/1.1236175.1323967473!/image/image.jpg_gen/derivatives/860x860/image.jpg' # 2011-12-16 AGe
|
||||
# 2011-12-16 AGe
|
||||
# remove_tags = [ dict(name='link'), dict(name='iframe'),
|
||||
# dict(name='div', attrs={'id':["bookmarking","themenbox","artikelfoot","CAD_AD",
|
||||
# "SKY_AD","NT1_AD","navbar1","sdesiteheader"]}),
|
||||
#
|
||||
# dict(name='div', attrs={'class':["similar-article-box","artikelliste","nteaser301bg",
|
||||
# "pages closed","basebox right narrow","headslot galleried"]}),
|
||||
#
|
||||
# dict(name='div', attrs={'class':["articleDistractor","listHeader","listHeader2","hr2",
|
||||
# "item","videoBigButton","articlefooter full-column",
|
||||
# "bildbanderolle full-column","footerCopy padleft5"]}),
|
||||
#
|
||||
# dict(name='p', attrs={'class':["ressortartikeln","artikelFliestext","entry-summary"]}),
|
||||
# dict(name='div', attrs={'style':["position:relative;"]}),
|
||||
# dict(name='span', attrs={'class':["nlinkheaderteaserschwarz","artikelLink","r10000000"]}),
|
||||
# dict(name='table', attrs={'class':["stoerBS","kommentare","footer","pageBoxBot","pageAktiv","bgcontent"]}),
|
||||
# dict(name='ul', attrs={'class':["breadcrumb","articles","activities","sitenav","actions"]}),
|
||||
# dict(name='td', attrs={'class':["artikelDruckenRight"]}),
|
||||
# dict(name='p', text = "ANZEIGE")
|
||||
# ]
|
||||
# remove_tags_after = [dict(name='div', attrs={'class':["themenbox full-column"]})]
|
||||
#
|
||||
extra_css = '''
|
||||
h2{font-family:Arial,Helvetica,sans-serif; font-size: x-small; color: #003399;}
|
||||
a{font-family:Arial,Helvetica,sans-serif; font-style:italic;}
|
||||
@ -53,30 +53,45 @@ class Sueddeutsche(BasicNewsRecipe):
|
||||
.artikelTeaser{font-family:Arial,Helvetica,sans-serif; font-size: x-small; font-weight:bold; }
|
||||
body{font-family:Arial,Helvetica,sans-serif; }
|
||||
.photo {font-family:Arial,Helvetica,sans-serif; font-size: x-small; color: #666666;} '''
|
||||
|
||||
#
|
||||
feeds = [
|
||||
(u'Politik', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EPolitik%24?output=rss'),
|
||||
(u'Wirtschaft', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EWirtschaft%24?output=rss'),
|
||||
(u'Geld', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EGeld%24?output=rss'),
|
||||
(u'Kultur', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EKultur%24?output=rss'),
|
||||
(u'Sport', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ESport%24?output=rss'),
|
||||
(u'Leben', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ELeben%24?output=rss'),
|
||||
(u'Karriere', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EKarriere%24?output=rss'),
|
||||
(u'München & Region', u'http://www.sueddeutsche.de/app/service/rss/ressort/muenchen/rss.xml'), # AGe 2011-11-13
|
||||
(u'Bayern', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EBayern%24?output=rss'),
|
||||
(u'Medien', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EMedien%24?output=rss'),
|
||||
(u'Digital', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EDigital%24?output=rss'),
|
||||
(u'Auto', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EAuto%24?output=rss'),
|
||||
(u'Wissen', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EWissen%24?output=rss'),
|
||||
(u'Panorama', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EPanorama%24?output=rss'),
|
||||
(u'Reise', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EReise%24?output=rss'),
|
||||
(u'Technik', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ETechnik%24?output=rss'), # sometimes only
|
||||
(u'Macht', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EMacht%24?output=rss'), # sometimes only
|
||||
(u'Job', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EJob%24?output=rss'), # sometimes only
|
||||
(u'Service', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EService%24?output=rss'), # sometimes only
|
||||
(u'Verlag', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EVerlag%24?output=rss'), # sometimes only
|
||||
# (u'Politik', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EPolitik%24?output=rss'), #AGe 2011-12-16 deactivated
|
||||
# (u'Wirtschaft', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EWirtschaft%24?output=rss'), #AGe 2011-12-16 deactivated
|
||||
# (u'Geld', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EGeld%24?output=rss'), #AGe 2011-12-16 deactivated
|
||||
# (u'Kultur', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EKultur%24?output=rss'), #AGe 2011-12-16 deactivated
|
||||
# (u'Sport', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ESport%24?output=rss'), #AGe 2011-12-16 deactivated
|
||||
# (u'Leben', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ELeben%24?output=rss'), #AGe 2011-12-16 deactivated
|
||||
# (u'Karriere', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EKarriere%24?output=rss'), #AGe 2011-12-16 deactivated
|
||||
# (u'München & Region', u'http://www.sueddeutsche.de/app/service/rss/ressort/muenchen/rss.xml'), # AGe 2011-11-13
|
||||
# (u'Bayern', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EBayern%24?output=rss'), #AGe 2011-12-16 deactivated
|
||||
# (u'Medien', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EMedien%24?output=rss'), #AGe 2011-12-16 deactivated
|
||||
# (u'Digital', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EDigital%24?output=rss'), #AGe 2011-12-16 deactivated
|
||||
# (u'Auto', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EAuto%24?output=rss'), #AGe 2011-12-16 deactivated
|
||||
# (u'Wissen', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EWissen%24?output=rss'), #AGe 2011-12-16 deactivated
|
||||
# (u'Panorama', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EPanorama%24?output=rss'), #AGe 2011-12-16 deactivated
|
||||
# (u'Reise', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EReise%24?output=rss'), #AGe 2011-12-16 deactivated
|
||||
# (u'Technik', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ETechnik%24?output=rss'), # sometimes only #AGe 2011-12-16 deactivated
|
||||
# (u'Macht', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EMacht%24?output=rss'), # sometimes only #AGe 2011-12-16 deactivated
|
||||
# (u'Job', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EJob%24?output=rss'), # sometimes only #AGe 2011-12-16 deactivated
|
||||
# (u'Service', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EService%24?output=rss'), # sometimes only #AGe 2011-12-16 deactivated
|
||||
# (u'Verlag', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EVerlag%24?output=rss'), # sometimes only #AGe 2011-12-16 deactivated
|
||||
(u'Politik', u'http://www.sueddeutsche.de/app/service/rss/ressort/politik/rss.xml'),
|
||||
(u'Wirtschaft', u'http://www.sueddeutsche.de/app/service/rss/ressort/wirtschaft/rss.xml'),
|
||||
(u'Geld', u'http://www.sueddeutsche.de/app/service/rss/ressort/finanzen/rss.xml'),
|
||||
(u'Kultur', u'http://www.sueddeutsche.de/app/service/rss/ressort/kultur/rss.xml'),
|
||||
(u'Sport', u'http://www.sueddeutsche.de/app/service/rss/ressort/sport/rss.xml'),
|
||||
(u'Leben', u'http://www.sueddeutsche.de/app/service/rss/ressort/leben/rss.xml'),
|
||||
(u'Karriere', u'http://www.sueddeutsche.de/app/service/rss/ressort/karriere/rss.xml'),
|
||||
(u'München & Region', u'http://www.sueddeutsche.de/app/service/rss/ressort/muenchen/rss.xml'),
|
||||
(u'Bayern', u'http://www.sueddeutsche.de/app/service/rss/ressort/bayern/rss.xml'),
|
||||
(u'Medien', u'http://www.sueddeutsche.de/app/service/rss/ressort/medien/rss.xml'),
|
||||
(u'Digital', u'http://www.sueddeutsche.de/app/service/rss/ressort/computerwissen/rss.xml'),
|
||||
(u'Auto', u'http://www.sueddeutsche.de/app/service/rss/ressort/autoreise/rss.xml'),
|
||||
(u'Wissen', u'http://www.sueddeutsche.de/app/service/rss/ressort/wissen/rss.xml'),
|
||||
(u'Panorama', u'http://www.sueddeutsche.de/app/service/rss/ressort/panorama/rss.xml'),
|
||||
(u'Reise', u'http://www.sueddeutsche.de/app/service/rss/ressort/reise/rss.xml'),
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
main, sep, id = url.rpartition('/')
|
||||
return main + '/2.220/' + id
|
||||
# def print_version(self, url): #AGe 2011-12-16 deactivated
|
||||
# main, sep, id = url.rpartition('/') #AGe 2011-12-16 deactivated
|
||||
# return main + '/2.220/' + id #AGe 2011-12-16 deactivated
|
||||
|
@ -59,6 +59,11 @@ class TelegraphUK(BasicNewsRecipe):
|
||||
,(u'Travel' , u'http://www.telegraph.co.uk/travel/rss' )
|
||||
,(u'How about that?', u'http://www.telegraph.co.uk/news/newstopics/howaboutthat/rss' )
|
||||
]
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||
picdiv = soup.find('img')
|
||||
if picdiv is not None:
|
||||
self.add_toc_thumbnail(article,picdiv['src'])
|
||||
|
||||
def get_article_url(self, article):
|
||||
url = article.get('link', None)
|
||||
|
@ -57,6 +57,12 @@ class WallStreetJournal(BasicNewsRecipe):
|
||||
'username and password')
|
||||
return br
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||
picdiv = soup.find('img')
|
||||
if picdiv is not None:
|
||||
self.add_toc_thumbnail(article,picdiv['src'])
|
||||
|
||||
def postprocess_html(self, soup, first):
|
||||
for tag in soup.findAll(name=['table', 'tr', 'td']):
|
||||
tag.name = 'div'
|
||||
|
@ -44,6 +44,12 @@ class WallStreetJournal(BasicNewsRecipe):
|
||||
]
|
||||
remove_tags_after = [dict(id="article_story_body"), {'class':"article story"},]
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||
picdiv = soup.find('img')
|
||||
if picdiv is not None:
|
||||
self.add_toc_thumbnail(article,picdiv['src'])
|
||||
|
||||
def postprocess_html(self, soup, first):
|
||||
for tag in soup.findAll(name=['table', 'tr', 'td']):
|
||||
tag.name = 'div'
|
||||
|
@ -1,5 +1,5 @@
|
||||
" Project wide builtins
|
||||
let g:pyflakes_builtins = ["_", "dynamic_property", "__", "P", "I", "lopen", "icu_lower", "icu_upper", "icu_title", "ngettext"]
|
||||
let $PYFLAKES_BUILTINS = "_,dynamic_property,__,P,I,lopen,icu_lower,icu_upper,icu_title,ngettext"
|
||||
|
||||
python << EOFPY
|
||||
import os, sys
|
||||
|
@ -11,7 +11,7 @@ __all__ = [
|
||||
'build', 'build_pdf2xml', 'server',
|
||||
'gui',
|
||||
'develop', 'install',
|
||||
'kakasi', 'resources',
|
||||
'kakasi', 'coffee', 'resources',
|
||||
'check',
|
||||
'sdist',
|
||||
'manual', 'tag_release',
|
||||
@ -49,9 +49,10 @@ gui = GUI()
|
||||
from setup.check import Check
|
||||
check = Check()
|
||||
|
||||
from setup.resources import Resources, Kakasi
|
||||
from setup.resources import Resources, Kakasi, Coffee
|
||||
resources = Resources()
|
||||
kakasi = Kakasi()
|
||||
coffee = Coffee()
|
||||
|
||||
from setup.publish import Manual, TagRelease, Stage1, Stage2, \
|
||||
Stage3, Stage4, Stage5, Publish
|
||||
|
@ -12,14 +12,14 @@ msgstr ""
|
||||
"Report-Msgid-Bugs-To: Debian iso-codes team <pkg-isocodes-"
|
||||
"devel@lists.alioth.debian.org>\n"
|
||||
"POT-Creation-Date: 2011-11-25 14:01+0000\n"
|
||||
"PO-Revision-Date: 2011-11-22 16:45+0000\n"
|
||||
"PO-Revision-Date: 2011-12-14 19:48+0000\n"
|
||||
"Last-Translator: Ferran Rius <frius64@hotmail.com>\n"
|
||||
"Language-Team: Catalan <linux@softcatala.org>\n"
|
||||
"MIME-Version: 1.0\n"
|
||||
"Content-Type: text/plain; charset=UTF-8\n"
|
||||
"Content-Transfer-Encoding: 8bit\n"
|
||||
"X-Launchpad-Export-Date: 2011-11-26 05:10+0000\n"
|
||||
"X-Generator: Launchpad (build 14381)\n"
|
||||
"X-Launchpad-Export-Date: 2011-12-15 05:18+0000\n"
|
||||
"X-Generator: Launchpad (build 14487)\n"
|
||||
"Language: ca\n"
|
||||
|
||||
#. name for aaa
|
||||
@ -9348,7 +9348,7 @@ msgstr "Seit-Kaitetu"
|
||||
|
||||
#. name for hil
|
||||
msgid "Hiligaynon"
|
||||
msgstr ""
|
||||
msgstr "Hiligainon"
|
||||
|
||||
#. name for hin
|
||||
msgid "Hindi"
|
||||
@ -9356,39 +9356,39 @@ msgstr "Hindi"
|
||||
|
||||
#. name for hio
|
||||
msgid "Tsoa"
|
||||
msgstr ""
|
||||
msgstr "Tsoa"
|
||||
|
||||
#. name for hir
|
||||
msgid "Himarimã"
|
||||
msgstr ""
|
||||
msgstr "Himarimà"
|
||||
|
||||
#. name for hit
|
||||
msgid "Hittite"
|
||||
msgstr ""
|
||||
msgstr "Hittita"
|
||||
|
||||
#. name for hiw
|
||||
msgid "Hiw"
|
||||
msgstr ""
|
||||
msgstr "Hiw"
|
||||
|
||||
#. name for hix
|
||||
msgid "Hixkaryána"
|
||||
msgstr ""
|
||||
msgstr "Hishkaryana"
|
||||
|
||||
#. name for hji
|
||||
msgid "Haji"
|
||||
msgstr ""
|
||||
msgstr "Aji"
|
||||
|
||||
#. name for hka
|
||||
msgid "Kahe"
|
||||
msgstr ""
|
||||
msgstr "Kahe"
|
||||
|
||||
#. name for hke
|
||||
msgid "Hunde"
|
||||
msgstr ""
|
||||
msgstr "Hunde"
|
||||
|
||||
#. name for hkk
|
||||
msgid "Hunjara-Kaina Ke"
|
||||
msgstr ""
|
||||
msgstr "Hunjara"
|
||||
|
||||
#. name for hks
|
||||
msgid "Hong Kong Sign Language"
|
||||
@ -9396,27 +9396,27 @@ msgstr "Llenguatge de signes de Hong Kong"
|
||||
|
||||
#. name for hla
|
||||
msgid "Halia"
|
||||
msgstr ""
|
||||
msgstr "Halia"
|
||||
|
||||
#. name for hlb
|
||||
msgid "Halbi"
|
||||
msgstr ""
|
||||
msgstr "Halbi"
|
||||
|
||||
#. name for hld
|
||||
msgid "Halang Doan"
|
||||
msgstr ""
|
||||
msgstr "Halang Doan"
|
||||
|
||||
#. name for hle
|
||||
msgid "Hlersu"
|
||||
msgstr ""
|
||||
msgstr "Sansu"
|
||||
|
||||
#. name for hlt
|
||||
msgid "Nga La"
|
||||
msgstr ""
|
||||
msgstr "Nga La"
|
||||
|
||||
#. name for hlu
|
||||
msgid "Luwian; Hieroglyphic"
|
||||
msgstr ""
|
||||
msgstr "Luvi; jeroglífic"
|
||||
|
||||
#. name for hma
|
||||
msgid "Miao; Southern Mashan"
|
||||
@ -9424,7 +9424,7 @@ msgstr "Miao; Mashan meridional"
|
||||
|
||||
#. name for hmb
|
||||
msgid "Songhay; Humburi Senni"
|
||||
msgstr ""
|
||||
msgstr "Songhai; central"
|
||||
|
||||
#. name for hmc
|
||||
msgid "Miao; Central Huishui"
|
||||
@ -9440,11 +9440,11 @@ msgstr "Miao; Huishui oriental"
|
||||
|
||||
#. name for hmf
|
||||
msgid "Hmong Don"
|
||||
msgstr ""
|
||||
msgstr "Miao; Don"
|
||||
|
||||
#. name for hmg
|
||||
msgid "Hmong; Southwestern Guiyang"
|
||||
msgstr ""
|
||||
msgstr "Miao; Guiyang sudoccidental"
|
||||
|
||||
#. name for hmh
|
||||
msgid "Miao; Southwestern Huishui"
|
||||
@ -9456,11 +9456,11 @@ msgstr "Miao; Huishui septentrional"
|
||||
|
||||
#. name for hmj
|
||||
msgid "Ge"
|
||||
msgstr ""
|
||||
msgstr "Ge"
|
||||
|
||||
#. name for hmk
|
||||
msgid "Maek"
|
||||
msgstr ""
|
||||
msgstr "Maek"
|
||||
|
||||
#. name for hml
|
||||
msgid "Miao; Luopohe"
|
||||
@ -9472,11 +9472,11 @@ msgstr "Miao; Mashan central"
|
||||
|
||||
#. name for hmn
|
||||
msgid "Hmong"
|
||||
msgstr ""
|
||||
msgstr "Hmong (macrollengua)"
|
||||
|
||||
#. name for hmo
|
||||
msgid "Hiri Motu"
|
||||
msgstr ""
|
||||
msgstr "Hiri Motu"
|
||||
|
||||
#. name for hmp
|
||||
msgid "Miao; Northern Mashan"
|
||||
@ -9488,7 +9488,7 @@ msgstr "Miao; Qiandong oriental"
|
||||
|
||||
#. name for hmr
|
||||
msgid "Hmar"
|
||||
msgstr ""
|
||||
msgstr "Hmar"
|
||||
|
||||
#. name for hms
|
||||
msgid "Miao; Southern Qiandong"
|
||||
@ -9496,15 +9496,15 @@ msgstr "Miao; Qiandong meridional"
|
||||
|
||||
#. name for hmt
|
||||
msgid "Hamtai"
|
||||
msgstr ""
|
||||
msgstr "Hamtai"
|
||||
|
||||
#. name for hmu
|
||||
msgid "Hamap"
|
||||
msgstr ""
|
||||
msgstr "Hamap"
|
||||
|
||||
#. name for hmv
|
||||
msgid "Hmong Dô"
|
||||
msgstr ""
|
||||
msgstr "Miao; Do"
|
||||
|
||||
#. name for hmw
|
||||
msgid "Miao; Western Mashan"
|
||||
@ -9520,19 +9520,19 @@ msgstr "Miao; Shua"
|
||||
|
||||
#. name for hna
|
||||
msgid "Mina (Cameroon)"
|
||||
msgstr ""
|
||||
msgstr "Mina (Camerun)"
|
||||
|
||||
#. name for hnd
|
||||
msgid "Hindko; Southern"
|
||||
msgstr ""
|
||||
msgstr "Hindko; meridional"
|
||||
|
||||
#. name for hne
|
||||
msgid "Chhattisgarhi"
|
||||
msgstr ""
|
||||
msgstr "Chattisgarbi"
|
||||
|
||||
#. name for hnh
|
||||
msgid "//Ani"
|
||||
msgstr ""
|
||||
msgstr "Ani"
|
||||
|
||||
#. name for hni
|
||||
msgid "Hani"
|
||||
@ -9540,7 +9540,7 @@ msgstr ""
|
||||
|
||||
#. name for hnj
|
||||
msgid "Hmong Njua"
|
||||
msgstr ""
|
||||
msgstr "Miao; Hmong Njua"
|
||||
|
||||
#. name for hnn
|
||||
msgid "Hanunoo"
|
||||
@ -9548,7 +9548,7 @@ msgstr ""
|
||||
|
||||
#. name for hno
|
||||
msgid "Hindko; Northern"
|
||||
msgstr ""
|
||||
msgstr "Hindko; septentrional"
|
||||
|
||||
#. name for hns
|
||||
msgid "Hindustani; Caribbean"
|
||||
@ -11800,7 +11800,7 @@ msgstr ""
|
||||
|
||||
#. name for khq
|
||||
msgid "Songhay; Koyra Chiini"
|
||||
msgstr ""
|
||||
msgstr "Songhai; Koyra"
|
||||
|
||||
#. name for khr
|
||||
msgid "Kharia"
|
||||
@ -17288,7 +17288,7 @@ msgstr ""
|
||||
|
||||
#. name for mww
|
||||
msgid "Hmong Daw"
|
||||
msgstr ""
|
||||
msgstr "Miao; blanc"
|
||||
|
||||
#. name for mwx
|
||||
msgid "Mediak"
|
||||
@ -28680,7 +28680,7 @@ msgstr ""
|
||||
|
||||
#. name for xlu
|
||||
msgid "Luwian; Cuneiform"
|
||||
msgstr ""
|
||||
msgstr "Luvi; cuneïforme"
|
||||
|
||||
#. name for xly
|
||||
msgid "Elymian"
|
||||
|
@ -6,7 +6,7 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, cPickle, re, shutil, marshal, zipfile, glob
|
||||
import os, cPickle, re, shutil, marshal, zipfile, glob, subprocess, time
|
||||
from zlib import compress
|
||||
|
||||
from setup import Command, basenames, __appname__
|
||||
@ -23,7 +23,70 @@ def get_opts_from_parser(parser):
|
||||
for o in g.option_list:
|
||||
for x in do_opt(o): yield x
|
||||
|
||||
class Kakasi(Command):
|
||||
class Coffee(Command): # {{{
|
||||
|
||||
description = 'Compile coffeescript files into javascript'
|
||||
COFFEE_DIRS = {'ebooks/oeb/display': 'display'}
|
||||
|
||||
def add_options(self, parser):
|
||||
parser.add_option('--watch', '-w', action='store_true', default=False,
|
||||
help='Autocompile when .coffee files are changed')
|
||||
parser.add_option('--show-js', action='store_true', default=False,
|
||||
help='Display the generated javascript')
|
||||
|
||||
def run(self, opts):
|
||||
self.do_coffee_compile(opts)
|
||||
if opts.watch:
|
||||
try:
|
||||
while True:
|
||||
time.sleep(0.5)
|
||||
self.do_coffee_compile(opts, timestamp=True,
|
||||
ignore_errors=True)
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
|
||||
def show_js(self, jsfile):
|
||||
from pygments.lexers import JavascriptLexer
|
||||
from pygments.formatters import TerminalFormatter
|
||||
from pygments import highlight
|
||||
with open(jsfile, 'rb') as f:
|
||||
raw = f.read()
|
||||
print highlight(raw, JavascriptLexer(), TerminalFormatter())
|
||||
|
||||
def do_coffee_compile(self, opts, timestamp=False, ignore_errors=False):
|
||||
for toplevel, dest in self.COFFEE_DIRS.iteritems():
|
||||
dest = self.j(self.RESOURCES, dest)
|
||||
for x in glob.glob(self.j(self.SRC, __appname__, toplevel, '*.coffee')):
|
||||
js = self.j(dest, os.path.basename(x.rpartition('.')[0]+'.js'))
|
||||
if self.newer(js, x):
|
||||
print ('\t%sCompiling %s'%(time.strftime('[%H:%M:%S] ') if
|
||||
timestamp else '', os.path.basename(x)))
|
||||
try:
|
||||
subprocess.check_call(['coffee', '-c', '-o', dest, x])
|
||||
except:
|
||||
print ('\n\tCompilation of %s failed'%os.path.basename(x))
|
||||
if ignore_errors:
|
||||
with open(js, 'wb') as f:
|
||||
f.write('# Compilation from coffeescript failed')
|
||||
else:
|
||||
raise SystemExit(1)
|
||||
else:
|
||||
if opts.show_js:
|
||||
self.show_js(js)
|
||||
print ('#'*80)
|
||||
print ('#'*80)
|
||||
|
||||
def clean(self):
|
||||
for toplevel, dest in self.COFFEE_DIRS.iteritems():
|
||||
dest = self.j(self.RESOURCES, dest)
|
||||
for x in glob.glob(self.j(self.SRC, __appname__, toplevel, '*.coffee')):
|
||||
x = x.rpartition('.')[0] + '.js'
|
||||
x = self.j(dest, os.path.basename(x))
|
||||
if os.path.exists(x):
|
||||
os.remove(x)
|
||||
# }}}
|
||||
|
||||
class Kakasi(Command): # {{{
|
||||
|
||||
description = 'Compile resources for unihandecode'
|
||||
|
||||
@ -62,9 +125,6 @@ class Kakasi(Command):
|
||||
self.info('\tGenerating kanadict')
|
||||
self.mkkanadict(src, dest)
|
||||
|
||||
return
|
||||
|
||||
|
||||
def mkitaiji(self, src, dst):
|
||||
dic = {}
|
||||
for line in open(src, "r"):
|
||||
@ -125,11 +185,12 @@ class Kakasi(Command):
|
||||
kakasi = self.j(self.RESOURCES, 'localization', 'pykakasi')
|
||||
if os.path.exists(kakasi):
|
||||
shutil.rmtree(kakasi)
|
||||
# }}}
|
||||
|
||||
class Resources(Command):
|
||||
class Resources(Command): # {{{
|
||||
|
||||
description = 'Compile various needed calibre resources'
|
||||
sub_commands = ['kakasi']
|
||||
sub_commands = ['kakasi', 'coffee']
|
||||
|
||||
def run(self, opts):
|
||||
scripts = {}
|
||||
@ -223,13 +284,13 @@ class Resources(Command):
|
||||
x = self.j(self.RESOURCES, x+'.pickle')
|
||||
if os.path.exists(x):
|
||||
os.remove(x)
|
||||
from setup.commands import kakasi
|
||||
from setup.commands import kakasi, coffee
|
||||
kakasi.clean()
|
||||
coffee.clean()
|
||||
for x in ('builtin_recipes.xml', 'builtin_recipes.zip',
|
||||
'template-functions.json'):
|
||||
x = self.j(self.RESOURCES, x)
|
||||
if os.path.exists(x):
|
||||
os.remove(x)
|
||||
|
||||
|
||||
# }}}
|
||||
|
||||
|
@ -215,32 +215,34 @@ class GetTranslations(Translations): # {{{
|
||||
description = 'Get updated translations from Launchpad'
|
||||
BRANCH = 'lp:~kovid/calibre/translations'
|
||||
|
||||
@classmethod
|
||||
def modified_translations(cls):
|
||||
raw = subprocess.Popen(['bzr', 'status'],
|
||||
@property
|
||||
def modified_translations(self):
|
||||
raw = subprocess.Popen(['bzr', 'status', '-S', self.PATH],
|
||||
stdout=subprocess.PIPE).stdout.read().strip()
|
||||
ans = []
|
||||
for line in raw.splitlines():
|
||||
line = line.strip()
|
||||
if line.startswith(cls.PATH) and line.endswith('.po'):
|
||||
yield line
|
||||
if line.startswith('M') and line.endswith('.po'):
|
||||
ans.append(line.split()[-1])
|
||||
return ans
|
||||
|
||||
def run(self, opts):
|
||||
if len(list(self.modified_translations())) == 0:
|
||||
if not self.modified_translations:
|
||||
subprocess.check_call(['bzr', 'merge', self.BRANCH])
|
||||
if len(list(self.modified_translations())) == 0:
|
||||
print 'No updated translations available'
|
||||
else:
|
||||
subprocess.check_call(['bzr', 'commit', '-m',
|
||||
'IGN:Updated translations', self.PATH])
|
||||
self.check_for_errors()
|
||||
|
||||
@classmethod
|
||||
def check_for_errors(cls):
|
||||
if self.modified_translations:
|
||||
subprocess.check_call(['bzr', 'commit', '-m',
|
||||
'IGN:Updated translations', self.PATH])
|
||||
else:
|
||||
print('No updated translations available')
|
||||
|
||||
def check_for_errors(self):
|
||||
errors = os.path.join(tempfile.gettempdir(), 'calibre-translation-errors')
|
||||
if os.path.exists(errors):
|
||||
shutil.rmtree(errors)
|
||||
os.mkdir(errors)
|
||||
pofilter = ('pofilter', '-i', cls.PATH, '-o', errors,
|
||||
pofilter = ('pofilter', '-i', self.PATH, '-o', errors,
|
||||
'-t', 'accelerators', '-t', 'escapes', '-t', 'variables',
|
||||
#'-t', 'xmltags',
|
||||
#'-t', 'brackets',
|
||||
@ -253,23 +255,20 @@ class GetTranslations(Translations): # {{{
|
||||
'-t', 'printf')
|
||||
subprocess.check_call(pofilter)
|
||||
errfiles = glob.glob(errors+os.sep+'*.po')
|
||||
subprocess.check_call(['gvim', '-f', '-p', '--']+errfiles)
|
||||
for f in errfiles:
|
||||
with open(f, 'r+b') as f:
|
||||
raw = f.read()
|
||||
raw = re.sub(r'# \(pofilter\).*', '', raw)
|
||||
f.seek(0)
|
||||
f.truncate()
|
||||
f.write(raw)
|
||||
if errfiles:
|
||||
subprocess.check_call(['gvim', '-f', '-p', '--']+errfiles)
|
||||
for f in errfiles:
|
||||
with open(f, 'r+b') as f:
|
||||
raw = f.read()
|
||||
raw = re.sub(r'# \(pofilter\).*', '', raw)
|
||||
f.seek(0)
|
||||
f.truncate()
|
||||
f.write(raw)
|
||||
|
||||
subprocess.check_call(['pomerge', '-t', cls.PATH, '-i', errors, '-o',
|
||||
cls.PATH])
|
||||
if len(list(cls.modified_translations())) > 0:
|
||||
subprocess.call(['bzr', 'diff', cls.PATH])
|
||||
yes = raw_input('Merge corrections? [y/n]: ').strip()
|
||||
if yes in ['', 'y']:
|
||||
subprocess.check_call(['bzr', 'commit', '-m',
|
||||
'IGN:Translation corrections', cls.PATH])
|
||||
subprocess.check_call(['pomerge', '-t', self.PATH, '-i', errors, '-o',
|
||||
self.PATH])
|
||||
return True
|
||||
return False
|
||||
|
||||
# }}}
|
||||
|
||||
|
@ -558,11 +558,11 @@ xml_entity_to_unicode = partial(entity_to_unicode, result_exceptions = {
|
||||
'>' : '>',
|
||||
'&' : '&'})
|
||||
|
||||
def replace_entities(raw):
|
||||
return _ent_pat.sub(entity_to_unicode, raw)
|
||||
def replace_entities(raw, encoding='cp1252'):
|
||||
return _ent_pat.sub(partial(entity_to_unicode, encoding=encoding), raw)
|
||||
|
||||
def xml_replace_entities(raw):
|
||||
return _ent_pat.sub(xml_entity_to_unicode, raw)
|
||||
def xml_replace_entities(raw, encoding='cp1252'):
|
||||
return _ent_pat.sub(partial(xml_entity_to_unicode, encoding=encoding), raw)
|
||||
|
||||
def prepare_string_for_xml(raw, attribute=False):
|
||||
raw = _ent_pat.sub(entity_to_unicode, raw)
|
||||
|
@ -4,7 +4,7 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
__appname__ = u'calibre'
|
||||
numeric_version = (0, 8, 30)
|
||||
numeric_version = (0, 8, 31)
|
||||
__version__ = u'.'.join(map(unicode, numeric_version))
|
||||
__author__ = u"Kovid Goyal <kovid@kovidgoyal.net>"
|
||||
|
||||
|
@ -173,8 +173,9 @@ class INVESBOOK(EB600):
|
||||
FORMATS = ['epub', 'mobi', 'prc', 'fb2', 'html', 'pdf', 'rtf', 'txt']
|
||||
BCD = [0x110, 0x323]
|
||||
|
||||
VENDOR_NAME = ['INVES_E6', 'INVES-WI']
|
||||
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = ['00INVES_E600', 'INVES-WIBOOK']
|
||||
VENDOR_NAME = ['INVES_E6', 'INVES-WI', 'POCKETBO']
|
||||
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = ['00INVES_E600', 'INVES-WIBOOK',
|
||||
'OK_POCKET_611_61']
|
||||
|
||||
class BOOQ(EB600):
|
||||
name = 'Booq Device Interface'
|
||||
|
@ -30,7 +30,7 @@ BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'txtz', 'text', 'ht
|
||||
'html', 'htmlz', 'xhtml', 'pdf', 'pdb', 'pdr', 'prc', 'mobi', 'azw', 'doc',
|
||||
'epub', 'fb2', 'djv', 'djvu', 'lrx', 'cbr', 'cbz', 'cbc', 'oebzip',
|
||||
'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1', 'pml', 'pmlz', 'mbp', 'tan', 'snb',
|
||||
'xps', 'oxps', 'azw4', 'book', 'zbf', 'pobi']
|
||||
'xps', 'oxps', 'azw4', 'book', 'zbf', 'pobi', 'docx']
|
||||
|
||||
class HTMLRenderer(object):
|
||||
|
||||
|
@ -229,7 +229,10 @@ class EPUBOutput(OutputFormatPlugin):
|
||||
if opts.extract_to is not None:
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
if os.path.exists(opts.extract_to):
|
||||
shutil.rmtree(opts.extract_to)
|
||||
if os.path.isdir(opts.extract_to):
|
||||
shutil.rmtree(opts.extract_to)
|
||||
else:
|
||||
os.remove(opts.extract_to)
|
||||
os.mkdir(opts.extract_to)
|
||||
with ZipFile(output_path) as zf:
|
||||
zf.extractall(path=opts.extract_to)
|
||||
|
@ -16,7 +16,8 @@ from lxml.html import tostring
|
||||
|
||||
from calibre import as_unicode
|
||||
from calibre.ebooks.metadata import check_isbn
|
||||
from calibre.ebooks.metadata.sources.base import Source, Option
|
||||
from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase,
|
||||
fixauthors)
|
||||
from calibre.utils.cleantext import clean_ascii_chars
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.ebooks.metadata.book.base import Metadata
|
||||
@ -509,6 +510,15 @@ class Amazon(Source):
|
||||
|
||||
return domain
|
||||
|
||||
def clean_downloaded_metadata(self, mi):
|
||||
if mi.title and self.domain in ('com', 'uk'):
|
||||
mi.title = fixcase(mi.title)
|
||||
mi.authors = fixauthors(mi.authors)
|
||||
if self.domain in ('com', 'uk'):
|
||||
mi.tags = list(map(fixcase, mi.tags))
|
||||
mi.isbn = check_isbn(mi.isbn)
|
||||
|
||||
|
||||
def create_query(self, log, title=None, authors=None, identifiers={}, # {{{
|
||||
domain=None):
|
||||
if domain is None:
|
||||
|
@ -31,7 +31,7 @@ class TOC(list):
|
||||
|
||||
def __init__(self, href=None, fragment=None, text=None, parent=None, play_order=0,
|
||||
base_path=os.getcwd(), type='unknown', author=None,
|
||||
description=None):
|
||||
description=None, toc_thumbnail=None):
|
||||
self.href = href
|
||||
self.fragment = fragment
|
||||
if not self.fragment:
|
||||
@ -43,6 +43,7 @@ class TOC(list):
|
||||
self.type = type
|
||||
self.author = author
|
||||
self.description = description
|
||||
self.toc_thumbnail = toc_thumbnail
|
||||
|
||||
def __str__(self):
|
||||
lines = ['TOC: %s#%s'%(self.href, self.fragment)]
|
||||
@ -72,12 +73,12 @@ class TOC(list):
|
||||
entry.parent = None
|
||||
|
||||
def add_item(self, href, fragment, text, play_order=None, type='unknown',
|
||||
author=None, description=None):
|
||||
author=None, description=None, toc_thumbnail=None):
|
||||
if play_order is None:
|
||||
play_order = (self[-1].play_order if len(self) else self.play_order) + 1
|
||||
self.append(TOC(href=href, fragment=fragment, text=text, parent=self,
|
||||
base_path=self.base_path, play_order=play_order,
|
||||
type=type, author=author, description=description))
|
||||
type=type, author=author, description=description, toc_thumbnail=toc_thumbnail))
|
||||
return self[-1]
|
||||
|
||||
def top_level_items(self):
|
||||
@ -269,6 +270,9 @@ class TOC(list):
|
||||
if desc:
|
||||
desc = re.sub(r'\s+', ' ', desc)
|
||||
elem.append(C.meta(desc, name='description'))
|
||||
idx = getattr(np, 'toc_thumbnail', None)
|
||||
if idx:
|
||||
elem.append(C.meta(idx, name='toc_thumbnail'))
|
||||
parent.append(elem)
|
||||
for np2 in np:
|
||||
navpoint(elem, np2)
|
||||
|
@ -656,11 +656,11 @@ class Tag(object): # {{{
|
||||
' image record associated with this article',
|
||||
'image_index'),
|
||||
70 : ('Description offset in cncx', 'desc_offset'),
|
||||
71 : ('Image attribution offset in cncx',
|
||||
'image_attr_offset'),
|
||||
71 : ('Author offset in cncx', 'author_offset'),
|
||||
72 : ('Image caption offset in cncx',
|
||||
'image_caption_offset'),
|
||||
73 : ('Author offset in cncx', 'author_offset'),
|
||||
73 : ('Image attribution offset in cncx',
|
||||
'image_attr_offset'),
|
||||
},
|
||||
|
||||
'chapter_with_subchapters' : {
|
||||
|
@ -973,7 +973,8 @@ class MobiReader(object):
|
||||
continue
|
||||
processed_records.append(i)
|
||||
data = self.sections[i][0]
|
||||
if data[:4] in (b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n'):
|
||||
if data[:4] in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n',
|
||||
b'RESC', b'BOUN', b'FDST', b'DATP'}:
|
||||
# A FLIS, FCIS, SRCS or EOF record, ignore
|
||||
continue
|
||||
buf = cStringIO.StringIO(data)
|
||||
|
@ -136,7 +136,8 @@ class IndexEntry(object):
|
||||
'last_child_index': 23,
|
||||
'image_index': 69,
|
||||
'desc_offset': 70,
|
||||
'author_offset': 73,
|
||||
'author_offset': 71,
|
||||
|
||||
}
|
||||
RTAG_MAP = {v:k for k, v in TAG_VALUES.iteritems()}
|
||||
|
||||
@ -754,6 +755,13 @@ class Indexer(object): # {{{
|
||||
normalized_articles.append(article)
|
||||
article.author_offset = self.cncx[art.author]
|
||||
article.desc_offset = self.cncx[art.description]
|
||||
if getattr(art, 'toc_thumbnail', None) is not None:
|
||||
try:
|
||||
ii = self.serializer.images[art.toc_thumbnail] - 1
|
||||
if ii > -1:
|
||||
article.image_index = ii
|
||||
except KeyError:
|
||||
pass # Image not found in serializer
|
||||
|
||||
if normalized_articles:
|
||||
normalized_articles.sort(key=lambda x:x.offset)
|
||||
|
@ -161,7 +161,7 @@ class MobiWriter(object):
|
||||
index = 1
|
||||
|
||||
mh_href = None
|
||||
if 'masthead' in oeb.guide:
|
||||
if 'masthead' in oeb.guide and oeb.guide['masthead'].href:
|
||||
mh_href = oeb.guide['masthead'].href
|
||||
self.image_records.append(None)
|
||||
index += 1
|
||||
|
@ -16,15 +16,13 @@ from urllib import unquote as urlunquote
|
||||
from lxml import etree, html
|
||||
from calibre.constants import filesystem_encoding, __version__
|
||||
from calibre.translations.dynamic import translate
|
||||
from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
|
||||
from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.ebooks.conversion.preprocess import CSSPreProcessor
|
||||
from calibre import isbytestring, as_unicode, get_types_map
|
||||
|
||||
RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True)
|
||||
from calibre import (isbytestring, as_unicode, get_types_map)
|
||||
from calibre.ebooks.oeb.parse_utils import (barename, XHTML_NS, RECOVER_PARSER,
|
||||
namespace, XHTML, parse_html, NotHTML)
|
||||
|
||||
XML_NS = 'http://www.w3.org/XML/1998/namespace'
|
||||
XHTML_NS = 'http://www.w3.org/1999/xhtml'
|
||||
OEB_DOC_NS = 'http://openebook.org/namespaces/oeb-document/1.0/'
|
||||
OPF1_NS = 'http://openebook.org/namespaces/oeb-package/1.0/'
|
||||
OPF2_NS = 'http://www.idpf.org/2007/opf'
|
||||
@ -55,9 +53,6 @@ OPF2_NSMAP = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS,
|
||||
def XML(name):
|
||||
return '{%s}%s' % (XML_NS, name)
|
||||
|
||||
def XHTML(name):
|
||||
return '{%s}%s' % (XHTML_NS, name)
|
||||
|
||||
def OPF(name):
|
||||
return '{%s}%s' % (OPF2_NS, name)
|
||||
|
||||
@ -279,22 +274,11 @@ PREFIXNAME_RE = re.compile(r'^[^:]+[:][^:]+')
|
||||
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
||||
CSSURL_RE = re.compile(r'''url[(](?P<q>["']?)(?P<url>[^)]+)(?P=q)[)]''')
|
||||
|
||||
|
||||
def element(parent, *args, **kwargs):
|
||||
if parent is not None:
|
||||
return etree.SubElement(parent, *args, **kwargs)
|
||||
return etree.Element(*args, **kwargs)
|
||||
|
||||
def namespace(name):
|
||||
if '}' in name:
|
||||
return name.split('}', 1)[0][1:]
|
||||
return ''
|
||||
|
||||
def barename(name):
|
||||
if '}' in name:
|
||||
return name.split('}', 1)[1]
|
||||
return name
|
||||
|
||||
def prefixname(name, nsrmap):
|
||||
if not isqname(name):
|
||||
return name
|
||||
@ -373,25 +357,6 @@ def urlnormalize(href):
|
||||
parts = (urlquote(part) for part in parts)
|
||||
return urlunparse(parts)
|
||||
|
||||
def merge_multiple_html_heads_and_bodies(root, log=None):
|
||||
heads, bodies = xpath(root, '//h:head'), xpath(root, '//h:body')
|
||||
if not (len(heads) > 1 or len(bodies) > 1): return root
|
||||
for child in root: root.remove(child)
|
||||
head = root.makeelement(XHTML('head'))
|
||||
body = root.makeelement(XHTML('body'))
|
||||
for h in heads:
|
||||
for x in h:
|
||||
head.append(x)
|
||||
for b in bodies:
|
||||
for x in b:
|
||||
body.append(x)
|
||||
map(root.append, (head, body))
|
||||
if log is not None:
|
||||
log.warn('Merging multiple <head> and <body> sections')
|
||||
return root
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class DummyHandler(logging.Handler):
|
||||
@ -418,10 +383,6 @@ class OEBError(Exception):
|
||||
"""Generic OEB-processing error."""
|
||||
pass
|
||||
|
||||
class NotHTML(OEBError):
|
||||
'''Raised when a file that should be HTML (as per manifest) is not'''
|
||||
pass
|
||||
|
||||
class NullContainer(object):
|
||||
"""An empty container.
|
||||
|
||||
@ -801,7 +762,6 @@ class Manifest(object):
|
||||
"""
|
||||
|
||||
NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)')
|
||||
META_XP = XPath('/h:html/h:head/h:meta[@http-equiv="Content-Type"]')
|
||||
|
||||
def __init__(self, oeb, id, href, media_type,
|
||||
fallback=None, loader=str, data=None):
|
||||
@ -830,244 +790,17 @@ class Manifest(object):
|
||||
return None
|
||||
return etree.fromstring(data, parser=RECOVER_PARSER)
|
||||
|
||||
def clean_word_doc(self, data):
|
||||
prefixes = []
|
||||
for match in re.finditer(r'xmlns:(\S+?)=".*?microsoft.*?"', data):
|
||||
prefixes.append(match.group(1))
|
||||
if prefixes:
|
||||
self.oeb.log.warn('Found microsoft markup, cleaning...')
|
||||
# Remove empty tags as they are not rendered by browsers
|
||||
# but can become renderable HTML tags like <p/> if the
|
||||
# document is parsed by an HTML parser
|
||||
pat = re.compile(
|
||||
r'<(%s):([a-zA-Z0-9]+)[^>/]*?></\1:\2>'%('|'.join(prefixes)),
|
||||
re.DOTALL)
|
||||
data = pat.sub('', data)
|
||||
pat = re.compile(
|
||||
r'<(%s):([a-zA-Z0-9]+)[^>/]*?/>'%('|'.join(prefixes)))
|
||||
data = pat.sub('', data)
|
||||
return data
|
||||
|
||||
def _parse_xhtml(self, data):
|
||||
orig_data = data
|
||||
self.oeb.log.debug('Parsing', self.href, '...')
|
||||
# Convert to Unicode and normalize line endings
|
||||
data = self.oeb.decode(data)
|
||||
data = strip_encoding_declarations(data)
|
||||
data = self.oeb.html_preprocessor(data)
|
||||
# There could be null bytes in data if it had � entities in it
|
||||
data = data.replace('\0', '')
|
||||
|
||||
# Remove DOCTYPE declaration as it messes up parsing
|
||||
# In particular, it causes tostring to insert xmlns
|
||||
# declarations, which messes up the coercing logic
|
||||
idx = data.find('<html')
|
||||
if idx == -1:
|
||||
idx = data.find('<HTML')
|
||||
if idx > -1:
|
||||
pre = data[:idx]
|
||||
data = data[idx:]
|
||||
if '<!DOCTYPE' in pre:
|
||||
user_entities = {}
|
||||
for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
|
||||
val = match.group(2)
|
||||
if val.startswith('"') and val.endswith('"'):
|
||||
val = val[1:-1]
|
||||
user_entities[match.group(1)] = val
|
||||
if user_entities:
|
||||
pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
|
||||
data = pat.sub(lambda m:user_entities[m.group(1)], data)
|
||||
|
||||
# Setting huge_tree=True causes crashes in windows with large files
|
||||
parser = etree.XMLParser(no_network=True)
|
||||
# Try with more & more drastic measures to parse
|
||||
def first_pass(data):
|
||||
try:
|
||||
data = etree.fromstring(data, parser=parser)
|
||||
except etree.XMLSyntaxError as err:
|
||||
self.oeb.log.debug('Initial parse failed, using more'
|
||||
' forgiving parsers')
|
||||
repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
|
||||
data = ENTITY_RE.sub(repl, data)
|
||||
try:
|
||||
data = etree.fromstring(data, parser=parser)
|
||||
except etree.XMLSyntaxError as err:
|
||||
self.oeb.logger.warn('Parsing file %r as HTML' % self.href)
|
||||
if err.args and err.args[0].startswith('Excessive depth'):
|
||||
from calibre.utils.soupparser import fromstring
|
||||
data = fromstring(data)
|
||||
else:
|
||||
data = html.fromstring(data)
|
||||
data.attrib.pop('xmlns', None)
|
||||
for elem in data.iter(tag=etree.Comment):
|
||||
if elem.text:
|
||||
elem.text = elem.text.strip('-')
|
||||
data = etree.tostring(data, encoding=unicode)
|
||||
try:
|
||||
data = etree.fromstring(data, parser=parser)
|
||||
except etree.XMLSyntaxError:
|
||||
data = etree.fromstring(data, parser=RECOVER_PARSER)
|
||||
return data
|
||||
fname = urlunquote(self.href)
|
||||
self.oeb.log.debug('Parsing', fname, '...')
|
||||
try:
|
||||
data = self.clean_word_doc(data)
|
||||
except:
|
||||
pass
|
||||
data = first_pass(data)
|
||||
|
||||
if data.tag == 'HTML':
|
||||
# Lower case all tag and attribute names
|
||||
data.tag = data.tag.lower()
|
||||
for x in data.iterdescendants():
|
||||
try:
|
||||
x.tag = x.tag.lower()
|
||||
for key, val in list(x.attrib.iteritems()):
|
||||
del x.attrib[key]
|
||||
key = key.lower()
|
||||
x.attrib[key] = val
|
||||
except:
|
||||
pass
|
||||
|
||||
# Handle weird (non-HTML/fragment) files
|
||||
if barename(data.tag) != 'html':
|
||||
if barename(data.tag) == 'ncx':
|
||||
return self._parse_xml(orig_data)
|
||||
self.oeb.log.warn('File %r does not appear to be (X)HTML'%self.href)
|
||||
nroot = etree.fromstring('<html></html>')
|
||||
has_body = False
|
||||
for child in list(data):
|
||||
if isinstance(child.tag, (unicode, str)) and barename(child.tag) == 'body':
|
||||
has_body = True
|
||||
break
|
||||
parent = nroot
|
||||
if not has_body:
|
||||
self.oeb.log.warn('File %r appears to be a HTML fragment'%self.href)
|
||||
nroot = etree.fromstring('<html><body/></html>')
|
||||
parent = nroot[0]
|
||||
for child in list(data.iter()):
|
||||
oparent = child.getparent()
|
||||
if oparent is not None:
|
||||
oparent.remove(child)
|
||||
parent.append(child)
|
||||
data = nroot
|
||||
|
||||
|
||||
# Force into the XHTML namespace
|
||||
if not namespace(data.tag):
|
||||
self.oeb.log.warn('Forcing', self.href, 'into XHTML namespace')
|
||||
data.attrib['xmlns'] = XHTML_NS
|
||||
data = etree.tostring(data, encoding=unicode)
|
||||
|
||||
try:
|
||||
data = etree.fromstring(data, parser=parser)
|
||||
except:
|
||||
data = data.replace(':=', '=').replace(':>', '>')
|
||||
data = data.replace('<http:/>', '')
|
||||
try:
|
||||
data = etree.fromstring(data, parser=parser)
|
||||
except etree.XMLSyntaxError:
|
||||
self.oeb.logger.warn('Stripping comments from %s'%
|
||||
self.href)
|
||||
data = re.compile(r'<!--.*?-->', re.DOTALL).sub('',
|
||||
data)
|
||||
data = data.replace(
|
||||
"<?xml version='1.0' encoding='utf-8'?><o:p></o:p>",
|
||||
'')
|
||||
data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '')
|
||||
try:
|
||||
data = etree.fromstring(data,
|
||||
parser=RECOVER_PARSER)
|
||||
except etree.XMLSyntaxError:
|
||||
self.oeb.logger.warn('Stripping meta tags from %s'%
|
||||
self.href)
|
||||
data = re.sub(r'<meta\s+[^>]+?>', '', data)
|
||||
data = etree.fromstring(data, parser=RECOVER_PARSER)
|
||||
elif namespace(data.tag) != XHTML_NS:
|
||||
# OEB_DOC_NS, but possibly others
|
||||
ns = namespace(data.tag)
|
||||
attrib = dict(data.attrib)
|
||||
nroot = etree.Element(XHTML('html'),
|
||||
nsmap={None: XHTML_NS}, attrib=attrib)
|
||||
for elem in data.iterdescendants():
|
||||
if isinstance(elem.tag, basestring) and \
|
||||
namespace(elem.tag) == ns:
|
||||
elem.tag = XHTML(barename(elem.tag))
|
||||
for elem in data:
|
||||
nroot.append(elem)
|
||||
data = nroot
|
||||
|
||||
data = merge_multiple_html_heads_and_bodies(data, self.oeb.logger)
|
||||
# Ensure has a <head/>
|
||||
head = xpath(data, '/h:html/h:head')
|
||||
head = head[0] if head else None
|
||||
if head is None:
|
||||
self.oeb.logger.warn(
|
||||
'File %r missing <head/> element' % self.href)
|
||||
head = etree.Element(XHTML('head'))
|
||||
data.insert(0, head)
|
||||
title = etree.SubElement(head, XHTML('title'))
|
||||
title.text = self.oeb.translate(__('Unknown'))
|
||||
elif not xpath(data, '/h:html/h:head/h:title'):
|
||||
self.oeb.logger.warn(
|
||||
'File %r missing <title/> element' % self.href)
|
||||
title = etree.SubElement(head, XHTML('title'))
|
||||
title.text = self.oeb.translate(__('Unknown'))
|
||||
# Remove any encoding-specifying <meta/> elements
|
||||
for meta in self.META_XP(data):
|
||||
meta.getparent().remove(meta)
|
||||
etree.SubElement(head, XHTML('meta'),
|
||||
attrib={'http-equiv': 'Content-Type',
|
||||
'content': '%s; charset=utf-8' % XHTML_NS})
|
||||
# Ensure has a <body/>
|
||||
if not xpath(data, '/h:html/h:body'):
|
||||
body = xpath(data, '//h:body')
|
||||
if body:
|
||||
body = body[0]
|
||||
body.getparent().remove(body)
|
||||
data.append(body)
|
||||
else:
|
||||
self.oeb.logger.warn(
|
||||
'File %r missing <body/> element' % self.href)
|
||||
etree.SubElement(data, XHTML('body'))
|
||||
|
||||
# Remove microsoft office markup
|
||||
r = [x for x in data.iterdescendants(etree.Element) if 'microsoft-com' in x.tag]
|
||||
for x in r:
|
||||
x.tag = XHTML('span')
|
||||
|
||||
# Remove lang redefinition inserted by the amazing Microsoft Word!
|
||||
body = xpath(data, '/h:html/h:body')[0]
|
||||
for key in list(body.attrib.keys()):
|
||||
if key == 'lang' or key.endswith('}lang'):
|
||||
body.attrib.pop(key)
|
||||
|
||||
def remove_elem(a):
|
||||
p = a.getparent()
|
||||
idx = p.index(a) -1
|
||||
p.remove(a)
|
||||
if a.tail:
|
||||
if idx <= 0:
|
||||
if p.text is None:
|
||||
p.text = ''
|
||||
p.text += a.tail
|
||||
else:
|
||||
if p[idx].tail is None:
|
||||
p[idx].tail = ''
|
||||
p[idx].tail += a.tail
|
||||
|
||||
# Remove hyperlinks with no content as they cause rendering
|
||||
# artifacts in browser based renderers
|
||||
# Also remove empty <b>, <u> and <i> tags
|
||||
for a in xpath(data, '//h:a[@href]|//h:i|//h:b|//h:u'):
|
||||
if a.get('id', None) is None and a.get('name', None) is None \
|
||||
and len(a) == 0 and not a.text:
|
||||
remove_elem(a)
|
||||
|
||||
# Convert <br>s with content into paragraphs as ADE can't handle
|
||||
# them
|
||||
for br in xpath(data, '//h:br'):
|
||||
if len(br) > 0 or br.text:
|
||||
br.tag = XHTML('div')
|
||||
|
||||
data = parse_html(data, log=self.oeb.log,
|
||||
decoder=self.oeb.decode,
|
||||
preprocessor=self.oeb.html_preprocessor,
|
||||
filename=fname, non_html_file_tags={'ncx'})
|
||||
except NotHTML:
|
||||
return self._parse_xml(orig_data)
|
||||
return data
|
||||
|
||||
def _parse_txt(self, data):
|
||||
@ -1629,9 +1362,10 @@ class TOC(object):
|
||||
:attr:`id`: Option unique identifier for this node.
|
||||
:attr:`author`: Optional author attribution for periodicals <mbp:>
|
||||
:attr:`description`: Optional description attribute for periodicals <mbp:>
|
||||
:attr:`toc_thumbnail`: Optional toc thumbnail image
|
||||
"""
|
||||
def __init__(self, title=None, href=None, klass=None, id=None,
|
||||
play_order=None, author=None, description=None):
|
||||
play_order=None, author=None, description=None, toc_thumbnail=None):
|
||||
self.title = title
|
||||
self.href = urlnormalize(href) if href else href
|
||||
self.klass = klass
|
||||
@ -1643,10 +1377,11 @@ class TOC(object):
|
||||
self.play_order = play_order
|
||||
self.author = author
|
||||
self.description = description
|
||||
self.toc_thumbnail = toc_thumbnail
|
||||
|
||||
def add(self, title, href, klass=None, id=None, play_order=0, author=None, description=None):
|
||||
def add(self, title, href, klass=None, id=None, play_order=0, author=None, description=None, toc_thumbnail=None):
|
||||
"""Create and return a new sub-node of this node."""
|
||||
node = TOC(title, href, klass, id, play_order, author, description)
|
||||
node = TOC(title, href, klass, id, play_order, author, description, toc_thumbnail)
|
||||
self.nodes.append(node)
|
||||
return node
|
||||
|
||||
|
225
src/calibre/ebooks/oeb/display/cfi.coffee
Normal file
225
src/calibre/ebooks/oeb/display/cfi.coffee
Normal file
@ -0,0 +1,225 @@
|
||||
#!/usr/bin/env coffee
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
###
|
||||
Copyright 2011, Kovid Goyal <kovid@kovidgoyal.net>
|
||||
Released under the GPLv3 License
|
||||
###
|
||||
#
|
||||
log = (error) ->
|
||||
if error
|
||||
if window?.console?.log
|
||||
window.console.log(error)
|
||||
else if process?.stdout?.write
|
||||
process.stdout.write(error + '\n')
|
||||
|
||||
# CFI escaping {{{
|
||||
escape_for_cfi = (raw) ->
|
||||
if raw
|
||||
for c in ['^', '[', ']', ',', '(', ')', ';', '~', '@', '-', '!']
|
||||
raw = raw.replace(c, '^'+c)
|
||||
raw
|
||||
|
||||
unescape_from_cfi = (raw) ->
|
||||
ans = raw
|
||||
if raw
|
||||
dropped = false
|
||||
ans = []
|
||||
for c in raw
|
||||
if not dropped and c == '^'
|
||||
dropped = true
|
||||
continue
|
||||
dropped = false
|
||||
ans.push(c)
|
||||
ans = ans.join('')
|
||||
ans
|
||||
# }}}
|
||||
|
||||
fstr = (d) -> # {{{
|
||||
# Convert a timestamp floating point number to a string
|
||||
ans = ""
|
||||
if ( d < 0 )
|
||||
ans = "-"
|
||||
d = -d
|
||||
n = Math.floor(d)
|
||||
ans += n
|
||||
n = Math.round((d-n)*100)
|
||||
if( n != 0 )
|
||||
ans += "."
|
||||
ans += if (n % 10 == 0) then (n/10) else n
|
||||
ans
|
||||
# }}}
|
||||
|
||||
class CanonicalFragmentIdentifier
|
||||
|
||||
# This class is a namespace to expose CFI functions via the window.cfi
|
||||
# object
|
||||
|
||||
constructor: () ->
|
||||
|
||||
encode: (doc, node, offset, tail) -> # {{{
|
||||
cfi = tail or ""
|
||||
|
||||
# Handle the offset, if any
|
||||
switch node.nodeType
|
||||
when 1 # Element node
|
||||
if typeoff(offset) == 'number'
|
||||
node = node.childNodes.item(offset)
|
||||
when 3, 4, 5, 6 # Text/entity/CDATA node
|
||||
offset or= 0
|
||||
while true
|
||||
p = node.previousSibling
|
||||
if (p?.nodeType not in [3, 4, 5, 6])
|
||||
break
|
||||
offset += p.nodeValue.length
|
||||
node = p
|
||||
cfi = ":" + offset + cfi
|
||||
else # Not handled
|
||||
log("Offsets for nodes of type #{ node.nodeType } are not handled")
|
||||
|
||||
# Construct the path to node from root
|
||||
until node == doc
|
||||
p = node.parentNode
|
||||
if not p
|
||||
if node.nodeType == 9 # Document node (iframe)
|
||||
win = node.defaultView
|
||||
if win.frameElement
|
||||
node = win.frameElement
|
||||
cfi = "!" + cfi
|
||||
continue
|
||||
break
|
||||
# Increase index by the length of all previous sibling text nodes
|
||||
index = 0
|
||||
child = p.firstChild
|
||||
while true
|
||||
index |= 1
|
||||
if child.nodeType in [1, 7]
|
||||
index++
|
||||
if child == node
|
||||
break
|
||||
child = child.nextSibling
|
||||
|
||||
# Add id assertions for robustness where possible
|
||||
id = node.getAttribute?('id')
|
||||
idspec = if id then "[#{ escape_for_cfi(id) }]" else ''
|
||||
cfi = '/' + index + idspec + cfi
|
||||
node = p
|
||||
|
||||
cfi
|
||||
# }}}
|
||||
|
||||
decode: (cfi, doc=window?.document) -> # {{{
|
||||
simple_node_regex = ///
|
||||
^/(\d+) # The node count
|
||||
(\[[^\]]*\])? # The optional id assertion
|
||||
///
|
||||
error = null
|
||||
node = doc
|
||||
|
||||
until cfi.length <= 0 or error
|
||||
if ( (r = cfi.match(simple_node_regex)) is not null ) # Path step
|
||||
target = parseInt(r[1])
|
||||
assertion = r[2]
|
||||
if assertion
|
||||
assertion = unescape_from_cfi(assertion.slice(1, assertion.length-1))
|
||||
index = 0
|
||||
child = node.firstChild
|
||||
|
||||
while true
|
||||
if not child
|
||||
if assertion # Try to use the assertion to find the node
|
||||
child = doc.getElementById(assertion)
|
||||
if child
|
||||
node = child
|
||||
if not child
|
||||
error = "No matching child found for CFI: " + cfi
|
||||
break
|
||||
index |= 1 # Increment index by 1 if it is even
|
||||
if child.nodeType in [1, 7] # We have an element or a PI
|
||||
index++
|
||||
if ( index == target )
|
||||
cfi = cfi.substr(r[0].length)
|
||||
node = child
|
||||
break
|
||||
child = child.nextSibling
|
||||
|
||||
else if cfi[0] == '!' # Indirection
|
||||
if node.contentDocument
|
||||
node = node.contentDocument
|
||||
cfi = cfi.substr(1)
|
||||
else
|
||||
error = "Cannot reference #{ node.nodeName }'s content:" + cfi
|
||||
|
||||
else
|
||||
break
|
||||
|
||||
if error
|
||||
log(error)
|
||||
return null
|
||||
|
||||
point = {}
|
||||
error = null
|
||||
|
||||
point
|
||||
|
||||
# }}}
|
||||
|
||||
at: (x, y, doc=window?.document) -> # {{{
|
||||
cdoc = doc
|
||||
target = null
|
||||
cwin = cdoc.defaultView
|
||||
tail = ''
|
||||
offset = null
|
||||
name = null
|
||||
|
||||
# Drill down into iframes, etc.
|
||||
while true
|
||||
target = cdoc.elementFromPoint x, y
|
||||
if not target or target.localName == 'html'
|
||||
log("No element at (#{ x }, #{ y })")
|
||||
return null
|
||||
|
||||
name = target.localName
|
||||
if name not in ['iframe', 'embed', 'object']
|
||||
break
|
||||
|
||||
cd = target.contentDocument
|
||||
if not cd
|
||||
break
|
||||
|
||||
x = x + cwin.pageXOffset - target.offsetLeft
|
||||
y = y + cwin.pageYOffset - target.offsetTop
|
||||
cdoc = cd
|
||||
cwin = cdoc.defaultView
|
||||
|
||||
target.normalize()
|
||||
|
||||
if name in ['audio', 'video']
|
||||
tail = "~" + fstr target.currentTime
|
||||
|
||||
if name in ['img', 'video']
|
||||
px = ((x + cwin.scrollX - target.offsetLeft)*100)/target.offsetWidth
|
||||
py = ((y + cwin.scrollY - target.offsetTop)*100)/target.offsetHeight
|
||||
tail = "#{ tail }@#{ fstr px },#{ fstr py }"
|
||||
else if name != 'audio'
|
||||
if cdoc.caretRangeFromPoint # WebKit
|
||||
range = cdoc.caretRangeFromPoint(x, y)
|
||||
if range
|
||||
target = range.startContainer
|
||||
offset = range.startOffset
|
||||
else
|
||||
# TODO: implement a span bisection algorithm for UAs
|
||||
# without caretRangeFromPoint (Gecko, IE)
|
||||
|
||||
this.encode(doc, target, offset, tail)
|
||||
# }}}
|
||||
|
||||
if window?
|
||||
window.cfi = new CanonicalFragmentIdentifier()
|
||||
else if process?
|
||||
# Some debugging code goes here to be run with the coffee interpreter
|
||||
cfi = new CanonicalFragmentIdentifier()
|
||||
t = 'a^!,1'
|
||||
log(t)
|
||||
log(escape_for_cfi(t))
|
||||
log(unescape_from_cfi(escape_for_cfi(t)))
|
24
src/calibre/ebooks/oeb/display/test/cfi-test.coffee
Normal file
24
src/calibre/ebooks/oeb/display/test/cfi-test.coffee
Normal file
@ -0,0 +1,24 @@
|
||||
#!/usr/bin/env coffee
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
###
|
||||
Copyright 2011, Kovid Goyal <kovid@kovidgoyal.net>
|
||||
Released under the GPLv3 License
|
||||
###
|
||||
|
||||
viewport_top = (node) ->
|
||||
$(node).offset().top - window.pageYOffset
|
||||
|
||||
viewport_left = (node) ->
|
||||
$(node).offset().left - window.pageXOffset
|
||||
|
||||
window.onload = ->
|
||||
h1 = document.getElementsByTagName('h1')[0]
|
||||
x = h1.scrollLeft + 150
|
||||
y = viewport_top(h1) + h1.offsetHeight/2
|
||||
e = document.elementFromPoint x, y
|
||||
if e.getAttribute('id') != 'first-h1'
|
||||
alert 'Failed to find top h1'
|
||||
return
|
||||
alert window.cfi.at x, y
|
||||
|
14
src/calibre/ebooks/oeb/display/test/test.html
Normal file
14
src/calibre/ebooks/oeb/display/test/test.html
Normal file
@ -0,0 +1,14 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Testing CFI functionality</title>
|
||||
<script type="text/javascript" src="cfi.js"></script>
|
||||
<script type="text/javascript" src="jquery.js"></script>
|
||||
<script type="text/javascript" src="cfi-test.js"></script>
|
||||
</head>
|
||||
<body>
|
||||
<h1 id="first-h1" style="border: solid 1px red">Testing CFI functionality</h1>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
|
26
src/calibre/ebooks/oeb/display/test/test.py
Normal file
26
src/calibre/ebooks/oeb/display/test/test.py
Normal file
@ -0,0 +1,26 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
try:
|
||||
from calibre.utils.coffeescript import serve
|
||||
except ImportError:
|
||||
import init_calibre
|
||||
if False: init_calibre, serve
|
||||
from calibre.utils.coffeescript import serve
|
||||
|
||||
|
||||
def run_devel_server():
|
||||
os.chdir(os.path.dirname(__file__))
|
||||
serve(['../cfi.coffee', 'cfi-test.coffee'])
|
||||
|
||||
if __name__ == '__main__':
|
||||
run_devel_server()
|
||||
|
@ -1,256 +0,0 @@
|
||||
"""
|
||||
Replacement for htmlentitydefs which uses purely numeric entities.
|
||||
"""
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
|
||||
|
||||
ENTITYDEFS = \
|
||||
{'AElig': 'Æ',
|
||||
'Aacute': 'Á',
|
||||
'Acirc': 'Â',
|
||||
'Agrave': 'À',
|
||||
'Alpha': 'Α',
|
||||
'Aring': 'Å',
|
||||
'Atilde': 'Ã',
|
||||
'Auml': 'Ä',
|
||||
'Beta': 'Β',
|
||||
'Ccedil': 'Ç',
|
||||
'Chi': 'Χ',
|
||||
'Dagger': '‡',
|
||||
'Delta': 'Δ',
|
||||
'ETH': 'Ð',
|
||||
'Eacute': 'É',
|
||||
'Ecirc': 'Ê',
|
||||
'Egrave': 'È',
|
||||
'Epsilon': 'Ε',
|
||||
'Eta': 'Η',
|
||||
'Euml': 'Ë',
|
||||
'Gamma': 'Γ',
|
||||
'Iacute': 'Í',
|
||||
'Icirc': 'Î',
|
||||
'Igrave': 'Ì',
|
||||
'Iota': 'Ι',
|
||||
'Iuml': 'Ï',
|
||||
'Kappa': 'Κ',
|
||||
'Lambda': 'Λ',
|
||||
'Mu': 'Μ',
|
||||
'Ntilde': 'Ñ',
|
||||
'Nu': 'Ν',
|
||||
'OElig': 'Œ',
|
||||
'Oacute': 'Ó',
|
||||
'Ocirc': 'Ô',
|
||||
'Ograve': 'Ò',
|
||||
'Omega': 'Ω',
|
||||
'Omicron': 'Ο',
|
||||
'Oslash': 'Ø',
|
||||
'Otilde': 'Õ',
|
||||
'Ouml': 'Ö',
|
||||
'Phi': 'Φ',
|
||||
'Pi': 'Π',
|
||||
'Prime': '″',
|
||||
'Psi': 'Ψ',
|
||||
'Rho': 'Ρ',
|
||||
'Scaron': 'Š',
|
||||
'Sigma': 'Σ',
|
||||
'THORN': 'Þ',
|
||||
'Tau': 'Τ',
|
||||
'Theta': 'Θ',
|
||||
'Uacute': 'Ú',
|
||||
'Ucirc': 'Û',
|
||||
'Ugrave': 'Ù',
|
||||
'Upsilon': 'Υ',
|
||||
'Uuml': 'Ü',
|
||||
'Xi': 'Ξ',
|
||||
'Yacute': 'Ý',
|
||||
'Yuml': 'Ÿ',
|
||||
'Zeta': 'Ζ',
|
||||
'aacute': 'á',
|
||||
'acirc': 'â',
|
||||
'acute': '´',
|
||||
'aelig': 'æ',
|
||||
'agrave': 'à',
|
||||
'alefsym': 'ℵ',
|
||||
'alpha': 'α',
|
||||
'and': '∧',
|
||||
'ang': '∠',
|
||||
'aring': 'å',
|
||||
'asymp': '≈',
|
||||
'atilde': 'ã',
|
||||
'auml': 'ä',
|
||||
'bdquo': '„',
|
||||
'beta': 'β',
|
||||
'brvbar': '¦',
|
||||
'bull': '•',
|
||||
'cap': '∩',
|
||||
'ccedil': 'ç',
|
||||
'cedil': '¸',
|
||||
'cent': '¢',
|
||||
'chi': 'χ',
|
||||
'circ': 'ˆ',
|
||||
'clubs': '♣',
|
||||
'cong': '≅',
|
||||
'copy': '©',
|
||||
'crarr': '↵',
|
||||
'cup': '∪',
|
||||
'curren': '¤',
|
||||
'dArr': '⇓',
|
||||
'dagger': '†',
|
||||
'darr': '↓',
|
||||
'deg': '°',
|
||||
'delta': 'δ',
|
||||
'diams': '♦',
|
||||
'divide': '÷',
|
||||
'eacute': 'é',
|
||||
'ecirc': 'ê',
|
||||
'egrave': 'è',
|
||||
'empty': '∅',
|
||||
'emsp': ' ',
|
||||
'ensp': ' ',
|
||||
'epsilon': 'ε',
|
||||
'equiv': '≡',
|
||||
'eta': 'η',
|
||||
'eth': 'ð',
|
||||
'euml': 'ë',
|
||||
'euro': '€',
|
||||
'exist': '∃',
|
||||
'fnof': 'ƒ',
|
||||
'forall': '∀',
|
||||
'frac12': '½',
|
||||
'frac14': '¼',
|
||||
'frac34': '¾',
|
||||
'frasl': '⁄',
|
||||
'gamma': 'γ',
|
||||
'ge': '≥',
|
||||
'hArr': '⇔',
|
||||
'harr': '↔',
|
||||
'hearts': '♥',
|
||||
'hellip': '…',
|
||||
'iacute': 'í',
|
||||
'icirc': 'î',
|
||||
'iexcl': '¡',
|
||||
'igrave': 'ì',
|
||||
'image': 'ℑ',
|
||||
'infin': '∞',
|
||||
'int': '∫',
|
||||
'iota': 'ι',
|
||||
'iquest': '¿',
|
||||
'isin': '∈',
|
||||
'iuml': 'ï',
|
||||
'kappa': 'κ',
|
||||
'lArr': '⇐',
|
||||
'lambda': 'λ',
|
||||
'lang': '〈',
|
||||
'laquo': '«',
|
||||
'larr': '←',
|
||||
'lceil': '⌈',
|
||||
'ldquo': '“',
|
||||
'le': '≤',
|
||||
'lfloor': '⌊',
|
||||
'lowast': '∗',
|
||||
'loz': '◊',
|
||||
'lrm': '‎',
|
||||
'lsaquo': '‹',
|
||||
'lsquo': '‘',
|
||||
'macr': '¯',
|
||||
'mdash': '—',
|
||||
'micro': 'µ',
|
||||
'middot': '·',
|
||||
'minus': '−',
|
||||
'mu': 'μ',
|
||||
'nabla': '∇',
|
||||
'nbsp': ' ',
|
||||
'ndash': '–',
|
||||
'ne': '≠',
|
||||
'ni': '∋',
|
||||
'not': '¬',
|
||||
'notin': '∉',
|
||||
'nsub': '⊄',
|
||||
'ntilde': 'ñ',
|
||||
'nu': 'ν',
|
||||
'oacute': 'ó',
|
||||
'ocirc': 'ô',
|
||||
'oelig': 'œ',
|
||||
'ograve': 'ò',
|
||||
'oline': '‾',
|
||||
'omega': 'ω',
|
||||
'omicron': 'ο',
|
||||
'oplus': '⊕',
|
||||
'or': '∨',
|
||||
'ordf': 'ª',
|
||||
'ordm': 'º',
|
||||
'oslash': 'ø',
|
||||
'otilde': 'õ',
|
||||
'otimes': '⊗',
|
||||
'ouml': 'ö',
|
||||
'para': '¶',
|
||||
'part': '∂',
|
||||
'permil': '‰',
|
||||
'perp': '⊥',
|
||||
'phi': 'φ',
|
||||
'pi': 'π',
|
||||
'piv': 'ϖ',
|
||||
'plusmn': '±',
|
||||
'pound': '£',
|
||||
'prime': '′',
|
||||
'prod': '∏',
|
||||
'prop': '∝',
|
||||
'psi': 'ψ',
|
||||
'rArr': '⇒',
|
||||
'radic': '√',
|
||||
'rang': '〉',
|
||||
'raquo': '»',
|
||||
'rarr': '→',
|
||||
'rceil': '⌉',
|
||||
'rdquo': '”',
|
||||
'real': 'ℜ',
|
||||
'reg': '®',
|
||||
'rfloor': '⌋',
|
||||
'rho': 'ρ',
|
||||
'rlm': '‏',
|
||||
'rsaquo': '›',
|
||||
'rsquo': '’',
|
||||
'sbquo': '‚',
|
||||
'scaron': 'š',
|
||||
'sdot': '⋅',
|
||||
'sect': '§',
|
||||
'shy': '­',
|
||||
'sigma': 'σ',
|
||||
'sigmaf': 'ς',
|
||||
'sim': '∼',
|
||||
'spades': '♠',
|
||||
'sub': '⊂',
|
||||
'sube': '⊆',
|
||||
'sum': '∑',
|
||||
'sup': '⊃',
|
||||
'sup1': '¹',
|
||||
'sup2': '²',
|
||||
'sup3': '³',
|
||||
'supe': '⊇',
|
||||
'szlig': 'ß',
|
||||
'tau': 'τ',
|
||||
'there4': '∴',
|
||||
'theta': 'θ',
|
||||
'thetasym': 'ϑ',
|
||||
'thinsp': ' ',
|
||||
'thorn': 'þ',
|
||||
'tilde': '˜',
|
||||
'times': '×',
|
||||
'trade': '™',
|
||||
'uArr': '⇑',
|
||||
'uacute': 'ú',
|
||||
'uarr': '↑',
|
||||
'ucirc': 'û',
|
||||
'ugrave': 'ù',
|
||||
'uml': '¨',
|
||||
'upsih': 'ϒ',
|
||||
'upsilon': 'υ',
|
||||
'uuml': 'ü',
|
||||
'weierp': '℘',
|
||||
'xi': 'ξ',
|
||||
'yacute': 'ý',
|
||||
'yen': '¥',
|
||||
'yuml': 'ÿ',
|
||||
'zeta': 'ζ',
|
||||
'zwj': '‍',
|
||||
'zwnj': '‌'}
|
347
src/calibre/ebooks/oeb/parse_utils.py
Normal file
347
src/calibre/ebooks/oeb/parse_utils.py
Normal file
@ -0,0 +1,347 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
|
||||
from lxml import etree, html
|
||||
|
||||
from calibre import xml_replace_entities, force_unicode
|
||||
from calibre.constants import filesystem_encoding
|
||||
from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
|
||||
|
||||
RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True)
|
||||
XHTML_NS = 'http://www.w3.org/1999/xhtml'
|
||||
|
||||
class NotHTML(Exception):
|
||||
|
||||
def __init__(self, root_tag):
|
||||
Exception.__init__(self, 'Data is not HTML')
|
||||
self.root_tag = root_tag
|
||||
|
||||
def barename(name):
|
||||
return name.rpartition('}')[-1]
|
||||
|
||||
def namespace(name):
|
||||
if '}' in name:
|
||||
return name.split('}', 1)[0][1:]
|
||||
return ''
|
||||
|
||||
def XHTML(name):
|
||||
return '{%s}%s' % (XHTML_NS, name)
|
||||
|
||||
def xpath(elem, expr):
|
||||
return elem.xpath(expr, namespaces={'h':XHTML_NS})
|
||||
|
||||
def XPath(expr):
|
||||
return etree.XPath(expr, namespaces={'h':XHTML_NS})
|
||||
|
||||
META_XP = XPath('/h:html/h:head/h:meta[@http-equiv="Content-Type"]')
|
||||
|
||||
def merge_multiple_html_heads_and_bodies(root, log=None):
|
||||
heads, bodies = xpath(root, '//h:head'), xpath(root, '//h:body')
|
||||
if not (len(heads) > 1 or len(bodies) > 1): return root
|
||||
for child in root: root.remove(child)
|
||||
head = root.makeelement(XHTML('head'))
|
||||
body = root.makeelement(XHTML('body'))
|
||||
for h in heads:
|
||||
for x in h:
|
||||
head.append(x)
|
||||
for b in bodies:
|
||||
for x in b:
|
||||
body.append(x)
|
||||
map(root.append, (head, body))
|
||||
if log is not None:
|
||||
log.warn('Merging multiple <head> and <body> sections')
|
||||
return root
|
||||
|
||||
def _html5_parse(data):
|
||||
import html5lib
|
||||
data = html5lib.parse(data, treebuilder='lxml').getroot()
|
||||
html_ns = [ns for ns, val in data.nsmap.iteritems() if (val == XHTML_NS and
|
||||
ns is not None)]
|
||||
if html_ns:
|
||||
# html5lib causes the XHTML namespace to not
|
||||
# be set as the default namespace
|
||||
nsmap = dict(data.nsmap)
|
||||
nsmap[None] = XHTML_NS
|
||||
for x in html_ns:
|
||||
nsmap.pop(x)
|
||||
nroot = etree.Element(data.tag, nsmap=nsmap,
|
||||
attrib=dict(data.attrib))
|
||||
nroot.text = data.text
|
||||
nroot.tail = data.tail
|
||||
for child in data:
|
||||
nroot.append(child)
|
||||
data = nroot
|
||||
return data
|
||||
|
||||
def _html4_parse(data, prefer_soup=False):
|
||||
if prefer_soup:
|
||||
from calibre.utils.soupparser import fromstring
|
||||
data = fromstring(data)
|
||||
else:
|
||||
data = html.fromstring(data)
|
||||
data.attrib.pop('xmlns', None)
|
||||
for elem in data.iter(tag=etree.Comment):
|
||||
if elem.text:
|
||||
elem.text = elem.text.strip('-')
|
||||
data = etree.tostring(data, encoding=unicode)
|
||||
|
||||
# Setting huge_tree=True causes crashes in windows with large files
|
||||
parser = etree.XMLParser(no_network=True)
|
||||
try:
|
||||
data = etree.fromstring(data, parser=parser)
|
||||
except etree.XMLSyntaxError:
|
||||
data = etree.fromstring(data, parser=RECOVER_PARSER)
|
||||
return data
|
||||
|
||||
def clean_word_doc(data, log):
|
||||
prefixes = []
|
||||
for match in re.finditer(r'xmlns:(\S+?)=".*?microsoft.*?"', data):
|
||||
prefixes.append(match.group(1))
|
||||
if prefixes:
|
||||
log.warn('Found microsoft markup, cleaning...')
|
||||
# Remove empty tags as they are not rendered by browsers
|
||||
# but can become renderable HTML tags like <p/> if the
|
||||
# document is parsed by an HTML parser
|
||||
pat = re.compile(
|
||||
r'<(%s):([a-zA-Z0-9]+)[^>/]*?></\1:\2>'%('|'.join(prefixes)),
|
||||
re.DOTALL)
|
||||
data = pat.sub('', data)
|
||||
pat = re.compile(
|
||||
r'<(%s):([a-zA-Z0-9]+)[^>/]*?/>'%('|'.join(prefixes)))
|
||||
data = pat.sub('', data)
|
||||
return data
|
||||
|
||||
def parse_html(data, log=None, decoder=None, preprocessor=None,
|
||||
filename='<string>', non_html_file_tags=frozenset()):
|
||||
if log is None:
|
||||
from calibre.utils.logging import default_log
|
||||
log = default_log
|
||||
|
||||
filename = force_unicode(filename, enc=filesystem_encoding)
|
||||
|
||||
if not isinstance(data, unicode):
|
||||
if decoder is not None:
|
||||
data = decoder(data)
|
||||
else:
|
||||
data = xml_to_unicode(data)[0]
|
||||
|
||||
data = strip_encoding_declarations(data)
|
||||
if preprocessor is not None:
|
||||
data = preprocessor(data)
|
||||
|
||||
# There could be null bytes in data if it had � entities in it
|
||||
data = data.replace('\0', '')
|
||||
|
||||
# Remove DOCTYPE declaration as it messes up parsing
|
||||
# In particular, it causes tostring to insert xmlns
|
||||
# declarations, which messes up the coercing logic
|
||||
idx = data.find('<html')
|
||||
if idx == -1:
|
||||
idx = data.find('<HTML')
|
||||
if idx > -1:
|
||||
pre = data[:idx]
|
||||
data = data[idx:]
|
||||
if '<!DOCTYPE' in pre: # Handle user defined entities
|
||||
user_entities = {}
|
||||
for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
|
||||
val = match.group(2)
|
||||
if val.startswith('"') and val.endswith('"'):
|
||||
val = val[1:-1]
|
||||
user_entities[match.group(1)] = val
|
||||
if user_entities:
|
||||
pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
|
||||
data = pat.sub(lambda m:user_entities[m.group(1)], data)
|
||||
|
||||
data = clean_word_doc(data, log)
|
||||
|
||||
# Setting huge_tree=True causes crashes in windows with large files
|
||||
parser = etree.XMLParser(no_network=True)
|
||||
|
||||
# Try with more & more drastic measures to parse
|
||||
try:
|
||||
data = etree.fromstring(data, parser=parser)
|
||||
except etree.XMLSyntaxError:
|
||||
log.debug('Initial parse failed, using more'
|
||||
' forgiving parsers')
|
||||
data = xml_replace_entities(data)
|
||||
try:
|
||||
data = etree.fromstring(data, parser=parser)
|
||||
except etree.XMLSyntaxError:
|
||||
log.debug('Parsing %s as HTML' % filename)
|
||||
try:
|
||||
data = _html5_parse(data)
|
||||
except:
|
||||
log.exception(
|
||||
'HTML 5 parsing failed, falling back to older parsers')
|
||||
data = _html4_parse(data)
|
||||
|
||||
if data.tag == 'HTML':
|
||||
# Lower case all tag and attribute names
|
||||
data.tag = data.tag.lower()
|
||||
for x in data.iterdescendants():
|
||||
try:
|
||||
x.tag = x.tag.lower()
|
||||
for key, val in list(x.attrib.iteritems()):
|
||||
del x.attrib[key]
|
||||
key = key.lower()
|
||||
x.attrib[key] = val
|
||||
except:
|
||||
pass
|
||||
|
||||
if barename(data.tag) != 'html':
|
||||
if barename(data.tag) in non_html_file_tags:
|
||||
raise NotHTML(data.tag)
|
||||
log.warn('File %r does not appear to be (X)HTML'%filename)
|
||||
nroot = etree.fromstring('<html></html>')
|
||||
has_body = False
|
||||
for child in list(data):
|
||||
if isinstance(child.tag, (unicode, str)) and barename(child.tag) == 'body':
|
||||
has_body = True
|
||||
break
|
||||
parent = nroot
|
||||
if not has_body:
|
||||
log.warn('File %r appears to be a HTML fragment'%filename)
|
||||
nroot = etree.fromstring('<html><body/></html>')
|
||||
parent = nroot[0]
|
||||
for child in list(data.iter()):
|
||||
oparent = child.getparent()
|
||||
if oparent is not None:
|
||||
oparent.remove(child)
|
||||
parent.append(child)
|
||||
data = nroot
|
||||
|
||||
# Force into the XHTML namespace
|
||||
if not namespace(data.tag):
|
||||
log.warn('Forcing', filename, 'into XHTML namespace')
|
||||
data.attrib['xmlns'] = XHTML_NS
|
||||
data = etree.tostring(data, encoding=unicode)
|
||||
|
||||
try:
|
||||
data = etree.fromstring(data, parser=parser)
|
||||
except:
|
||||
data = data.replace(':=', '=').replace(':>', '>')
|
||||
data = data.replace('<http:/>', '')
|
||||
try:
|
||||
data = etree.fromstring(data, parser=parser)
|
||||
except etree.XMLSyntaxError:
|
||||
log.warn('Stripping comments from %s'%
|
||||
filename)
|
||||
data = re.compile(r'<!--.*?-->', re.DOTALL).sub('',
|
||||
data)
|
||||
data = data.replace(
|
||||
"<?xml version='1.0' encoding='utf-8'?><o:p></o:p>",
|
||||
'')
|
||||
data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '')
|
||||
try:
|
||||
data = etree.fromstring(data,
|
||||
parser=RECOVER_PARSER)
|
||||
except etree.XMLSyntaxError:
|
||||
log.warn('Stripping meta tags from %s'% filename)
|
||||
data = re.sub(r'<meta\s+[^>]+?>', '', data)
|
||||
data = etree.fromstring(data, parser=RECOVER_PARSER)
|
||||
elif namespace(data.tag) != XHTML_NS:
|
||||
# OEB_DOC_NS, but possibly others
|
||||
ns = namespace(data.tag)
|
||||
attrib = dict(data.attrib)
|
||||
nroot = etree.Element(XHTML('html'),
|
||||
nsmap={None: XHTML_NS}, attrib=attrib)
|
||||
for elem in data.iterdescendants():
|
||||
if isinstance(elem.tag, basestring) and \
|
||||
namespace(elem.tag) == ns:
|
||||
elem.tag = XHTML(barename(elem.tag))
|
||||
for elem in data:
|
||||
nroot.append(elem)
|
||||
data = nroot
|
||||
|
||||
data = merge_multiple_html_heads_and_bodies(data, log)
|
||||
# Ensure has a <head/>
|
||||
head = xpath(data, '/h:html/h:head')
|
||||
head = head[0] if head else None
|
||||
if head is None:
|
||||
log.warn('File %s missing <head/> element' % filename)
|
||||
head = etree.Element(XHTML('head'))
|
||||
data.insert(0, head)
|
||||
title = etree.SubElement(head, XHTML('title'))
|
||||
title.text = _('Unknown')
|
||||
elif not xpath(data, '/h:html/h:head/h:title'):
|
||||
log.warn('File %s missing <title/> element' % filename)
|
||||
title = etree.SubElement(head, XHTML('title'))
|
||||
title.text = _('Unknown')
|
||||
# Remove any encoding-specifying <meta/> elements
|
||||
for meta in META_XP(data):
|
||||
meta.getparent().remove(meta)
|
||||
etree.SubElement(head, XHTML('meta'),
|
||||
attrib={'http-equiv': 'Content-Type',
|
||||
'content': '%s; charset=utf-8' % XHTML_NS})
|
||||
# Ensure has a <body/>
|
||||
if not xpath(data, '/h:html/h:body'):
|
||||
body = xpath(data, '//h:body')
|
||||
if body:
|
||||
body = body[0]
|
||||
body.getparent().remove(body)
|
||||
data.append(body)
|
||||
else:
|
||||
log.warn('File %s missing <body/> element' % filename)
|
||||
etree.SubElement(data, XHTML('body'))
|
||||
|
||||
# Remove microsoft office markup
|
||||
r = [x for x in data.iterdescendants(etree.Element) if 'microsoft-com' in x.tag]
|
||||
for x in r:
|
||||
x.tag = XHTML('span')
|
||||
|
||||
# Remove lang redefinition inserted by the amazing Microsoft Word!
|
||||
body = xpath(data, '/h:html/h:body')[0]
|
||||
for key in list(body.attrib.keys()):
|
||||
if key == 'lang' or key.endswith('}lang'):
|
||||
body.attrib.pop(key)
|
||||
|
||||
def remove_elem(a):
|
||||
p = a.getparent()
|
||||
idx = p.index(a) -1
|
||||
p.remove(a)
|
||||
if a.tail:
|
||||
if idx <= 0:
|
||||
if p.text is None:
|
||||
p.text = ''
|
||||
p.text += a.tail
|
||||
else:
|
||||
if p[idx].tail is None:
|
||||
p[idx].tail = ''
|
||||
p[idx].tail += a.tail
|
||||
|
||||
# Remove hyperlinks with no content as they cause rendering
|
||||
# artifacts in browser based renderers
|
||||
# Also remove empty <b>, <u> and <i> tags
|
||||
for a in xpath(data, '//h:a[@href]|//h:i|//h:b|//h:u'):
|
||||
if a.get('id', None) is None and a.get('name', None) is None \
|
||||
and len(a) == 0 and not a.text:
|
||||
remove_elem(a)
|
||||
|
||||
# Convert <br>s with content into paragraphs as ADE can't handle
|
||||
# them
|
||||
for br in xpath(data, '//h:br'):
|
||||
if len(br) > 0 or br.text:
|
||||
br.tag = XHTML('div')
|
||||
|
||||
# Remove any stray text in the <head> section and format it nicely
|
||||
data.text = '\n '
|
||||
head = xpath(data, '//h:head')
|
||||
if head:
|
||||
head = head[0]
|
||||
head.text = '\n '
|
||||
head.tail = '\n '
|
||||
for child in head:
|
||||
child.tail = '\n '
|
||||
child.tail = '\n '
|
||||
|
||||
return data
|
||||
|
||||
|
@ -19,16 +19,15 @@ from calibre.ebooks.oeb.base import OPF1_NS, OPF2_NS, OPF2_NSMAP, DC11_NS, \
|
||||
from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_IMAGES, \
|
||||
PAGE_MAP_MIME, JPEG_MIME, NCX_MIME, SVG_MIME
|
||||
from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, \
|
||||
ENTITY_RE, MS_COVER_TYPE, iterlinks
|
||||
MS_COVER_TYPE, iterlinks
|
||||
from calibre.ebooks.oeb.base import namespace, barename, XPath, xpath, \
|
||||
urlnormalize, BINARY_MIME, \
|
||||
OEBError, OEBBook, DirContainer
|
||||
from calibre.ebooks.oeb.writer import OEBWriter
|
||||
from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
|
||||
from calibre.utils.localization import get_lang
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.constants import __appname__, __version__
|
||||
from calibre import guess_type
|
||||
from calibre import guess_type, xml_replace_entities
|
||||
|
||||
__all__ = ['OEBReader']
|
||||
|
||||
@ -107,8 +106,7 @@ class OEBReader(object):
|
||||
try:
|
||||
opf = etree.fromstring(data)
|
||||
except etree.XMLSyntaxError:
|
||||
repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
|
||||
data = ENTITY_RE.sub(repl, data)
|
||||
data = xml_replace_entities(data, encoding=None)
|
||||
try:
|
||||
opf = etree.fromstring(data)
|
||||
self.logger.warn('OPF contains invalid HTML named entities')
|
||||
@ -371,8 +369,15 @@ class OEBReader(object):
|
||||
else :
|
||||
description = None
|
||||
|
||||
index_image = xpath(child,
|
||||
'descendant::calibre:meta[@name = "toc_thumbnail"]')
|
||||
toc_thumbnail = (index_image[0].text if index_image else None)
|
||||
if not toc_thumbnail or not toc_thumbnail.strip():
|
||||
toc_thumbnail = None
|
||||
|
||||
node = toc.add(title, href, id=id, klass=klass,
|
||||
play_order=po, description=description, author=author)
|
||||
play_order=po, description=description, author=author,
|
||||
toc_thumbnail=toc_thumbnail)
|
||||
|
||||
self._toc_from_navpoint(item, node, child)
|
||||
|
||||
|
@ -56,8 +56,11 @@ def render_html(mi, css, vertical, widget, all_fields=False): # {{{
|
||||
</body>
|
||||
<html>
|
||||
'''%(f, c, css)
|
||||
fm = getattr(mi, 'field_metadata', field_metadata)
|
||||
fl = dict(get_field_list(fm))
|
||||
show_comments = (all_fields or fl.get('comments', True))
|
||||
comments = u''
|
||||
if mi.comments:
|
||||
if mi.comments and show_comments:
|
||||
comments = comments_to_html(force_unicode(mi.comments))
|
||||
right_pane = u'<div id="comments" class="comments">%s</div>'%comments
|
||||
|
||||
|
@ -429,7 +429,7 @@ def populate_metadata_page(layout, db, book_id, bulk=False, two_column=False, pa
|
||||
# The fields named here must be first in the widget list
|
||||
tweak_cols = tweaks['metadata_edit_custom_column_order']
|
||||
comments_in_tweak = 0
|
||||
for key in tweak_cols:
|
||||
for key in (tweak_cols or ()):
|
||||
# Add the key if it really exists in the database
|
||||
if key in cols_to_display:
|
||||
cols.append(key)
|
||||
|
@ -441,7 +441,7 @@ class Scheduler(QObject):
|
||||
self.news_menu.addAction(self.cac)
|
||||
self.news_menu.addSeparator()
|
||||
self.all_action = self.news_menu.addAction(
|
||||
_('Download all scheduled new sources'),
|
||||
_('Download all scheduled news sources'),
|
||||
self.download_all_scheduled)
|
||||
|
||||
self.timer = QTimer(self)
|
||||
|
@ -758,11 +758,12 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
|
||||
self.set_page_number(frac)
|
||||
|
||||
def next_document(self):
|
||||
if self.current_index < len(self.iterator.spine) - 1:
|
||||
if (hasattr(self, 'current_index') and self.current_index <
|
||||
len(self.iterator.spine) - 1):
|
||||
self.load_path(self.iterator.spine[self.current_index+1])
|
||||
|
||||
def previous_document(self):
|
||||
if self.current_index > 0:
|
||||
if hasattr(self, 'current_index') and self.current_index > 0:
|
||||
self.load_path(self.iterator.spine[self.current_index-1], pos=1.0)
|
||||
|
||||
def keyPressEvent(self, event):
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
19163
src/calibre/translations/ku.po
Normal file
19163
src/calibre/translations/ku.po
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user