mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Sync to trunk.
This commit is contained in:
commit
679857b342
@ -19,6 +19,83 @@
|
|||||||
# new recipes:
|
# new recipes:
|
||||||
# - title:
|
# - title:
|
||||||
|
|
||||||
|
- version: 0.8.1
|
||||||
|
date: 2011-05-13
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "Add Amazon DE, Beam EBooks, Beam DE, Weightless Books, Wizards Tower Books to the list of ebook stores searched by Get Books"
|
||||||
|
|
||||||
|
- title: "TXT output: All new Textile output with much greater preservation of formatting from the input document"
|
||||||
|
|
||||||
|
- title: "Migrate metadata plugin for Douban Books to the 0.8 API"
|
||||||
|
|
||||||
|
- title: "Driver for Dell Streak on windows"
|
||||||
|
|
||||||
|
- title: "Add menu items to Get Books action to search by title and author of current book"
|
||||||
|
|
||||||
|
- title: "Add title_sort as available field to CSV/XML catalogs"
|
||||||
|
|
||||||
|
- title: "Add a context menu to the manage authors dialog"
|
||||||
|
|
||||||
|
- title: "Add a button to paste isbn into the identifiers field in the edit metadata dialog automatically"
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "Amazon metadata download plugin: Fix links being stripped from comments. Also fix ratings/isbn not being parsed from kindle edition pages."
|
||||||
|
tickets: [782012]
|
||||||
|
|
||||||
|
- title: "Fix one source of segfaults on shutdown in the linux binary builds."
|
||||||
|
|
||||||
|
- title: "Allow the use of condensed/expanded fonts as interface fonts"
|
||||||
|
|
||||||
|
- title: "EPUB Input: Ignore missing cover file when converting, instead of erroring out."
|
||||||
|
tickets: [781848]
|
||||||
|
|
||||||
|
- title: "Fix custom identifier being erased by metadata download"
|
||||||
|
tickets: [781759]
|
||||||
|
|
||||||
|
- title: "Fix regression that broke various things when using Japanese language calibre on windows"
|
||||||
|
tickets: [780804]
|
||||||
|
|
||||||
|
- title: "RTF Input: Handle null color codes correctly"
|
||||||
|
tickets: [780728]
|
||||||
|
|
||||||
|
- title: "ODT Input: Handle inline special styles defined on <text:span> tags."
|
||||||
|
tickets: [780250]
|
||||||
|
|
||||||
|
- title: "Fix error when pressing next previous button with an empty search in the Plugins preferences"
|
||||||
|
tickets: [781135]
|
||||||
|
|
||||||
|
- title: "Ignore 'Unknown' author when downloading metadata."
|
||||||
|
tickets: [779348]
|
||||||
|
|
||||||
|
- title: "Fix timezone bug when setting dates in the edit metadata dialog"
|
||||||
|
tickets: [779497]
|
||||||
|
|
||||||
|
- title: "Fix ebook-convert not recognizing output paths starting with .."
|
||||||
|
tickets: [779322]
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- "Strategy+Business"
|
||||||
|
- Readers Digest
|
||||||
|
- Ming Pao
|
||||||
|
- Telepolis
|
||||||
|
- Fronda
|
||||||
|
- Rzeczpospolita
|
||||||
|
|
||||||
|
new recipes:
|
||||||
|
- title: "Various Taiwanese news sources"
|
||||||
|
author: Eddie Lau
|
||||||
|
|
||||||
|
- title: Replica Vedetelor, Ziua Veche
|
||||||
|
author: Silviu Cotoara
|
||||||
|
|
||||||
|
- title: Welt der Physik
|
||||||
|
author: schuster
|
||||||
|
|
||||||
|
- title: Korea Herald
|
||||||
|
author: Seongkyoun Yoo
|
||||||
|
|
||||||
|
|
||||||
- version: 0.8.0
|
- version: 0.8.0
|
||||||
date: 2010-05-06
|
date: 2010-05-06
|
||||||
|
|
||||||
|
@ -20,7 +20,7 @@ class Economist(BasicNewsRecipe):
|
|||||||
INDEX = 'http://www.economist.com/printedition'
|
INDEX = 'http://www.economist.com/printedition'
|
||||||
description = ('Global news and current affairs from a European'
|
description = ('Global news and current affairs from a European'
|
||||||
' perspective. Best downloaded on Friday mornings (GMT)')
|
' perspective. Best downloaded on Friday mornings (GMT)')
|
||||||
|
extra_css = '.headline {font-size: x-large;} \n h2 { font-size: small; } \n h1 { font-size: medium; }'
|
||||||
oldest_article = 7.0
|
oldest_article = 7.0
|
||||||
cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
|
cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
|
64
recipes/financialsense.recipe
Normal file
64
recipes/financialsense.recipe
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
'''
|
||||||
|
www.financialsense.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class FinancialSense(BasicNewsRecipe):
|
||||||
|
title = 'Financial Sense'
|
||||||
|
__author__ = 'Darko Miletic'
|
||||||
|
description = 'Uncommon News & Views for the Wise Investor'
|
||||||
|
publisher = 'Financial Sense'
|
||||||
|
category = 'news, finances, politics, USA'
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 200
|
||||||
|
no_stylesheets = True
|
||||||
|
encoding = 'utf8'
|
||||||
|
use_embedded_content = False
|
||||||
|
language = 'en'
|
||||||
|
remove_empty_feeds = True
|
||||||
|
publication_type = 'newsportal'
|
||||||
|
masthead_url = 'http://www.financialsense.com/sites/default/files/logo.jpg'
|
||||||
|
extra_css = """
|
||||||
|
body{font-family: Arial,"Helvetica Neue",Helvetica,sans-serif }
|
||||||
|
img{margin-bottom: 0.4em; display:block}
|
||||||
|
h2{color: gray}
|
||||||
|
.name{margin-right: 5em}
|
||||||
|
"""
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher' : publisher
|
||||||
|
, 'language' : language
|
||||||
|
}
|
||||||
|
|
||||||
|
remove_tags =[dict(name=['meta','link','base','object','embed','iframe'])]
|
||||||
|
remove_tags_after=dict(attrs={'class':'vcard'})
|
||||||
|
keep_only_tags =[dict(attrs={'class':['title','post-meta','content','item-title','vcard']})]
|
||||||
|
remove_attributes=['lang','type']
|
||||||
|
|
||||||
|
|
||||||
|
feeds = [(u'Articles', u'http://feeds.feedburner.com/fso')]
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll(style=True):
|
||||||
|
del item['style']
|
||||||
|
for item in soup.findAll('a'):
|
||||||
|
limg = item.find('img')
|
||||||
|
if item.string is not None:
|
||||||
|
str = item.string
|
||||||
|
item.replaceWith(str)
|
||||||
|
else:
|
||||||
|
if limg:
|
||||||
|
item.name = 'div'
|
||||||
|
item.attrs = []
|
||||||
|
else:
|
||||||
|
str = self.tag_to_string(item)
|
||||||
|
item.replaceWith(str)
|
||||||
|
for item in soup.findAll('img'):
|
||||||
|
if not item.has_key('alt'):
|
||||||
|
item['alt'] = 'image'
|
||||||
|
return soup
|
BIN
recipes/icons/financialsense.png
Normal file
BIN
recipes/icons/financialsense.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 702 B |
BIN
recipes/icons/iprofesional.png
Normal file
BIN
recipes/icons/iprofesional.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.1 KiB |
79
recipes/iprofesional.recipe
Normal file
79
recipes/iprofesional.recipe
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
'''
|
||||||
|
www.iprofesional.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class iProfesional(BasicNewsRecipe):
|
||||||
|
title = 'iProfesional.com'
|
||||||
|
__author__ = 'Darko Miletic'
|
||||||
|
description = 'Las ultimas noticias sobre profesionales'
|
||||||
|
publisher = 'Emprendimientos Corporativos S.A.'
|
||||||
|
category = 'news, IT, impuestos, negocios, politics, Argentina'
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 200
|
||||||
|
no_stylesheets = True
|
||||||
|
encoding = 'utf8'
|
||||||
|
use_embedded_content = False
|
||||||
|
language = 'es_AR'
|
||||||
|
remove_empty_feeds = True
|
||||||
|
publication_type = 'nesportal'
|
||||||
|
masthead_url = 'http://www.iprofesional.com/img/logo-iprofesional.png'
|
||||||
|
extra_css = """
|
||||||
|
body{font-family: Arial,Helvetica,sans-serif }
|
||||||
|
img{margin-bottom: 0.4em; display:block}
|
||||||
|
.titulo-interior{font-family: Georgia,"Times New Roman",Times,serif}
|
||||||
|
.autor-nota{font-size: small; font-weight: bold; font-style: italic; color: gray}
|
||||||
|
"""
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher' : publisher
|
||||||
|
, 'language' : language
|
||||||
|
}
|
||||||
|
|
||||||
|
keep_only_tags = [dict(attrs={'class':['fecha','interior-nota']})]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name=['meta','link','base','embed','object','iframe'])
|
||||||
|
,dict(attrs={'class':['menu-imprimir','guardarNota','IN-widget','fin','permalink']})
|
||||||
|
]
|
||||||
|
remove_attributes=['lang','xmlns:og','xmlns:fb']
|
||||||
|
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Ultimas noticias' , u'http://feeds.feedburner.com/iprofesional-principales-noticias')
|
||||||
|
,(u'Finanzas' , u'http://feeds.feedburner.com/iprofesional-finanzas' )
|
||||||
|
,(u'Impuestos' , u'http://feeds.feedburner.com/iprofesional-impuestos' )
|
||||||
|
,(u'Negocios' , u'http://feeds.feedburner.com/iprofesional-economia' )
|
||||||
|
,(u'Comercio Exterior' , u'http://feeds.feedburner.com/iprofesional-comercio-exterior' )
|
||||||
|
,(u'Tecnologia' , u'http://feeds.feedburner.com/iprofesional-tecnologia' )
|
||||||
|
,(u'Management' , u'http://feeds.feedburner.com/iprofesional-managment' )
|
||||||
|
,(u'Marketing' , u'http://feeds.feedburner.com/iprofesional-marketing' )
|
||||||
|
,(u'Legales' , u'http://feeds.feedburner.com/iprofesional-legales' )
|
||||||
|
,(u'Autos' , u'http://feeds.feedburner.com/iprofesional-autos' )
|
||||||
|
,(u'Vinos' , u'http://feeds.feedburner.com/iprofesional-vinos-bodegas' )
|
||||||
|
]
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll(style=True):
|
||||||
|
del item['style']
|
||||||
|
for item in soup.findAll('a'):
|
||||||
|
limg = item.find('img')
|
||||||
|
if item.string is not None:
|
||||||
|
str = item.string
|
||||||
|
item.replaceWith(str)
|
||||||
|
else:
|
||||||
|
if limg:
|
||||||
|
item.name = 'div'
|
||||||
|
item.attrs = []
|
||||||
|
else:
|
||||||
|
str = self.tag_to_string(item)
|
||||||
|
item.replaceWith(str)
|
||||||
|
for item in soup.findAll('img'):
|
||||||
|
if not item.has_key('alt'):
|
||||||
|
item['alt'] = 'image'
|
||||||
|
return soup
|
@ -30,11 +30,12 @@ int report_libc_error(const char *msg) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
int pyobject_to_int(PyObject *res) {
|
int pyobject_to_int(PyObject *res) {
|
||||||
int ret; PyObject *tmp;
|
int ret = 0; PyObject *tmp;
|
||||||
|
if (res != NULL) {
|
||||||
tmp = PyNumber_Int(res);
|
tmp = PyNumber_Int(res);
|
||||||
if (tmp == NULL) ret = (PyObject_IsTrue(res)) ? 1 : 0;
|
if (tmp == NULL) ret = (PyObject_IsTrue(res)) ? 1 : 0;
|
||||||
else ret = (int)PyInt_AS_LONG(tmp);
|
else ret = (int)PyInt_AS_LONG(tmp);
|
||||||
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4,7 +4,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
__appname__ = u'calibre'
|
__appname__ = u'calibre'
|
||||||
numeric_version = (0, 8, 0)
|
numeric_version = (0, 8, 1)
|
||||||
__version__ = u'.'.join(map(unicode, numeric_version))
|
__version__ = u'.'.join(map(unicode, numeric_version))
|
||||||
__author__ = u"Kovid Goyal <kovid@kovidgoyal.net>"
|
__author__ = u"Kovid Goyal <kovid@kovidgoyal.net>"
|
||||||
|
|
||||||
|
@ -628,8 +628,9 @@ from calibre.ebooks.metadata.sources.amazon import Amazon
|
|||||||
from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary
|
from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary
|
||||||
from calibre.ebooks.metadata.sources.isbndb import ISBNDB
|
from calibre.ebooks.metadata.sources.isbndb import ISBNDB
|
||||||
from calibre.ebooks.metadata.sources.overdrive import OverDrive
|
from calibre.ebooks.metadata.sources.overdrive import OverDrive
|
||||||
|
from calibre.ebooks.metadata.sources.douban import Douban
|
||||||
|
|
||||||
plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive]
|
plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive, Douban]
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
|
@ -92,7 +92,7 @@ def restore_plugin_state_to_default(plugin_or_name):
|
|||||||
config['enabled_plugins'] = ep
|
config['enabled_plugins'] = ep
|
||||||
|
|
||||||
default_disabled_plugins = set([
|
default_disabled_plugins = set([
|
||||||
'Overdrive',
|
'Overdrive', 'Douban Books',
|
||||||
])
|
])
|
||||||
|
|
||||||
def is_disabled(plugin):
|
def is_disabled(plugin):
|
||||||
|
@ -83,6 +83,7 @@ class ArchiveExtract(FileTypePlugin):
|
|||||||
return of.name
|
return of.name
|
||||||
|
|
||||||
def get_comic_book_info(d, mi):
|
def get_comic_book_info(d, mi):
|
||||||
|
# See http://code.google.com/p/comicbookinfo/wiki/Example
|
||||||
series = d.get('series', '')
|
series = d.get('series', '')
|
||||||
if series.strip():
|
if series.strip():
|
||||||
mi.series = series
|
mi.series = series
|
||||||
@ -111,6 +112,7 @@ def get_comic_book_info(d, mi):
|
|||||||
|
|
||||||
|
|
||||||
def get_cbz_metadata(stream):
|
def get_cbz_metadata(stream):
|
||||||
|
# See http://code.google.com/p/comicbookinfo/wiki/Example
|
||||||
from calibre.utils.zipfile import ZipFile
|
from calibre.utils.zipfile import ZipFile
|
||||||
from calibre.ebooks.metadata import MetaInformation
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
import json
|
import json
|
||||||
|
@ -16,7 +16,7 @@ from lxml.html import soupparser, tostring
|
|||||||
|
|
||||||
from calibre import as_unicode
|
from calibre import as_unicode
|
||||||
from calibre.ebooks.metadata import check_isbn
|
from calibre.ebooks.metadata import check_isbn
|
||||||
from calibre.ebooks.metadata.sources.base import Source
|
from calibre.ebooks.metadata.sources.base import Source, Option
|
||||||
from calibre.utils.cleantext import clean_ascii_chars
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
from calibre.ebooks.metadata.book.base import Metadata
|
from calibre.ebooks.metadata.book.base import Metadata
|
||||||
@ -37,6 +37,92 @@ class Worker(Thread): # Get details {{{
|
|||||||
self.relevance, self.plugin = relevance, plugin
|
self.relevance, self.plugin = relevance, plugin
|
||||||
self.browser = browser.clone_browser()
|
self.browser = browser.clone_browser()
|
||||||
self.cover_url = self.amazon_id = self.isbn = None
|
self.cover_url = self.amazon_id = self.isbn = None
|
||||||
|
self.domain = self.plugin.domain
|
||||||
|
|
||||||
|
months = {
|
||||||
|
'de': {
|
||||||
|
1 : ['jän'],
|
||||||
|
3 : ['märz'],
|
||||||
|
5 : ['mai'],
|
||||||
|
6 : ['juni'],
|
||||||
|
7 : ['juli'],
|
||||||
|
10: ['okt'],
|
||||||
|
12: ['dez']
|
||||||
|
},
|
||||||
|
'it': {
|
||||||
|
1: ['enn'],
|
||||||
|
2: ['febbr'],
|
||||||
|
5: ['magg'],
|
||||||
|
6: ['giugno'],
|
||||||
|
7: ['luglio'],
|
||||||
|
8: ['ag'],
|
||||||
|
9: ['sett'],
|
||||||
|
10: ['ott'],
|
||||||
|
12: ['dic'],
|
||||||
|
},
|
||||||
|
'fr': {
|
||||||
|
1: ['janv'],
|
||||||
|
2: ['févr'],
|
||||||
|
3: ['mars'],
|
||||||
|
4: ['avril'],
|
||||||
|
5: ['mai'],
|
||||||
|
6: ['juin'],
|
||||||
|
7: ['juil'],
|
||||||
|
8: ['août'],
|
||||||
|
9: ['sept'],
|
||||||
|
12: ['déc'],
|
||||||
|
},
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
self.english_months = [None, 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
|
||||||
|
'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
|
||||||
|
self.months = months.get(self.domain, {})
|
||||||
|
|
||||||
|
self.pd_xpath = '''
|
||||||
|
//h2[text()="Product Details" or \
|
||||||
|
text()="Produktinformation" or \
|
||||||
|
text()="Dettagli prodotto" or \
|
||||||
|
text()="Product details" or \
|
||||||
|
text()="Détails sur le produit"]/../div[@class="content"]
|
||||||
|
'''
|
||||||
|
self.publisher_xpath = '''
|
||||||
|
descendant::*[starts-with(text(), "Publisher:") or \
|
||||||
|
starts-with(text(), "Verlag:") or \
|
||||||
|
starts-with(text(), "Editore:") or \
|
||||||
|
starts-with(text(), "Editeur")]
|
||||||
|
'''
|
||||||
|
self.language_xpath = '''
|
||||||
|
descendant::*[
|
||||||
|
starts-with(text(), "Language:") \
|
||||||
|
or text() = "Language" \
|
||||||
|
or text() = "Sprache:" \
|
||||||
|
or text() = "Lingua:" \
|
||||||
|
or starts-with(text(), "Langue") \
|
||||||
|
]
|
||||||
|
'''
|
||||||
|
self.ratings_pat = re.compile(
|
||||||
|
r'([0-9.]+) (out of|von|su|étoiles sur) (\d+)( (stars|Sternen|stelle)){0,1}')
|
||||||
|
|
||||||
|
lm = {
|
||||||
|
'en': ('English', 'Englisch'),
|
||||||
|
'fr': ('French', 'Français'),
|
||||||
|
'it': ('Italian', 'Italiano'),
|
||||||
|
'de': ('German', 'Deutsch'),
|
||||||
|
}
|
||||||
|
self.lang_map = {}
|
||||||
|
for code, names in lm.iteritems():
|
||||||
|
for name in names:
|
||||||
|
self.lang_map[name] = code
|
||||||
|
|
||||||
|
def delocalize_datestr(self, raw):
|
||||||
|
if not self.months:
|
||||||
|
return raw
|
||||||
|
ans = raw.lower()
|
||||||
|
for i, vals in self.months.iteritems():
|
||||||
|
for x in vals:
|
||||||
|
ans = ans.replace(x, self.english_months[i])
|
||||||
|
return ans
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
try:
|
try:
|
||||||
@ -132,7 +218,7 @@ class Worker(Thread): # Get details {{{
|
|||||||
self.log.exception('Error parsing cover for url: %r'%self.url)
|
self.log.exception('Error parsing cover for url: %r'%self.url)
|
||||||
mi.has_cover = bool(self.cover_url)
|
mi.has_cover = bool(self.cover_url)
|
||||||
|
|
||||||
pd = root.xpath('//h2[text()="Product Details"]/../div[@class="content"]')
|
pd = root.xpath(self.pd_xpath)
|
||||||
if pd:
|
if pd:
|
||||||
pd = pd[0]
|
pd = pd[0]
|
||||||
|
|
||||||
@ -194,30 +280,42 @@ class Worker(Thread): # Get details {{{
|
|||||||
def parse_authors(self, root):
|
def parse_authors(self, root):
|
||||||
x = '//h1[@class="parseasinTitle"]/following-sibling::span/*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]'
|
x = '//h1[@class="parseasinTitle"]/following-sibling::span/*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]'
|
||||||
aname = root.xpath(x)
|
aname = root.xpath(x)
|
||||||
|
if not aname:
|
||||||
|
aname = root.xpath('''
|
||||||
|
//h1[@class="parseasinTitle"]/following-sibling::*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]
|
||||||
|
''')
|
||||||
for x in aname:
|
for x in aname:
|
||||||
x.tail = ''
|
x.tail = ''
|
||||||
authors = [tostring(x, encoding=unicode, method='text').strip() for x
|
authors = [tostring(x, encoding=unicode, method='text').strip() for x
|
||||||
in aname]
|
in aname]
|
||||||
|
authors = [a for a in authors if a]
|
||||||
return authors
|
return authors
|
||||||
|
|
||||||
def parse_rating(self, root):
|
def parse_rating(self, root):
|
||||||
ratings = root.xpath('//div[@class="jumpBar"]/descendant::span[@class="asinReviewsSummary"]')
|
ratings = root.xpath('//div[@class="jumpBar"]/descendant::span[@class="asinReviewsSummary"]')
|
||||||
pat = re.compile(r'([0-9.]+) out of (\d+) stars')
|
if not ratings:
|
||||||
|
ratings = root.xpath('//div[@class="buying"]/descendant::span[@class="asinReviewsSummary"]')
|
||||||
|
if not ratings:
|
||||||
|
ratings = root.xpath('//span[@class="crAvgStars"]/descendant::span[@class="asinReviewsSummary"]')
|
||||||
if ratings:
|
if ratings:
|
||||||
for elem in ratings[0].xpath('descendant::*[@title]'):
|
for elem in ratings[0].xpath('descendant::*[@title]'):
|
||||||
t = elem.get('title').strip()
|
t = elem.get('title').strip()
|
||||||
m = pat.match(t)
|
m = self.ratings_pat.match(t)
|
||||||
if m is not None:
|
if m is not None:
|
||||||
return float(m.group(1))/float(m.group(2)) * 5
|
return float(m.group(1))/float(m.group(3)) * 5
|
||||||
|
|
||||||
def parse_comments(self, root):
|
def parse_comments(self, root):
|
||||||
desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]')
|
desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]')
|
||||||
if desc:
|
if desc:
|
||||||
desc = desc[0]
|
desc = desc[0]
|
||||||
for c in desc.xpath('descendant::*[@class="seeAll" or'
|
for c in desc.xpath('descendant::*[@class="seeAll" or'
|
||||||
' @class="emptyClear" or @href]'):
|
' @class="emptyClear"]'):
|
||||||
c.getparent().remove(c)
|
c.getparent().remove(c)
|
||||||
|
for a in desc.xpath('descendant::a[@href]'):
|
||||||
|
del a.attrib['href']
|
||||||
|
a.tag = 'span'
|
||||||
desc = tostring(desc, method='html', encoding=unicode).strip()
|
desc = tostring(desc, method='html', encoding=unicode).strip()
|
||||||
|
|
||||||
# Encoding bug in Amazon data U+fffd (replacement char)
|
# Encoding bug in Amazon data U+fffd (replacement char)
|
||||||
# in some examples it is present in place of '
|
# in some examples it is present in place of '
|
||||||
desc = desc.replace('\ufffd', "'")
|
desc = desc.replace('\ufffd', "'")
|
||||||
@ -246,41 +344,44 @@ class Worker(Thread): # Get details {{{
|
|||||||
return ('/'.join(parts[:-1]))+'/'+bn
|
return ('/'.join(parts[:-1]))+'/'+bn
|
||||||
|
|
||||||
def parse_isbn(self, pd):
|
def parse_isbn(self, pd):
|
||||||
for x in reversed(pd.xpath(
|
items = pd.xpath(
|
||||||
'descendant::*[starts-with(text(), "ISBN")]')):
|
'descendant::*[starts-with(text(), "ISBN")]')
|
||||||
|
if not items:
|
||||||
|
items = pd.xpath(
|
||||||
|
'descendant::b[contains(text(), "ISBN:")]')
|
||||||
|
for x in reversed(items):
|
||||||
if x.tail:
|
if x.tail:
|
||||||
ans = check_isbn(x.tail.strip())
|
ans = check_isbn(x.tail.strip())
|
||||||
if ans:
|
if ans:
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
def parse_publisher(self, pd):
|
def parse_publisher(self, pd):
|
||||||
for x in reversed(pd.xpath(
|
for x in reversed(pd.xpath(self.publisher_xpath)):
|
||||||
'descendant::*[starts-with(text(), "Publisher:")]')):
|
|
||||||
if x.tail:
|
if x.tail:
|
||||||
ans = x.tail.partition(';')[0]
|
ans = x.tail.partition(';')[0]
|
||||||
return ans.partition('(')[0].strip()
|
return ans.partition('(')[0].strip()
|
||||||
|
|
||||||
def parse_pubdate(self, pd):
|
def parse_pubdate(self, pd):
|
||||||
for x in reversed(pd.xpath(
|
for x in reversed(pd.xpath(self.publisher_xpath)):
|
||||||
'descendant::*[starts-with(text(), "Publisher:")]')):
|
|
||||||
if x.tail:
|
if x.tail:
|
||||||
ans = x.tail
|
ans = x.tail
|
||||||
date = ans.partition('(')[-1].replace(')', '').strip()
|
date = ans.partition('(')[-1].replace(')', '').strip()
|
||||||
|
date = self.delocalize_datestr(date)
|
||||||
return parse_date(date, assume_utc=True)
|
return parse_date(date, assume_utc=True)
|
||||||
|
|
||||||
def parse_language(self, pd):
|
def parse_language(self, pd):
|
||||||
for x in reversed(pd.xpath(
|
for x in reversed(pd.xpath(self.language_xpath)):
|
||||||
'descendant::*[starts-with(text(), "Language:")]')):
|
|
||||||
if x.tail:
|
if x.tail:
|
||||||
ans = x.tail.strip()
|
ans = x.tail.strip()
|
||||||
if ans == 'English':
|
ans = self.lang_map.get(ans, None)
|
||||||
return 'en'
|
if ans:
|
||||||
|
return ans
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
class Amazon(Source):
|
class Amazon(Source):
|
||||||
|
|
||||||
name = 'Amazon.com'
|
name = 'Amazon.com'
|
||||||
description = _('Downloads metadata from Amazon')
|
description = _('Downloads metadata and covers from Amazon')
|
||||||
|
|
||||||
capabilities = frozenset(['identify', 'cover'])
|
capabilities = frozenset(['identify', 'cover'])
|
||||||
touched_fields = frozenset(['title', 'authors', 'identifier:amazon',
|
touched_fields = frozenset(['title', 'authors', 'identifier:amazon',
|
||||||
@ -294,8 +395,15 @@ class Amazon(Source):
|
|||||||
'fr' : _('France'),
|
'fr' : _('France'),
|
||||||
'de' : _('Germany'),
|
'de' : _('Germany'),
|
||||||
'uk' : _('UK'),
|
'uk' : _('UK'),
|
||||||
|
'it' : _('Italy'),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
options = (
|
||||||
|
Option('domain', 'choices', 'com', _('Amazon website to use:'),
|
||||||
|
_('Metadata from Amazon will be fetched using this '
|
||||||
|
'country\'s Amazon website.'), choices=AMAZON_DOMAINS),
|
||||||
|
)
|
||||||
|
|
||||||
def get_book_url(self, identifiers): # {{{
|
def get_book_url(self, identifiers): # {{{
|
||||||
asin = identifiers.get('amazon', None)
|
asin = identifiers.get('amazon', None)
|
||||||
if asin is None:
|
if asin is None:
|
||||||
@ -304,8 +412,16 @@ class Amazon(Source):
|
|||||||
return ('amazon', asin, 'http://amzn.com/%s'%asin)
|
return ('amazon', asin, 'http://amzn.com/%s'%asin)
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
|
@property
|
||||||
|
def domain(self):
|
||||||
|
domain = self.prefs['domain']
|
||||||
|
if domain not in self.AMAZON_DOMAINS:
|
||||||
|
domain = 'com'
|
||||||
|
|
||||||
|
return domain
|
||||||
|
|
||||||
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
|
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
|
||||||
domain = self.prefs.get('domain', 'com')
|
domain = self.domain
|
||||||
|
|
||||||
# See the amazon detailed search page to get all options
|
# See the amazon detailed search page to get all options
|
||||||
q = { 'search-alias' : 'aps',
|
q = { 'search-alias' : 'aps',
|
||||||
@ -345,6 +461,8 @@ class Amazon(Source):
|
|||||||
latin1q = dict([(x.encode('latin1', 'ignore'), y.encode('latin1',
|
latin1q = dict([(x.encode('latin1', 'ignore'), y.encode('latin1',
|
||||||
'ignore')) for x, y in
|
'ignore')) for x, y in
|
||||||
q.iteritems()])
|
q.iteritems()])
|
||||||
|
if domain == 'uk':
|
||||||
|
domain = 'co.uk'
|
||||||
url = 'http://www.amazon.%s/s/?'%domain + urlencode(latin1q)
|
url = 'http://www.amazon.%s/s/?'%domain + urlencode(latin1q)
|
||||||
return url
|
return url
|
||||||
|
|
||||||
@ -516,11 +634,19 @@ if __name__ == '__main__': # tests {{{
|
|||||||
# src/calibre/ebooks/metadata/sources/amazon.py
|
# src/calibre/ebooks/metadata/sources/amazon.py
|
||||||
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
|
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
|
||||||
title_test, authors_test)
|
title_test, authors_test)
|
||||||
test_identify_plugin(Amazon.name,
|
com_tests = [ # {{{
|
||||||
[
|
|
||||||
|
|
||||||
( # An e-book ISBN not on Amazon, one of the authors is
|
( # Description has links
|
||||||
# unknown to Amazon, so no popup wrapper
|
{'identifiers':{'isbn': '9780671578275'}},
|
||||||
|
[title_test('A Civil Campaign: A Comedy of Biology and Manners',
|
||||||
|
exact=True), authors_test(['Lois McMaster Bujold'])
|
||||||
|
]
|
||||||
|
|
||||||
|
),
|
||||||
|
|
||||||
|
( # An e-book ISBN not on Amazon, the title/author search matches
|
||||||
|
# the Kindle edition, which has different markup for ratings and
|
||||||
|
# isbn
|
||||||
{'identifiers':{'isbn': '9780307459671'},
|
{'identifiers':{'isbn': '9780307459671'},
|
||||||
'title':'Invisible Gorilla', 'authors':['Christopher Chabris']},
|
'title':'Invisible Gorilla', 'authors':['Christopher Chabris']},
|
||||||
[title_test('The Invisible Gorilla: And Other Ways Our Intuitions Deceive Us',
|
[title_test('The Invisible Gorilla: And Other Ways Our Intuitions Deceive Us',
|
||||||
@ -556,6 +682,38 @@ if __name__ == '__main__': # tests {{{
|
|||||||
|
|
||||||
),
|
),
|
||||||
|
|
||||||
])
|
] # }}}
|
||||||
|
|
||||||
|
de_tests = [ # {{{
|
||||||
|
(
|
||||||
|
{'identifiers':{'isbn': '3548283519'}},
|
||||||
|
[title_test('Wer Wind sät',
|
||||||
|
exact=True), authors_test(['Nele Neuhaus'])
|
||||||
|
]
|
||||||
|
|
||||||
|
),
|
||||||
|
] # }}}
|
||||||
|
|
||||||
|
it_tests = [ # {{{
|
||||||
|
(
|
||||||
|
{'identifiers':{'isbn': '8838922195'}},
|
||||||
|
[title_test('La briscola in cinque',
|
||||||
|
exact=True), authors_test(['Marco Malvaldi'])
|
||||||
|
]
|
||||||
|
|
||||||
|
),
|
||||||
|
] # }}}
|
||||||
|
|
||||||
|
fr_tests = [ # {{{
|
||||||
|
(
|
||||||
|
{'identifiers':{'isbn': '2221116798'}},
|
||||||
|
[title_test('L\'étrange voyage de Monsieur Daldry',
|
||||||
|
exact=True), authors_test(['Marc Levy'])
|
||||||
|
]
|
||||||
|
|
||||||
|
),
|
||||||
|
] # }}}
|
||||||
|
|
||||||
|
test_identify_plugin(Amazon.name, com_tests)
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
|
@ -145,10 +145,13 @@ class Option(object):
|
|||||||
:param default: The default value for this option
|
:param default: The default value for this option
|
||||||
:param label: A short (few words) description of this option
|
:param label: A short (few words) description of this option
|
||||||
:param desc: A longer description of this option
|
:param desc: A longer description of this option
|
||||||
:param choices: A list of possible values, used only if type='choices'
|
:param choices: A dict of possible values, used only if type='choices'.
|
||||||
|
dict is of the form {key:human readable label, ...}
|
||||||
'''
|
'''
|
||||||
self.name, self.type, self.default, self.label, self.desc = (name,
|
self.name, self.type, self.default, self.label, self.desc = (name,
|
||||||
type_, default, label, desc)
|
type_, default, label, desc)
|
||||||
|
if choices and not isinstance(choices, dict):
|
||||||
|
choices = dict([(x, x) for x in choices])
|
||||||
self.choices = choices
|
self.choices = choices
|
||||||
|
|
||||||
class Source(Plugin):
|
class Source(Plugin):
|
||||||
|
347
src/calibre/ebooks/metadata/sources/douban.py
Normal file
347
src/calibre/ebooks/metadata/sources/douban.py
Normal file
@ -0,0 +1,347 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>; 2011, Li Fanxi <lifanxi@freemindworld.com>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import time
|
||||||
|
from urllib import urlencode
|
||||||
|
from functools import partial
|
||||||
|
from Queue import Queue, Empty
|
||||||
|
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
from calibre.ebooks.metadata import check_isbn
|
||||||
|
from calibre.ebooks.metadata.sources.base import Source
|
||||||
|
from calibre.ebooks.metadata.book.base import Metadata
|
||||||
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
|
from calibre.utils.date import parse_date, utcnow
|
||||||
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
|
from calibre import as_unicode
|
||||||
|
|
||||||
|
NAMESPACES = {
|
||||||
|
'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
|
||||||
|
'atom' : 'http://www.w3.org/2005/Atom',
|
||||||
|
'db': 'http://www.douban.com/xmlns/',
|
||||||
|
'gd': 'http://schemas.google.com/g/2005'
|
||||||
|
}
|
||||||
|
XPath = partial(etree.XPath, namespaces=NAMESPACES)
|
||||||
|
total_results = XPath('//openSearch:totalResults')
|
||||||
|
start_index = XPath('//openSearch:startIndex')
|
||||||
|
items_per_page = XPath('//openSearch:itemsPerPage')
|
||||||
|
entry = XPath('//atom:entry')
|
||||||
|
entry_id = XPath('descendant::atom:id')
|
||||||
|
title = XPath('descendant::atom:title')
|
||||||
|
description = XPath('descendant::atom:summary')
|
||||||
|
publisher = XPath("descendant::db:attribute[@name='publisher']")
|
||||||
|
isbn = XPath("descendant::db:attribute[@name='isbn13']")
|
||||||
|
date = XPath("descendant::db:attribute[@name='pubdate']")
|
||||||
|
creator = XPath("descendant::db:attribute[@name='author']")
|
||||||
|
booktag = XPath("descendant::db:tag/attribute::name")
|
||||||
|
rating = XPath("descendant::gd:rating/attribute::average")
|
||||||
|
cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href")
|
||||||
|
|
||||||
|
def get_details(browser, url, timeout): # {{{
|
||||||
|
try:
|
||||||
|
raw = browser.open_novisit(url, timeout=timeout).read()
|
||||||
|
except Exception as e:
|
||||||
|
gc = getattr(e, 'getcode', lambda : -1)
|
||||||
|
if gc() != 403:
|
||||||
|
raise
|
||||||
|
# Douban is throttling us, wait a little
|
||||||
|
time.sleep(2)
|
||||||
|
raw = browser.open_novisit(url, timeout=timeout).read()
|
||||||
|
|
||||||
|
return raw
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
def to_metadata(browser, log, entry_, timeout): # {{{
|
||||||
|
def get_text(extra, x):
|
||||||
|
try:
|
||||||
|
ans = x(extra)
|
||||||
|
if ans:
|
||||||
|
ans = ans[0].text
|
||||||
|
if ans and ans.strip():
|
||||||
|
return ans.strip()
|
||||||
|
except:
|
||||||
|
log.exception('Programming error:')
|
||||||
|
return None
|
||||||
|
|
||||||
|
id_url = entry_id(entry_)[0].text
|
||||||
|
douban_id = id_url.split('/')[-1]
|
||||||
|
title_ = ': '.join([x.text for x in title(entry_)]).strip()
|
||||||
|
authors = [x.text.strip() for x in creator(entry_) if x.text]
|
||||||
|
if not authors:
|
||||||
|
authors = [_('Unknown')]
|
||||||
|
if not id_url or not title:
|
||||||
|
# Silently discard this entry
|
||||||
|
return None
|
||||||
|
|
||||||
|
mi = Metadata(title_, authors)
|
||||||
|
mi.identifiers = {'douban':douban_id}
|
||||||
|
try:
|
||||||
|
raw = get_details(browser, id_url, timeout)
|
||||||
|
feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
|
||||||
|
strip_encoding_pats=True)[0])
|
||||||
|
extra = entry(feed)[0]
|
||||||
|
except:
|
||||||
|
log.exception('Failed to get additional details for', mi.title)
|
||||||
|
return mi
|
||||||
|
mi.comments = get_text(extra, description)
|
||||||
|
mi.publisher = get_text(extra, publisher)
|
||||||
|
|
||||||
|
# ISBN
|
||||||
|
isbns = []
|
||||||
|
for x in [t.text for t in isbn(extra)]:
|
||||||
|
if check_isbn(x):
|
||||||
|
isbns.append(x)
|
||||||
|
if isbns:
|
||||||
|
mi.isbn = sorted(isbns, key=len)[-1]
|
||||||
|
mi.all_isbns = isbns
|
||||||
|
|
||||||
|
# Tags
|
||||||
|
try:
|
||||||
|
btags = [x for x in booktag(extra) if x]
|
||||||
|
tags = []
|
||||||
|
for t in btags:
|
||||||
|
atags = [y.strip() for y in t.split('/')]
|
||||||
|
for tag in atags:
|
||||||
|
if tag not in tags:
|
||||||
|
tags.append(tag)
|
||||||
|
except:
|
||||||
|
log.exception('Failed to parse tags:')
|
||||||
|
tags = []
|
||||||
|
if tags:
|
||||||
|
mi.tags = [x.replace(',', ';') for x in tags]
|
||||||
|
|
||||||
|
# pubdate
|
||||||
|
pubdate = get_text(extra, date)
|
||||||
|
if pubdate:
|
||||||
|
try:
|
||||||
|
default = utcnow().replace(day=15)
|
||||||
|
mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
|
||||||
|
except:
|
||||||
|
log.error('Failed to parse pubdate %r'%pubdate)
|
||||||
|
|
||||||
|
# Ratings
|
||||||
|
if rating(extra):
|
||||||
|
try:
|
||||||
|
mi.rating = float(rating(extra)[0]) / 2.0
|
||||||
|
except:
|
||||||
|
log.exception('Failed to parse rating')
|
||||||
|
mi.rating = 0
|
||||||
|
|
||||||
|
# Cover
|
||||||
|
mi.has_douban_cover = None
|
||||||
|
u = cover_url(extra)
|
||||||
|
if u:
|
||||||
|
u = u[0].replace('/spic/', '/lpic/');
|
||||||
|
# If URL contains "book-default", the book doesn't have a cover
|
||||||
|
if u.find('book-default') == -1:
|
||||||
|
mi.has_douban_cover = u
|
||||||
|
return mi
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
class Douban(Source):
|
||||||
|
|
||||||
|
name = 'Douban Books'
|
||||||
|
author = 'Li Fanxi'
|
||||||
|
version = (2, 0, 0)
|
||||||
|
|
||||||
|
description = _('Downloads metadata and covers from Douban.com')
|
||||||
|
|
||||||
|
capabilities = frozenset(['identify', 'cover'])
|
||||||
|
touched_fields = frozenset(['title', 'authors', 'tags',
|
||||||
|
'pubdate', 'comments', 'publisher', 'identifier:isbn', 'rating',
|
||||||
|
'identifier:douban']) # language currently disabled
|
||||||
|
supports_gzip_transfer_encoding = True
|
||||||
|
cached_cover_url_is_reliable = True
|
||||||
|
|
||||||
|
DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d'
|
||||||
|
DOUBAN_BOOK_URL = 'http://book.douban.com/subject/%s/'
|
||||||
|
|
||||||
|
def get_book_url(self, identifiers): # {{{
|
||||||
|
db = identifiers.get('douban', None)
|
||||||
|
if db is not None:
|
||||||
|
return ('douban', db, self.DOUBAN_BOOK_URL%db)
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
|
||||||
|
SEARCH_URL = 'http://api.douban.com/book/subjects?'
|
||||||
|
ISBN_URL = 'http://api.douban.com/book/subject/isbn/'
|
||||||
|
SUBJECT_URL = 'http://api.douban.com/book/subject/'
|
||||||
|
|
||||||
|
q = ''
|
||||||
|
t = None
|
||||||
|
isbn = check_isbn(identifiers.get('isbn', None))
|
||||||
|
subject = identifiers.get('douban', None)
|
||||||
|
if isbn is not None:
|
||||||
|
q = isbn
|
||||||
|
t = 'isbn'
|
||||||
|
elif subject is not None:
|
||||||
|
q = subject
|
||||||
|
t = 'subject'
|
||||||
|
elif title or authors:
|
||||||
|
def build_term(prefix, parts):
|
||||||
|
return ' '.join(x for x in parts)
|
||||||
|
title_tokens = list(self.get_title_tokens(title))
|
||||||
|
if title_tokens:
|
||||||
|
q += build_term('title', title_tokens)
|
||||||
|
author_tokens = self.get_author_tokens(authors,
|
||||||
|
only_first_author=True)
|
||||||
|
if author_tokens:
|
||||||
|
q += ((' ' if q != '' else '') +
|
||||||
|
build_term('author', author_tokens))
|
||||||
|
t = 'search'
|
||||||
|
q = q.strip()
|
||||||
|
if isinstance(q, unicode):
|
||||||
|
q = q.encode('utf-8')
|
||||||
|
if not q:
|
||||||
|
return None
|
||||||
|
url = None
|
||||||
|
if t == "isbn":
|
||||||
|
url = ISBN_URL + q
|
||||||
|
elif t == 'subject':
|
||||||
|
url = SUBJECT_URL + q
|
||||||
|
else:
|
||||||
|
url = SEARCH_URL + urlencode({
|
||||||
|
'q': q,
|
||||||
|
})
|
||||||
|
if self.DOUBAN_API_KEY and self.DOUBAN_API_KEY != '':
|
||||||
|
url = url + "?apikey=" + self.DOUBAN_API_KEY
|
||||||
|
return url
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
def download_cover(self, log, result_queue, abort, # {{{
|
||||||
|
title=None, authors=None, identifiers={}, timeout=30):
|
||||||
|
cached_url = self.get_cached_cover_url(identifiers)
|
||||||
|
if cached_url is None:
|
||||||
|
log.info('No cached cover found, running identify')
|
||||||
|
rq = Queue()
|
||||||
|
self.identify(log, rq, abort, title=title, authors=authors,
|
||||||
|
identifiers=identifiers)
|
||||||
|
if abort.is_set():
|
||||||
|
return
|
||||||
|
results = []
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
results.append(rq.get_nowait())
|
||||||
|
except Empty:
|
||||||
|
break
|
||||||
|
results.sort(key=self.identify_results_keygen(
|
||||||
|
title=title, authors=authors, identifiers=identifiers))
|
||||||
|
for mi in results:
|
||||||
|
cached_url = self.get_cached_cover_url(mi.identifiers)
|
||||||
|
if cached_url is not None:
|
||||||
|
break
|
||||||
|
if cached_url is None:
|
||||||
|
log.info('No cover found')
|
||||||
|
return
|
||||||
|
|
||||||
|
if abort.is_set():
|
||||||
|
return
|
||||||
|
br = self.browser
|
||||||
|
log('Downloading cover from:', cached_url)
|
||||||
|
try:
|
||||||
|
cdata = br.open_novisit(cached_url, timeout=timeout).read()
|
||||||
|
if cdata:
|
||||||
|
result_queue.put((self, cdata))
|
||||||
|
except:
|
||||||
|
log.exception('Failed to download cover from:', cached_url)
|
||||||
|
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
def get_cached_cover_url(self, identifiers): # {{{
|
||||||
|
url = None
|
||||||
|
db = identifiers.get('douban', None)
|
||||||
|
if db is None:
|
||||||
|
isbn = identifiers.get('isbn', None)
|
||||||
|
if isbn is not None:
|
||||||
|
db = self.cached_isbn_to_identifier(isbn)
|
||||||
|
if db is not None:
|
||||||
|
url = self.cached_identifier_to_cover_url(db)
|
||||||
|
|
||||||
|
return url
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
def get_all_details(self, br, log, entries, abort, # {{{
|
||||||
|
result_queue, timeout):
|
||||||
|
for relevance, i in enumerate(entries):
|
||||||
|
try:
|
||||||
|
ans = to_metadata(br, log, i, timeout)
|
||||||
|
if isinstance(ans, Metadata):
|
||||||
|
ans.source_relevance = relevance
|
||||||
|
db = ans.identifiers['douban']
|
||||||
|
for isbn in getattr(ans, 'all_isbns', []):
|
||||||
|
self.cache_isbn_to_identifier(isbn, db)
|
||||||
|
if ans.has_douban_cover:
|
||||||
|
self.cache_identifier_to_cover_url(db,
|
||||||
|
ans.has_douban_cover)
|
||||||
|
self.clean_downloaded_metadata(ans)
|
||||||
|
result_queue.put(ans)
|
||||||
|
except:
|
||||||
|
log.exception(
|
||||||
|
'Failed to get metadata for identify entry:',
|
||||||
|
etree.tostring(i))
|
||||||
|
if abort.is_set():
|
||||||
|
break
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
|
||||||
|
identifiers={}, timeout=30):
|
||||||
|
query = self.create_query(log, title=title, authors=authors,
|
||||||
|
identifiers=identifiers)
|
||||||
|
if not query:
|
||||||
|
log.error('Insufficient metadata to construct query')
|
||||||
|
return
|
||||||
|
br = self.browser
|
||||||
|
try:
|
||||||
|
raw = br.open_novisit(query, timeout=timeout).read()
|
||||||
|
except Exception as e:
|
||||||
|
log.exception('Failed to make identify query: %r'%query)
|
||||||
|
return as_unicode(e)
|
||||||
|
try:
|
||||||
|
parser = etree.XMLParser(recover=True, no_network=True)
|
||||||
|
feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
|
||||||
|
strip_encoding_pats=True)[0], parser=parser)
|
||||||
|
entries = entry(feed)
|
||||||
|
except Exception as e:
|
||||||
|
log.exception('Failed to parse identify results')
|
||||||
|
return as_unicode(e)
|
||||||
|
if not entries and identifiers and title and authors and \
|
||||||
|
not abort.is_set():
|
||||||
|
return self.identify(log, result_queue, abort, title=title,
|
||||||
|
authors=authors, timeout=timeout)
|
||||||
|
|
||||||
|
# There is no point running these queries in threads as douban
|
||||||
|
# throttles requests returning 403 Forbidden errors
|
||||||
|
self.get_all_details(br, log, entries, abort, result_queue, timeout)
|
||||||
|
|
||||||
|
return None
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
if __name__ == '__main__': # tests {{{
|
||||||
|
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/douban.py
|
||||||
|
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
|
||||||
|
title_test, authors_test)
|
||||||
|
test_identify_plugin(Douban.name,
|
||||||
|
[
|
||||||
|
|
||||||
|
|
||||||
|
(
|
||||||
|
{'identifiers':{'isbn': '9787536692930'}, 'title':'三体',
|
||||||
|
'authors':['刘慈欣']},
|
||||||
|
[title_test('三体', exact=True),
|
||||||
|
authors_test(['刘慈欣'])]
|
||||||
|
),
|
||||||
|
|
||||||
|
(
|
||||||
|
{'title': 'Linux内核修炼之道', 'authors':['任桥伟']},
|
||||||
|
[title_test('Linux内核修炼之道', exact=False)]
|
||||||
|
),
|
||||||
|
])
|
||||||
|
# }}}
|
||||||
|
|
@ -157,7 +157,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{
|
|||||||
class GoogleBooks(Source):
|
class GoogleBooks(Source):
|
||||||
|
|
||||||
name = 'Google'
|
name = 'Google'
|
||||||
description = _('Downloads metadata from Google Books')
|
description = _('Downloads metadata and covers from Google Books')
|
||||||
|
|
||||||
capabilities = frozenset(['identify', 'cover'])
|
capabilities = frozenset(['identify', 'cover'])
|
||||||
touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate',
|
touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate',
|
||||||
|
@ -30,7 +30,7 @@ base_url = 'http://search.overdrive.com/'
|
|||||||
class OverDrive(Source):
|
class OverDrive(Source):
|
||||||
|
|
||||||
name = 'Overdrive'
|
name = 'Overdrive'
|
||||||
description = _('Downloads metadata from Overdrive\'s Content Reserve')
|
description = _('Downloads metadata and covers from Overdrive\'s Content Reserve')
|
||||||
|
|
||||||
capabilities = frozenset(['identify', 'cover'])
|
capabilities = frozenset(['identify', 'cover'])
|
||||||
touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate',
|
touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate',
|
||||||
|
@ -191,7 +191,11 @@ class OEBReader(object):
|
|||||||
if not scheme and href not in known:
|
if not scheme and href not in known:
|
||||||
new.add(href)
|
new.add(href)
|
||||||
elif item.media_type in OEB_STYLES:
|
elif item.media_type in OEB_STYLES:
|
||||||
for url in cssutils.getUrls(item.data):
|
try:
|
||||||
|
urls = list(cssutils.getUrls(item.data))
|
||||||
|
except:
|
||||||
|
urls = []
|
||||||
|
for url in urls:
|
||||||
href, _ = urldefrag(url)
|
href, _ = urldefrag(url)
|
||||||
href = item.abshref(urlnormalize(href))
|
href = item.abshref(urlnormalize(href))
|
||||||
scheme = urlparse(href).scheme
|
scheme = urlparse(href).scheme
|
||||||
|
@ -13,6 +13,7 @@
|
|||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <wand/MagickWand.h>
|
#include <wand/MagickWand.h>
|
||||||
|
#include <zlib.h>
|
||||||
|
|
||||||
#include "images.h"
|
#include "images.h"
|
||||||
#include "utils.h"
|
#include "utils.h"
|
||||||
|
@ -12,7 +12,7 @@ A Humane Web Text Generator
|
|||||||
#__date__ = '2009/12/04'
|
#__date__ = '2009/12/04'
|
||||||
|
|
||||||
__copyright__ = """
|
__copyright__ = """
|
||||||
Copyright (c) 2011, Leigh Parry
|
Copyright (c) 2011, Leigh Parry <leighparry@blueyonder.co.uk>
|
||||||
Copyright (c) 2011, John Schember <john@nachtimwald.com>
|
Copyright (c) 2011, John Schember <john@nachtimwald.com>
|
||||||
Copyright (c) 2009, Jason Samsa, http://jsamsa.com/
|
Copyright (c) 2009, Jason Samsa, http://jsamsa.com/
|
||||||
Copyright (c) 2004, Roberto A. F. De Almeida, http://dealmeida.net/
|
Copyright (c) 2004, Roberto A. F. De Almeida, http://dealmeida.net/
|
||||||
@ -219,14 +219,13 @@ class Textile(object):
|
|||||||
]
|
]
|
||||||
glyph_defaults = [
|
glyph_defaults = [
|
||||||
(re.compile(r'(\d+\'?\"?)( ?)x( ?)(?=\d+)'), r'\1\2×\3'), # dimension sign
|
(re.compile(r'(\d+\'?\"?)( ?)x( ?)(?=\d+)'), r'\1\2×\3'), # dimension sign
|
||||||
(re.compile(r'(\d+)\'', re.I), r'\1′'), # prime
|
(re.compile(r'(\d+)\'(\s)', re.I), r'\1′\2'), # prime
|
||||||
(re.compile(r'(\d+)\"', re.I), r'\1″'), # prime-double
|
(re.compile(r'(\d+)\"(\s)', re.I), r'\1″\2'), # prime-double
|
||||||
(re.compile(r'\b([A-Z][A-Z0-9]{2,})\b(?:[(]([^)]*)[)])'), r'<acronym title="\2">\1</acronym>'), # 3+ uppercase acronym
|
(re.compile(r'\b([A-Z][A-Z0-9]{2,})\b(?:[(]([^)]*)[)])'), r'<acronym title="\2">\1</acronym>'), # 3+ uppercase acronym
|
||||||
(re.compile(r'\b([A-Z][A-Z\'\-]+[A-Z])(?=[\s.,\)>])'), r'<span class="caps">\1</span>'), # 3+ uppercase
|
(re.compile(r'\b([A-Z][A-Z\'\-]+[A-Z])(?=[\s.,\)>])'), r'<span class="caps">\1</span>'), # 3+ uppercase
|
||||||
(re.compile(r'\b(\s{0,1})?\.{3}'), r'\1…'), # ellipsis
|
(re.compile(r'\b(\s{0,1})?\.{3}'), r'\1…'), # ellipsis
|
||||||
(re.compile(r'^[\*_-]{3,}$', re.M), r'<hr />'), # <hr> scene-break
|
(re.compile(r'^[\*_-]{3,}$', re.M), r'<hr />'), # <hr> scene-break
|
||||||
(re.compile(r'\b--\b'), r'—'), # em dash
|
(re.compile(r'(^|[^-])--([^-]|$)'), r'\1—\2'), # em dash
|
||||||
(re.compile(r'(\s)--(\s)'), r'\1—\2'), # em dash
|
|
||||||
(re.compile(r'\s-(?:\s|$)'), r' – '), # en dash
|
(re.compile(r'\s-(?:\s|$)'), r' – '), # en dash
|
||||||
(re.compile(r'\b( ?)[([]TM[])]', re.I), r'\1™'), # trademark
|
(re.compile(r'\b( ?)[([]TM[])]', re.I), r'\1™'), # trademark
|
||||||
(re.compile(r'\b( ?)[([]R[])]', re.I), r'\1®'), # registered
|
(re.compile(r'\b( ?)[([]R[])]', re.I), r'\1®'), # registered
|
||||||
@ -706,6 +705,21 @@ class Textile(object):
|
|||||||
result.append(line)
|
result.append(line)
|
||||||
return ''.join(result)
|
return ''.join(result)
|
||||||
|
|
||||||
|
def macros_only(self, text):
|
||||||
|
# fix: hackish
|
||||||
|
text = re.sub(r'"\Z', '\" ', text)
|
||||||
|
|
||||||
|
result = []
|
||||||
|
for line in re.compile(r'(<.*?>)', re.U).split(text):
|
||||||
|
if not re.search(r'<.*>', line):
|
||||||
|
rules = []
|
||||||
|
if re.search(r'{.+?}', line):
|
||||||
|
rules = self.macro_defaults
|
||||||
|
for s, r in rules:
|
||||||
|
line = s.sub(r, line)
|
||||||
|
result.append(line)
|
||||||
|
return ''.join(result)
|
||||||
|
|
||||||
def vAlign(self, input):
|
def vAlign(self, input):
|
||||||
d = {'^':'top', '-':'middle', '~':'bottom'}
|
d = {'^':'top', '-':'middle', '~':'bottom'}
|
||||||
return d.get(input, '')
|
return d.get(input, '')
|
||||||
@ -814,6 +828,7 @@ class Textile(object):
|
|||||||
'fooobar ... and hello world ...'
|
'fooobar ... and hello world ...'
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
text = self.macros_only(text)
|
||||||
punct = '!"#$%&\'*+,-./:;=?@\\^_`|~'
|
punct = '!"#$%&\'*+,-./:;=?@\\^_`|~'
|
||||||
|
|
||||||
pattern = r'''
|
pattern = r'''
|
||||||
@ -1044,4 +1059,3 @@ def textile_restricted(text, lite=True, noimage=True, html_type='xhtml'):
|
|||||||
return Textile(restricted=True, lite=lite,
|
return Textile(restricted=True, lite=lite,
|
||||||
noimage=noimage).textile(text, rel='nofollow',
|
noimage=noimage).textile(text, rel='nofollow',
|
||||||
html_type=html_type)
|
html_type=html_type)
|
||||||
|
|
||||||
|
@ -66,19 +66,26 @@ class TXTOutput(OutputFormatPlugin):
|
|||||||
help=_('Do not remove image references within the document. This is only ' \
|
help=_('Do not remove image references within the document. This is only ' \
|
||||||
'useful when paired with a txt-output-formatting option that '
|
'useful when paired with a txt-output-formatting option that '
|
||||||
'is not none because links are always removed with plain text output.')),
|
'is not none because links are always removed with plain text output.')),
|
||||||
|
OptionRecommendation(name='keep_color',
|
||||||
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Do not remove font color from output. This is only useful when ' \
|
||||||
|
'txt-output-formatting is set to textile. Textile is the only ' \
|
||||||
|
'formatting that supports setting font color. If this option is ' \
|
||||||
|
'not specified font color will not be set and default to the ' \
|
||||||
|
'color displayed by the reader (generally this is black).')),
|
||||||
])
|
])
|
||||||
|
|
||||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||||
if opts.txt_output_formatting.lower() == 'markdown':
|
if opts.txt_output_formatting.lower() == 'markdown':
|
||||||
from calibre.ebooks.txt.markdownml import MarkdownMLizer
|
from calibre.ebooks.txt.markdownml import MarkdownMLizer
|
||||||
writer = MarkdownMLizer(log)
|
self.writer = MarkdownMLizer(log)
|
||||||
elif opts.txt_output_formatting.lower() == 'textile':
|
elif opts.txt_output_formatting.lower() == 'textile':
|
||||||
from calibre.ebooks.txt.textileml import TextileMLizer
|
from calibre.ebooks.txt.textileml import TextileMLizer
|
||||||
writer = TextileMLizer(log)
|
self.writer = TextileMLizer(log)
|
||||||
else:
|
else:
|
||||||
writer = TXTMLizer(log)
|
self.writer = TXTMLizer(log)
|
||||||
|
|
||||||
txt = writer.extract_content(oeb_book, opts)
|
txt = self.writer.extract_content(oeb_book, opts)
|
||||||
txt = clean_ascii_chars(txt)
|
txt = clean_ascii_chars(txt)
|
||||||
|
|
||||||
log.debug('\tReplacing newlines with selected type...')
|
log.debug('\tReplacing newlines with selected type...')
|
||||||
@ -111,17 +118,28 @@ class TXTZOutput(TXTOutput):
|
|||||||
from calibre.ebooks.oeb.base import OEB_IMAGES
|
from calibre.ebooks.oeb.base import OEB_IMAGES
|
||||||
with TemporaryDirectory('_txtz_output') as tdir:
|
with TemporaryDirectory('_txtz_output') as tdir:
|
||||||
# TXT
|
# TXT
|
||||||
with TemporaryFile('index.txt') as tf:
|
txt_name = 'index.txt'
|
||||||
|
if opts.txt_output_formatting.lower() == 'textile':
|
||||||
|
txt_name = 'index.text'
|
||||||
|
with TemporaryFile(txt_name) as tf:
|
||||||
TXTOutput.convert(self, oeb_book, tf, input_plugin, opts, log)
|
TXTOutput.convert(self, oeb_book, tf, input_plugin, opts, log)
|
||||||
shutil.copy(tf, os.path.join(tdir, 'index.txt'))
|
shutil.copy(tf, os.path.join(tdir, txt_name))
|
||||||
|
|
||||||
# Images
|
# Images
|
||||||
for item in oeb_book.manifest:
|
for item in oeb_book.manifest:
|
||||||
if item.media_type in OEB_IMAGES:
|
if item.media_type in OEB_IMAGES:
|
||||||
|
if hasattr(self.writer, 'images'):
|
||||||
|
path = os.path.join(tdir, 'images')
|
||||||
|
if item.href in self.writer.images:
|
||||||
|
href = self.writer.images[item.href]
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
path = os.path.join(tdir, os.path.dirname(item.href))
|
path = os.path.join(tdir, os.path.dirname(item.href))
|
||||||
|
href = os.path.basename(item.href)
|
||||||
if not os.path.exists(path):
|
if not os.path.exists(path):
|
||||||
os.makedirs(path)
|
os.makedirs(path)
|
||||||
with open(os.path.join(tdir, item.href), 'wb') as imgf:
|
with open(os.path.join(path, href), 'wb') as imgf:
|
||||||
imgf.write(item.data)
|
imgf.write(item.data)
|
||||||
|
|
||||||
# Metadata
|
# Metadata
|
||||||
|
@ -242,6 +242,8 @@ def detect_formatting_type(txt):
|
|||||||
textile_count += len(re.findall(r'(?mu)(?<=\!)\S+(?=\!)', txt))
|
textile_count += len(re.findall(r'(?mu)(?<=\!)\S+(?=\!)', txt))
|
||||||
# Links
|
# Links
|
||||||
textile_count += len(re.findall(r'"[^"]*":\S+', txt))
|
textile_count += len(re.findall(r'"[^"]*":\S+', txt))
|
||||||
|
# paragraph blocks
|
||||||
|
textile_count += len(re.findall(r'(?mu)^p(<|<>|=|>)?\. ', txt))
|
||||||
|
|
||||||
# Decide if either markdown or textile is used in the text
|
# Decide if either markdown or textile is used in the text
|
||||||
# based on the number of unique formatting elements found.
|
# based on the number of unique formatting elements found.
|
||||||
|
@ -1,62 +1,489 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
__license__ = 'GPL 3'
|
__license__ = 'GPL 3'
|
||||||
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
__copyright__ = '2011, Leigh Parry <leighparry@blueyonder.co.uk>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Transform OEB content into Textile formatted plain text
|
Transform OEB content into Textile formatted plain text
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from lxml import etree
|
from functools import partial
|
||||||
|
|
||||||
from calibre.ebooks.oeb.base import XHTML
|
from calibre.ebooks.htmlz.oeb2html import OEB2HTML
|
||||||
from calibre.utils.html2textile import html2textile
|
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links
|
||||||
|
from calibre.ebooks.oeb.stylizer import Stylizer
|
||||||
|
from calibre.ebooks import unit_convert
|
||||||
|
from calibre.ebooks.txt.unsmarten import unsmarten
|
||||||
|
|
||||||
class TextileMLizer(object):
|
class TextileMLizer(OEB2HTML):
|
||||||
|
|
||||||
def __init__(self, log):
|
|
||||||
self.log = log
|
|
||||||
|
|
||||||
def extract_content(self, oeb_book, opts):
|
def extract_content(self, oeb_book, opts):
|
||||||
self.log.info('Converting XHTML to Textile formatted TXT...')
|
self.log.info('Converting XHTML to Textile formatted TXT...')
|
||||||
self.oeb_book = oeb_book
|
|
||||||
self.opts = opts
|
self.opts = opts
|
||||||
|
self.in_pre = False
|
||||||
|
self.in_table = False
|
||||||
|
self.links = {}
|
||||||
|
self.list = []
|
||||||
|
self.our_links = []
|
||||||
|
self.in_a_link = False
|
||||||
|
self.our_ids = []
|
||||||
|
self.images = {}
|
||||||
|
self.id_no_text = u''
|
||||||
|
self.style_embed = []
|
||||||
|
self.remove_space_after_newline = False
|
||||||
|
self.base_hrefs = [item.href for item in oeb_book.spine]
|
||||||
|
self.map_resources(oeb_book)
|
||||||
|
|
||||||
return self.mlize_spine()
|
self.style_bold = False
|
||||||
|
self.style_italic = False
|
||||||
|
self.style_under = False
|
||||||
|
self.style_strike = False
|
||||||
|
self.style_smallcap = False
|
||||||
|
|
||||||
def mlize_spine(self):
|
txt = self.mlize_spine(oeb_book)
|
||||||
|
txt = unsmarten(txt)
|
||||||
|
|
||||||
|
# Do some tidying up
|
||||||
|
txt = self.tidy_up(txt)
|
||||||
|
|
||||||
|
return txt
|
||||||
|
|
||||||
|
def mlize_spine(self, oeb_book):
|
||||||
output = [u'']
|
output = [u'']
|
||||||
|
for item in oeb_book.spine:
|
||||||
for item in self.oeb_book.spine:
|
|
||||||
self.log.debug('Converting %s to Textile formatted TXT...' % item.href)
|
self.log.debug('Converting %s to Textile formatted TXT...' % item.href)
|
||||||
|
self.rewrite_ids(item.data, item)
|
||||||
|
rewrite_links(item.data, partial(self.rewrite_link, page=item))
|
||||||
|
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts, self.opts.output_profile)
|
||||||
|
output += self.dump_text(item.data.find(XHTML('body')), stylizer)
|
||||||
|
output.append('\n\n')
|
||||||
|
return ''.join(output)
|
||||||
|
|
||||||
html = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
|
def tidy_up(self, text):
|
||||||
|
# May need tweaking and finetuning
|
||||||
|
def check_escaping(text, tests):
|
||||||
|
for t in tests:
|
||||||
|
# I'm not checking for duplicated spans '%' as any that follow each other were being incorrectly merged
|
||||||
|
txt = '%s' % t
|
||||||
|
if txt != '%':
|
||||||
|
text = re.sub(r'([^'+t+'|^\n])'+t+'\]\['+t+'([^'+t+'])', r'\1\2', text)
|
||||||
|
text = re.sub(r'([^'+t+'|^\n])'+t+t+'([^'+t+'])', r'\1\2', text)
|
||||||
|
text = re.sub(r'(\s|[*_\'"])\[('+t+'[a-zA-Z0-9 \'",.*_]+'+t+')\](\s|[*_\'"?!,.])', r'\1\2\3', text)
|
||||||
|
return text
|
||||||
|
|
||||||
if not self.opts.keep_links:
|
# Now tidyup links and ids - remove ones that don't have a correponding opposite
|
||||||
html = re.sub(r'<\s*/*\s*a[^>]*>', '', html)
|
if self.opts.keep_links:
|
||||||
if not self.opts.keep_image_references:
|
for i in self.our_links:
|
||||||
html = re.sub(r'<\s*img[^>]*>', '', html)
|
if i[0] == '#':
|
||||||
|
if i not in self.our_ids:
|
||||||
|
text = re.sub(r'"(.+)":'+i+'(\s)', r'\1\2', text)
|
||||||
|
for i in self.our_ids:
|
||||||
|
if i not in self.our_links:
|
||||||
|
text = re.sub(r'%?\('+i+'\)\xa0?%?', r'', text)
|
||||||
|
|
||||||
text = html2textile(html)
|
# Remove obvious non-needed escaping, add sub/sup-script ones
|
||||||
|
text = check_escaping(text, ['\*', '_', '\*'])
|
||||||
|
# escape the super/sub-scripts if needed
|
||||||
|
text = re.sub(r'(\w)([~^]\w+[~^])', r'\1[\2]', text)
|
||||||
|
# escape the super/sub-scripts if needed
|
||||||
|
text = re.sub(r'([~^]\w+[~^])(\w)', r'[\1]\2', text)
|
||||||
|
|
||||||
# Ensure the section ends with at least two new line characters.
|
#remove empty spans
|
||||||
# This is to prevent the last paragraph from a section being
|
text = re.sub(r'%\xa0+', r'%', text)
|
||||||
# combined into the fist paragraph of the next.
|
#remove empty spans - MAY MERGE SOME ?
|
||||||
end_chars = text[-4:]
|
text = re.sub(r'%%', r'', text)
|
||||||
# Convert all newlines to \n
|
#remove spans from tagged output
|
||||||
end_chars = end_chars.replace('\r\n', '\n')
|
text = re.sub(r'%([_+*-]+)%', r'\1', text)
|
||||||
end_chars = end_chars.replace('\r', '\n')
|
#remove spaces before a newline
|
||||||
end_chars = end_chars[-2:]
|
text = re.sub(r' +\n', r'\n', text)
|
||||||
if not end_chars[1] == '\n':
|
#remove newlines at top of file
|
||||||
text += '\n\n'
|
text = re.sub(r'^\n+', r'', text)
|
||||||
if end_chars[1] == '\n' and not end_chars[0] == '\n':
|
#correct blockcode paras
|
||||||
text += '\n'
|
text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text)
|
||||||
|
#correct blockquote paras
|
||||||
|
text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text)
|
||||||
|
|
||||||
output += text
|
#reduce blank lines
|
||||||
|
text = re.sub(r'\n{3}', r'\n\np. \n\n', text)
|
||||||
|
text = re.sub(u'%\n(p[<>=]{1,2}\.|p\.)', r'%\n\n\1', text)
|
||||||
|
#Check span following blank para
|
||||||
|
text = re.sub(r'\n+ +%', r' %', text)
|
||||||
|
text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text)
|
||||||
|
# blank paragraph
|
||||||
|
text = re.sub(r'\n(p.*\.)\n', r'\n\1 \n\n', text)
|
||||||
|
# blank paragraph
|
||||||
|
text = re.sub(u'\n\xa0', r'\np. ', text)
|
||||||
|
# blank paragraph
|
||||||
|
text = re.sub(u'\np[<>=]{1,2}?\. \xa0', r'\np. ', text)
|
||||||
|
text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text)
|
||||||
|
text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text)
|
||||||
|
#sort out spaces in tables
|
||||||
|
text = re.sub(r' {2,}\|', r' |', text)
|
||||||
|
|
||||||
output = u''.join(output)
|
# Now put back spaces removed earlier as they're needed here
|
||||||
|
text = re.sub(r'\np\.\n', r'\np. \n', text)
|
||||||
|
#reduce blank lines
|
||||||
|
text = re.sub(r' \n\n\n', r' \n\n', text)
|
||||||
|
|
||||||
return output
|
return text
|
||||||
|
|
||||||
|
def remove_newlines(self, text):
|
||||||
|
text = text.replace('\r\n', ' ')
|
||||||
|
text = text.replace('\n', ' ')
|
||||||
|
text = text.replace('\r', ' ')
|
||||||
|
# Condense redundant spaces created by replacing newlines with spaces.
|
||||||
|
text = re.sub(r'[ ]{2,}', ' ', text)
|
||||||
|
text = re.sub(r'\t+', '', text)
|
||||||
|
if self.remove_space_after_newline == True:
|
||||||
|
text = re.sub(r'^ +', '', text)
|
||||||
|
self.remove_space_after_newline = False
|
||||||
|
return text
|
||||||
|
|
||||||
|
def check_styles(self, style):
|
||||||
|
txt = '{'
|
||||||
|
if self.opts.keep_color:
|
||||||
|
if 'color' in style.cssdict() and style['color'] != 'black':
|
||||||
|
txt += 'color:'+style['color']+';'
|
||||||
|
if 'background' in style.cssdict():
|
||||||
|
txt += 'background:'+style['background']+';'
|
||||||
|
txt += '}'
|
||||||
|
if txt == '{}': txt = ''
|
||||||
|
return txt
|
||||||
|
|
||||||
|
def check_halign(self, style):
|
||||||
|
tests = {'left':'<','justify':'<>','center':'=','right':'>'}
|
||||||
|
for i in tests:
|
||||||
|
if style['text-align'] == i:
|
||||||
|
return tests[i]
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def check_valign(self, style):
|
||||||
|
tests = {'top':'^','bottom':'~'} #, 'middle':'-'}
|
||||||
|
for i in tests:
|
||||||
|
if style['vertical-align'] == i:
|
||||||
|
return tests[i]
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def check_padding(self, style, stylizer):
|
||||||
|
txt = ''
|
||||||
|
left_padding_pts = 0
|
||||||
|
left_margin_pts = 0
|
||||||
|
if 'padding-left' in style.cssdict() and style['padding-left'] != 'auto':
|
||||||
|
left_padding_pts = unit_convert(style['padding-left'], style.width, style.fontSize, stylizer.profile.dpi)
|
||||||
|
if 'margin-left' in style.cssdict() and style['margin-left'] != 'auto':
|
||||||
|
left_margin_pts = unit_convert(style['margin-left'], style.width, style.fontSize, stylizer.profile.dpi)
|
||||||
|
left = left_margin_pts + left_padding_pts
|
||||||
|
emleft = int(round(left / stylizer.profile.fbase))
|
||||||
|
if emleft >= 1:
|
||||||
|
txt += '(' * emleft
|
||||||
|
right_padding_pts = 0
|
||||||
|
right_margin_pts = 0
|
||||||
|
if 'padding-right' in style.cssdict() and style['padding-right'] != 'auto':
|
||||||
|
right_padding_pts = unit_convert(style['padding-right'], style.width, style.fontSize, stylizer.profile.dpi)
|
||||||
|
if 'margin-right' in style.cssdict() and style['margin-right'] != 'auto':
|
||||||
|
right_margin_pts = unit_convert(style['margin-right'], style.width, style.fontSize, stylizer.profile.dpi)
|
||||||
|
right = right_margin_pts + right_padding_pts
|
||||||
|
emright = int(round(right / stylizer.profile.fbase))
|
||||||
|
if emright >= 1:
|
||||||
|
txt += ')' * emright
|
||||||
|
|
||||||
|
return txt
|
||||||
|
|
||||||
|
def check_id_tag(self, attribs):
|
||||||
|
txt = ''
|
||||||
|
if attribs.has_key('id'):
|
||||||
|
txt = '(#'+attribs['id']+ ')'
|
||||||
|
self.our_ids.append('#'+attribs['id'])
|
||||||
|
self.id_no_text = u'\xa0'
|
||||||
|
return txt
|
||||||
|
|
||||||
|
def build_block(self, tag, style, attribs, stylizer):
|
||||||
|
txt = '\n' + tag
|
||||||
|
if self.opts.keep_links:
|
||||||
|
txt += self.check_id_tag(attribs)
|
||||||
|
txt += self.check_padding(style, stylizer)
|
||||||
|
txt += self.check_halign(style)
|
||||||
|
txt += self.check_styles(style)
|
||||||
|
return txt
|
||||||
|
|
||||||
|
def prepare_string_for_textile(self, txt):
|
||||||
|
if re.search(r'(\s([*&_+\-~@%|]|\?{2})\S)|(\S([*&_+\-~@%|]|\?{2})\s)', txt):
|
||||||
|
return ' ==%s== ' % txt
|
||||||
|
return txt
|
||||||
|
|
||||||
|
def dump_text(self, elem, stylizer):
|
||||||
|
'''
|
||||||
|
@elem: The element in the etree that we are working on.
|
||||||
|
@stylizer: The style information attached to the element.
|
||||||
|
'''
|
||||||
|
|
||||||
|
# We can only processes tags. If there isn't a tag return any text.
|
||||||
|
if not isinstance(elem.tag, basestring) \
|
||||||
|
or namespace(elem.tag) != XHTML_NS:
|
||||||
|
p = elem.getparent()
|
||||||
|
if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \
|
||||||
|
and elem.tail:
|
||||||
|
return [elem.tail]
|
||||||
|
return ['']
|
||||||
|
|
||||||
|
# Setup our variables.
|
||||||
|
text = ['']
|
||||||
|
style = stylizer.style(elem)
|
||||||
|
tags = []
|
||||||
|
tag = barename(elem.tag)
|
||||||
|
attribs = elem.attrib
|
||||||
|
|
||||||
|
# Ignore anything that is set to not be displayed.
|
||||||
|
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
|
||||||
|
or style['visibility'] == 'hidden':
|
||||||
|
return ['']
|
||||||
|
|
||||||
|
# Soft scene breaks.
|
||||||
|
if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto':
|
||||||
|
ems = int(round(float(style.marginTop) / style.fontSize) - 1)
|
||||||
|
if ems >= 1:
|
||||||
|
text.append(u'\n\n\xa0' * ems)
|
||||||
|
|
||||||
|
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'):
|
||||||
|
if tag == 'div':
|
||||||
|
tag = 'p'
|
||||||
|
text.append(self.build_block(tag, style, attribs, stylizer))
|
||||||
|
text.append('. ')
|
||||||
|
tags.append('\n')
|
||||||
|
|
||||||
|
if style['font-style'] == 'italic' or tag in ('i', 'em'):
|
||||||
|
if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'):
|
||||||
|
if self.style_italic == False:
|
||||||
|
if self.in_a_link:
|
||||||
|
text.append('_')
|
||||||
|
tags.append('_')
|
||||||
|
else:
|
||||||
|
text.append('[_')
|
||||||
|
tags.append('_]')
|
||||||
|
self.style_embed.append('_')
|
||||||
|
self.style_italic = True
|
||||||
|
if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'):
|
||||||
|
if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'):
|
||||||
|
if self.style_bold == False:
|
||||||
|
if self.in_a_link:
|
||||||
|
text.append('*')
|
||||||
|
tags.append('*')
|
||||||
|
else:
|
||||||
|
text.append('[*')
|
||||||
|
tags.append('*]')
|
||||||
|
self.style_embed.append('*')
|
||||||
|
self.style_bold = True
|
||||||
|
if style['text-decoration'] == 'underline' or tag in ('u', 'ins'):
|
||||||
|
if tag != 'a':
|
||||||
|
if self.style_under == False:
|
||||||
|
text.append('[+')
|
||||||
|
tags.append('+]')
|
||||||
|
self.style_embed.append('+')
|
||||||
|
self.style_under = True
|
||||||
|
if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'):
|
||||||
|
if self.style_strike == False:
|
||||||
|
text.append('[-')
|
||||||
|
tags.append('-]')
|
||||||
|
self.style_embed.append('-')
|
||||||
|
self.style_strike = True
|
||||||
|
if tag == 'br':
|
||||||
|
for i in reversed(self.style_embed):
|
||||||
|
text.append(i)
|
||||||
|
text.append('\n')
|
||||||
|
for i in self.style_embed:
|
||||||
|
text.append(i)
|
||||||
|
tags.append('')
|
||||||
|
self.remove_space_after_newline = True
|
||||||
|
if tag == 'blockquote':
|
||||||
|
text.append('\nbq. ')
|
||||||
|
tags.append('\n')
|
||||||
|
elif tag in ('abbr', 'acronym'):
|
||||||
|
text.append('')
|
||||||
|
txt = attribs['title']
|
||||||
|
tags.append('(' + txt + ')')
|
||||||
|
elif tag == 'sup':
|
||||||
|
text.append('^')
|
||||||
|
tags.append('^')
|
||||||
|
elif tag == 'sub':
|
||||||
|
text.append('~')
|
||||||
|
tags.append('~')
|
||||||
|
elif tag == 'code':
|
||||||
|
if self.in_pre:
|
||||||
|
text.append('\nbc. ')
|
||||||
|
tags.append('')
|
||||||
|
else:
|
||||||
|
text.append('@')
|
||||||
|
tags.append('@')
|
||||||
|
elif tag == 'cite':
|
||||||
|
text.append('??')
|
||||||
|
tags.append('??')
|
||||||
|
elif tag == 'hr':
|
||||||
|
text.append('\n***')
|
||||||
|
tags.append('\n')
|
||||||
|
elif tag == 'pre':
|
||||||
|
self.in_pre = True
|
||||||
|
text.append('\npre. ')
|
||||||
|
tags.append('pre\n')
|
||||||
|
elif tag == 'a':
|
||||||
|
if self.opts.keep_links:
|
||||||
|
if attribs.has_key('href'):
|
||||||
|
text.append('"')
|
||||||
|
tags.append('a')
|
||||||
|
tags.append('":' + attribs['href'])
|
||||||
|
self.our_links.append(attribs['href'])
|
||||||
|
if attribs.has_key('title'):
|
||||||
|
tags.append('(' + attribs['title'] + ')')
|
||||||
|
self.in_a_link = True
|
||||||
|
else:
|
||||||
|
text.append('%')
|
||||||
|
tags.append('%')
|
||||||
|
elif tag == 'img':
|
||||||
|
if self.opts.keep_image_references:
|
||||||
|
txt = '!' + self.check_halign(style)
|
||||||
|
txt += self.check_valign(style)
|
||||||
|
txt += attribs['src']
|
||||||
|
text.append(txt)
|
||||||
|
if attribs.has_key('alt'):
|
||||||
|
txt = attribs['alt']
|
||||||
|
if txt != '':
|
||||||
|
text.append('(' + txt + ')')
|
||||||
|
tags.append('!')
|
||||||
|
elif tag in ('ol', 'ul'):
|
||||||
|
self.list.append({'name': tag, 'num': 0})
|
||||||
|
text.append('')
|
||||||
|
tags.append(tag)
|
||||||
|
elif tag == 'li':
|
||||||
|
if self.list: li = self.list[-1]
|
||||||
|
else: li = {'name': 'ul', 'num': 0}
|
||||||
|
text.append('\n')
|
||||||
|
if li['name'] == 'ul':
|
||||||
|
text.append('*' * len(self.list) + ' ')
|
||||||
|
elif li['name'] == 'ol':
|
||||||
|
text.append('#' * len(self.list) + ' ')
|
||||||
|
tags.append('')
|
||||||
|
elif tag == 'dl':
|
||||||
|
text.append('\n')
|
||||||
|
tags.append('')
|
||||||
|
elif tag == 'dt':
|
||||||
|
text.append('')
|
||||||
|
tags.append('\n')
|
||||||
|
elif tag == 'dd':
|
||||||
|
text.append(' ')
|
||||||
|
tags.append('')
|
||||||
|
elif tag == 'dd':
|
||||||
|
text.append('')
|
||||||
|
tags.append('\n')
|
||||||
|
elif tag == 'table':
|
||||||
|
txt = self.build_block(tag, style, attribs, stylizer)
|
||||||
|
txt += '. \n'
|
||||||
|
if txt != '\ntable. \n':
|
||||||
|
text.append(txt)
|
||||||
|
else:
|
||||||
|
text.append('\n')
|
||||||
|
tags.append('')
|
||||||
|
elif tag == 'tr':
|
||||||
|
txt = self.build_block('', style, attribs, stylizer)
|
||||||
|
txt += '. '
|
||||||
|
if txt != '\n. ':
|
||||||
|
txt = re.sub ('\n', '', txt)
|
||||||
|
text.append(txt)
|
||||||
|
tags.append('|\n')
|
||||||
|
elif tag == 'td':
|
||||||
|
text.append('|')
|
||||||
|
txt = ''
|
||||||
|
txt += self.check_halign(style)
|
||||||
|
txt += self.check_valign(style)
|
||||||
|
if attribs.has_key ('colspan'):
|
||||||
|
txt += '\\' + attribs['colspan']
|
||||||
|
if attribs.has_key ('rowspan'):
|
||||||
|
txt += '/' + attribs['rowspan']
|
||||||
|
txt += self.check_styles(style)
|
||||||
|
if txt != '':
|
||||||
|
text.append(txt + '. ')
|
||||||
|
tags.append('')
|
||||||
|
elif tag == 'th':
|
||||||
|
text.append('|_. ')
|
||||||
|
tags.append('')
|
||||||
|
elif tag == 'span':
|
||||||
|
if style['font-variant'] == 'small-caps':
|
||||||
|
if self.style_smallcap == False:
|
||||||
|
text.append('&')
|
||||||
|
tags.append('&')
|
||||||
|
self.style_smallcap = True
|
||||||
|
else:
|
||||||
|
if self.in_a_link == False:
|
||||||
|
txt = '%'
|
||||||
|
if self.opts.keep_links:
|
||||||
|
txt += self.check_id_tag(attribs)
|
||||||
|
txt += self.check_styles(style)
|
||||||
|
if txt != '%':
|
||||||
|
text.append(txt)
|
||||||
|
tags.append('%')
|
||||||
|
|
||||||
|
if self.opts.keep_links and attribs.has_key('id'):
|
||||||
|
if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'table'):
|
||||||
|
text.append(self.check_id_tag(attribs))
|
||||||
|
|
||||||
|
# Process the styles for any that we want to keep
|
||||||
|
if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'a', 'img', \
|
||||||
|
'span', 'table', 'tr', 'td'):
|
||||||
|
if not self.in_a_link:
|
||||||
|
text.append(self.check_styles(style))
|
||||||
|
|
||||||
|
# Process tags that contain text.
|
||||||
|
if hasattr(elem, 'text') and elem.text:
|
||||||
|
txt = elem.text
|
||||||
|
if not self.in_pre:
|
||||||
|
txt = self.prepare_string_for_textile(self.remove_newlines(txt))
|
||||||
|
text.append(txt)
|
||||||
|
self.id_no_text = u''
|
||||||
|
|
||||||
|
# Recurse down into tags within the tag we are in.
|
||||||
|
for item in elem:
|
||||||
|
text += self.dump_text(item, stylizer)
|
||||||
|
|
||||||
|
# Close all open tags.
|
||||||
|
tags.reverse()
|
||||||
|
for t in tags:
|
||||||
|
if tag in ('pre', 'ul', 'ol', 'li', 'table'):
|
||||||
|
if tag == 'pre':
|
||||||
|
self.in_pre = False
|
||||||
|
elif tag in ('ul', 'ol'):
|
||||||
|
if self.list: self.list.pop()
|
||||||
|
if not self.list: text.append('\n')
|
||||||
|
else:
|
||||||
|
if t == 'a':
|
||||||
|
self.in_a_link = False
|
||||||
|
t = ''
|
||||||
|
text.append(self.id_no_text)
|
||||||
|
self.id_no_text = u''
|
||||||
|
if t in ('*]', '*'):
|
||||||
|
self.style_bold = False
|
||||||
|
elif t in ('_]', '_'):
|
||||||
|
self.style_italic = False
|
||||||
|
elif t == '+]':
|
||||||
|
self.style_under = False
|
||||||
|
elif t == '-]':
|
||||||
|
self.style_strike = False
|
||||||
|
elif t == '&':
|
||||||
|
self.style_smallcap = False
|
||||||
|
if t in ('*]', '_]', '+]', '-]', '*', '_'):
|
||||||
|
txt = self.style_embed.pop()
|
||||||
|
text.append('%s' % t)
|
||||||
|
|
||||||
|
# Soft scene breaks.
|
||||||
|
if 'margin-bottom' in style.cssdict() and style['margin-bottom'] != 'auto':
|
||||||
|
ems = int(round((float(style.marginBottom) / style.fontSize) - 1))
|
||||||
|
if ems >= 1:
|
||||||
|
text.append(u'\n\n\xa0' * ems)
|
||||||
|
|
||||||
|
# Add the text that is outside of the tag.
|
||||||
|
if hasattr(elem, 'tail') and elem.tail:
|
||||||
|
tail = elem.tail
|
||||||
|
if not self.in_pre:
|
||||||
|
tail = self.prepare_string_for_textile(self.remove_newlines(tail))
|
||||||
|
text.append(tail)
|
||||||
|
|
||||||
|
return text
|
||||||
|
108
src/calibre/ebooks/txt/unsmarten.py
Normal file
108
src/calibre/ebooks/txt/unsmarten.py
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
"""unsmarten : html2textile helper function"""
|
||||||
|
|
||||||
|
__version__ = '0.1'
|
||||||
|
__author__ = 'Leigh Parry'
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
def unsmarten(txt):
|
||||||
|
txt = re.sub(u'–|–|–', r'-', txt) # en-dash
|
||||||
|
txt = re.sub(u'—|—|—', r'--', txt) # em-dash
|
||||||
|
txt = re.sub(u'…|…|…', r'...', txt) # ellipsis
|
||||||
|
|
||||||
|
txt = re.sub(u'“|”|″|“|”|″|“|”|″', r'"', txt) # double quote
|
||||||
|
txt = re.sub(u'(["\'‘“]|\s)’', r"\1{'/}", txt) # apostrophe
|
||||||
|
txt = re.sub(u'‘|’|′|‘|’|′|‘|’|′', r"'", txt) # single quote
|
||||||
|
|
||||||
|
txt = re.sub(u'¢|¢|¢', r'{c\}', txt) # cent
|
||||||
|
txt = re.sub(u'£|£|£', r'{L-}', txt) # pound
|
||||||
|
txt = re.sub(u'¥|¥|¥', r'{Y=}', txt) # yen
|
||||||
|
txt = re.sub(u'©|©|©', r'{(c)}', txt) # copyright
|
||||||
|
txt = re.sub(u'®|®|®', r'{(r)}', txt) # registered
|
||||||
|
txt = re.sub(u'¼|¼|¼', r'{1/4}', txt) # quarter
|
||||||
|
txt = re.sub(u'½|½|½', r'{1/2}', txt) # half
|
||||||
|
txt = re.sub(u'¾|¾|¾', r'{3/4}', txt) # three-quarter
|
||||||
|
txt = re.sub(u'À|À|À', r'{A`)}', txt) # A-grave
|
||||||
|
txt = re.sub(u'Á|Á|Á', r"{A'}", txt) # A-acute
|
||||||
|
txt = re.sub(u'Â|Â|Â', r'{A^}', txt) # A-circumflex
|
||||||
|
txt = re.sub(u'Ã|Ã|Ã', r'{A~}', txt) # A-tilde
|
||||||
|
txt = re.sub(u'Ä|Ä|Ä', r'{A"}', txt) # A-umlaut
|
||||||
|
txt = re.sub(u'Å|Å|Å', r'{Ao}', txt) # A-ring
|
||||||
|
txt = re.sub(u'Æ|Æ|Æ', r'{AE}', txt) # AE
|
||||||
|
txt = re.sub(u'Ç|Ç|Ç', r'{C,}', txt) # C-cedilla
|
||||||
|
txt = re.sub(u'È|È|È', r'{E`}', txt) # E-grave
|
||||||
|
txt = re.sub(u'É|É|É', r"{E'}", txt) # E-acute
|
||||||
|
txt = re.sub(u'Ê|Ê|Ê', r'{E^}', txt) # E-circumflex
|
||||||
|
txt = re.sub(u'Ë|Ë|Ë', r'{E"}', txt) # E-umlaut
|
||||||
|
txt = re.sub(u'Ì|Ì|Ì', r'{I`}', txt) # I-grave
|
||||||
|
txt = re.sub(u'Í|Í|Í', r"{I'}", txt) # I-acute
|
||||||
|
txt = re.sub(u'Î|Î|Î', r'{I^}', txt) # I-circumflex
|
||||||
|
txt = re.sub(u'Ï|Ï|Ï', r'{I"}', txt) # I-umlaut
|
||||||
|
txt = re.sub(u'Ð|Ð|Ð', r'{D-}', txt) # ETH
|
||||||
|
txt = re.sub(u'Ñ|Ñ|Ñ', r'{N~}', txt) # N-tilde
|
||||||
|
txt = re.sub(u'Ò|Ò|Ò', r'{O`}', txt) # O-grave
|
||||||
|
txt = re.sub(u'Ó|Ó|Ó', r"{O'}", txt) # O-acute
|
||||||
|
txt = re.sub(u'Ô|Ô|Ô', r'{O^}', txt) # O-circumflex
|
||||||
|
txt = re.sub(u'Õ|Õ|Õ', r'{O~}', txt) # O-tilde
|
||||||
|
txt = re.sub(u'Ö|Ö|Ö', r'{O"}', txt) # O-umlaut
|
||||||
|
txt = re.sub(u'×|×|×', r'{x}', txt) # dimension
|
||||||
|
txt = re.sub(u'Ø|Ø|Ø', r'{O/}', txt) # O-slash
|
||||||
|
txt = re.sub(u'Ù|Ù|Ù', r"{U`}", txt) # U-grave
|
||||||
|
txt = re.sub(u'Ú|Ú|Ú', r"{U'}", txt) # U-acute
|
||||||
|
txt = re.sub(u'Û|Û|Û', r'{U^}', txt) # U-circumflex
|
||||||
|
txt = re.sub(u'Ü|Ü|Ü', r'{U"}', txt) # U-umlaut
|
||||||
|
txt = re.sub(u'Ý|Ý|Ý', r"{Y'}", txt) # Y-grave
|
||||||
|
txt = re.sub(u'ß|ß|ß', r'{sz}', txt) # sharp-s
|
||||||
|
txt = re.sub(u'à|à|à', r'{a`}', txt) # a-grave
|
||||||
|
txt = re.sub(u'á|á|á', r"{a'}", txt) # a-acute
|
||||||
|
txt = re.sub(u'â|â|â', r'{a^}', txt) # a-circumflex
|
||||||
|
txt = re.sub(u'ã|ã|ã', r'{a~}', txt) # a-tilde
|
||||||
|
txt = re.sub(u'ä|ä|ä', r'{a"}', txt) # a-umlaut
|
||||||
|
txt = re.sub(u'å|å|å', r'{ao}', txt) # a-ring
|
||||||
|
txt = re.sub(u'æ|æ|æ', r'{ae}', txt) # ae
|
||||||
|
txt = re.sub(u'ç|ç|ç', r'{c,}', txt) # c-cedilla
|
||||||
|
txt = re.sub(u'è|è|è', r'{e`}', txt) # e-grave
|
||||||
|
txt = re.sub(u'é|é|é', r"{e'}", txt) # e-acute
|
||||||
|
txt = re.sub(u'ê|ê|ê', r'{e^}', txt) # e-circumflex
|
||||||
|
txt = re.sub(u'ë|ë|ë', r'{e"}', txt) # e-umlaut
|
||||||
|
txt = re.sub(u'ì|ì|ì', r'{i`}', txt) # i-grave
|
||||||
|
txt = re.sub(u'í|í|í', r"{i'}", txt) # i-acute
|
||||||
|
txt = re.sub(u'î|î|î', r'{i^}', txt) # i-circumflex
|
||||||
|
txt = re.sub(u'ï|ï|ï', r'{i"}', txt) # i-umlaut
|
||||||
|
txt = re.sub(u'ð|ð|ð', r'{d-}', txt) # eth
|
||||||
|
txt = re.sub(u'ñ|ñ|ñ', r'{n~}', txt) # n-tilde
|
||||||
|
txt = re.sub(u'ò|ò|ò', r'{o`}', txt) # o-grave
|
||||||
|
txt = re.sub(u'ó|ó|ó', r"{o'}", txt) # o-acute
|
||||||
|
txt = re.sub(u'ô|ô|ô', r'{o^}', txt) # o-circumflex
|
||||||
|
txt = re.sub(u'õ|õ|õ', r'{o~}', txt) # o-tilde
|
||||||
|
txt = re.sub(u'ö|ö|ö', r'{o"}', txt) # o-umlaut
|
||||||
|
txt = re.sub(u'ø|ø|ø', r'{o/}', txt) # o-stroke
|
||||||
|
txt = re.sub(u'ù|ù|ù', r'{u`}', txt) # u-grave
|
||||||
|
txt = re.sub(u'ú|ú|ú', r"{u'}", txt) # u-acute
|
||||||
|
txt = re.sub(u'û|û|û', r'{u^}', txt) # u-circumflex
|
||||||
|
txt = re.sub(u'ü|ü|ü', r'{u"}', txt) # u-umlaut
|
||||||
|
txt = re.sub(u'ý|ý|ý', r"{y'}", txt) # y-acute
|
||||||
|
txt = re.sub(u'ÿ|ÿ|ÿ', r'{y"}', txt) # y-umlaut
|
||||||
|
txt = re.sub(u'Œ|Œ|Œ', r'{OE}', txt) # OE
|
||||||
|
txt = re.sub(u'œ|œ|œ', r'{oe}', txt) # oe
|
||||||
|
txt = re.sub(u'Ŝ|Š|Ŝ', r'{S^}', txt) # Scaron
|
||||||
|
txt = re.sub(u'ŝ|š|ŝ', r'{s^}', txt) # scaron
|
||||||
|
txt = re.sub(u'•|•|•', r'{*}', txt) # bullet
|
||||||
|
txt = re.sub(u'₣|₣', r'{Fr}', txt) # Franc
|
||||||
|
txt = re.sub(u'₤|₤', r'{L=}', txt) # Lira
|
||||||
|
txt = re.sub(u'₨|₨', r'{Rs}', txt) # Rupee
|
||||||
|
txt = re.sub(u'€|€|€', r'{C=}', txt) # euro
|
||||||
|
txt = re.sub(u'™|™|™', r'{tm}', txt) # trademark
|
||||||
|
txt = re.sub(u'♠|♠|♠', r'{spade}', txt) # spade
|
||||||
|
txt = re.sub(u'♣|♣|♣', r'{club}', txt) # club
|
||||||
|
txt = re.sub(u'♥|♥|♥', r'{heart}', txt) # heart
|
||||||
|
txt = re.sub(u'♦|♦|♦', r'{diamond}', txt) # diamond
|
||||||
|
|
||||||
|
# Move into main code?
|
||||||
|
# txt = re.sub(u'\xa0', r'p. ', txt) # blank paragraph
|
||||||
|
# txt = re.sub(u'\n\n\n\n', r'\n\np. \n\n', txt) # blank paragraph
|
||||||
|
# txt = re.sub(u'\n \n', r'\n<br />\n', txt) # blank paragraph - br tag
|
||||||
|
|
||||||
|
return txt
|
@ -625,6 +625,16 @@ class Application(QApplication):
|
|||||||
if s is not None:
|
if s is not None:
|
||||||
font.setStretch(s)
|
font.setStretch(s)
|
||||||
QApplication.setFont(font)
|
QApplication.setFont(font)
|
||||||
|
st = self.style()
|
||||||
|
if st is not None:
|
||||||
|
st = unicode(st.objectName()).lower()
|
||||||
|
if (islinux or isfreebsd) and st in ('windows', 'motif', 'cde'):
|
||||||
|
from PyQt4.Qt import QStyleFactory
|
||||||
|
styles = set(map(unicode, QStyleFactory.keys()))
|
||||||
|
if 'Cleanlooks' in styles:
|
||||||
|
self.setStyle('Cleanlooks')
|
||||||
|
else:
|
||||||
|
self.setStyle('Plastique')
|
||||||
|
|
||||||
def _send_file_open_events(self):
|
def _send_file_open_events(self):
|
||||||
with self._file_open_lock:
|
with self._file_open_lock:
|
||||||
|
@ -20,7 +20,7 @@ class StoreAction(InterfaceAction):
|
|||||||
action_spec = (_('Get books'), 'store.png', None, None)
|
action_spec = (_('Get books'), 'store.png', None, None)
|
||||||
|
|
||||||
def genesis(self):
|
def genesis(self):
|
||||||
self.qaction.triggered.connect(self.search)
|
self.qaction.triggered.connect(self.do_search)
|
||||||
self.store_menu = QMenu()
|
self.store_menu = QMenu()
|
||||||
self.load_menu()
|
self.load_menu()
|
||||||
|
|
||||||
@ -36,6 +36,9 @@ class StoreAction(InterfaceAction):
|
|||||||
self.store_list_menu.addAction(n, partial(self.open_store, p))
|
self.store_list_menu.addAction(n, partial(self.open_store, p))
|
||||||
self.qaction.setMenu(self.store_menu)
|
self.qaction.setMenu(self.store_menu)
|
||||||
|
|
||||||
|
def do_search(self):
|
||||||
|
return self.search()
|
||||||
|
|
||||||
def search(self, query=''):
|
def search(self, query=''):
|
||||||
self.show_disclaimer()
|
self.show_disclaimer()
|
||||||
from calibre.gui2.store.search.search import SearchDialog
|
from calibre.gui2.store.search.search import SearchDialog
|
||||||
@ -52,6 +55,8 @@ class StoreAction(InterfaceAction):
|
|||||||
author = ''
|
author = ''
|
||||||
if self.gui.current_view() is self.gui.library_view:
|
if self.gui.current_view() is self.gui.library_view:
|
||||||
author = self.gui.library_view.model().authors(row)
|
author = self.gui.library_view.model().authors(row)
|
||||||
|
if author:
|
||||||
|
author = author.replace('|', ' ')
|
||||||
else:
|
else:
|
||||||
mi = self.gui.current_view().model().get_book_display_info(row)
|
mi = self.gui.current_view().model().get_book_display_info(row)
|
||||||
author = ' & '.join(mi.authors)
|
author = ' & '.join(mi.authors)
|
||||||
|
@ -19,7 +19,7 @@ class PluginWidget(Widget, Ui_Form):
|
|||||||
Widget.__init__(self, parent,
|
Widget.__init__(self, parent,
|
||||||
['newline', 'max_line_length', 'force_max_line_length',
|
['newline', 'max_line_length', 'force_max_line_length',
|
||||||
'inline_toc', 'txt_output_formatting', 'keep_links', 'keep_image_references',
|
'inline_toc', 'txt_output_formatting', 'keep_links', 'keep_image_references',
|
||||||
'txt_output_encoding'])
|
'keep_color', 'txt_output_encoding'])
|
||||||
self.db, self.book_id = db, book_id
|
self.db, self.book_id = db, book_id
|
||||||
for x in get_option('newline').option.choices:
|
for x in get_option('newline').option.choices:
|
||||||
self.opt_newline.addItem(x)
|
self.opt_newline.addItem(x)
|
||||||
|
@ -122,6 +122,13 @@
|
|||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
|
<item>
|
||||||
|
<widget class="QCheckBox" name="opt_keep_color">
|
||||||
|
<property name="text">
|
||||||
|
<string>Keep text color, when possible</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
</layout>
|
</layout>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
|
@ -10,7 +10,7 @@ __docformat__ = 'restructuredtext en'
|
|||||||
import textwrap
|
import textwrap
|
||||||
|
|
||||||
from PyQt4.Qt import (QWidget, QGridLayout, QGroupBox, QListView, Qt, QSpinBox,
|
from PyQt4.Qt import (QWidget, QGridLayout, QGroupBox, QListView, Qt, QSpinBox,
|
||||||
QDoubleSpinBox, QCheckBox, QLineEdit, QComboBox, QLabel)
|
QDoubleSpinBox, QCheckBox, QLineEdit, QComboBox, QLabel, QVariant)
|
||||||
|
|
||||||
from calibre.gui2.preferences.metadata_sources import FieldsModel as FM
|
from calibre.gui2.preferences.metadata_sources import FieldsModel as FM
|
||||||
|
|
||||||
@ -95,9 +95,9 @@ class ConfigWidget(QWidget):
|
|||||||
widget.setChecked(bool(val))
|
widget.setChecked(bool(val))
|
||||||
elif opt.type == 'choices':
|
elif opt.type == 'choices':
|
||||||
widget = QComboBox(self)
|
widget = QComboBox(self)
|
||||||
for x in opt.choices:
|
for key, label in opt.choices.iteritems():
|
||||||
widget.addItem(x)
|
widget.addItem(label, QVariant(key))
|
||||||
idx = opt.choices.index(val)
|
idx = widget.findData(QVariant(val))
|
||||||
widget.setCurrentIndex(idx)
|
widget.setCurrentIndex(idx)
|
||||||
widget.opt = opt
|
widget.opt = opt
|
||||||
widget.setToolTip(textwrap.fill(opt.desc))
|
widget.setToolTip(textwrap.fill(opt.desc))
|
||||||
@ -124,7 +124,8 @@ class ConfigWidget(QWidget):
|
|||||||
elif isinstance(w, QCheckBox):
|
elif isinstance(w, QCheckBox):
|
||||||
val = w.isChecked()
|
val = w.isChecked()
|
||||||
elif isinstance(w, QComboBox):
|
elif isinstance(w, QComboBox):
|
||||||
val = unicode(w.currentText())
|
idx = w.currentIndex()
|
||||||
|
val = unicode(w.itemData(idx).toString())
|
||||||
self.plugin.prefs[w.opt.name] = val
|
self.plugin.prefs[w.opt.name] = val
|
||||||
|
|
||||||
|
|
||||||
|
@ -71,9 +71,10 @@ class SourcesModel(QAbstractTableModel): # {{{
|
|||||||
plugin.is_configured()):
|
plugin.is_configured()):
|
||||||
return QIcon(I('list_remove.png'))
|
return QIcon(I('list_remove.png'))
|
||||||
elif role == Qt.ToolTipRole:
|
elif role == Qt.ToolTipRole:
|
||||||
|
base = plugin.description + '\n\n'
|
||||||
if plugin.is_configured():
|
if plugin.is_configured():
|
||||||
return _('This source is configured and ready to go')
|
return base + _('This source is configured and ready to go')
|
||||||
return _('This source needs configuration')
|
return base + _('This source needs configuration')
|
||||||
return NONE
|
return NONE
|
||||||
|
|
||||||
def setData(self, index, val, role):
|
def setData(self, index, val, role):
|
||||||
|
@ -29,7 +29,7 @@ class WizardsTowerBooksStore(BasicStoreConfig, StorePlugin):
|
|||||||
detail_item = self.url + detail_item
|
detail_item = self.url + detail_item
|
||||||
|
|
||||||
if external or self.config.get('open_external', False):
|
if external or self.config.get('open_external', False):
|
||||||
open_url(QUrl(url_slash_cleaner(detail_item if detail_item else url)))
|
open_url(QUrl(url_slash_cleaner(detail_item)))
|
||||||
else:
|
else:
|
||||||
d = WebStoreDialog(self.gui, self.url, parent, detail_item)
|
d = WebStoreDialog(self.gui, self.url, parent, detail_item)
|
||||||
d.setWindowTitle(self.name)
|
d.setWindowTitle(self.name)
|
||||||
|
@ -1263,7 +1263,7 @@ class TagsModel(QAbstractItemModel): # {{{
|
|||||||
d['last'] = data[key][cat_len-1]
|
d['last'] = data[key][cat_len-1]
|
||||||
name = eval_formatter.safe_format(collapse_template,
|
name = eval_formatter.safe_format(collapse_template,
|
||||||
d, 'TAG_VIEW', None)
|
d, 'TAG_VIEW', None)
|
||||||
self.beginInsertRows(category_index, 999999, 1) #len(data[key])-1)
|
self.beginInsertRows(category_index, 999998, 999999) #len(data[key])-1)
|
||||||
sub_cat = TagTreeItem(parent=category, data = name,
|
sub_cat = TagTreeItem(parent=category, data = name,
|
||||||
tooltip = None, temporary=True,
|
tooltip = None, temporary=True,
|
||||||
category_icon = category_node.icon,
|
category_icon = category_node.icon,
|
||||||
@ -1296,7 +1296,7 @@ class TagsModel(QAbstractItemModel): # {{{
|
|||||||
key in ['authors', 'publisher', 'news', 'formats', 'rating'] or
|
key in ['authors', 'publisher', 'news', 'formats', 'rating'] or
|
||||||
key not in self.db.prefs.get('categories_using_hierarchy', []) or
|
key not in self.db.prefs.get('categories_using_hierarchy', []) or
|
||||||
len(components) == 1):
|
len(components) == 1):
|
||||||
self.beginInsertRows(category_index, 999999, 1)
|
self.beginInsertRows(category_index, 999998, 999999)
|
||||||
n = TagTreeItem(parent=node_parent, data=tag, tooltip=tt,
|
n = TagTreeItem(parent=node_parent, data=tag, tooltip=tt,
|
||||||
icon_map=self.icon_state_map)
|
icon_map=self.icon_state_map)
|
||||||
if tag.id_set is not None:
|
if tag.id_set is not None:
|
||||||
@ -1332,7 +1332,7 @@ class TagsModel(QAbstractItemModel): # {{{
|
|||||||
t.is_hierarchical = \
|
t.is_hierarchical = \
|
||||||
'5state' if t.category != 'search' else '3state'
|
'5state' if t.category != 'search' else '3state'
|
||||||
t.name = comp
|
t.name = comp
|
||||||
self.beginInsertRows(category_index, 999999, 1)
|
self.beginInsertRows(category_index, 999998, 999999)
|
||||||
node_parent = TagTreeItem(parent=node_parent, data=t,
|
node_parent = TagTreeItem(parent=node_parent, data=t,
|
||||||
tooltip=tt, icon_map=self.icon_state_map)
|
tooltip=tt, icon_map=self.icon_state_map)
|
||||||
child_map[(comp,tag.category)] = node_parent
|
child_map[(comp,tag.category)] = node_parent
|
||||||
|
@ -633,8 +633,8 @@ class LibraryPage(QWizardPage, LibraryUI):
|
|||||||
try:
|
try:
|
||||||
lang = prefs['language'].lower()[:2]
|
lang = prefs['language'].lower()[:2]
|
||||||
metadata_plugins = {
|
metadata_plugins = {
|
||||||
'zh' : ('Douban Books', 'Douban.com covers'),
|
'zh' : ('Douban Books',),
|
||||||
'fr' : ('Nicebooks', 'Nicebooks covers'),
|
'fr' : ('Nicebooks',),
|
||||||
}.get(lang, [])
|
}.get(lang, [])
|
||||||
from calibre.customize.ui import enable_plugin
|
from calibre.customize.ui import enable_plugin
|
||||||
for name in metadata_plugins:
|
for name in metadata_plugins:
|
||||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
16269
src/calibre/translations/ltg.po
Normal file
16269
src/calibre/translations/ltg.po
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -869,6 +869,7 @@ class Engine(threading.Thread):
|
|||||||
if DEBUG:
|
if DEBUG:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
except:
|
except:
|
||||||
|
if DEBUG:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
@ -1,209 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
# Copyright (c) 2010, Webreactor - Marcin Lulek <info@webreactor.eu>
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# Redistribution and use in source and binary forms, with or without
|
|
||||||
# modification, are permitted provided that the following conditions are met:
|
|
||||||
# * Redistributions of source code must retain the above copyright
|
|
||||||
# notice, this list of conditions and the following disclaimer.
|
|
||||||
# * Redistributions in binary form must reproduce the above copyright
|
|
||||||
# notice, this list of conditions and the following disclaimer in the
|
|
||||||
# documentation and/or other materials provided with the distribution.
|
|
||||||
# * Neither the name of the <organization> nor the
|
|
||||||
# names of its contributors may be used to endorse or promote products
|
|
||||||
# derived from this software without specific prior written permission.
|
|
||||||
#
|
|
||||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
||||||
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
||||||
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
||||||
# DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
|
|
||||||
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
||||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
||||||
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
||||||
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
||||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
||||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
|
|
||||||
|
|
||||||
from lxml import etree
|
|
||||||
from calibre.ebooks.oeb.base import barename
|
|
||||||
|
|
||||||
class EchoTarget:
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.final_output = []
|
|
||||||
self.block = False
|
|
||||||
self.ol_ident = 0
|
|
||||||
self.ul_ident = 0
|
|
||||||
self.list_types = []
|
|
||||||
self.haystack = []
|
|
||||||
|
|
||||||
def start(self, tag, attrib):
|
|
||||||
tag = barename(tag)
|
|
||||||
|
|
||||||
newline = '\n'
|
|
||||||
dot = ''
|
|
||||||
new_tag = ''
|
|
||||||
|
|
||||||
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
|
|
||||||
new_tag = tag
|
|
||||||
dot = '. '
|
|
||||||
elif tag == 'p':
|
|
||||||
new_tag = ''
|
|
||||||
dot = ''
|
|
||||||
elif tag == 'blockquote':
|
|
||||||
new_tag = 'bq'
|
|
||||||
dot = '. '
|
|
||||||
elif tag in ('b', 'strong'):
|
|
||||||
new_tag = '*'
|
|
||||||
newline = ''
|
|
||||||
elif tag in ('em', 'i'):
|
|
||||||
new_tag = '_'
|
|
||||||
newline = ''
|
|
||||||
elif tag == 'cite':
|
|
||||||
new_tag = '??'
|
|
||||||
newline = ''
|
|
||||||
elif tag == 'del':
|
|
||||||
new_tag = '-'
|
|
||||||
newline = ''
|
|
||||||
elif tag == 'ins':
|
|
||||||
new_tag = '+'
|
|
||||||
newline = ''
|
|
||||||
elif tag == 'sup':
|
|
||||||
new_tag = '^'
|
|
||||||
newline = ''
|
|
||||||
elif tag == 'sub':
|
|
||||||
new_tag = '~'
|
|
||||||
newline = ''
|
|
||||||
elif tag == 'span':
|
|
||||||
new_tag = ''
|
|
||||||
newline = ''
|
|
||||||
elif tag == 'a':
|
|
||||||
self.block = True
|
|
||||||
if 'title' in attrib:
|
|
||||||
self.a_part = {'title':attrib.get('title'),
|
|
||||||
'href':attrib.get('href', '')}
|
|
||||||
else:
|
|
||||||
self.a_part = {'title':None, 'href':attrib.get('href', '')}
|
|
||||||
new_tag = ''
|
|
||||||
newline = ''
|
|
||||||
|
|
||||||
elif tag == 'img':
|
|
||||||
if 'alt' in attrib:
|
|
||||||
new_tag = ' !%s(%s)' % (attrib.get('src'), attrib.get('title'),)
|
|
||||||
else:
|
|
||||||
new_tag = ' !%s' % attrib.get('src')
|
|
||||||
newline = ''
|
|
||||||
|
|
||||||
elif tag in ('ul', 'ol'):
|
|
||||||
new_tag = ''
|
|
||||||
newline = ''
|
|
||||||
self.list_types.append(tag)
|
|
||||||
if tag == 'ul':
|
|
||||||
self.ul_ident += 1
|
|
||||||
else:
|
|
||||||
self.ol_ident += 1
|
|
||||||
|
|
||||||
elif tag == 'li':
|
|
||||||
indent = self.ul_ident + self.ol_ident
|
|
||||||
if self.list_types[-1] == 'ul':
|
|
||||||
new_tag = '*' * indent + ' '
|
|
||||||
newline = '\n'
|
|
||||||
else:
|
|
||||||
new_tag = '#' * indent + ' '
|
|
||||||
newline = '\n'
|
|
||||||
|
|
||||||
|
|
||||||
if tag not in ('ul', 'ol'):
|
|
||||||
textile = '%(newline)s%(tag)s%(dot)s' % \
|
|
||||||
{
|
|
||||||
'newline':newline,
|
|
||||||
'tag':new_tag,
|
|
||||||
'dot':dot
|
|
||||||
}
|
|
||||||
if not self.block:
|
|
||||||
self.final_output.append(textile)
|
|
||||||
else:
|
|
||||||
self.haystack.append(textile)
|
|
||||||
|
|
||||||
def end(self, tag):
|
|
||||||
tag = barename(tag)
|
|
||||||
|
|
||||||
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'):
|
|
||||||
self.final_output.append('\n')
|
|
||||||
elif tag in ('b', 'strong'):
|
|
||||||
self.final_output.append('*')
|
|
||||||
elif tag in ('em', 'i'):
|
|
||||||
self.final_output.append('_')
|
|
||||||
elif tag == 'cite':
|
|
||||||
self.final_output.append('??')
|
|
||||||
elif tag == 'del':
|
|
||||||
self.final_output.append('-')
|
|
||||||
elif tag == 'ins':
|
|
||||||
self.final_output.append('+')
|
|
||||||
elif tag == 'sup':
|
|
||||||
self.final_output.append('^')
|
|
||||||
elif tag == 'sub':
|
|
||||||
self.final_output.append('~')
|
|
||||||
elif tag == 'span':
|
|
||||||
self.final_output.append('')
|
|
||||||
elif tag == 'a':
|
|
||||||
if self.a_part['title']:
|
|
||||||
textilized = ' "%s (%s)":%s ' % (
|
|
||||||
''.join(self.haystack),
|
|
||||||
self.a_part.get('title'),
|
|
||||||
self.a_part.get('href'),
|
|
||||||
)
|
|
||||||
self.haystack = []
|
|
||||||
else:
|
|
||||||
textilized = ' "%s":%s ' % (
|
|
||||||
''.join(self.haystack),
|
|
||||||
self.a_part.get('href'),
|
|
||||||
)
|
|
||||||
self.haystack = []
|
|
||||||
self.final_output.append(textilized)
|
|
||||||
self.block = False
|
|
||||||
elif tag == 'img':
|
|
||||||
self.final_output.append('!')
|
|
||||||
elif tag == 'ul':
|
|
||||||
self.ul_ident -= 1
|
|
||||||
self.list_types.pop()
|
|
||||||
if len(self.list_types) == 0:
|
|
||||||
self.final_output.append('\n')
|
|
||||||
elif tag == 'ol':
|
|
||||||
self.ol_ident -= 1
|
|
||||||
self.list_types.pop()
|
|
||||||
if len(self.list_types) == 0:
|
|
||||||
self.final_output.append('\n')
|
|
||||||
|
|
||||||
def data(self, data):
|
|
||||||
#we dont want any linebreaks inside our tags
|
|
||||||
node_data = data.replace('\n','')
|
|
||||||
if not self.block:
|
|
||||||
self.final_output.append(node_data)
|
|
||||||
else:
|
|
||||||
self.haystack.append(node_data)
|
|
||||||
|
|
||||||
def comment(self, text):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
return "closed!"
|
|
||||||
|
|
||||||
|
|
||||||
def html2textile(html):
|
|
||||||
#1st pass
|
|
||||||
#clean the whitespace and convert html to xhtml
|
|
||||||
parser = etree.HTMLParser()
|
|
||||||
tree = etree.fromstring(html, parser)
|
|
||||||
xhtml = etree.tostring(tree, method="xml")
|
|
||||||
parser = etree.XMLParser(remove_blank_text=True)
|
|
||||||
root = etree.XML(xhtml, parser)
|
|
||||||
cleaned_html = etree.tostring(root)
|
|
||||||
#2nd pass build textile
|
|
||||||
target = EchoTarget()
|
|
||||||
parser = etree.XMLParser(target=target)
|
|
||||||
root = etree.fromstring(cleaned_html, parser)
|
|
||||||
textilized_text = ''.join(target.final_output).lstrip().rstrip()
|
|
||||||
return textilized_text
|
|
Loading…
x
Reference in New Issue
Block a user