mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Merge from trunk
This commit is contained in:
commit
3c2b969c5a
BIN
resources/images/news/exiled.png
Normal file
BIN
resources/images/news/exiled.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.3 KiB |
35
resources/recipes/cicero.recipe
Normal file
35
resources/recipes/cicero.recipe
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Cicero(BasicNewsRecipe):
|
||||||
|
timefmt = ' [%Y-%m-%d]'
|
||||||
|
title = u'Cicero'
|
||||||
|
__author__ = 'mad@sharktooth.de'
|
||||||
|
description = u'Magazin f\xfcr politische Kultur'
|
||||||
|
oldest_article = 7
|
||||||
|
language = 'de'
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
publisher = 'Ringier Publishing'
|
||||||
|
category = 'news, politics, Germany'
|
||||||
|
encoding = 'iso-8859-1'
|
||||||
|
publication_type = 'magazine'
|
||||||
|
masthead_url = 'http://www.cicero.de/img2/cicero_logo_rss.gif'
|
||||||
|
feeds = [
|
||||||
|
(u'Das gesamte Portfolio', u'http://www.cicero.de/rss/rss.php?ress_id='),
|
||||||
|
#(u'Alle Heft-Inhalte', u'http://www.cicero.de/rss/rss.php?ress_id=heft'),
|
||||||
|
#(u'Alle Online-Inhalte', u'http://www.cicero.de/rss/rss.php?ress_id=online'),
|
||||||
|
#(u'Berliner Republik', u'http://www.cicero.de/rss/rss.php?ress_id=4'),
|
||||||
|
#(u'Weltb\xfchne', u'http://www.cicero.de/rss/rss.php?ress_id=1'),
|
||||||
|
#(u'Salon', u'http://www.cicero.de/rss/rss.php?ress_id=7'),
|
||||||
|
#(u'Kapital', u'http://www.cicero.de/rss/rss.php?ress_id=6'),
|
||||||
|
#(u'Netzst\xfccke', u'http://www.cicero.de/rss/rss.php?ress_id=9'),
|
||||||
|
#(u'Leinwand', u'http://www.cicero.de/rss/rss.php?ress_id=12'),
|
||||||
|
#(u'Bibliothek', u'http://www.cicero.de/rss/rss.php?ress_id=15'),
|
||||||
|
(u'Kolumne - Alle Kolulmnen', u'http://www.cicero.de/rss/rss2.php?ress_id='),
|
||||||
|
#(u'Kolumne - Schreiber, Berlin', u'http://www.cicero.de/rss/rss2.php?ress_id=35'),
|
||||||
|
#(u'Kolumne - TV Kritik', u'http://www.cicero.de/rss/rss2.php?ress_id=34')
|
||||||
|
]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return 'http://www.cicero.de/page_print.php?' + url.rpartition('?')[2]
|
@ -22,7 +22,7 @@ class Deia(BasicNewsRecipe):
|
|||||||
cover_url ='http://2.bp.blogspot.com/_RjrWzC6tI14/TM6jrPLaBZI/AAAAAAAAFaI/ayffwxidFEY/s1600/2009-10-13-logo-deia.jpg'
|
cover_url ='http://2.bp.blogspot.com/_RjrWzC6tI14/TM6jrPLaBZI/AAAAAAAAFaI/ayffwxidFEY/s1600/2009-10-13-logo-deia.jpg'
|
||||||
timefmt ='[%a, %d %b, %Y]'
|
timefmt ='[%a, %d %b, %Y]'
|
||||||
encoding ='utf8'
|
encoding ='utf8'
|
||||||
language ='es_ES'
|
language ='es'
|
||||||
remove_javascript =True
|
remove_javascript =True
|
||||||
remove_tags_after =dict(id='Texto')
|
remove_tags_after =dict(id='Texto')
|
||||||
remove_tags_before =dict(id='Texto')
|
remove_tags_before =dict(id='Texto')
|
||||||
|
122
resources/recipes/el_correo.recipe
Normal file
122
resources/recipes/el_correo.recipe
Normal file
@ -0,0 +1,122 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '08 Januery 2011, desUBIKado'
|
||||||
|
__author__ = 'desUBIKado'
|
||||||
|
__description__ = 'Daily newspaper from Biscay'
|
||||||
|
__version__ = 'v0.08'
|
||||||
|
__date__ = '08, Januery 2011'
|
||||||
|
'''
|
||||||
|
[url]http://www.elcorreo.com/[/url]
|
||||||
|
'''
|
||||||
|
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class heraldo(BasicNewsRecipe):
|
||||||
|
__author__ = 'desUBIKado'
|
||||||
|
description = 'Daily newspaper from Biscay'
|
||||||
|
title = u'El Correo'
|
||||||
|
publisher = 'Vocento'
|
||||||
|
category = 'News, politics, culture, economy, general interest'
|
||||||
|
oldest_article = 2
|
||||||
|
delay = 1
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
language = 'es'
|
||||||
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
|
encoding = 'iso-8859-1'
|
||||||
|
remove_empty_feeds = True
|
||||||
|
remove_javascript = False
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Portada', u'http://www.elcorreo.com/vizcaya/portada.xml'),
|
||||||
|
(u'Local', u'http://www.elcorreo.com/vizcaya/rss/feeds/vizcaya.xml'),
|
||||||
|
(u'Internacional', u'hhttp://www.elcorreo.com/vizcaya/rss/feeds/internacional.xml'),
|
||||||
|
(u'Econom\xeda', u'http://www.elcorreo.com/vizcaya/rss/feeds/economia.xml'),
|
||||||
|
(u'Pol\xedtica', u'http://www.elcorreo.com/vizcaya/rss/feeds/politica.xml'),
|
||||||
|
(u'Opini\xf3n', u'http://www.elcorreo.com/vizcaya/rss/feeds/opinion.xml'),
|
||||||
|
(u'Deportes', u'http://www.elcorreo.com/vizcaya/rss/feeds/deportes.xml'),
|
||||||
|
(u'Sociedad', u'http://www.elcorreo.com/vizcaya/rss/feeds/sociedad.xml'),
|
||||||
|
(u'Cultura', u'http://www.elcorreo.com/vizcaya/rss/feeds/cultura.xml'),
|
||||||
|
(u'Televisi\xf3n', u'http://www.elcorreo.com/vizcaya/rss/feeds/television.xml'),
|
||||||
|
(u'Gente', u'http://www.elcorreo.com/vizcaya/rss/feeds/gente.xml')
|
||||||
|
]
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='div', attrs={'class':['grouphead','date','art_head','story-texto','text','colC_articulo','contenido_comentarios']}),
|
||||||
|
dict(name='div' , attrs={'id':['articulo','story-texto','story-entradilla']})
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'class':['art_barra','detalles-opinion','formdenunciar','modulo calculadoras','nubetags','pie']}),
|
||||||
|
dict(name='div', attrs={'class':['mod_lomas','bloque_lomas','blm_header','link-app3','link-app4','botones_listado']}),
|
||||||
|
dict(name='div', attrs={'class':['navegacion_galeria','modulocanalpromocion','separa','separacion','compartir','tags_relacionados']}),
|
||||||
|
dict(name='div', attrs={'class':['moduloBuscadorDeportes','modulo-gente','moddestacadopeq','OpcArt','articulopiniones']}),
|
||||||
|
dict(name='div', attrs={'class':['modulo-especial','publiEspecial']}),
|
||||||
|
dict(name='div', attrs={'id':['articulopina']}),
|
||||||
|
dict(name='br', attrs={'class':'clear'}),
|
||||||
|
dict(name='form', attrs={'name':'frm_conversor2'})
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags_before = dict(name='div' , attrs={'class':'articulo '})
|
||||||
|
remove_tags_after = dict(name='div' , attrs={'class':'comentarios'})
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
cover = None
|
||||||
|
st = time.localtime()
|
||||||
|
year = str(st.tm_year)
|
||||||
|
month = "%.2d" % st.tm_mon
|
||||||
|
day = "%.2d" % st.tm_mday
|
||||||
|
#[url]http://img.kiosko.net/2011/01/02/es/elcorreo.750.jpg[/url]
|
||||||
|
#[url]http://info.elcorreo.com/pdf/06012011-viz.pdf[/url]
|
||||||
|
cover='http://info.elcorreo.com/pdf/'+ day + month + year +'-viz.pdf'
|
||||||
|
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
try:
|
||||||
|
br.open(cover)
|
||||||
|
except:
|
||||||
|
self.log("\nPortada no disponible")
|
||||||
|
cover ='http://www.elcorreo.com/vizcaya/noticias/201002/02/Media/logo-elcorreo-nuevo.png'
|
||||||
|
return cover
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
h1, .headline {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;}
|
||||||
|
h2, .subhead {font-family:Arial,Helvetica,sans-serif; font-style:italic; font-weight:normal;font-size:18px;}
|
||||||
|
h3, .overhead {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}
|
||||||
|
h4 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}
|
||||||
|
h5 {font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:16px;}
|
||||||
|
h6 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}
|
||||||
|
.date,.byline, .photo {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:14px;}
|
||||||
|
img{margin-bottom: 0.4em}
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
preprocess_regexps = [
|
||||||
|
|
||||||
|
# To present the image of the embedded video
|
||||||
|
(re.compile(r'var RUTA_IMAGEN', re.DOTALL|re.IGNORECASE), lambda match: '</script><img src'),
|
||||||
|
(re.compile(r'.jpg";', re.DOTALL|re.IGNORECASE), lambda match: '.jpg">'),
|
||||||
|
(re.compile(r'var SITIO = "elcorreo";', re.DOTALL|re.IGNORECASE), lambda match: '<SCRIPT TYPE="text/JavaScript"'),
|
||||||
|
|
||||||
|
# To separate paragraphs with a blank line
|
||||||
|
(re.compile(r'<div class="p"', re.DOTALL|re.IGNORECASE), lambda match: '<p></p><div class="p"'),
|
||||||
|
|
||||||
|
# To put a blank line between the subtitle and the date and time of the news
|
||||||
|
(re.compile(r'<div class="date">', re.DOTALL|re.IGNORECASE), lambda match: '<br><div class="date">'),
|
||||||
|
|
||||||
|
# To put a blank line between the intro of the embedded videos and the previous text
|
||||||
|
(re.compile(r'<div class="video"', re.DOTALL|re.IGNORECASE), lambda match: '<br><div class="video"'),
|
||||||
|
|
||||||
|
# To view photos from the first when these are presented as a gallery
|
||||||
|
(re.compile(r'src="/img/shim.gif"', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||||
|
(re.compile(r'rel=', re.DOTALL|re.IGNORECASE), lambda match: 'src='),
|
||||||
|
|
||||||
|
# To remove the link of the title
|
||||||
|
(re.compile(r'<h1 class="headline">\n<a href="', re.DOTALL|re.IGNORECASE), lambda match: '<h1 class="'),
|
||||||
|
(re.compile(r'</a>\n</h1>', re.DOTALL|re.IGNORECASE), lambda match: '</h1>'),
|
||||||
|
|
||||||
|
]
|
||||||
|
|
43
resources/recipes/el_publico.recipe
Normal file
43
resources/recipes/el_publico.recipe
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = 'Gerardo Diez'
|
||||||
|
__copyright__ = 'Gerardo Diez<gerardo.diez.garcia@gmail.com>'
|
||||||
|
description = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
'''
|
||||||
|
publico.es
|
||||||
|
'''
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
class Publico(BasicNewsRecipe):
|
||||||
|
title =u'Publico.es'
|
||||||
|
__author__ ='Gerardo Diez'
|
||||||
|
publisher =u'Mediapubli Sociedad de Publicaciones y Ediciones S.L.'
|
||||||
|
category ='news, politics, finances, world, spain, science, catalunya'
|
||||||
|
oldest_article =1
|
||||||
|
max_articles_per_feed =100
|
||||||
|
simultaneous_downloads =10
|
||||||
|
cover_url =u'http://imagenes.publico.es/css/img/logo_publico.gif'
|
||||||
|
timefmt ='[%a, %d %b, %Y]'
|
||||||
|
encoding ='utf8'
|
||||||
|
language ='es'
|
||||||
|
remove_javascript =True
|
||||||
|
no_stylesheets =True
|
||||||
|
keep_only_tags =dict(id='main')
|
||||||
|
remove_tags =[
|
||||||
|
dict(name='div', attrs={'class':['Noticias_642x50', 'contInfo ancho']}),
|
||||||
|
dict(name='ul', attrs={'class':['navComentarios', 'comentarios']}),
|
||||||
|
dict(name='div', attrs={'id':['commentsContext', 'toolbar', 'comentarios']}),
|
||||||
|
dict(name='h5', attrs={'id':'comentarios'})
|
||||||
|
]
|
||||||
|
feeds =[(u'Internacional', u'http://www.publico.es/estaticos/rss/internacional'),
|
||||||
|
(u'Espa\xf1a', u'http://www.publico.es/estaticos/rss/espana'),
|
||||||
|
(u'Dinero', u'http://www.publico.es/estaticos/rss/dinero'),
|
||||||
|
(u'Ciencias', u'http://www.publico.es/estaticos/rss/ciencias'),
|
||||||
|
(u'Culturas', u'http://www.publico.es/estaticos/rss/culturas'),
|
||||||
|
(u'Deportes', u'http://www.publico.es/estaticos/rss/deportes'),
|
||||||
|
(u'Televisi\xf3n y Gente', u'http://www.publico.es/estaticos/rss/televisionygente'),
|
||||||
|
(u'Catalu\xf1a', u'http://www.publico.es/estaticos/rss/catalunya'),
|
||||||
|
(u'Viajes', u'http://www.publico.es/estaticos/rss/viajes')]
|
||||||
|
|
||||||
|
|
@ -17,7 +17,7 @@ class ElPais_RSS(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
encoding = 'cp1252'
|
encoding = 'cp1252'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
language = 'es_ES'
|
language = 'es'
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
publication_type = 'newspaper'
|
publication_type = 'newspaper'
|
||||||
masthead_url = 'http://www.elpais.com/im/tit_logo.gif'
|
masthead_url = 'http://www.elpais.com/im/tit_logo.gif'
|
||||||
|
@ -1,7 +1,5 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
'''
|
'''
|
||||||
exiledonline.com
|
exiledonline.com
|
||||||
'''
|
'''
|
||||||
@ -20,18 +18,20 @@ class Exiled(BasicNewsRecipe):
|
|||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
encoding = 'utf8'
|
encoding = 'utf8'
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
language = 'en'
|
language = 'en'
|
||||||
|
publication_type = 'newsblog'
|
||||||
|
masthead_url = 'http://exiledonline.com/wp-content/themes/exiledonline_theme/images/header-sm.gif'
|
||||||
|
extra_css = """
|
||||||
|
body{font-family: Arial,Helvetica,sans-serif}
|
||||||
|
#topslug{font-size: xx-large; font-weight: bold; color: red}
|
||||||
|
"""
|
||||||
|
|
||||||
cover_url = 'http://exiledonline.com/wp-content/themes/exiledonline_theme/images/header-sm.gif'
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
html2lrf_options = [
|
, 'tags' : category
|
||||||
'--comment' , description
|
, 'publisher' : publisher
|
||||||
, '--base-font-size', '10'
|
, 'language' : language
|
||||||
, '--category' , category
|
}
|
||||||
, '--publisher' , publisher
|
|
||||||
]
|
|
||||||
|
|
||||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
|
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'id':'main'})]
|
keep_only_tags = [dict(name='div', attrs={'id':'main'})]
|
||||||
|
|
||||||
@ -47,12 +47,13 @@ class Exiled(BasicNewsRecipe):
|
|||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
del item['style']
|
del item['style']
|
||||||
mtag = '\n<meta http-equiv="Content-Language" content="en"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n'
|
for alink in soup.findAll('a'):
|
||||||
soup.head.insert(0,mtag)
|
if alink.string is not None:
|
||||||
|
tstr = alink.string
|
||||||
|
alink.replaceWith(tstr)
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def get_article_url(self, article):
|
def get_article_url(self, article):
|
||||||
raw = article.get('link', None)
|
raw = article.get('link', None)
|
||||||
final = raw + 'all/1/'
|
final = raw + 'all/1/'
|
||||||
return final
|
return final
|
||||||
|
|
||||||
|
@ -3,29 +3,31 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '04 December 2010, desUBIKado'
|
__copyright__ = '04 December 2010, desUBIKado'
|
||||||
__author__ = 'desUBIKado'
|
__author__ = 'desUBIKado'
|
||||||
__description__ = 'Daily newspaper from Aragon'
|
__description__ = 'Daily newspaper from Aragon'
|
||||||
__version__ = 'v0.03'
|
__version__ = 'v0.04'
|
||||||
__date__ = '11, December 2010'
|
__date__ = '6, Januery 2011'
|
||||||
'''
|
'''
|
||||||
[url]http://www.heraldo.es/[/url]
|
[url]http://www.heraldo.es/[/url]
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import time
|
import time
|
||||||
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class heraldo(BasicNewsRecipe):
|
class heraldo(BasicNewsRecipe):
|
||||||
__author__ = 'desUBIKado'
|
__author__ = 'desUBIKado'
|
||||||
description = 'Daily newspaper from Aragon'
|
description = 'Daily newspaper from Aragon'
|
||||||
title = u'Heraldo de Aragon'
|
title = u'Heraldo de Aragon'
|
||||||
publisher = 'OJD Nielsen'
|
publisher = 'OJD Nielsen'
|
||||||
category = 'News, politics, culture, economy, general interest'
|
category = 'News, politics, culture, economy, general interest'
|
||||||
language = 'es'
|
language = 'es'
|
||||||
timefmt = '[%a, %d %b, %Y]'
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
oldest_article = 1
|
oldest_article = 2
|
||||||
|
delay = 1
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
recursion = 10
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Portadas', u'http://www.heraldo.es/index.php/mod.portadas/mem.rss')
|
(u'Portadas', u'http://www.heraldo.es/index.php/mod.portadas/mem.rss')
|
||||||
@ -37,29 +39,39 @@ class heraldo(BasicNewsRecipe):
|
|||||||
|
|
||||||
remove_tags = [dict(name='a', attrs={'class':['com flo-r','enl-if','enl-df']}),
|
remove_tags = [dict(name='a', attrs={'class':['com flo-r','enl-if','enl-df']}),
|
||||||
dict(name='div', attrs={'class':['brb-b-s con marg-btt','cnt-rel con']}),
|
dict(name='div', attrs={'class':['brb-b-s con marg-btt','cnt-rel con']}),
|
||||||
dict(name='form', attrs={'class':'form'})]
|
dict(name='form', attrs={'class':'form'}),
|
||||||
|
dict(name='ul', attrs={'id':['cont-tags','pag-1']})]
|
||||||
|
|
||||||
remove_tags_before = dict(name='div' , attrs={'id':'dts'})
|
remove_tags_before = dict(name='div' , attrs={'id':'dts'})
|
||||||
remove_tags_after = dict(name='div' , attrs={'id':'com'})
|
remove_tags_after = dict(name='div' , attrs={'id':'com'})
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
cover = None
|
cover = None
|
||||||
st = time.localtime()
|
st = time.localtime()
|
||||||
year = str(st.tm_year)
|
year = str(st.tm_year)
|
||||||
month = "%.2d" % st.tm_mon
|
month = "%.2d" % st.tm_mon
|
||||||
day = "%.2d" % st.tm_mday
|
day = "%.2d" % st.tm_mday
|
||||||
#[url]http://oldorigin-www.heraldo.es/20101211/primeras/portada_aragon.pdf[/url]
|
#[url]http://oldorigin-www.heraldo.es/20101211/primeras/portada_aragon.pdf[/url]
|
||||||
cover='http://oldorigin-www.heraldo.es/'+ year + month + day +'/primeras/portada_aragon.pdf'
|
cover='http://oldorigin-www.heraldo.es/'+ year + month + day +'/primeras/portada_aragon.pdf'
|
||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
try:
|
try:
|
||||||
br.open(cover)
|
br.open(cover)
|
||||||
except:
|
except:
|
||||||
self.log("\nPortada no disponible")
|
self.log("\nPortada no disponible")
|
||||||
cover ='http://www.heraldo.es/MODULOS/global/publico/interfaces/img/logo-Heraldo.png'
|
cover ='http://www.heraldo.es/MODULOS/global/publico/interfaces/img/logo-Heraldo.png'
|
||||||
return cover
|
return cover
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
h2{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:xx-large;}
|
.con strong{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:16px;}
|
||||||
'''
|
.con h2{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;}
|
||||||
|
.con span{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:12px;}
|
||||||
|
.ent {font-family:Arial,Helvetica,sans-serif; font-weight:normal; font-style:italic; font-size:18px;}
|
||||||
|
img{margin-bottom: 0.4em}
|
||||||
|
'''
|
||||||
|
|
||||||
|
preprocess_regexps = [
|
||||||
|
|
||||||
|
# To separate the comments with a blank line
|
||||||
|
(re.compile(r'<div id="com"', re.DOTALL|re.IGNORECASE), lambda match: '<br><div id="com"')
|
||||||
|
]
|
||||||
|
@ -117,7 +117,6 @@ if iswindows:
|
|||||||
poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
|
poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
|
||||||
r'%s\poppler;%s'%(sw_inc_dir, sw_inc_dir))
|
r'%s\poppler;%s'%(sw_inc_dir, sw_inc_dir))
|
||||||
|
|
||||||
popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[1]+r'\qt4']
|
|
||||||
poppler_lib_dirs = consolidate('POPPLER_LIB_DIR', sw_lib_dir)
|
poppler_lib_dirs = consolidate('POPPLER_LIB_DIR', sw_lib_dir)
|
||||||
popplerqt4_lib_dirs = poppler_lib_dirs
|
popplerqt4_lib_dirs = poppler_lib_dirs
|
||||||
poppler_libs = ['poppler']
|
poppler_libs = ['poppler']
|
||||||
@ -131,7 +130,6 @@ elif isosx:
|
|||||||
fc_lib = '/sw/lib'
|
fc_lib = '/sw/lib'
|
||||||
poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
|
poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
|
||||||
'/sw/build/poppler-0.14.5/poppler:/sw/build/poppler-0.14.5')
|
'/sw/build/poppler-0.14.5/poppler:/sw/build/poppler-0.14.5')
|
||||||
popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[0]+'/qt4']
|
|
||||||
poppler_lib_dirs = consolidate('POPPLER_LIB_DIR',
|
poppler_lib_dirs = consolidate('POPPLER_LIB_DIR',
|
||||||
'/sw/lib')
|
'/sw/lib')
|
||||||
poppler_libs = ['poppler']
|
poppler_libs = ['poppler']
|
||||||
@ -150,9 +148,6 @@ else:
|
|||||||
# Include directories
|
# Include directories
|
||||||
poppler_inc_dirs = pkgconfig_include_dirs('poppler',
|
poppler_inc_dirs = pkgconfig_include_dirs('poppler',
|
||||||
'POPPLER_INC_DIR', '/usr/include/poppler')
|
'POPPLER_INC_DIR', '/usr/include/poppler')
|
||||||
popplerqt4_inc_dirs = pkgconfig_include_dirs('poppler-qt4', '', '')
|
|
||||||
if not popplerqt4_inc_dirs:
|
|
||||||
popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[0]+'/qt4']
|
|
||||||
png_inc_dirs = pkgconfig_include_dirs('libpng', 'PNG_INC_DIR',
|
png_inc_dirs = pkgconfig_include_dirs('libpng', 'PNG_INC_DIR',
|
||||||
'/usr/include')
|
'/usr/include')
|
||||||
magick_inc_dirs = pkgconfig_include_dirs('MagickWand', 'MAGICK_INC', '/usr/include/ImageMagick')
|
magick_inc_dirs = pkgconfig_include_dirs('MagickWand', 'MAGICK_INC', '/usr/include/ImageMagick')
|
||||||
@ -187,20 +182,17 @@ if not poppler_inc_dirs or not os.path.exists(
|
|||||||
poppler_error = \
|
poppler_error = \
|
||||||
('Poppler not found on your system. Various PDF related',
|
('Poppler not found on your system. Various PDF related',
|
||||||
' functionality will not work. Use the POPPLER_INC_DIR and',
|
' functionality will not work. Use the POPPLER_INC_DIR and',
|
||||||
' POPPLER_LIB_DIR environment variables.')
|
' POPPLER_LIB_DIR environment variables. calibre requires '
|
||||||
|
' the poppler XPDF headers. If your distro does not '
|
||||||
popplerqt4_error = None
|
' include them you will have to re-compile poppler '
|
||||||
if not popplerqt4_inc_dirs or not os.path.exists(
|
' by hand with --enable-xpdf-headers')
|
||||||
os.path.join(popplerqt4_inc_dirs[-1], 'poppler-qt4.h')):
|
|
||||||
popplerqt4_error = \
|
|
||||||
('Poppler Qt4 bindings not found on your system.')
|
|
||||||
|
|
||||||
magick_error = None
|
magick_error = None
|
||||||
if not magick_inc_dirs or not os.path.exists(os.path.join(magick_inc_dirs[0],
|
if not magick_inc_dirs or not os.path.exists(os.path.join(magick_inc_dirs[0],
|
||||||
'wand')):
|
'wand')):
|
||||||
magick_error = ('ImageMagick not found on your system. '
|
magick_error = ('ImageMagick not found on your system. '
|
||||||
'Try setting the environment variables MAGICK_INC '
|
'Try setting the environment variables MAGICK_INC '
|
||||||
'and MAGICK_LIB to help calibre locate the inclue and libbrary '
|
'and MAGICK_LIB to help calibre locate the include and library '
|
||||||
'files.')
|
'files.')
|
||||||
|
|
||||||
podofo_lib = os.environ.get('PODOFO_LIB_DIR', podofo_lib)
|
podofo_lib = os.environ.get('PODOFO_LIB_DIR', podofo_lib)
|
||||||
|
@ -29,7 +29,7 @@ class ANDROID(USBMS):
|
|||||||
# Motorola
|
# Motorola
|
||||||
0x22b8 : { 0x41d9 : [0x216], 0x2d61 : [0x100], 0x2d67 : [0x100],
|
0x22b8 : { 0x41d9 : [0x216], 0x2d61 : [0x100], 0x2d67 : [0x100],
|
||||||
0x41db : [0x216], 0x4285 : [0x216], 0x42a3 : [0x216],
|
0x41db : [0x216], 0x4285 : [0x216], 0x42a3 : [0x216],
|
||||||
0x4286 : [0x216] },
|
0x4286 : [0x216], 0x42b3 : [0x216] },
|
||||||
|
|
||||||
# Sony Ericsson
|
# Sony Ericsson
|
||||||
0xfce : { 0xd12e : [0x0100]},
|
0xfce : { 0xd12e : [0x0100]},
|
||||||
|
@ -78,6 +78,8 @@ class DocAnalysis(object):
|
|||||||
linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
|
linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
|
||||||
elif format == 'spanned_html':
|
elif format == 'spanned_html':
|
||||||
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
|
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
|
||||||
|
elif format == 'txt':
|
||||||
|
linere = re.compile('.*?\n')
|
||||||
self.lines = linere.findall(raw)
|
self.lines = linere.findall(raw)
|
||||||
|
|
||||||
def line_length(self, percent):
|
def line_length(self, percent):
|
||||||
@ -175,7 +177,7 @@ class Dehyphenator(object):
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
# Add common suffixes to the regex below to increase the likelihood of a match -
|
# Add common suffixes to the regex below to increase the likelihood of a match -
|
||||||
# don't add suffixes which are also complete words, such as 'able' or 'sex'
|
# don't add suffixes which are also complete words, such as 'able' or 'sex'
|
||||||
self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
|
self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$", re.IGNORECASE)
|
||||||
# remove prefixes if the prefix was not already the point of hyphenation
|
# remove prefixes if the prefix was not already the point of hyphenation
|
||||||
self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
|
self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
|
||||||
self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
|
self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
|
||||||
@ -197,7 +199,7 @@ class Dehyphenator(object):
|
|||||||
searchresult = self.html.find(lookupword.lower())
|
searchresult = self.html.find(lookupword.lower())
|
||||||
except:
|
except:
|
||||||
return hyphenated
|
return hyphenated
|
||||||
if self.format == 'html_cleanup':
|
if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
|
||||||
if self.html.find(lookupword) != -1 or searchresult != -1:
|
if self.html.find(lookupword) != -1 or searchresult != -1:
|
||||||
#print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
|
#print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
|
||||||
return dehyphenated
|
return dehyphenated
|
||||||
@ -223,10 +225,15 @@ class Dehyphenator(object):
|
|||||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags>(</span>)?\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)' % length)
|
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags>(</span>)?\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)' % length)
|
||||||
elif format == 'pdf':
|
elif format == 'pdf':
|
||||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
|
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
|
||||||
|
elif format == 'txt':
|
||||||
|
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
|
||||||
elif format == 'individual_words':
|
elif format == 'individual_words':
|
||||||
intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
|
intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
|
||||||
elif format == 'html_cleanup':
|
elif format == 'html_cleanup':
|
||||||
intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
|
intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
|
||||||
|
elif format == 'txt_cleanup':
|
||||||
|
intextmatch = re.compile(u'(?P<firstpart>\w+)(-|‐)(?P<wraptags>\s+)(?P<secondpart>[\w\d]+)')
|
||||||
|
|
||||||
|
|
||||||
html = intextmatch.sub(self.dehyphenate, html)
|
html = intextmatch.sub(self.dehyphenate, html)
|
||||||
return html
|
return html
|
||||||
@ -561,8 +568,8 @@ class HTMLPreProcessor(object):
|
|||||||
html = html.replace(start, '<!--')
|
html = html.replace(start, '<!--')
|
||||||
html = html.replace(stop, '-->')
|
html = html.replace(stop, '-->')
|
||||||
# convert ellipsis to entities to prevent wrapping
|
# convert ellipsis to entities to prevent wrapping
|
||||||
html = re.sub('(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html)
|
html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html)
|
||||||
# convert double dashes to em-dash
|
# convert double dashes to em-dash
|
||||||
html = re.sub('\s--\s', u'\u2014', html)
|
html = re.sub(r'\s--\s', u'\u2014', html)
|
||||||
return substitute_entites(html)
|
return substitute_entites(html)
|
||||||
|
|
||||||
|
@ -154,7 +154,7 @@ class PreProcessor(object):
|
|||||||
default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
|
default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
|
||||||
|
|
||||||
chapter_types = [
|
chapter_types = [
|
||||||
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
|
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common Chapter Headings"],
|
||||||
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
|
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
|
||||||
[r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters
|
[r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters
|
||||||
[r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering
|
[r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering
|
||||||
@ -184,6 +184,21 @@ class PreProcessor(object):
|
|||||||
self.log("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
|
self.log("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
def punctuation_unwrap(self, length, content, format):
|
||||||
|
# define the pieces of the regex
|
||||||
|
lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
|
||||||
|
line_ending = "\s*</(span|p|div)>\s*(</(p|span|div)>)?"
|
||||||
|
blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
|
||||||
|
line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*"
|
||||||
|
txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}"
|
||||||
|
|
||||||
|
unwrap_regex = lookahead+line_ending+blanklines+line_opening
|
||||||
|
if format == 'txt':
|
||||||
|
unwrap_regex = lookahead+txt_line_wrap
|
||||||
|
|
||||||
|
unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
|
||||||
|
content = unwrap.sub(' ', content)
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
def __call__(self, html):
|
def __call__(self, html):
|
||||||
@ -194,7 +209,7 @@ class PreProcessor(object):
|
|||||||
totalwords = 0
|
totalwords = 0
|
||||||
totalwords = self.get_word_count(html)
|
totalwords = self.get_word_count(html)
|
||||||
|
|
||||||
if totalwords < 20:
|
if totalwords < 50:
|
||||||
self.log("not enough text, not preprocessing")
|
self.log("not enough text, not preprocessing")
|
||||||
return html
|
return html
|
||||||
|
|
||||||
@ -312,8 +327,7 @@ class PreProcessor(object):
|
|||||||
self.log("Done dehyphenating")
|
self.log("Done dehyphenating")
|
||||||
# Unwrap lines using punctation and line length
|
# Unwrap lines using punctation and line length
|
||||||
#unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE)
|
#unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE)
|
||||||
unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
html = self.punctuation_unwrap(length, html, 'html')
|
||||||
html = unwrap.sub(' ', html)
|
|
||||||
#check any remaining hyphens, but only unwrap if there is a match
|
#check any remaining hyphens, but only unwrap if there is a match
|
||||||
dehyphenator = Dehyphenator()
|
dehyphenator = Dehyphenator()
|
||||||
html = dehyphenator(html,'html_cleanup', length)
|
html = dehyphenator(html,'html_cleanup', length)
|
||||||
@ -343,6 +357,6 @@ class PreProcessor(object):
|
|||||||
html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
|
html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
|
||||||
|
|
||||||
# Center separator lines
|
# Center separator lines
|
||||||
html = re.sub(u'<p>\s*(?P<break>([*#•]+\s*)+)\s*</p>', '<p style="text-align:center">' + '\g<break>' + '</p>', html)
|
html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center">' + '\g<break>' + '</p>', html)
|
||||||
|
|
||||||
return html
|
return html
|
||||||
|
@ -324,14 +324,16 @@ class Metadata(object):
|
|||||||
if metadata is None:
|
if metadata is None:
|
||||||
traceback.print_stack()
|
traceback.print_stack()
|
||||||
return
|
return
|
||||||
metadata = copy.deepcopy(metadata)
|
m = {}
|
||||||
if '#value#' not in metadata:
|
for k in metadata:
|
||||||
if metadata['datatype'] == 'text' and metadata['is_multiple']:
|
m[k] = copy.copy(metadata[k])
|
||||||
metadata['#value#'] = []
|
if '#value#' not in m:
|
||||||
|
if m['datatype'] == 'text' and m['is_multiple']:
|
||||||
|
m['#value#'] = []
|
||||||
else:
|
else:
|
||||||
metadata['#value#'] = None
|
m['#value#'] = None
|
||||||
_data = object.__getattribute__(self, '_data')
|
_data = object.__getattribute__(self, '_data')
|
||||||
_data['user_metadata'][field] = metadata
|
_data['user_metadata'][field] = m
|
||||||
|
|
||||||
def template_to_attribute(self, other, ops):
|
def template_to_attribute(self, other, ops):
|
||||||
'''
|
'''
|
||||||
|
58
src/calibre/ebooks/txt/heuristicprocessor.py
Normal file
58
src/calibre/ebooks/txt/heuristicprocessor.py
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
__license__ = 'GPL 3'
|
||||||
|
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from calibre import prepare_string_for_xml
|
||||||
|
|
||||||
|
class TXTHeuristicProcessor(object):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.ITALICIZE_WORDS = [
|
||||||
|
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
|
||||||
|
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.',
|
||||||
|
'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
|
||||||
|
'Mlle.', 'Mons.', 'PS.', 'PPS.',
|
||||||
|
]
|
||||||
|
self.ITALICIZE_STYLE_PATS = [
|
||||||
|
r'(?msu)_(?P<words>.+?)_',
|
||||||
|
r'(?msu)/(?P<words>[^<>]+?)/',
|
||||||
|
r'(?msu)~~(?P<words>.+?)~~',
|
||||||
|
r'(?msu)\*(?P<words>.+?)\*',
|
||||||
|
r'(?msu)~(?P<words>.+?)~',
|
||||||
|
r'(?msu)_/(?P<words>[^<>]+?)/_',
|
||||||
|
r'(?msu)_\*(?P<words>.+?)\*_',
|
||||||
|
r'(?msu)\*/(?P<words>[^<>]+?)/\*',
|
||||||
|
r'(?msu)_\*/(?P<words>[^<>]+?)/\*_',
|
||||||
|
r'(?msu)/:(?P<words>[^<>]+?):/',
|
||||||
|
r'(?msu)\|:(?P<words>.+?):\|',
|
||||||
|
]
|
||||||
|
|
||||||
|
def process_paragraph(self, paragraph):
|
||||||
|
for word in self.ITALICIZE_WORDS:
|
||||||
|
paragraph = paragraph.replace(word, '<i>%s</i>' % word)
|
||||||
|
for pat in self.ITALICIZE_STYLE_PATS:
|
||||||
|
paragraph = re.sub(pat, lambda mo: '<i>%s</i>' % mo.group('words'), paragraph)
|
||||||
|
return paragraph
|
||||||
|
|
||||||
|
def convert(self, txt, title='', epub_split_size_kb=0):
|
||||||
|
from calibre.ebooks.txt.processor import clean_txt, split_txt, HTML_TEMPLATE
|
||||||
|
txt = clean_txt(txt)
|
||||||
|
txt = split_txt(txt, epub_split_size_kb)
|
||||||
|
|
||||||
|
processed = []
|
||||||
|
for line in txt.split('\n\n'):
|
||||||
|
processed.append(u'<p>%s</p>' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
|
||||||
|
|
||||||
|
txt = u'\n'.join(processed)
|
||||||
|
txt = re.sub('[ ]{2,}', ' ', txt)
|
||||||
|
html = HTML_TEMPLATE % (title, txt)
|
||||||
|
|
||||||
|
from calibre.ebooks.conversion.utils import PreProcessor
|
||||||
|
pp = PreProcessor()
|
||||||
|
html = pp.markup_chapters(html, pp.get_word_count(html), False)
|
||||||
|
|
||||||
|
return html
|
@ -7,10 +7,12 @@ __docformat__ = 'restructuredtext en'
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||||
|
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
|
||||||
from calibre.ebooks.chardet import detect
|
from calibre.ebooks.chardet import detect
|
||||||
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
||||||
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
||||||
preserve_spaces, detect_paragraph_type, detect_formatting_type
|
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
|
||||||
|
convert_heuristic, normalize_line_endings
|
||||||
from calibre import _ent_pat, xml_entity_to_unicode
|
from calibre import _ent_pat, xml_entity_to_unicode
|
||||||
|
|
||||||
class TXTInput(InputFormatPlugin):
|
class TXTInput(InputFormatPlugin):
|
||||||
@ -22,20 +24,24 @@ class TXTInput(InputFormatPlugin):
|
|||||||
|
|
||||||
options = set([
|
options = set([
|
||||||
OptionRecommendation(name='paragraph_type', recommended_value='auto',
|
OptionRecommendation(name='paragraph_type', recommended_value='auto',
|
||||||
choices=['auto', 'block', 'single', 'print'],
|
choices=['auto', 'block', 'single', 'print', 'unformatted'],
|
||||||
help=_('Paragraph structure.\n'
|
help=_('Paragraph structure.\n'
|
||||||
'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
|
'choices are [\'auto\', \'block\', \'single\', \'print\', \'unformatted\']\n'
|
||||||
'* auto: Try to auto detect paragraph type.\n'
|
'* auto: Try to auto detect paragraph type.\n'
|
||||||
'* block: Treat a blank line as a paragraph break.\n'
|
'* block: Treat a blank line as a paragraph break.\n'
|
||||||
'* single: Assume every line is a paragraph.\n'
|
'* single: Assume every line is a paragraph.\n'
|
||||||
'* print: Assume every line starting with 2+ spaces or a tab '
|
'* print: Assume every line starting with 2+ spaces or a tab '
|
||||||
'starts a paragraph.')),
|
'starts a paragraph.'
|
||||||
|
'* unformatted: Most lines have hard line breaks, few/no blank lines or indents.')),
|
||||||
OptionRecommendation(name='formatting_type', recommended_value='auto',
|
OptionRecommendation(name='formatting_type', recommended_value='auto',
|
||||||
choices=['auto', 'none', 'markdown'],
|
choices=['auto', 'none', 'heuristic', 'markdown'],
|
||||||
help=_('Formatting used within the document.'
|
help=_('Formatting used within the document.'
|
||||||
'* auto: Try to auto detect the document formatting.\n'
|
'* auto: Automatically decide which formatting processor to use.\n'
|
||||||
'* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
|
'* none: Do not process the document formatting. Everything is a '
|
||||||
'* markdown: Run the input though the markdown pre-processor. '
|
'paragraph and no styling is applied.\n'
|
||||||
|
'* heuristic: Process using heuristics to determine formatting such '
|
||||||
|
'as chapter headings and italic text.\n'
|
||||||
|
'* markdown: Processing using markdown formatting. '
|
||||||
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
|
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
|
||||||
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
||||||
help=_('Normally extra spaces are condensed into a single space. '
|
help=_('Normally extra spaces are condensed into a single space. '
|
||||||
@ -68,6 +74,13 @@ class TXTInput(InputFormatPlugin):
|
|||||||
if options.preserve_spaces:
|
if options.preserve_spaces:
|
||||||
txt = preserve_spaces(txt)
|
txt = preserve_spaces(txt)
|
||||||
|
|
||||||
|
# Normalize line endings
|
||||||
|
txt = normalize_line_endings(txt)
|
||||||
|
|
||||||
|
# Get length for hyphen removal and punctuation unwrap
|
||||||
|
docanalysis = DocAnalysis('txt', txt)
|
||||||
|
length = docanalysis.line_length(.5)
|
||||||
|
|
||||||
if options.formatting_type == 'auto':
|
if options.formatting_type == 'auto':
|
||||||
options.formatting_type = detect_formatting_type(txt)
|
options.formatting_type = detect_formatting_type(txt)
|
||||||
|
|
||||||
@ -88,15 +101,37 @@ class TXTInput(InputFormatPlugin):
|
|||||||
else:
|
else:
|
||||||
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
|
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
|
||||||
|
|
||||||
|
# Dehyphenate
|
||||||
|
dehyphenator = Dehyphenator()
|
||||||
|
txt = dehyphenator(txt,'txt', length)
|
||||||
|
|
||||||
# We don't check for block because the processor assumes block.
|
# We don't check for block because the processor assumes block.
|
||||||
# single and print at transformed to block for processing.
|
# single and print at transformed to block for processing.
|
||||||
if options.paragraph_type == 'single':
|
|
||||||
|
if options.paragraph_type == 'single' or options.paragraph_type == 'unformatted':
|
||||||
txt = separate_paragraphs_single_line(txt)
|
txt = separate_paragraphs_single_line(txt)
|
||||||
elif options.paragraph_type == 'print':
|
elif options.paragraph_type == 'print':
|
||||||
txt = separate_paragraphs_print_formatted(txt)
|
txt = separate_paragraphs_print_formatted(txt)
|
||||||
|
|
||||||
|
if options.paragraph_type == 'unformatted':
|
||||||
|
from calibre.ebooks.conversion.utils import PreProcessor
|
||||||
|
# get length
|
||||||
|
|
||||||
|
# unwrap lines based on punctuation
|
||||||
|
preprocessor = PreProcessor(options, log=getattr(self, 'log', None))
|
||||||
|
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
|
||||||
|
|
||||||
flow_size = getattr(options, 'flow_size', 0)
|
flow_size = getattr(options, 'flow_size', 0)
|
||||||
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
|
||||||
|
if options.formatting_type == 'heuristic':
|
||||||
|
html = convert_heuristic(txt, epub_split_size_kb=flow_size)
|
||||||
|
else:
|
||||||
|
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
||||||
|
|
||||||
|
# Dehyphenate in cleanup mode for missed txt and markdown conversion
|
||||||
|
dehyphenator = Dehyphenator()
|
||||||
|
html = dehyphenator(html,'txt_cleanup', length)
|
||||||
|
html = dehyphenator(html,'html_cleanup', length)
|
||||||
|
|
||||||
from calibre.customize.ui import plugin_for_input_format
|
from calibre.customize.ui import plugin_for_input_format
|
||||||
html_input = plugin_for_input_format('html')
|
html_input = plugin_for_input_format('html')
|
||||||
|
@ -10,6 +10,8 @@ from calibre import prepare_string_for_xml, isbytestring
|
|||||||
from calibre.ebooks.markdown import markdown
|
from calibre.ebooks.markdown import markdown
|
||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
from calibre.utils.cleantext import clean_ascii_chars
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
|
from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
|
||||||
|
from calibre.ebooks.conversion.preprocess import DocAnalysis
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||||
@ -17,7 +19,7 @@ __docformat__ = 'restructuredtext en'
|
|||||||
|
|
||||||
HTML_TEMPLATE = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'
|
HTML_TEMPLATE = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'
|
||||||
|
|
||||||
def convert_basic(txt, title='', epub_split_size_kb=0):
|
def clean_txt(txt):
|
||||||
if isbytestring(txt):
|
if isbytestring(txt):
|
||||||
txt = txt.decode('utf-8', 'replace')
|
txt = txt.decode('utf-8', 'replace')
|
||||||
# Strip whitespace from the beginning and end of the line. Also replace
|
# Strip whitespace from the beginning and end of the line. Also replace
|
||||||
@ -34,6 +36,10 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
|
|||||||
txt = re.sub('\n{3,}', '\n\n', txt)
|
txt = re.sub('\n{3,}', '\n\n', txt)
|
||||||
#remove ASCII invalid chars
|
#remove ASCII invalid chars
|
||||||
txt = clean_ascii_chars(txt)
|
txt = clean_ascii_chars(txt)
|
||||||
|
|
||||||
|
return txt
|
||||||
|
|
||||||
|
def split_txt(txt, epub_split_size_kb=0):
|
||||||
#Takes care if there is no point to split
|
#Takes care if there is no point to split
|
||||||
if epub_split_size_kb > 0:
|
if epub_split_size_kb > 0:
|
||||||
if isinstance(txt, unicode):
|
if isinstance(txt, unicode):
|
||||||
@ -48,6 +54,12 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
|
|||||||
if isbytestring(txt):
|
if isbytestring(txt):
|
||||||
txt = txt.decode('utf-8')
|
txt = txt.decode('utf-8')
|
||||||
|
|
||||||
|
return txt
|
||||||
|
|
||||||
|
def convert_basic(txt, title='', epub_split_size_kb=0):
|
||||||
|
txt = clean_txt(txt)
|
||||||
|
txt = split_txt(txt, epub_split_size_kb)
|
||||||
|
|
||||||
lines = []
|
lines = []
|
||||||
# Split into paragraphs based on having a blank line between text.
|
# Split into paragraphs based on having a blank line between text.
|
||||||
for line in txt.split('\n\n'):
|
for line in txt.split('\n\n'):
|
||||||
@ -56,6 +68,10 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
|
|||||||
|
|
||||||
return HTML_TEMPLATE % (title, u'\n'.join(lines))
|
return HTML_TEMPLATE % (title, u'\n'.join(lines))
|
||||||
|
|
||||||
|
def convert_heuristic(txt, title='', epub_split_size_kb=0):
|
||||||
|
tp = TXTHeuristicProcessor()
|
||||||
|
return tp.convert(txt, title, epub_split_size_kb)
|
||||||
|
|
||||||
def convert_markdown(txt, title='', disable_toc=False):
|
def convert_markdown(txt, title='', disable_toc=False):
|
||||||
md = markdown.Markdown(
|
md = markdown.Markdown(
|
||||||
extensions=['footnotes', 'tables', 'toc'],
|
extensions=['footnotes', 'tables', 'toc'],
|
||||||
@ -63,9 +79,12 @@ def convert_markdown(txt, title='', disable_toc=False):
|
|||||||
safe_mode=False)
|
safe_mode=False)
|
||||||
return HTML_TEMPLATE % (title, md.convert(txt))
|
return HTML_TEMPLATE % (title, md.convert(txt))
|
||||||
|
|
||||||
def separate_paragraphs_single_line(txt):
|
def normalize_line_endings(txt):
|
||||||
txt = txt.replace('\r\n', '\n')
|
txt = txt.replace('\r\n', '\n')
|
||||||
txt = txt.replace('\r', '\n')
|
txt = txt.replace('\r', '\n')
|
||||||
|
return txt
|
||||||
|
|
||||||
|
def separate_paragraphs_single_line(txt):
|
||||||
txt = re.sub(u'(?<=.)\n(?=.)', '\n\n', txt)
|
txt = re.sub(u'(?<=.)\n(?=.)', '\n\n', txt)
|
||||||
return txt
|
return txt
|
||||||
|
|
||||||
@ -100,27 +119,42 @@ def detect_paragraph_type(txt):
|
|||||||
single: Each line is a paragraph.
|
single: Each line is a paragraph.
|
||||||
print: Each paragraph starts with a 2+ spaces or a tab
|
print: Each paragraph starts with a 2+ spaces or a tab
|
||||||
and ends when a new paragraph is reached.
|
and ends when a new paragraph is reached.
|
||||||
markdown: Markdown formatting is in the document.
|
unformatted: most lines have hard line breaks, few/no blank lines or indents
|
||||||
|
|
||||||
returns block, single, print, markdown
|
returns block, single, print, unformatted
|
||||||
'''
|
'''
|
||||||
txt = txt.replace('\r\n', '\n')
|
txt = txt.replace('\r\n', '\n')
|
||||||
txt = txt.replace('\r', '\n')
|
txt = txt.replace('\r', '\n')
|
||||||
txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))
|
txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))
|
||||||
|
|
||||||
# Check for print
|
# Check for hard line breaks - true if 55% of the doc breaks in the same region
|
||||||
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
|
docanalysis = DocAnalysis('txt', txt)
|
||||||
if tab_line_count / float(txt_line_count) >= .25:
|
hardbreaks = docanalysis.line_histogram(.55)
|
||||||
return 'print'
|
|
||||||
|
|
||||||
# Check for block
|
if hardbreaks:
|
||||||
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
|
# Determine print percentage
|
||||||
if empty_line_count / float(txt_line_count) >= .25:
|
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
|
||||||
return 'block'
|
print_percent = tab_line_count / float(txt_line_count)
|
||||||
|
|
||||||
# Nothing else matched to assume single.
|
# Determine block percentage
|
||||||
|
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
|
||||||
|
block_percent = empty_line_count / float(txt_line_count)
|
||||||
|
|
||||||
|
# Compare the two types - the type with the larger number of instances wins
|
||||||
|
# in cases where only one or the other represents the vast majority of the document neither wins
|
||||||
|
if print_percent >= block_percent:
|
||||||
|
if .15 <= print_percent <= .75:
|
||||||
|
return 'print'
|
||||||
|
elif .15 <= block_percent <= .75:
|
||||||
|
return 'block'
|
||||||
|
|
||||||
|
# Assume unformatted text with hardbreaks if nothing else matches
|
||||||
|
return 'unformatted'
|
||||||
|
|
||||||
|
# return single if hardbreaks is false
|
||||||
return 'single'
|
return 'single'
|
||||||
|
|
||||||
|
|
||||||
def detect_formatting_type(txt):
|
def detect_formatting_type(txt):
|
||||||
# Check for markdown
|
# Check for markdown
|
||||||
# Headings
|
# Headings
|
||||||
@ -142,4 +176,4 @@ def detect_formatting_type(txt):
|
|||||||
if txt.count('\\'+c) > 10:
|
if txt.count('\\'+c) > 10:
|
||||||
return 'markdown'
|
return 'markdown'
|
||||||
|
|
||||||
return 'none'
|
return 'heuristic'
|
||||||
|
@ -8,7 +8,6 @@ __docformat__ = 'restructuredtext en'
|
|||||||
Transform OEB content into plain text
|
Transform OEB content into plain text
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import os
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
@ -33,6 +32,15 @@ BLOCK_STYLES = [
|
|||||||
'block',
|
'block',
|
||||||
]
|
]
|
||||||
|
|
||||||
|
HEADING_TAGS = [
|
||||||
|
'h1',
|
||||||
|
'h2',
|
||||||
|
'h3',
|
||||||
|
'h4',
|
||||||
|
'h5',
|
||||||
|
'h6',
|
||||||
|
]
|
||||||
|
|
||||||
SPACE_TAGS = [
|
SPACE_TAGS = [
|
||||||
'td',
|
'td',
|
||||||
'br',
|
'br',
|
||||||
@ -47,6 +55,10 @@ class TXTMLizer(object):
|
|||||||
self.log.info('Converting XHTML to TXT...')
|
self.log.info('Converting XHTML to TXT...')
|
||||||
self.oeb_book = oeb_book
|
self.oeb_book = oeb_book
|
||||||
self.opts = opts
|
self.opts = opts
|
||||||
|
self.toc_ids = []
|
||||||
|
self.last_was_heading = False
|
||||||
|
|
||||||
|
self.create_flat_toc(self.oeb_book.toc)
|
||||||
|
|
||||||
return self.mlize_spine()
|
return self.mlize_spine()
|
||||||
|
|
||||||
@ -58,8 +70,11 @@ class TXTMLizer(object):
|
|||||||
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
|
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
|
||||||
content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
|
content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
|
||||||
content = self.remove_newlines(content)
|
content = self.remove_newlines(content)
|
||||||
output += self.dump_text(etree.fromstring(content), stylizer)
|
output += self.dump_text(etree.fromstring(content), stylizer, item)
|
||||||
output = self.cleanup_text(u''.join(output))
|
output += '\n\n\n\n\n\n'
|
||||||
|
output = u''.join(output)
|
||||||
|
output = u'\n'.join(l.rstrip() for l in output.splitlines())
|
||||||
|
output = self.cleanup_text(output)
|
||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
@ -68,6 +83,8 @@ class TXTMLizer(object):
|
|||||||
text = text.replace('\r\n', ' ')
|
text = text.replace('\r\n', ' ')
|
||||||
text = text.replace('\n', ' ')
|
text = text.replace('\n', ' ')
|
||||||
text = text.replace('\r', ' ')
|
text = text.replace('\r', ' ')
|
||||||
|
# Condense redundant spaces created by replacing newlines with spaces.
|
||||||
|
text = re.sub(r'[ ]{2,}', ' ', text)
|
||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
@ -80,6 +97,14 @@ class TXTMLizer(object):
|
|||||||
toc.append(u'* %s\n\n' % item.title)
|
toc.append(u'* %s\n\n' % item.title)
|
||||||
return ''.join(toc)
|
return ''.join(toc)
|
||||||
|
|
||||||
|
def create_flat_toc(self, nodes):
|
||||||
|
'''
|
||||||
|
Turns a hierarchical list of TOC href's into a flat list.
|
||||||
|
'''
|
||||||
|
for item in nodes:
|
||||||
|
self.toc_ids.append(item.href)
|
||||||
|
self.create_flat_toc(item.nodes)
|
||||||
|
|
||||||
def cleanup_text(self, text):
|
def cleanup_text(self, text):
|
||||||
self.log.debug('\tClean up text...')
|
self.log.debug('\tClean up text...')
|
||||||
# Replace bad characters.
|
# Replace bad characters.
|
||||||
@ -92,7 +117,7 @@ class TXTMLizer(object):
|
|||||||
text = text.replace('\f+', ' ')
|
text = text.replace('\f+', ' ')
|
||||||
|
|
||||||
# Single line paragraph.
|
# Single line paragraph.
|
||||||
text = re.sub('(?<=.)%s(?=.)' % os.linesep, ' ', text)
|
text = re.sub('(?<=.)\n(?=.)', ' ', text)
|
||||||
|
|
||||||
# Remove multiple spaces.
|
# Remove multiple spaces.
|
||||||
text = re.sub('[ ]{2,}', ' ', text)
|
text = re.sub('[ ]{2,}', ' ', text)
|
||||||
@ -101,14 +126,20 @@ class TXTMLizer(object):
|
|||||||
text = re.sub('\n[ ]+\n', '\n\n', text)
|
text = re.sub('\n[ ]+\n', '\n\n', text)
|
||||||
if self.opts.remove_paragraph_spacing:
|
if self.opts.remove_paragraph_spacing:
|
||||||
text = re.sub('\n{2,}', '\n', text)
|
text = re.sub('\n{2,}', '\n', text)
|
||||||
text = re.sub('(?imu)^(?=.)', '\t', text)
|
text = re.sub(r'(?msu)^(?P<t>[^\t\n]+?)$', lambda mo: u'%s\n\n' % mo.group('t'), text)
|
||||||
|
text = re.sub(r'(?msu)(?P<b>[^\n])\n+(?P<t>[^\t\n]+?)(?=\n)', lambda mo: '%s\n\n\n\n\n\n%s' % (mo.group('b'), mo.group('t')), text)
|
||||||
else:
|
else:
|
||||||
text = re.sub('\n{3,}', '\n\n', text)
|
text = re.sub('\n{7,}', '\n\n\n\n\n\n', text)
|
||||||
|
|
||||||
# Replace spaces at the beginning and end of lines
|
# Replace spaces at the beginning and end of lines
|
||||||
|
# We don't replace tabs because those are only added
|
||||||
|
# when remove paragraph spacing is enabled.
|
||||||
text = re.sub('(?imu)^[ ]+', '', text)
|
text = re.sub('(?imu)^[ ]+', '', text)
|
||||||
text = re.sub('(?imu)[ ]+$', '', text)
|
text = re.sub('(?imu)[ ]+$', '', text)
|
||||||
|
|
||||||
|
# Remove empty space and newlines at the beginning of the document.
|
||||||
|
text = re.sub(r'(?u)^[ \n]+', '', text)
|
||||||
|
|
||||||
if self.opts.max_line_length:
|
if self.opts.max_line_length:
|
||||||
max_length = self.opts.max_line_length
|
max_length = self.opts.max_line_length
|
||||||
if self.opts.max_line_length < 25 and not self.opts.force_max_line_length:
|
if self.opts.max_line_length < 25 and not self.opts.force_max_line_length:
|
||||||
@ -145,13 +176,11 @@ class TXTMLizer(object):
|
|||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def dump_text(self, elem, stylizer, end=''):
|
def dump_text(self, elem, stylizer, page):
|
||||||
'''
|
'''
|
||||||
@elem: The element in the etree that we are working on.
|
@elem: The element in the etree that we are working on.
|
||||||
@stylizer: The style information attached to the element.
|
@stylizer: The style information attached to the element.
|
||||||
@end: The last two characters of the text from the previous element.
|
@page: OEB page used to determine absolute urls.
|
||||||
This is used to determine if a blank line is needed when starting
|
|
||||||
a new block element.
|
|
||||||
'''
|
'''
|
||||||
|
|
||||||
if not isinstance(elem.tag, basestring) \
|
if not isinstance(elem.tag, basestring) \
|
||||||
@ -170,13 +199,22 @@ class TXTMLizer(object):
|
|||||||
return ['']
|
return ['']
|
||||||
|
|
||||||
tag = barename(elem.tag)
|
tag = barename(elem.tag)
|
||||||
|
tag_id = elem.attrib.get('id', None)
|
||||||
in_block = False
|
in_block = False
|
||||||
|
in_heading = False
|
||||||
|
|
||||||
|
# Are we in a heading?
|
||||||
|
# This can either be a heading tag or a TOC item.
|
||||||
|
if tag in HEADING_TAGS or '%s#%s' % (page.href, tag_id) in self.toc_ids:
|
||||||
|
in_heading = True
|
||||||
|
if not self.last_was_heading:
|
||||||
|
text.append('\n\n\n\n\n\n')
|
||||||
|
|
||||||
# Are we in a paragraph block?
|
# Are we in a paragraph block?
|
||||||
if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
|
if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
|
||||||
|
if self.opts.remove_paragraph_spacing and not in_heading:
|
||||||
|
text.append(u'\t')
|
||||||
in_block = True
|
in_block = True
|
||||||
if not end.endswith(u'\n\n') and hasattr(elem, 'text') and elem.text:
|
|
||||||
text.append(u'\n\n')
|
|
||||||
|
|
||||||
if tag in SPACE_TAGS:
|
if tag in SPACE_TAGS:
|
||||||
text.append(u' ')
|
text.append(u' ')
|
||||||
@ -185,14 +223,17 @@ class TXTMLizer(object):
|
|||||||
if hasattr(elem, 'text') and elem.text:
|
if hasattr(elem, 'text') and elem.text:
|
||||||
text.append(elem.text)
|
text.append(elem.text)
|
||||||
|
|
||||||
|
# Recurse down into tags within the tag we are in.
|
||||||
for item in elem:
|
for item in elem:
|
||||||
en = u''
|
text += self.dump_text(item, stylizer, page)
|
||||||
if len(text) >= 2:
|
|
||||||
en = text[-1][-2:]
|
|
||||||
text += self.dump_text(item, stylizer, en)
|
|
||||||
|
|
||||||
if in_block:
|
if in_block:
|
||||||
text.append(u'\n\n')
|
text.append(u'\n\n')
|
||||||
|
if in_heading:
|
||||||
|
text.append(u'\n')
|
||||||
|
self.last_was_heading = True
|
||||||
|
else:
|
||||||
|
self.last_was_heading = False
|
||||||
|
|
||||||
if hasattr(elem, 'tail') and elem.tail:
|
if hasattr(elem, 'tail') and elem.tail:
|
||||||
text.append(elem.tail)
|
text.append(elem.tail)
|
||||||
|
@ -1266,8 +1266,8 @@ class DeviceMixin(object): # {{{
|
|||||||
# Force a reset if the caches are not initialized
|
# Force a reset if the caches are not initialized
|
||||||
if reset or not hasattr(self, 'db_book_title_cache'):
|
if reset or not hasattr(self, 'db_book_title_cache'):
|
||||||
# Build a cache (map) of the library, so the search isn't On**2
|
# Build a cache (map) of the library, so the search isn't On**2
|
||||||
self.db_book_title_cache = {}
|
db_book_title_cache = {}
|
||||||
self.db_book_uuid_cache = {}
|
db_book_uuid_cache = {}
|
||||||
# It might be possible to get here without having initialized the
|
# It might be possible to get here without having initialized the
|
||||||
# library view. In this case, simply give up
|
# library view. In this case, simply give up
|
||||||
try:
|
try:
|
||||||
@ -1278,8 +1278,8 @@ class DeviceMixin(object): # {{{
|
|||||||
for id in db.data.iterallids():
|
for id in db.data.iterallids():
|
||||||
mi = db.get_metadata(id, index_is_id=True)
|
mi = db.get_metadata(id, index_is_id=True)
|
||||||
title = clean_string(mi.title)
|
title = clean_string(mi.title)
|
||||||
if title not in self.db_book_title_cache:
|
if title not in db_book_title_cache:
|
||||||
self.db_book_title_cache[title] = \
|
db_book_title_cache[title] = \
|
||||||
{'authors':{}, 'author_sort':{}, 'db_ids':{}}
|
{'authors':{}, 'author_sort':{}, 'db_ids':{}}
|
||||||
# If there are multiple books in the library with the same title
|
# If there are multiple books in the library with the same title
|
||||||
# and author, then remember the last one. That is OK, because as
|
# and author, then remember the last one. That is OK, because as
|
||||||
@ -1287,12 +1287,14 @@ class DeviceMixin(object): # {{{
|
|||||||
# as another.
|
# as another.
|
||||||
if mi.authors:
|
if mi.authors:
|
||||||
authors = clean_string(authors_to_string(mi.authors))
|
authors = clean_string(authors_to_string(mi.authors))
|
||||||
self.db_book_title_cache[title]['authors'][authors] = mi
|
db_book_title_cache[title]['authors'][authors] = mi
|
||||||
if mi.author_sort:
|
if mi.author_sort:
|
||||||
aus = clean_string(mi.author_sort)
|
aus = clean_string(mi.author_sort)
|
||||||
self.db_book_title_cache[title]['author_sort'][aus] = mi
|
db_book_title_cache[title]['author_sort'][aus] = mi
|
||||||
self.db_book_title_cache[title]['db_ids'][mi.application_id] = mi
|
db_book_title_cache[title]['db_ids'][mi.application_id] = mi
|
||||||
self.db_book_uuid_cache[mi.uuid] = mi
|
db_book_uuid_cache[mi.uuid] = mi
|
||||||
|
self.db_book_title_cache = db_book_title_cache
|
||||||
|
self.db_book_uuid_cache = db_book_uuid_cache
|
||||||
|
|
||||||
# Now iterate through all the books on the device, setting the
|
# Now iterate through all the books on the device, setting the
|
||||||
# in_library field. If the UUID matches a book in the library, then
|
# in_library field. If the UUID matches a book in the library, then
|
||||||
|
@ -5,11 +5,11 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
from PyQt4 import QtGui
|
from PyQt4.Qt import Qt, QLineEdit, QComboBox, SIGNAL, QListWidgetItem
|
||||||
from PyQt4.Qt import Qt
|
|
||||||
|
|
||||||
from calibre.gui2 import error_dialog
|
from calibre.gui2 import error_dialog
|
||||||
from calibre.gui2.device import device_name_for_plugboards
|
from calibre.gui2.device import device_name_for_plugboards
|
||||||
|
from calibre.gui2.dialogs.template_dialog import TemplateDialog
|
||||||
from calibre.gui2.preferences import ConfigWidgetBase, test_widget
|
from calibre.gui2.preferences import ConfigWidgetBase, test_widget
|
||||||
from calibre.gui2.preferences.plugboard_ui import Ui_Form
|
from calibre.gui2.preferences.plugboard_ui import Ui_Form
|
||||||
from calibre.customize.ui import metadata_writers, device_plugins
|
from calibre.customize.ui import metadata_writers, device_plugins
|
||||||
@ -17,6 +17,27 @@ from calibre.library.save_to_disk import plugboard_any_format_value, \
|
|||||||
plugboard_any_device_value, plugboard_save_to_disk_value
|
plugboard_any_device_value, plugboard_save_to_disk_value
|
||||||
from calibre.utils.formatter import validation_formatter
|
from calibre.utils.formatter import validation_formatter
|
||||||
|
|
||||||
|
|
||||||
|
class LineEditWithTextBox(QLineEdit):
|
||||||
|
|
||||||
|
'''
|
||||||
|
Extend the context menu of a QLineEdit to include more actions.
|
||||||
|
'''
|
||||||
|
|
||||||
|
def contextMenuEvent(self, event):
|
||||||
|
menu = self.createStandardContextMenu()
|
||||||
|
menu.addSeparator()
|
||||||
|
|
||||||
|
action_open_editor = menu.addAction(_('Open Editor'))
|
||||||
|
|
||||||
|
self.connect(action_open_editor, SIGNAL('triggered()'), self.open_editor)
|
||||||
|
menu.exec_(event.globalPos())
|
||||||
|
|
||||||
|
def open_editor(self):
|
||||||
|
t = TemplateDialog(self, self.text())
|
||||||
|
if t.exec_():
|
||||||
|
self.setText(t.textbox.toPlainText())
|
||||||
|
|
||||||
class ConfigWidget(ConfigWidgetBase, Ui_Form):
|
class ConfigWidget(ConfigWidgetBase, Ui_Form):
|
||||||
|
|
||||||
def genesis(self, gui):
|
def genesis(self, gui):
|
||||||
@ -72,10 +93,10 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form):
|
|||||||
self.source_widgets = []
|
self.source_widgets = []
|
||||||
self.dest_widgets = []
|
self.dest_widgets = []
|
||||||
for i in range(0, len(self.dest_fields)-1):
|
for i in range(0, len(self.dest_fields)-1):
|
||||||
w = QtGui.QLineEdit(self)
|
w = LineEditWithTextBox(self)
|
||||||
self.source_widgets.append(w)
|
self.source_widgets.append(w)
|
||||||
self.fields_layout.addWidget(w, 5+i, 0, 1, 1)
|
self.fields_layout.addWidget(w, 5+i, 0, 1, 1)
|
||||||
w = QtGui.QComboBox(self)
|
w = QComboBox(self)
|
||||||
self.dest_widgets.append(w)
|
self.dest_widgets.append(w)
|
||||||
self.fields_layout.addWidget(w, 5+i, 1, 1, 1)
|
self.fields_layout.addWidget(w, 5+i, 1, 1, 1)
|
||||||
|
|
||||||
@ -297,7 +318,7 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form):
|
|||||||
for op in self.current_plugboards[f][d]:
|
for op in self.current_plugboards[f][d]:
|
||||||
ops.append('([' + op[0] + '] -> ' + op[1] + ')')
|
ops.append('([' + op[0] + '] -> ' + op[1] + ')')
|
||||||
txt = '%s:%s = %s\n'%(f, d, ', '.join(ops))
|
txt = '%s:%s = %s\n'%(f, d, ', '.join(ops))
|
||||||
item = QtGui.QListWidgetItem(txt)
|
item = QListWidgetItem(txt)
|
||||||
item.setData(Qt.UserRole, (f, d))
|
item.setData(Qt.UserRole, (f, d))
|
||||||
self.existing_plugboards.addItem(item)
|
self.existing_plugboards.addItem(item)
|
||||||
self.refilling = False
|
self.refilling = False
|
||||||
|
@ -16,7 +16,7 @@ from PyQt4.Qt import QWidget, pyqtSignal, QDialog, Qt, QLabel, \
|
|||||||
from calibre.gui2.wizard.send_email_ui import Ui_Form
|
from calibre.gui2.wizard.send_email_ui import Ui_Form
|
||||||
from calibre.utils.smtp import config as smtp_prefs
|
from calibre.utils.smtp import config as smtp_prefs
|
||||||
from calibre.gui2.dialogs.test_email_ui import Ui_Dialog as TE_Dialog
|
from calibre.gui2.dialogs.test_email_ui import Ui_Dialog as TE_Dialog
|
||||||
from calibre.gui2 import error_dialog
|
from calibre.gui2 import error_dialog, question_dialog
|
||||||
|
|
||||||
class TestEmail(QDialog, TE_Dialog):
|
class TestEmail(QDialog, TE_Dialog):
|
||||||
|
|
||||||
@ -92,7 +92,10 @@ class SendEmail(QWidget, Ui_Form):
|
|||||||
pa = self.preferred_to_address()
|
pa = self.preferred_to_address()
|
||||||
to_set = pa is not None
|
to_set = pa is not None
|
||||||
if self.set_email_settings(to_set):
|
if self.set_email_settings(to_set):
|
||||||
TestEmail(pa, self).exec_()
|
if question_dialog(self, _('OK to proceed?'),
|
||||||
|
_('This will display your email password on the screen'
|
||||||
|
'. Is it OK to proceed?'), show_copy_button=False):
|
||||||
|
TestEmail(pa, self).exec_()
|
||||||
|
|
||||||
def test_email_settings(self, to):
|
def test_email_settings(self, to):
|
||||||
opts = smtp_prefs().parse()
|
opts = smtp_prefs().parse()
|
||||||
|
@ -181,7 +181,7 @@ class ResultCache(SearchQueryParser): # {{{
|
|||||||
self.search_restriction = ''
|
self.search_restriction = ''
|
||||||
self.field_metadata = field_metadata
|
self.field_metadata = field_metadata
|
||||||
self.all_search_locations = field_metadata.get_search_terms()
|
self.all_search_locations = field_metadata.get_search_terms()
|
||||||
SearchQueryParser.__init__(self, self.all_search_locations)
|
SearchQueryParser.__init__(self, self.all_search_locations, optimize=True)
|
||||||
self.build_date_relop_dict()
|
self.build_date_relop_dict()
|
||||||
self.build_numeric_relop_dict()
|
self.build_numeric_relop_dict()
|
||||||
|
|
||||||
@ -264,7 +264,7 @@ class ResultCache(SearchQueryParser): # {{{
|
|||||||
'<=':[2, relop_le]
|
'<=':[2, relop_le]
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_dates_matches(self, location, query):
|
def get_dates_matches(self, location, query, candidates):
|
||||||
matches = set([])
|
matches = set([])
|
||||||
if len(query) < 2:
|
if len(query) < 2:
|
||||||
return matches
|
return matches
|
||||||
@ -274,13 +274,15 @@ class ResultCache(SearchQueryParser): # {{{
|
|||||||
loc = self.field_metadata[location]['rec_index']
|
loc = self.field_metadata[location]['rec_index']
|
||||||
|
|
||||||
if query == 'false':
|
if query == 'false':
|
||||||
for item in self._data:
|
for id_ in candidates:
|
||||||
|
item = self._data[id_]
|
||||||
if item is None: continue
|
if item is None: continue
|
||||||
if item[loc] is None or item[loc] <= UNDEFINED_DATE:
|
if item[loc] is None or item[loc] <= UNDEFINED_DATE:
|
||||||
matches.add(item[0])
|
matches.add(item[0])
|
||||||
return matches
|
return matches
|
||||||
if query == 'true':
|
if query == 'true':
|
||||||
for item in self._data:
|
for id_ in candidates:
|
||||||
|
item = self._data[id_]
|
||||||
if item is None: continue
|
if item is None: continue
|
||||||
if item[loc] is not None and item[loc] > UNDEFINED_DATE:
|
if item[loc] is not None and item[loc] > UNDEFINED_DATE:
|
||||||
matches.add(item[0])
|
matches.add(item[0])
|
||||||
@ -319,7 +321,8 @@ class ResultCache(SearchQueryParser): # {{{
|
|||||||
field_count = query.count('-') + 1
|
field_count = query.count('-') + 1
|
||||||
else:
|
else:
|
||||||
field_count = query.count('/') + 1
|
field_count = query.count('/') + 1
|
||||||
for item in self._data:
|
for id_ in candidates:
|
||||||
|
item = self._data[id_]
|
||||||
if item is None or item[loc] is None: continue
|
if item is None or item[loc] is None: continue
|
||||||
if relop(item[loc], qd, field_count):
|
if relop(item[loc], qd, field_count):
|
||||||
matches.add(item[0])
|
matches.add(item[0])
|
||||||
@ -335,7 +338,7 @@ class ResultCache(SearchQueryParser): # {{{
|
|||||||
'<=':[2, lambda r, q: r <= q]
|
'<=':[2, lambda r, q: r <= q]
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_numeric_matches(self, location, query, val_func = None):
|
def get_numeric_matches(self, location, query, candidates, val_func = None):
|
||||||
matches = set([])
|
matches = set([])
|
||||||
if len(query) == 0:
|
if len(query) == 0:
|
||||||
return matches
|
return matches
|
||||||
@ -381,7 +384,8 @@ class ResultCache(SearchQueryParser): # {{{
|
|||||||
except:
|
except:
|
||||||
return matches
|
return matches
|
||||||
|
|
||||||
for item in self._data:
|
for id_ in candidates:
|
||||||
|
item = self._data[id_]
|
||||||
if item is None:
|
if item is None:
|
||||||
continue
|
continue
|
||||||
v = val_func(item)
|
v = val_func(item)
|
||||||
@ -393,8 +397,13 @@ class ResultCache(SearchQueryParser): # {{{
|
|||||||
matches.add(item[0])
|
matches.add(item[0])
|
||||||
return matches
|
return matches
|
||||||
|
|
||||||
def get_matches(self, location, query, allow_recursion=True):
|
def get_matches(self, location, query, allow_recursion=True, candidates=None):
|
||||||
matches = set([])
|
matches = set([])
|
||||||
|
if candidates is None:
|
||||||
|
candidates = self.universal_set()
|
||||||
|
if len(candidates) == 0:
|
||||||
|
return matches
|
||||||
|
|
||||||
if query and query.strip():
|
if query and query.strip():
|
||||||
# get metadata key associated with the search term. Eliminates
|
# get metadata key associated with the search term. Eliminates
|
||||||
# dealing with plurals and other aliases
|
# dealing with plurals and other aliases
|
||||||
@ -476,7 +485,8 @@ class ResultCache(SearchQueryParser): # {{{
|
|||||||
else:
|
else:
|
||||||
q = query
|
q = query
|
||||||
|
|
||||||
for item in self._data:
|
for id_ in candidates:
|
||||||
|
item = self._data[id_]
|
||||||
if item is None: continue
|
if item is None: continue
|
||||||
|
|
||||||
if col_datatype[loc] == 'bool': # complexity caused by the two-/three-value tweak
|
if col_datatype[loc] == 'bool': # complexity caused by the two-/three-value tweak
|
||||||
|
@ -2861,25 +2861,17 @@ class EPUB_MOBI(CatalogPlugin):
|
|||||||
self.updateProgressMicroStep("Thumbnail %d of %d" % \
|
self.updateProgressMicroStep("Thumbnail %d of %d" % \
|
||||||
(i,len(self.booksByTitle)),
|
(i,len(self.booksByTitle)),
|
||||||
i/float(len(self.booksByTitle)))
|
i/float(len(self.booksByTitle)))
|
||||||
# Check to see if source file exists
|
|
||||||
if 'cover' in title and os.path.isfile(title['cover']):
|
|
||||||
# Add the thumb spec to thumbs[]
|
|
||||||
thumbs.append("thumbnail_%d.jpg" % int(title['id']))
|
|
||||||
|
|
||||||
# Check to see if thumbnail exists
|
thumb_file = 'thumbnail_%d.jpg' % int(title['id'])
|
||||||
thumb_fp = "%s/thumbnail_%d.jpg" % (image_dir,int(title['id']))
|
thumb_generated = True
|
||||||
thumb_file = 'thumbnail_%d.jpg' % int(title['id'])
|
try:
|
||||||
if os.path.isfile(thumb_fp):
|
self.generateThumbnail(title, image_dir, thumb_file)
|
||||||
# Check to see if cover is newer than thumbnail
|
thumbs.append("thumbnail_%d.jpg" % int(title['id']))
|
||||||
# os.path.getmtime() = modified time
|
except:
|
||||||
# os.path.ctime() = creation time
|
thumb_generated = False
|
||||||
cover_timestamp = os.path.getmtime(title['cover'])
|
|
||||||
thumb_timestamp = os.path.getmtime(thumb_fp)
|
|
||||||
if thumb_timestamp < cover_timestamp:
|
if not thumb_generated:
|
||||||
self.generateThumbnail(title, image_dir, thumb_file)
|
|
||||||
else:
|
|
||||||
self.generateThumbnail(title, image_dir, thumb_file)
|
|
||||||
else:
|
|
||||||
# Use default cover
|
# Use default cover
|
||||||
if False and self.verbose:
|
if False and self.verbose:
|
||||||
self.opts.log.warn(" using default cover for '%s'" % \
|
self.opts.log.warn(" using default cover for '%s'" % \
|
||||||
|
@ -151,6 +151,8 @@ class CustomColumns(object):
|
|||||||
return v
|
return v
|
||||||
|
|
||||||
def adapt_number(x, d):
|
def adapt_number(x, d):
|
||||||
|
if x is None:
|
||||||
|
return None
|
||||||
if isinstance(x, (str, unicode, bytes)):
|
if isinstance(x, (str, unicode, bytes)):
|
||||||
if x.lower() == 'none':
|
if x.lower() == 'none':
|
||||||
return None
|
return None
|
||||||
|
@ -256,7 +256,8 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
|
|||||||
'pubdate',
|
'pubdate',
|
||||||
'flags',
|
'flags',
|
||||||
'uuid',
|
'uuid',
|
||||||
'has_cover'
|
'has_cover',
|
||||||
|
('au_map', 'authors', 'author', 'aum_sortconcat(link.id, authors.name, authors.sort)')
|
||||||
]
|
]
|
||||||
lines = []
|
lines = []
|
||||||
for col in columns:
|
for col in columns:
|
||||||
@ -273,9 +274,9 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
|
|||||||
|
|
||||||
self.FIELD_MAP = {'id':0, 'title':1, 'authors':2, 'timestamp':3,
|
self.FIELD_MAP = {'id':0, 'title':1, 'authors':2, 'timestamp':3,
|
||||||
'size':4, 'rating':5, 'tags':6, 'comments':7, 'series':8,
|
'size':4, 'rating':5, 'tags':6, 'comments':7, 'series':8,
|
||||||
'publisher':9, 'series_index':10,
|
'publisher':9, 'series_index':10, 'sort':11, 'author_sort':12,
|
||||||
'sort':11, 'author_sort':12, 'formats':13, 'isbn':14, 'path':15,
|
'formats':13, 'isbn':14, 'path':15, 'lccn':16, 'pubdate':17,
|
||||||
'lccn':16, 'pubdate':17, 'flags':18, 'uuid':19, 'cover':20}
|
'flags':18, 'uuid':19, 'cover':20, 'au_map':21}
|
||||||
|
|
||||||
for k,v in self.FIELD_MAP.iteritems():
|
for k,v in self.FIELD_MAP.iteritems():
|
||||||
self.field_metadata.set_field_record_index(k, v, prefer_custom=False)
|
self.field_metadata.set_field_record_index(k, v, prefer_custom=False)
|
||||||
@ -687,9 +688,11 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
|
|||||||
Convenience method to return metadata as a :class:`Metadata` object.
|
Convenience method to return metadata as a :class:`Metadata` object.
|
||||||
Note that the list of formats is not verified.
|
Note that the list of formats is not verified.
|
||||||
'''
|
'''
|
||||||
|
row = self.data._data[idx] if index_is_id else self.data[idx]
|
||||||
|
fm = self.FIELD_MAP
|
||||||
|
|
||||||
self.gm_count += 1
|
self.gm_count += 1
|
||||||
mi = self.data.get(idx, self.FIELD_MAP['all_metadata'],
|
mi = row[self.FIELD_MAP['all_metadata']]
|
||||||
row_is_id = index_is_id)
|
|
||||||
if mi is not None:
|
if mi is not None:
|
||||||
if get_cover:
|
if get_cover:
|
||||||
# Always get the cover, because the value can be wrong if the
|
# Always get the cover, because the value can be wrong if the
|
||||||
@ -699,49 +702,46 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
|
|||||||
|
|
||||||
self.gm_missed += 1
|
self.gm_missed += 1
|
||||||
mi = Metadata(None)
|
mi = Metadata(None)
|
||||||
self.data.set(idx, self.FIELD_MAP['all_metadata'], mi,
|
self.data.set(idx, fm['all_metadata'], mi, row_is_id = index_is_id)
|
||||||
row_is_id = index_is_id)
|
|
||||||
|
|
||||||
aut_list = self.authors_with_sort_strings(idx, index_is_id=index_is_id)
|
aut_list = row[fm['au_map']]
|
||||||
|
aut_list = [p.split(':::') for p in aut_list.split(':#:')]
|
||||||
aum = []
|
aum = []
|
||||||
aus = {}
|
aus = {}
|
||||||
for (author, author_sort) in aut_list:
|
for (author, author_sort) in aut_list:
|
||||||
aum.append(author)
|
aum.append(author)
|
||||||
aus[author] = author_sort
|
aus[author] = author_sort.replace('|', ',')
|
||||||
mi.title = self.title(idx, index_is_id=index_is_id)
|
mi.title = row[fm['title']]
|
||||||
mi.authors = aum
|
mi.authors = aum
|
||||||
mi.author_sort = self.author_sort(idx, index_is_id=index_is_id)
|
mi.author_sort = row[fm['author_sort']]
|
||||||
mi.author_sort_map = aus
|
mi.author_sort_map = aus
|
||||||
mi.comments = self.comments(idx, index_is_id=index_is_id)
|
mi.comments = row[fm['comments']]
|
||||||
mi.publisher = self.publisher(idx, index_is_id=index_is_id)
|
mi.publisher = row[fm['publisher']]
|
||||||
mi.timestamp = self.timestamp(idx, index_is_id=index_is_id)
|
mi.timestamp = row[fm['timestamp']]
|
||||||
mi.pubdate = self.pubdate(idx, index_is_id=index_is_id)
|
mi.pubdate = row[fm['pubdate']]
|
||||||
mi.uuid = self.uuid(idx, index_is_id=index_is_id)
|
mi.uuid = row[fm['uuid']]
|
||||||
mi.title_sort = self.title_sort(idx, index_is_id=index_is_id)
|
mi.title_sort = row[fm['sort']]
|
||||||
mi.formats = self.formats(idx, index_is_id=index_is_id,
|
formats = row[fm['formats']]
|
||||||
verify_formats=False)
|
if not formats:
|
||||||
if hasattr(mi.formats, 'split'):
|
formats = None
|
||||||
mi.formats = mi.formats.split(',')
|
mi.formats = formats
|
||||||
else:
|
tags = row[fm['tags']]
|
||||||
mi.formats = None
|
|
||||||
tags = self.tags(idx, index_is_id=index_is_id)
|
|
||||||
if tags:
|
if tags:
|
||||||
mi.tags = [i.strip() for i in tags.split(',')]
|
mi.tags = [i.strip() for i in tags.split(',')]
|
||||||
mi.series = self.series(idx, index_is_id=index_is_id)
|
mi.series = row[fm['series']]
|
||||||
if mi.series:
|
if mi.series:
|
||||||
mi.series_index = self.series_index(idx, index_is_id=index_is_id)
|
mi.series_index = row[fm['series_index']]
|
||||||
mi.rating = self.rating(idx, index_is_id=index_is_id)
|
mi.rating = row[fm['rating']]
|
||||||
mi.isbn = self.isbn(idx, index_is_id=index_is_id)
|
mi.isbn = row[fm['isbn']]
|
||||||
id = idx if index_is_id else self.id(idx)
|
id = idx if index_is_id else self.id(idx)
|
||||||
mi.application_id = id
|
mi.application_id = id
|
||||||
mi.id = id
|
mi.id = id
|
||||||
for key,meta in self.field_metadata.iteritems():
|
for key, meta in self.field_metadata.custom_iteritems():
|
||||||
if meta['is_custom']:
|
mi.set_user_metadata(key, meta)
|
||||||
mi.set_user_metadata(key, meta)
|
mi.set(key, val=self.get_custom(idx, label=meta['label'],
|
||||||
mi.set(key, val=self.get_custom(idx, label=meta['label'],
|
index_is_id=index_is_id),
|
||||||
index_is_id=index_is_id),
|
extra=self.get_custom_extra(idx, label=meta['label'],
|
||||||
extra=self.get_custom_extra(idx, label=meta['label'],
|
index_is_id=index_is_id))
|
||||||
index_is_id=index_is_id))
|
|
||||||
if get_cover:
|
if get_cover:
|
||||||
mi.cover = self.cover(id, index_is_id=True, as_path=True)
|
mi.cover = self.cover(id, index_is_id=True, as_path=True)
|
||||||
return mi
|
return mi
|
||||||
@ -877,18 +877,17 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
|
|||||||
|
|
||||||
def formats(self, index, index_is_id=False, verify_formats=True):
|
def formats(self, index, index_is_id=False, verify_formats=True):
|
||||||
''' Return available formats as a comma separated list or None if there are no available formats '''
|
''' Return available formats as a comma separated list or None if there are no available formats '''
|
||||||
id = index if index_is_id else self.id(index)
|
id_ = index if index_is_id else self.id(index)
|
||||||
try:
|
formats = self.data.get(id_, self.FIELD_MAP['formats'], row_is_id=True)
|
||||||
formats = self.conn.get('SELECT format FROM data WHERE book=?', (id,))
|
if not formats:
|
||||||
formats = map(lambda x:x[0], formats)
|
|
||||||
except:
|
|
||||||
return None
|
return None
|
||||||
if not verify_formats:
|
if not verify_formats:
|
||||||
return ','.join(formats)
|
return formats
|
||||||
|
formats = formats.split(',')
|
||||||
ans = []
|
ans = []
|
||||||
for format in formats:
|
for fmt in formats:
|
||||||
if self.format_abspath(id, format, index_is_id=True) is not None:
|
if self.format_abspath(id_, fmt, index_is_id=True) is not None:
|
||||||
ans.append(format)
|
ans.append(fmt)
|
||||||
if not ans:
|
if not ans:
|
||||||
return None
|
return None
|
||||||
return ','.join(ans)
|
return ','.join(ans)
|
||||||
@ -1607,6 +1606,10 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
|
|||||||
','.join([a.replace(',', '|') for a in authors]),
|
','.join([a.replace(',', '|') for a in authors]),
|
||||||
row_is_id=True)
|
row_is_id=True)
|
||||||
self.data.set(id, self.FIELD_MAP['author_sort'], ss, row_is_id=True)
|
self.data.set(id, self.FIELD_MAP['author_sort'], ss, row_is_id=True)
|
||||||
|
aum = self.authors_with_sort_strings(id, index_is_id=True)
|
||||||
|
self.data.set(id, self.FIELD_MAP['au_map'],
|
||||||
|
':#:'.join([':::'.join((au.replace(',', '|'), aus)) for (au, aus) in aum]),
|
||||||
|
row_is_id=True)
|
||||||
|
|
||||||
def set_authors(self, id, authors, notify=True, commit=True):
|
def set_authors(self, id, authors, notify=True, commit=True):
|
||||||
'''
|
'''
|
||||||
|
@ -180,6 +180,15 @@ class FieldMetadata(dict):
|
|||||||
'search_terms':['author_sort'],
|
'search_terms':['author_sort'],
|
||||||
'is_custom':False,
|
'is_custom':False,
|
||||||
'is_category':False}),
|
'is_category':False}),
|
||||||
|
('au_map', {'table':None,
|
||||||
|
'column':None,
|
||||||
|
'datatype':'text',
|
||||||
|
'is_multiple':',',
|
||||||
|
'kind':'field',
|
||||||
|
'name':None,
|
||||||
|
'search_terms':[],
|
||||||
|
'is_custom':False,
|
||||||
|
'is_category':False}),
|
||||||
('comments', {'table':None,
|
('comments', {'table':None,
|
||||||
'column':None,
|
'column':None,
|
||||||
'datatype':'text',
|
'datatype':'text',
|
||||||
@ -400,6 +409,12 @@ class FieldMetadata(dict):
|
|||||||
for key in self._tb_cats:
|
for key in self._tb_cats:
|
||||||
yield (key, self._tb_cats[key])
|
yield (key, self._tb_cats[key])
|
||||||
|
|
||||||
|
def custom_iteritems(self):
|
||||||
|
for key in self._tb_cats:
|
||||||
|
fm = self._tb_cats[key]
|
||||||
|
if fm['is_custom']:
|
||||||
|
yield (key, self._tb_cats[key])
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
return list(self.iteritems())
|
return list(self.iteritems())
|
||||||
|
|
||||||
|
@ -756,7 +756,7 @@ class BrowseServer(object):
|
|||||||
sort = self.browse_sort_book_list(items, list_sort)
|
sort = self.browse_sort_book_list(items, list_sort)
|
||||||
ids = [x[0] for x in items]
|
ids = [x[0] for x in items]
|
||||||
html = render_book_list(ids, self.opts.url_prefix,
|
html = render_book_list(ids, self.opts.url_prefix,
|
||||||
suffix=_('in search')+': '+query)
|
suffix=_('in search')+': '+xml(query))
|
||||||
return self.browse_template(sort, category=False, initial_search=query).format(
|
return self.browse_template(sort, category=False, initial_search=query).format(
|
||||||
title=_('Matching books'),
|
title=_('Matching books'),
|
||||||
script='booklist();', main=html)
|
script='booklist();', main=html)
|
||||||
|
@ -87,6 +87,23 @@ class SortedConcatenate(object):
|
|||||||
class SafeSortedConcatenate(SortedConcatenate):
|
class SafeSortedConcatenate(SortedConcatenate):
|
||||||
sep = '|'
|
sep = '|'
|
||||||
|
|
||||||
|
class AumSortedConcatenate(object):
|
||||||
|
'''String concatenation aggregator for the author sort map'''
|
||||||
|
def __init__(self):
|
||||||
|
self.ans = {}
|
||||||
|
|
||||||
|
def step(self, ndx, author, sort):
|
||||||
|
if author is not None:
|
||||||
|
self.ans[ndx] = author + ':::' + sort
|
||||||
|
|
||||||
|
def finalize(self):
|
||||||
|
keys = self.ans.keys()
|
||||||
|
if len(keys) == 0:
|
||||||
|
return None
|
||||||
|
if len(keys) == 1:
|
||||||
|
return self.ans[keys[0]]
|
||||||
|
return ':#:'.join([self.ans[v] for v in sorted(keys)])
|
||||||
|
|
||||||
class Connection(sqlite.Connection):
|
class Connection(sqlite.Connection):
|
||||||
|
|
||||||
def get(self, *args, **kw):
|
def get(self, *args, **kw):
|
||||||
@ -155,6 +172,7 @@ class DBThread(Thread):
|
|||||||
c_ext_loaded = load_c_extensions(self.conn)
|
c_ext_loaded = load_c_extensions(self.conn)
|
||||||
self.conn.row_factory = sqlite.Row if self.row_factory else lambda cursor, row : list(row)
|
self.conn.row_factory = sqlite.Row if self.row_factory else lambda cursor, row : list(row)
|
||||||
self.conn.create_aggregate('concat', 1, Concatenate)
|
self.conn.create_aggregate('concat', 1, Concatenate)
|
||||||
|
self.conn.create_aggregate('aum_sortconcat', 3, AumSortedConcatenate)
|
||||||
if not c_ext_loaded:
|
if not c_ext_loaded:
|
||||||
self.conn.create_aggregate('sortconcat', 2, SortedConcatenate)
|
self.conn.create_aggregate('sortconcat', 2, SortedConcatenate)
|
||||||
self.conn.create_aggregate('sort_concat', 2, SafeSortedConcatenate)
|
self.conn.create_aggregate('sort_concat', 2, SafeSortedConcatenate)
|
||||||
|
@ -533,17 +533,23 @@ PDF documents are one of the worst formats to convert from. They are a fixed pag
|
|||||||
Meaning, it is very difficult to determine where one paragraph ends and another begins. |app| will try to unwrap
|
Meaning, it is very difficult to determine where one paragraph ends and another begins. |app| will try to unwrap
|
||||||
paragraphs using a configurable, :guilabel:`Line Un-Wrapping Factor`. This is a scale used to determine the length
|
paragraphs using a configurable, :guilabel:`Line Un-Wrapping Factor`. This is a scale used to determine the length
|
||||||
at which a line should be unwrapped. Valid values are a decimal
|
at which a line should be unwrapped. Valid values are a decimal
|
||||||
between 0 and 1. The default is 0.5, this is the median line length. Lower this value to include more
|
between 0 and 1. The default is 0.45, just under the median line length. Lower this value to include more
|
||||||
text in the unwrapping. Increase to include less. You can adjust this value in the conversion settings under PDF Input.
|
text in the unwrapping. Increase to include less. You can adjust this value in the conversion settings under :guilabel:`PDF Input`.
|
||||||
|
|
||||||
Also, they often have headers and footers as part of the document that will become included with the text.
|
Also, they often have headers and footers as part of the document that will become included with the text.
|
||||||
Use the options to remove headers and footers to mitigate this issue. If the headers and footers are not
|
Use the options to remove headers and footers to mitigate this issue. If the headers and footers are not
|
||||||
removed from the text it can throw off the paragraph unwrapping.
|
removed from the text it can throw off the paragraph unwrapping. To learn how to use the header and footer removal options, read
|
||||||
|
:ref:`regexptutorial`.
|
||||||
|
|
||||||
Some limitations of PDF input is complex, multi-column, and image based documents are not supported.
|
Some limitations of PDF input are:
|
||||||
Extraction of vector images and tables from within the document is also not supported. Some PDFs use special glyphs to
|
|
||||||
represent double ll or doubfle ff or fi,etc. Conversion of these may or may not work depending on jusy how they are
|
* Complex, multi-column, and image based documents are not supported.
|
||||||
represented internally in the PDF.
|
* Extraction of vector images and tables from within the document is also not supported.
|
||||||
|
* Some PDFs use special glyphs to represent ll or ff or fi, etc. Conversion of these may or may not work depending on just how they are represented internally in the PDF.
|
||||||
|
* Some PDFs store their images upside down with a rotation instruction, |app| currently doesn't support that instruction, so the images will be rotated in the output as well.
|
||||||
|
|
||||||
|
To re-iterate **PDF is a really, really bad** format to use as input. If you absolutely must use PDF, then be prepared for an
|
||||||
|
output ranging anywhere from decent to unusable, depending on the input PDF.
|
||||||
|
|
||||||
Comic Book Collections
|
Comic Book Collections
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
@ -5,8 +5,8 @@
|
|||||||
msgid ""
|
msgid ""
|
||||||
msgstr ""
|
msgstr ""
|
||||||
"Project-Id-Version: calibre 0.7.38\n"
|
"Project-Id-Version: calibre 0.7.38\n"
|
||||||
"POT-Creation-Date: 2011-01-07 13:12+MST\n"
|
"POT-Creation-Date: 2011-01-08 18:40+MST\n"
|
||||||
"PO-Revision-Date: 2011-01-07 13:12+MST\n"
|
"PO-Revision-Date: 2011-01-08 18:40+MST\n"
|
||||||
"Last-Translator: Automatically generated\n"
|
"Last-Translator: Automatically generated\n"
|
||||||
"Language-Team: LANGUAGE\n"
|
"Language-Team: LANGUAGE\n"
|
||||||
"MIME-Version: 1.0\n"
|
"MIME-Version: 1.0\n"
|
||||||
@ -2905,28 +2905,29 @@ msgstr ""
|
|||||||
msgid " (Preface)"
|
msgid " (Preface)"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/ebooks/txt/input.py:26
|
#: /home/kovid/work/calibre/src/calibre/ebooks/txt/input.py:27
|
||||||
msgid ""
|
msgid ""
|
||||||
"Paragraph structure.\n"
|
"Paragraph structure.\n"
|
||||||
"choices are ['auto', 'block', 'single', 'print', 'markdown']\n"
|
"choices are ['auto', 'block', 'single', 'print', 'unformatted']\n"
|
||||||
"* auto: Try to auto detect paragraph type.\n"
|
"* auto: Try to auto detect paragraph type.\n"
|
||||||
"* block: Treat a blank line as a paragraph break.\n"
|
"* block: Treat a blank line as a paragraph break.\n"
|
||||||
"* single: Assume every line is a paragraph.\n"
|
"* single: Assume every line is a paragraph.\n"
|
||||||
"* print: Assume every line starting with 2+ spaces or a tab starts a paragraph."
|
"* print: Assume every line starting with 2+ spaces or a tab starts a paragraph.* unformatted: Most lines have hard line breaks, few/no spaces or indents."
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/ebooks/txt/input.py:35
|
#: /home/kovid/work/calibre/src/calibre/ebooks/txt/input.py:37
|
||||||
msgid ""
|
msgid ""
|
||||||
"Formatting used within the document.* auto: Try to auto detect the document formatting.\n"
|
"Formatting used within the document.* auto: Automatically decide which formatting processor to use.\n"
|
||||||
"* none: Do not modify the paragraph formatting. Everything is a paragraph.\n"
|
"* none: Do not process the document formatting. Everything is a paragraph and no styling is applied.\n"
|
||||||
"* markdown: Run the input though the markdown pre-processor. To learn more about markdown see"
|
"* heuristic: Process using heuristics to determine formatting such as chapter headings and italic text.\n"
|
||||||
|
"* markdown: Processing using markdown formatting. To learn more about markdown see"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/ebooks/txt/input.py:41
|
#: /home/kovid/work/calibre/src/calibre/ebooks/txt/input.py:46
|
||||||
msgid "Normally extra spaces are condensed into a single space. With this option all spaces will be displayed."
|
msgid "Normally extra spaces are condensed into a single space. With this option all spaces will be displayed."
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/ebooks/txt/input.py:44
|
#: /home/kovid/work/calibre/src/calibre/ebooks/txt/input.py:49
|
||||||
msgid "Do not insert a Table of Contents into the output text."
|
msgid "Do not insert a Table of Contents into the output text."
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
@ -7225,7 +7226,7 @@ msgstr ""
|
|||||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/password_ui.py:65
|
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/password_ui.py:65
|
||||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/scheduler_ui.py:219
|
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/scheduler_ui.py:219
|
||||||
#: /home/kovid/work/calibre/src/calibre/gui2/preferences/server_ui.py:130
|
#: /home/kovid/work/calibre/src/calibre/gui2/preferences/server_ui.py:130
|
||||||
#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:169
|
#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:172
|
||||||
msgid "&Show password"
|
msgid "&Show password"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
@ -10621,48 +10622,56 @@ msgstr ""
|
|||||||
msgid "Mail successfully sent"
|
msgid "Mail successfully sent"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:136
|
#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:95
|
||||||
|
msgid "OK to proceed?"
|
||||||
|
msgstr ""
|
||||||
|
|
||||||
|
#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:96
|
||||||
|
msgid "This will display your email password on the screen. Is it OK to proceed?"
|
||||||
|
msgstr ""
|
||||||
|
|
||||||
|
#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:139
|
||||||
msgid "If you are setting up a new hotmail account, you must log in to it once before you will be able to send mails."
|
msgid "If you are setting up a new hotmail account, you must log in to it once before you will be able to send mails."
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:147
|
#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:150
|
||||||
msgid "Setup sending email using"
|
msgid "Setup sending email using"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:149
|
#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:152
|
||||||
msgid "If you don't have an account, you can sign up for a free {name} email account at <a href=\"http://{url}\">http://{url}</a>. {extra}"
|
msgid "If you don't have an account, you can sign up for a free {name} email account at <a href=\"http://{url}\">http://{url}</a>. {extra}"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:156
|
#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:159
|
||||||
msgid "Your %s &email address:"
|
msgid "Your %s &email address:"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:157
|
#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:160
|
||||||
msgid "Your %s &username:"
|
msgid "Your %s &username:"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:158
|
#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:161
|
||||||
msgid "Your %s &password:"
|
msgid "Your %s &password:"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:176
|
#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:179
|
||||||
msgid "If you plan to use email to send books to your Kindle, remember to add the your %s email address to the allowed email addresses in your Amazon.com Kindle management page."
|
msgid "If you plan to use email to send books to your Kindle, remember to add the your %s email address to the allowed email addresses in your Amazon.com Kindle management page."
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:183
|
#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:186
|
||||||
msgid "Setup"
|
msgid "Setup"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:198
|
#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:201
|
||||||
#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:205
|
#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:208
|
||||||
msgid "Bad configuration"
|
msgid "Bad configuration"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:199
|
#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:202
|
||||||
msgid "You must set the From email address"
|
msgid "You must set the From email address"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:206
|
#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:209
|
||||||
msgid "You must set the username and password for the mail server."
|
msgid "You must set the username and password for the mail server."
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
|
@ -98,9 +98,10 @@ class _Parser(object):
|
|||||||
m = 'Formatter: ' + message + _(' near ')
|
m = 'Formatter: ' + message + _(' near ')
|
||||||
if self.lex_pos > 0:
|
if self.lex_pos > 0:
|
||||||
m = '{0} {1}'.format(m, self.prog[self.lex_pos-1][1])
|
m = '{0} {1}'.format(m, self.prog[self.lex_pos-1][1])
|
||||||
m = '{0} {1}'.format(m, self.prog[self.lex_pos][1])
|
elif self.lex_pos < len(self.prog):
|
||||||
if self.lex_pos < len(self.prog):
|
|
||||||
m = '{0} {1}'.format(m, self.prog[self.lex_pos+1][1])
|
m = '{0} {1}'.format(m, self.prog[self.lex_pos+1][1])
|
||||||
|
else:
|
||||||
|
m = '{0} {1}'.format(m, _('end of program'))
|
||||||
raise ValueError(m)
|
raise ValueError(m)
|
||||||
|
|
||||||
def token(self):
|
def token(self):
|
||||||
|
@ -118,8 +118,9 @@ class SearchQueryParser(object):
|
|||||||
failed.append(test[0])
|
failed.append(test[0])
|
||||||
return failed
|
return failed
|
||||||
|
|
||||||
def __init__(self, locations, test=False):
|
def __init__(self, locations, test=False, optimize=False):
|
||||||
self._tests_failed = False
|
self._tests_failed = False
|
||||||
|
self.optimize = optimize
|
||||||
# Define a token
|
# Define a token
|
||||||
standard_locations = map(lambda x : CaselessLiteral(x)+Suppress(':'),
|
standard_locations = map(lambda x : CaselessLiteral(x)+Suppress(':'),
|
||||||
locations)
|
locations)
|
||||||
@ -182,38 +183,52 @@ class SearchQueryParser(object):
|
|||||||
# empty the list of searches used for recursion testing
|
# empty the list of searches used for recursion testing
|
||||||
self.recurse_level = 0
|
self.recurse_level = 0
|
||||||
self.searches_seen = set([])
|
self.searches_seen = set([])
|
||||||
return self._parse(query)
|
candidates = self.universal_set()
|
||||||
|
return self._parse(query, candidates)
|
||||||
|
|
||||||
# this parse is used internally because it doesn't clear the
|
# this parse is used internally because it doesn't clear the
|
||||||
# recursive search test list. However, we permit seeing the
|
# recursive search test list. However, we permit seeing the
|
||||||
# same search a few times because the search might appear within
|
# same search a few times because the search might appear within
|
||||||
# another search.
|
# another search.
|
||||||
def _parse(self, query):
|
def _parse(self, query, candidates=None):
|
||||||
self.recurse_level += 1
|
self.recurse_level += 1
|
||||||
res = self._parser.parseString(query)[0]
|
res = self._parser.parseString(query)[0]
|
||||||
t = self.evaluate(res)
|
if candidates is None:
|
||||||
|
candidates = self.universal_set()
|
||||||
|
t = self.evaluate(res, candidates)
|
||||||
self.recurse_level -= 1
|
self.recurse_level -= 1
|
||||||
return t
|
return t
|
||||||
|
|
||||||
def method(self, group_name):
|
def method(self, group_name):
|
||||||
return getattr(self, 'evaluate_'+group_name)
|
return getattr(self, 'evaluate_'+group_name)
|
||||||
|
|
||||||
def evaluate(self, parse_result):
|
def evaluate(self, parse_result, candidates):
|
||||||
return self.method(parse_result.getName())(parse_result)
|
return self.method(parse_result.getName())(parse_result, candidates)
|
||||||
|
|
||||||
def evaluate_and(self, argument):
|
def evaluate_and(self, argument, candidates):
|
||||||
return self.evaluate(argument[0]).intersection(self.evaluate(argument[1]))
|
# RHS checks only those items matched by LHS
|
||||||
|
# returns result of RHS check: RHmatches(LHmatches(c))
|
||||||
|
# return self.evaluate(argument[0]).intersection(self.evaluate(argument[1]))
|
||||||
|
l = self.evaluate(argument[0], candidates)
|
||||||
|
return l.intersection(self.evaluate(argument[1], l))
|
||||||
|
|
||||||
def evaluate_or(self, argument):
|
def evaluate_or(self, argument, candidates):
|
||||||
return self.evaluate(argument[0]).union(self.evaluate(argument[1]))
|
# RHS checks only those elements not matched by LHS
|
||||||
|
# returns LHS union RHS: LHmatches(c) + RHmatches(c-LHmatches(c))
|
||||||
|
# return self.evaluate(argument[0]).union(self.evaluate(argument[1]))
|
||||||
|
l = self.evaluate(argument[0], candidates)
|
||||||
|
return l.union(self.evaluate(argument[1], candidates.difference(l)))
|
||||||
|
|
||||||
def evaluate_not(self, argument):
|
def evaluate_not(self, argument, candidates):
|
||||||
return self.universal_set().difference(self.evaluate(argument[0]))
|
# unary op checks only candidates. Result: list of items matching
|
||||||
|
# returns: c - matches(c)
|
||||||
|
# return self.universal_set().difference(self.evaluate(argument[0]))
|
||||||
|
return candidates.difference(self.evaluate(argument[0], candidates))
|
||||||
|
|
||||||
def evaluate_parenthesis(self, argument):
|
def evaluate_parenthesis(self, argument, candidates):
|
||||||
return self.evaluate(argument[0])
|
return self.evaluate(argument[0], candidates)
|
||||||
|
|
||||||
def evaluate_token(self, argument):
|
def evaluate_token(self, argument, candidates):
|
||||||
location = argument[0]
|
location = argument[0]
|
||||||
query = argument[1]
|
query = argument[1]
|
||||||
if location.lower() == 'search':
|
if location.lower() == 'search':
|
||||||
@ -224,17 +239,27 @@ class SearchQueryParser(object):
|
|||||||
raise ParseException(query, len(query), 'undefined saved search', self)
|
raise ParseException(query, len(query), 'undefined saved search', self)
|
||||||
if self.recurse_level > 5:
|
if self.recurse_level > 5:
|
||||||
self.searches_seen.add(query)
|
self.searches_seen.add(query)
|
||||||
return self._parse(saved_searches().lookup(query))
|
return self._parse(saved_searches().lookup(query), candidates)
|
||||||
except: # convert all exceptions (e.g., missing key) to a parse error
|
except: # convert all exceptions (e.g., missing key) to a parse error
|
||||||
raise ParseException(query, len(query), 'undefined saved search', self)
|
raise ParseException(query, len(query), 'undefined saved search', self)
|
||||||
return self.get_matches(location, query)
|
return self._get_matches(location, query, candidates)
|
||||||
|
|
||||||
def get_matches(self, location, query):
|
def _get_matches(self, location, query, candidates):
|
||||||
|
if self.optimize:
|
||||||
|
return self.get_matches(location, query, candidates=candidates)
|
||||||
|
else:
|
||||||
|
return self.get_matches(location, query)
|
||||||
|
|
||||||
|
def get_matches(self, location, query, candidates=None):
|
||||||
'''
|
'''
|
||||||
Should return the set of matches for :param:'location` and :param:`query`.
|
Should return the set of matches for :param:'location` and :param:`query`.
|
||||||
|
|
||||||
|
The search must be performed over all entries is :param:`candidates` is
|
||||||
|
None otherwise only over the items in candidates.
|
||||||
|
|
||||||
:param:`location` is one of the items in :member:`SearchQueryParser.DEFAULT_LOCATIONS`.
|
:param:`location` is one of the items in :member:`SearchQueryParser.DEFAULT_LOCATIONS`.
|
||||||
:param:`query` is a string literal.
|
:param:`query` is a string literal.
|
||||||
|
:param: None or a subset of the set returned by :meth:`universal_set`.
|
||||||
'''
|
'''
|
||||||
return set([])
|
return set([])
|
||||||
|
|
||||||
@ -561,7 +586,7 @@ class Tester(SearchQueryParser):
|
|||||||
def universal_set(self):
|
def universal_set(self):
|
||||||
return self._universal_set
|
return self._universal_set
|
||||||
|
|
||||||
def get_matches(self, location, query):
|
def get_matches(self, location, query, candidates=None):
|
||||||
location = location.lower()
|
location = location.lower()
|
||||||
if location in self.fields.keys():
|
if location in self.fields.keys():
|
||||||
getter = operator.itemgetter(self.fields[location])
|
getter = operator.itemgetter(self.fields[location])
|
||||||
@ -573,8 +598,13 @@ class Tester(SearchQueryParser):
|
|||||||
if not query:
|
if not query:
|
||||||
return set([])
|
return set([])
|
||||||
query = query.lower()
|
query = query.lower()
|
||||||
return set(key for key, val in self.texts.items() \
|
if candidates:
|
||||||
if query and query in getattr(getter(val), 'lower', lambda : '')())
|
return set(key for key, val in self.texts.items() \
|
||||||
|
if key in candidates and query and query
|
||||||
|
in getattr(getter(val), 'lower', lambda : '')())
|
||||||
|
else:
|
||||||
|
return set(key for key, val in self.texts.items() \
|
||||||
|
if query and query in getattr(getter(val), 'lower', lambda : '')())
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -592,6 +622,7 @@ class Tester(SearchQueryParser):
|
|||||||
|
|
||||||
|
|
||||||
def main(args=sys.argv):
|
def main(args=sys.argv):
|
||||||
|
print 'testing unoptimized'
|
||||||
tester = Tester(['authors', 'author', 'series', 'formats', 'format',
|
tester = Tester(['authors', 'author', 'series', 'formats', 'format',
|
||||||
'publisher', 'rating', 'tags', 'tag', 'comments', 'comment', 'cover',
|
'publisher', 'rating', 'tags', 'tag', 'comments', 'comment', 'cover',
|
||||||
'isbn', 'ondevice', 'pubdate', 'size', 'date', 'title', u'#read',
|
'isbn', 'ondevice', 'pubdate', 'size', 'date', 'title', u'#read',
|
||||||
@ -601,6 +632,16 @@ def main(args=sys.argv):
|
|||||||
print '>>>>>>>>>>>>>> Tests Failed <<<<<<<<<<<<<<<'
|
print '>>>>>>>>>>>>>> Tests Failed <<<<<<<<<<<<<<<'
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
print '\n\ntesting optimized'
|
||||||
|
tester = Tester(['authors', 'author', 'series', 'formats', 'format',
|
||||||
|
'publisher', 'rating', 'tags', 'tag', 'comments', 'comment', 'cover',
|
||||||
|
'isbn', 'ondevice', 'pubdate', 'size', 'date', 'title', u'#read',
|
||||||
|
'all', 'search'], test=True, optimize=True)
|
||||||
|
failed = tester.run_tests()
|
||||||
|
if tester._tests_failed or failed:
|
||||||
|
print '>>>>>>>>>>>>>> Tests Failed <<<<<<<<<<<<<<<'
|
||||||
|
return 1
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
Loading…
x
Reference in New Issue
Block a user