mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
KG updates
This commit is contained in:
commit
81d8ac6dbf
@ -26,7 +26,7 @@ var current_library_request = null;
|
|||||||
|
|
||||||
////////////////////////////// GET BOOK LIST //////////////////////////////
|
////////////////////////////// GET BOOK LIST //////////////////////////////
|
||||||
|
|
||||||
var LIBRARY_FETCH_TIMEOUT = 30000; // milliseconds
|
var LIBRARY_FETCH_TIMEOUT = 5*60000; // milliseconds
|
||||||
|
|
||||||
function create_table_headers() {
|
function create_table_headers() {
|
||||||
var thead = $('table#book_list thead tr');
|
var thead = $('table#book_list thead tr');
|
||||||
|
@ -114,3 +114,11 @@ add_new_book_tags_when_importing_books = False
|
|||||||
# Set the maximum number of tags to show per book in the content server
|
# Set the maximum number of tags to show per book in the content server
|
||||||
max_content_server_tags_shown=5
|
max_content_server_tags_shown=5
|
||||||
|
|
||||||
|
|
||||||
|
# Set the maximum number of sort 'levels' that calibre will use to resort the
|
||||||
|
# library after certain operations such as searches or device insertion. Each
|
||||||
|
# sort level adds a performance penalty. If the database is large (thousands of
|
||||||
|
# books) the penalty might be noticeable. If you are not concerned about multi-
|
||||||
|
# level sorts, and if you are seeing a slowdown, reduce the value of this tweak.
|
||||||
|
maximum_resort_levels = 5
|
||||||
|
|
||||||
|
@ -1,12 +1,8 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
'''
|
'''
|
||||||
infobae.com
|
infobae.com
|
||||||
'''
|
'''
|
||||||
import re
|
|
||||||
import urllib, urlparse
|
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
@ -20,34 +16,23 @@ class Infobae(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
language = 'es'
|
language = 'es'
|
||||||
lang = 'es-AR'
|
|
||||||
|
|
||||||
encoding = 'cp1252'
|
encoding = 'cp1252'
|
||||||
cover_url = 'http://www.infobae.com/imgs/header/header.gif'
|
masthead_url = 'http://www.infobae.com/imgs/header/header.gif'
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
preprocess_regexps = [(re.compile(
|
remove_empty_feeds = True
|
||||||
r'<meta name="Description" content="[^"]+">'), lambda m:'')]
|
extra_css = '''
|
||||||
|
body{font-family:Arial,Helvetica,sans-serif;}
|
||||||
|
.popUpTitulo{color:#0D4261; font-size: xx-large}
|
||||||
|
'''
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
extra_css = '''
|
, 'publisher' : publisher
|
||||||
.col-center{font-family:Arial,Helvetica,sans-serif;}
|
, 'language' : language
|
||||||
h1{font-family:Arial,Helvetica,sans-serif; color:#0D4261;}
|
, 'linearize_tables' : True
|
||||||
.fuenteIntNota{font-family:Arial,Helvetica,sans-serif; color:#1D1D1D; font-size:x-small;}
|
}
|
||||||
'''
|
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'class':['content']})]
|
|
||||||
|
|
||||||
|
|
||||||
remove_tags = [
|
|
||||||
dict(name='div', attrs={'class':['options','col-right','controles', 'bannerLibre','tiulo-masleidas','masleidas-h']}),
|
|
||||||
dict(name='a', attrs={'name' : 'comentario',}),
|
|
||||||
dict(name='iframe'),
|
|
||||||
dict(name='img', alt = "Ver galerias de imagenes"),
|
|
||||||
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
@ -57,39 +42,14 @@ class Infobae(BasicNewsRecipe):
|
|||||||
,(u'Deportes' , u'http://www.infobae.com/adjuntos/html/RSS/deportes.xml' )
|
,(u'Deportes' , u'http://www.infobae.com/adjuntos/html/RSS/deportes.xml' )
|
||||||
]
|
]
|
||||||
|
|
||||||
# def print_version(self, url):
|
def print_version(self, url):
|
||||||
# main, sep, article_part = url.partition('contenidos/')
|
article_part = url.rpartition('/')[2]
|
||||||
# article_id, rsep, rrest = article_part.partition('-')
|
article_id= article_part.partition('-')[0]
|
||||||
# return u'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id
|
return 'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id
|
||||||
|
|
||||||
def get_article_url(self, article):
|
|
||||||
ans = article.get('link').encode('utf-8')
|
|
||||||
parts = list(urlparse.urlparse(ans))
|
|
||||||
parts[2] = urllib.quote(parts[2])
|
|
||||||
ans = urlparse.urlunparse(parts)
|
|
||||||
return ans.decode('utf-8')
|
|
||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
|
|
||||||
for tag in soup.head.findAll('strong'):
|
|
||||||
tag.extract()
|
|
||||||
for tag in soup.findAll('meta'):
|
|
||||||
del tag['content']
|
|
||||||
tag.extract()
|
|
||||||
|
|
||||||
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">\n<meta http-equiv="Content-Language" content="es-AR"/>\n'
|
|
||||||
soup.head.insert(0,mtag)
|
|
||||||
for item in soup.findAll(style=True):
|
|
||||||
del item['style']
|
|
||||||
|
|
||||||
return soup
|
|
||||||
|
|
||||||
def postprocess_html(self, soup, first):
|
def postprocess_html(self, soup, first):
|
||||||
|
|
||||||
for tag in soup.findAll(name='strong'):
|
for tag in soup.findAll(name='strong'):
|
||||||
tag.name = 'b'
|
tag.name = 'b'
|
||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|
@ -6,6 +6,7 @@ nspm.rs
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import NavigableString
|
||||||
|
|
||||||
class Nspm(BasicNewsRecipe):
|
class Nspm(BasicNewsRecipe):
|
||||||
title = 'Nova srpska politicka misao'
|
title = 'Nova srpska politicka misao'
|
||||||
@ -21,6 +22,7 @@ class Nspm(BasicNewsRecipe):
|
|||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
language = 'sr'
|
language = 'sr'
|
||||||
delay = 2
|
delay = 2
|
||||||
|
remove_empty_feeds = True
|
||||||
publication_type = 'magazine'
|
publication_type = 'magazine'
|
||||||
masthead_url = 'http://www.nspm.rs/templates/jsn_epic_pro/images/logol.jpg'
|
masthead_url = 'http://www.nspm.rs/templates/jsn_epic_pro/images/logol.jpg'
|
||||||
extra_css = """ @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)}
|
extra_css = """ @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)}
|
||||||
@ -45,8 +47,9 @@ class Nspm(BasicNewsRecipe):
|
|||||||
dict(name=['link','object','embed','script','meta','base','iframe'])
|
dict(name=['link','object','embed','script','meta','base','iframe'])
|
||||||
,dict(attrs={'class':'buttonheading'})
|
,dict(attrs={'class':'buttonheading'})
|
||||||
]
|
]
|
||||||
remove_tags_after = dict(attrs={'class':'article_separator'})
|
remove_tags_before = dict(attrs={'class':'contentheading'})
|
||||||
remove_attributes = ['width','height']
|
remove_tags_after = dict(attrs={'class':'article_separator'})
|
||||||
|
remove_attributes = ['width','height']
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
@ -67,4 +70,8 @@ class Nspm(BasicNewsRecipe):
|
|||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for item in soup.body.findAll(style=True):
|
for item in soup.body.findAll(style=True):
|
||||||
del item['style']
|
del item['style']
|
||||||
|
for item in soup.body.findAll('h1'):
|
||||||
|
nh = NavigableString(item.a.string)
|
||||||
|
item.a.extract()
|
||||||
|
item.insert(0,nh)
|
||||||
return self.adeify_images(soup)
|
return self.adeify_images(soup)
|
||||||
|
@ -35,7 +35,7 @@ class XkcdCom(BasicNewsRecipe):
|
|||||||
'date': item['title'],
|
'date': item['title'],
|
||||||
'timestamp': time.mktime(time.strptime(item['title'], '%Y-%m-%d'))+1,
|
'timestamp': time.mktime(time.strptime(item['title'], '%Y-%m-%d'))+1,
|
||||||
'url': 'http://xkcd.com' + item['href'],
|
'url': 'http://xkcd.com' + item['href'],
|
||||||
'title': self.tag_to_string(item).encode('UTF-8'),
|
'title': self.tag_to_string(item),
|
||||||
'description': '',
|
'description': '',
|
||||||
'content': '',
|
'content': '',
|
||||||
})
|
})
|
||||||
|
@ -459,7 +459,7 @@ from calibre.devices.iriver.driver import IRIVER_STORY
|
|||||||
from calibre.devices.binatone.driver import README
|
from calibre.devices.binatone.driver import README
|
||||||
from calibre.devices.hanvon.driver import N516, EB511, ALEX, AZBOOKA, THEBOOK
|
from calibre.devices.hanvon.driver import N516, EB511, ALEX, AZBOOKA, THEBOOK
|
||||||
from calibre.devices.edge.driver import EDGE
|
from calibre.devices.edge.driver import EDGE
|
||||||
from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS
|
from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS, SOVOS
|
||||||
from calibre.devices.sne.driver import SNE
|
from calibre.devices.sne.driver import SNE
|
||||||
from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, GEMEI
|
from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, GEMEI
|
||||||
from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG
|
from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG
|
||||||
@ -557,6 +557,7 @@ plugins += [
|
|||||||
TECLAST_K3,
|
TECLAST_K3,
|
||||||
NEWSMY,
|
NEWSMY,
|
||||||
IPAPYRUS,
|
IPAPYRUS,
|
||||||
|
SOVOS,
|
||||||
EDGE,
|
EDGE,
|
||||||
SNE,
|
SNE,
|
||||||
ALEX,
|
ALEX,
|
||||||
|
@ -44,16 +44,17 @@ class Book(MetaInformation):
|
|||||||
self.mime = mime
|
self.mime = mime
|
||||||
|
|
||||||
self.size = size # will be set later if None
|
self.size = size # will be set later if None
|
||||||
try:
|
|
||||||
if ContentType == '6':
|
|
||||||
self.datetime = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%f")
|
|
||||||
else:
|
|
||||||
self.datetime = time.gmtime(os.path.getctime(self.path))
|
|
||||||
except:
|
|
||||||
self.datetime = time.gmtime()
|
|
||||||
|
|
||||||
if thumbnail_name is not None:
|
if ContentType == '6':
|
||||||
self.thumbnail = ImageWrapper(thumbnail_name)
|
self.datetime = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%f")
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
self.datetime = time.gmtime(os.path.getctime(self.path))
|
||||||
|
except:
|
||||||
|
self.datetime = time.gmtime()
|
||||||
|
|
||||||
|
if thumbnail_name is not None:
|
||||||
|
self.thumbnail = ImageWrapper(thumbnail_name)
|
||||||
self.tags = []
|
self.tags = []
|
||||||
if other:
|
if other:
|
||||||
self.smart_update(other)
|
self.smart_update(other)
|
||||||
|
@ -106,11 +106,14 @@ class KOBO(USBMS):
|
|||||||
changed = True
|
changed = True
|
||||||
bl[idx].device_collections = playlist_map.get(lpath, [])
|
bl[idx].device_collections = playlist_map.get(lpath, [])
|
||||||
else:
|
else:
|
||||||
book = self.book_from_path(prefix, lpath, title, authors, mime, date, ContentType, ImageID)
|
if ContentType == '6':
|
||||||
|
book = Book(prefix, lpath, title, authors, mime, date, ContentType, ImageID, size=1048576)
|
||||||
|
else:
|
||||||
|
book = self.book_from_path(prefix, lpath, title, authors, mime, date, ContentType, ImageID)
|
||||||
# print 'Update booklist'
|
# print 'Update booklist'
|
||||||
|
book.device_collections = playlist_map.get(book.lpath, [])
|
||||||
if bl.add_book(book, replace_metadata=False):
|
if bl.add_book(book, replace_metadata=False):
|
||||||
changed = True
|
changed = True
|
||||||
book.device_collections = playlist_map.get(book.lpath, [])
|
|
||||||
except: # Probably a path encoding error
|
except: # Probably a path encoding error
|
||||||
import traceback
|
import traceback
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
@ -231,21 +234,9 @@ class KOBO(USBMS):
|
|||||||
path = self.normalize_path(path)
|
path = self.normalize_path(path)
|
||||||
# print "Delete file normalized path: " + path
|
# print "Delete file normalized path: " + path
|
||||||
extension = os.path.splitext(path)[1]
|
extension = os.path.splitext(path)[1]
|
||||||
|
ContentType = self.get_content_type_from_extension(extension)
|
||||||
|
|
||||||
if extension == '.kobo':
|
ContentID = self.contentid_from_path(path, ContentType)
|
||||||
# Kobo books do not have book files. They do have some images though
|
|
||||||
#print "kobo book"
|
|
||||||
ContentType = 6
|
|
||||||
ContentID = self.contentid_from_path(path, ContentType)
|
|
||||||
elif extension == '.pdf' or extension == '.epub':
|
|
||||||
# print "ePub or pdf"
|
|
||||||
ContentType = 16
|
|
||||||
#print "Path: " + path
|
|
||||||
ContentID = self.contentid_from_path(path, ContentType)
|
|
||||||
# print "ContentID: " + ContentID
|
|
||||||
else: # if extension == '.html' or extension == '.txt':
|
|
||||||
ContentType = 999 # Yet another hack: to get around Kobo changing how ContentID is stored
|
|
||||||
ContentID = self.contentid_from_path(path, ContentType)
|
|
||||||
|
|
||||||
ImageID = self.delete_via_sql(ContentID, ContentType)
|
ImageID = self.delete_via_sql(ContentID, ContentType)
|
||||||
#print " We would now delete the Images for" + ImageID
|
#print " We would now delete the Images for" + ImageID
|
||||||
@ -343,6 +334,17 @@ class KOBO(USBMS):
|
|||||||
ContentID = ContentID.replace("\\", '/')
|
ContentID = ContentID.replace("\\", '/')
|
||||||
return ContentID
|
return ContentID
|
||||||
|
|
||||||
|
def get_content_type_from_extension(self, extension):
|
||||||
|
if extension == '.kobo':
|
||||||
|
# Kobo books do not have book files. They do have some images though
|
||||||
|
#print "kobo book"
|
||||||
|
ContentType = 6
|
||||||
|
elif extension == '.pdf' or extension == '.epub':
|
||||||
|
# print "ePub or pdf"
|
||||||
|
ContentType = 16
|
||||||
|
else: # if extension == '.html' or extension == '.txt':
|
||||||
|
ContentType = 999 # Yet another hack: to get around Kobo changing how ContentID is stored
|
||||||
|
return ContentType
|
||||||
|
|
||||||
def path_from_contentid(self, ContentID, ContentType, oncard):
|
def path_from_contentid(self, ContentID, ContentType, oncard):
|
||||||
path = ContentID
|
path = ContentID
|
||||||
|
@ -52,3 +52,14 @@ class IPAPYRUS(TECLAST_K3):
|
|||||||
VENDOR_NAME = 'E_READER'
|
VENDOR_NAME = 'E_READER'
|
||||||
WINDOWS_MAIN_MEM = ''
|
WINDOWS_MAIN_MEM = ''
|
||||||
|
|
||||||
|
class SOVOS(TECLAST_K3):
|
||||||
|
|
||||||
|
name = 'Sovos device interface'
|
||||||
|
gui_name = 'Sovos'
|
||||||
|
description = _('Communicate with the Sovos reader.')
|
||||||
|
|
||||||
|
FORMATS = ['epub', 'fb2', 'pdf', 'txt']
|
||||||
|
|
||||||
|
VENDOR_NAME = 'RK28XX'
|
||||||
|
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'USB-MSC'
|
||||||
|
|
||||||
|
@ -132,7 +132,11 @@ class CHMReader(CHMFile):
|
|||||||
for path in self.Contents():
|
for path in self.Contents():
|
||||||
lpath = os.path.join(output_dir, path)
|
lpath = os.path.join(output_dir, path)
|
||||||
self._ensure_dir(lpath)
|
self._ensure_dir(lpath)
|
||||||
data = self.GetFile(path)
|
try:
|
||||||
|
data = self.GetFile(path)
|
||||||
|
except:
|
||||||
|
self.log.exception('Failed to extract %s from CHM, ignoring'%path)
|
||||||
|
continue
|
||||||
if lpath.find(';') != -1:
|
if lpath.find(';') != -1:
|
||||||
# fix file names with ";<junk>" at the end, see _reformat()
|
# fix file names with ";<junk>" at the end, see _reformat()
|
||||||
lpath = lpath.split(';')[0]
|
lpath = lpath.split(';')[0]
|
||||||
|
@ -122,7 +122,7 @@ def add_pipeline_options(parser, plumber):
|
|||||||
'font_size_mapping',
|
'font_size_mapping',
|
||||||
'line_height',
|
'line_height',
|
||||||
'linearize_tables',
|
'linearize_tables',
|
||||||
'extra_css',
|
'extra_css', 'smarten_punctuation',
|
||||||
'margin_top', 'margin_left', 'margin_right',
|
'margin_top', 'margin_left', 'margin_right',
|
||||||
'margin_bottom', 'change_justification',
|
'margin_bottom', 'change_justification',
|
||||||
'insert_blank_line', 'remove_paragraph_spacing','remove_paragraph_spacing_indent_size',
|
'insert_blank_line', 'remove_paragraph_spacing','remove_paragraph_spacing_indent_size',
|
||||||
|
@ -362,6 +362,14 @@ OptionRecommendation(name='preprocess_html',
|
|||||||
)
|
)
|
||||||
),
|
),
|
||||||
|
|
||||||
|
OptionRecommendation(name='smarten_punctuation',
|
||||||
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Convert plain quotes, dashes and ellipsis to their '
|
||||||
|
'typographically correct equivalents. For details, see '
|
||||||
|
'http://daringfireball.net/projects/smartypants'
|
||||||
|
)
|
||||||
|
),
|
||||||
|
|
||||||
OptionRecommendation(name='remove_header',
|
OptionRecommendation(name='remove_header',
|
||||||
recommended_value=False, level=OptionRecommendation.LOW,
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
help=_('Use a regular expression to try and remove the header.'
|
help=_('Use a regular expression to try and remove the header.'
|
||||||
|
@ -75,6 +75,8 @@ def line_length(format, raw, percent):
|
|||||||
linere = re.compile('(?<=<p).*?(?=</p>)', re.DOTALL)
|
linere = re.compile('(?<=<p).*?(?=</p>)', re.DOTALL)
|
||||||
elif format == 'pdf':
|
elif format == 'pdf':
|
||||||
linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
|
linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
|
||||||
|
elif format == 'spanned_html':
|
||||||
|
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
|
||||||
lines = linere.findall(raw)
|
lines = linere.findall(raw)
|
||||||
|
|
||||||
lengths = []
|
lengths = []
|
||||||
@ -166,6 +168,17 @@ class HTMLPreProcessor(object):
|
|||||||
(re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'),
|
(re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'),
|
||||||
(re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'),
|
(re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'),
|
||||||
(re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'),
|
(re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'),
|
||||||
|
# ` with letter before
|
||||||
|
(re.compile(u'a\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'à'),
|
||||||
|
(re.compile(u'A\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'À'),
|
||||||
|
(re.compile(u'e\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'è'),
|
||||||
|
(re.compile(u'E\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'È'),
|
||||||
|
(re.compile(u'i\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ì'),
|
||||||
|
(re.compile(u'I\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ì'),
|
||||||
|
(re.compile(u'o\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ò'),
|
||||||
|
(re.compile(u'O\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ò'),
|
||||||
|
(re.compile(u'u\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ù'),
|
||||||
|
(re.compile(u'U\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ù'),
|
||||||
|
|
||||||
# ´
|
# ´
|
||||||
(re.compile(u'´\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'á'),
|
(re.compile(u'´\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'á'),
|
||||||
@ -213,30 +226,29 @@ class HTMLPreProcessor(object):
|
|||||||
(re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'),
|
(re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'),
|
||||||
(re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'),
|
(re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'),
|
||||||
|
|
||||||
|
# If pdf printed from a browser then the header/footer has a reliable pattern
|
||||||
|
(re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
|
||||||
|
|
||||||
|
# Center separator lines
|
||||||
|
(re.compile(u'<br>\s*(?P<break>([*#•]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
|
||||||
|
|
||||||
# Remove page links
|
# Remove page links
|
||||||
(re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
|
(re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
|
||||||
# Remove <hr> tags
|
# Remove <hr> tags
|
||||||
(re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
|
(re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br>'),
|
||||||
# Replace <br><br> with <p>
|
|
||||||
(re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
|
|
||||||
|
|
||||||
# Remove hyphenation
|
|
||||||
(re.compile(r'-<br.*?>\n\r?'), lambda match: ''),
|
|
||||||
|
|
||||||
# Remove gray background
|
# Remove gray background
|
||||||
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
|
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
|
||||||
|
|
||||||
# Detect Chapters to match default XPATH in GUI
|
# Detect Chapters to match default XPATH in GUI
|
||||||
(re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part)\s*([\d\w-]+(\s\w+)?)?(</(i|b)>(</(i|b)>)?)?)</?(br|p)[^>]*>\s*(?P<title>(<(i|b)>)?\s*\w+(\s*\w+)?\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
|
(re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head),
|
||||||
(re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
|
# Cover the case where every letter in a chapter title is separated by a space
|
||||||
|
(re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head),
|
||||||
|
|
||||||
# Have paragraphs show better
|
# Have paragraphs show better
|
||||||
(re.compile(r'<br.*?>'), lambda match : '<p>'),
|
(re.compile(r'<br.*?>'), lambda match : '<p>'),
|
||||||
# Clean up spaces
|
# Clean up spaces
|
||||||
(re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
|
(re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
|
||||||
# Connect paragraphs split by -
|
|
||||||
(re.compile(u'(?<=[^\s][-–])[\s]*(</p>)*[\s]*(<p>)*\s*(?=[^\s])'), lambda match: ''),
|
|
||||||
# Add space before and after italics
|
# Add space before and after italics
|
||||||
(re.compile(u'(?<!“)<i>'), lambda match: ' <i>'),
|
(re.compile(u'(?<!“)<i>'), lambda match: ' <i>'),
|
||||||
(re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
|
(re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
|
||||||
@ -317,12 +329,29 @@ class HTMLPreProcessor(object):
|
|||||||
print 'Failed to parse remove_footer regexp'
|
print 'Failed to parse remove_footer regexp'
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
|
# unwrap hyphenation - moved here so it's executed after header/footer removal
|
||||||
|
if is_pdftohtml:
|
||||||
|
# unwrap visible dashes and hyphens - don't delete they are often hyphens for
|
||||||
|
# for compound words, formatting, etc
|
||||||
|
end_rules.append((re.compile(u'(?<=[-–—])\s*<p>\s*(?=[[a-z\d])'), lambda match: ''))
|
||||||
|
# unwrap/delete soft hyphens
|
||||||
|
end_rules.append((re.compile(u'[](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
||||||
|
# unwrap/delete soft hyphens with formatting
|
||||||
|
end_rules.append((re.compile(u'[]\s*(</(i|u|b)>)+(\s*<p>)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
||||||
|
|
||||||
|
# Make the more aggressive chapter marking regex optional with the preprocess option to
|
||||||
|
# reduce false positives and move after header/footer removal
|
||||||
|
if getattr(self.extra_opts, 'preprocess_html', None):
|
||||||
|
if is_pdftohtml:
|
||||||
|
end_rules.append((re.compile(r'<p>\s*(?P<chap>(<[ibu]>){0,2}\s*([A-Z \'"!]{3,})\s*([\dA-Z:]+\s){0,4}\s*(</[ibu]>){0,2})\s*<p>\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<p>)?'), chap_head),)
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
||||||
length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
|
length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
|
||||||
if length:
|
if length:
|
||||||
|
# print "The pdf line length returned is " + str(length)
|
||||||
end_rules.append(
|
end_rules.append(
|
||||||
# Un wrap using punctuation
|
# Un wrap using punctuation
|
||||||
(re.compile(r'(?<=.{%i}[a-z\.,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
|
(re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
||||||
)
|
)
|
||||||
|
|
||||||
for rule in self.PREPROCESS + start_rules:
|
for rule in self.PREPROCESS + start_rules:
|
||||||
@ -372,5 +401,14 @@ class HTMLPreProcessor(object):
|
|||||||
if self.plugin_preprocess:
|
if self.plugin_preprocess:
|
||||||
html = self.input_plugin_preprocess(html)
|
html = self.input_plugin_preprocess(html)
|
||||||
|
|
||||||
|
if getattr(self.extra_opts, 'smarten_punctuation', False):
|
||||||
|
html = self.smarten_punctuation(html)
|
||||||
|
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
def smarten_punctuation(self, html):
|
||||||
|
from calibre.utils.smartypants import smartyPants
|
||||||
|
from calibre.ebooks.chardet import substitute_entites
|
||||||
|
html = smartyPants(html)
|
||||||
|
return substitute_entites(html)
|
||||||
|
|
||||||
|
173
src/calibre/ebooks/conversion/utils.py
Normal file
173
src/calibre/ebooks/conversion/utils.py
Normal file
@ -0,0 +1,173 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import re
|
||||||
|
from calibre.ebooks.conversion.preprocess import line_length
|
||||||
|
from calibre.utils.logging import default_log
|
||||||
|
|
||||||
|
class PreProcessor(object):
|
||||||
|
|
||||||
|
def __init__(self, log=None):
|
||||||
|
self.log = default_log if log is None else log
|
||||||
|
self.html_preprocess_sections = 0
|
||||||
|
self.found_indents = 0
|
||||||
|
|
||||||
|
def chapter_head(self, match):
|
||||||
|
chap = match.group('chap')
|
||||||
|
title = match.group('title')
|
||||||
|
if not title:
|
||||||
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
|
self.log("found " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
|
||||||
|
return '<h2>'+chap+'</h2>\n'
|
||||||
|
else:
|
||||||
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
|
self.log("found " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
|
||||||
|
return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
|
||||||
|
|
||||||
|
def chapter_break(self, match):
|
||||||
|
chap = match.group('section')
|
||||||
|
styles = match.group('styles')
|
||||||
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
|
self.log("marked " + str(self.html_preprocess_sections) + " section markers based on punctuation. - " + str(chap))
|
||||||
|
return '<'+styles+' style="page-break-before:always">'+chap
|
||||||
|
|
||||||
|
def insert_indent(self, match):
|
||||||
|
pstyle = match.group('formatting')
|
||||||
|
span = match.group('span')
|
||||||
|
self.found_indents = self.found_indents + 1
|
||||||
|
if pstyle:
|
||||||
|
if not span:
|
||||||
|
return '<p '+pstyle+' style="text-indent:3%">'
|
||||||
|
else:
|
||||||
|
return '<p '+pstyle+' style="text-indent:3%">'+span
|
||||||
|
else:
|
||||||
|
if not span:
|
||||||
|
return '<p style="text-indent:3%">'
|
||||||
|
else:
|
||||||
|
return '<p style="text-indent:3%">'+span
|
||||||
|
|
||||||
|
def no_markup(self, raw, percent):
|
||||||
|
'''
|
||||||
|
Detects total marked up line endings in the file. raw is the text to
|
||||||
|
inspect. Percent is the minimum percent of line endings which should
|
||||||
|
be marked up to return true.
|
||||||
|
'''
|
||||||
|
htm_end_ere = re.compile('</p>', re.DOTALL)
|
||||||
|
line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
|
||||||
|
htm_end = htm_end_ere.findall(raw)
|
||||||
|
line_end = line_end_ere.findall(raw)
|
||||||
|
tot_htm_ends = len(htm_end)
|
||||||
|
tot_ln_fds = len(line_end)
|
||||||
|
self.log("There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked up endings")
|
||||||
|
|
||||||
|
if percent > 1:
|
||||||
|
percent = 1
|
||||||
|
if percent < 0:
|
||||||
|
percent = 0
|
||||||
|
|
||||||
|
min_lns = tot_ln_fds * percent
|
||||||
|
self.log("There must be fewer than " + str(min_lns) + " unmarked lines to add markup")
|
||||||
|
if min_lns > tot_htm_ends:
|
||||||
|
return True
|
||||||
|
|
||||||
|
def __call__(self, html):
|
||||||
|
self.log("********* Preprocessing HTML *********")
|
||||||
|
# Replace series of non-breaking spaces with text-indent
|
||||||
|
txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
|
||||||
|
html = txtindent.sub(self.insert_indent, html)
|
||||||
|
if self.found_indents > 1:
|
||||||
|
self.log("replaced "+str(self.found_indents)+ " nbsp indents with inline styles")
|
||||||
|
# remove remaining non-breaking spaces
|
||||||
|
html = re.sub(ur'\u00a0', ' ', html)
|
||||||
|
# Get rid of empty <o:p> tags to simplify other processing
|
||||||
|
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
|
||||||
|
# Get rid of empty span tags
|
||||||
|
html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html)
|
||||||
|
|
||||||
|
# If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
|
||||||
|
linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
||||||
|
blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
|
||||||
|
blanklines = blankreg.findall(html)
|
||||||
|
lines = linereg.findall(html)
|
||||||
|
if len(lines) > 1:
|
||||||
|
self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
|
||||||
|
if float(len(blanklines)) / float(len(lines)) > 0.40:
|
||||||
|
self.log("deleting blank lines")
|
||||||
|
html = blankreg.sub('', html)
|
||||||
|
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
|
||||||
|
html = re.sub(r"\s*</p>", "</p>\n", html)
|
||||||
|
html = re.sub(r"\s*<p>\s*", "\n<p>", html)
|
||||||
|
|
||||||
|
# some lit files don't have any <p> tags or equivalent (generally just plain text between
|
||||||
|
# <pre> tags), check and mark up line endings if required before proceeding
|
||||||
|
if self.no_markup(html, 0.1):
|
||||||
|
self.log("not enough paragraph markers, adding now")
|
||||||
|
add_markup = re.compile('(?<!>)(\n)')
|
||||||
|
html = add_markup.sub('</p>\n<p>', html)
|
||||||
|
|
||||||
|
# detect chapters/sections to match xpath or splitting logic
|
||||||
|
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
||||||
|
self.html_preprocess_sections = len(heading.findall(html))
|
||||||
|
self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings")
|
||||||
|
#
|
||||||
|
# Start with most typical chapter headings, get more aggressive until one works
|
||||||
|
if self.html_preprocess_sections < 10:
|
||||||
|
chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}s*(<span[^>]*>)?\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
|
||||||
|
html = chapdetect.sub(self.chapter_head, html)
|
||||||
|
if self.html_preprocess_sections < 10:
|
||||||
|
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
|
||||||
|
chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
|
||||||
|
html = chapdetect2.sub(self.chapter_head, html)
|
||||||
|
|
||||||
|
if self.html_preprocess_sections < 10:
|
||||||
|
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
|
||||||
|
chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
|
||||||
|
html = chapdetect2.sub(self.chapter_head, html)
|
||||||
|
|
||||||
|
# Unwrap lines
|
||||||
|
#
|
||||||
|
self.log("Unwrapping Lines")
|
||||||
|
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
|
||||||
|
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
|
||||||
|
# that lines can be un-wrapped across page boundaries
|
||||||
|
paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
|
||||||
|
spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
|
||||||
|
paras = len(paras_reg.findall(html))
|
||||||
|
spans = len(spans_reg.findall(html))
|
||||||
|
if spans > 1:
|
||||||
|
if float(paras) / float(spans) < 0.75:
|
||||||
|
format = 'spanned_html'
|
||||||
|
else:
|
||||||
|
format = 'html'
|
||||||
|
else:
|
||||||
|
format = 'html'
|
||||||
|
|
||||||
|
# Calculate Length
|
||||||
|
length = line_length(format, html, 0.4)
|
||||||
|
self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***")
|
||||||
|
#
|
||||||
|
# Unwrap and/or delete soft-hyphens, hyphens
|
||||||
|
html = re.sub(u'\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
|
||||||
|
html = re.sub(u'(?<=[-–—])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
|
||||||
|
|
||||||
|
# Unwrap lines using punctation if the median length of all lines is less than 200
|
||||||
|
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
||||||
|
html = unwrap.sub(' ', html)
|
||||||
|
|
||||||
|
# If still no sections after unwrapping mark split points on lines with no punctuation
|
||||||
|
if self.html_preprocess_sections < 10:
|
||||||
|
self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections))
|
||||||
|
#self.log(html)
|
||||||
|
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
||||||
|
html = chapdetect3.sub(self.chapter_break, html)
|
||||||
|
# search for places where a first or second level heading is immediately followed by another
|
||||||
|
# top level heading. demote the second heading to h3 to prevent splitting between chapter
|
||||||
|
# headings and titles, images, etc
|
||||||
|
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
||||||
|
html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
|
||||||
|
|
||||||
|
return html
|
@ -24,7 +24,7 @@ from calibre.constants import islinux, isfreebsd, iswindows
|
|||||||
from calibre import unicode_path
|
from calibre import unicode_path
|
||||||
from calibre.utils.localization import get_lang
|
from calibre.utils.localization import get_lang
|
||||||
from calibre.utils.filenames import ascii_filename
|
from calibre.utils.filenames import ascii_filename
|
||||||
from calibre.ebooks.conversion.preprocess import line_length
|
from calibre.ebooks.conversion.utils import PreProcessor
|
||||||
|
|
||||||
class Link(object):
|
class Link(object):
|
||||||
'''
|
'''
|
||||||
@ -491,20 +491,6 @@ class HTMLInput(InputFormatPlugin):
|
|||||||
return (None, raw)
|
return (None, raw)
|
||||||
|
|
||||||
def preprocess_html(self, html):
|
def preprocess_html(self, html):
|
||||||
if not hasattr(self, 'log'):
|
preprocessor = PreProcessor(log=getattr(self, 'log', None))
|
||||||
from calibre.utils.logging import default_log
|
return preprocessor(html)
|
||||||
self.log = default_log
|
|
||||||
self.log("********* Preprocessing HTML *********")
|
|
||||||
# Detect Chapters to match the xpath in the GUI
|
|
||||||
chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE)
|
|
||||||
html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html)
|
|
||||||
# Unwrap lines using punctation if the median length of all lines is less than 150
|
|
||||||
#
|
|
||||||
# Insert extra line feeds so the line length regex functions properly
|
|
||||||
html = re.sub(r"</p>", "</p>\n", html)
|
|
||||||
length = line_length('html', html, 0.4)
|
|
||||||
self.log.debug("*** Median length is " + str(length) + " ***")
|
|
||||||
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
|
||||||
if length < 150:
|
|
||||||
html = unwrap.sub(' ', html)
|
|
||||||
return html
|
|
||||||
|
@ -6,10 +6,9 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
from calibre.ebooks.conversion.preprocess import line_length
|
from calibre.ebooks.conversion.utils import PreProcessor
|
||||||
|
|
||||||
|
|
||||||
class LITInput(InputFormatPlugin):
|
class LITInput(InputFormatPlugin):
|
||||||
|
|
||||||
@ -55,18 +54,6 @@ class LITInput(InputFormatPlugin):
|
|||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, html):
|
def preprocess_html(self, html):
|
||||||
self.log("********* Preprocessing HTML *********")
|
preprocessor = PreProcessor(log=getattr(self, 'log', None))
|
||||||
# Detect Chapters to match the xpath in the GUI
|
return preprocessor(html)
|
||||||
chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE)
|
|
||||||
html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html)
|
|
||||||
# Unwrap lines using punctation if the median length of all lines is less than 150
|
|
||||||
#
|
|
||||||
# Insert extra line feeds so the line length regex functions properly
|
|
||||||
html = re.sub(r"</p>", "</p>\n", html)
|
|
||||||
length = line_length('html', html, 0.4)
|
|
||||||
self.log("*** Median length is " + str(length) + " ***")
|
|
||||||
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
|
||||||
if length < 150:
|
|
||||||
html = unwrap.sub(' ', html)
|
|
||||||
return html
|
|
||||||
|
|
||||||
|
@ -3,6 +3,7 @@ __license__ = 'GPL 3'
|
|||||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import re
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
|
|
||||||
class MOBIInput(InputFormatPlugin):
|
class MOBIInput(InputFormatPlugin):
|
||||||
@ -37,3 +38,12 @@ class MOBIInput(InputFormatPlugin):
|
|||||||
include_meta_content_type=False))
|
include_meta_content_type=False))
|
||||||
accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
|
accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
|
||||||
return mr.created_opf_path
|
return mr.created_opf_path
|
||||||
|
|
||||||
|
def preprocess_html(self, html):
|
||||||
|
# search for places where a first or second level heading is immediately followed by another
|
||||||
|
# top level heading. demote the second heading to h3 to prevent splitting between chapter
|
||||||
|
# headings and titles, images, etc
|
||||||
|
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
||||||
|
html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
|
||||||
|
return html
|
||||||
|
|
||||||
|
@ -138,6 +138,7 @@ class CSSFlattener(object):
|
|||||||
float(self.context.margin_left))
|
float(self.context.margin_left))
|
||||||
bs.append('margin-right : %fpt'%\
|
bs.append('margin-right : %fpt'%\
|
||||||
float(self.context.margin_right))
|
float(self.context.margin_right))
|
||||||
|
bs.extend(['padding-left: 0pt', 'padding-right: 0pt'])
|
||||||
if self.context.change_justification != 'original':
|
if self.context.change_justification != 'original':
|
||||||
bs.append('text-align: '+ self.context.change_justification)
|
bs.append('text-align: '+ self.context.change_justification)
|
||||||
body.set('style', '; '.join(bs))
|
body.set('style', '; '.join(bs))
|
||||||
|
@ -21,7 +21,7 @@ class Reader(FormatReader):
|
|||||||
self.options = options
|
self.options = options
|
||||||
setattr(self.options, 'new_pdf_engine', False)
|
setattr(self.options, 'new_pdf_engine', False)
|
||||||
setattr(self.options, 'no_images', False)
|
setattr(self.options, 'no_images', False)
|
||||||
setattr(self.options, 'unwrap_factor', 0.5)
|
setattr(self.options, 'unwrap_factor', 0.45)
|
||||||
|
|
||||||
def extract_content(self, output_dir):
|
def extract_content(self, output_dir):
|
||||||
self.log.info('Extracting PDF...')
|
self.log.info('Extracting PDF...')
|
||||||
|
@ -22,10 +22,10 @@ class PDFInput(InputFormatPlugin):
|
|||||||
options = set([
|
options = set([
|
||||||
OptionRecommendation(name='no_images', recommended_value=False,
|
OptionRecommendation(name='no_images', recommended_value=False,
|
||||||
help=_('Do not extract images from the document')),
|
help=_('Do not extract images from the document')),
|
||||||
OptionRecommendation(name='unwrap_factor', recommended_value=0.5,
|
OptionRecommendation(name='unwrap_factor', recommended_value=0.45,
|
||||||
help=_('Scale used to determine the length at which a line should '
|
help=_('Scale used to determine the length at which a line should '
|
||||||
'be unwrapped. Valid values are a decimal between 0 and 1. The '
|
'be unwrapped. Valid values are a decimal between 0 and 1. The '
|
||||||
'default is 0.5, this is the median line length.')),
|
'default is 0.45, just below the median line length.')),
|
||||||
OptionRecommendation(name='new_pdf_engine', recommended_value=False,
|
OptionRecommendation(name='new_pdf_engine', recommended_value=False,
|
||||||
help=_('Use the new PDF conversion engine.'))
|
help=_('Use the new PDF conversion engine.'))
|
||||||
])
|
])
|
||||||
|
@ -207,6 +207,7 @@ class PML_HTMLizer(object):
|
|||||||
while html != old:
|
while html != old:
|
||||||
old = html
|
old = html
|
||||||
html = self.cleanup_html_remove_redundant(html)
|
html = self.cleanup_html_remove_redundant(html)
|
||||||
|
html = re.sub(r'(?imu)^\s*', '', html)
|
||||||
return html
|
return html
|
||||||
|
|
||||||
def cleanup_html_remove_redundant(self, html):
|
def cleanup_html_remove_redundant(self, html):
|
||||||
@ -216,7 +217,7 @@ class PML_HTMLizer(object):
|
|||||||
html = re.sub(r'(?u)%s\s*%s' % (open % '.*?', close), '', html)
|
html = re.sub(r'(?u)%s\s*%s' % (open % '.*?', close), '', html)
|
||||||
else:
|
else:
|
||||||
html = re.sub(r'(?u)%s\s*%s' % (open, close), '', html)
|
html = re.sub(r'(?u)%s\s*%s' % (open, close), '', html)
|
||||||
html = re.sub(r'<p>\s*</p>', '', html)
|
html = re.sub(r'(?imu)<p>\s*</p>', '', html)
|
||||||
return html
|
return html
|
||||||
|
|
||||||
def start_line(self):
|
def start_line(self):
|
||||||
@ -556,7 +557,7 @@ class PML_HTMLizer(object):
|
|||||||
text = t
|
text = t
|
||||||
else:
|
else:
|
||||||
self.toc.add_item(os.path.basename(self.file_name), id, value)
|
self.toc.add_item(os.path.basename(self.file_name), id, value)
|
||||||
text = '<span id="%s"></span>%s' % (id, t)
|
text = '%s<span id="%s"></span>' % (t, id)
|
||||||
elif c == 'm':
|
elif c == 'm':
|
||||||
empty = False
|
empty = False
|
||||||
src = self.code_value(line)
|
src = self.code_value(line)
|
||||||
|
@ -7,7 +7,7 @@ import os, glob, re, textwrap
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
from calibre.ebooks.conversion.preprocess import line_length
|
from calibre.ebooks.conversion.utils import PreProcessor
|
||||||
|
|
||||||
class InlineClass(etree.XSLTExtension):
|
class InlineClass(etree.XSLTExtension):
|
||||||
|
|
||||||
@ -229,16 +229,8 @@ class RTFInput(InputFormatPlugin):
|
|||||||
res = transform.tostring(result)
|
res = transform.tostring(result)
|
||||||
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
|
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
|
||||||
if self.options.preprocess_html:
|
if self.options.preprocess_html:
|
||||||
self.log("********* Preprocessing HTML *********")
|
preprocessor = PreProcessor(log=getattr(self, 'log', None))
|
||||||
# Detect Chapters to match the xpath in the GUI
|
res = preprocessor(res)
|
||||||
chapdetect = re.compile(r'<p[^>]*>\s*<span[^>]*>\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)\s*</span>\s*</p>', re.IGNORECASE)
|
|
||||||
res = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', res)
|
|
||||||
# Unwrap lines using punctation if the median length of all lines is less than 150
|
|
||||||
length = line_length('html', res, 0.4)
|
|
||||||
self.log("*** Median length is " + str(length) + " ***")
|
|
||||||
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</span>\s*(</p>)?\s*(?P<up2threeblanks><p[^>]*>\s*(<span[^>]*>\s*</span>\s*)</p>\s*){0,3}\s*<p[^>]*>\s*(<span[^>]*>)?\s*" % length, re.UNICODE)
|
|
||||||
if length < 150:
|
|
||||||
res = unwrap.sub(' ', res)
|
|
||||||
f.write(res)
|
f.write(res)
|
||||||
self.write_inline_css(inline_class)
|
self.write_inline_css(inline_class)
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
|
@ -77,7 +77,7 @@ def separate_paragraphs_print_formatted(txt):
|
|||||||
|
|
||||||
def preserve_spaces(txt):
|
def preserve_spaces(txt):
|
||||||
txt = txt.replace(' ', ' ')
|
txt = txt.replace(' ', ' ')
|
||||||
txt = txt.replace('\t', '	')
|
txt = txt.replace('\t', ' ')
|
||||||
return txt
|
return txt
|
||||||
|
|
||||||
def opf_writer(path, opf_name, manifest, spine, mi):
|
def opf_writer(path, opf_name, manifest, spine, mi):
|
||||||
|
@ -209,8 +209,9 @@ class EditMetadataAction(InterfaceAction):
|
|||||||
dest_id, src_books, src_ids = self.books_to_merge(rows)
|
dest_id, src_books, src_ids = self.books_to_merge(rows)
|
||||||
if safe_merge:
|
if safe_merge:
|
||||||
if not confirm('<p>'+_(
|
if not confirm('<p>'+_(
|
||||||
'All book formats and metadata from the selected books '
|
'Book formats and metadata from the selected books '
|
||||||
'will be added to the <b>first selected book.</b><br><br> '
|
'will be added to the <b>first selected book.</b> '
|
||||||
|
'ISBN will <i>not</i> be merged.<br><br> '
|
||||||
'The second and subsequently selected books will not '
|
'The second and subsequently selected books will not '
|
||||||
'be deleted or changed.<br><br>'
|
'be deleted or changed.<br><br>'
|
||||||
'Please confirm you want to proceed.')
|
'Please confirm you want to proceed.')
|
||||||
@ -220,8 +221,9 @@ class EditMetadataAction(InterfaceAction):
|
|||||||
self.merge_metadata(dest_id, src_ids)
|
self.merge_metadata(dest_id, src_ids)
|
||||||
else:
|
else:
|
||||||
if not confirm('<p>'+_(
|
if not confirm('<p>'+_(
|
||||||
'All book formats and metadata from the selected books will be merged '
|
'Book formats and metadata from the selected books will be merged '
|
||||||
'into the <b>first selected book</b>.<br><br>'
|
'into the <b>first selected book</b>. '
|
||||||
|
'ISBN will <i>not</i> be merged.<br><br>'
|
||||||
'After merger the second and '
|
'After merger the second and '
|
||||||
'subsequently selected books will be <b>deleted</b>. <br><br>'
|
'subsequently selected books will be <b>deleted</b>. <br><br>'
|
||||||
'All book formats of the first selected book will be kept '
|
'All book formats of the first selected book will be kept '
|
||||||
|
@ -22,7 +22,7 @@ class LookAndFeelWidget(Widget, Ui_Form):
|
|||||||
Widget.__init__(self, parent,
|
Widget.__init__(self, parent,
|
||||||
['change_justification', 'extra_css', 'base_font_size',
|
['change_justification', 'extra_css', 'base_font_size',
|
||||||
'font_size_mapping', 'line_height',
|
'font_size_mapping', 'line_height',
|
||||||
'linearize_tables',
|
'linearize_tables', 'smarten_punctuation',
|
||||||
'disable_font_rescaling', 'insert_blank_line',
|
'disable_font_rescaling', 'insert_blank_line',
|
||||||
'remove_paragraph_spacing', 'remove_paragraph_spacing_indent_size','input_encoding',
|
'remove_paragraph_spacing', 'remove_paragraph_spacing_indent_size','input_encoding',
|
||||||
'asciiize', 'keep_ligatures']
|
'asciiize', 'keep_ligatures']
|
||||||
|
@ -178,7 +178,7 @@
|
|||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="9" column="0" colspan="4">
|
<item row="10" column="0" colspan="4">
|
||||||
<widget class="QGroupBox" name="groupBox">
|
<widget class="QGroupBox" name="groupBox">
|
||||||
<property name="title">
|
<property name="title">
|
||||||
<string>Extra &CSS</string>
|
<string>Extra &CSS</string>
|
||||||
@ -214,6 +214,13 @@
|
|||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
|
<item row="9" column="0">
|
||||||
|
<widget class="QCheckBox" name="opt_smarten_punctuation">
|
||||||
|
<property name="text">
|
||||||
|
<string>Smarten &punctuation</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
</layout>
|
</layout>
|
||||||
</widget>
|
</widget>
|
||||||
<resources>
|
<resources>
|
||||||
|
@ -46,7 +46,7 @@
|
|||||||
<double>0.010000000000000</double>
|
<double>0.010000000000000</double>
|
||||||
</property>
|
</property>
|
||||||
<property name="value">
|
<property name="value">
|
||||||
<double>0.500000000000000</double>
|
<double>0.450000000000000</double>
|
||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
|
@ -155,6 +155,7 @@ class CoverFlowMixin(object):
|
|||||||
self.cb_splitter.action_toggle.triggered.connect(self.toggle_cover_browser)
|
self.cb_splitter.action_toggle.triggered.connect(self.toggle_cover_browser)
|
||||||
if CoverFlow is not None:
|
if CoverFlow is not None:
|
||||||
self.cover_flow.stop.connect(self.hide_cover_browser)
|
self.cover_flow.stop.connect(self.hide_cover_browser)
|
||||||
|
self.cover_flow.setVisible(False)
|
||||||
else:
|
else:
|
||||||
self.cb_splitter.insertWidget(self.cb_splitter.side_index, self.cover_flow)
|
self.cb_splitter.insertWidget(self.cb_splitter.side_index, self.cover_flow)
|
||||||
if CoverFlow is not None:
|
if CoverFlow is not None:
|
||||||
|
@ -121,10 +121,8 @@ class BooksModel(QAbstractTableModel): # {{{
|
|||||||
def set_device_connected(self, is_connected):
|
def set_device_connected(self, is_connected):
|
||||||
self.device_connected = is_connected
|
self.device_connected = is_connected
|
||||||
self.db.refresh_ondevice()
|
self.db.refresh_ondevice()
|
||||||
self.refresh()
|
self.refresh() # does a resort()
|
||||||
self.research()
|
self.research()
|
||||||
if is_connected and self.sorted_on[0] == 'ondevice':
|
|
||||||
self.resort()
|
|
||||||
|
|
||||||
def set_book_on_device_func(self, func):
|
def set_book_on_device_func(self, func):
|
||||||
self.book_on_device = func
|
self.book_on_device = func
|
||||||
@ -264,19 +262,15 @@ class BooksModel(QAbstractTableModel): # {{{
|
|||||||
self.sorting_done.emit(self.db.index)
|
self.sorting_done.emit(self.db.index)
|
||||||
|
|
||||||
def refresh(self, reset=True):
|
def refresh(self, reset=True):
|
||||||
try:
|
|
||||||
col = self.column_map.index(self.sorted_on[0])
|
|
||||||
except:
|
|
||||||
col = 0
|
|
||||||
self.db.refresh(field=None)
|
self.db.refresh(field=None)
|
||||||
self.sort(col, self.sorted_on[1], reset=reset)
|
self.resort(reset=reset)
|
||||||
|
|
||||||
def resort(self, reset=True):
|
def resort(self, reset=True):
|
||||||
try:
|
if not self.db:
|
||||||
col = self.column_map.index(self.sorted_on[0])
|
return
|
||||||
except ValueError:
|
self.db.multisort(self.sort_history[:tweaks['maximum_resort_levels']])
|
||||||
col = 0
|
if reset:
|
||||||
self.sort(col, self.sorted_on[1], reset=reset)
|
self.reset()
|
||||||
|
|
||||||
def research(self, reset=True):
|
def research(self, reset=True):
|
||||||
self.search(self.last_search, reset=reset)
|
self.search(self.last_search, reset=reset)
|
||||||
@ -1030,6 +1024,11 @@ class DeviceBooksModel(BooksModel): # {{{
|
|||||||
if reset:
|
if reset:
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
|
def resort(self, reset=True):
|
||||||
|
if self.sorted_on:
|
||||||
|
self.sort(self.column_map.index(self.sorted_on[0]),
|
||||||
|
self.sorted_on[1], reset=reset)
|
||||||
|
|
||||||
def columnCount(self, parent):
|
def columnCount(self, parent):
|
||||||
if parent and parent.isValid():
|
if parent and parent.isValid():
|
||||||
return 0
|
return 0
|
||||||
|
@ -512,7 +512,8 @@ class TagsModel(QAbstractItemModel): # {{{
|
|||||||
_('The saved search name %s is already used.')%val).exec_()
|
_('The saved search name %s is already used.')%val).exec_()
|
||||||
return False
|
return False
|
||||||
saved_searches().rename(unicode(item.data(role).toString()), val)
|
saved_searches().rename(unicode(item.data(role).toString()), val)
|
||||||
self.tags_view.search_item_renamed.emit()
|
item.tag.name = val
|
||||||
|
self.tags_view.search_item_renamed.emit() # Does a refresh
|
||||||
else:
|
else:
|
||||||
if key == 'series':
|
if key == 'series':
|
||||||
self.db.rename_series(item.tag.id, val)
|
self.db.rename_series(item.tag.id, val)
|
||||||
@ -526,8 +527,8 @@ class TagsModel(QAbstractItemModel): # {{{
|
|||||||
self.db.rename_custom_item(item.tag.id, val,
|
self.db.rename_custom_item(item.tag.id, val,
|
||||||
label=self.db.field_metadata[key]['label'])
|
label=self.db.field_metadata[key]['label'])
|
||||||
self.tags_view.tag_item_renamed.emit()
|
self.tags_view.tag_item_renamed.emit()
|
||||||
item.tag.name = val
|
item.tag.name = val
|
||||||
self.refresh() # Should work, because no categories can have disappeared
|
self.refresh() # Should work, because no categories can have disappeared
|
||||||
if path:
|
if path:
|
||||||
idx = self.index_for_path(path)
|
idx = self.index_for_path(path)
|
||||||
if idx.isValid():
|
if idx.isValid():
|
||||||
@ -669,7 +670,7 @@ class TagBrowserMixin(object): # {{{
|
|||||||
self.tags_view.saved_search_edit.connect(self.do_saved_search_edit)
|
self.tags_view.saved_search_edit.connect(self.do_saved_search_edit)
|
||||||
self.tags_view.author_sort_edit.connect(self.do_author_sort_edit)
|
self.tags_view.author_sort_edit.connect(self.do_author_sort_edit)
|
||||||
self.tags_view.tag_item_renamed.connect(self.do_tag_item_renamed)
|
self.tags_view.tag_item_renamed.connect(self.do_tag_item_renamed)
|
||||||
self.tags_view.search_item_renamed.connect(self.saved_search.clear_to_help)
|
self.tags_view.search_item_renamed.connect(self.saved_searches_changed)
|
||||||
self.edit_categories.clicked.connect(lambda x:
|
self.edit_categories.clicked.connect(lambda x:
|
||||||
self.do_user_categories_edit())
|
self.do_user_categories_edit())
|
||||||
|
|
||||||
|
@ -6,7 +6,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import re, itertools, functools
|
import re, itertools
|
||||||
from itertools import repeat
|
from itertools import repeat
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
from threading import Thread, RLock
|
from threading import Thread, RLock
|
||||||
@ -112,7 +112,7 @@ class ResultCache(SearchQueryParser):
|
|||||||
'''
|
'''
|
||||||
def __init__(self, FIELD_MAP, field_metadata):
|
def __init__(self, FIELD_MAP, field_metadata):
|
||||||
self.FIELD_MAP = FIELD_MAP
|
self.FIELD_MAP = FIELD_MAP
|
||||||
self._map = self._map_filtered = self._data = []
|
self._map = self._data = self._map_filtered = []
|
||||||
self.first_sort = True
|
self.first_sort = True
|
||||||
self.search_restriction = ''
|
self.search_restriction = ''
|
||||||
self.field_metadata = field_metadata
|
self.field_metadata = field_metadata
|
||||||
@ -141,6 +141,8 @@ class ResultCache(SearchQueryParser):
|
|||||||
for x in self.iterall():
|
for x in self.iterall():
|
||||||
yield x[idx]
|
yield x[idx]
|
||||||
|
|
||||||
|
# Search functions {{{
|
||||||
|
|
||||||
def universal_set(self):
|
def universal_set(self):
|
||||||
return set([i[0] for i in self._data if i is not None])
|
return set([i[0] for i in self._data if i is not None])
|
||||||
|
|
||||||
@ -462,12 +464,43 @@ class ResultCache(SearchQueryParser):
|
|||||||
continue
|
continue
|
||||||
return matches
|
return matches
|
||||||
|
|
||||||
|
def search(self, query, return_matches=False):
|
||||||
|
ans = self.search_getting_ids(query, self.search_restriction)
|
||||||
|
if return_matches:
|
||||||
|
return ans
|
||||||
|
self._map_filtered = ans
|
||||||
|
|
||||||
|
def search_getting_ids(self, query, search_restriction):
|
||||||
|
q = ''
|
||||||
|
if not query or not query.strip():
|
||||||
|
q = search_restriction
|
||||||
|
else:
|
||||||
|
q = query
|
||||||
|
if search_restriction:
|
||||||
|
q = u'%s (%s)' % (search_restriction, query)
|
||||||
|
if not q:
|
||||||
|
return list(self._map)
|
||||||
|
matches = self.parse(q)
|
||||||
|
tmap = list(itertools.repeat(False, len(self._data)))
|
||||||
|
for x in matches:
|
||||||
|
tmap[x] = True
|
||||||
|
return [x for x in self._map if tmap[x]]
|
||||||
|
|
||||||
|
def set_search_restriction(self, s):
|
||||||
|
self.search_restriction = s
|
||||||
|
|
||||||
|
# }}}
|
||||||
|
|
||||||
def remove(self, id):
|
def remove(self, id):
|
||||||
self._data[id] = None
|
self._data[id] = None
|
||||||
if id in self._map:
|
try:
|
||||||
self._map.remove(id)
|
self._map.remove(id)
|
||||||
if id in self._map_filtered:
|
except ValueError:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
self._map_filtered.remove(id)
|
self._map_filtered.remove(id)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
def set(self, row, col, val, row_is_id=False):
|
def set(self, row, col, val, row_is_id=False):
|
||||||
id = row if row_is_id else self._map_filtered[row]
|
id = row if row_is_id else self._map_filtered[row]
|
||||||
@ -522,9 +555,7 @@ class ResultCache(SearchQueryParser):
|
|||||||
|
|
||||||
def books_deleted(self, ids):
|
def books_deleted(self, ids):
|
||||||
for id in ids:
|
for id in ids:
|
||||||
self._data[id] = None
|
self.remove(id)
|
||||||
if id in self._map: self._map.remove(id)
|
|
||||||
if id in self._map_filtered: self._map_filtered.remove(id)
|
|
||||||
|
|
||||||
def count(self):
|
def count(self):
|
||||||
return len(self._map)
|
return len(self._map)
|
||||||
@ -549,90 +580,97 @@ class ResultCache(SearchQueryParser):
|
|||||||
self.sort(field, ascending)
|
self.sort(field, ascending)
|
||||||
self._map_filtered = list(self._map)
|
self._map_filtered = list(self._map)
|
||||||
if self.search_restriction:
|
if self.search_restriction:
|
||||||
self.search('', return_matches=False, ignore_search_restriction=False)
|
self.search('', return_matches=False)
|
||||||
|
|
||||||
def seriescmp(self, sidx, siidx, x, y, library_order=None):
|
# Sorting functions {{{
|
||||||
try:
|
|
||||||
if library_order:
|
|
||||||
ans = cmp(title_sort(self._data[x][sidx].lower()),
|
|
||||||
title_sort(self._data[y][sidx].lower()))
|
|
||||||
else:
|
|
||||||
ans = cmp(self._data[x][sidx].lower(),
|
|
||||||
self._data[y][sidx].lower())
|
|
||||||
except AttributeError: # Some entries may be None
|
|
||||||
ans = cmp(self._data[x][sidx], self._data[y][sidx])
|
|
||||||
if ans != 0: return ans
|
|
||||||
return cmp(self._data[x][siidx], self._data[y][siidx])
|
|
||||||
|
|
||||||
def cmp(self, loc, x, y, asstr=True, subsort=False):
|
def sanitize_sort_field_name(self, field):
|
||||||
try:
|
field = field.lower().strip()
|
||||||
ans = cmp(self._data[x][loc].lower(), self._data[y][loc].lower()) if \
|
if field not in self.field_metadata.iterkeys():
|
||||||
asstr else cmp(self._data[x][loc], self._data[y][loc])
|
if field in ('author', 'tag', 'comment'):
|
||||||
except AttributeError: # Some entries may be None
|
field += 's'
|
||||||
ans = cmp(self._data[x][loc], self._data[y][loc])
|
if field == 'date': field = 'timestamp'
|
||||||
except TypeError: ## raised when a datetime is None
|
elif field == 'title': field = 'sort'
|
||||||
x = self._data[x][loc]
|
elif field == 'authors': field = 'author_sort'
|
||||||
if x is None:
|
return field
|
||||||
x = UNDEFINED_DATE
|
|
||||||
y = self._data[y][loc]
|
|
||||||
if y is None:
|
|
||||||
y = UNDEFINED_DATE
|
|
||||||
return cmp(x, y)
|
|
||||||
if subsort and ans == 0:
|
|
||||||
return cmp(self._data[x][11].lower(), self._data[y][11].lower())
|
|
||||||
return ans
|
|
||||||
|
|
||||||
def sort(self, field, ascending, subsort=False):
|
def sort(self, field, ascending, subsort=False):
|
||||||
field = field.lower().strip()
|
self.multisort([(field, ascending)])
|
||||||
if field in ('author', 'tag', 'comment'):
|
|
||||||
field += 's'
|
|
||||||
if field == 'date': field = 'timestamp'
|
|
||||||
elif field == 'title': field = 'sort'
|
|
||||||
elif field == 'authors': field = 'author_sort'
|
|
||||||
as_string = field not in ('size', 'rating', 'timestamp')
|
|
||||||
|
|
||||||
if self.first_sort:
|
def multisort(self, fields=[], subsort=False):
|
||||||
subsort = True
|
fields = [(self.sanitize_sort_field_name(x), bool(y)) for x, y in fields]
|
||||||
self.first_sort = False
|
keys = self.field_metadata.field_keys()
|
||||||
if self.field_metadata[field]['is_custom']:
|
fields = [x for x in fields if x[0] in keys]
|
||||||
if self.field_metadata[field]['datatype'] == 'series':
|
if subsort and 'sort' not in [x[0] for x in fields]:
|
||||||
fcmp = functools.partial(self.seriescmp,
|
fields += [('sort', True)]
|
||||||
self.field_metadata[field]['rec_index'],
|
if not fields:
|
||||||
self.field_metadata.cc_series_index_column_for(field),
|
fields = [('timestamp', False)]
|
||||||
library_order=tweaks['title_series_sorting'] == 'library_order')
|
|
||||||
else:
|
keyg = SortKeyGenerator(fields, self.field_metadata, self._data)
|
||||||
as_string = self.field_metadata[field]['datatype'] in ('comments', 'text')
|
if len(fields) == 1:
|
||||||
field = self.field_metadata[field]['colnum']
|
self._map.sort(key=keyg, reverse=not fields[0][1])
|
||||||
fcmp = functools.partial(self.cmp, self.FIELD_MAP[field],
|
|
||||||
subsort=subsort, asstr=as_string)
|
|
||||||
elif field == 'series':
|
|
||||||
fcmp = functools.partial(self.seriescmp, self.FIELD_MAP['series'],
|
|
||||||
self.FIELD_MAP['series_index'],
|
|
||||||
library_order=tweaks['title_series_sorting'] == 'library_order')
|
|
||||||
else:
|
else:
|
||||||
fcmp = functools.partial(self.cmp, self.FIELD_MAP[field],
|
self._map.sort(key=keyg)
|
||||||
subsort=subsort, asstr=as_string)
|
|
||||||
self._map.sort(cmp=fcmp, reverse=not ascending)
|
|
||||||
self._map_filtered = [id for id in self._map if id in self._map_filtered]
|
|
||||||
|
|
||||||
def search(self, query, return_matches=False):
|
tmap = list(itertools.repeat(False, len(self._data)))
|
||||||
ans = self.search_getting_ids(query, self.search_restriction)
|
for x in self._map_filtered:
|
||||||
if return_matches:
|
tmap[x] = True
|
||||||
return ans
|
self._map_filtered = [x for x in self._map if tmap[x]]
|
||||||
self._map_filtered = ans
|
|
||||||
|
|
||||||
|
class SortKey(object):
|
||||||
|
|
||||||
|
def __init__(self, orders, values):
|
||||||
|
self.orders, self.values = orders, values
|
||||||
|
|
||||||
|
def __cmp__(self, other):
|
||||||
|
for i, ascending in enumerate(self.orders):
|
||||||
|
ans = cmp(self.values[i], other.values[i])
|
||||||
|
if ans != 0:
|
||||||
|
return ans * ascending
|
||||||
|
return 0
|
||||||
|
|
||||||
|
class SortKeyGenerator(object):
|
||||||
|
|
||||||
|
def __init__(self, fields, field_metadata, data):
|
||||||
|
self.field_metadata = field_metadata
|
||||||
|
self.orders = [-1 if x[1] else 1 for x in fields]
|
||||||
|
self.entries = [(x[0], field_metadata[x[0]]) for x in fields]
|
||||||
|
self.library_order = tweaks['title_series_sorting'] == 'library_order'
|
||||||
|
self.data = data
|
||||||
|
|
||||||
|
def __call__(self, record):
|
||||||
|
values = tuple(self.itervals(self.data[record]))
|
||||||
|
if len(values) == 1:
|
||||||
|
return values[0]
|
||||||
|
return SortKey(self.orders, values)
|
||||||
|
|
||||||
|
def itervals(self, record):
|
||||||
|
for name, fm in self.entries:
|
||||||
|
dt = fm['datatype']
|
||||||
|
val = record[fm['rec_index']]
|
||||||
|
|
||||||
|
if dt == 'datetime':
|
||||||
|
if val is None:
|
||||||
|
val = UNDEFINED_DATE
|
||||||
|
|
||||||
|
elif dt == 'series':
|
||||||
|
if val is None:
|
||||||
|
val = ('', 1)
|
||||||
|
else:
|
||||||
|
val = val.lower()
|
||||||
|
if self.library_order:
|
||||||
|
val = title_sort(val)
|
||||||
|
sidx_fm = self.field_metadata[name + '_index']
|
||||||
|
sidx = record[sidx_fm['rec_index']]
|
||||||
|
val = (val, sidx)
|
||||||
|
|
||||||
|
elif dt in ('text', 'comments'):
|
||||||
|
if val is None:
|
||||||
|
val = ''
|
||||||
|
val = val.lower()
|
||||||
|
yield val
|
||||||
|
|
||||||
|
# }}}
|
||||||
|
|
||||||
def search_getting_ids(self, query, search_restriction):
|
|
||||||
q = ''
|
|
||||||
if not query or not query.strip():
|
|
||||||
q = search_restriction
|
|
||||||
else:
|
|
||||||
q = query
|
|
||||||
if search_restriction:
|
|
||||||
q = u'%s (%s)' % (search_restriction, query)
|
|
||||||
if not q:
|
|
||||||
return list(self._map)
|
|
||||||
matches = sorted(self.parse(q))
|
|
||||||
return [id for id in self._map if id in matches]
|
|
||||||
|
|
||||||
def set_search_restriction(self, s):
|
|
||||||
self.search_restriction = s
|
|
||||||
|
@ -311,6 +311,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
|
|||||||
self.search_getting_ids = self.data.search_getting_ids
|
self.search_getting_ids = self.data.search_getting_ids
|
||||||
self.refresh = functools.partial(self.data.refresh, self)
|
self.refresh = functools.partial(self.data.refresh, self)
|
||||||
self.sort = self.data.sort
|
self.sort = self.data.sort
|
||||||
|
self.multisort = self.data.multisort
|
||||||
self.index = self.data.index
|
self.index = self.data.index
|
||||||
self.refresh_ids = functools.partial(self.data.refresh_ids, self)
|
self.refresh_ids = functools.partial(self.data.refresh_ids, self)
|
||||||
self.row = self.data.row
|
self.row = self.data.row
|
||||||
|
@ -69,6 +69,8 @@ class FieldMetadata(dict):
|
|||||||
VALID_DATA_TYPES = frozenset([None, 'rating', 'text', 'comments', 'datetime',
|
VALID_DATA_TYPES = frozenset([None, 'rating', 'text', 'comments', 'datetime',
|
||||||
'int', 'float', 'bool', 'series'])
|
'int', 'float', 'bool', 'series'])
|
||||||
|
|
||||||
|
# Builtin metadata {{{
|
||||||
|
|
||||||
_field_metadata = [
|
_field_metadata = [
|
||||||
('authors', {'table':'authors',
|
('authors', {'table':'authors',
|
||||||
'column':'name',
|
'column':'name',
|
||||||
@ -287,7 +289,8 @@ class FieldMetadata(dict):
|
|||||||
'search_terms':[],
|
'search_terms':[],
|
||||||
'is_custom':False,
|
'is_custom':False,
|
||||||
'is_category':False}),
|
'is_category':False}),
|
||||||
]
|
]
|
||||||
|
# }}}
|
||||||
|
|
||||||
# search labels that are not db columns
|
# search labels that are not db columns
|
||||||
search_items = [ 'all',
|
search_items = [ 'all',
|
||||||
@ -332,6 +335,9 @@ class FieldMetadata(dict):
|
|||||||
def keys(self):
|
def keys(self):
|
||||||
return self._tb_cats.keys()
|
return self._tb_cats.keys()
|
||||||
|
|
||||||
|
def field_keys(self):
|
||||||
|
return [k for k in self._tb_cats.keys() if self._tb_cats[k]['kind']=='field']
|
||||||
|
|
||||||
def iterkeys(self):
|
def iterkeys(self):
|
||||||
for key in self._tb_cats:
|
for key in self._tb_cats:
|
||||||
yield key
|
yield key
|
||||||
|
@ -5,7 +5,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import re, os, cStringIO, operator
|
import re, os, cStringIO
|
||||||
|
|
||||||
import cherrypy
|
import cherrypy
|
||||||
try:
|
try:
|
||||||
@ -16,7 +16,15 @@ except ImportError:
|
|||||||
|
|
||||||
from calibre import fit_image, guess_type
|
from calibre import fit_image, guess_type
|
||||||
from calibre.utils.date import fromtimestamp
|
from calibre.utils.date import fromtimestamp
|
||||||
from calibre.ebooks.metadata import title_sort
|
from calibre.library.caches import SortKeyGenerator
|
||||||
|
|
||||||
|
class CSSortKeyGenerator(SortKeyGenerator):
|
||||||
|
|
||||||
|
def __init__(self, fields, fm):
|
||||||
|
SortKeyGenerator.__init__(self, fields, fm, None)
|
||||||
|
|
||||||
|
def __call__(self, record):
|
||||||
|
return self.itervals(record).next()
|
||||||
|
|
||||||
class ContentServer(object):
|
class ContentServer(object):
|
||||||
|
|
||||||
@ -47,32 +55,12 @@ class ContentServer(object):
|
|||||||
|
|
||||||
|
|
||||||
def sort(self, items, field, order):
|
def sort(self, items, field, order):
|
||||||
field = field.lower().strip()
|
field = self.db.data.sanitize_sort_field_name(field)
|
||||||
if field == 'author':
|
|
||||||
field = 'authors'
|
|
||||||
if field == 'date':
|
|
||||||
field = 'timestamp'
|
|
||||||
if field not in ('title', 'authors', 'rating', 'timestamp', 'tags', 'size', 'series'):
|
if field not in ('title', 'authors', 'rating', 'timestamp', 'tags', 'size', 'series'):
|
||||||
raise cherrypy.HTTPError(400, '%s is not a valid sort field'%field)
|
raise cherrypy.HTTPError(400, '%s is not a valid sort field'%field)
|
||||||
cmpf = cmp if field in ('rating', 'size', 'timestamp') else \
|
keyg = CSSortKeyGenerator([(field, order)], self.db.field_metadata)
|
||||||
lambda x, y: cmp(x.lower() if x else '', y.lower() if y else '')
|
items.sort(key=keyg, reverse=not order)
|
||||||
if field == 'series':
|
|
||||||
items.sort(cmp=self.seriescmp, reverse=not order)
|
|
||||||
else:
|
|
||||||
lookup = 'sort' if field == 'title' else field
|
|
||||||
lookup = 'author_sort' if field == 'authors' else field
|
|
||||||
field = self.db.FIELD_MAP[lookup]
|
|
||||||
getter = operator.itemgetter(field)
|
|
||||||
items.sort(cmp=lambda x, y: cmpf(getter(x), getter(y)), reverse=not order)
|
|
||||||
|
|
||||||
def seriescmp(self, x, y):
|
|
||||||
si = self.db.FIELD_MAP['series']
|
|
||||||
try:
|
|
||||||
ans = cmp(title_sort(x[si].lower()), title_sort(y[si].lower()))
|
|
||||||
except AttributeError: # Some entries may be None
|
|
||||||
ans = cmp(x[si], y[si])
|
|
||||||
if ans != 0: return ans
|
|
||||||
return cmp(x[self.db.FIELD_MAP['series_index']], y[self.db.FIELD_MAP['series_index']])
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
|
|
||||||
|
@ -54,7 +54,8 @@ def shorten_components_to(length, components):
|
|||||||
r = x[0] if x is components[-1] else ''
|
r = x[0] if x is components[-1] else ''
|
||||||
else:
|
else:
|
||||||
if x is components[-1]:
|
if x is components[-1]:
|
||||||
b, _, e = x.rpartition('.')
|
b, e = os.path.splitext(x)
|
||||||
|
if e == '.': e = ''
|
||||||
r = b[:-delta]+e
|
r = b[:-delta]+e
|
||||||
if r.startswith('.'): r = x[0]+r
|
if r.startswith('.'): r = x[0]+r
|
||||||
else:
|
else:
|
||||||
|
899
src/calibre/utils/smartypants.py
Executable file
899
src/calibre/utils/smartypants.py
Executable file
@ -0,0 +1,899 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
|
||||||
|
r"""
|
||||||
|
==============
|
||||||
|
smartypants.py
|
||||||
|
==============
|
||||||
|
|
||||||
|
----------------------------
|
||||||
|
SmartyPants ported to Python
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
Ported by `Chad Miller`_
|
||||||
|
Copyright (c) 2004, 2007 Chad Miller
|
||||||
|
|
||||||
|
original `SmartyPants`_ by `John Gruber`_
|
||||||
|
Copyright (c) 2003 John Gruber
|
||||||
|
|
||||||
|
|
||||||
|
Synopsis
|
||||||
|
========
|
||||||
|
|
||||||
|
A smart-quotes plugin for Pyblosxom_.
|
||||||
|
|
||||||
|
The priginal "SmartyPants" is a free web publishing plug-in for Movable Type,
|
||||||
|
Blosxom, and BBEdit that easily translates plain ASCII punctuation characters
|
||||||
|
into "smart" typographic punctuation HTML entities.
|
||||||
|
|
||||||
|
This software, *smartypants.py*, endeavours to be a functional port of
|
||||||
|
SmartyPants to Python, for use with Pyblosxom_.
|
||||||
|
|
||||||
|
|
||||||
|
Description
|
||||||
|
===========
|
||||||
|
|
||||||
|
SmartyPants can perform the following transformations:
|
||||||
|
|
||||||
|
- Straight quotes ( " and ' ) into "curly" quote HTML entities
|
||||||
|
- Backticks-style quotes (\`\`like this'') into "curly" quote HTML entities
|
||||||
|
- Dashes (``--`` and ``---``) into en- and em-dash entities
|
||||||
|
- Three consecutive dots (``...`` or ``. . .``) into an ellipsis entity
|
||||||
|
|
||||||
|
This means you can write, edit, and save your posts using plain old
|
||||||
|
ASCII straight quotes, plain dashes, and plain dots, but your published
|
||||||
|
posts (and final HTML output) will appear with smart quotes, em-dashes,
|
||||||
|
and proper ellipses.
|
||||||
|
|
||||||
|
SmartyPants does not modify characters within ``<pre>``, ``<code>``, ``<kbd>``,
|
||||||
|
``<math>`` or ``<script>`` tag blocks. Typically, these tags are used to
|
||||||
|
display text where smart quotes and other "smart punctuation" would not be
|
||||||
|
appropriate, such as source code or example markup.
|
||||||
|
|
||||||
|
|
||||||
|
Backslash Escapes
|
||||||
|
=================
|
||||||
|
|
||||||
|
If you need to use literal straight quotes (or plain hyphens and
|
||||||
|
periods), SmartyPants accepts the following backslash escape sequences
|
||||||
|
to force non-smart punctuation. It does so by transforming the escape
|
||||||
|
sequence into a decimal-encoded HTML entity:
|
||||||
|
|
||||||
|
(FIXME: table here.)
|
||||||
|
|
||||||
|
.. comment It sucks that there's a disconnect between the visual layout and table markup when special characters are involved.
|
||||||
|
.. comment ====== ===== =========
|
||||||
|
.. comment Escape Value Character
|
||||||
|
.. comment ====== ===== =========
|
||||||
|
.. comment \\\\\\\\ \ \\\\
|
||||||
|
.. comment \\\\" " "
|
||||||
|
.. comment \\\\' ' '
|
||||||
|
.. comment \\\\. . .
|
||||||
|
.. comment \\\\- - \-
|
||||||
|
.. comment \\\\` ` \`
|
||||||
|
.. comment ====== ===== =========
|
||||||
|
|
||||||
|
This is useful, for example, when you want to use straight quotes as
|
||||||
|
foot and inch marks: 6'2" tall; a 17" iMac.
|
||||||
|
|
||||||
|
Options
|
||||||
|
=======
|
||||||
|
|
||||||
|
For Pyblosxom users, the ``smartypants_attributes`` attribute is where you
|
||||||
|
specify configuration options.
|
||||||
|
|
||||||
|
Numeric values are the easiest way to configure SmartyPants' behavior:
|
||||||
|
|
||||||
|
"0"
|
||||||
|
Suppress all transformations. (Do nothing.)
|
||||||
|
"1"
|
||||||
|
Performs default SmartyPants transformations: quotes (including
|
||||||
|
\`\`backticks'' -style), em-dashes, and ellipses. "``--``" (dash dash)
|
||||||
|
is used to signify an em-dash; there is no support for en-dashes.
|
||||||
|
|
||||||
|
"2"
|
||||||
|
Same as smarty_pants="1", except that it uses the old-school typewriter
|
||||||
|
shorthand for dashes: "``--``" (dash dash) for en-dashes, "``---``"
|
||||||
|
(dash dash dash)
|
||||||
|
for em-dashes.
|
||||||
|
|
||||||
|
"3"
|
||||||
|
Same as smarty_pants="2", but inverts the shorthand for dashes:
|
||||||
|
"``--``" (dash dash) for em-dashes, and "``---``" (dash dash dash) for
|
||||||
|
en-dashes.
|
||||||
|
|
||||||
|
"-1"
|
||||||
|
Stupefy mode. Reverses the SmartyPants transformation process, turning
|
||||||
|
the HTML entities produced by SmartyPants into their ASCII equivalents.
|
||||||
|
E.g. "“" is turned into a simple double-quote ("), "—" is
|
||||||
|
turned into two dashes, etc.
|
||||||
|
|
||||||
|
|
||||||
|
The following single-character attribute values can be combined to toggle
|
||||||
|
individual transformations from within the smarty_pants attribute. For
|
||||||
|
example, to educate normal quotes and em-dashes, but not ellipses or
|
||||||
|
\`\`backticks'' -style quotes:
|
||||||
|
|
||||||
|
``py['smartypants_attributes'] = "1"``
|
||||||
|
|
||||||
|
"q"
|
||||||
|
Educates normal quote characters: (") and (').
|
||||||
|
|
||||||
|
"b"
|
||||||
|
Educates \`\`backticks'' -style double quotes.
|
||||||
|
|
||||||
|
"B"
|
||||||
|
Educates \`\`backticks'' -style double quotes and \`single' quotes.
|
||||||
|
|
||||||
|
"d"
|
||||||
|
Educates em-dashes.
|
||||||
|
|
||||||
|
"D"
|
||||||
|
Educates em-dashes and en-dashes, using old-school typewriter shorthand:
|
||||||
|
(dash dash) for en-dashes, (dash dash dash) for em-dashes.
|
||||||
|
|
||||||
|
"i"
|
||||||
|
Educates em-dashes and en-dashes, using inverted old-school typewriter
|
||||||
|
shorthand: (dash dash) for em-dashes, (dash dash dash) for en-dashes.
|
||||||
|
|
||||||
|
"e"
|
||||||
|
Educates ellipses.
|
||||||
|
|
||||||
|
"w"
|
||||||
|
Translates any instance of ``"`` into a normal double-quote character.
|
||||||
|
This should be of no interest to most people, but of particular interest
|
||||||
|
to anyone who writes their posts using Dreamweaver, as Dreamweaver
|
||||||
|
inexplicably uses this entity to represent a literal double-quote
|
||||||
|
character. SmartyPants only educates normal quotes, not entities (because
|
||||||
|
ordinarily, entities are used for the explicit purpose of representing the
|
||||||
|
specific character they represent). The "w" option must be used in
|
||||||
|
conjunction with one (or both) of the other quote options ("q" or "b").
|
||||||
|
Thus, if you wish to apply all SmartyPants transformations (quotes, en-
|
||||||
|
and em-dashes, and ellipses) and also translate ``"`` entities into
|
||||||
|
regular quotes so SmartyPants can educate them, you should pass the
|
||||||
|
following to the smarty_pants attribute:
|
||||||
|
|
||||||
|
The ``smartypants_forbidden_flavours`` list contains pyblosxom flavours for
|
||||||
|
which no Smarty Pants rendering will occur.
|
||||||
|
|
||||||
|
|
||||||
|
Caveats
|
||||||
|
=======
|
||||||
|
|
||||||
|
Why You Might Not Want to Use Smart Quotes in Your Weblog
|
||||||
|
---------------------------------------------------------
|
||||||
|
|
||||||
|
For one thing, you might not care.
|
||||||
|
|
||||||
|
Most normal, mentally stable individuals do not take notice of proper
|
||||||
|
typographic punctuation. Many design and typography nerds, however, break
|
||||||
|
out in a nasty rash when they encounter, say, a restaurant sign that uses
|
||||||
|
a straight apostrophe to spell "Joe's".
|
||||||
|
|
||||||
|
If you're the sort of person who just doesn't care, you might well want to
|
||||||
|
continue not caring. Using straight quotes -- and sticking to the 7-bit
|
||||||
|
ASCII character set in general -- is certainly a simpler way to live.
|
||||||
|
|
||||||
|
Even if you I *do* care about accurate typography, you still might want to
|
||||||
|
think twice before educating the quote characters in your weblog. One side
|
||||||
|
effect of publishing curly quote HTML entities is that it makes your
|
||||||
|
weblog a bit harder for others to quote from using copy-and-paste. What
|
||||||
|
happens is that when someone copies text from your blog, the copied text
|
||||||
|
contains the 8-bit curly quote characters (as well as the 8-bit characters
|
||||||
|
for em-dashes and ellipses, if you use these options). These characters
|
||||||
|
are not standard across different text encoding methods, which is why they
|
||||||
|
need to be encoded as HTML entities.
|
||||||
|
|
||||||
|
People copying text from your weblog, however, may not notice that you're
|
||||||
|
using curly quotes, and they'll go ahead and paste the unencoded 8-bit
|
||||||
|
characters copied from their browser into an email message or their own
|
||||||
|
weblog. When pasted as raw "smart quotes", these characters are likely to
|
||||||
|
get mangled beyond recognition.
|
||||||
|
|
||||||
|
That said, my own opinion is that any decent text editor or email client
|
||||||
|
makes it easy to stupefy smart quote characters into their 7-bit
|
||||||
|
equivalents, and I don't consider it my problem if you're using an
|
||||||
|
indecent text editor or email client.
|
||||||
|
|
||||||
|
|
||||||
|
Algorithmic Shortcomings
|
||||||
|
------------------------
|
||||||
|
|
||||||
|
One situation in which quotes will get curled the wrong way is when
|
||||||
|
apostrophes are used at the start of leading contractions. For example:
|
||||||
|
|
||||||
|
``'Twas the night before Christmas.``
|
||||||
|
|
||||||
|
In the case above, SmartyPants will turn the apostrophe into an opening
|
||||||
|
single-quote, when in fact it should be a closing one. I don't think
|
||||||
|
this problem can be solved in the general case -- every word processor
|
||||||
|
I've tried gets this wrong as well. In such cases, it's best to use the
|
||||||
|
proper HTML entity for closing single-quotes (``’``) by hand.
|
||||||
|
|
||||||
|
|
||||||
|
Bugs
|
||||||
|
====
|
||||||
|
|
||||||
|
To file bug reports or feature requests (other than topics listed in the
|
||||||
|
Caveats section above) please send email to: mailto:smartypantspy@chad.org
|
||||||
|
|
||||||
|
If the bug involves quotes being curled the wrong way, please send example
|
||||||
|
text to illustrate.
|
||||||
|
|
||||||
|
To Do list
|
||||||
|
----------
|
||||||
|
|
||||||
|
- Provide a function for use within templates to quote anything at all.
|
||||||
|
|
||||||
|
|
||||||
|
Version History
|
||||||
|
===============
|
||||||
|
|
||||||
|
1.5_1.6: Fri, 27 Jul 2007 07:06:40 -0400
|
||||||
|
- Fixed bug where blocks of precious unalterable text was instead
|
||||||
|
interpreted. Thanks to Le Roux and Dirk van Oosterbosch.
|
||||||
|
|
||||||
|
1.5_1.5: Sat, 13 Aug 2005 15:50:24 -0400
|
||||||
|
- Fix bogus magical quotation when there is no hint that the
|
||||||
|
user wants it, e.g., in "21st century". Thanks to Nathan Hamblen.
|
||||||
|
- Be smarter about quotes before terminating numbers in an en-dash'ed
|
||||||
|
range.
|
||||||
|
|
||||||
|
1.5_1.4: Thu, 10 Feb 2005 20:24:36 -0500
|
||||||
|
- Fix a date-processing bug, as reported by jacob childress.
|
||||||
|
- Begin a test-suite for ensuring correct output.
|
||||||
|
- Removed import of "string", since I didn't really need it.
|
||||||
|
(This was my first every Python program. Sue me!)
|
||||||
|
|
||||||
|
1.5_1.3: Wed, 15 Sep 2004 18:25:58 -0400
|
||||||
|
- Abort processing if the flavour is in forbidden-list. Default of
|
||||||
|
[ "rss" ] (Idea of Wolfgang SCHNERRING.)
|
||||||
|
- Remove stray virgules from en-dashes. Patch by Wolfgang SCHNERRING.
|
||||||
|
|
||||||
|
1.5_1.2: Mon, 24 May 2004 08:14:54 -0400
|
||||||
|
- Some single quotes weren't replaced properly. Diff-tesuji played
|
||||||
|
by Benjamin GEIGER.
|
||||||
|
|
||||||
|
1.5_1.1: Sun, 14 Mar 2004 14:38:28 -0500
|
||||||
|
- Support upcoming pyblosxom 0.9 plugin verification feature.
|
||||||
|
|
||||||
|
1.5_1.0: Tue, 09 Mar 2004 08:08:35 -0500
|
||||||
|
- Initial release
|
||||||
|
|
||||||
|
Version Information
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
Version numbers will track the SmartyPants_ version numbers, with the addition
|
||||||
|
of an underscore and the smartypants.py version on the end.
|
||||||
|
|
||||||
|
New versions will be available at `http://wiki.chad.org/SmartyPantsPy`_
|
||||||
|
|
||||||
|
.. _http://wiki.chad.org/SmartyPantsPy: http://wiki.chad.org/SmartyPantsPy
|
||||||
|
|
||||||
|
Authors
|
||||||
|
=======
|
||||||
|
|
||||||
|
`John Gruber`_ did all of the hard work of writing this software in Perl for
|
||||||
|
`Movable Type`_ and almost all of this useful documentation. `Chad Miller`_
|
||||||
|
ported it to Python to use with Pyblosxom_.
|
||||||
|
|
||||||
|
|
||||||
|
Additional Credits
|
||||||
|
==================
|
||||||
|
|
||||||
|
Portions of the SmartyPants original work are based on Brad Choate's nifty
|
||||||
|
MTRegex plug-in. `Brad Choate`_ also contributed a few bits of source code to
|
||||||
|
this plug-in. Brad Choate is a fine hacker indeed.
|
||||||
|
|
||||||
|
`Jeremy Hedley`_ and `Charles Wiltgen`_ deserve mention for exemplary beta
|
||||||
|
testing of the original SmartyPants.
|
||||||
|
|
||||||
|
`Rael Dornfest`_ ported SmartyPants to Blosxom.
|
||||||
|
|
||||||
|
.. _Brad Choate: http://bradchoate.com/
|
||||||
|
.. _Jeremy Hedley: http://antipixel.com/
|
||||||
|
.. _Charles Wiltgen: http://playbacktime.com/
|
||||||
|
.. _Rael Dornfest: http://raelity.org/
|
||||||
|
|
||||||
|
|
||||||
|
Copyright and License
|
||||||
|
=====================
|
||||||
|
|
||||||
|
SmartyPants_ license::
|
||||||
|
|
||||||
|
Copyright (c) 2003 John Gruber
|
||||||
|
(http://daringfireball.net/)
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
|
||||||
|
* Neither the name "SmartyPants" nor the names of its contributors
|
||||||
|
may be used to endorse or promote products derived from this
|
||||||
|
software without specific prior written permission.
|
||||||
|
|
||||||
|
This software is provided by the copyright holders and contributors "as
|
||||||
|
is" and any express or implied warranties, including, but not limited
|
||||||
|
to, the implied warranties of merchantability and fitness for a
|
||||||
|
particular purpose are disclaimed. In no event shall the copyright
|
||||||
|
owner or contributors be liable for any direct, indirect, incidental,
|
||||||
|
special, exemplary, or consequential damages (including, but not
|
||||||
|
limited to, procurement of substitute goods or services; loss of use,
|
||||||
|
data, or profits; or business interruption) however caused and on any
|
||||||
|
theory of liability, whether in contract, strict liability, or tort
|
||||||
|
(including negligence or otherwise) arising in any way out of the use
|
||||||
|
of this software, even if advised of the possibility of such damage.
|
||||||
|
|
||||||
|
|
||||||
|
smartypants.py license::
|
||||||
|
|
||||||
|
smartypants.py is a derivative work of SmartyPants.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
|
||||||
|
This software is provided by the copyright holders and contributors "as
|
||||||
|
is" and any express or implied warranties, including, but not limited
|
||||||
|
to, the implied warranties of merchantability and fitness for a
|
||||||
|
particular purpose are disclaimed. In no event shall the copyright
|
||||||
|
owner or contributors be liable for any direct, indirect, incidental,
|
||||||
|
special, exemplary, or consequential damages (including, but not
|
||||||
|
limited to, procurement of substitute goods or services; loss of use,
|
||||||
|
data, or profits; or business interruption) however caused and on any
|
||||||
|
theory of liability, whether in contract, strict liability, or tort
|
||||||
|
(including negligence or otherwise) arising in any way out of the use
|
||||||
|
of this software, even if advised of the possibility of such damage.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
.. _John Gruber: http://daringfireball.net/
|
||||||
|
.. _Chad Miller: http://web.chad.org/
|
||||||
|
|
||||||
|
.. _Pyblosxom: http://roughingit.subtlehints.net/pyblosxom
|
||||||
|
.. _SmartyPants: http://daringfireball.net/projects/smartypants/
|
||||||
|
.. _Movable Type: http://www.movabletype.org/
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
default_smartypants_attr = "1"
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
tags_to_skip_regex = re.compile(r"<(/)?(pre|code|kbd|script|math)[^>]*>", re.I)
|
||||||
|
|
||||||
|
|
||||||
|
def verify_installation(request):
|
||||||
|
return 1
|
||||||
|
# assert the plugin is functional
|
||||||
|
|
||||||
|
|
||||||
|
def cb_story(args):
|
||||||
|
global default_smartypants_attr
|
||||||
|
|
||||||
|
try:
|
||||||
|
forbidden_flavours = args["entry"]["smartypants_forbidden_flavours"]
|
||||||
|
except KeyError:
|
||||||
|
forbidden_flavours = [ "rss" ]
|
||||||
|
|
||||||
|
try:
|
||||||
|
attributes = args["entry"]["smartypants_attributes"]
|
||||||
|
except KeyError:
|
||||||
|
attributes = default_smartypants_attr
|
||||||
|
|
||||||
|
if attributes is None:
|
||||||
|
attributes = default_smartypants_attr
|
||||||
|
|
||||||
|
entryData = args["entry"].getData()
|
||||||
|
|
||||||
|
try:
|
||||||
|
if args["request"]["flavour"] in forbidden_flavours:
|
||||||
|
return
|
||||||
|
except KeyError:
|
||||||
|
if "<" in args["entry"]["body"][0:15]: # sniff the stream
|
||||||
|
return # abort if it looks like escaped HTML. FIXME
|
||||||
|
|
||||||
|
# FIXME: make these configurable, perhaps?
|
||||||
|
args["entry"]["body"] = smartyPants(entryData, attributes)
|
||||||
|
args["entry"]["title"] = smartyPants(args["entry"]["title"], attributes)
|
||||||
|
|
||||||
|
|
||||||
|
### interal functions below here
|
||||||
|
|
||||||
|
def smartyPants(text, attr=default_smartypants_attr):
|
||||||
|
convert_quot = False # should we translate " entities into normal quotes?
|
||||||
|
|
||||||
|
# Parse attributes:
|
||||||
|
# 0 : do nothing
|
||||||
|
# 1 : set all
|
||||||
|
# 2 : set all, using old school en- and em- dash shortcuts
|
||||||
|
# 3 : set all, using inverted old school en and em- dash shortcuts
|
||||||
|
#
|
||||||
|
# q : quotes
|
||||||
|
# b : backtick quotes (``double'' only)
|
||||||
|
# B : backtick quotes (``double'' and `single')
|
||||||
|
# d : dashes
|
||||||
|
# D : old school dashes
|
||||||
|
# i : inverted old school dashes
|
||||||
|
# e : ellipses
|
||||||
|
# w : convert " entities to " for Dreamweaver users
|
||||||
|
|
||||||
|
skipped_tag_stack = []
|
||||||
|
do_dashes = "0"
|
||||||
|
do_backticks = "0"
|
||||||
|
do_quotes = "0"
|
||||||
|
do_ellipses = "0"
|
||||||
|
do_stupefy = "0"
|
||||||
|
|
||||||
|
if attr == "0":
|
||||||
|
# Do nothing.
|
||||||
|
return text
|
||||||
|
elif attr == "1":
|
||||||
|
do_quotes = "1"
|
||||||
|
do_backticks = "1"
|
||||||
|
do_dashes = "1"
|
||||||
|
do_ellipses = "1"
|
||||||
|
elif attr == "2":
|
||||||
|
# Do everything, turn all options on, use old school dash shorthand.
|
||||||
|
do_quotes = "1"
|
||||||
|
do_backticks = "1"
|
||||||
|
do_dashes = "2"
|
||||||
|
do_ellipses = "1"
|
||||||
|
elif attr == "3":
|
||||||
|
# Do everything, turn all options on, use inverted old school dash shorthand.
|
||||||
|
do_quotes = "1"
|
||||||
|
do_backticks = "1"
|
||||||
|
do_dashes = "3"
|
||||||
|
do_ellipses = "1"
|
||||||
|
elif attr == "-1":
|
||||||
|
# Special "stupefy" mode.
|
||||||
|
do_stupefy = "1"
|
||||||
|
else:
|
||||||
|
for c in attr:
|
||||||
|
if c == "q": do_quotes = "1"
|
||||||
|
elif c == "b": do_backticks = "1"
|
||||||
|
elif c == "B": do_backticks = "2"
|
||||||
|
elif c == "d": do_dashes = "1"
|
||||||
|
elif c == "D": do_dashes = "2"
|
||||||
|
elif c == "i": do_dashes = "3"
|
||||||
|
elif c == "e": do_ellipses = "1"
|
||||||
|
elif c == "w": convert_quot = "1"
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
# ignore unknown option
|
||||||
|
|
||||||
|
tokens = _tokenize(text)
|
||||||
|
result = []
|
||||||
|
in_pre = False
|
||||||
|
|
||||||
|
prev_token_last_char = ""
|
||||||
|
# This is a cheat, used to get some context
|
||||||
|
# for one-character tokens that consist of
|
||||||
|
# just a quote char. What we do is remember
|
||||||
|
# the last character of the previous text
|
||||||
|
# token, to use as context to curl single-
|
||||||
|
# character quote tokens correctly.
|
||||||
|
|
||||||
|
for cur_token in tokens:
|
||||||
|
if cur_token[0] == "tag":
|
||||||
|
# Don't mess with quotes inside some tags. This does not handle self <closing/> tags!
|
||||||
|
result.append(cur_token[1])
|
||||||
|
skip_match = tags_to_skip_regex.match(cur_token[1])
|
||||||
|
if skip_match is not None:
|
||||||
|
if not skip_match.group(1):
|
||||||
|
skipped_tag_stack.append(skip_match.group(2).lower())
|
||||||
|
in_pre = True
|
||||||
|
else:
|
||||||
|
if len(skipped_tag_stack) > 0:
|
||||||
|
if skip_match.group(2).lower() == skipped_tag_stack[-1]:
|
||||||
|
skipped_tag_stack.pop()
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
# This close doesn't match the open. This isn't XHTML. We should barf here.
|
||||||
|
if len(skipped_tag_stack) == 0:
|
||||||
|
in_pre = False
|
||||||
|
else:
|
||||||
|
t = cur_token[1]
|
||||||
|
last_char = t[-1:] # Remember last char of this token before processing.
|
||||||
|
if not in_pre:
|
||||||
|
t = processEscapes(t)
|
||||||
|
|
||||||
|
if convert_quot != "0":
|
||||||
|
t = re.sub('"', '"', t)
|
||||||
|
|
||||||
|
if do_dashes != "0":
|
||||||
|
if do_dashes == "1":
|
||||||
|
t = educateDashes(t)
|
||||||
|
if do_dashes == "2":
|
||||||
|
t = educateDashesOldSchool(t)
|
||||||
|
if do_dashes == "3":
|
||||||
|
t = educateDashesOldSchoolInverted(t)
|
||||||
|
|
||||||
|
if do_ellipses != "0":
|
||||||
|
t = educateEllipses(t)
|
||||||
|
|
||||||
|
# Note: backticks need to be processed before quotes.
|
||||||
|
if do_backticks != "0":
|
||||||
|
t = educateBackticks(t)
|
||||||
|
|
||||||
|
if do_backticks == "2":
|
||||||
|
t = educateSingleBackticks(t)
|
||||||
|
|
||||||
|
if do_quotes != "0":
|
||||||
|
if t == "'":
|
||||||
|
# Special case: single-character ' token
|
||||||
|
if re.match("\S", prev_token_last_char):
|
||||||
|
t = "’"
|
||||||
|
else:
|
||||||
|
t = "‘"
|
||||||
|
elif t == '"':
|
||||||
|
# Special case: single-character " token
|
||||||
|
if re.match("\S", prev_token_last_char):
|
||||||
|
t = "”"
|
||||||
|
else:
|
||||||
|
t = "“"
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Normal case:
|
||||||
|
t = educateQuotes(t)
|
||||||
|
|
||||||
|
if do_stupefy == "1":
|
||||||
|
t = stupefyEntities(t)
|
||||||
|
|
||||||
|
prev_token_last_char = last_char
|
||||||
|
result.append(t)
|
||||||
|
|
||||||
|
return "".join(result)
|
||||||
|
|
||||||
|
|
||||||
|
def educateQuotes(str):
|
||||||
|
"""
|
||||||
|
Parameter: String.
|
||||||
|
|
||||||
|
Returns: The string, with "educated" curly quote HTML entities.
|
||||||
|
|
||||||
|
Example input: "Isn't this fun?"
|
||||||
|
Example output: “Isn’t this fun?”
|
||||||
|
"""
|
||||||
|
|
||||||
|
punct_class = r"""[!"#\$\%'()*+,-.\/:;<=>?\@\[\\\]\^_`{|}~]"""
|
||||||
|
|
||||||
|
# Special case if the very first character is a quote
|
||||||
|
# followed by punctuation at a non-word-break. Close the quotes by brute force:
|
||||||
|
str = re.sub(r"""^'(?=%s\\B)""" % (punct_class,), r"""’""", str)
|
||||||
|
str = re.sub(r"""^"(?=%s\\B)""" % (punct_class,), r"""”""", str)
|
||||||
|
|
||||||
|
# Special case for double sets of quotes, e.g.:
|
||||||
|
# <p>He said, "'Quoted' words in a larger quote."</p>
|
||||||
|
str = re.sub(r""""'(?=\w)""", """“‘""", str)
|
||||||
|
str = re.sub(r"""'"(?=\w)""", """‘“""", str)
|
||||||
|
|
||||||
|
# Special case for decade abbreviations (the '80s):
|
||||||
|
str = re.sub(r"""\b'(?=\d{2}s)""", r"""’""", str)
|
||||||
|
|
||||||
|
close_class = r"""[^\ \t\r\n\[\{\(\-]"""
|
||||||
|
dec_dashes = r"""–|—"""
|
||||||
|
|
||||||
|
# Get most opening single quotes:
|
||||||
|
opening_single_quotes_regex = re.compile(r"""
|
||||||
|
(
|
||||||
|
\s | # a whitespace char, or
|
||||||
|
| # a non-breaking space entity, or
|
||||||
|
-- | # dashes, or
|
||||||
|
&[mn]dash; | # named dash entities
|
||||||
|
%s | # or decimal entities
|
||||||
|
&\#x201[34]; # or hex
|
||||||
|
)
|
||||||
|
' # the quote
|
||||||
|
(?=\w) # followed by a word character
|
||||||
|
""" % (dec_dashes,), re.VERBOSE)
|
||||||
|
str = opening_single_quotes_regex.sub(r"""\1‘""", str)
|
||||||
|
|
||||||
|
closing_single_quotes_regex = re.compile(r"""
|
||||||
|
(%s)
|
||||||
|
'
|
||||||
|
(?!\s | s\b | \d)
|
||||||
|
""" % (close_class,), re.VERBOSE)
|
||||||
|
str = closing_single_quotes_regex.sub(r"""\1’""", str)
|
||||||
|
|
||||||
|
closing_single_quotes_regex = re.compile(r"""
|
||||||
|
(%s)
|
||||||
|
'
|
||||||
|
(\s | s\b)
|
||||||
|
""" % (close_class,), re.VERBOSE)
|
||||||
|
str = closing_single_quotes_regex.sub(r"""\1’\2""", str)
|
||||||
|
|
||||||
|
# Any remaining single quotes should be opening ones:
|
||||||
|
str = re.sub(r"""'""", r"""‘""", str)
|
||||||
|
|
||||||
|
# Get most opening double quotes:
|
||||||
|
opening_double_quotes_regex = re.compile(r"""
|
||||||
|
(
|
||||||
|
\s | # a whitespace char, or
|
||||||
|
| # a non-breaking space entity, or
|
||||||
|
-- | # dashes, or
|
||||||
|
&[mn]dash; | # named dash entities
|
||||||
|
%s | # or decimal entities
|
||||||
|
&\#x201[34]; # or hex
|
||||||
|
)
|
||||||
|
" # the quote
|
||||||
|
(?=\w) # followed by a word character
|
||||||
|
""" % (dec_dashes,), re.VERBOSE)
|
||||||
|
str = opening_double_quotes_regex.sub(r"""\1“""", str)
|
||||||
|
|
||||||
|
# Double closing quotes:
|
||||||
|
closing_double_quotes_regex = re.compile(r"""
|
||||||
|
#(%s)? # character that indicates the quote should be closing
|
||||||
|
"
|
||||||
|
(?=\s)
|
||||||
|
""" % (close_class,), re.VERBOSE)
|
||||||
|
str = closing_double_quotes_regex.sub(r"""”""", str)
|
||||||
|
|
||||||
|
closing_double_quotes_regex = re.compile(r"""
|
||||||
|
(%s) # character that indicates the quote should be closing
|
||||||
|
"
|
||||||
|
""" % (close_class,), re.VERBOSE)
|
||||||
|
str = closing_double_quotes_regex.sub(r"""\1”""", str)
|
||||||
|
|
||||||
|
# Any remaining quotes should be opening ones.
|
||||||
|
str = re.sub(r'"', r"""“""", str)
|
||||||
|
|
||||||
|
return str
|
||||||
|
|
||||||
|
|
||||||
|
def educateBackticks(str):
|
||||||
|
"""
|
||||||
|
Parameter: String.
|
||||||
|
Returns: The string, with ``backticks'' -style double quotes
|
||||||
|
translated into HTML curly quote entities.
|
||||||
|
Example input: ``Isn't this fun?''
|
||||||
|
Example output: “Isn't this fun?”
|
||||||
|
"""
|
||||||
|
|
||||||
|
str = re.sub(r"""``""", r"""“""", str)
|
||||||
|
str = re.sub(r"""''""", r"""”""", str)
|
||||||
|
return str
|
||||||
|
|
||||||
|
|
||||||
|
def educateSingleBackticks(str):
|
||||||
|
"""
|
||||||
|
Parameter: String.
|
||||||
|
Returns: The string, with `backticks' -style single quotes
|
||||||
|
translated into HTML curly quote entities.
|
||||||
|
|
||||||
|
Example input: `Isn't this fun?'
|
||||||
|
Example output: ‘Isn’t this fun?’
|
||||||
|
"""
|
||||||
|
|
||||||
|
str = re.sub(r"""`""", r"""‘""", str)
|
||||||
|
str = re.sub(r"""'""", r"""’""", str)
|
||||||
|
return str
|
||||||
|
|
||||||
|
|
||||||
|
def educateDashes(str):
|
||||||
|
"""
|
||||||
|
Parameter: String.
|
||||||
|
|
||||||
|
Returns: The string, with each instance of "--" translated to
|
||||||
|
an em-dash HTML entity.
|
||||||
|
"""
|
||||||
|
|
||||||
|
str = re.sub(r"""---""", r"""–""", str) # en (yes, backwards)
|
||||||
|
str = re.sub(r"""--""", r"""—""", str) # em (yes, backwards)
|
||||||
|
return str
|
||||||
|
|
||||||
|
|
||||||
|
def educateDashesOldSchool(str):
|
||||||
|
"""
|
||||||
|
Parameter: String.
|
||||||
|
|
||||||
|
Returns: The string, with each instance of "--" translated to
|
||||||
|
an en-dash HTML entity, and each "---" translated to
|
||||||
|
an em-dash HTML entity.
|
||||||
|
"""
|
||||||
|
|
||||||
|
str = re.sub(r"""---""", r"""—""", str) # em (yes, backwards)
|
||||||
|
str = re.sub(r"""--""", r"""–""", str) # en (yes, backwards)
|
||||||
|
return str
|
||||||
|
|
||||||
|
|
||||||
|
def educateDashesOldSchoolInverted(str):
|
||||||
|
"""
|
||||||
|
Parameter: String.
|
||||||
|
|
||||||
|
Returns: The string, with each instance of "--" translated to
|
||||||
|
an em-dash HTML entity, and each "---" translated to
|
||||||
|
an en-dash HTML entity. Two reasons why: First, unlike the
|
||||||
|
en- and em-dash syntax supported by
|
||||||
|
EducateDashesOldSchool(), it's compatible with existing
|
||||||
|
entries written before SmartyPants 1.1, back when "--" was
|
||||||
|
only used for em-dashes. Second, em-dashes are more
|
||||||
|
common than en-dashes, and so it sort of makes sense that
|
||||||
|
the shortcut should be shorter to type. (Thanks to Aaron
|
||||||
|
Swartz for the idea.)
|
||||||
|
"""
|
||||||
|
str = re.sub(r"""---""", r"""–""", str) # em
|
||||||
|
str = re.sub(r"""--""", r"""—""", str) # en
|
||||||
|
return str
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def educateEllipses(str):
|
||||||
|
"""
|
||||||
|
Parameter: String.
|
||||||
|
Returns: The string, with each instance of "..." translated to
|
||||||
|
an ellipsis HTML entity.
|
||||||
|
|
||||||
|
Example input: Huh...?
|
||||||
|
Example output: Huh…?
|
||||||
|
"""
|
||||||
|
|
||||||
|
str = re.sub(r"""\.\.\.""", r"""…""", str)
|
||||||
|
str = re.sub(r"""\. \. \.""", r"""…""", str)
|
||||||
|
return str
|
||||||
|
|
||||||
|
|
||||||
|
def stupefyEntities(str):
|
||||||
|
"""
|
||||||
|
Parameter: String.
|
||||||
|
Returns: The string, with each SmartyPants HTML entity translated to
|
||||||
|
its ASCII counterpart.
|
||||||
|
|
||||||
|
Example input: “Hello — world.”
|
||||||
|
Example output: "Hello -- world."
|
||||||
|
"""
|
||||||
|
|
||||||
|
str = re.sub(r"""–""", r"""-""", str) # en-dash
|
||||||
|
str = re.sub(r"""—""", r"""--""", str) # em-dash
|
||||||
|
|
||||||
|
str = re.sub(r"""‘""", r"""'""", str) # open single quote
|
||||||
|
str = re.sub(r"""’""", r"""'""", str) # close single quote
|
||||||
|
|
||||||
|
str = re.sub(r"""“""", r'''"''', str) # open double quote
|
||||||
|
str = re.sub(r"""”""", r'''"''', str) # close double quote
|
||||||
|
|
||||||
|
str = re.sub(r"""…""", r"""...""", str)# ellipsis
|
||||||
|
|
||||||
|
return str
|
||||||
|
|
||||||
|
|
||||||
|
def processEscapes(str):
|
||||||
|
r"""
|
||||||
|
Parameter: String.
|
||||||
|
Returns: The string, with after processing the following backslash
|
||||||
|
escape sequences. This is useful if you want to force a "dumb"
|
||||||
|
quote or other character to appear.
|
||||||
|
|
||||||
|
Escape Value
|
||||||
|
------ -----
|
||||||
|
\\ \
|
||||||
|
\" "
|
||||||
|
\' '
|
||||||
|
\. .
|
||||||
|
\- -
|
||||||
|
\` `
|
||||||
|
"""
|
||||||
|
str = re.sub(r"""\\\\""", r"""\""", str)
|
||||||
|
str = re.sub(r'''\\"''', r""""""", str)
|
||||||
|
str = re.sub(r"""\\'""", r"""'""", str)
|
||||||
|
str = re.sub(r"""\\\.""", r""".""", str)
|
||||||
|
str = re.sub(r"""\\-""", r"""-""", str)
|
||||||
|
str = re.sub(r"""\\`""", r"""`""", str)
|
||||||
|
|
||||||
|
return str
|
||||||
|
|
||||||
|
|
||||||
|
def _tokenize(str):
|
||||||
|
"""
|
||||||
|
Parameter: String containing HTML markup.
|
||||||
|
Returns: Reference to an array of the tokens comprising the input
|
||||||
|
string. Each token is either a tag (possibly with nested,
|
||||||
|
tags contained therein, such as <a href="<MTFoo>">, or a
|
||||||
|
run of text between tags. Each element of the array is a
|
||||||
|
two-element array; the first is either 'tag' or 'text';
|
||||||
|
the second is the actual value.
|
||||||
|
|
||||||
|
Based on the _tokenize() subroutine from Brad Choate's MTRegex plugin.
|
||||||
|
<http://www.bradchoate.com/past/mtregex.php>
|
||||||
|
"""
|
||||||
|
|
||||||
|
tokens = []
|
||||||
|
|
||||||
|
#depth = 6
|
||||||
|
#nested_tags = "|".join(['(?:<(?:[^<>]',] * depth) + (')*>)' * depth)
|
||||||
|
#match = r"""(?: <! ( -- .*? -- \s* )+ > ) | # comments
|
||||||
|
# (?: <\? .*? \?> ) | # directives
|
||||||
|
# %s # nested tags """ % (nested_tags,)
|
||||||
|
tag_soup = re.compile(r"""([^<]*)(<[^>]*>)""")
|
||||||
|
|
||||||
|
token_match = tag_soup.search(str)
|
||||||
|
|
||||||
|
previous_end = 0
|
||||||
|
while token_match is not None:
|
||||||
|
if token_match.group(1):
|
||||||
|
tokens.append(['text', token_match.group(1)])
|
||||||
|
|
||||||
|
tokens.append(['tag', token_match.group(2)])
|
||||||
|
|
||||||
|
previous_end = token_match.end()
|
||||||
|
token_match = tag_soup.search(str, token_match.end())
|
||||||
|
|
||||||
|
if previous_end < len(str):
|
||||||
|
tokens.append(['text', str[previous_end:]])
|
||||||
|
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
import locale
|
||||||
|
|
||||||
|
try:
|
||||||
|
locale.setlocale(locale.LC_ALL, '')
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
from docutils.core import publish_string
|
||||||
|
docstring_html = publish_string(__doc__, writer_name='html')
|
||||||
|
|
||||||
|
print docstring_html
|
||||||
|
|
||||||
|
|
||||||
|
# Unit test output goes out stderr. No worries.
|
||||||
|
import unittest
|
||||||
|
sp = smartyPants
|
||||||
|
|
||||||
|
class TestSmartypantsAllAttributes(unittest.TestCase):
|
||||||
|
# the default attribute is "1", which means "all".
|
||||||
|
|
||||||
|
def test_dates(self):
|
||||||
|
self.assertEqual(sp("1440-80's"), "1440-80’s")
|
||||||
|
self.assertEqual(sp("1440-'80s"), "1440-‘80s")
|
||||||
|
self.assertEqual(sp("1440---'80s"), "1440–‘80s")
|
||||||
|
self.assertEqual(sp("1960s"), "1960s") # no effect.
|
||||||
|
self.assertEqual(sp("1960's"), "1960’s")
|
||||||
|
self.assertEqual(sp("one two '60s"), "one two ‘60s")
|
||||||
|
self.assertEqual(sp("'60s"), "‘60s")
|
||||||
|
|
||||||
|
def test_skip_tags(self):
|
||||||
|
self.assertEqual(
|
||||||
|
sp("""<script type="text/javascript">\n<!--\nvar href = "http://www.google.com";\nvar linktext = "google";\ndocument.write('<a href="' + href + '">' + linktext + "</a>");\n//-->\n</script>"""),
|
||||||
|
"""<script type="text/javascript">\n<!--\nvar href = "http://www.google.com";\nvar linktext = "google";\ndocument.write('<a href="' + href + '">' + linktext + "</a>");\n//-->\n</script>""")
|
||||||
|
self.assertEqual(
|
||||||
|
sp("""<p>He said "Let's write some code." This code here <code>if True:\n\tprint "Okay"</code> is python code.</p>"""),
|
||||||
|
"""<p>He said “Let’s write some code.” This code here <code>if True:\n\tprint "Okay"</code> is python code.</p>""")
|
||||||
|
|
||||||
|
|
||||||
|
def test_ordinal_numbers(self):
|
||||||
|
self.assertEqual(sp("21st century"), "21st century") # no effect.
|
||||||
|
self.assertEqual(sp("3rd"), "3rd") # no effect.
|
||||||
|
|
||||||
|
def test_educated_quotes(self):
|
||||||
|
self.assertEqual(sp('''"Isn't this fun?"'''), '''“Isn’t this fun?”''')
|
||||||
|
|
||||||
|
unittest.main()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
__author__ = "Chad Miller <smartypantspy@chad.org>"
|
||||||
|
__version__ = "1.5_1.6: Fri, 27 Jul 2007 07:06:40 -0400"
|
||||||
|
__url__ = "http://wiki.chad.org/SmartyPantsPy"
|
||||||
|
__description__ = "Smart-quotes, smart-ellipses, and smart-dashes for weblog entries in pyblosxom"
|
@ -165,7 +165,9 @@ class Feed(object):
|
|||||||
if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
|
if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
|
||||||
self.articles.append(article)
|
self.articles.append(article)
|
||||||
else:
|
else:
|
||||||
self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%(title, article.localtime.strftime('%a, %d %b, %Y %H:%M'), self.title))
|
t = strftime(u'%a, %d %b, %Y %H:%M', article.localtime.timetuple())
|
||||||
|
self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%
|
||||||
|
(title, t, self.title))
|
||||||
d = item.get('date', '')
|
d = item.get('date', '')
|
||||||
article.formatted_date = d
|
article.formatted_date = d
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user