diff --git a/resources/content_server/gui.js b/resources/content_server/gui.js
index 631fb8b617..d0fb49cc8e 100644
--- a/resources/content_server/gui.js
+++ b/resources/content_server/gui.js
@@ -26,7 +26,7 @@ var current_library_request = null;
////////////////////////////// GET BOOK LIST //////////////////////////////
-var LIBRARY_FETCH_TIMEOUT = 30000; // milliseconds
+var LIBRARY_FETCH_TIMEOUT = 5*60000; // milliseconds
function create_table_headers() {
var thead = $('table#book_list thead tr');
diff --git a/resources/default_tweaks.py b/resources/default_tweaks.py
index 66ee4d1471..71bf2c6c37 100644
--- a/resources/default_tweaks.py
+++ b/resources/default_tweaks.py
@@ -114,3 +114,11 @@ add_new_book_tags_when_importing_books = False
# Set the maximum number of tags to show per book in the content server
max_content_server_tags_shown=5
+
+# Set the maximum number of sort 'levels' that calibre will use to resort the
+# library after certain operations such as searches or device insertion. Each
+# sort level adds a performance penalty. If the database is large (thousands of
+# books) the penalty might be noticeable. If you are not concerned about multi-
+# level sorts, and if you are seeing a slowdown, reduce the value of this tweak.
+maximum_resort_levels = 5
+
diff --git a/resources/recipes/infobae.recipe b/resources/recipes/infobae.recipe
index cda9bf83d2..b7f9cd3c6c 100644
--- a/resources/recipes/infobae.recipe
+++ b/resources/recipes/infobae.recipe
@@ -1,12 +1,8 @@
-#!/usr/bin/env python
-
__license__ = 'GPL v3'
-__copyright__ = '2008-2009, Darko Miletic '
+__copyright__ = '2008-2010, Darko Miletic '
'''
infobae.com
'''
-import re
-import urllib, urlparse
from calibre.web.feeds.news import BasicNewsRecipe
@@ -20,35 +16,24 @@ class Infobae(BasicNewsRecipe):
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
- language = 'es'
- lang = 'es-AR'
-
+ language = 'es'
encoding = 'cp1252'
- cover_url = 'http://www.infobae.com/imgs/header/header.gif'
+ masthead_url = 'http://www.infobae.com/imgs/header/header.gif'
remove_javascript = True
- preprocess_regexps = [(re.compile(
- r''), lambda m:'')]
-
-
- html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
-
- extra_css = '''
- .col-center{font-family:Arial,Helvetica,sans-serif;}
- h1{font-family:Arial,Helvetica,sans-serif; color:#0D4261;}
- .fuenteIntNota{font-family:Arial,Helvetica,sans-serif; color:#1D1D1D; font-size:x-small;}
- '''
-
- keep_only_tags = [dict(name='div', attrs={'class':['content']})]
-
-
- remove_tags = [
- dict(name='div', attrs={'class':['options','col-right','controles', 'bannerLibre','tiulo-masleidas','masleidas-h']}),
- dict(name='a', attrs={'name' : 'comentario',}),
- dict(name='iframe'),
- dict(name='img', alt = "Ver galerias de imagenes"),
-
- ]
-
+ remove_empty_feeds = True
+ extra_css = '''
+ body{font-family:Arial,Helvetica,sans-serif;}
+ .popUpTitulo{color:#0D4261; font-size: xx-large}
+ '''
+
+ conversion_options = {
+ 'comment' : description
+ , 'tags' : category
+ , 'publisher' : publisher
+ , 'language' : language
+ , 'linearize_tables' : True
+ }
+
feeds = [
(u'Noticias' , u'http://www.infobae.com/adjuntos/html/RSS/hoy.xml' )
@@ -57,39 +42,14 @@ class Infobae(BasicNewsRecipe):
,(u'Deportes' , u'http://www.infobae.com/adjuntos/html/RSS/deportes.xml' )
]
-# def print_version(self, url):
-# main, sep, article_part = url.partition('contenidos/')
-# article_id, rsep, rrest = article_part.partition('-')
-# return u'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id
-
- def get_article_url(self, article):
- ans = article.get('link').encode('utf-8')
- parts = list(urlparse.urlparse(ans))
- parts[2] = urllib.quote(parts[2])
- ans = urlparse.urlunparse(parts)
- return ans.decode('utf-8')
-
-
- def preprocess_html(self, soup):
-
- for tag in soup.head.findAll('strong'):
- tag.extract()
- for tag in soup.findAll('meta'):
- del tag['content']
- tag.extract()
-
- mtag = '\n\n'
- soup.head.insert(0,mtag)
- for item in soup.findAll(style=True):
- del item['style']
-
- return soup
+ def print_version(self, url):
+ article_part = url.rpartition('/')[2]
+ article_id= article_part.partition('-')[0]
+ return 'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id
def postprocess_html(self, soup, first):
-
for tag in soup.findAll(name='strong'):
tag.name = 'b'
-
return soup
diff --git a/resources/recipes/nspm.recipe b/resources/recipes/nspm.recipe
index 13ff42b277..58b782415b 100644
--- a/resources/recipes/nspm.recipe
+++ b/resources/recipes/nspm.recipe
@@ -6,6 +6,7 @@ nspm.rs
import re
from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import NavigableString
class Nspm(BasicNewsRecipe):
title = 'Nova srpska politicka misao'
@@ -21,6 +22,7 @@ class Nspm(BasicNewsRecipe):
encoding = 'utf-8'
language = 'sr'
delay = 2
+ remove_empty_feeds = True
publication_type = 'magazine'
masthead_url = 'http://www.nspm.rs/templates/jsn_epic_pro/images/logol.jpg'
extra_css = """ @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)}
@@ -45,8 +47,9 @@ class Nspm(BasicNewsRecipe):
dict(name=['link','object','embed','script','meta','base','iframe'])
,dict(attrs={'class':'buttonheading'})
]
- remove_tags_after = dict(attrs={'class':'article_separator'})
- remove_attributes = ['width','height']
+ remove_tags_before = dict(attrs={'class':'contentheading'})
+ remove_tags_after = dict(attrs={'class':'article_separator'})
+ remove_attributes = ['width','height']
def get_browser(self):
br = BasicNewsRecipe.get_browser()
@@ -67,4 +70,8 @@ class Nspm(BasicNewsRecipe):
def preprocess_html(self, soup):
for item in soup.body.findAll(style=True):
del item['style']
+ for item in soup.body.findAll('h1'):
+ nh = NavigableString(item.a.string)
+ item.a.extract()
+ item.insert(0,nh)
return self.adeify_images(soup)
diff --git a/resources/recipes/xkcd.recipe b/resources/recipes/xkcd.recipe
index 312027004e..ad0d420deb 100644
--- a/resources/recipes/xkcd.recipe
+++ b/resources/recipes/xkcd.recipe
@@ -24,18 +24,18 @@ class XkcdCom(BasicNewsRecipe):
(re.compile(r'()'),
lambda m: '%s%s%s
' % (m.group(1), m.group(3), m.group(2)))
]
-
+
def parse_index(self):
INDEX = 'http://xkcd.com/archive/'
- soup = self.index_to_soup(INDEX)
+ soup = self.index_to_soup(INDEX)
articles = []
for item in soup.findAll('a', title=True):
articles.append({
'date': item['title'],
'timestamp': time.mktime(time.strptime(item['title'], '%Y-%m-%d'))+1,
'url': 'http://xkcd.com' + item['href'],
- 'title': self.tag_to_string(item).encode('UTF-8'),
+ 'title': self.tag_to_string(item),
'description': '',
'content': '',
})
diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py
index 4c87236e71..68df832048 100644
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@@ -459,7 +459,7 @@ from calibre.devices.iriver.driver import IRIVER_STORY
from calibre.devices.binatone.driver import README
from calibre.devices.hanvon.driver import N516, EB511, ALEX, AZBOOKA, THEBOOK
from calibre.devices.edge.driver import EDGE
-from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS
+from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS, SOVOS
from calibre.devices.sne.driver import SNE
from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, GEMEI
from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG
@@ -557,6 +557,7 @@ plugins += [
TECLAST_K3,
NEWSMY,
IPAPYRUS,
+ SOVOS,
EDGE,
SNE,
ALEX,
diff --git a/src/calibre/devices/kobo/books.py b/src/calibre/devices/kobo/books.py
index 9da99d75c8..496162d668 100644
--- a/src/calibre/devices/kobo/books.py
+++ b/src/calibre/devices/kobo/books.py
@@ -44,16 +44,17 @@ class Book(MetaInformation):
self.mime = mime
self.size = size # will be set later if None
- try:
- if ContentType == '6':
- self.datetime = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%f")
- else:
- self.datetime = time.gmtime(os.path.getctime(self.path))
- except:
- self.datetime = time.gmtime()
- if thumbnail_name is not None:
- self.thumbnail = ImageWrapper(thumbnail_name)
+ if ContentType == '6':
+ self.datetime = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%f")
+ else:
+ try:
+ self.datetime = time.gmtime(os.path.getctime(self.path))
+ except:
+ self.datetime = time.gmtime()
+
+ if thumbnail_name is not None:
+ self.thumbnail = ImageWrapper(thumbnail_name)
self.tags = []
if other:
self.smart_update(other)
diff --git a/src/calibre/devices/kobo/driver.py b/src/calibre/devices/kobo/driver.py
index 5e1c752c76..f24e00143b 100644
--- a/src/calibre/devices/kobo/driver.py
+++ b/src/calibre/devices/kobo/driver.py
@@ -106,11 +106,14 @@ class KOBO(USBMS):
changed = True
bl[idx].device_collections = playlist_map.get(lpath, [])
else:
- book = self.book_from_path(prefix, lpath, title, authors, mime, date, ContentType, ImageID)
+ if ContentType == '6':
+ book = Book(prefix, lpath, title, authors, mime, date, ContentType, ImageID, size=1048576)
+ else:
+ book = self.book_from_path(prefix, lpath, title, authors, mime, date, ContentType, ImageID)
# print 'Update booklist'
+ book.device_collections = playlist_map.get(book.lpath, [])
if bl.add_book(book, replace_metadata=False):
changed = True
- book.device_collections = playlist_map.get(book.lpath, [])
except: # Probably a path encoding error
import traceback
traceback.print_exc()
@@ -231,21 +234,9 @@ class KOBO(USBMS):
path = self.normalize_path(path)
# print "Delete file normalized path: " + path
extension = os.path.splitext(path)[1]
-
- if extension == '.kobo':
- # Kobo books do not have book files. They do have some images though
- #print "kobo book"
- ContentType = 6
- ContentID = self.contentid_from_path(path, ContentType)
- elif extension == '.pdf' or extension == '.epub':
- # print "ePub or pdf"
- ContentType = 16
- #print "Path: " + path
- ContentID = self.contentid_from_path(path, ContentType)
- # print "ContentID: " + ContentID
- else: # if extension == '.html' or extension == '.txt':
- ContentType = 999 # Yet another hack: to get around Kobo changing how ContentID is stored
- ContentID = self.contentid_from_path(path, ContentType)
+ ContentType = self.get_content_type_from_extension(extension)
+
+ ContentID = self.contentid_from_path(path, ContentType)
ImageID = self.delete_via_sql(ContentID, ContentType)
#print " We would now delete the Images for" + ImageID
@@ -343,6 +334,17 @@ class KOBO(USBMS):
ContentID = ContentID.replace("\\", '/')
return ContentID
+ def get_content_type_from_extension(self, extension):
+ if extension == '.kobo':
+ # Kobo books do not have book files. They do have some images though
+ #print "kobo book"
+ ContentType = 6
+ elif extension == '.pdf' or extension == '.epub':
+ # print "ePub or pdf"
+ ContentType = 16
+ else: # if extension == '.html' or extension == '.txt':
+ ContentType = 999 # Yet another hack: to get around Kobo changing how ContentID is stored
+ return ContentType
def path_from_contentid(self, ContentID, ContentType, oncard):
path = ContentID
diff --git a/src/calibre/devices/teclast/driver.py b/src/calibre/devices/teclast/driver.py
index 0c60a367cf..2055ff9306 100644
--- a/src/calibre/devices/teclast/driver.py
+++ b/src/calibre/devices/teclast/driver.py
@@ -52,3 +52,14 @@ class IPAPYRUS(TECLAST_K3):
VENDOR_NAME = 'E_READER'
WINDOWS_MAIN_MEM = ''
+class SOVOS(TECLAST_K3):
+
+ name = 'Sovos device interface'
+ gui_name = 'Sovos'
+ description = _('Communicate with the Sovos reader.')
+
+ FORMATS = ['epub', 'fb2', 'pdf', 'txt']
+
+ VENDOR_NAME = 'RK28XX'
+ WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'USB-MSC'
+
diff --git a/src/calibre/ebooks/chm/reader.py b/src/calibre/ebooks/chm/reader.py
index 67a2d36607..831c16bf6a 100644
--- a/src/calibre/ebooks/chm/reader.py
+++ b/src/calibre/ebooks/chm/reader.py
@@ -132,7 +132,11 @@ class CHMReader(CHMFile):
for path in self.Contents():
lpath = os.path.join(output_dir, path)
self._ensure_dir(lpath)
- data = self.GetFile(path)
+ try:
+ data = self.GetFile(path)
+ except:
+ self.log.exception('Failed to extract %s from CHM, ignoring'%path)
+ continue
if lpath.find(';') != -1:
# fix file names with ";" at the end, see _reformat()
lpath = lpath.split(';')[0]
diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py
index 7439718cf6..2ef633d0bb 100644
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@@ -122,7 +122,7 @@ def add_pipeline_options(parser, plumber):
'font_size_mapping',
'line_height',
'linearize_tables',
- 'extra_css',
+ 'extra_css', 'smarten_punctuation',
'margin_top', 'margin_left', 'margin_right',
'margin_bottom', 'change_justification',
'insert_blank_line', 'remove_paragraph_spacing','remove_paragraph_spacing_indent_size',
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 24b35f804f..16282dd28d 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -362,6 +362,14 @@ OptionRecommendation(name='preprocess_html',
)
),
+OptionRecommendation(name='smarten_punctuation',
+ recommended_value=False, level=OptionRecommendation.LOW,
+ help=_('Convert plain quotes, dashes and ellipsis to their '
+ 'typographically correct equivalents. For details, see '
+ 'http://daringfireball.net/projects/smartypants'
+ )
+ ),
+
OptionRecommendation(name='remove_header',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Use a regular expression to try and remove the header.'
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index f7b803974f..4538af96c4 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -75,6 +75,8 @@ def line_length(format, raw, percent):
linere = re.compile('(?<=)', re.DOTALL)
elif format == 'pdf':
linere = re.compile('(?<=
).*?(?=
)', re.DOTALL)
+ elif format == 'spanned_html':
+ linere = re.compile('(?<=)', re.DOTALL)
lines = linere.findall(raw)
lengths = []
@@ -166,6 +168,17 @@ class HTMLPreProcessor(object):
(re.compile(u'`\s*()*\s*O', re.UNICODE), lambda match: u'Ò'),
(re.compile(u'`\s*()*\s*u', re.UNICODE), lambda match: u'ù'),
(re.compile(u'`\s*()*\s*U', re.UNICODE), lambda match: u'Ù'),
+ # ` with letter before
+ (re.compile(u'a\s*()*\s*`', re.UNICODE), lambda match: u'à'),
+ (re.compile(u'A\s*()*\s*`', re.UNICODE), lambda match: u'À'),
+ (re.compile(u'e\s*()*\s*`', re.UNICODE), lambda match: u'è'),
+ (re.compile(u'E\s*()*\s*`', re.UNICODE), lambda match: u'È'),
+ (re.compile(u'i\s*()*\s*`', re.UNICODE), lambda match: u'ì'),
+ (re.compile(u'I\s*()*\s*`', re.UNICODE), lambda match: u'Ì'),
+ (re.compile(u'o\s*()*\s*`', re.UNICODE), lambda match: u'ò'),
+ (re.compile(u'O\s*()*\s*`', re.UNICODE), lambda match: u'Ò'),
+ (re.compile(u'u\s*()*\s*`', re.UNICODE), lambda match: u'ù'),
+ (re.compile(u'U\s*()*\s*`', re.UNICODE), lambda match: u'Ù'),
# ´
(re.compile(u'´\s*()*\s*a', re.UNICODE), lambda match: u'á'),
@@ -208,35 +221,34 @@ class HTMLPreProcessor(object):
(re.compile(u'˛\s*()*\s*A', re.UNICODE), lambda match: u'Ą'),
(re.compile(u'˛\s*()*\s*e', re.UNICODE), lambda match: u'ę'),
(re.compile(u'˛\s*()*\s*E', re.UNICODE), lambda match: u'Ę'),
-
+
# ˙
(re.compile(u'˙\s*()*\s*z', re.UNICODE), lambda match: u'ż'),
(re.compile(u'˙\s*()*\s*Z', re.UNICODE), lambda match: u'Ż'),
-
+
+ # If pdf printed from a browser then the header/footer has a reliable pattern
+ (re.compile(r'((?<=)\s*file:////?[A-Z].*
|file:////?[A-Z].*
(?=\s*
))', re.IGNORECASE), lambda match: ''),
+
+ # Center separator lines
+ (re.compile(u'
\s*(?P([*#•]+\s*)+)\s*
'), lambda match: '\n
' + match.group(1) + '
'),
# Remove page links
(re.compile(r'', re.IGNORECASE), lambda match: ''),
# Remove
tags
- (re.compile(r'', re.IGNORECASE), lambda match: '
'),
- # Replace
with
- (re.compile(r'\s*', re.IGNORECASE), lambda match: ''),
-
- # Remove hyphenation
- (re.compile(r'-\n\r?'), lambda match: ''),
+ (re.compile(r'', re.IGNORECASE), lambda match: '
'),
# Remove gray background
(re.compile(r']+>'), lambda match : ''),
# Detect Chapters to match default XPATH in GUI
- (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part)\s*([\d\w-]+(\s\w+)?)?((i|b)>((i|b)>)?)?)?(br|p)[^>]*>\s*(?P(<(i|b)>)?\s*\w+(\s*\w+)?\s*((i|b)>)?\s*(?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
- (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P([A-Z \'"!]{5,})\s*(\d+|\w+)?)(?p[^>]*>|
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P.*)(
]*>|?p[^>]*>)))?'), chap_head),
+ (re.compile(r'
\s*(?P(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*([ibu]>){0,2})\s*(
\s*){1,3}\s*(?P(<[ibu]>){0,2}(\s*\w+){1,4}\s*([ibu]>){0,2}\s*
)?', re.IGNORECASE), chap_head),
+ # Cover the case where every letter in a chapter title is separated by a space
+ (re.compile(r'
\s*(?P([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(
\s*){1,3}\s*(?P(<[ibu]>){0,2}(\s*\w+){1,4}\s*([ibu]>){0,2}\s*(
))?'), chap_head),
# Have paragraphs show better
(re.compile(r''), lambda match : ''),
# Clean up spaces
(re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
- # Connect paragraphs split by -
- (re.compile(u'(?<=[^\s][-–])[\s]*(
)*[\s]*()*\s*(?=[^\s])'), lambda match: ''),
# Add space before and after italics
(re.compile(u'(?'), lambda match: ' '),
(re.compile(r'(?=\w)'), lambda match: ' '),
@@ -317,12 +329,29 @@ class HTMLPreProcessor(object):
print 'Failed to parse remove_footer regexp'
traceback.print_exc()
+ # unwrap hyphenation - moved here so it's executed after header/footer removal
+ if is_pdftohtml:
+ # unwrap visible dashes and hyphens - don't delete they are often hyphens for
+ # for compound words, formatting, etc
+ end_rules.append((re.compile(u'(?<=[-–—])\s*
\s*(?=[[a-z\d])'), lambda match: ''))
+ # unwrap/delete soft hyphens
+ end_rules.append((re.compile(u'[](\s*
)+\s*(?=[[a-z\d])'), lambda match: ''))
+ # unwrap/delete soft hyphens with formatting
+ end_rules.append((re.compile(u'[]\s*((i|u|b)>)+(\s*
)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
+
+ # Make the more aggressive chapter marking regex optional with the preprocess option to
+ # reduce false positives and move after header/footer removal
+ if getattr(self.extra_opts, 'preprocess_html', None):
+ if is_pdftohtml:
+ end_rules.append((re.compile(r'
\s*(?P(<[ibu]>){0,2}\s*([A-Z \'"!]{3,})\s*([\dA-Z:]+\s){0,4}\s*([ibu]>){0,2})\s*\s*(?P
(<[ibu]>){0,2}(\s*\w+){1,4}\s*([ibu]>){0,2}\s*)?'), chap_head),)
+
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
if length:
+ # print "The pdf line length returned is " + str(length)
end_rules.append(
# Un wrap using punctuation
- (re.compile(r'(?<=.{%i}[a-z\.,;:)\-IA])\s*(?P(i|b|u)>)?\s*()\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
+ (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P(i|b|u)>)?\s*(\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
)
for rule in self.PREPROCESS + start_rules:
@@ -372,5 +401,14 @@ class HTMLPreProcessor(object):
if self.plugin_preprocess:
html = self.input_plugin_preprocess(html)
+ if getattr(self.extra_opts, 'smarten_punctuation', False):
+ html = self.smarten_punctuation(html)
+
return html
+ def smarten_punctuation(self, html):
+ from calibre.utils.smartypants import smartyPants
+ from calibre.ebooks.chardet import substitute_entites
+ html = smartyPants(html)
+ return substitute_entites(html)
+
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
new file mode 100644
index 0000000000..5301f70a16
--- /dev/null
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+__license__ = 'GPL v3'
+__copyright__ = '2010, Kovid Goyal '
+__docformat__ = 'restructuredtext en'
+
+import re
+from calibre.ebooks.conversion.preprocess import line_length
+from calibre.utils.logging import default_log
+
+class PreProcessor(object):
+
+ def __init__(self, log=None):
+ self.log = default_log if log is None else log
+ self.html_preprocess_sections = 0
+ self.found_indents = 0
+
+ def chapter_head(self, match):
+ chap = match.group('chap')
+ title = match.group('title')
+ if not title:
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("found " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
+ return ''+chap+'
\n'
+ else:
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("found " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
+ return ''+chap+'
\n'+title+'
\n'
+
+ def chapter_break(self, match):
+ chap = match.group('section')
+ styles = match.group('styles')
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("marked " + str(self.html_preprocess_sections) + " section markers based on punctuation. - " + str(chap))
+ return '<'+styles+' style="page-break-before:always">'+chap
+
+ def insert_indent(self, match):
+ pstyle = match.group('formatting')
+ span = match.group('span')
+ self.found_indents = self.found_indents + 1
+ if pstyle:
+ if not span:
+ return ''
+ else:
+ return '
'+span
+ else:
+ if not span:
+ return '
'
+ else:
+ return '
'+span
+
+ def no_markup(self, raw, percent):
+ '''
+ Detects total marked up line endings in the file. raw is the text to
+ inspect. Percent is the minimum percent of line endings which should
+ be marked up to return true.
+ '''
+ htm_end_ere = re.compile('
', re.DOTALL)
+ line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
+ htm_end = htm_end_ere.findall(raw)
+ line_end = line_end_ere.findall(raw)
+ tot_htm_ends = len(htm_end)
+ tot_ln_fds = len(line_end)
+ self.log("There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked up endings")
+
+ if percent > 1:
+ percent = 1
+ if percent < 0:
+ percent = 0
+
+ min_lns = tot_ln_fds * percent
+ self.log("There must be fewer than " + str(min_lns) + " unmarked lines to add markup")
+ if min_lns > tot_htm_ends:
+ return True
+
+ def __call__(self, html):
+ self.log("********* Preprocessing HTML *********")
+ # Replace series of non-breaking spaces with text-indent
+ txtindent = re.compile(ur'[^>]*)>\s*(?P(]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
+ html = txtindent.sub(self.insert_indent, html)
+ if self.found_indents > 1:
+ self.log("replaced "+str(self.found_indents)+ " nbsp indents with inline styles")
+ # remove remaining non-breaking spaces
+ html = re.sub(ur'\u00a0', ' ', html)
+ # Get rid of empty tags to simplify other processing
+ html = re.sub(ur'\s*\s*', ' ', html)
+ # Get rid of empty span tags
+ html = re.sub(r"\s*]*>\s*", " ", html)
+
+ # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
+ linereg = re.compile('(?<=)', re.IGNORECASE|re.DOTALL)
+ blankreg = re.compile(r'\s*
]*>\s*(<(b|i|u)>)?\s*((b|i|u)>)?\s*
', re.IGNORECASE)
+ blanklines = blankreg.findall(html)
+ lines = linereg.findall(html)
+ if len(lines) > 1:
+ self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
+ if float(len(blanklines)) / float(len(lines)) > 0.40:
+ self.log("deleting blank lines")
+ html = blankreg.sub('', html)
+ # Arrange line feeds and
tags so the line_length and no_markup functions work correctly
+ html = re.sub(r"\s*
", "
\n", html)
+ html = re.sub(r"\s*\s*", "\n
", html)
+
+ # some lit files don't have any
tags or equivalent (generally just plain text between
+ #
tags), check and mark up line endings if required before proceeding
+ if self.no_markup(html, 0.1):
+ self.log("not enough paragraph markers, adding now")
+ add_markup = re.compile('(?)(\n)')
+ html = add_markup.sub('
\n', html)
+
+ # detect chapters/sections to match xpath or splitting logic
+ heading = re.compile(']*>', re.IGNORECASE)
+ self.html_preprocess_sections = len(heading.findall(html))
+ self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings")
+ #
+ # Start with most typical chapter headings, get more aggressive until one works
+ if self.html_preprocess_sections < 10:
+ chapdetect = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}s*(]*>)?\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*([ibu]>){0,2})\s*()?s*()?\s*([ibu]>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((br|p)>))?', re.IGNORECASE)
+ html = chapdetect.sub(self.chapter_head, html)
+ if self.html_preprocess_sections < 10:
+ self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
+ chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((br|p)>))?', re.UNICODE)
+ html = chapdetect2.sub(self.chapter_head, html)
+
+ if self.html_preprocess_sections < 10:
+ self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
+ chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((br|p)>))?', re.UNICODE)
+ html = chapdetect2.sub(self.chapter_head, html)
+
+ # Unwrap lines
+ #
+ self.log("Unwrapping Lines")
+ # Some OCR sourced files have line breaks in the html using a combination of span & p tags
+ # span are used for hard line breaks, p for new paragraphs. Determine which is used so
+ # that lines can be un-wrapped across page boundaries
+ paras_reg = re.compile(']*>', re.IGNORECASE)
+ spans_reg = re.compile(']*>', re.IGNORECASE)
+ paras = len(paras_reg.findall(html))
+ spans = len(spans_reg.findall(html))
+ if spans > 1:
+ if float(paras) / float(spans) < 0.75:
+ format = 'spanned_html'
+ else:
+ format = 'html'
+ else:
+ format = 'html'
+
+ # Calculate Length
+ length = line_length(format, html, 0.4)
+ self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***")
+ #
+ # Unwrap and/or delete soft-hyphens, hyphens
+ html = re.sub(u'\s*(\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
+ html = re.sub(u'(?<=[-–—])\s*(?=<)(\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
+
+ # Unwrap lines using punctation if the median length of all lines is less than 200
+ unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
+ html = unwrap.sub(' ', html)
+
+ # If still no sections after unwrapping mark split points on lines with no punctuation
+ if self.html_preprocess_sections < 10:
+ self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections))
+ #self.log(html)
+ chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P(]*>)?\s*(<[ibu]>){0,2}\s*(]*>)?\s*(<[ibu]>){0,2}\s*(]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*()?([ibu]>){0,2}\s*()?\s*([ibu]>){0,2}\s*()?\s*(p|div)>)', re.IGNORECASE)
+ html = chapdetect3.sub(self.chapter_break, html)
+ # search for places where a first or second level heading is immediately followed by another
+ # top level heading. demote the second heading to h3 to prevent splitting between chapter
+ # headings and titles, images, etc
+ doubleheading = re.compile(r'(?P]*>.+?\s*(<(?!h\d)[^>]*>\s*)*)[^>]*>.+?)', re.IGNORECASE)
+ html = doubleheading.sub('\g'+''+'
', html)
+
+ return html
diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index d57bfddd3e..084d48e54b 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -24,7 +24,7 @@ from calibre.constants import islinux, isfreebsd, iswindows
from calibre import unicode_path
from calibre.utils.localization import get_lang
from calibre.utils.filenames import ascii_filename
-from calibre.ebooks.conversion.preprocess import line_length
+from calibre.ebooks.conversion.utils import PreProcessor
class Link(object):
'''
@@ -491,20 +491,6 @@ class HTMLInput(InputFormatPlugin):
return (None, raw)
def preprocess_html(self, html):
- if not hasattr(self, 'log'):
- from calibre.utils.logging import default_log
- self.log = default_log
- self.log("********* Preprocessing HTML *********")
- # Detect Chapters to match the xpath in the GUI
- chapdetect = re.compile(r'(?=?(br|p|span))(?(br|p|span)[^>]*>)?\s*(?P(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?((i|b)>(i|b)>|(i|b)>)?)(?(p|br|span)[^>]*>)', re.IGNORECASE)
- html = chapdetect.sub(''+'\g'+'
\n', html)
- # Unwrap lines using punctation if the median length of all lines is less than 150
- #
- # Insert extra line feeds so the line length regex functions properly
- html = re.sub(r"
", "
\n", html)
- length = line_length('html', html, 0.4)
- self.log.debug("*** Median length is " + str(length) + " ***")
- unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
- if length < 150:
- html = unwrap.sub(' ', html)
- return html
+ preprocessor = PreProcessor(log=getattr(self, 'log', None))
+ return preprocessor(html)
+
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index 9bf20fb1d4..65f5c607a2 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -6,10 +6,9 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal '
__docformat__ = 'restructuredtext en'
-import re
-
from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.preprocess import line_length
+from calibre.ebooks.conversion.utils import PreProcessor
+
class LITInput(InputFormatPlugin):
@@ -55,18 +54,6 @@ class LITInput(InputFormatPlugin):
def preprocess_html(self, html):
- self.log("********* Preprocessing HTML *********")
- # Detect Chapters to match the xpath in the GUI
- chapdetect = re.compile(r'(?=?(br|p|span))(?(br|p|span)[^>]*>)?\s*(?P(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?((i|b)>(i|b)>|(i|b)>)?)(?(p|br|span)[^>]*>)', re.IGNORECASE)
- html = chapdetect.sub(''+'\g'+'
\n', html)
- # Unwrap lines using punctation if the median length of all lines is less than 150
- #
- # Insert extra line feeds so the line length regex functions properly
- html = re.sub(r"
", "
\n", html)
- length = line_length('html', html, 0.4)
- self.log("*** Median length is " + str(length) + " ***")
- unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
- if length < 150:
- html = unwrap.sub(' ', html)
- return html
+ preprocessor = PreProcessor(log=getattr(self, 'log', None))
+ return preprocessor(html)
diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py
index 487e70c04f..b8dc7a9560 100644
--- a/src/calibre/ebooks/mobi/input.py
+++ b/src/calibre/ebooks/mobi/input.py
@@ -3,6 +3,7 @@ __license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal '
__docformat__ = 'restructuredtext en'
+import re
from calibre.customize.conversion import InputFormatPlugin
class MOBIInput(InputFormatPlugin):
@@ -37,3 +38,12 @@ class MOBIInput(InputFormatPlugin):
include_meta_content_type=False))
accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
return mr.created_opf_path
+
+ def preprocess_html(self, html):
+ # search for places where a first or second level heading is immediately followed by another
+ # top level heading. demote the second heading to h3 to prevent splitting between chapter
+ # headings and titles, images, etc
+ doubleheading = re.compile(r'(?P]*>.+?\s*(<(?!h\d)[^>]*>\s*)*)[^>]*>.+?)', re.IGNORECASE)
+ html = doubleheading.sub('\g'+''+'
', html)
+ return html
+
diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py
index 030c271362..9a5ff36d55 100644
--- a/src/calibre/ebooks/oeb/transforms/flatcss.py
+++ b/src/calibre/ebooks/oeb/transforms/flatcss.py
@@ -138,6 +138,7 @@ class CSSFlattener(object):
float(self.context.margin_left))
bs.append('margin-right : %fpt'%\
float(self.context.margin_right))
+ bs.extend(['padding-left: 0pt', 'padding-right: 0pt'])
if self.context.change_justification != 'original':
bs.append('text-align: '+ self.context.change_justification)
body.set('style', '; '.join(bs))
diff --git a/src/calibre/ebooks/pdb/pdf/reader.py b/src/calibre/ebooks/pdb/pdf/reader.py
index 3ae9f8ccca..c151551866 100644
--- a/src/calibre/ebooks/pdb/pdf/reader.py
+++ b/src/calibre/ebooks/pdb/pdf/reader.py
@@ -21,7 +21,7 @@ class Reader(FormatReader):
self.options = options
setattr(self.options, 'new_pdf_engine', False)
setattr(self.options, 'no_images', False)
- setattr(self.options, 'unwrap_factor', 0.5)
+ setattr(self.options, 'unwrap_factor', 0.45)
def extract_content(self, output_dir):
self.log.info('Extracting PDF...')
diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py
index 64a089281e..14b3552b04 100644
--- a/src/calibre/ebooks/pdf/input.py
+++ b/src/calibre/ebooks/pdf/input.py
@@ -22,10 +22,10 @@ class PDFInput(InputFormatPlugin):
options = set([
OptionRecommendation(name='no_images', recommended_value=False,
help=_('Do not extract images from the document')),
- OptionRecommendation(name='unwrap_factor', recommended_value=0.5,
+ OptionRecommendation(name='unwrap_factor', recommended_value=0.45,
help=_('Scale used to determine the length at which a line should '
'be unwrapped. Valid values are a decimal between 0 and 1. The '
- 'default is 0.5, this is the median line length.')),
+ 'default is 0.45, just below the median line length.')),
OptionRecommendation(name='new_pdf_engine', recommended_value=False,
help=_('Use the new PDF conversion engine.'))
])
diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py
index 166695ff5c..b0fc15197a 100644
--- a/src/calibre/ebooks/pml/pmlconverter.py
+++ b/src/calibre/ebooks/pml/pmlconverter.py
@@ -207,6 +207,7 @@ class PML_HTMLizer(object):
while html != old:
old = html
html = self.cleanup_html_remove_redundant(html)
+ html = re.sub(r'(?imu)^\s*', '', html)
return html
def cleanup_html_remove_redundant(self, html):
@@ -216,7 +217,7 @@ class PML_HTMLizer(object):
html = re.sub(r'(?u)%s\s*%s' % (open % '.*?', close), '', html)
else:
html = re.sub(r'(?u)%s\s*%s' % (open, close), '', html)
- html = re.sub(r'\s*
', '', html)
+ html = re.sub(r'(?imu)\s*
', '', html)
return html
def start_line(self):
@@ -556,7 +557,7 @@ class PML_HTMLizer(object):
text = t
else:
self.toc.add_item(os.path.basename(self.file_name), id, value)
- text = '%s' % (id, t)
+ text = '%s' % (t, id)
elif c == 'm':
empty = False
src = self.code_value(line)
diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index adda8794ca..000c603c1c 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -7,7 +7,7 @@ import os, glob, re, textwrap
from lxml import etree
from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.preprocess import line_length
+from calibre.ebooks.conversion.utils import PreProcessor
class InlineClass(etree.XSLTExtension):
@@ -229,16 +229,8 @@ class RTFInput(InputFormatPlugin):
res = transform.tostring(result)
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
if self.options.preprocess_html:
- self.log("********* Preprocessing HTML *********")
- # Detect Chapters to match the xpath in the GUI
- chapdetect = re.compile(r']*>\s*]*>\s*(?P(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?((i|b)>(i|b)>|(i|b)>)?)\s*\s*
', re.IGNORECASE)
- res = chapdetect.sub(''+'\g'+'
\n', res)
- # Unwrap lines using punctation if the median length of all lines is less than 150
- length = line_length('html', res, 0.4)
- self.log("*** Median length is " + str(length) + " ***")
- unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*\s*(
)?\s*(?P]*>\s*(]*>\s*\s*)
\s*){0,3}\s*]*>\s*(]*>)?\s*" % length, re.UNICODE)
- if length < 150:
- res = unwrap.sub(' ', res)
+ preprocessor = PreProcessor(log=getattr(self, 'log', None))
+ res = preprocessor(res)
f.write(res)
self.write_inline_css(inline_class)
stream.seek(0)
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index a12e8a0761..dac1e34df7 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -77,7 +77,7 @@ def separate_paragraphs_print_formatted(txt):
def preserve_spaces(txt):
txt = txt.replace(' ', ' ')
- txt = txt.replace('\t', ' ')
+ txt = txt.replace('\t', ' ')
return txt
def opf_writer(path, opf_name, manifest, spine, mi):
diff --git a/src/calibre/gui2/actions/edit_metadata.py b/src/calibre/gui2/actions/edit_metadata.py
index f0232d9859..878ba77a43 100644
--- a/src/calibre/gui2/actions/edit_metadata.py
+++ b/src/calibre/gui2/actions/edit_metadata.py
@@ -209,8 +209,9 @@ class EditMetadataAction(InterfaceAction):
dest_id, src_books, src_ids = self.books_to_merge(rows)
if safe_merge:
if not confirm(''+_(
- 'All book formats and metadata from the selected books '
- 'will be added to the first selected book.
'
+ 'Book formats and metadata from the selected books '
+ 'will be added to the first selected book. '
+ 'ISBN will not be merged.
'
'The second and subsequently selected books will not '
'be deleted or changed.
'
'Please confirm you want to proceed.')
@@ -220,8 +221,9 @@ class EditMetadataAction(InterfaceAction):
self.merge_metadata(dest_id, src_ids)
else:
if not confirm('
'+_(
- 'All book formats and metadata from the selected books will be merged '
- 'into the first selected book.
'
+ 'Book formats and metadata from the selected books will be merged '
+ 'into the first selected book. '
+ 'ISBN will not be merged.
'
'After merger the second and '
'subsequently selected books will be deleted.
'
'All book formats of the first selected book will be kept '
diff --git a/src/calibre/gui2/convert/look_and_feel.py b/src/calibre/gui2/convert/look_and_feel.py
index b0403bf1dd..ec3f0b944d 100644
--- a/src/calibre/gui2/convert/look_and_feel.py
+++ b/src/calibre/gui2/convert/look_and_feel.py
@@ -22,7 +22,7 @@ class LookAndFeelWidget(Widget, Ui_Form):
Widget.__init__(self, parent,
['change_justification', 'extra_css', 'base_font_size',
'font_size_mapping', 'line_height',
- 'linearize_tables',
+ 'linearize_tables', 'smarten_punctuation',
'disable_font_rescaling', 'insert_blank_line',
'remove_paragraph_spacing', 'remove_paragraph_spacing_indent_size','input_encoding',
'asciiize', 'keep_ligatures']
diff --git a/src/calibre/gui2/convert/look_and_feel.ui b/src/calibre/gui2/convert/look_and_feel.ui
index de48e7caf9..c683300854 100644
--- a/src/calibre/gui2/convert/look_and_feel.ui
+++ b/src/calibre/gui2/convert/look_and_feel.ui
@@ -178,7 +178,7 @@
- -
+
-
Extra &CSS
@@ -214,6 +214,13 @@
+ -
+
+
+ Smarten &punctuation
+
+
+
diff --git a/src/calibre/gui2/convert/pdf_input.ui b/src/calibre/gui2/convert/pdf_input.ui
index 626c68ea63..b2ee421922 100644
--- a/src/calibre/gui2/convert/pdf_input.ui
+++ b/src/calibre/gui2/convert/pdf_input.ui
@@ -46,7 +46,7 @@
0.010000000000000
- 0.500000000000000
+ 0.450000000000000
diff --git a/src/calibre/gui2/cover_flow.py b/src/calibre/gui2/cover_flow.py
index 88bbae6c41..cb951b09be 100644
--- a/src/calibre/gui2/cover_flow.py
+++ b/src/calibre/gui2/cover_flow.py
@@ -155,6 +155,7 @@ class CoverFlowMixin(object):
self.cb_splitter.action_toggle.triggered.connect(self.toggle_cover_browser)
if CoverFlow is not None:
self.cover_flow.stop.connect(self.hide_cover_browser)
+ self.cover_flow.setVisible(False)
else:
self.cb_splitter.insertWidget(self.cb_splitter.side_index, self.cover_flow)
if CoverFlow is not None:
diff --git a/src/calibre/gui2/library/models.py b/src/calibre/gui2/library/models.py
index bb47508531..c746a5aa56 100644
--- a/src/calibre/gui2/library/models.py
+++ b/src/calibre/gui2/library/models.py
@@ -121,10 +121,8 @@ class BooksModel(QAbstractTableModel): # {{{
def set_device_connected(self, is_connected):
self.device_connected = is_connected
self.db.refresh_ondevice()
- self.refresh()
+ self.refresh() # does a resort()
self.research()
- if is_connected and self.sorted_on[0] == 'ondevice':
- self.resort()
def set_book_on_device_func(self, func):
self.book_on_device = func
@@ -264,19 +262,15 @@ class BooksModel(QAbstractTableModel): # {{{
self.sorting_done.emit(self.db.index)
def refresh(self, reset=True):
- try:
- col = self.column_map.index(self.sorted_on[0])
- except:
- col = 0
self.db.refresh(field=None)
- self.sort(col, self.sorted_on[1], reset=reset)
+ self.resort(reset=reset)
def resort(self, reset=True):
- try:
- col = self.column_map.index(self.sorted_on[0])
- except ValueError:
- col = 0
- self.sort(col, self.sorted_on[1], reset=reset)
+ if not self.db:
+ return
+ self.db.multisort(self.sort_history[:tweaks['maximum_resort_levels']])
+ if reset:
+ self.reset()
def research(self, reset=True):
self.search(self.last_search, reset=reset)
@@ -1030,6 +1024,11 @@ class DeviceBooksModel(BooksModel): # {{{
if reset:
self.reset()
+ def resort(self, reset=True):
+ if self.sorted_on:
+ self.sort(self.column_map.index(self.sorted_on[0]),
+ self.sorted_on[1], reset=reset)
+
def columnCount(self, parent):
if parent and parent.isValid():
return 0
diff --git a/src/calibre/gui2/tag_view.py b/src/calibre/gui2/tag_view.py
index a64eb2eb9a..519d533ff6 100644
--- a/src/calibre/gui2/tag_view.py
+++ b/src/calibre/gui2/tag_view.py
@@ -512,7 +512,8 @@ class TagsModel(QAbstractItemModel): # {{{
_('The saved search name %s is already used.')%val).exec_()
return False
saved_searches().rename(unicode(item.data(role).toString()), val)
- self.tags_view.search_item_renamed.emit()
+ item.tag.name = val
+ self.tags_view.search_item_renamed.emit() # Does a refresh
else:
if key == 'series':
self.db.rename_series(item.tag.id, val)
@@ -526,8 +527,8 @@ class TagsModel(QAbstractItemModel): # {{{
self.db.rename_custom_item(item.tag.id, val,
label=self.db.field_metadata[key]['label'])
self.tags_view.tag_item_renamed.emit()
- item.tag.name = val
- self.refresh() # Should work, because no categories can have disappeared
+ item.tag.name = val
+ self.refresh() # Should work, because no categories can have disappeared
if path:
idx = self.index_for_path(path)
if idx.isValid():
@@ -669,7 +670,7 @@ class TagBrowserMixin(object): # {{{
self.tags_view.saved_search_edit.connect(self.do_saved_search_edit)
self.tags_view.author_sort_edit.connect(self.do_author_sort_edit)
self.tags_view.tag_item_renamed.connect(self.do_tag_item_renamed)
- self.tags_view.search_item_renamed.connect(self.saved_search.clear_to_help)
+ self.tags_view.search_item_renamed.connect(self.saved_searches_changed)
self.edit_categories.clicked.connect(lambda x:
self.do_user_categories_edit())
diff --git a/src/calibre/library/caches.py b/src/calibre/library/caches.py
index b9c1211c7f..4f795ab733 100644
--- a/src/calibre/library/caches.py
+++ b/src/calibre/library/caches.py
@@ -6,7 +6,7 @@ __license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal '
__docformat__ = 'restructuredtext en'
-import re, itertools, functools
+import re, itertools
from itertools import repeat
from datetime import timedelta
from threading import Thread, RLock
@@ -112,7 +112,7 @@ class ResultCache(SearchQueryParser):
'''
def __init__(self, FIELD_MAP, field_metadata):
self.FIELD_MAP = FIELD_MAP
- self._map = self._map_filtered = self._data = []
+ self._map = self._data = self._map_filtered = []
self.first_sort = True
self.search_restriction = ''
self.field_metadata = field_metadata
@@ -141,6 +141,8 @@ class ResultCache(SearchQueryParser):
for x in self.iterall():
yield x[idx]
+ # Search functions {{{
+
def universal_set(self):
return set([i[0] for i in self._data if i is not None])
@@ -462,12 +464,43 @@ class ResultCache(SearchQueryParser):
continue
return matches
+ def search(self, query, return_matches=False):
+ ans = self.search_getting_ids(query, self.search_restriction)
+ if return_matches:
+ return ans
+ self._map_filtered = ans
+
+ def search_getting_ids(self, query, search_restriction):
+ q = ''
+ if not query or not query.strip():
+ q = search_restriction
+ else:
+ q = query
+ if search_restriction:
+ q = u'%s (%s)' % (search_restriction, query)
+ if not q:
+ return list(self._map)
+ matches = self.parse(q)
+ tmap = list(itertools.repeat(False, len(self._data)))
+ for x in matches:
+ tmap[x] = True
+ return [x for x in self._map if tmap[x]]
+
+ def set_search_restriction(self, s):
+ self.search_restriction = s
+
+ # }}}
+
def remove(self, id):
self._data[id] = None
- if id in self._map:
+ try:
self._map.remove(id)
- if id in self._map_filtered:
+ except ValueError:
+ pass
+ try:
self._map_filtered.remove(id)
+ except ValueError:
+ pass
def set(self, row, col, val, row_is_id=False):
id = row if row_is_id else self._map_filtered[row]
@@ -522,9 +555,7 @@ class ResultCache(SearchQueryParser):
def books_deleted(self, ids):
for id in ids:
- self._data[id] = None
- if id in self._map: self._map.remove(id)
- if id in self._map_filtered: self._map_filtered.remove(id)
+ self.remove(id)
def count(self):
return len(self._map)
@@ -549,90 +580,97 @@ class ResultCache(SearchQueryParser):
self.sort(field, ascending)
self._map_filtered = list(self._map)
if self.search_restriction:
- self.search('', return_matches=False, ignore_search_restriction=False)
+ self.search('', return_matches=False)
- def seriescmp(self, sidx, siidx, x, y, library_order=None):
- try:
- if library_order:
- ans = cmp(title_sort(self._data[x][sidx].lower()),
- title_sort(self._data[y][sidx].lower()))
- else:
- ans = cmp(self._data[x][sidx].lower(),
- self._data[y][sidx].lower())
- except AttributeError: # Some entries may be None
- ans = cmp(self._data[x][sidx], self._data[y][sidx])
- if ans != 0: return ans
- return cmp(self._data[x][siidx], self._data[y][siidx])
+ # Sorting functions {{{
- def cmp(self, loc, x, y, asstr=True, subsort=False):
- try:
- ans = cmp(self._data[x][loc].lower(), self._data[y][loc].lower()) if \
- asstr else cmp(self._data[x][loc], self._data[y][loc])
- except AttributeError: # Some entries may be None
- ans = cmp(self._data[x][loc], self._data[y][loc])
- except TypeError: ## raised when a datetime is None
- x = self._data[x][loc]
- if x is None:
- x = UNDEFINED_DATE
- y = self._data[y][loc]
- if y is None:
- y = UNDEFINED_DATE
- return cmp(x, y)
- if subsort and ans == 0:
- return cmp(self._data[x][11].lower(), self._data[y][11].lower())
- return ans
+ def sanitize_sort_field_name(self, field):
+ field = field.lower().strip()
+ if field not in self.field_metadata.iterkeys():
+ if field in ('author', 'tag', 'comment'):
+ field += 's'
+ if field == 'date': field = 'timestamp'
+ elif field == 'title': field = 'sort'
+ elif field == 'authors': field = 'author_sort'
+ return field
def sort(self, field, ascending, subsort=False):
- field = field.lower().strip()
- if field in ('author', 'tag', 'comment'):
- field += 's'
- if field == 'date': field = 'timestamp'
- elif field == 'title': field = 'sort'
- elif field == 'authors': field = 'author_sort'
- as_string = field not in ('size', 'rating', 'timestamp')
+ self.multisort([(field, ascending)])
- if self.first_sort:
- subsort = True
- self.first_sort = False
- if self.field_metadata[field]['is_custom']:
- if self.field_metadata[field]['datatype'] == 'series':
- fcmp = functools.partial(self.seriescmp,
- self.field_metadata[field]['rec_index'],
- self.field_metadata.cc_series_index_column_for(field),
- library_order=tweaks['title_series_sorting'] == 'library_order')
- else:
- as_string = self.field_metadata[field]['datatype'] in ('comments', 'text')
- field = self.field_metadata[field]['colnum']
- fcmp = functools.partial(self.cmp, self.FIELD_MAP[field],
- subsort=subsort, asstr=as_string)
- elif field == 'series':
- fcmp = functools.partial(self.seriescmp, self.FIELD_MAP['series'],
- self.FIELD_MAP['series_index'],
- library_order=tweaks['title_series_sorting'] == 'library_order')
+ def multisort(self, fields=[], subsort=False):
+ fields = [(self.sanitize_sort_field_name(x), bool(y)) for x, y in fields]
+ keys = self.field_metadata.field_keys()
+ fields = [x for x in fields if x[0] in keys]
+ if subsort and 'sort' not in [x[0] for x in fields]:
+ fields += [('sort', True)]
+ if not fields:
+ fields = [('timestamp', False)]
+
+ keyg = SortKeyGenerator(fields, self.field_metadata, self._data)
+ if len(fields) == 1:
+ self._map.sort(key=keyg, reverse=not fields[0][1])
else:
- fcmp = functools.partial(self.cmp, self.FIELD_MAP[field],
- subsort=subsort, asstr=as_string)
- self._map.sort(cmp=fcmp, reverse=not ascending)
- self._map_filtered = [id for id in self._map if id in self._map_filtered]
+ self._map.sort(key=keyg)
- def search(self, query, return_matches=False):
- ans = self.search_getting_ids(query, self.search_restriction)
- if return_matches:
- return ans
- self._map_filtered = ans
+ tmap = list(itertools.repeat(False, len(self._data)))
+ for x in self._map_filtered:
+ tmap[x] = True
+ self._map_filtered = [x for x in self._map if tmap[x]]
+
+
+class SortKey(object):
+
+ def __init__(self, orders, values):
+ self.orders, self.values = orders, values
+
+ def __cmp__(self, other):
+ for i, ascending in enumerate(self.orders):
+ ans = cmp(self.values[i], other.values[i])
+ if ans != 0:
+ return ans * ascending
+ return 0
+
+class SortKeyGenerator(object):
+
+ def __init__(self, fields, field_metadata, data):
+ self.field_metadata = field_metadata
+ self.orders = [-1 if x[1] else 1 for x in fields]
+ self.entries = [(x[0], field_metadata[x[0]]) for x in fields]
+ self.library_order = tweaks['title_series_sorting'] == 'library_order'
+ self.data = data
+
+ def __call__(self, record):
+ values = tuple(self.itervals(self.data[record]))
+ if len(values) == 1:
+ return values[0]
+ return SortKey(self.orders, values)
+
+ def itervals(self, record):
+ for name, fm in self.entries:
+ dt = fm['datatype']
+ val = record[fm['rec_index']]
+
+ if dt == 'datetime':
+ if val is None:
+ val = UNDEFINED_DATE
+
+ elif dt == 'series':
+ if val is None:
+ val = ('', 1)
+ else:
+ val = val.lower()
+ if self.library_order:
+ val = title_sort(val)
+ sidx_fm = self.field_metadata[name + '_index']
+ sidx = record[sidx_fm['rec_index']]
+ val = (val, sidx)
+
+ elif dt in ('text', 'comments'):
+ if val is None:
+ val = ''
+ val = val.lower()
+ yield val
+
+ # }}}
- def search_getting_ids(self, query, search_restriction):
- q = ''
- if not query or not query.strip():
- q = search_restriction
- else:
- q = query
- if search_restriction:
- q = u'%s (%s)' % (search_restriction, query)
- if not q:
- return list(self._map)
- matches = sorted(self.parse(q))
- return [id for id in self._map if id in matches]
- def set_search_restriction(self, s):
- self.search_restriction = s
diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py
index 4106f8c965..8a5ab75c3c 100644
--- a/src/calibre/library/database2.py
+++ b/src/calibre/library/database2.py
@@ -311,6 +311,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
self.search_getting_ids = self.data.search_getting_ids
self.refresh = functools.partial(self.data.refresh, self)
self.sort = self.data.sort
+ self.multisort = self.data.multisort
self.index = self.data.index
self.refresh_ids = functools.partial(self.data.refresh_ids, self)
self.row = self.data.row
diff --git a/src/calibre/library/field_metadata.py b/src/calibre/library/field_metadata.py
index 66cdee51f0..276a6ba971 100644
--- a/src/calibre/library/field_metadata.py
+++ b/src/calibre/library/field_metadata.py
@@ -69,6 +69,8 @@ class FieldMetadata(dict):
VALID_DATA_TYPES = frozenset([None, 'rating', 'text', 'comments', 'datetime',
'int', 'float', 'bool', 'series'])
+ # Builtin metadata {{{
+
_field_metadata = [
('authors', {'table':'authors',
'column':'name',
@@ -287,7 +289,8 @@ class FieldMetadata(dict):
'search_terms':[],
'is_custom':False,
'is_category':False}),
- ]
+ ]
+ # }}}
# search labels that are not db columns
search_items = [ 'all',
@@ -332,6 +335,9 @@ class FieldMetadata(dict):
def keys(self):
return self._tb_cats.keys()
+ def field_keys(self):
+ return [k for k in self._tb_cats.keys() if self._tb_cats[k]['kind']=='field']
+
def iterkeys(self):
for key in self._tb_cats:
yield key
diff --git a/src/calibre/library/server/content.py b/src/calibre/library/server/content.py
index 6784abd8f4..ecb467b4c2 100644
--- a/src/calibre/library/server/content.py
+++ b/src/calibre/library/server/content.py
@@ -5,7 +5,7 @@ __license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal '
__docformat__ = 'restructuredtext en'
-import re, os, cStringIO, operator
+import re, os, cStringIO
import cherrypy
try:
@@ -16,7 +16,15 @@ except ImportError:
from calibre import fit_image, guess_type
from calibre.utils.date import fromtimestamp
-from calibre.ebooks.metadata import title_sort
+from calibre.library.caches import SortKeyGenerator
+
+class CSSortKeyGenerator(SortKeyGenerator):
+
+ def __init__(self, fields, fm):
+ SortKeyGenerator.__init__(self, fields, fm, None)
+
+ def __call__(self, record):
+ return self.itervals(record).next()
class ContentServer(object):
@@ -47,32 +55,12 @@ class ContentServer(object):
def sort(self, items, field, order):
- field = field.lower().strip()
- if field == 'author':
- field = 'authors'
- if field == 'date':
- field = 'timestamp'
+ field = self.db.data.sanitize_sort_field_name(field)
if field not in ('title', 'authors', 'rating', 'timestamp', 'tags', 'size', 'series'):
raise cherrypy.HTTPError(400, '%s is not a valid sort field'%field)
- cmpf = cmp if field in ('rating', 'size', 'timestamp') else \
- lambda x, y: cmp(x.lower() if x else '', y.lower() if y else '')
- if field == 'series':
- items.sort(cmp=self.seriescmp, reverse=not order)
- else:
- lookup = 'sort' if field == 'title' else field
- lookup = 'author_sort' if field == 'authors' else field
- field = self.db.FIELD_MAP[lookup]
- getter = operator.itemgetter(field)
- items.sort(cmp=lambda x, y: cmpf(getter(x), getter(y)), reverse=not order)
+ keyg = CSSortKeyGenerator([(field, order)], self.db.field_metadata)
+ items.sort(key=keyg, reverse=not order)
- def seriescmp(self, x, y):
- si = self.db.FIELD_MAP['series']
- try:
- ans = cmp(title_sort(x[si].lower()), title_sort(y[si].lower()))
- except AttributeError: # Some entries may be None
- ans = cmp(x[si], y[si])
- if ans != 0: return ans
- return cmp(x[self.db.FIELD_MAP['series_index']], y[self.db.FIELD_MAP['series_index']])
# }}}
diff --git a/src/calibre/utils/filenames.py b/src/calibre/utils/filenames.py
index 01eb9f30a0..47ccbe73c2 100644
--- a/src/calibre/utils/filenames.py
+++ b/src/calibre/utils/filenames.py
@@ -54,7 +54,8 @@ def shorten_components_to(length, components):
r = x[0] if x is components[-1] else ''
else:
if x is components[-1]:
- b, _, e = x.rpartition('.')
+ b, e = os.path.splitext(x)
+ if e == '.': e = ''
r = b[:-delta]+e
if r.startswith('.'): r = x[0]+r
else:
diff --git a/src/calibre/utils/smartypants.py b/src/calibre/utils/smartypants.py
new file mode 100755
index 0000000000..44aac4de8c
--- /dev/null
+++ b/src/calibre/utils/smartypants.py
@@ -0,0 +1,899 @@
+#!/usr/bin/python
+
+r"""
+==============
+smartypants.py
+==============
+
+----------------------------
+SmartyPants ported to Python
+----------------------------
+
+Ported by `Chad Miller`_
+Copyright (c) 2004, 2007 Chad Miller
+
+original `SmartyPants`_ by `John Gruber`_
+Copyright (c) 2003 John Gruber
+
+
+Synopsis
+========
+
+A smart-quotes plugin for Pyblosxom_.
+
+The priginal "SmartyPants" is a free web publishing plug-in for Movable Type,
+Blosxom, and BBEdit that easily translates plain ASCII punctuation characters
+into "smart" typographic punctuation HTML entities.
+
+This software, *smartypants.py*, endeavours to be a functional port of
+SmartyPants to Python, for use with Pyblosxom_.
+
+
+Description
+===========
+
+SmartyPants can perform the following transformations:
+
+- Straight quotes ( " and ' ) into "curly" quote HTML entities
+- Backticks-style quotes (\`\`like this'') into "curly" quote HTML entities
+- Dashes (``--`` and ``---``) into en- and em-dash entities
+- Three consecutive dots (``...`` or ``. . .``) into an ellipsis entity
+
+This means you can write, edit, and save your posts using plain old
+ASCII straight quotes, plain dashes, and plain dots, but your published
+posts (and final HTML output) will appear with smart quotes, em-dashes,
+and proper ellipses.
+
+SmartyPants does not modify characters within ````, ````, ````,
+``