diff --git a/resources/content_server/gui.js b/resources/content_server/gui.js index 631fb8b617..d0fb49cc8e 100644 --- a/resources/content_server/gui.js +++ b/resources/content_server/gui.js @@ -26,7 +26,7 @@ var current_library_request = null; ////////////////////////////// GET BOOK LIST ////////////////////////////// -var LIBRARY_FETCH_TIMEOUT = 30000; // milliseconds +var LIBRARY_FETCH_TIMEOUT = 5*60000; // milliseconds function create_table_headers() { var thead = $('table#book_list thead tr'); diff --git a/resources/default_tweaks.py b/resources/default_tweaks.py index 66ee4d1471..71bf2c6c37 100644 --- a/resources/default_tweaks.py +++ b/resources/default_tweaks.py @@ -114,3 +114,11 @@ add_new_book_tags_when_importing_books = False # Set the maximum number of tags to show per book in the content server max_content_server_tags_shown=5 + +# Set the maximum number of sort 'levels' that calibre will use to resort the +# library after certain operations such as searches or device insertion. Each +# sort level adds a performance penalty. If the database is large (thousands of +# books) the penalty might be noticeable. If you are not concerned about multi- +# level sorts, and if you are seeing a slowdown, reduce the value of this tweak. +maximum_resort_levels = 5 + diff --git a/resources/recipes/infobae.recipe b/resources/recipes/infobae.recipe index cda9bf83d2..b7f9cd3c6c 100644 --- a/resources/recipes/infobae.recipe +++ b/resources/recipes/infobae.recipe @@ -1,12 +1,8 @@ -#!/usr/bin/env python - __license__ = 'GPL v3' -__copyright__ = '2008-2009, Darko Miletic ' +__copyright__ = '2008-2010, Darko Miletic ' ''' infobae.com ''' -import re -import urllib, urlparse from calibre.web.feeds.news import BasicNewsRecipe @@ -20,35 +16,24 @@ class Infobae(BasicNewsRecipe): max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False - language = 'es' - lang = 'es-AR' - + language = 'es' encoding = 'cp1252' - cover_url = 'http://www.infobae.com/imgs/header/header.gif' + masthead_url = 'http://www.infobae.com/imgs/header/header.gif' remove_javascript = True - preprocess_regexps = [(re.compile( - r''), lambda m:'')] - - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True' - - extra_css = ''' - .col-center{font-family:Arial,Helvetica,sans-serif;} - h1{font-family:Arial,Helvetica,sans-serif; color:#0D4261;} - .fuenteIntNota{font-family:Arial,Helvetica,sans-serif; color:#1D1D1D; font-size:x-small;} - ''' - - keep_only_tags = [dict(name='div', attrs={'class':['content']})] - - - remove_tags = [ - dict(name='div', attrs={'class':['options','col-right','controles', 'bannerLibre','tiulo-masleidas','masleidas-h']}), - dict(name='a', attrs={'name' : 'comentario',}), - dict(name='iframe'), - dict(name='img', alt = "Ver galerias de imagenes"), - - ] - + remove_empty_feeds = True + extra_css = ''' + body{font-family:Arial,Helvetica,sans-serif;} + .popUpTitulo{color:#0D4261; font-size: xx-large} + ''' + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + , 'linearize_tables' : True + } + feeds = [ (u'Noticias' , u'http://www.infobae.com/adjuntos/html/RSS/hoy.xml' ) @@ -57,39 +42,14 @@ class Infobae(BasicNewsRecipe): ,(u'Deportes' , u'http://www.infobae.com/adjuntos/html/RSS/deportes.xml' ) ] -# def print_version(self, url): -# main, sep, article_part = url.partition('contenidos/') -# article_id, rsep, rrest = article_part.partition('-') -# return u'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id - - def get_article_url(self, article): - ans = article.get('link').encode('utf-8') - parts = list(urlparse.urlparse(ans)) - parts[2] = urllib.quote(parts[2]) - ans = urlparse.urlunparse(parts) - return ans.decode('utf-8') - - - def preprocess_html(self, soup): - - for tag in soup.head.findAll('strong'): - tag.extract() - for tag in soup.findAll('meta'): - del tag['content'] - tag.extract() - - mtag = '\n\n' - soup.head.insert(0,mtag) - for item in soup.findAll(style=True): - del item['style'] - - return soup + def print_version(self, url): + article_part = url.rpartition('/')[2] + article_id= article_part.partition('-')[0] + return 'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id def postprocess_html(self, soup, first): - for tag in soup.findAll(name='strong'): tag.name = 'b' - return soup diff --git a/resources/recipes/nspm.recipe b/resources/recipes/nspm.recipe index 13ff42b277..58b782415b 100644 --- a/resources/recipes/nspm.recipe +++ b/resources/recipes/nspm.recipe @@ -6,6 +6,7 @@ nspm.rs import re from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import NavigableString class Nspm(BasicNewsRecipe): title = 'Nova srpska politicka misao' @@ -21,6 +22,7 @@ class Nspm(BasicNewsRecipe): encoding = 'utf-8' language = 'sr' delay = 2 + remove_empty_feeds = True publication_type = 'magazine' masthead_url = 'http://www.nspm.rs/templates/jsn_epic_pro/images/logol.jpg' extra_css = """ @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @@ -45,8 +47,9 @@ class Nspm(BasicNewsRecipe): dict(name=['link','object','embed','script','meta','base','iframe']) ,dict(attrs={'class':'buttonheading'}) ] - remove_tags_after = dict(attrs={'class':'article_separator'}) - remove_attributes = ['width','height'] + remove_tags_before = dict(attrs={'class':'contentheading'}) + remove_tags_after = dict(attrs={'class':'article_separator'}) + remove_attributes = ['width','height'] def get_browser(self): br = BasicNewsRecipe.get_browser() @@ -67,4 +70,8 @@ class Nspm(BasicNewsRecipe): def preprocess_html(self, soup): for item in soup.body.findAll(style=True): del item['style'] + for item in soup.body.findAll('h1'): + nh = NavigableString(item.a.string) + item.a.extract() + item.insert(0,nh) return self.adeify_images(soup) diff --git a/resources/recipes/xkcd.recipe b/resources/recipes/xkcd.recipe index 312027004e..ad0d420deb 100644 --- a/resources/recipes/xkcd.recipe +++ b/resources/recipes/xkcd.recipe @@ -24,18 +24,18 @@ class XkcdCom(BasicNewsRecipe): (re.compile(r'()'), lambda m: '%s%s

%s

' % (m.group(1), m.group(3), m.group(2))) ] - + def parse_index(self): INDEX = 'http://xkcd.com/archive/' - soup = self.index_to_soup(INDEX) + soup = self.index_to_soup(INDEX) articles = [] for item in soup.findAll('a', title=True): articles.append({ 'date': item['title'], 'timestamp': time.mktime(time.strptime(item['title'], '%Y-%m-%d'))+1, 'url': 'http://xkcd.com' + item['href'], - 'title': self.tag_to_string(item).encode('UTF-8'), + 'title': self.tag_to_string(item), 'description': '', 'content': '', }) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 4c87236e71..68df832048 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -459,7 +459,7 @@ from calibre.devices.iriver.driver import IRIVER_STORY from calibre.devices.binatone.driver import README from calibre.devices.hanvon.driver import N516, EB511, ALEX, AZBOOKA, THEBOOK from calibre.devices.edge.driver import EDGE -from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS +from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS, SOVOS from calibre.devices.sne.driver import SNE from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, GEMEI from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG @@ -557,6 +557,7 @@ plugins += [ TECLAST_K3, NEWSMY, IPAPYRUS, + SOVOS, EDGE, SNE, ALEX, diff --git a/src/calibre/devices/kobo/books.py b/src/calibre/devices/kobo/books.py index 9da99d75c8..496162d668 100644 --- a/src/calibre/devices/kobo/books.py +++ b/src/calibre/devices/kobo/books.py @@ -44,16 +44,17 @@ class Book(MetaInformation): self.mime = mime self.size = size # will be set later if None - try: - if ContentType == '6': - self.datetime = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%f") - else: - self.datetime = time.gmtime(os.path.getctime(self.path)) - except: - self.datetime = time.gmtime() - if thumbnail_name is not None: - self.thumbnail = ImageWrapper(thumbnail_name) + if ContentType == '6': + self.datetime = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%f") + else: + try: + self.datetime = time.gmtime(os.path.getctime(self.path)) + except: + self.datetime = time.gmtime() + + if thumbnail_name is not None: + self.thumbnail = ImageWrapper(thumbnail_name) self.tags = [] if other: self.smart_update(other) diff --git a/src/calibre/devices/kobo/driver.py b/src/calibre/devices/kobo/driver.py index 5e1c752c76..f24e00143b 100644 --- a/src/calibre/devices/kobo/driver.py +++ b/src/calibre/devices/kobo/driver.py @@ -106,11 +106,14 @@ class KOBO(USBMS): changed = True bl[idx].device_collections = playlist_map.get(lpath, []) else: - book = self.book_from_path(prefix, lpath, title, authors, mime, date, ContentType, ImageID) + if ContentType == '6': + book = Book(prefix, lpath, title, authors, mime, date, ContentType, ImageID, size=1048576) + else: + book = self.book_from_path(prefix, lpath, title, authors, mime, date, ContentType, ImageID) # print 'Update booklist' + book.device_collections = playlist_map.get(book.lpath, []) if bl.add_book(book, replace_metadata=False): changed = True - book.device_collections = playlist_map.get(book.lpath, []) except: # Probably a path encoding error import traceback traceback.print_exc() @@ -231,21 +234,9 @@ class KOBO(USBMS): path = self.normalize_path(path) # print "Delete file normalized path: " + path extension = os.path.splitext(path)[1] - - if extension == '.kobo': - # Kobo books do not have book files. They do have some images though - #print "kobo book" - ContentType = 6 - ContentID = self.contentid_from_path(path, ContentType) - elif extension == '.pdf' or extension == '.epub': - # print "ePub or pdf" - ContentType = 16 - #print "Path: " + path - ContentID = self.contentid_from_path(path, ContentType) - # print "ContentID: " + ContentID - else: # if extension == '.html' or extension == '.txt': - ContentType = 999 # Yet another hack: to get around Kobo changing how ContentID is stored - ContentID = self.contentid_from_path(path, ContentType) + ContentType = self.get_content_type_from_extension(extension) + + ContentID = self.contentid_from_path(path, ContentType) ImageID = self.delete_via_sql(ContentID, ContentType) #print " We would now delete the Images for" + ImageID @@ -343,6 +334,17 @@ class KOBO(USBMS): ContentID = ContentID.replace("\\", '/') return ContentID + def get_content_type_from_extension(self, extension): + if extension == '.kobo': + # Kobo books do not have book files. They do have some images though + #print "kobo book" + ContentType = 6 + elif extension == '.pdf' or extension == '.epub': + # print "ePub or pdf" + ContentType = 16 + else: # if extension == '.html' or extension == '.txt': + ContentType = 999 # Yet another hack: to get around Kobo changing how ContentID is stored + return ContentType def path_from_contentid(self, ContentID, ContentType, oncard): path = ContentID diff --git a/src/calibre/devices/teclast/driver.py b/src/calibre/devices/teclast/driver.py index 0c60a367cf..2055ff9306 100644 --- a/src/calibre/devices/teclast/driver.py +++ b/src/calibre/devices/teclast/driver.py @@ -52,3 +52,14 @@ class IPAPYRUS(TECLAST_K3): VENDOR_NAME = 'E_READER' WINDOWS_MAIN_MEM = '' +class SOVOS(TECLAST_K3): + + name = 'Sovos device interface' + gui_name = 'Sovos' + description = _('Communicate with the Sovos reader.') + + FORMATS = ['epub', 'fb2', 'pdf', 'txt'] + + VENDOR_NAME = 'RK28XX' + WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'USB-MSC' + diff --git a/src/calibre/ebooks/chm/reader.py b/src/calibre/ebooks/chm/reader.py index 67a2d36607..831c16bf6a 100644 --- a/src/calibre/ebooks/chm/reader.py +++ b/src/calibre/ebooks/chm/reader.py @@ -132,7 +132,11 @@ class CHMReader(CHMFile): for path in self.Contents(): lpath = os.path.join(output_dir, path) self._ensure_dir(lpath) - data = self.GetFile(path) + try: + data = self.GetFile(path) + except: + self.log.exception('Failed to extract %s from CHM, ignoring'%path) + continue if lpath.find(';') != -1: # fix file names with ";" at the end, see _reformat() lpath = lpath.split(';')[0] diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index 7439718cf6..2ef633d0bb 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -122,7 +122,7 @@ def add_pipeline_options(parser, plumber): 'font_size_mapping', 'line_height', 'linearize_tables', - 'extra_css', + 'extra_css', 'smarten_punctuation', 'margin_top', 'margin_left', 'margin_right', 'margin_bottom', 'change_justification', 'insert_blank_line', 'remove_paragraph_spacing','remove_paragraph_spacing_indent_size', diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 24b35f804f..16282dd28d 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -362,6 +362,14 @@ OptionRecommendation(name='preprocess_html', ) ), +OptionRecommendation(name='smarten_punctuation', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Convert plain quotes, dashes and ellipsis to their ' + 'typographically correct equivalents. For details, see ' + 'http://daringfireball.net/projects/smartypants' + ) + ), + OptionRecommendation(name='remove_header', recommended_value=False, level=OptionRecommendation.LOW, help=_('Use a regular expression to try and remove the header.' diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index f7b803974f..4538af96c4 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -75,6 +75,8 @@ def line_length(format, raw, percent): linere = re.compile('(?<=)', re.DOTALL) elif format == 'pdf': linere = re.compile('(?<=
).*?(?=
)', re.DOTALL) + elif format == 'spanned_html': + linere = re.compile('(?<=)', re.DOTALL) lines = linere.findall(raw) lengths = [] @@ -166,6 +168,17 @@ class HTMLPreProcessor(object): (re.compile(u'`\s*()*\s*O', re.UNICODE), lambda match: u'Ò'), (re.compile(u'`\s*()*\s*u', re.UNICODE), lambda match: u'ù'), (re.compile(u'`\s*()*\s*U', re.UNICODE), lambda match: u'Ù'), + # ` with letter before + (re.compile(u'a\s*()*\s*`', re.UNICODE), lambda match: u'à'), + (re.compile(u'A\s*()*\s*`', re.UNICODE), lambda match: u'À'), + (re.compile(u'e\s*()*\s*`', re.UNICODE), lambda match: u'è'), + (re.compile(u'E\s*()*\s*`', re.UNICODE), lambda match: u'È'), + (re.compile(u'i\s*()*\s*`', re.UNICODE), lambda match: u'ì'), + (re.compile(u'I\s*()*\s*`', re.UNICODE), lambda match: u'Ì'), + (re.compile(u'o\s*()*\s*`', re.UNICODE), lambda match: u'ò'), + (re.compile(u'O\s*()*\s*`', re.UNICODE), lambda match: u'Ò'), + (re.compile(u'u\s*()*\s*`', re.UNICODE), lambda match: u'ù'), + (re.compile(u'U\s*()*\s*`', re.UNICODE), lambda match: u'Ù'), # ´ (re.compile(u'´\s*()*\s*a', re.UNICODE), lambda match: u'á'), @@ -208,35 +221,34 @@ class HTMLPreProcessor(object): (re.compile(u'˛\s*()*\s*A', re.UNICODE), lambda match: u'Ą'), (re.compile(u'˛\s*()*\s*e', re.UNICODE), lambda match: u'ę'), (re.compile(u'˛\s*()*\s*E', re.UNICODE), lambda match: u'Ę'), - + # ˙ (re.compile(u'˙\s*()*\s*z', re.UNICODE), lambda match: u'ż'), (re.compile(u'˙\s*()*\s*Z', re.UNICODE), lambda match: u'Ż'), - + + # If pdf printed from a browser then the header/footer has a reliable pattern + (re.compile(r'((?<=)\s*file:////?[A-Z].*
|file:////?[A-Z].*
(?=\s*
))', re.IGNORECASE), lambda match: ''), + + # Center separator lines + (re.compile(u'
\s*(?P([*#•]+\s*)+)\s*
'), lambda match: '

\n

' + match.group(1) + '

'), # Remove page links (re.compile(r'', re.IGNORECASE), lambda match: ''), # Remove
tags - (re.compile(r'', re.IGNORECASE), lambda match: '
'), - # Replace

with

- (re.compile(r'\s*', re.IGNORECASE), lambda match: '

'), - - # Remove hyphenation - (re.compile(r'-\n\r?'), lambda match: ''), + (re.compile(r'', re.IGNORECASE), lambda match: '
'), # Remove gray background (re.compile(r']+>'), lambda match : ''), # Detect Chapters to match default XPATH in GUI - (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part)\s*([\d\w-]+(\s\w+)?)?(()?)?)]*>\s*(?P(<(i|b)>)?\s*\w+(\s*\w+)?\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head), - (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head), + (re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head), + # Cover the case where every letter in a chapter title is separated by a space + (re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head), # Have paragraphs show better (re.compile(r'<br.*?>'), lambda match : '<p>'), # Clean up spaces (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), - # Connect paragraphs split by - - (re.compile(u'(?<=[^\s][-–])[\s]*(</p>)*[\s]*(<p>)*\s*(?=[^\s])'), lambda match: ''), # Add space before and after italics (re.compile(u'(?<!“)<i>'), lambda match: ' <i>'), (re.compile(r'</i>(?=\w)'), lambda match: '</i> '), @@ -317,12 +329,29 @@ class HTMLPreProcessor(object): print 'Failed to parse remove_footer regexp' traceback.print_exc() + # unwrap hyphenation - moved here so it's executed after header/footer removal + if is_pdftohtml: + # unwrap visible dashes and hyphens - don't delete they are often hyphens for + # for compound words, formatting, etc + end_rules.append((re.compile(u'(?<=[-–—])\s*<p>\s*(?=[[a-z\d])'), lambda match: '')) + # unwrap/delete soft hyphens + end_rules.append((re.compile(u'[­](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: '')) + # unwrap/delete soft hyphens with formatting + end_rules.append((re.compile(u'[­]\s*(</(i|u|b)>)+(\s*<p>)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: '')) + + # Make the more aggressive chapter marking regex optional with the preprocess option to + # reduce false positives and move after header/footer removal + if getattr(self.extra_opts, 'preprocess_html', None): + if is_pdftohtml: + end_rules.append((re.compile(r'<p>\s*(?P<chap>(<[ibu]>){0,2}\s*([A-Z \'"!]{3,})\s*([\dA-Z:]+\s){0,4}\s*(</[ibu]>){0,2})\s*<p>\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<p>)?'), chap_head),) + if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01: length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor')) if length: + # print "The pdf line length returned is " + str(length) end_rules.append( # Un wrap using punctuation - (re.compile(r'(?<=.{%i}[a-z\.,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines), + (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), ) for rule in self.PREPROCESS + start_rules: @@ -372,5 +401,14 @@ class HTMLPreProcessor(object): if self.plugin_preprocess: html = self.input_plugin_preprocess(html) + if getattr(self.extra_opts, 'smarten_punctuation', False): + html = self.smarten_punctuation(html) + return html + def smarten_punctuation(self, html): + from calibre.utils.smartypants import smartyPants + from calibre.ebooks.chardet import substitute_entites + html = smartyPants(html) + return substitute_entites(html) + diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py new file mode 100644 index 0000000000..5301f70a16 --- /dev/null +++ b/src/calibre/ebooks/conversion/utils.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai + +__license__ = 'GPL v3' +__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>' +__docformat__ = 'restructuredtext en' + +import re +from calibre.ebooks.conversion.preprocess import line_length +from calibre.utils.logging import default_log + +class PreProcessor(object): + + def __init__(self, log=None): + self.log = default_log if log is None else log + self.html_preprocess_sections = 0 + self.found_indents = 0 + + def chapter_head(self, match): + chap = match.group('chap') + title = match.group('title') + if not title: + self.html_preprocess_sections = self.html_preprocess_sections + 1 + self.log("found " + str(self.html_preprocess_sections) + " chapters. - " + str(chap)) + return '<h2>'+chap+'</h2>\n' + else: + self.html_preprocess_sections = self.html_preprocess_sections + 1 + self.log("found " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title)) + return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n' + + def chapter_break(self, match): + chap = match.group('section') + styles = match.group('styles') + self.html_preprocess_sections = self.html_preprocess_sections + 1 + self.log("marked " + str(self.html_preprocess_sections) + " section markers based on punctuation. - " + str(chap)) + return '<'+styles+' style="page-break-before:always">'+chap + + def insert_indent(self, match): + pstyle = match.group('formatting') + span = match.group('span') + self.found_indents = self.found_indents + 1 + if pstyle: + if not span: + return '<p '+pstyle+' style="text-indent:3%">' + else: + return '<p '+pstyle+' style="text-indent:3%">'+span + else: + if not span: + return '<p style="text-indent:3%">' + else: + return '<p style="text-indent:3%">'+span + + def no_markup(self, raw, percent): + ''' + Detects total marked up line endings in the file. raw is the text to + inspect. Percent is the minimum percent of line endings which should + be marked up to return true. + ''' + htm_end_ere = re.compile('</p>', re.DOTALL) + line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL) + htm_end = htm_end_ere.findall(raw) + line_end = line_end_ere.findall(raw) + tot_htm_ends = len(htm_end) + tot_ln_fds = len(line_end) + self.log("There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked up endings") + + if percent > 1: + percent = 1 + if percent < 0: + percent = 0 + + min_lns = tot_ln_fds * percent + self.log("There must be fewer than " + str(min_lns) + " unmarked lines to add markup") + if min_lns > tot_htm_ends: + return True + + def __call__(self, html): + self.log("********* Preprocessing HTML *********") + # Replace series of non-breaking spaces with text-indent + txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE) + html = txtindent.sub(self.insert_indent, html) + if self.found_indents > 1: + self.log("replaced "+str(self.found_indents)+ " nbsp indents with inline styles") + # remove remaining non-breaking spaces + html = re.sub(ur'\u00a0', ' ', html) + # Get rid of empty <o:p> tags to simplify other processing + html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html) + # Get rid of empty span tags + html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html) + + # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing + linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL) + blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE) + blanklines = blankreg.findall(html) + lines = linereg.findall(html) + if len(lines) > 1: + self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank") + if float(len(blanklines)) / float(len(lines)) > 0.40: + self.log("deleting blank lines") + html = blankreg.sub('', html) + # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly + html = re.sub(r"\s*</p>", "</p>\n", html) + html = re.sub(r"\s*<p>\s*", "\n<p>", html) + + # some lit files don't have any <p> tags or equivalent (generally just plain text between + # <pre> tags), check and mark up line endings if required before proceeding + if self.no_markup(html, 0.1): + self.log("not enough paragraph markers, adding now") + add_markup = re.compile('(?<!>)(\n)') + html = add_markup.sub('</p>\n<p>', html) + + # detect chapters/sections to match xpath or splitting logic + heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE) + self.html_preprocess_sections = len(heading.findall(html)) + self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings") + # + # Start with most typical chapter headings, get more aggressive until one works + if self.html_preprocess_sections < 10: + chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}s*(<span[^>]*>)?\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE) + html = chapdetect.sub(self.chapter_head, html) + if self.html_preprocess_sections < 10: + self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters") + chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE) + html = chapdetect2.sub(self.chapter_head, html) + + if self.html_preprocess_sections < 10: + self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words") + chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE) + html = chapdetect2.sub(self.chapter_head, html) + + # Unwrap lines + # + self.log("Unwrapping Lines") + # Some OCR sourced files have line breaks in the html using a combination of span & p tags + # span are used for hard line breaks, p for new paragraphs. Determine which is used so + # that lines can be un-wrapped across page boundaries + paras_reg = re.compile('<p[^>]*>', re.IGNORECASE) + spans_reg = re.compile('<span[^>]*>', re.IGNORECASE) + paras = len(paras_reg.findall(html)) + spans = len(spans_reg.findall(html)) + if spans > 1: + if float(paras) / float(spans) < 0.75: + format = 'spanned_html' + else: + format = 'html' + else: + format = 'html' + + # Calculate Length + length = line_length(format, html, 0.4) + self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***") + # + # Unwrap and/or delete soft-hyphens, hyphens + html = re.sub(u'­\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html) + html = re.sub(u'(?<=[-–—])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html) + + # Unwrap lines using punctation if the median length of all lines is less than 200 + unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) + html = unwrap.sub(' ', html) + + # If still no sections after unwrapping mark split points on lines with no punctuation + if self.html_preprocess_sections < 10: + self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections)) + #self.log(html) + chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE) + html = chapdetect3.sub(self.chapter_break, html) + # search for places where a first or second level heading is immediately followed by another + # top level heading. demote the second heading to h3 to prevent splitting between chapter + # headings and titles, images, etc + doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE) + html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html) + + return html diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index d57bfddd3e..084d48e54b 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -24,7 +24,7 @@ from calibre.constants import islinux, isfreebsd, iswindows from calibre import unicode_path from calibre.utils.localization import get_lang from calibre.utils.filenames import ascii_filename -from calibre.ebooks.conversion.preprocess import line_length +from calibre.ebooks.conversion.utils import PreProcessor class Link(object): ''' @@ -491,20 +491,6 @@ class HTMLInput(InputFormatPlugin): return (None, raw) def preprocess_html(self, html): - if not hasattr(self, 'log'): - from calibre.utils.logging import default_log - self.log = default_log - self.log("********* Preprocessing HTML *********") - # Detect Chapters to match the xpath in the GUI - chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE) - html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html) - # Unwrap lines using punctation if the median length of all lines is less than 150 - # - # Insert extra line feeds so the line length regex functions properly - html = re.sub(r"</p>", "</p>\n", html) - length = line_length('html', html, 0.4) - self.log.debug("*** Median length is " + str(length) + " ***") - unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) - if length < 150: - html = unwrap.sub(' ', html) - return html + preprocessor = PreProcessor(log=getattr(self, 'log', None)) + return preprocessor(html) + diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py index 9bf20fb1d4..65f5c607a2 100644 --- a/src/calibre/ebooks/lit/input.py +++ b/src/calibre/ebooks/lit/input.py @@ -6,10 +6,9 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' -import re - from calibre.customize.conversion import InputFormatPlugin -from calibre.ebooks.conversion.preprocess import line_length +from calibre.ebooks.conversion.utils import PreProcessor + class LITInput(InputFormatPlugin): @@ -55,18 +54,6 @@ class LITInput(InputFormatPlugin): def preprocess_html(self, html): - self.log("********* Preprocessing HTML *********") - # Detect Chapters to match the xpath in the GUI - chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE) - html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html) - # Unwrap lines using punctation if the median length of all lines is less than 150 - # - # Insert extra line feeds so the line length regex functions properly - html = re.sub(r"</p>", "</p>\n", html) - length = line_length('html', html, 0.4) - self.log("*** Median length is " + str(length) + " ***") - unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) - if length < 150: - html = unwrap.sub(' ', html) - return html + preprocessor = PreProcessor(log=getattr(self, 'log', None)) + return preprocessor(html) diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py index 487e70c04f..b8dc7a9560 100644 --- a/src/calibre/ebooks/mobi/input.py +++ b/src/calibre/ebooks/mobi/input.py @@ -3,6 +3,7 @@ __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' +import re from calibre.customize.conversion import InputFormatPlugin class MOBIInput(InputFormatPlugin): @@ -37,3 +38,12 @@ class MOBIInput(InputFormatPlugin): include_meta_content_type=False)) accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]' return mr.created_opf_path + + def preprocess_html(self, html): + # search for places where a first or second level heading is immediately followed by another + # top level heading. demote the second heading to h3 to prevent splitting between chapter + # headings and titles, images, etc + doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE) + html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html) + return html + diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py index 030c271362..9a5ff36d55 100644 --- a/src/calibre/ebooks/oeb/transforms/flatcss.py +++ b/src/calibre/ebooks/oeb/transforms/flatcss.py @@ -138,6 +138,7 @@ class CSSFlattener(object): float(self.context.margin_left)) bs.append('margin-right : %fpt'%\ float(self.context.margin_right)) + bs.extend(['padding-left: 0pt', 'padding-right: 0pt']) if self.context.change_justification != 'original': bs.append('text-align: '+ self.context.change_justification) body.set('style', '; '.join(bs)) diff --git a/src/calibre/ebooks/pdb/pdf/reader.py b/src/calibre/ebooks/pdb/pdf/reader.py index 3ae9f8ccca..c151551866 100644 --- a/src/calibre/ebooks/pdb/pdf/reader.py +++ b/src/calibre/ebooks/pdb/pdf/reader.py @@ -21,7 +21,7 @@ class Reader(FormatReader): self.options = options setattr(self.options, 'new_pdf_engine', False) setattr(self.options, 'no_images', False) - setattr(self.options, 'unwrap_factor', 0.5) + setattr(self.options, 'unwrap_factor', 0.45) def extract_content(self, output_dir): self.log.info('Extracting PDF...') diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py index 64a089281e..14b3552b04 100644 --- a/src/calibre/ebooks/pdf/input.py +++ b/src/calibre/ebooks/pdf/input.py @@ -22,10 +22,10 @@ class PDFInput(InputFormatPlugin): options = set([ OptionRecommendation(name='no_images', recommended_value=False, help=_('Do not extract images from the document')), - OptionRecommendation(name='unwrap_factor', recommended_value=0.5, + OptionRecommendation(name='unwrap_factor', recommended_value=0.45, help=_('Scale used to determine the length at which a line should ' 'be unwrapped. Valid values are a decimal between 0 and 1. The ' - 'default is 0.5, this is the median line length.')), + 'default is 0.45, just below the median line length.')), OptionRecommendation(name='new_pdf_engine', recommended_value=False, help=_('Use the new PDF conversion engine.')) ]) diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index 166695ff5c..b0fc15197a 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -207,6 +207,7 @@ class PML_HTMLizer(object): while html != old: old = html html = self.cleanup_html_remove_redundant(html) + html = re.sub(r'(?imu)^\s*', '', html) return html def cleanup_html_remove_redundant(self, html): @@ -216,7 +217,7 @@ class PML_HTMLizer(object): html = re.sub(r'(?u)%s\s*%s' % (open % '.*?', close), '', html) else: html = re.sub(r'(?u)%s\s*%s' % (open, close), '', html) - html = re.sub(r'<p>\s*</p>', '', html) + html = re.sub(r'(?imu)<p>\s*</p>', '', html) return html def start_line(self): @@ -556,7 +557,7 @@ class PML_HTMLizer(object): text = t else: self.toc.add_item(os.path.basename(self.file_name), id, value) - text = '<span id="%s"></span>%s' % (id, t) + text = '%s<span id="%s"></span>' % (t, id) elif c == 'm': empty = False src = self.code_value(line) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index adda8794ca..000c603c1c 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -7,7 +7,7 @@ import os, glob, re, textwrap from lxml import etree from calibre.customize.conversion import InputFormatPlugin -from calibre.ebooks.conversion.preprocess import line_length +from calibre.ebooks.conversion.utils import PreProcessor class InlineClass(etree.XSLTExtension): @@ -229,16 +229,8 @@ class RTFInput(InputFormatPlugin): res = transform.tostring(result) res = res[:100].replace('xmlns:html', 'xmlns') + res[100:] if self.options.preprocess_html: - self.log("********* Preprocessing HTML *********") - # Detect Chapters to match the xpath in the GUI - chapdetect = re.compile(r'<p[^>]*>\s*<span[^>]*>\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)\s*</span>\s*</p>', re.IGNORECASE) - res = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', res) - # Unwrap lines using punctation if the median length of all lines is less than 150 - length = line_length('html', res, 0.4) - self.log("*** Median length is " + str(length) + " ***") - unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</span>\s*(</p>)?\s*(?P<up2threeblanks><p[^>]*>\s*(<span[^>]*>\s*</span>\s*)</p>\s*){0,3}\s*<p[^>]*>\s*(<span[^>]*>)?\s*" % length, re.UNICODE) - if length < 150: - res = unwrap.sub(' ', res) + preprocessor = PreProcessor(log=getattr(self, 'log', None)) + res = preprocessor(res) f.write(res) self.write_inline_css(inline_class) stream.seek(0) diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index a12e8a0761..dac1e34df7 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -77,7 +77,7 @@ def separate_paragraphs_print_formatted(txt): def preserve_spaces(txt): txt = txt.replace(' ', ' ') - txt = txt.replace('\t', ' ') + txt = txt.replace('\t', '    ') return txt def opf_writer(path, opf_name, manifest, spine, mi): diff --git a/src/calibre/gui2/actions/edit_metadata.py b/src/calibre/gui2/actions/edit_metadata.py index f0232d9859..878ba77a43 100644 --- a/src/calibre/gui2/actions/edit_metadata.py +++ b/src/calibre/gui2/actions/edit_metadata.py @@ -209,8 +209,9 @@ class EditMetadataAction(InterfaceAction): dest_id, src_books, src_ids = self.books_to_merge(rows) if safe_merge: if not confirm('<p>'+_( - 'All book formats and metadata from the selected books ' - 'will be added to the <b>first selected book.</b><br><br> ' + 'Book formats and metadata from the selected books ' + 'will be added to the <b>first selected book.</b> ' + 'ISBN will <i>not</i> be merged.<br><br> ' 'The second and subsequently selected books will not ' 'be deleted or changed.<br><br>' 'Please confirm you want to proceed.') @@ -220,8 +221,9 @@ class EditMetadataAction(InterfaceAction): self.merge_metadata(dest_id, src_ids) else: if not confirm('<p>'+_( - 'All book formats and metadata from the selected books will be merged ' - 'into the <b>first selected book</b>.<br><br>' + 'Book formats and metadata from the selected books will be merged ' + 'into the <b>first selected book</b>. ' + 'ISBN will <i>not</i> be merged.<br><br>' 'After merger the second and ' 'subsequently selected books will be <b>deleted</b>. <br><br>' 'All book formats of the first selected book will be kept ' diff --git a/src/calibre/gui2/convert/look_and_feel.py b/src/calibre/gui2/convert/look_and_feel.py index b0403bf1dd..ec3f0b944d 100644 --- a/src/calibre/gui2/convert/look_and_feel.py +++ b/src/calibre/gui2/convert/look_and_feel.py @@ -22,7 +22,7 @@ class LookAndFeelWidget(Widget, Ui_Form): Widget.__init__(self, parent, ['change_justification', 'extra_css', 'base_font_size', 'font_size_mapping', 'line_height', - 'linearize_tables', + 'linearize_tables', 'smarten_punctuation', 'disable_font_rescaling', 'insert_blank_line', 'remove_paragraph_spacing', 'remove_paragraph_spacing_indent_size','input_encoding', 'asciiize', 'keep_ligatures'] diff --git a/src/calibre/gui2/convert/look_and_feel.ui b/src/calibre/gui2/convert/look_and_feel.ui index de48e7caf9..c683300854 100644 --- a/src/calibre/gui2/convert/look_and_feel.ui +++ b/src/calibre/gui2/convert/look_and_feel.ui @@ -178,7 +178,7 @@ </property> </widget> </item> - <item row="9" column="0" colspan="4"> + <item row="10" column="0" colspan="4"> <widget class="QGroupBox" name="groupBox"> <property name="title"> <string>Extra &CSS</string> @@ -214,6 +214,13 @@ </property> </widget> </item> + <item row="9" column="0"> + <widget class="QCheckBox" name="opt_smarten_punctuation"> + <property name="text"> + <string>Smarten &punctuation</string> + </property> + </widget> + </item> </layout> </widget> <resources> diff --git a/src/calibre/gui2/convert/pdf_input.ui b/src/calibre/gui2/convert/pdf_input.ui index 626c68ea63..b2ee421922 100644 --- a/src/calibre/gui2/convert/pdf_input.ui +++ b/src/calibre/gui2/convert/pdf_input.ui @@ -46,7 +46,7 @@ <double>0.010000000000000</double> </property> <property name="value"> - <double>0.500000000000000</double> + <double>0.450000000000000</double> </property> </widget> </item> diff --git a/src/calibre/gui2/cover_flow.py b/src/calibre/gui2/cover_flow.py index 88bbae6c41..cb951b09be 100644 --- a/src/calibre/gui2/cover_flow.py +++ b/src/calibre/gui2/cover_flow.py @@ -155,6 +155,7 @@ class CoverFlowMixin(object): self.cb_splitter.action_toggle.triggered.connect(self.toggle_cover_browser) if CoverFlow is not None: self.cover_flow.stop.connect(self.hide_cover_browser) + self.cover_flow.setVisible(False) else: self.cb_splitter.insertWidget(self.cb_splitter.side_index, self.cover_flow) if CoverFlow is not None: diff --git a/src/calibre/gui2/library/models.py b/src/calibre/gui2/library/models.py index bb47508531..c746a5aa56 100644 --- a/src/calibre/gui2/library/models.py +++ b/src/calibre/gui2/library/models.py @@ -121,10 +121,8 @@ class BooksModel(QAbstractTableModel): # {{{ def set_device_connected(self, is_connected): self.device_connected = is_connected self.db.refresh_ondevice() - self.refresh() + self.refresh() # does a resort() self.research() - if is_connected and self.sorted_on[0] == 'ondevice': - self.resort() def set_book_on_device_func(self, func): self.book_on_device = func @@ -264,19 +262,15 @@ class BooksModel(QAbstractTableModel): # {{{ self.sorting_done.emit(self.db.index) def refresh(self, reset=True): - try: - col = self.column_map.index(self.sorted_on[0]) - except: - col = 0 self.db.refresh(field=None) - self.sort(col, self.sorted_on[1], reset=reset) + self.resort(reset=reset) def resort(self, reset=True): - try: - col = self.column_map.index(self.sorted_on[0]) - except ValueError: - col = 0 - self.sort(col, self.sorted_on[1], reset=reset) + if not self.db: + return + self.db.multisort(self.sort_history[:tweaks['maximum_resort_levels']]) + if reset: + self.reset() def research(self, reset=True): self.search(self.last_search, reset=reset) @@ -1030,6 +1024,11 @@ class DeviceBooksModel(BooksModel): # {{{ if reset: self.reset() + def resort(self, reset=True): + if self.sorted_on: + self.sort(self.column_map.index(self.sorted_on[0]), + self.sorted_on[1], reset=reset) + def columnCount(self, parent): if parent and parent.isValid(): return 0 diff --git a/src/calibre/gui2/tag_view.py b/src/calibre/gui2/tag_view.py index a64eb2eb9a..519d533ff6 100644 --- a/src/calibre/gui2/tag_view.py +++ b/src/calibre/gui2/tag_view.py @@ -512,7 +512,8 @@ class TagsModel(QAbstractItemModel): # {{{ _('The saved search name %s is already used.')%val).exec_() return False saved_searches().rename(unicode(item.data(role).toString()), val) - self.tags_view.search_item_renamed.emit() + item.tag.name = val + self.tags_view.search_item_renamed.emit() # Does a refresh else: if key == 'series': self.db.rename_series(item.tag.id, val) @@ -526,8 +527,8 @@ class TagsModel(QAbstractItemModel): # {{{ self.db.rename_custom_item(item.tag.id, val, label=self.db.field_metadata[key]['label']) self.tags_view.tag_item_renamed.emit() - item.tag.name = val - self.refresh() # Should work, because no categories can have disappeared + item.tag.name = val + self.refresh() # Should work, because no categories can have disappeared if path: idx = self.index_for_path(path) if idx.isValid(): @@ -669,7 +670,7 @@ class TagBrowserMixin(object): # {{{ self.tags_view.saved_search_edit.connect(self.do_saved_search_edit) self.tags_view.author_sort_edit.connect(self.do_author_sort_edit) self.tags_view.tag_item_renamed.connect(self.do_tag_item_renamed) - self.tags_view.search_item_renamed.connect(self.saved_search.clear_to_help) + self.tags_view.search_item_renamed.connect(self.saved_searches_changed) self.edit_categories.clicked.connect(lambda x: self.do_user_categories_edit()) diff --git a/src/calibre/library/caches.py b/src/calibre/library/caches.py index b9c1211c7f..4f795ab733 100644 --- a/src/calibre/library/caches.py +++ b/src/calibre/library/caches.py @@ -6,7 +6,7 @@ __license__ = 'GPL v3' __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' -import re, itertools, functools +import re, itertools from itertools import repeat from datetime import timedelta from threading import Thread, RLock @@ -112,7 +112,7 @@ class ResultCache(SearchQueryParser): ''' def __init__(self, FIELD_MAP, field_metadata): self.FIELD_MAP = FIELD_MAP - self._map = self._map_filtered = self._data = [] + self._map = self._data = self._map_filtered = [] self.first_sort = True self.search_restriction = '' self.field_metadata = field_metadata @@ -141,6 +141,8 @@ class ResultCache(SearchQueryParser): for x in self.iterall(): yield x[idx] + # Search functions {{{ + def universal_set(self): return set([i[0] for i in self._data if i is not None]) @@ -462,12 +464,43 @@ class ResultCache(SearchQueryParser): continue return matches + def search(self, query, return_matches=False): + ans = self.search_getting_ids(query, self.search_restriction) + if return_matches: + return ans + self._map_filtered = ans + + def search_getting_ids(self, query, search_restriction): + q = '' + if not query or not query.strip(): + q = search_restriction + else: + q = query + if search_restriction: + q = u'%s (%s)' % (search_restriction, query) + if not q: + return list(self._map) + matches = self.parse(q) + tmap = list(itertools.repeat(False, len(self._data))) + for x in matches: + tmap[x] = True + return [x for x in self._map if tmap[x]] + + def set_search_restriction(self, s): + self.search_restriction = s + + # }}} + def remove(self, id): self._data[id] = None - if id in self._map: + try: self._map.remove(id) - if id in self._map_filtered: + except ValueError: + pass + try: self._map_filtered.remove(id) + except ValueError: + pass def set(self, row, col, val, row_is_id=False): id = row if row_is_id else self._map_filtered[row] @@ -522,9 +555,7 @@ class ResultCache(SearchQueryParser): def books_deleted(self, ids): for id in ids: - self._data[id] = None - if id in self._map: self._map.remove(id) - if id in self._map_filtered: self._map_filtered.remove(id) + self.remove(id) def count(self): return len(self._map) @@ -549,90 +580,97 @@ class ResultCache(SearchQueryParser): self.sort(field, ascending) self._map_filtered = list(self._map) if self.search_restriction: - self.search('', return_matches=False, ignore_search_restriction=False) + self.search('', return_matches=False) - def seriescmp(self, sidx, siidx, x, y, library_order=None): - try: - if library_order: - ans = cmp(title_sort(self._data[x][sidx].lower()), - title_sort(self._data[y][sidx].lower())) - else: - ans = cmp(self._data[x][sidx].lower(), - self._data[y][sidx].lower()) - except AttributeError: # Some entries may be None - ans = cmp(self._data[x][sidx], self._data[y][sidx]) - if ans != 0: return ans - return cmp(self._data[x][siidx], self._data[y][siidx]) + # Sorting functions {{{ - def cmp(self, loc, x, y, asstr=True, subsort=False): - try: - ans = cmp(self._data[x][loc].lower(), self._data[y][loc].lower()) if \ - asstr else cmp(self._data[x][loc], self._data[y][loc]) - except AttributeError: # Some entries may be None - ans = cmp(self._data[x][loc], self._data[y][loc]) - except TypeError: ## raised when a datetime is None - x = self._data[x][loc] - if x is None: - x = UNDEFINED_DATE - y = self._data[y][loc] - if y is None: - y = UNDEFINED_DATE - return cmp(x, y) - if subsort and ans == 0: - return cmp(self._data[x][11].lower(), self._data[y][11].lower()) - return ans + def sanitize_sort_field_name(self, field): + field = field.lower().strip() + if field not in self.field_metadata.iterkeys(): + if field in ('author', 'tag', 'comment'): + field += 's' + if field == 'date': field = 'timestamp' + elif field == 'title': field = 'sort' + elif field == 'authors': field = 'author_sort' + return field def sort(self, field, ascending, subsort=False): - field = field.lower().strip() - if field in ('author', 'tag', 'comment'): - field += 's' - if field == 'date': field = 'timestamp' - elif field == 'title': field = 'sort' - elif field == 'authors': field = 'author_sort' - as_string = field not in ('size', 'rating', 'timestamp') + self.multisort([(field, ascending)]) - if self.first_sort: - subsort = True - self.first_sort = False - if self.field_metadata[field]['is_custom']: - if self.field_metadata[field]['datatype'] == 'series': - fcmp = functools.partial(self.seriescmp, - self.field_metadata[field]['rec_index'], - self.field_metadata.cc_series_index_column_for(field), - library_order=tweaks['title_series_sorting'] == 'library_order') - else: - as_string = self.field_metadata[field]['datatype'] in ('comments', 'text') - field = self.field_metadata[field]['colnum'] - fcmp = functools.partial(self.cmp, self.FIELD_MAP[field], - subsort=subsort, asstr=as_string) - elif field == 'series': - fcmp = functools.partial(self.seriescmp, self.FIELD_MAP['series'], - self.FIELD_MAP['series_index'], - library_order=tweaks['title_series_sorting'] == 'library_order') + def multisort(self, fields=[], subsort=False): + fields = [(self.sanitize_sort_field_name(x), bool(y)) for x, y in fields] + keys = self.field_metadata.field_keys() + fields = [x for x in fields if x[0] in keys] + if subsort and 'sort' not in [x[0] for x in fields]: + fields += [('sort', True)] + if not fields: + fields = [('timestamp', False)] + + keyg = SortKeyGenerator(fields, self.field_metadata, self._data) + if len(fields) == 1: + self._map.sort(key=keyg, reverse=not fields[0][1]) else: - fcmp = functools.partial(self.cmp, self.FIELD_MAP[field], - subsort=subsort, asstr=as_string) - self._map.sort(cmp=fcmp, reverse=not ascending) - self._map_filtered = [id for id in self._map if id in self._map_filtered] + self._map.sort(key=keyg) - def search(self, query, return_matches=False): - ans = self.search_getting_ids(query, self.search_restriction) - if return_matches: - return ans - self._map_filtered = ans + tmap = list(itertools.repeat(False, len(self._data))) + for x in self._map_filtered: + tmap[x] = True + self._map_filtered = [x for x in self._map if tmap[x]] + + +class SortKey(object): + + def __init__(self, orders, values): + self.orders, self.values = orders, values + + def __cmp__(self, other): + for i, ascending in enumerate(self.orders): + ans = cmp(self.values[i], other.values[i]) + if ans != 0: + return ans * ascending + return 0 + +class SortKeyGenerator(object): + + def __init__(self, fields, field_metadata, data): + self.field_metadata = field_metadata + self.orders = [-1 if x[1] else 1 for x in fields] + self.entries = [(x[0], field_metadata[x[0]]) for x in fields] + self.library_order = tweaks['title_series_sorting'] == 'library_order' + self.data = data + + def __call__(self, record): + values = tuple(self.itervals(self.data[record])) + if len(values) == 1: + return values[0] + return SortKey(self.orders, values) + + def itervals(self, record): + for name, fm in self.entries: + dt = fm['datatype'] + val = record[fm['rec_index']] + + if dt == 'datetime': + if val is None: + val = UNDEFINED_DATE + + elif dt == 'series': + if val is None: + val = ('', 1) + else: + val = val.lower() + if self.library_order: + val = title_sort(val) + sidx_fm = self.field_metadata[name + '_index'] + sidx = record[sidx_fm['rec_index']] + val = (val, sidx) + + elif dt in ('text', 'comments'): + if val is None: + val = '' + val = val.lower() + yield val + + # }}} - def search_getting_ids(self, query, search_restriction): - q = '' - if not query or not query.strip(): - q = search_restriction - else: - q = query - if search_restriction: - q = u'%s (%s)' % (search_restriction, query) - if not q: - return list(self._map) - matches = sorted(self.parse(q)) - return [id for id in self._map if id in matches] - def set_search_restriction(self, s): - self.search_restriction = s diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py index 4106f8c965..8a5ab75c3c 100644 --- a/src/calibre/library/database2.py +++ b/src/calibre/library/database2.py @@ -311,6 +311,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): self.search_getting_ids = self.data.search_getting_ids self.refresh = functools.partial(self.data.refresh, self) self.sort = self.data.sort + self.multisort = self.data.multisort self.index = self.data.index self.refresh_ids = functools.partial(self.data.refresh_ids, self) self.row = self.data.row diff --git a/src/calibre/library/field_metadata.py b/src/calibre/library/field_metadata.py index 66cdee51f0..276a6ba971 100644 --- a/src/calibre/library/field_metadata.py +++ b/src/calibre/library/field_metadata.py @@ -69,6 +69,8 @@ class FieldMetadata(dict): VALID_DATA_TYPES = frozenset([None, 'rating', 'text', 'comments', 'datetime', 'int', 'float', 'bool', 'series']) + # Builtin metadata {{{ + _field_metadata = [ ('authors', {'table':'authors', 'column':'name', @@ -287,7 +289,8 @@ class FieldMetadata(dict): 'search_terms':[], 'is_custom':False, 'is_category':False}), - ] + ] + # }}} # search labels that are not db columns search_items = [ 'all', @@ -332,6 +335,9 @@ class FieldMetadata(dict): def keys(self): return self._tb_cats.keys() + def field_keys(self): + return [k for k in self._tb_cats.keys() if self._tb_cats[k]['kind']=='field'] + def iterkeys(self): for key in self._tb_cats: yield key diff --git a/src/calibre/library/server/content.py b/src/calibre/library/server/content.py index 6784abd8f4..ecb467b4c2 100644 --- a/src/calibre/library/server/content.py +++ b/src/calibre/library/server/content.py @@ -5,7 +5,7 @@ __license__ = 'GPL v3' __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' -import re, os, cStringIO, operator +import re, os, cStringIO import cherrypy try: @@ -16,7 +16,15 @@ except ImportError: from calibre import fit_image, guess_type from calibre.utils.date import fromtimestamp -from calibre.ebooks.metadata import title_sort +from calibre.library.caches import SortKeyGenerator + +class CSSortKeyGenerator(SortKeyGenerator): + + def __init__(self, fields, fm): + SortKeyGenerator.__init__(self, fields, fm, None) + + def __call__(self, record): + return self.itervals(record).next() class ContentServer(object): @@ -47,32 +55,12 @@ class ContentServer(object): def sort(self, items, field, order): - field = field.lower().strip() - if field == 'author': - field = 'authors' - if field == 'date': - field = 'timestamp' + field = self.db.data.sanitize_sort_field_name(field) if field not in ('title', 'authors', 'rating', 'timestamp', 'tags', 'size', 'series'): raise cherrypy.HTTPError(400, '%s is not a valid sort field'%field) - cmpf = cmp if field in ('rating', 'size', 'timestamp') else \ - lambda x, y: cmp(x.lower() if x else '', y.lower() if y else '') - if field == 'series': - items.sort(cmp=self.seriescmp, reverse=not order) - else: - lookup = 'sort' if field == 'title' else field - lookup = 'author_sort' if field == 'authors' else field - field = self.db.FIELD_MAP[lookup] - getter = operator.itemgetter(field) - items.sort(cmp=lambda x, y: cmpf(getter(x), getter(y)), reverse=not order) + keyg = CSSortKeyGenerator([(field, order)], self.db.field_metadata) + items.sort(key=keyg, reverse=not order) - def seriescmp(self, x, y): - si = self.db.FIELD_MAP['series'] - try: - ans = cmp(title_sort(x[si].lower()), title_sort(y[si].lower())) - except AttributeError: # Some entries may be None - ans = cmp(x[si], y[si]) - if ans != 0: return ans - return cmp(x[self.db.FIELD_MAP['series_index']], y[self.db.FIELD_MAP['series_index']]) # }}} diff --git a/src/calibre/utils/filenames.py b/src/calibre/utils/filenames.py index 01eb9f30a0..47ccbe73c2 100644 --- a/src/calibre/utils/filenames.py +++ b/src/calibre/utils/filenames.py @@ -54,7 +54,8 @@ def shorten_components_to(length, components): r = x[0] if x is components[-1] else '' else: if x is components[-1]: - b, _, e = x.rpartition('.') + b, e = os.path.splitext(x) + if e == '.': e = '' r = b[:-delta]+e if r.startswith('.'): r = x[0]+r else: diff --git a/src/calibre/utils/smartypants.py b/src/calibre/utils/smartypants.py new file mode 100755 index 0000000000..44aac4de8c --- /dev/null +++ b/src/calibre/utils/smartypants.py @@ -0,0 +1,899 @@ +#!/usr/bin/python + +r""" +============== +smartypants.py +============== + +---------------------------- +SmartyPants ported to Python +---------------------------- + +Ported by `Chad Miller`_ +Copyright (c) 2004, 2007 Chad Miller + +original `SmartyPants`_ by `John Gruber`_ +Copyright (c) 2003 John Gruber + + +Synopsis +======== + +A smart-quotes plugin for Pyblosxom_. + +The priginal "SmartyPants" is a free web publishing plug-in for Movable Type, +Blosxom, and BBEdit that easily translates plain ASCII punctuation characters +into "smart" typographic punctuation HTML entities. + +This software, *smartypants.py*, endeavours to be a functional port of +SmartyPants to Python, for use with Pyblosxom_. + + +Description +=========== + +SmartyPants can perform the following transformations: + +- Straight quotes ( " and ' ) into "curly" quote HTML entities +- Backticks-style quotes (\`\`like this'') into "curly" quote HTML entities +- Dashes (``--`` and ``---``) into en- and em-dash entities +- Three consecutive dots (``...`` or ``. . .``) into an ellipsis entity + +This means you can write, edit, and save your posts using plain old +ASCII straight quotes, plain dashes, and plain dots, but your published +posts (and final HTML output) will appear with smart quotes, em-dashes, +and proper ellipses. + +SmartyPants does not modify characters within ``<pre>``, ``<code>``, ``<kbd>``, +``<math>`` or ``<script>`` tag blocks. Typically, these tags are used to +display text where smart quotes and other "smart punctuation" would not be +appropriate, such as source code or example markup. + + +Backslash Escapes +================= + +If you need to use literal straight quotes (or plain hyphens and +periods), SmartyPants accepts the following backslash escape sequences +to force non-smart punctuation. It does so by transforming the escape +sequence into a decimal-encoded HTML entity: + +(FIXME: table here.) + +.. comment It sucks that there's a disconnect between the visual layout and table markup when special characters are involved. +.. comment ====== ===== ========= +.. comment Escape Value Character +.. comment ====== ===== ========= +.. comment \\\\\\\\ \ \\\\ +.. comment \\\\" " " +.. comment \\\\' ' ' +.. comment \\\\. . . +.. comment \\\\- - \- +.. comment \\\\` ` \` +.. comment ====== ===== ========= + +This is useful, for example, when you want to use straight quotes as +foot and inch marks: 6'2" tall; a 17" iMac. + +Options +======= + +For Pyblosxom users, the ``smartypants_attributes`` attribute is where you +specify configuration options. + +Numeric values are the easiest way to configure SmartyPants' behavior: + +"0" + Suppress all transformations. (Do nothing.) +"1" + Performs default SmartyPants transformations: quotes (including + \`\`backticks'' -style), em-dashes, and ellipses. "``--``" (dash dash) + is used to signify an em-dash; there is no support for en-dashes. + +"2" + Same as smarty_pants="1", except that it uses the old-school typewriter + shorthand for dashes: "``--``" (dash dash) for en-dashes, "``---``" + (dash dash dash) + for em-dashes. + +"3" + Same as smarty_pants="2", but inverts the shorthand for dashes: + "``--``" (dash dash) for em-dashes, and "``---``" (dash dash dash) for + en-dashes. + +"-1" + Stupefy mode. Reverses the SmartyPants transformation process, turning + the HTML entities produced by SmartyPants into their ASCII equivalents. + E.g. "“" is turned into a simple double-quote ("), "—" is + turned into two dashes, etc. + + +The following single-character attribute values can be combined to toggle +individual transformations from within the smarty_pants attribute. For +example, to educate normal quotes and em-dashes, but not ellipses or +\`\`backticks'' -style quotes: + +``py['smartypants_attributes'] = "1"`` + +"q" + Educates normal quote characters: (") and ('). + +"b" + Educates \`\`backticks'' -style double quotes. + +"B" + Educates \`\`backticks'' -style double quotes and \`single' quotes. + +"d" + Educates em-dashes. + +"D" + Educates em-dashes and en-dashes, using old-school typewriter shorthand: + (dash dash) for en-dashes, (dash dash dash) for em-dashes. + +"i" + Educates em-dashes and en-dashes, using inverted old-school typewriter + shorthand: (dash dash) for em-dashes, (dash dash dash) for en-dashes. + +"e" + Educates ellipses. + +"w" + Translates any instance of ``"`` into a normal double-quote character. + This should be of no interest to most people, but of particular interest + to anyone who writes their posts using Dreamweaver, as Dreamweaver + inexplicably uses this entity to represent a literal double-quote + character. SmartyPants only educates normal quotes, not entities (because + ordinarily, entities are used for the explicit purpose of representing the + specific character they represent). The "w" option must be used in + conjunction with one (or both) of the other quote options ("q" or "b"). + Thus, if you wish to apply all SmartyPants transformations (quotes, en- + and em-dashes, and ellipses) and also translate ``"`` entities into + regular quotes so SmartyPants can educate them, you should pass the + following to the smarty_pants attribute: + +The ``smartypants_forbidden_flavours`` list contains pyblosxom flavours for +which no Smarty Pants rendering will occur. + + +Caveats +======= + +Why You Might Not Want to Use Smart Quotes in Your Weblog +--------------------------------------------------------- + +For one thing, you might not care. + +Most normal, mentally stable individuals do not take notice of proper +typographic punctuation. Many design and typography nerds, however, break +out in a nasty rash when they encounter, say, a restaurant sign that uses +a straight apostrophe to spell "Joe's". + +If you're the sort of person who just doesn't care, you might well want to +continue not caring. Using straight quotes -- and sticking to the 7-bit +ASCII character set in general -- is certainly a simpler way to live. + +Even if you I *do* care about accurate typography, you still might want to +think twice before educating the quote characters in your weblog. One side +effect of publishing curly quote HTML entities is that it makes your +weblog a bit harder for others to quote from using copy-and-paste. What +happens is that when someone copies text from your blog, the copied text +contains the 8-bit curly quote characters (as well as the 8-bit characters +for em-dashes and ellipses, if you use these options). These characters +are not standard across different text encoding methods, which is why they +need to be encoded as HTML entities. + +People copying text from your weblog, however, may not notice that you're +using curly quotes, and they'll go ahead and paste the unencoded 8-bit +characters copied from their browser into an email message or their own +weblog. When pasted as raw "smart quotes", these characters are likely to +get mangled beyond recognition. + +That said, my own opinion is that any decent text editor or email client +makes it easy to stupefy smart quote characters into their 7-bit +equivalents, and I don't consider it my problem if you're using an +indecent text editor or email client. + + +Algorithmic Shortcomings +------------------------ + +One situation in which quotes will get curled the wrong way is when +apostrophes are used at the start of leading contractions. For example: + +``'Twas the night before Christmas.`` + +In the case above, SmartyPants will turn the apostrophe into an opening +single-quote, when in fact it should be a closing one. I don't think +this problem can be solved in the general case -- every word processor +I've tried gets this wrong as well. In such cases, it's best to use the +proper HTML entity for closing single-quotes (``’``) by hand. + + +Bugs +==== + +To file bug reports or feature requests (other than topics listed in the +Caveats section above) please send email to: mailto:smartypantspy@chad.org + +If the bug involves quotes being curled the wrong way, please send example +text to illustrate. + +To Do list +---------- + +- Provide a function for use within templates to quote anything at all. + + +Version History +=============== + +1.5_1.6: Fri, 27 Jul 2007 07:06:40 -0400 + - Fixed bug where blocks of precious unalterable text was instead + interpreted. Thanks to Le Roux and Dirk van Oosterbosch. + +1.5_1.5: Sat, 13 Aug 2005 15:50:24 -0400 + - Fix bogus magical quotation when there is no hint that the + user wants it, e.g., in "21st century". Thanks to Nathan Hamblen. + - Be smarter about quotes before terminating numbers in an en-dash'ed + range. + +1.5_1.4: Thu, 10 Feb 2005 20:24:36 -0500 + - Fix a date-processing bug, as reported by jacob childress. + - Begin a test-suite for ensuring correct output. + - Removed import of "string", since I didn't really need it. + (This was my first every Python program. Sue me!) + +1.5_1.3: Wed, 15 Sep 2004 18:25:58 -0400 + - Abort processing if the flavour is in forbidden-list. Default of + [ "rss" ] (Idea of Wolfgang SCHNERRING.) + - Remove stray virgules from en-dashes. Patch by Wolfgang SCHNERRING. + +1.5_1.2: Mon, 24 May 2004 08:14:54 -0400 + - Some single quotes weren't replaced properly. Diff-tesuji played + by Benjamin GEIGER. + +1.5_1.1: Sun, 14 Mar 2004 14:38:28 -0500 + - Support upcoming pyblosxom 0.9 plugin verification feature. + +1.5_1.0: Tue, 09 Mar 2004 08:08:35 -0500 + - Initial release + +Version Information +------------------- + +Version numbers will track the SmartyPants_ version numbers, with the addition +of an underscore and the smartypants.py version on the end. + +New versions will be available at `http://wiki.chad.org/SmartyPantsPy`_ + +.. _http://wiki.chad.org/SmartyPantsPy: http://wiki.chad.org/SmartyPantsPy + +Authors +======= + +`John Gruber`_ did all of the hard work of writing this software in Perl for +`Movable Type`_ and almost all of this useful documentation. `Chad Miller`_ +ported it to Python to use with Pyblosxom_. + + +Additional Credits +================== + +Portions of the SmartyPants original work are based on Brad Choate's nifty +MTRegex plug-in. `Brad Choate`_ also contributed a few bits of source code to +this plug-in. Brad Choate is a fine hacker indeed. + +`Jeremy Hedley`_ and `Charles Wiltgen`_ deserve mention for exemplary beta +testing of the original SmartyPants. + +`Rael Dornfest`_ ported SmartyPants to Blosxom. + +.. _Brad Choate: http://bradchoate.com/ +.. _Jeremy Hedley: http://antipixel.com/ +.. _Charles Wiltgen: http://playbacktime.com/ +.. _Rael Dornfest: http://raelity.org/ + + +Copyright and License +===================== + +SmartyPants_ license:: + + Copyright (c) 2003 John Gruber + (http://daringfireball.net/) + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + * Neither the name "SmartyPants" nor the names of its contributors + may be used to endorse or promote products derived from this + software without specific prior written permission. + + This software is provided by the copyright holders and contributors "as + is" and any express or implied warranties, including, but not limited + to, the implied warranties of merchantability and fitness for a + particular purpose are disclaimed. In no event shall the copyright + owner or contributors be liable for any direct, indirect, incidental, + special, exemplary, or consequential damages (including, but not + limited to, procurement of substitute goods or services; loss of use, + data, or profits; or business interruption) however caused and on any + theory of liability, whether in contract, strict liability, or tort + (including negligence or otherwise) arising in any way out of the use + of this software, even if advised of the possibility of such damage. + + +smartypants.py license:: + + smartypants.py is a derivative work of SmartyPants. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + This software is provided by the copyright holders and contributors "as + is" and any express or implied warranties, including, but not limited + to, the implied warranties of merchantability and fitness for a + particular purpose are disclaimed. In no event shall the copyright + owner or contributors be liable for any direct, indirect, incidental, + special, exemplary, or consequential damages (including, but not + limited to, procurement of substitute goods or services; loss of use, + data, or profits; or business interruption) however caused and on any + theory of liability, whether in contract, strict liability, or tort + (including negligence or otherwise) arising in any way out of the use + of this software, even if advised of the possibility of such damage. + + + +.. _John Gruber: http://daringfireball.net/ +.. _Chad Miller: http://web.chad.org/ + +.. _Pyblosxom: http://roughingit.subtlehints.net/pyblosxom +.. _SmartyPants: http://daringfireball.net/projects/smartypants/ +.. _Movable Type: http://www.movabletype.org/ + +""" + +default_smartypants_attr = "1" + +import re + +tags_to_skip_regex = re.compile(r"<(/)?(pre|code|kbd|script|math)[^>]*>", re.I) + + +def verify_installation(request): + return 1 + # assert the plugin is functional + + +def cb_story(args): + global default_smartypants_attr + + try: + forbidden_flavours = args["entry"]["smartypants_forbidden_flavours"] + except KeyError: + forbidden_flavours = [ "rss" ] + + try: + attributes = args["entry"]["smartypants_attributes"] + except KeyError: + attributes = default_smartypants_attr + + if attributes is None: + attributes = default_smartypants_attr + + entryData = args["entry"].getData() + + try: + if args["request"]["flavour"] in forbidden_flavours: + return + except KeyError: + if "<" in args["entry"]["body"][0:15]: # sniff the stream + return # abort if it looks like escaped HTML. FIXME + + # FIXME: make these configurable, perhaps? + args["entry"]["body"] = smartyPants(entryData, attributes) + args["entry"]["title"] = smartyPants(args["entry"]["title"], attributes) + + +### interal functions below here + +def smartyPants(text, attr=default_smartypants_attr): + convert_quot = False # should we translate " entities into normal quotes? + + # Parse attributes: + # 0 : do nothing + # 1 : set all + # 2 : set all, using old school en- and em- dash shortcuts + # 3 : set all, using inverted old school en and em- dash shortcuts + # + # q : quotes + # b : backtick quotes (``double'' only) + # B : backtick quotes (``double'' and `single') + # d : dashes + # D : old school dashes + # i : inverted old school dashes + # e : ellipses + # w : convert " entities to " for Dreamweaver users + + skipped_tag_stack = [] + do_dashes = "0" + do_backticks = "0" + do_quotes = "0" + do_ellipses = "0" + do_stupefy = "0" + + if attr == "0": + # Do nothing. + return text + elif attr == "1": + do_quotes = "1" + do_backticks = "1" + do_dashes = "1" + do_ellipses = "1" + elif attr == "2": + # Do everything, turn all options on, use old school dash shorthand. + do_quotes = "1" + do_backticks = "1" + do_dashes = "2" + do_ellipses = "1" + elif attr == "3": + # Do everything, turn all options on, use inverted old school dash shorthand. + do_quotes = "1" + do_backticks = "1" + do_dashes = "3" + do_ellipses = "1" + elif attr == "-1": + # Special "stupefy" mode. + do_stupefy = "1" + else: + for c in attr: + if c == "q": do_quotes = "1" + elif c == "b": do_backticks = "1" + elif c == "B": do_backticks = "2" + elif c == "d": do_dashes = "1" + elif c == "D": do_dashes = "2" + elif c == "i": do_dashes = "3" + elif c == "e": do_ellipses = "1" + elif c == "w": convert_quot = "1" + else: + pass + # ignore unknown option + + tokens = _tokenize(text) + result = [] + in_pre = False + + prev_token_last_char = "" + # This is a cheat, used to get some context + # for one-character tokens that consist of + # just a quote char. What we do is remember + # the last character of the previous text + # token, to use as context to curl single- + # character quote tokens correctly. + + for cur_token in tokens: + if cur_token[0] == "tag": + # Don't mess with quotes inside some tags. This does not handle self <closing/> tags! + result.append(cur_token[1]) + skip_match = tags_to_skip_regex.match(cur_token[1]) + if skip_match is not None: + if not skip_match.group(1): + skipped_tag_stack.append(skip_match.group(2).lower()) + in_pre = True + else: + if len(skipped_tag_stack) > 0: + if skip_match.group(2).lower() == skipped_tag_stack[-1]: + skipped_tag_stack.pop() + else: + pass + # This close doesn't match the open. This isn't XHTML. We should barf here. + if len(skipped_tag_stack) == 0: + in_pre = False + else: + t = cur_token[1] + last_char = t[-1:] # Remember last char of this token before processing. + if not in_pre: + t = processEscapes(t) + + if convert_quot != "0": + t = re.sub('"', '"', t) + + if do_dashes != "0": + if do_dashes == "1": + t = educateDashes(t) + if do_dashes == "2": + t = educateDashesOldSchool(t) + if do_dashes == "3": + t = educateDashesOldSchoolInverted(t) + + if do_ellipses != "0": + t = educateEllipses(t) + + # Note: backticks need to be processed before quotes. + if do_backticks != "0": + t = educateBackticks(t) + + if do_backticks == "2": + t = educateSingleBackticks(t) + + if do_quotes != "0": + if t == "'": + # Special case: single-character ' token + if re.match("\S", prev_token_last_char): + t = "’" + else: + t = "‘" + elif t == '"': + # Special case: single-character " token + if re.match("\S", prev_token_last_char): + t = "”" + else: + t = "“" + + else: + # Normal case: + t = educateQuotes(t) + + if do_stupefy == "1": + t = stupefyEntities(t) + + prev_token_last_char = last_char + result.append(t) + + return "".join(result) + + +def educateQuotes(str): + """ + Parameter: String. + + Returns: The string, with "educated" curly quote HTML entities. + + Example input: "Isn't this fun?" + Example output: “Isn’t this fun?” + """ + + punct_class = r"""[!"#\$\%'()*+,-.\/:;<=>?\@\[\\\]\^_`{|}~]""" + + # Special case if the very first character is a quote + # followed by punctuation at a non-word-break. Close the quotes by brute force: + str = re.sub(r"""^'(?=%s\\B)""" % (punct_class,), r"""’""", str) + str = re.sub(r"""^"(?=%s\\B)""" % (punct_class,), r"""”""", str) + + # Special case for double sets of quotes, e.g.: + # <p>He said, "'Quoted' words in a larger quote."</p> + str = re.sub(r""""'(?=\w)""", """“‘""", str) + str = re.sub(r"""'"(?=\w)""", """‘“""", str) + + # Special case for decade abbreviations (the '80s): + str = re.sub(r"""\b'(?=\d{2}s)""", r"""’""", str) + + close_class = r"""[^\ \t\r\n\[\{\(\-]""" + dec_dashes = r"""–|—""" + + # Get most opening single quotes: + opening_single_quotes_regex = re.compile(r""" + ( + \s | # a whitespace char, or +   | # a non-breaking space entity, or + -- | # dashes, or + &[mn]dash; | # named dash entities + %s | # or decimal entities + &\#x201[34]; # or hex + ) + ' # the quote + (?=\w) # followed by a word character + """ % (dec_dashes,), re.VERBOSE) + str = opening_single_quotes_regex.sub(r"""\1‘""", str) + + closing_single_quotes_regex = re.compile(r""" + (%s) + ' + (?!\s | s\b | \d) + """ % (close_class,), re.VERBOSE) + str = closing_single_quotes_regex.sub(r"""\1’""", str) + + closing_single_quotes_regex = re.compile(r""" + (%s) + ' + (\s | s\b) + """ % (close_class,), re.VERBOSE) + str = closing_single_quotes_regex.sub(r"""\1’\2""", str) + + # Any remaining single quotes should be opening ones: + str = re.sub(r"""'""", r"""‘""", str) + + # Get most opening double quotes: + opening_double_quotes_regex = re.compile(r""" + ( + \s | # a whitespace char, or +   | # a non-breaking space entity, or + -- | # dashes, or + &[mn]dash; | # named dash entities + %s | # or decimal entities + &\#x201[34]; # or hex + ) + " # the quote + (?=\w) # followed by a word character + """ % (dec_dashes,), re.VERBOSE) + str = opening_double_quotes_regex.sub(r"""\1“""", str) + + # Double closing quotes: + closing_double_quotes_regex = re.compile(r""" + #(%s)? # character that indicates the quote should be closing + " + (?=\s) + """ % (close_class,), re.VERBOSE) + str = closing_double_quotes_regex.sub(r"""”""", str) + + closing_double_quotes_regex = re.compile(r""" + (%s) # character that indicates the quote should be closing + " + """ % (close_class,), re.VERBOSE) + str = closing_double_quotes_regex.sub(r"""\1”""", str) + + # Any remaining quotes should be opening ones. + str = re.sub(r'"', r"""“""", str) + + return str + + +def educateBackticks(str): + """ + Parameter: String. + Returns: The string, with ``backticks'' -style double quotes + translated into HTML curly quote entities. + Example input: ``Isn't this fun?'' + Example output: “Isn't this fun?” + """ + + str = re.sub(r"""``""", r"""“""", str) + str = re.sub(r"""''""", r"""”""", str) + return str + + +def educateSingleBackticks(str): + """ + Parameter: String. + Returns: The string, with `backticks' -style single quotes + translated into HTML curly quote entities. + + Example input: `Isn't this fun?' + Example output: ‘Isn’t this fun?’ + """ + + str = re.sub(r"""`""", r"""‘""", str) + str = re.sub(r"""'""", r"""’""", str) + return str + + +def educateDashes(str): + """ + Parameter: String. + + Returns: The string, with each instance of "--" translated to + an em-dash HTML entity. + """ + + str = re.sub(r"""---""", r"""–""", str) # en (yes, backwards) + str = re.sub(r"""--""", r"""—""", str) # em (yes, backwards) + return str + + +def educateDashesOldSchool(str): + """ + Parameter: String. + + Returns: The string, with each instance of "--" translated to + an en-dash HTML entity, and each "---" translated to + an em-dash HTML entity. + """ + + str = re.sub(r"""---""", r"""—""", str) # em (yes, backwards) + str = re.sub(r"""--""", r"""–""", str) # en (yes, backwards) + return str + + +def educateDashesOldSchoolInverted(str): + """ + Parameter: String. + + Returns: The string, with each instance of "--" translated to + an em-dash HTML entity, and each "---" translated to + an en-dash HTML entity. Two reasons why: First, unlike the + en- and em-dash syntax supported by + EducateDashesOldSchool(), it's compatible with existing + entries written before SmartyPants 1.1, back when "--" was + only used for em-dashes. Second, em-dashes are more + common than en-dashes, and so it sort of makes sense that + the shortcut should be shorter to type. (Thanks to Aaron + Swartz for the idea.) + """ + str = re.sub(r"""---""", r"""–""", str) # em + str = re.sub(r"""--""", r"""—""", str) # en + return str + + + +def educateEllipses(str): + """ + Parameter: String. + Returns: The string, with each instance of "..." translated to + an ellipsis HTML entity. + + Example input: Huh...? + Example output: Huh…? + """ + + str = re.sub(r"""\.\.\.""", r"""…""", str) + str = re.sub(r"""\. \. \.""", r"""…""", str) + return str + + +def stupefyEntities(str): + """ + Parameter: String. + Returns: The string, with each SmartyPants HTML entity translated to + its ASCII counterpart. + + Example input: “Hello — world.” + Example output: "Hello -- world." + """ + + str = re.sub(r"""–""", r"""-""", str) # en-dash + str = re.sub(r"""—""", r"""--""", str) # em-dash + + str = re.sub(r"""‘""", r"""'""", str) # open single quote + str = re.sub(r"""’""", r"""'""", str) # close single quote + + str = re.sub(r"""“""", r'''"''', str) # open double quote + str = re.sub(r"""”""", r'''"''', str) # close double quote + + str = re.sub(r"""…""", r"""...""", str)# ellipsis + + return str + + +def processEscapes(str): + r""" + Parameter: String. + Returns: The string, with after processing the following backslash + escape sequences. This is useful if you want to force a "dumb" + quote or other character to appear. + + Escape Value + ------ ----- + \\ \ + \" " + \' ' + \. . + \- - + \` ` + """ + str = re.sub(r"""\\\\""", r"""\""", str) + str = re.sub(r'''\\"''', r""""""", str) + str = re.sub(r"""\\'""", r"""'""", str) + str = re.sub(r"""\\\.""", r""".""", str) + str = re.sub(r"""\\-""", r"""-""", str) + str = re.sub(r"""\\`""", r"""`""", str) + + return str + + +def _tokenize(str): + """ + Parameter: String containing HTML markup. + Returns: Reference to an array of the tokens comprising the input + string. Each token is either a tag (possibly with nested, + tags contained therein, such as <a href="<MTFoo>">, or a + run of text between tags. Each element of the array is a + two-element array; the first is either 'tag' or 'text'; + the second is the actual value. + + Based on the _tokenize() subroutine from Brad Choate's MTRegex plugin. + <http://www.bradchoate.com/past/mtregex.php> + """ + + tokens = [] + + #depth = 6 + #nested_tags = "|".join(['(?:<(?:[^<>]',] * depth) + (')*>)' * depth) + #match = r"""(?: <! ( -- .*? -- \s* )+ > ) | # comments + # (?: <\? .*? \?> ) | # directives + # %s # nested tags """ % (nested_tags,) + tag_soup = re.compile(r"""([^<]*)(<[^>]*>)""") + + token_match = tag_soup.search(str) + + previous_end = 0 + while token_match is not None: + if token_match.group(1): + tokens.append(['text', token_match.group(1)]) + + tokens.append(['tag', token_match.group(2)]) + + previous_end = token_match.end() + token_match = tag_soup.search(str, token_match.end()) + + if previous_end < len(str): + tokens.append(['text', str[previous_end:]]) + + return tokens + + + +if __name__ == "__main__": + + import locale + + try: + locale.setlocale(locale.LC_ALL, '') + except: + pass + + from docutils.core import publish_string + docstring_html = publish_string(__doc__, writer_name='html') + + print docstring_html + + + # Unit test output goes out stderr. No worries. + import unittest + sp = smartyPants + + class TestSmartypantsAllAttributes(unittest.TestCase): + # the default attribute is "1", which means "all". + + def test_dates(self): + self.assertEqual(sp("1440-80's"), "1440-80’s") + self.assertEqual(sp("1440-'80s"), "1440-‘80s") + self.assertEqual(sp("1440---'80s"), "1440–‘80s") + self.assertEqual(sp("1960s"), "1960s") # no effect. + self.assertEqual(sp("1960's"), "1960’s") + self.assertEqual(sp("one two '60s"), "one two ‘60s") + self.assertEqual(sp("'60s"), "‘60s") + + def test_skip_tags(self): + self.assertEqual( + sp("""<script type="text/javascript">\n<!--\nvar href = "http://www.google.com";\nvar linktext = "google";\ndocument.write('<a href="' + href + '">' + linktext + "</a>");\n//-->\n</script>"""), + """<script type="text/javascript">\n<!--\nvar href = "http://www.google.com";\nvar linktext = "google";\ndocument.write('<a href="' + href + '">' + linktext + "</a>");\n//-->\n</script>""") + self.assertEqual( + sp("""<p>He said "Let's write some code." This code here <code>if True:\n\tprint "Okay"</code> is python code.</p>"""), + """<p>He said “Let’s write some code.” This code here <code>if True:\n\tprint "Okay"</code> is python code.</p>""") + + + def test_ordinal_numbers(self): + self.assertEqual(sp("21st century"), "21st century") # no effect. + self.assertEqual(sp("3rd"), "3rd") # no effect. + + def test_educated_quotes(self): + self.assertEqual(sp('''"Isn't this fun?"'''), '''“Isn’t this fun?”''') + + unittest.main() + + + + +__author__ = "Chad Miller <smartypantspy@chad.org>" +__version__ = "1.5_1.6: Fri, 27 Jul 2007 07:06:40 -0400" +__url__ = "http://wiki.chad.org/SmartyPantsPy" +__description__ = "Smart-quotes, smart-ellipses, and smart-dashes for weblog entries in pyblosxom" diff --git a/src/calibre/web/feeds/__init__.py b/src/calibre/web/feeds/__init__.py index a70cf8b664..8aef350498 100644 --- a/src/calibre/web/feeds/__init__.py +++ b/src/calibre/web/feeds/__init__.py @@ -165,7 +165,9 @@ class Feed(object): if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article: self.articles.append(article) else: - self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%(title, article.localtime.strftime('%a, %d %b, %Y %H:%M'), self.title)) + t = strftime(u'%a, %d %b, %Y %H:%M', article.localtime.timetuple()) + self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'% + (title, t, self.title)) d = item.get('date', '') article.formatted_date = d