merge from branch with kovid's integration of preprocess code

This commit is contained in:
ldolse 2010-09-14 11:03:03 +08:00
commit 0a2e16e466
24 changed files with 282 additions and 247 deletions

View File

@ -26,7 +26,7 @@ var current_library_request = null;
////////////////////////////// GET BOOK LIST ////////////////////////////// ////////////////////////////// GET BOOK LIST //////////////////////////////
var LIBRARY_FETCH_TIMEOUT = 30000; // milliseconds var LIBRARY_FETCH_TIMEOUT = 5*60000; // milliseconds
function create_table_headers() { function create_table_headers() {
var thead = $('table#book_list thead tr'); var thead = $('table#book_list thead tr');

View File

@ -114,3 +114,11 @@ add_new_book_tags_when_importing_books = False
# Set the maximum number of tags to show per book in the content server # Set the maximum number of tags to show per book in the content server
max_content_server_tags_shown=5 max_content_server_tags_shown=5
# Set the maximum number of sort 'levels' that calibre will use to resort the
# library after certain operations such as searches or device insertion. Each
# sort level adds a performance penalty. If the database is large (thousands of
# books) the penalty might be noticeable. If you are not concerned about multi-
# level sorts, and if you are seeing a slowdown, reduce the value of this tweak.
maximum_resort_levels = 5

View File

@ -1,12 +1,8 @@
#!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
''' '''
infobae.com infobae.com
''' '''
import re
import urllib, urlparse
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -20,35 +16,24 @@ class Infobae(BasicNewsRecipe):
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
language = 'es' language = 'es'
lang = 'es-AR'
encoding = 'cp1252' encoding = 'cp1252'
cover_url = 'http://www.infobae.com/imgs/header/header.gif' masthead_url = 'http://www.infobae.com/imgs/header/header.gif'
remove_javascript = True remove_javascript = True
preprocess_regexps = [(re.compile( remove_empty_feeds = True
r'<meta name="Description" content="[^"]+">'), lambda m:'')] extra_css = '''
body{font-family:Arial,Helvetica,sans-serif;}
.popUpTitulo{color:#0D4261; font-size: xx-large}
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True' '''
extra_css = ''' conversion_options = {
.col-center{font-family:Arial,Helvetica,sans-serif;} 'comment' : description
h1{font-family:Arial,Helvetica,sans-serif; color:#0D4261;} , 'tags' : category
.fuenteIntNota{font-family:Arial,Helvetica,sans-serif; color:#1D1D1D; font-size:x-small;} , 'publisher' : publisher
''' , 'language' : language
, 'linearize_tables' : True
keep_only_tags = [dict(name='div', attrs={'class':['content']})] }
remove_tags = [
dict(name='div', attrs={'class':['options','col-right','controles', 'bannerLibre','tiulo-masleidas','masleidas-h']}),
dict(name='a', attrs={'name' : 'comentario',}),
dict(name='iframe'),
dict(name='img', alt = "Ver galerias de imagenes"),
]
feeds = [ feeds = [
(u'Noticias' , u'http://www.infobae.com/adjuntos/html/RSS/hoy.xml' ) (u'Noticias' , u'http://www.infobae.com/adjuntos/html/RSS/hoy.xml' )
@ -57,39 +42,14 @@ class Infobae(BasicNewsRecipe):
,(u'Deportes' , u'http://www.infobae.com/adjuntos/html/RSS/deportes.xml' ) ,(u'Deportes' , u'http://www.infobae.com/adjuntos/html/RSS/deportes.xml' )
] ]
# def print_version(self, url): def print_version(self, url):
# main, sep, article_part = url.partition('contenidos/') article_part = url.rpartition('/')[2]
# article_id, rsep, rrest = article_part.partition('-') article_id= article_part.partition('-')[0]
# return u'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id return 'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id
def get_article_url(self, article):
ans = article.get('link').encode('utf-8')
parts = list(urlparse.urlparse(ans))
parts[2] = urllib.quote(parts[2])
ans = urlparse.urlunparse(parts)
return ans.decode('utf-8')
def preprocess_html(self, soup):
for tag in soup.head.findAll('strong'):
tag.extract()
for tag in soup.findAll('meta'):
del tag['content']
tag.extract()
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">\n<meta http-equiv="Content-Language" content="es-AR"/>\n'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
def postprocess_html(self, soup, first): def postprocess_html(self, soup, first):
for tag in soup.findAll(name='strong'): for tag in soup.findAll(name='strong'):
tag.name = 'b' tag.name = 'b'
return soup return soup

View File

@ -6,6 +6,7 @@ nspm.rs
import re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import NavigableString
class Nspm(BasicNewsRecipe): class Nspm(BasicNewsRecipe):
title = 'Nova srpska politicka misao' title = 'Nova srpska politicka misao'
@ -21,6 +22,7 @@ class Nspm(BasicNewsRecipe):
encoding = 'utf-8' encoding = 'utf-8'
language = 'sr' language = 'sr'
delay = 2 delay = 2
remove_empty_feeds = True
publication_type = 'magazine' publication_type = 'magazine'
masthead_url = 'http://www.nspm.rs/templates/jsn_epic_pro/images/logol.jpg' masthead_url = 'http://www.nspm.rs/templates/jsn_epic_pro/images/logol.jpg'
extra_css = """ @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} extra_css = """ @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)}
@ -45,8 +47,9 @@ class Nspm(BasicNewsRecipe):
dict(name=['link','object','embed','script','meta','base','iframe']) dict(name=['link','object','embed','script','meta','base','iframe'])
,dict(attrs={'class':'buttonheading'}) ,dict(attrs={'class':'buttonheading'})
] ]
remove_tags_after = dict(attrs={'class':'article_separator'}) remove_tags_before = dict(attrs={'class':'contentheading'})
remove_attributes = ['width','height'] remove_tags_after = dict(attrs={'class':'article_separator'})
remove_attributes = ['width','height']
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
@ -67,4 +70,8 @@ class Nspm(BasicNewsRecipe):
def preprocess_html(self, soup): def preprocess_html(self, soup):
for item in soup.body.findAll(style=True): for item in soup.body.findAll(style=True):
del item['style'] del item['style']
for item in soup.body.findAll('h1'):
nh = NavigableString(item.a.string)
item.a.extract()
item.insert(0,nh)
return self.adeify_images(soup) return self.adeify_images(soup)

View File

@ -24,18 +24,18 @@ class XkcdCom(BasicNewsRecipe):
(re.compile(r'(<img.*title=")([^"]+)(".*>)'), (re.compile(r'(<img.*title=")([^"]+)(".*>)'),
lambda m: '%s%s<p>%s</p>' % (m.group(1), m.group(3), m.group(2))) lambda m: '%s%s<p>%s</p>' % (m.group(1), m.group(3), m.group(2)))
] ]
def parse_index(self): def parse_index(self):
INDEX = 'http://xkcd.com/archive/' INDEX = 'http://xkcd.com/archive/'
soup = self.index_to_soup(INDEX) soup = self.index_to_soup(INDEX)
articles = [] articles = []
for item in soup.findAll('a', title=True): for item in soup.findAll('a', title=True):
articles.append({ articles.append({
'date': item['title'], 'date': item['title'],
'timestamp': time.mktime(time.strptime(item['title'], '%Y-%m-%d'))+1, 'timestamp': time.mktime(time.strptime(item['title'], '%Y-%m-%d'))+1,
'url': 'http://xkcd.com' + item['href'], 'url': 'http://xkcd.com' + item['href'],
'title': self.tag_to_string(item).encode('UTF-8'), 'title': self.tag_to_string(item),
'description': '', 'description': '',
'content': '', 'content': '',
}) })

View File

@ -459,7 +459,7 @@ from calibre.devices.iriver.driver import IRIVER_STORY
from calibre.devices.binatone.driver import README from calibre.devices.binatone.driver import README
from calibre.devices.hanvon.driver import N516, EB511, ALEX, AZBOOKA, THEBOOK from calibre.devices.hanvon.driver import N516, EB511, ALEX, AZBOOKA, THEBOOK
from calibre.devices.edge.driver import EDGE from calibre.devices.edge.driver import EDGE
from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS, SOVOS
from calibre.devices.sne.driver import SNE from calibre.devices.sne.driver import SNE
from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, GEMEI from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, GEMEI
from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG
@ -557,6 +557,7 @@ plugins += [
TECLAST_K3, TECLAST_K3,
NEWSMY, NEWSMY,
IPAPYRUS, IPAPYRUS,
SOVOS,
EDGE, EDGE,
SNE, SNE,
ALEX, ALEX,

View File

@ -52,3 +52,14 @@ class IPAPYRUS(TECLAST_K3):
VENDOR_NAME = 'E_READER' VENDOR_NAME = 'E_READER'
WINDOWS_MAIN_MEM = '' WINDOWS_MAIN_MEM = ''
class SOVOS(TECLAST_K3):
name = 'Sovos device interface'
gui_name = 'Sovos'
description = _('Communicate with the Sovos reader.')
FORMATS = ['epub', 'fb2', 'pdf', 'txt']
VENDOR_NAME = 'RK28XX'
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'USB-MSC'

View File

@ -132,7 +132,11 @@ class CHMReader(CHMFile):
for path in self.Contents(): for path in self.Contents():
lpath = os.path.join(output_dir, path) lpath = os.path.join(output_dir, path)
self._ensure_dir(lpath) self._ensure_dir(lpath)
data = self.GetFile(path) try:
data = self.GetFile(path)
except:
self.log.exception('Failed to extract %s from CHM, ignoring'%path)
continue
if lpath.find(';') != -1: if lpath.find(';') != -1:
# fix file names with ";<junk>" at the end, see _reformat() # fix file names with ";<junk>" at the end, see _reformat()
lpath = lpath.split(';')[0] lpath = lpath.split(';')[0]

View File

@ -168,6 +168,17 @@ class HTMLPreProcessor(object):
(re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'), (re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'),
(re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'), (re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'),
(re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'), (re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'),
# ` with letter before
(re.compile(u'a\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'à'),
(re.compile(u'A\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'À'),
(re.compile(u'e\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'è'),
(re.compile(u'E\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'È'),
(re.compile(u'i\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ì'),
(re.compile(u'I\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ì'),
(re.compile(u'o\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ò'),
(re.compile(u'O\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ò'),
(re.compile(u'u\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ù'),
(re.compile(u'U\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ù'),
# ´ # ´
(re.compile(u'´\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'á'), (re.compile(u'´\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'á'),

View File

@ -10,24 +10,23 @@ from calibre.ebooks.conversion.preprocess import line_length
from calibre.utils.logging import default_log from calibre.utils.logging import default_log
class PreProcessor(object): class PreProcessor(object):
html_preprocess_sections = 0
found_indents = 0
def __init__(self, args): def __init__(self, log=None):
self.args = args self.log = default_log if log is None else log
self.log = default_log self.html_preprocess_sections = 0
self.found_indents = 0
def chapter_head(self, match): def chapter_head(self, match):
chap = match.group('chap') chap = match.group('chap')
title = match.group('title') title = match.group('title')
if not title: if not title:
self.html_preprocess_sections = self.html_preprocess_sections + 1 self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log("found " + str(self.html_preprocess_sections) + " chapters. - " + str(chap)) self.log("found " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
return '<h2>'+chap+'</h2>\n' return '<h2>'+chap+'</h2>\n'
else: else:
self.html_preprocess_sections = self.html_preprocess_sections + 1 self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log("found " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title)) self.log("found " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n' return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
def chapter_break(self, match): def chapter_break(self, match):
chap = match.group('section') chap = match.group('section')
@ -35,7 +34,7 @@ class PreProcessor(object):
self.html_preprocess_sections = self.html_preprocess_sections + 1 self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log("marked " + str(self.html_preprocess_sections) + " section markers based on punctuation. - " + str(chap)) self.log("marked " + str(self.html_preprocess_sections) + " section markers based on punctuation. - " + str(chap))
return '<'+styles+' style="page-break-before:always">'+chap return '<'+styles+' style="page-break-before:always">'+chap
def insert_indent(self, match): def insert_indent(self, match):
pstyle = match.group('formatting') pstyle = match.group('formatting')
span = match.group('span') span = match.group('span')
@ -50,11 +49,11 @@ class PreProcessor(object):
return '<p style="text-indent:3%">' return '<p style="text-indent:3%">'
else: else:
return '<p style="text-indent:3%">'+span return '<p style="text-indent:3%">'+span
def no_markup(self, raw, percent): def no_markup(self, raw, percent):
''' '''
Detects total marked up line endings in the file. raw is the text to Detects total marked up line endings in the file. raw is the text to
inspect. Percent is the minimum percent of line endings which should inspect. Percent is the minimum percent of line endings which should
be marked up to return true. be marked up to return true.
''' '''
htm_end_ere = re.compile('</p>', re.DOTALL) htm_end_ere = re.compile('</p>', re.DOTALL)
@ -68,13 +67,13 @@ class PreProcessor(object):
if percent > 1: if percent > 1:
percent = 1 percent = 1
if percent < 0: if percent < 0:
percent = 0 percent = 0
min_lns = tot_ln_fds * percent min_lns = tot_ln_fds * percent
self.log("There must be fewer than " + str(min_lns) + " unmarked lines to add markup") self.log("There must be fewer than " + str(min_lns) + " unmarked lines to add markup")
if min_lns > tot_htm_ends: if min_lns > tot_htm_ends:
return True return True
def __call__(self, html): def __call__(self, html):
self.log("********* Preprocessing HTML *********") self.log("********* Preprocessing HTML *********")
# Replace series of non-breaking spaces with text-indent # Replace series of non-breaking spaces with text-indent
@ -88,7 +87,7 @@ class PreProcessor(object):
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html) html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
# Get rid of empty span tags # Get rid of empty span tags
html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html) html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html)
# If more than 40% of the lines are empty paragraphs then delete them to clean up spacing # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL) linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE) blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
@ -102,19 +101,19 @@ class PreProcessor(object):
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
html = re.sub(r"\s*</p>", "</p>\n", html) html = re.sub(r"\s*</p>", "</p>\n", html)
html = re.sub(r"\s*<p>\s*", "\n<p>", html) html = re.sub(r"\s*<p>\s*", "\n<p>", html)
# some lit files don't have any <p> tags or equivalent (generally just plain text between # some lit files don't have any <p> tags or equivalent (generally just plain text between
# <pre> tags), check and mark up line endings if required before proceeding # <pre> tags), check and mark up line endings if required before proceeding
if self.no_markup(html, 0.1): if self.no_markup(html, 0.1):
self.log("not enough paragraph markers, adding now") self.log("not enough paragraph markers, adding now")
add_markup = re.compile('(?<!>)(\n)') add_markup = re.compile('(?<!>)(\n)')
html = add_markup.sub('</p>\n<p>', html) html = add_markup.sub('</p>\n<p>', html)
# detect chapters/sections to match xpath or splitting logic # detect chapters/sections to match xpath or splitting logic
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE) heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
self.html_preprocess_sections = len(heading.findall(html)) self.html_preprocess_sections = len(heading.findall(html))
self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings") self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings")
# #
# Start with most typical chapter headings, get more aggressive until one works # Start with most typical chapter headings, get more aggressive until one works
if self.html_preprocess_sections < 10: if self.html_preprocess_sections < 10:
chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}s*(<span[^>]*>)?\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE) chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}s*(<span[^>]*>)?\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
@ -122,18 +121,18 @@ class PreProcessor(object):
if self.html_preprocess_sections < 10: if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters") self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE) chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
html = chapdetect2.sub(self.chapter_head, html) html = chapdetect2.sub(self.chapter_head, html)
if self.html_preprocess_sections < 10: if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words") self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE) chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
html = chapdetect2.sub(self.chapter_head, html) html = chapdetect2.sub(self.chapter_head, html)
# Unwrap lines # Unwrap lines
# #
self.log("Unwrapping Lines") self.log("Unwrapping Lines")
# Some OCR sourced files have line breaks in the html using a combination of span & p tags # Some OCR sourced files have line breaks in the html using a combination of span & p tags
# span are used for hard line breaks, p for new paragraphs. Determine which is used so # span are used for hard line breaks, p for new paragraphs. Determine which is used so
# that lines can be un-wrapped across page boundaries # that lines can be un-wrapped across page boundaries
paras_reg = re.compile('<p[^>]*>', re.IGNORECASE) paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
spans_reg = re.compile('<span[^>]*>', re.IGNORECASE) spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
@ -146,7 +145,7 @@ class PreProcessor(object):
format = 'html' format = 'html'
else: else:
format = 'html' format = 'html'
# Calculate Length # Calculate Length
length = line_length(format, html, 0.4) length = line_length(format, html, 0.4)
self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***") self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***")
@ -154,8 +153,8 @@ class PreProcessor(object):
# Unwrap and/or delete soft-hyphens, hyphens # Unwrap and/or delete soft-hyphens, hyphens
html = re.sub(u'­\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html) html = re.sub(u'­\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
html = re.sub(u'(?<=[-–—])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html) html = re.sub(u'(?<=[-–—])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
# Unwrap lines using punctation if the median length of all lines is less than 200 # Unwrap lines using punctation if the median length of all lines is less than 200
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
html = unwrap.sub(' ', html) html = unwrap.sub(' ', html)
@ -164,11 +163,11 @@ class PreProcessor(object):
self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections)) self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections))
#self.log(html) #self.log(html)
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE) chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
html = chapdetect3.sub(self.chapter_break, html) html = chapdetect3.sub(self.chapter_break, html)
# search for places where a first or second level heading is immediately followed by another # search for places where a first or second level heading is immediately followed by another
# top level heading. demote the second heading to h3 to prevent splitting between chapter # top level heading. demote the second heading to h3 to prevent splitting between chapter
# headings and titles, images, etc # headings and titles, images, etc
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE) doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html) html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
return html return html

View File

@ -491,6 +491,6 @@ class HTMLInput(InputFormatPlugin):
return (None, raw) return (None, raw)
def preprocess_html(self, html): def preprocess_html(self, html):
preprocessor = PreProcessor(html) preprocessor = PreProcessor(log=getattr(self, 'log', None))
html = preprocessor(html) return preprocessor(html)
return html

View File

@ -54,7 +54,6 @@ class LITInput(InputFormatPlugin):
def preprocess_html(self, html): def preprocess_html(self, html):
preprocessor = PreProcessor(html) preprocessor = PreProcessor(log=getattr(self, 'log', None))
html = preprocessor(html) return preprocessor(html)
return html

View File

@ -138,6 +138,7 @@ class CSSFlattener(object):
float(self.context.margin_left)) float(self.context.margin_left))
bs.append('margin-right : %fpt'%\ bs.append('margin-right : %fpt'%\
float(self.context.margin_right)) float(self.context.margin_right))
bs.extend(['padding-left: 0pt', 'padding-right: 0pt'])
if self.context.change_justification != 'original': if self.context.change_justification != 'original':
bs.append('text-align: '+ self.context.change_justification) bs.append('text-align: '+ self.context.change_justification)
body.set('style', '; '.join(bs)) body.set('style', '; '.join(bs))

View File

@ -207,6 +207,7 @@ class PML_HTMLizer(object):
while html != old: while html != old:
old = html old = html
html = self.cleanup_html_remove_redundant(html) html = self.cleanup_html_remove_redundant(html)
html = re.sub(r'(?imu)^\s*', '', html)
return html return html
def cleanup_html_remove_redundant(self, html): def cleanup_html_remove_redundant(self, html):
@ -216,7 +217,7 @@ class PML_HTMLizer(object):
html = re.sub(r'(?u)%s\s*%s' % (open % '.*?', close), '', html) html = re.sub(r'(?u)%s\s*%s' % (open % '.*?', close), '', html)
else: else:
html = re.sub(r'(?u)%s\s*%s' % (open, close), '', html) html = re.sub(r'(?u)%s\s*%s' % (open, close), '', html)
html = re.sub(r'<p>\s*</p>', '', html) html = re.sub(r'(?imu)<p>\s*</p>', '', html)
return html return html
def start_line(self): def start_line(self):
@ -556,7 +557,7 @@ class PML_HTMLizer(object):
text = t text = t
else: else:
self.toc.add_item(os.path.basename(self.file_name), id, value) self.toc.add_item(os.path.basename(self.file_name), id, value)
text = '<span id="%s"></span>%s' % (id, t) text = '%s<span id="%s"></span>' % (t, id)
elif c == 'm': elif c == 'm':
empty = False empty = False
src = self.code_value(line) src = self.code_value(line)

View File

@ -7,7 +7,6 @@ import os, glob, re, textwrap
from lxml import etree from lxml import etree
from calibre.customize.conversion import InputFormatPlugin from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.conversion.preprocess import line_length
from calibre.ebooks.conversion.utils import PreProcessor from calibre.ebooks.conversion.utils import PreProcessor
class InlineClass(etree.XSLTExtension): class InlineClass(etree.XSLTExtension):
@ -230,7 +229,7 @@ class RTFInput(InputFormatPlugin):
res = transform.tostring(result) res = transform.tostring(result)
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:] res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
if self.options.preprocess_html: if self.options.preprocess_html:
preprocessor = PreProcessor(res) preprocessor = PreProcessor(log=getattr(self, 'log', None))
res = preprocessor(res) res = preprocessor(res)
f.write(res) f.write(res)
self.write_inline_css(inline_class) self.write_inline_css(inline_class)

View File

@ -77,7 +77,7 @@ def separate_paragraphs_print_formatted(txt):
def preserve_spaces(txt): def preserve_spaces(txt):
txt = txt.replace(' ', '&nbsp;') txt = txt.replace(' ', '&nbsp;')
txt = txt.replace('\t', '&#09;') txt = txt.replace('\t', '&nbsp;&nbsp;&nbsp;&nbsp;')
return txt return txt
def opf_writer(path, opf_name, manifest, spine, mi): def opf_writer(path, opf_name, manifest, spine, mi):

View File

@ -209,8 +209,9 @@ class EditMetadataAction(InterfaceAction):
dest_id, src_books, src_ids = self.books_to_merge(rows) dest_id, src_books, src_ids = self.books_to_merge(rows)
if safe_merge: if safe_merge:
if not confirm('<p>'+_( if not confirm('<p>'+_(
'All book formats and metadata from the selected books ' 'Book formats and metadata from the selected books '
'will be added to the <b>first selected book.</b><br><br> ' 'will be added to the <b>first selected book.</b> '
'ISBN will <i>not</i> be merged.<br><br> '
'The second and subsequently selected books will not ' 'The second and subsequently selected books will not '
'be deleted or changed.<br><br>' 'be deleted or changed.<br><br>'
'Please confirm you want to proceed.') 'Please confirm you want to proceed.')
@ -220,8 +221,9 @@ class EditMetadataAction(InterfaceAction):
self.merge_metadata(dest_id, src_ids) self.merge_metadata(dest_id, src_ids)
else: else:
if not confirm('<p>'+_( if not confirm('<p>'+_(
'All book formats and metadata from the selected books will be merged ' 'Book formats and metadata from the selected books will be merged '
'into the <b>first selected book</b>.<br><br>' 'into the <b>first selected book</b>. '
'ISBN will <i>not</i> be merged.<br><br>'
'After merger the second and ' 'After merger the second and '
'subsequently selected books will be <b>deleted</b>. <br><br>' 'subsequently selected books will be <b>deleted</b>. <br><br>'
'All book formats of the first selected book will be kept ' 'All book formats of the first selected book will be kept '

View File

@ -121,10 +121,8 @@ class BooksModel(QAbstractTableModel): # {{{
def set_device_connected(self, is_connected): def set_device_connected(self, is_connected):
self.device_connected = is_connected self.device_connected = is_connected
self.db.refresh_ondevice() self.db.refresh_ondevice()
self.refresh() self.refresh() # does a resort()
self.research() self.research()
if is_connected and self.sorted_on[0] == 'ondevice':
self.resort()
def set_book_on_device_func(self, func): def set_book_on_device_func(self, func):
self.book_on_device = func self.book_on_device = func
@ -264,19 +262,15 @@ class BooksModel(QAbstractTableModel): # {{{
self.sorting_done.emit(self.db.index) self.sorting_done.emit(self.db.index)
def refresh(self, reset=True): def refresh(self, reset=True):
try:
col = self.column_map.index(self.sorted_on[0])
except:
col = 0
self.db.refresh(field=None) self.db.refresh(field=None)
self.sort(col, self.sorted_on[1], reset=reset) self.resort(reset=reset)
def resort(self, reset=True): def resort(self, reset=True):
try: if not self.db:
col = self.column_map.index(self.sorted_on[0]) return
except ValueError: self.db.multisort(self.sort_history[:tweaks['maximum_resort_levels']])
col = 0 if reset:
self.sort(col, self.sorted_on[1], reset=reset) self.reset()
def research(self, reset=True): def research(self, reset=True):
self.search(self.last_search, reset=reset) self.search(self.last_search, reset=reset)
@ -1030,6 +1024,11 @@ class DeviceBooksModel(BooksModel): # {{{
if reset: if reset:
self.reset() self.reset()
def resort(self, reset=True):
if self.sorted_on:
self.sort(self.column_map.index(self.sorted_on[0]),
self.sorted_on[1], reset=reset)
def columnCount(self, parent): def columnCount(self, parent):
if parent and parent.isValid(): if parent and parent.isValid():
return 0 return 0

View File

@ -6,7 +6,7 @@ __license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import re, itertools, functools import re, itertools
from itertools import repeat from itertools import repeat
from datetime import timedelta from datetime import timedelta
from threading import Thread, RLock from threading import Thread, RLock
@ -112,7 +112,7 @@ class ResultCache(SearchQueryParser):
''' '''
def __init__(self, FIELD_MAP, field_metadata): def __init__(self, FIELD_MAP, field_metadata):
self.FIELD_MAP = FIELD_MAP self.FIELD_MAP = FIELD_MAP
self._map = self._map_filtered = self._data = [] self._map = self._data = self._map_filtered = []
self.first_sort = True self.first_sort = True
self.search_restriction = '' self.search_restriction = ''
self.field_metadata = field_metadata self.field_metadata = field_metadata
@ -141,6 +141,8 @@ class ResultCache(SearchQueryParser):
for x in self.iterall(): for x in self.iterall():
yield x[idx] yield x[idx]
# Search functions {{{
def universal_set(self): def universal_set(self):
return set([i[0] for i in self._data if i is not None]) return set([i[0] for i in self._data if i is not None])
@ -462,12 +464,43 @@ class ResultCache(SearchQueryParser):
continue continue
return matches return matches
def search(self, query, return_matches=False):
ans = self.search_getting_ids(query, self.search_restriction)
if return_matches:
return ans
self._map_filtered = ans
def search_getting_ids(self, query, search_restriction):
q = ''
if not query or not query.strip():
q = search_restriction
else:
q = query
if search_restriction:
q = u'%s (%s)' % (search_restriction, query)
if not q:
return list(self._map)
matches = self.parse(q)
tmap = list(itertools.repeat(False, len(self._data)))
for x in matches:
tmap[x] = True
return [x for x in self._map if tmap[x]]
def set_search_restriction(self, s):
self.search_restriction = s
# }}}
def remove(self, id): def remove(self, id):
self._data[id] = None self._data[id] = None
if id in self._map: try:
self._map.remove(id) self._map.remove(id)
if id in self._map_filtered: except ValueError:
pass
try:
self._map_filtered.remove(id) self._map_filtered.remove(id)
except ValueError:
pass
def set(self, row, col, val, row_is_id=False): def set(self, row, col, val, row_is_id=False):
id = row if row_is_id else self._map_filtered[row] id = row if row_is_id else self._map_filtered[row]
@ -522,9 +555,7 @@ class ResultCache(SearchQueryParser):
def books_deleted(self, ids): def books_deleted(self, ids):
for id in ids: for id in ids:
self._data[id] = None self.remove(id)
if id in self._map: self._map.remove(id)
if id in self._map_filtered: self._map_filtered.remove(id)
def count(self): def count(self):
return len(self._map) return len(self._map)
@ -549,90 +580,97 @@ class ResultCache(SearchQueryParser):
self.sort(field, ascending) self.sort(field, ascending)
self._map_filtered = list(self._map) self._map_filtered = list(self._map)
if self.search_restriction: if self.search_restriction:
self.search('', return_matches=False, ignore_search_restriction=False) self.search('', return_matches=False)
def seriescmp(self, sidx, siidx, x, y, library_order=None): # Sorting functions {{{
try:
if library_order:
ans = cmp(title_sort(self._data[x][sidx].lower()),
title_sort(self._data[y][sidx].lower()))
else:
ans = cmp(self._data[x][sidx].lower(),
self._data[y][sidx].lower())
except AttributeError: # Some entries may be None
ans = cmp(self._data[x][sidx], self._data[y][sidx])
if ans != 0: return ans
return cmp(self._data[x][siidx], self._data[y][siidx])
def cmp(self, loc, x, y, asstr=True, subsort=False): def sanitize_sort_field_name(self, field):
try: field = field.lower().strip()
ans = cmp(self._data[x][loc].lower(), self._data[y][loc].lower()) if \ if field not in self.field_metadata.iterkeys():
asstr else cmp(self._data[x][loc], self._data[y][loc]) if field in ('author', 'tag', 'comment'):
except AttributeError: # Some entries may be None field += 's'
ans = cmp(self._data[x][loc], self._data[y][loc]) if field == 'date': field = 'timestamp'
except TypeError: ## raised when a datetime is None elif field == 'title': field = 'sort'
x = self._data[x][loc] elif field == 'authors': field = 'author_sort'
if x is None: return field
x = UNDEFINED_DATE
y = self._data[y][loc]
if y is None:
y = UNDEFINED_DATE
return cmp(x, y)
if subsort and ans == 0:
return cmp(self._data[x][11].lower(), self._data[y][11].lower())
return ans
def sort(self, field, ascending, subsort=False): def sort(self, field, ascending, subsort=False):
field = field.lower().strip() self.multisort([(field, ascending)])
if field in ('author', 'tag', 'comment'):
field += 's'
if field == 'date': field = 'timestamp'
elif field == 'title': field = 'sort'
elif field == 'authors': field = 'author_sort'
as_string = field not in ('size', 'rating', 'timestamp')
if self.first_sort: def multisort(self, fields=[], subsort=False):
subsort = True fields = [(self.sanitize_sort_field_name(x), bool(y)) for x, y in fields]
self.first_sort = False keys = self.field_metadata.field_keys()
if self.field_metadata[field]['is_custom']: fields = [x for x in fields if x[0] in keys]
if self.field_metadata[field]['datatype'] == 'series': if subsort and 'sort' not in [x[0] for x in fields]:
fcmp = functools.partial(self.seriescmp, fields += [('sort', True)]
self.field_metadata[field]['rec_index'], if not fields:
self.field_metadata.cc_series_index_column_for(field), fields = [('timestamp', False)]
library_order=tweaks['title_series_sorting'] == 'library_order')
else: keyg = SortKeyGenerator(fields, self.field_metadata, self._data)
as_string = self.field_metadata[field]['datatype'] in ('comments', 'text') if len(fields) == 1:
field = self.field_metadata[field]['colnum'] self._map.sort(key=keyg, reverse=not fields[0][1])
fcmp = functools.partial(self.cmp, self.FIELD_MAP[field],
subsort=subsort, asstr=as_string)
elif field == 'series':
fcmp = functools.partial(self.seriescmp, self.FIELD_MAP['series'],
self.FIELD_MAP['series_index'],
library_order=tweaks['title_series_sorting'] == 'library_order')
else: else:
fcmp = functools.partial(self.cmp, self.FIELD_MAP[field], self._map.sort(key=keyg)
subsort=subsort, asstr=as_string)
self._map.sort(cmp=fcmp, reverse=not ascending)
self._map_filtered = [id for id in self._map if id in self._map_filtered]
def search(self, query, return_matches=False): tmap = list(itertools.repeat(False, len(self._data)))
ans = self.search_getting_ids(query, self.search_restriction) for x in self._map_filtered:
if return_matches: tmap[x] = True
return ans self._map_filtered = [x for x in self._map if tmap[x]]
self._map_filtered = ans
class SortKey(object):
def __init__(self, orders, values):
self.orders, self.values = orders, values
def __cmp__(self, other):
for i, ascending in enumerate(self.orders):
ans = cmp(self.values[i], other.values[i])
if ans != 0:
return ans * ascending
return 0
class SortKeyGenerator(object):
def __init__(self, fields, field_metadata, data):
self.field_metadata = field_metadata
self.orders = [-1 if x[1] else 1 for x in fields]
self.entries = [(x[0], field_metadata[x[0]]) for x in fields]
self.library_order = tweaks['title_series_sorting'] == 'library_order'
self.data = data
def __call__(self, record):
values = tuple(self.itervals(self.data[record]))
if len(values) == 1:
return values[0]
return SortKey(self.orders, values)
def itervals(self, record):
for name, fm in self.entries:
dt = fm['datatype']
val = record[fm['rec_index']]
if dt == 'datetime':
if val is None:
val = UNDEFINED_DATE
elif dt == 'series':
if val is None:
val = ('', 1)
else:
val = val.lower()
if self.library_order:
val = title_sort(val)
sidx_fm = self.field_metadata[name + '_index']
sidx = record[sidx_fm['rec_index']]
val = (val, sidx)
elif dt in ('text', 'comments'):
if val is None:
val = ''
val = val.lower()
yield val
# }}}
def search_getting_ids(self, query, search_restriction):
q = ''
if not query or not query.strip():
q = search_restriction
else:
q = query
if search_restriction:
q = u'%s (%s)' % (search_restriction, query)
if not q:
return list(self._map)
matches = sorted(self.parse(q))
return [id for id in self._map if id in matches]
def set_search_restriction(self, s):
self.search_restriction = s

View File

@ -311,6 +311,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
self.search_getting_ids = self.data.search_getting_ids self.search_getting_ids = self.data.search_getting_ids
self.refresh = functools.partial(self.data.refresh, self) self.refresh = functools.partial(self.data.refresh, self)
self.sort = self.data.sort self.sort = self.data.sort
self.multisort = self.data.multisort
self.index = self.data.index self.index = self.data.index
self.refresh_ids = functools.partial(self.data.refresh_ids, self) self.refresh_ids = functools.partial(self.data.refresh_ids, self)
self.row = self.data.row self.row = self.data.row

View File

@ -69,6 +69,8 @@ class FieldMetadata(dict):
VALID_DATA_TYPES = frozenset([None, 'rating', 'text', 'comments', 'datetime', VALID_DATA_TYPES = frozenset([None, 'rating', 'text', 'comments', 'datetime',
'int', 'float', 'bool', 'series']) 'int', 'float', 'bool', 'series'])
# Builtin metadata {{{
_field_metadata = [ _field_metadata = [
('authors', {'table':'authors', ('authors', {'table':'authors',
'column':'name', 'column':'name',
@ -287,7 +289,8 @@ class FieldMetadata(dict):
'search_terms':[], 'search_terms':[],
'is_custom':False, 'is_custom':False,
'is_category':False}), 'is_category':False}),
] ]
# }}}
# search labels that are not db columns # search labels that are not db columns
search_items = [ 'all', search_items = [ 'all',
@ -332,6 +335,9 @@ class FieldMetadata(dict):
def keys(self): def keys(self):
return self._tb_cats.keys() return self._tb_cats.keys()
def field_keys(self):
return [k for k in self._tb_cats.keys() if self._tb_cats[k]['kind']=='field']
def iterkeys(self): def iterkeys(self):
for key in self._tb_cats: for key in self._tb_cats:
yield key yield key

View File

@ -5,7 +5,7 @@ __license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import re, os, cStringIO, operator import re, os, cStringIO
import cherrypy import cherrypy
try: try:
@ -16,7 +16,15 @@ except ImportError:
from calibre import fit_image, guess_type from calibre import fit_image, guess_type
from calibre.utils.date import fromtimestamp from calibre.utils.date import fromtimestamp
from calibre.ebooks.metadata import title_sort from calibre.library.caches import SortKeyGenerator
class CSSortKeyGenerator(SortKeyGenerator):
def __init__(self, fields, fm):
SortKeyGenerator.__init__(self, fields, fm, None)
def __call__(self, record):
return self.itervals(record).next()
class ContentServer(object): class ContentServer(object):
@ -47,32 +55,12 @@ class ContentServer(object):
def sort(self, items, field, order): def sort(self, items, field, order):
field = field.lower().strip() field = self.db.data.sanitize_sort_field_name(field)
if field == 'author':
field = 'authors'
if field == 'date':
field = 'timestamp'
if field not in ('title', 'authors', 'rating', 'timestamp', 'tags', 'size', 'series'): if field not in ('title', 'authors', 'rating', 'timestamp', 'tags', 'size', 'series'):
raise cherrypy.HTTPError(400, '%s is not a valid sort field'%field) raise cherrypy.HTTPError(400, '%s is not a valid sort field'%field)
cmpf = cmp if field in ('rating', 'size', 'timestamp') else \ keyg = CSSortKeyGenerator([(field, order)], self.db.field_metadata)
lambda x, y: cmp(x.lower() if x else '', y.lower() if y else '') items.sort(key=keyg, reverse=not order)
if field == 'series':
items.sort(cmp=self.seriescmp, reverse=not order)
else:
lookup = 'sort' if field == 'title' else field
lookup = 'author_sort' if field == 'authors' else field
field = self.db.FIELD_MAP[lookup]
getter = operator.itemgetter(field)
items.sort(cmp=lambda x, y: cmpf(getter(x), getter(y)), reverse=not order)
def seriescmp(self, x, y):
si = self.db.FIELD_MAP['series']
try:
ans = cmp(title_sort(x[si].lower()), title_sort(y[si].lower()))
except AttributeError: # Some entries may be None
ans = cmp(x[si], y[si])
if ans != 0: return ans
return cmp(x[self.db.FIELD_MAP['series_index']], y[self.db.FIELD_MAP['series_index']])
# }}} # }}}

View File

@ -54,10 +54,8 @@ def shorten_components_to(length, components):
r = x[0] if x is components[-1] else '' r = x[0] if x is components[-1] else ''
else: else:
if x is components[-1]: if x is components[-1]:
b, _, e = x.rpartition('.') b, e = os.path.splitext(x)
if not b and e: if e == '.': e = ''
b = e
e = ''
r = b[:-delta]+e r = b[:-delta]+e
if r.startswith('.'): r = x[0]+r if r.startswith('.'): r = x[0]+r
else: else:

View File

@ -165,7 +165,9 @@ class Feed(object):
if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article: if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
self.articles.append(article) self.articles.append(article)
else: else:
self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%(title, article.localtime.strftime('%a, %d %b, %Y %H:%M'), self.title)) t = strftime(u'%a, %d %b, %Y %H:%M', article.localtime.timetuple())
self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%
(title, t, self.title))
d = item.get('date', '') d = item.get('date', '')
article.formatted_date = d article.formatted_date = d