merge from branch with kovid's integration of preprocess code

This commit is contained in:
ldolse 2010-09-14 11:03:03 +08:00
commit 0a2e16e466
24 changed files with 282 additions and 247 deletions

View File

@ -26,7 +26,7 @@ var current_library_request = null;
////////////////////////////// GET BOOK LIST //////////////////////////////
var LIBRARY_FETCH_TIMEOUT = 30000; // milliseconds
var LIBRARY_FETCH_TIMEOUT = 5*60000; // milliseconds
function create_table_headers() {
var thead = $('table#book_list thead tr');

View File

@ -114,3 +114,11 @@ add_new_book_tags_when_importing_books = False
# Set the maximum number of tags to show per book in the content server
max_content_server_tags_shown=5
# Set the maximum number of sort 'levels' that calibre will use to resort the
# library after certain operations such as searches or device insertion. Each
# sort level adds a performance penalty. If the database is large (thousands of
# books) the penalty might be noticeable. If you are not concerned about multi-
# level sorts, and if you are seeing a slowdown, reduce the value of this tweak.
maximum_resort_levels = 5

View File

@ -1,12 +1,8 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
'''
infobae.com
'''
import re
import urllib, urlparse
from calibre.web.feeds.news import BasicNewsRecipe
@ -20,35 +16,24 @@ class Infobae(BasicNewsRecipe):
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
language = 'es'
lang = 'es-AR'
language = 'es'
encoding = 'cp1252'
cover_url = 'http://www.infobae.com/imgs/header/header.gif'
masthead_url = 'http://www.infobae.com/imgs/header/header.gif'
remove_javascript = True
preprocess_regexps = [(re.compile(
r'<meta name="Description" content="[^"]+">'), lambda m:'')]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
extra_css = '''
.col-center{font-family:Arial,Helvetica,sans-serif;}
h1{font-family:Arial,Helvetica,sans-serif; color:#0D4261;}
.fuenteIntNota{font-family:Arial,Helvetica,sans-serif; color:#1D1D1D; font-size:x-small;}
'''
keep_only_tags = [dict(name='div', attrs={'class':['content']})]
remove_tags = [
dict(name='div', attrs={'class':['options','col-right','controles', 'bannerLibre','tiulo-masleidas','masleidas-h']}),
dict(name='a', attrs={'name' : 'comentario',}),
dict(name='iframe'),
dict(name='img', alt = "Ver galerias de imagenes"),
]
remove_empty_feeds = True
extra_css = '''
body{font-family:Arial,Helvetica,sans-serif;}
.popUpTitulo{color:#0D4261; font-size: xx-large}
'''
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
, 'linearize_tables' : True
}
feeds = [
(u'Noticias' , u'http://www.infobae.com/adjuntos/html/RSS/hoy.xml' )
@ -57,39 +42,14 @@ class Infobae(BasicNewsRecipe):
,(u'Deportes' , u'http://www.infobae.com/adjuntos/html/RSS/deportes.xml' )
]
# def print_version(self, url):
# main, sep, article_part = url.partition('contenidos/')
# article_id, rsep, rrest = article_part.partition('-')
# return u'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id
def get_article_url(self, article):
ans = article.get('link').encode('utf-8')
parts = list(urlparse.urlparse(ans))
parts[2] = urllib.quote(parts[2])
ans = urlparse.urlunparse(parts)
return ans.decode('utf-8')
def preprocess_html(self, soup):
for tag in soup.head.findAll('strong'):
tag.extract()
for tag in soup.findAll('meta'):
del tag['content']
tag.extract()
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">\n<meta http-equiv="Content-Language" content="es-AR"/>\n'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
def print_version(self, url):
article_part = url.rpartition('/')[2]
article_id= article_part.partition('-')[0]
return 'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id
def postprocess_html(self, soup, first):
for tag in soup.findAll(name='strong'):
tag.name = 'b'
return soup

View File

@ -6,6 +6,7 @@ nspm.rs
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import NavigableString
class Nspm(BasicNewsRecipe):
title = 'Nova srpska politicka misao'
@ -21,6 +22,7 @@ class Nspm(BasicNewsRecipe):
encoding = 'utf-8'
language = 'sr'
delay = 2
remove_empty_feeds = True
publication_type = 'magazine'
masthead_url = 'http://www.nspm.rs/templates/jsn_epic_pro/images/logol.jpg'
extra_css = """ @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)}
@ -45,8 +47,9 @@ class Nspm(BasicNewsRecipe):
dict(name=['link','object','embed','script','meta','base','iframe'])
,dict(attrs={'class':'buttonheading'})
]
remove_tags_after = dict(attrs={'class':'article_separator'})
remove_attributes = ['width','height']
remove_tags_before = dict(attrs={'class':'contentheading'})
remove_tags_after = dict(attrs={'class':'article_separator'})
remove_attributes = ['width','height']
def get_browser(self):
br = BasicNewsRecipe.get_browser()
@ -67,4 +70,8 @@ class Nspm(BasicNewsRecipe):
def preprocess_html(self, soup):
for item in soup.body.findAll(style=True):
del item['style']
for item in soup.body.findAll('h1'):
nh = NavigableString(item.a.string)
item.a.extract()
item.insert(0,nh)
return self.adeify_images(soup)

View File

@ -24,18 +24,18 @@ class XkcdCom(BasicNewsRecipe):
(re.compile(r'(<img.*title=")([^"]+)(".*>)'),
lambda m: '%s%s<p>%s</p>' % (m.group(1), m.group(3), m.group(2)))
]
def parse_index(self):
INDEX = 'http://xkcd.com/archive/'
soup = self.index_to_soup(INDEX)
soup = self.index_to_soup(INDEX)
articles = []
for item in soup.findAll('a', title=True):
articles.append({
'date': item['title'],
'timestamp': time.mktime(time.strptime(item['title'], '%Y-%m-%d'))+1,
'url': 'http://xkcd.com' + item['href'],
'title': self.tag_to_string(item).encode('UTF-8'),
'title': self.tag_to_string(item),
'description': '',
'content': '',
})

View File

@ -459,7 +459,7 @@ from calibre.devices.iriver.driver import IRIVER_STORY
from calibre.devices.binatone.driver import README
from calibre.devices.hanvon.driver import N516, EB511, ALEX, AZBOOKA, THEBOOK
from calibre.devices.edge.driver import EDGE
from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS
from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS, SOVOS
from calibre.devices.sne.driver import SNE
from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, GEMEI
from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG
@ -557,6 +557,7 @@ plugins += [
TECLAST_K3,
NEWSMY,
IPAPYRUS,
SOVOS,
EDGE,
SNE,
ALEX,

View File

@ -52,3 +52,14 @@ class IPAPYRUS(TECLAST_K3):
VENDOR_NAME = 'E_READER'
WINDOWS_MAIN_MEM = ''
class SOVOS(TECLAST_K3):
name = 'Sovos device interface'
gui_name = 'Sovos'
description = _('Communicate with the Sovos reader.')
FORMATS = ['epub', 'fb2', 'pdf', 'txt']
VENDOR_NAME = 'RK28XX'
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'USB-MSC'

View File

@ -132,7 +132,11 @@ class CHMReader(CHMFile):
for path in self.Contents():
lpath = os.path.join(output_dir, path)
self._ensure_dir(lpath)
data = self.GetFile(path)
try:
data = self.GetFile(path)
except:
self.log.exception('Failed to extract %s from CHM, ignoring'%path)
continue
if lpath.find(';') != -1:
# fix file names with ";<junk>" at the end, see _reformat()
lpath = lpath.split(';')[0]

View File

@ -168,6 +168,17 @@ class HTMLPreProcessor(object):
(re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'),
(re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'),
(re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'),
# ` with letter before
(re.compile(u'a\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'à'),
(re.compile(u'A\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'À'),
(re.compile(u'e\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'è'),
(re.compile(u'E\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'È'),
(re.compile(u'i\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ì'),
(re.compile(u'I\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ì'),
(re.compile(u'o\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ò'),
(re.compile(u'O\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ò'),
(re.compile(u'u\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ù'),
(re.compile(u'U\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ù'),
# ´
(re.compile(u'´\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'á'),

View File

@ -10,24 +10,23 @@ from calibre.ebooks.conversion.preprocess import line_length
from calibre.utils.logging import default_log
class PreProcessor(object):
html_preprocess_sections = 0
found_indents = 0
def __init__(self, args):
self.args = args
self.log = default_log
def __init__(self, log=None):
self.log = default_log if log is None else log
self.html_preprocess_sections = 0
self.found_indents = 0
def chapter_head(self, match):
chap = match.group('chap')
title = match.group('title')
if not title:
self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log("found " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
return '<h2>'+chap+'</h2>\n'
self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log("found " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
return '<h2>'+chap+'</h2>\n'
else:
self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log("found " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log("found " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
def chapter_break(self, match):
chap = match.group('section')
@ -35,7 +34,7 @@ class PreProcessor(object):
self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log("marked " + str(self.html_preprocess_sections) + " section markers based on punctuation. - " + str(chap))
return '<'+styles+' style="page-break-before:always">'+chap
def insert_indent(self, match):
pstyle = match.group('formatting')
span = match.group('span')
@ -50,11 +49,11 @@ class PreProcessor(object):
return '<p style="text-indent:3%">'
else:
return '<p style="text-indent:3%">'+span
def no_markup(self, raw, percent):
'''
Detects total marked up line endings in the file. raw is the text to
inspect. Percent is the minimum percent of line endings which should
Detects total marked up line endings in the file. raw is the text to
inspect. Percent is the minimum percent of line endings which should
be marked up to return true.
'''
htm_end_ere = re.compile('</p>', re.DOTALL)
@ -68,13 +67,13 @@ class PreProcessor(object):
if percent > 1:
percent = 1
if percent < 0:
percent = 0
percent = 0
min_lns = tot_ln_fds * percent
self.log("There must be fewer than " + str(min_lns) + " unmarked lines to add markup")
if min_lns > tot_htm_ends:
return True
def __call__(self, html):
self.log("********* Preprocessing HTML *********")
# Replace series of non-breaking spaces with text-indent
@ -88,7 +87,7 @@ class PreProcessor(object):
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
# Get rid of empty span tags
html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html)
# If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
@ -102,19 +101,19 @@ class PreProcessor(object):
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
html = re.sub(r"\s*</p>", "</p>\n", html)
html = re.sub(r"\s*<p>\s*", "\n<p>", html)
# some lit files don't have any <p> tags or equivalent (generally just plain text between
# some lit files don't have any <p> tags or equivalent (generally just plain text between
# <pre> tags), check and mark up line endings if required before proceeding
if self.no_markup(html, 0.1):
self.log("not enough paragraph markers, adding now")
add_markup = re.compile('(?<!>)(\n)')
html = add_markup.sub('</p>\n<p>', html)
# detect chapters/sections to match xpath or splitting logic
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
self.html_preprocess_sections = len(heading.findall(html))
self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings")
#
#
# Start with most typical chapter headings, get more aggressive until one works
if self.html_preprocess_sections < 10:
chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}s*(<span[^>]*>)?\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
@ -122,18 +121,18 @@ class PreProcessor(object):
if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
html = chapdetect2.sub(self.chapter_head, html)
html = chapdetect2.sub(self.chapter_head, html)
if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
html = chapdetect2.sub(self.chapter_head, html)
# Unwrap lines
#
#
self.log("Unwrapping Lines")
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
# that lines can be un-wrapped across page boundaries
paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
@ -146,7 +145,7 @@ class PreProcessor(object):
format = 'html'
else:
format = 'html'
# Calculate Length
length = line_length(format, html, 0.4)
self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***")
@ -154,8 +153,8 @@ class PreProcessor(object):
# Unwrap and/or delete soft-hyphens, hyphens
html = re.sub(u'­\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
html = re.sub(u'(?<=[-–—])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
# Unwrap lines using punctation if the median length of all lines is less than 200
# Unwrap lines using punctation if the median length of all lines is less than 200
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
html = unwrap.sub(' ', html)
@ -164,11 +163,11 @@ class PreProcessor(object):
self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections))
#self.log(html)
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
html = chapdetect3.sub(self.chapter_break, html)
html = chapdetect3.sub(self.chapter_break, html)
# search for places where a first or second level heading is immediately followed by another
# top level heading. demote the second heading to h3 to prevent splitting between chapter
# headings and titles, images, etc
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
return html
return html

View File

@ -491,6 +491,6 @@ class HTMLInput(InputFormatPlugin):
return (None, raw)
def preprocess_html(self, html):
preprocessor = PreProcessor(html)
html = preprocessor(html)
return html
preprocessor = PreProcessor(log=getattr(self, 'log', None))
return preprocessor(html)

View File

@ -54,7 +54,6 @@ class LITInput(InputFormatPlugin):
def preprocess_html(self, html):
preprocessor = PreProcessor(html)
html = preprocessor(html)
return html
preprocessor = PreProcessor(log=getattr(self, 'log', None))
return preprocessor(html)

View File

@ -138,6 +138,7 @@ class CSSFlattener(object):
float(self.context.margin_left))
bs.append('margin-right : %fpt'%\
float(self.context.margin_right))
bs.extend(['padding-left: 0pt', 'padding-right: 0pt'])
if self.context.change_justification != 'original':
bs.append('text-align: '+ self.context.change_justification)
body.set('style', '; '.join(bs))

View File

@ -207,6 +207,7 @@ class PML_HTMLizer(object):
while html != old:
old = html
html = self.cleanup_html_remove_redundant(html)
html = re.sub(r'(?imu)^\s*', '', html)
return html
def cleanup_html_remove_redundant(self, html):
@ -216,7 +217,7 @@ class PML_HTMLizer(object):
html = re.sub(r'(?u)%s\s*%s' % (open % '.*?', close), '', html)
else:
html = re.sub(r'(?u)%s\s*%s' % (open, close), '', html)
html = re.sub(r'<p>\s*</p>', '', html)
html = re.sub(r'(?imu)<p>\s*</p>', '', html)
return html
def start_line(self):
@ -556,7 +557,7 @@ class PML_HTMLizer(object):
text = t
else:
self.toc.add_item(os.path.basename(self.file_name), id, value)
text = '<span id="%s"></span>%s' % (id, t)
text = '%s<span id="%s"></span>' % (t, id)
elif c == 'm':
empty = False
src = self.code_value(line)

View File

@ -7,7 +7,6 @@ import os, glob, re, textwrap
from lxml import etree
from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.conversion.preprocess import line_length
from calibre.ebooks.conversion.utils import PreProcessor
class InlineClass(etree.XSLTExtension):
@ -230,7 +229,7 @@ class RTFInput(InputFormatPlugin):
res = transform.tostring(result)
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
if self.options.preprocess_html:
preprocessor = PreProcessor(res)
preprocessor = PreProcessor(log=getattr(self, 'log', None))
res = preprocessor(res)
f.write(res)
self.write_inline_css(inline_class)

View File

@ -77,7 +77,7 @@ def separate_paragraphs_print_formatted(txt):
def preserve_spaces(txt):
txt = txt.replace(' ', '&nbsp;')
txt = txt.replace('\t', '&#09;')
txt = txt.replace('\t', '&nbsp;&nbsp;&nbsp;&nbsp;')
return txt
def opf_writer(path, opf_name, manifest, spine, mi):

View File

@ -209,8 +209,9 @@ class EditMetadataAction(InterfaceAction):
dest_id, src_books, src_ids = self.books_to_merge(rows)
if safe_merge:
if not confirm('<p>'+_(
'All book formats and metadata from the selected books '
'will be added to the <b>first selected book.</b><br><br> '
'Book formats and metadata from the selected books '
'will be added to the <b>first selected book.</b> '
'ISBN will <i>not</i> be merged.<br><br> '
'The second and subsequently selected books will not '
'be deleted or changed.<br><br>'
'Please confirm you want to proceed.')
@ -220,8 +221,9 @@ class EditMetadataAction(InterfaceAction):
self.merge_metadata(dest_id, src_ids)
else:
if not confirm('<p>'+_(
'All book formats and metadata from the selected books will be merged '
'into the <b>first selected book</b>.<br><br>'
'Book formats and metadata from the selected books will be merged '
'into the <b>first selected book</b>. '
'ISBN will <i>not</i> be merged.<br><br>'
'After merger the second and '
'subsequently selected books will be <b>deleted</b>. <br><br>'
'All book formats of the first selected book will be kept '

View File

@ -121,10 +121,8 @@ class BooksModel(QAbstractTableModel): # {{{
def set_device_connected(self, is_connected):
self.device_connected = is_connected
self.db.refresh_ondevice()
self.refresh()
self.refresh() # does a resort()
self.research()
if is_connected and self.sorted_on[0] == 'ondevice':
self.resort()
def set_book_on_device_func(self, func):
self.book_on_device = func
@ -264,19 +262,15 @@ class BooksModel(QAbstractTableModel): # {{{
self.sorting_done.emit(self.db.index)
def refresh(self, reset=True):
try:
col = self.column_map.index(self.sorted_on[0])
except:
col = 0
self.db.refresh(field=None)
self.sort(col, self.sorted_on[1], reset=reset)
self.resort(reset=reset)
def resort(self, reset=True):
try:
col = self.column_map.index(self.sorted_on[0])
except ValueError:
col = 0
self.sort(col, self.sorted_on[1], reset=reset)
if not self.db:
return
self.db.multisort(self.sort_history[:tweaks['maximum_resort_levels']])
if reset:
self.reset()
def research(self, reset=True):
self.search(self.last_search, reset=reset)
@ -1030,6 +1024,11 @@ class DeviceBooksModel(BooksModel): # {{{
if reset:
self.reset()
def resort(self, reset=True):
if self.sorted_on:
self.sort(self.column_map.index(self.sorted_on[0]),
self.sorted_on[1], reset=reset)
def columnCount(self, parent):
if parent and parent.isValid():
return 0

View File

@ -6,7 +6,7 @@ __license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re, itertools, functools
import re, itertools
from itertools import repeat
from datetime import timedelta
from threading import Thread, RLock
@ -112,7 +112,7 @@ class ResultCache(SearchQueryParser):
'''
def __init__(self, FIELD_MAP, field_metadata):
self.FIELD_MAP = FIELD_MAP
self._map = self._map_filtered = self._data = []
self._map = self._data = self._map_filtered = []
self.first_sort = True
self.search_restriction = ''
self.field_metadata = field_metadata
@ -141,6 +141,8 @@ class ResultCache(SearchQueryParser):
for x in self.iterall():
yield x[idx]
# Search functions {{{
def universal_set(self):
return set([i[0] for i in self._data if i is not None])
@ -462,12 +464,43 @@ class ResultCache(SearchQueryParser):
continue
return matches
def search(self, query, return_matches=False):
ans = self.search_getting_ids(query, self.search_restriction)
if return_matches:
return ans
self._map_filtered = ans
def search_getting_ids(self, query, search_restriction):
q = ''
if not query or not query.strip():
q = search_restriction
else:
q = query
if search_restriction:
q = u'%s (%s)' % (search_restriction, query)
if not q:
return list(self._map)
matches = self.parse(q)
tmap = list(itertools.repeat(False, len(self._data)))
for x in matches:
tmap[x] = True
return [x for x in self._map if tmap[x]]
def set_search_restriction(self, s):
self.search_restriction = s
# }}}
def remove(self, id):
self._data[id] = None
if id in self._map:
try:
self._map.remove(id)
if id in self._map_filtered:
except ValueError:
pass
try:
self._map_filtered.remove(id)
except ValueError:
pass
def set(self, row, col, val, row_is_id=False):
id = row if row_is_id else self._map_filtered[row]
@ -522,9 +555,7 @@ class ResultCache(SearchQueryParser):
def books_deleted(self, ids):
for id in ids:
self._data[id] = None
if id in self._map: self._map.remove(id)
if id in self._map_filtered: self._map_filtered.remove(id)
self.remove(id)
def count(self):
return len(self._map)
@ -549,90 +580,97 @@ class ResultCache(SearchQueryParser):
self.sort(field, ascending)
self._map_filtered = list(self._map)
if self.search_restriction:
self.search('', return_matches=False, ignore_search_restriction=False)
self.search('', return_matches=False)
def seriescmp(self, sidx, siidx, x, y, library_order=None):
try:
if library_order:
ans = cmp(title_sort(self._data[x][sidx].lower()),
title_sort(self._data[y][sidx].lower()))
else:
ans = cmp(self._data[x][sidx].lower(),
self._data[y][sidx].lower())
except AttributeError: # Some entries may be None
ans = cmp(self._data[x][sidx], self._data[y][sidx])
if ans != 0: return ans
return cmp(self._data[x][siidx], self._data[y][siidx])
# Sorting functions {{{
def cmp(self, loc, x, y, asstr=True, subsort=False):
try:
ans = cmp(self._data[x][loc].lower(), self._data[y][loc].lower()) if \
asstr else cmp(self._data[x][loc], self._data[y][loc])
except AttributeError: # Some entries may be None
ans = cmp(self._data[x][loc], self._data[y][loc])
except TypeError: ## raised when a datetime is None
x = self._data[x][loc]
if x is None:
x = UNDEFINED_DATE
y = self._data[y][loc]
if y is None:
y = UNDEFINED_DATE
return cmp(x, y)
if subsort and ans == 0:
return cmp(self._data[x][11].lower(), self._data[y][11].lower())
return ans
def sanitize_sort_field_name(self, field):
field = field.lower().strip()
if field not in self.field_metadata.iterkeys():
if field in ('author', 'tag', 'comment'):
field += 's'
if field == 'date': field = 'timestamp'
elif field == 'title': field = 'sort'
elif field == 'authors': field = 'author_sort'
return field
def sort(self, field, ascending, subsort=False):
field = field.lower().strip()
if field in ('author', 'tag', 'comment'):
field += 's'
if field == 'date': field = 'timestamp'
elif field == 'title': field = 'sort'
elif field == 'authors': field = 'author_sort'
as_string = field not in ('size', 'rating', 'timestamp')
self.multisort([(field, ascending)])
if self.first_sort:
subsort = True
self.first_sort = False
if self.field_metadata[field]['is_custom']:
if self.field_metadata[field]['datatype'] == 'series':
fcmp = functools.partial(self.seriescmp,
self.field_metadata[field]['rec_index'],
self.field_metadata.cc_series_index_column_for(field),
library_order=tweaks['title_series_sorting'] == 'library_order')
else:
as_string = self.field_metadata[field]['datatype'] in ('comments', 'text')
field = self.field_metadata[field]['colnum']
fcmp = functools.partial(self.cmp, self.FIELD_MAP[field],
subsort=subsort, asstr=as_string)
elif field == 'series':
fcmp = functools.partial(self.seriescmp, self.FIELD_MAP['series'],
self.FIELD_MAP['series_index'],
library_order=tweaks['title_series_sorting'] == 'library_order')
def multisort(self, fields=[], subsort=False):
fields = [(self.sanitize_sort_field_name(x), bool(y)) for x, y in fields]
keys = self.field_metadata.field_keys()
fields = [x for x in fields if x[0] in keys]
if subsort and 'sort' not in [x[0] for x in fields]:
fields += [('sort', True)]
if not fields:
fields = [('timestamp', False)]
keyg = SortKeyGenerator(fields, self.field_metadata, self._data)
if len(fields) == 1:
self._map.sort(key=keyg, reverse=not fields[0][1])
else:
fcmp = functools.partial(self.cmp, self.FIELD_MAP[field],
subsort=subsort, asstr=as_string)
self._map.sort(cmp=fcmp, reverse=not ascending)
self._map_filtered = [id for id in self._map if id in self._map_filtered]
self._map.sort(key=keyg)
def search(self, query, return_matches=False):
ans = self.search_getting_ids(query, self.search_restriction)
if return_matches:
return ans
self._map_filtered = ans
tmap = list(itertools.repeat(False, len(self._data)))
for x in self._map_filtered:
tmap[x] = True
self._map_filtered = [x for x in self._map if tmap[x]]
class SortKey(object):
def __init__(self, orders, values):
self.orders, self.values = orders, values
def __cmp__(self, other):
for i, ascending in enumerate(self.orders):
ans = cmp(self.values[i], other.values[i])
if ans != 0:
return ans * ascending
return 0
class SortKeyGenerator(object):
def __init__(self, fields, field_metadata, data):
self.field_metadata = field_metadata
self.orders = [-1 if x[1] else 1 for x in fields]
self.entries = [(x[0], field_metadata[x[0]]) for x in fields]
self.library_order = tweaks['title_series_sorting'] == 'library_order'
self.data = data
def __call__(self, record):
values = tuple(self.itervals(self.data[record]))
if len(values) == 1:
return values[0]
return SortKey(self.orders, values)
def itervals(self, record):
for name, fm in self.entries:
dt = fm['datatype']
val = record[fm['rec_index']]
if dt == 'datetime':
if val is None:
val = UNDEFINED_DATE
elif dt == 'series':
if val is None:
val = ('', 1)
else:
val = val.lower()
if self.library_order:
val = title_sort(val)
sidx_fm = self.field_metadata[name + '_index']
sidx = record[sidx_fm['rec_index']]
val = (val, sidx)
elif dt in ('text', 'comments'):
if val is None:
val = ''
val = val.lower()
yield val
# }}}
def search_getting_ids(self, query, search_restriction):
q = ''
if not query or not query.strip():
q = search_restriction
else:
q = query
if search_restriction:
q = u'%s (%s)' % (search_restriction, query)
if not q:
return list(self._map)
matches = sorted(self.parse(q))
return [id for id in self._map if id in matches]
def set_search_restriction(self, s):
self.search_restriction = s

View File

@ -311,6 +311,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
self.search_getting_ids = self.data.search_getting_ids
self.refresh = functools.partial(self.data.refresh, self)
self.sort = self.data.sort
self.multisort = self.data.multisort
self.index = self.data.index
self.refresh_ids = functools.partial(self.data.refresh_ids, self)
self.row = self.data.row

View File

@ -69,6 +69,8 @@ class FieldMetadata(dict):
VALID_DATA_TYPES = frozenset([None, 'rating', 'text', 'comments', 'datetime',
'int', 'float', 'bool', 'series'])
# Builtin metadata {{{
_field_metadata = [
('authors', {'table':'authors',
'column':'name',
@ -287,7 +289,8 @@ class FieldMetadata(dict):
'search_terms':[],
'is_custom':False,
'is_category':False}),
]
]
# }}}
# search labels that are not db columns
search_items = [ 'all',
@ -332,6 +335,9 @@ class FieldMetadata(dict):
def keys(self):
return self._tb_cats.keys()
def field_keys(self):
return [k for k in self._tb_cats.keys() if self._tb_cats[k]['kind']=='field']
def iterkeys(self):
for key in self._tb_cats:
yield key

View File

@ -5,7 +5,7 @@ __license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re, os, cStringIO, operator
import re, os, cStringIO
import cherrypy
try:
@ -16,7 +16,15 @@ except ImportError:
from calibre import fit_image, guess_type
from calibre.utils.date import fromtimestamp
from calibre.ebooks.metadata import title_sort
from calibre.library.caches import SortKeyGenerator
class CSSortKeyGenerator(SortKeyGenerator):
def __init__(self, fields, fm):
SortKeyGenerator.__init__(self, fields, fm, None)
def __call__(self, record):
return self.itervals(record).next()
class ContentServer(object):
@ -47,32 +55,12 @@ class ContentServer(object):
def sort(self, items, field, order):
field = field.lower().strip()
if field == 'author':
field = 'authors'
if field == 'date':
field = 'timestamp'
field = self.db.data.sanitize_sort_field_name(field)
if field not in ('title', 'authors', 'rating', 'timestamp', 'tags', 'size', 'series'):
raise cherrypy.HTTPError(400, '%s is not a valid sort field'%field)
cmpf = cmp if field in ('rating', 'size', 'timestamp') else \
lambda x, y: cmp(x.lower() if x else '', y.lower() if y else '')
if field == 'series':
items.sort(cmp=self.seriescmp, reverse=not order)
else:
lookup = 'sort' if field == 'title' else field
lookup = 'author_sort' if field == 'authors' else field
field = self.db.FIELD_MAP[lookup]
getter = operator.itemgetter(field)
items.sort(cmp=lambda x, y: cmpf(getter(x), getter(y)), reverse=not order)
keyg = CSSortKeyGenerator([(field, order)], self.db.field_metadata)
items.sort(key=keyg, reverse=not order)
def seriescmp(self, x, y):
si = self.db.FIELD_MAP['series']
try:
ans = cmp(title_sort(x[si].lower()), title_sort(y[si].lower()))
except AttributeError: # Some entries may be None
ans = cmp(x[si], y[si])
if ans != 0: return ans
return cmp(x[self.db.FIELD_MAP['series_index']], y[self.db.FIELD_MAP['series_index']])
# }}}

View File

@ -54,10 +54,8 @@ def shorten_components_to(length, components):
r = x[0] if x is components[-1] else ''
else:
if x is components[-1]:
b, _, e = x.rpartition('.')
if not b and e:
b = e
e = ''
b, e = os.path.splitext(x)
if e == '.': e = ''
r = b[:-delta]+e
if r.startswith('.'): r = x[0]+r
else:

View File

@ -165,7 +165,9 @@ class Feed(object):
if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
self.articles.append(article)
else:
self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%(title, article.localtime.strftime('%a, %d %b, %Y %H:%M'), self.title))
t = strftime(u'%a, %d %b, %Y %H:%M', article.localtime.timetuple())
self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%
(title, t, self.title))
d = item.get('date', '')
article.formatted_date = d