mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
merge from branch with kovid's integration of preprocess code
This commit is contained in:
commit
0a2e16e466
@ -26,7 +26,7 @@ var current_library_request = null;
|
|||||||
|
|
||||||
////////////////////////////// GET BOOK LIST //////////////////////////////
|
////////////////////////////// GET BOOK LIST //////////////////////////////
|
||||||
|
|
||||||
var LIBRARY_FETCH_TIMEOUT = 30000; // milliseconds
|
var LIBRARY_FETCH_TIMEOUT = 5*60000; // milliseconds
|
||||||
|
|
||||||
function create_table_headers() {
|
function create_table_headers() {
|
||||||
var thead = $('table#book_list thead tr');
|
var thead = $('table#book_list thead tr');
|
||||||
|
@ -114,3 +114,11 @@ add_new_book_tags_when_importing_books = False
|
|||||||
# Set the maximum number of tags to show per book in the content server
|
# Set the maximum number of tags to show per book in the content server
|
||||||
max_content_server_tags_shown=5
|
max_content_server_tags_shown=5
|
||||||
|
|
||||||
|
|
||||||
|
# Set the maximum number of sort 'levels' that calibre will use to resort the
|
||||||
|
# library after certain operations such as searches or device insertion. Each
|
||||||
|
# sort level adds a performance penalty. If the database is large (thousands of
|
||||||
|
# books) the penalty might be noticeable. If you are not concerned about multi-
|
||||||
|
# level sorts, and if you are seeing a slowdown, reduce the value of this tweak.
|
||||||
|
maximum_resort_levels = 5
|
||||||
|
|
||||||
|
@ -1,12 +1,8 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
'''
|
'''
|
||||||
infobae.com
|
infobae.com
|
||||||
'''
|
'''
|
||||||
import re
|
|
||||||
import urllib, urlparse
|
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
@ -20,35 +16,24 @@ class Infobae(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
language = 'es'
|
language = 'es'
|
||||||
lang = 'es-AR'
|
|
||||||
|
|
||||||
encoding = 'cp1252'
|
encoding = 'cp1252'
|
||||||
cover_url = 'http://www.infobae.com/imgs/header/header.gif'
|
masthead_url = 'http://www.infobae.com/imgs/header/header.gif'
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
preprocess_regexps = [(re.compile(
|
remove_empty_feeds = True
|
||||||
r'<meta name="Description" content="[^"]+">'), lambda m:'')]
|
extra_css = '''
|
||||||
|
body{font-family:Arial,Helvetica,sans-serif;}
|
||||||
|
.popUpTitulo{color:#0D4261; font-size: xx-large}
|
||||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
|
'''
|
||||||
|
|
||||||
extra_css = '''
|
conversion_options = {
|
||||||
.col-center{font-family:Arial,Helvetica,sans-serif;}
|
'comment' : description
|
||||||
h1{font-family:Arial,Helvetica,sans-serif; color:#0D4261;}
|
, 'tags' : category
|
||||||
.fuenteIntNota{font-family:Arial,Helvetica,sans-serif; color:#1D1D1D; font-size:x-small;}
|
, 'publisher' : publisher
|
||||||
'''
|
, 'language' : language
|
||||||
|
, 'linearize_tables' : True
|
||||||
keep_only_tags = [dict(name='div', attrs={'class':['content']})]
|
}
|
||||||
|
|
||||||
|
|
||||||
remove_tags = [
|
|
||||||
dict(name='div', attrs={'class':['options','col-right','controles', 'bannerLibre','tiulo-masleidas','masleidas-h']}),
|
|
||||||
dict(name='a', attrs={'name' : 'comentario',}),
|
|
||||||
dict(name='iframe'),
|
|
||||||
dict(name='img', alt = "Ver galerias de imagenes"),
|
|
||||||
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Noticias' , u'http://www.infobae.com/adjuntos/html/RSS/hoy.xml' )
|
(u'Noticias' , u'http://www.infobae.com/adjuntos/html/RSS/hoy.xml' )
|
||||||
@ -57,39 +42,14 @@ class Infobae(BasicNewsRecipe):
|
|||||||
,(u'Deportes' , u'http://www.infobae.com/adjuntos/html/RSS/deportes.xml' )
|
,(u'Deportes' , u'http://www.infobae.com/adjuntos/html/RSS/deportes.xml' )
|
||||||
]
|
]
|
||||||
|
|
||||||
# def print_version(self, url):
|
def print_version(self, url):
|
||||||
# main, sep, article_part = url.partition('contenidos/')
|
article_part = url.rpartition('/')[2]
|
||||||
# article_id, rsep, rrest = article_part.partition('-')
|
article_id= article_part.partition('-')[0]
|
||||||
# return u'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id
|
return 'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id
|
||||||
|
|
||||||
def get_article_url(self, article):
|
|
||||||
ans = article.get('link').encode('utf-8')
|
|
||||||
parts = list(urlparse.urlparse(ans))
|
|
||||||
parts[2] = urllib.quote(parts[2])
|
|
||||||
ans = urlparse.urlunparse(parts)
|
|
||||||
return ans.decode('utf-8')
|
|
||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
|
|
||||||
for tag in soup.head.findAll('strong'):
|
|
||||||
tag.extract()
|
|
||||||
for tag in soup.findAll('meta'):
|
|
||||||
del tag['content']
|
|
||||||
tag.extract()
|
|
||||||
|
|
||||||
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">\n<meta http-equiv="Content-Language" content="es-AR"/>\n'
|
|
||||||
soup.head.insert(0,mtag)
|
|
||||||
for item in soup.findAll(style=True):
|
|
||||||
del item['style']
|
|
||||||
|
|
||||||
return soup
|
|
||||||
|
|
||||||
def postprocess_html(self, soup, first):
|
def postprocess_html(self, soup, first):
|
||||||
|
|
||||||
for tag in soup.findAll(name='strong'):
|
for tag in soup.findAll(name='strong'):
|
||||||
tag.name = 'b'
|
tag.name = 'b'
|
||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|
@ -6,6 +6,7 @@ nspm.rs
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import NavigableString
|
||||||
|
|
||||||
class Nspm(BasicNewsRecipe):
|
class Nspm(BasicNewsRecipe):
|
||||||
title = 'Nova srpska politicka misao'
|
title = 'Nova srpska politicka misao'
|
||||||
@ -21,6 +22,7 @@ class Nspm(BasicNewsRecipe):
|
|||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
language = 'sr'
|
language = 'sr'
|
||||||
delay = 2
|
delay = 2
|
||||||
|
remove_empty_feeds = True
|
||||||
publication_type = 'magazine'
|
publication_type = 'magazine'
|
||||||
masthead_url = 'http://www.nspm.rs/templates/jsn_epic_pro/images/logol.jpg'
|
masthead_url = 'http://www.nspm.rs/templates/jsn_epic_pro/images/logol.jpg'
|
||||||
extra_css = """ @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)}
|
extra_css = """ @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)}
|
||||||
@ -45,8 +47,9 @@ class Nspm(BasicNewsRecipe):
|
|||||||
dict(name=['link','object','embed','script','meta','base','iframe'])
|
dict(name=['link','object','embed','script','meta','base','iframe'])
|
||||||
,dict(attrs={'class':'buttonheading'})
|
,dict(attrs={'class':'buttonheading'})
|
||||||
]
|
]
|
||||||
remove_tags_after = dict(attrs={'class':'article_separator'})
|
remove_tags_before = dict(attrs={'class':'contentheading'})
|
||||||
remove_attributes = ['width','height']
|
remove_tags_after = dict(attrs={'class':'article_separator'})
|
||||||
|
remove_attributes = ['width','height']
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
@ -67,4 +70,8 @@ class Nspm(BasicNewsRecipe):
|
|||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for item in soup.body.findAll(style=True):
|
for item in soup.body.findAll(style=True):
|
||||||
del item['style']
|
del item['style']
|
||||||
|
for item in soup.body.findAll('h1'):
|
||||||
|
nh = NavigableString(item.a.string)
|
||||||
|
item.a.extract()
|
||||||
|
item.insert(0,nh)
|
||||||
return self.adeify_images(soup)
|
return self.adeify_images(soup)
|
||||||
|
@ -24,18 +24,18 @@ class XkcdCom(BasicNewsRecipe):
|
|||||||
(re.compile(r'(<img.*title=")([^"]+)(".*>)'),
|
(re.compile(r'(<img.*title=")([^"]+)(".*>)'),
|
||||||
lambda m: '%s%s<p>%s</p>' % (m.group(1), m.group(3), m.group(2)))
|
lambda m: '%s%s<p>%s</p>' % (m.group(1), m.group(3), m.group(2)))
|
||||||
]
|
]
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
INDEX = 'http://xkcd.com/archive/'
|
INDEX = 'http://xkcd.com/archive/'
|
||||||
|
|
||||||
soup = self.index_to_soup(INDEX)
|
soup = self.index_to_soup(INDEX)
|
||||||
articles = []
|
articles = []
|
||||||
for item in soup.findAll('a', title=True):
|
for item in soup.findAll('a', title=True):
|
||||||
articles.append({
|
articles.append({
|
||||||
'date': item['title'],
|
'date': item['title'],
|
||||||
'timestamp': time.mktime(time.strptime(item['title'], '%Y-%m-%d'))+1,
|
'timestamp': time.mktime(time.strptime(item['title'], '%Y-%m-%d'))+1,
|
||||||
'url': 'http://xkcd.com' + item['href'],
|
'url': 'http://xkcd.com' + item['href'],
|
||||||
'title': self.tag_to_string(item).encode('UTF-8'),
|
'title': self.tag_to_string(item),
|
||||||
'description': '',
|
'description': '',
|
||||||
'content': '',
|
'content': '',
|
||||||
})
|
})
|
||||||
|
@ -459,7 +459,7 @@ from calibre.devices.iriver.driver import IRIVER_STORY
|
|||||||
from calibre.devices.binatone.driver import README
|
from calibre.devices.binatone.driver import README
|
||||||
from calibre.devices.hanvon.driver import N516, EB511, ALEX, AZBOOKA, THEBOOK
|
from calibre.devices.hanvon.driver import N516, EB511, ALEX, AZBOOKA, THEBOOK
|
||||||
from calibre.devices.edge.driver import EDGE
|
from calibre.devices.edge.driver import EDGE
|
||||||
from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS
|
from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS, SOVOS
|
||||||
from calibre.devices.sne.driver import SNE
|
from calibre.devices.sne.driver import SNE
|
||||||
from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, GEMEI
|
from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, GEMEI
|
||||||
from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG
|
from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG
|
||||||
@ -557,6 +557,7 @@ plugins += [
|
|||||||
TECLAST_K3,
|
TECLAST_K3,
|
||||||
NEWSMY,
|
NEWSMY,
|
||||||
IPAPYRUS,
|
IPAPYRUS,
|
||||||
|
SOVOS,
|
||||||
EDGE,
|
EDGE,
|
||||||
SNE,
|
SNE,
|
||||||
ALEX,
|
ALEX,
|
||||||
|
@ -52,3 +52,14 @@ class IPAPYRUS(TECLAST_K3):
|
|||||||
VENDOR_NAME = 'E_READER'
|
VENDOR_NAME = 'E_READER'
|
||||||
WINDOWS_MAIN_MEM = ''
|
WINDOWS_MAIN_MEM = ''
|
||||||
|
|
||||||
|
class SOVOS(TECLAST_K3):
|
||||||
|
|
||||||
|
name = 'Sovos device interface'
|
||||||
|
gui_name = 'Sovos'
|
||||||
|
description = _('Communicate with the Sovos reader.')
|
||||||
|
|
||||||
|
FORMATS = ['epub', 'fb2', 'pdf', 'txt']
|
||||||
|
|
||||||
|
VENDOR_NAME = 'RK28XX'
|
||||||
|
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'USB-MSC'
|
||||||
|
|
||||||
|
@ -132,7 +132,11 @@ class CHMReader(CHMFile):
|
|||||||
for path in self.Contents():
|
for path in self.Contents():
|
||||||
lpath = os.path.join(output_dir, path)
|
lpath = os.path.join(output_dir, path)
|
||||||
self._ensure_dir(lpath)
|
self._ensure_dir(lpath)
|
||||||
data = self.GetFile(path)
|
try:
|
||||||
|
data = self.GetFile(path)
|
||||||
|
except:
|
||||||
|
self.log.exception('Failed to extract %s from CHM, ignoring'%path)
|
||||||
|
continue
|
||||||
if lpath.find(';') != -1:
|
if lpath.find(';') != -1:
|
||||||
# fix file names with ";<junk>" at the end, see _reformat()
|
# fix file names with ";<junk>" at the end, see _reformat()
|
||||||
lpath = lpath.split(';')[0]
|
lpath = lpath.split(';')[0]
|
||||||
|
@ -168,6 +168,17 @@ class HTMLPreProcessor(object):
|
|||||||
(re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'),
|
(re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'),
|
||||||
(re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'),
|
(re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'),
|
||||||
(re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'),
|
(re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'),
|
||||||
|
# ` with letter before
|
||||||
|
(re.compile(u'a\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'à'),
|
||||||
|
(re.compile(u'A\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'À'),
|
||||||
|
(re.compile(u'e\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'è'),
|
||||||
|
(re.compile(u'E\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'È'),
|
||||||
|
(re.compile(u'i\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ì'),
|
||||||
|
(re.compile(u'I\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ì'),
|
||||||
|
(re.compile(u'o\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ò'),
|
||||||
|
(re.compile(u'O\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ò'),
|
||||||
|
(re.compile(u'u\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ù'),
|
||||||
|
(re.compile(u'U\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ù'),
|
||||||
|
|
||||||
# ´
|
# ´
|
||||||
(re.compile(u'´\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'á'),
|
(re.compile(u'´\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'á'),
|
||||||
|
@ -10,24 +10,23 @@ from calibre.ebooks.conversion.preprocess import line_length
|
|||||||
from calibre.utils.logging import default_log
|
from calibre.utils.logging import default_log
|
||||||
|
|
||||||
class PreProcessor(object):
|
class PreProcessor(object):
|
||||||
html_preprocess_sections = 0
|
|
||||||
found_indents = 0
|
|
||||||
|
|
||||||
def __init__(self, args):
|
def __init__(self, log=None):
|
||||||
self.args = args
|
self.log = default_log if log is None else log
|
||||||
self.log = default_log
|
self.html_preprocess_sections = 0
|
||||||
|
self.found_indents = 0
|
||||||
|
|
||||||
def chapter_head(self, match):
|
def chapter_head(self, match):
|
||||||
chap = match.group('chap')
|
chap = match.group('chap')
|
||||||
title = match.group('title')
|
title = match.group('title')
|
||||||
if not title:
|
if not title:
|
||||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
self.log("found " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
|
self.log("found " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
|
||||||
return '<h2>'+chap+'</h2>\n'
|
return '<h2>'+chap+'</h2>\n'
|
||||||
else:
|
else:
|
||||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
self.log("found " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
|
self.log("found " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
|
||||||
return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
|
return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
|
||||||
|
|
||||||
def chapter_break(self, match):
|
def chapter_break(self, match):
|
||||||
chap = match.group('section')
|
chap = match.group('section')
|
||||||
@ -35,7 +34,7 @@ class PreProcessor(object):
|
|||||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
self.log("marked " + str(self.html_preprocess_sections) + " section markers based on punctuation. - " + str(chap))
|
self.log("marked " + str(self.html_preprocess_sections) + " section markers based on punctuation. - " + str(chap))
|
||||||
return '<'+styles+' style="page-break-before:always">'+chap
|
return '<'+styles+' style="page-break-before:always">'+chap
|
||||||
|
|
||||||
def insert_indent(self, match):
|
def insert_indent(self, match):
|
||||||
pstyle = match.group('formatting')
|
pstyle = match.group('formatting')
|
||||||
span = match.group('span')
|
span = match.group('span')
|
||||||
@ -50,11 +49,11 @@ class PreProcessor(object):
|
|||||||
return '<p style="text-indent:3%">'
|
return '<p style="text-indent:3%">'
|
||||||
else:
|
else:
|
||||||
return '<p style="text-indent:3%">'+span
|
return '<p style="text-indent:3%">'+span
|
||||||
|
|
||||||
def no_markup(self, raw, percent):
|
def no_markup(self, raw, percent):
|
||||||
'''
|
'''
|
||||||
Detects total marked up line endings in the file. raw is the text to
|
Detects total marked up line endings in the file. raw is the text to
|
||||||
inspect. Percent is the minimum percent of line endings which should
|
inspect. Percent is the minimum percent of line endings which should
|
||||||
be marked up to return true.
|
be marked up to return true.
|
||||||
'''
|
'''
|
||||||
htm_end_ere = re.compile('</p>', re.DOTALL)
|
htm_end_ere = re.compile('</p>', re.DOTALL)
|
||||||
@ -68,13 +67,13 @@ class PreProcessor(object):
|
|||||||
if percent > 1:
|
if percent > 1:
|
||||||
percent = 1
|
percent = 1
|
||||||
if percent < 0:
|
if percent < 0:
|
||||||
percent = 0
|
percent = 0
|
||||||
|
|
||||||
min_lns = tot_ln_fds * percent
|
min_lns = tot_ln_fds * percent
|
||||||
self.log("There must be fewer than " + str(min_lns) + " unmarked lines to add markup")
|
self.log("There must be fewer than " + str(min_lns) + " unmarked lines to add markup")
|
||||||
if min_lns > tot_htm_ends:
|
if min_lns > tot_htm_ends:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def __call__(self, html):
|
def __call__(self, html):
|
||||||
self.log("********* Preprocessing HTML *********")
|
self.log("********* Preprocessing HTML *********")
|
||||||
# Replace series of non-breaking spaces with text-indent
|
# Replace series of non-breaking spaces with text-indent
|
||||||
@ -88,7 +87,7 @@ class PreProcessor(object):
|
|||||||
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
|
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
|
||||||
# Get rid of empty span tags
|
# Get rid of empty span tags
|
||||||
html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html)
|
html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html)
|
||||||
|
|
||||||
# If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
|
# If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
|
||||||
linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
||||||
blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
|
blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
|
||||||
@ -102,19 +101,19 @@ class PreProcessor(object):
|
|||||||
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
|
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
|
||||||
html = re.sub(r"\s*</p>", "</p>\n", html)
|
html = re.sub(r"\s*</p>", "</p>\n", html)
|
||||||
html = re.sub(r"\s*<p>\s*", "\n<p>", html)
|
html = re.sub(r"\s*<p>\s*", "\n<p>", html)
|
||||||
|
|
||||||
# some lit files don't have any <p> tags or equivalent (generally just plain text between
|
# some lit files don't have any <p> tags or equivalent (generally just plain text between
|
||||||
# <pre> tags), check and mark up line endings if required before proceeding
|
# <pre> tags), check and mark up line endings if required before proceeding
|
||||||
if self.no_markup(html, 0.1):
|
if self.no_markup(html, 0.1):
|
||||||
self.log("not enough paragraph markers, adding now")
|
self.log("not enough paragraph markers, adding now")
|
||||||
add_markup = re.compile('(?<!>)(\n)')
|
add_markup = re.compile('(?<!>)(\n)')
|
||||||
html = add_markup.sub('</p>\n<p>', html)
|
html = add_markup.sub('</p>\n<p>', html)
|
||||||
|
|
||||||
# detect chapters/sections to match xpath or splitting logic
|
# detect chapters/sections to match xpath or splitting logic
|
||||||
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
||||||
self.html_preprocess_sections = len(heading.findall(html))
|
self.html_preprocess_sections = len(heading.findall(html))
|
||||||
self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings")
|
self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings")
|
||||||
#
|
#
|
||||||
# Start with most typical chapter headings, get more aggressive until one works
|
# Start with most typical chapter headings, get more aggressive until one works
|
||||||
if self.html_preprocess_sections < 10:
|
if self.html_preprocess_sections < 10:
|
||||||
chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}s*(<span[^>]*>)?\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
|
chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}s*(<span[^>]*>)?\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
|
||||||
@ -122,18 +121,18 @@ class PreProcessor(object):
|
|||||||
if self.html_preprocess_sections < 10:
|
if self.html_preprocess_sections < 10:
|
||||||
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
|
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
|
||||||
chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
|
chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
|
||||||
html = chapdetect2.sub(self.chapter_head, html)
|
html = chapdetect2.sub(self.chapter_head, html)
|
||||||
|
|
||||||
if self.html_preprocess_sections < 10:
|
if self.html_preprocess_sections < 10:
|
||||||
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
|
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
|
||||||
chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
|
chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
|
||||||
html = chapdetect2.sub(self.chapter_head, html)
|
html = chapdetect2.sub(self.chapter_head, html)
|
||||||
|
|
||||||
# Unwrap lines
|
# Unwrap lines
|
||||||
#
|
#
|
||||||
self.log("Unwrapping Lines")
|
self.log("Unwrapping Lines")
|
||||||
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
|
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
|
||||||
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
|
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
|
||||||
# that lines can be un-wrapped across page boundaries
|
# that lines can be un-wrapped across page boundaries
|
||||||
paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
|
paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
|
||||||
spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
|
spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
|
||||||
@ -146,7 +145,7 @@ class PreProcessor(object):
|
|||||||
format = 'html'
|
format = 'html'
|
||||||
else:
|
else:
|
||||||
format = 'html'
|
format = 'html'
|
||||||
|
|
||||||
# Calculate Length
|
# Calculate Length
|
||||||
length = line_length(format, html, 0.4)
|
length = line_length(format, html, 0.4)
|
||||||
self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***")
|
self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***")
|
||||||
@ -154,8 +153,8 @@ class PreProcessor(object):
|
|||||||
# Unwrap and/or delete soft-hyphens, hyphens
|
# Unwrap and/or delete soft-hyphens, hyphens
|
||||||
html = re.sub(u'\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
|
html = re.sub(u'\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
|
||||||
html = re.sub(u'(?<=[-–—])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
|
html = re.sub(u'(?<=[-–—])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
|
||||||
|
|
||||||
# Unwrap lines using punctation if the median length of all lines is less than 200
|
# Unwrap lines using punctation if the median length of all lines is less than 200
|
||||||
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
||||||
html = unwrap.sub(' ', html)
|
html = unwrap.sub(' ', html)
|
||||||
|
|
||||||
@ -164,11 +163,11 @@ class PreProcessor(object):
|
|||||||
self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections))
|
self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections))
|
||||||
#self.log(html)
|
#self.log(html)
|
||||||
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
||||||
html = chapdetect3.sub(self.chapter_break, html)
|
html = chapdetect3.sub(self.chapter_break, html)
|
||||||
# search for places where a first or second level heading is immediately followed by another
|
# search for places where a first or second level heading is immediately followed by another
|
||||||
# top level heading. demote the second heading to h3 to prevent splitting between chapter
|
# top level heading. demote the second heading to h3 to prevent splitting between chapter
|
||||||
# headings and titles, images, etc
|
# headings and titles, images, etc
|
||||||
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
||||||
html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
|
html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
|
||||||
|
|
||||||
return html
|
return html
|
||||||
|
@ -491,6 +491,6 @@ class HTMLInput(InputFormatPlugin):
|
|||||||
return (None, raw)
|
return (None, raw)
|
||||||
|
|
||||||
def preprocess_html(self, html):
|
def preprocess_html(self, html):
|
||||||
preprocessor = PreProcessor(html)
|
preprocessor = PreProcessor(log=getattr(self, 'log', None))
|
||||||
html = preprocessor(html)
|
return preprocessor(html)
|
||||||
return html
|
|
||||||
|
@ -54,7 +54,6 @@ class LITInput(InputFormatPlugin):
|
|||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, html):
|
def preprocess_html(self, html):
|
||||||
preprocessor = PreProcessor(html)
|
preprocessor = PreProcessor(log=getattr(self, 'log', None))
|
||||||
html = preprocessor(html)
|
return preprocessor(html)
|
||||||
return html
|
|
||||||
|
|
||||||
|
@ -138,6 +138,7 @@ class CSSFlattener(object):
|
|||||||
float(self.context.margin_left))
|
float(self.context.margin_left))
|
||||||
bs.append('margin-right : %fpt'%\
|
bs.append('margin-right : %fpt'%\
|
||||||
float(self.context.margin_right))
|
float(self.context.margin_right))
|
||||||
|
bs.extend(['padding-left: 0pt', 'padding-right: 0pt'])
|
||||||
if self.context.change_justification != 'original':
|
if self.context.change_justification != 'original':
|
||||||
bs.append('text-align: '+ self.context.change_justification)
|
bs.append('text-align: '+ self.context.change_justification)
|
||||||
body.set('style', '; '.join(bs))
|
body.set('style', '; '.join(bs))
|
||||||
|
@ -207,6 +207,7 @@ class PML_HTMLizer(object):
|
|||||||
while html != old:
|
while html != old:
|
||||||
old = html
|
old = html
|
||||||
html = self.cleanup_html_remove_redundant(html)
|
html = self.cleanup_html_remove_redundant(html)
|
||||||
|
html = re.sub(r'(?imu)^\s*', '', html)
|
||||||
return html
|
return html
|
||||||
|
|
||||||
def cleanup_html_remove_redundant(self, html):
|
def cleanup_html_remove_redundant(self, html):
|
||||||
@ -216,7 +217,7 @@ class PML_HTMLizer(object):
|
|||||||
html = re.sub(r'(?u)%s\s*%s' % (open % '.*?', close), '', html)
|
html = re.sub(r'(?u)%s\s*%s' % (open % '.*?', close), '', html)
|
||||||
else:
|
else:
|
||||||
html = re.sub(r'(?u)%s\s*%s' % (open, close), '', html)
|
html = re.sub(r'(?u)%s\s*%s' % (open, close), '', html)
|
||||||
html = re.sub(r'<p>\s*</p>', '', html)
|
html = re.sub(r'(?imu)<p>\s*</p>', '', html)
|
||||||
return html
|
return html
|
||||||
|
|
||||||
def start_line(self):
|
def start_line(self):
|
||||||
@ -556,7 +557,7 @@ class PML_HTMLizer(object):
|
|||||||
text = t
|
text = t
|
||||||
else:
|
else:
|
||||||
self.toc.add_item(os.path.basename(self.file_name), id, value)
|
self.toc.add_item(os.path.basename(self.file_name), id, value)
|
||||||
text = '<span id="%s"></span>%s' % (id, t)
|
text = '%s<span id="%s"></span>' % (t, id)
|
||||||
elif c == 'm':
|
elif c == 'm':
|
||||||
empty = False
|
empty = False
|
||||||
src = self.code_value(line)
|
src = self.code_value(line)
|
||||||
|
@ -7,7 +7,6 @@ import os, glob, re, textwrap
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
from calibre.ebooks.conversion.preprocess import line_length
|
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
from calibre.ebooks.conversion.utils import PreProcessor
|
||||||
|
|
||||||
class InlineClass(etree.XSLTExtension):
|
class InlineClass(etree.XSLTExtension):
|
||||||
@ -230,7 +229,7 @@ class RTFInput(InputFormatPlugin):
|
|||||||
res = transform.tostring(result)
|
res = transform.tostring(result)
|
||||||
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
|
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
|
||||||
if self.options.preprocess_html:
|
if self.options.preprocess_html:
|
||||||
preprocessor = PreProcessor(res)
|
preprocessor = PreProcessor(log=getattr(self, 'log', None))
|
||||||
res = preprocessor(res)
|
res = preprocessor(res)
|
||||||
f.write(res)
|
f.write(res)
|
||||||
self.write_inline_css(inline_class)
|
self.write_inline_css(inline_class)
|
||||||
|
@ -77,7 +77,7 @@ def separate_paragraphs_print_formatted(txt):
|
|||||||
|
|
||||||
def preserve_spaces(txt):
|
def preserve_spaces(txt):
|
||||||
txt = txt.replace(' ', ' ')
|
txt = txt.replace(' ', ' ')
|
||||||
txt = txt.replace('\t', '	')
|
txt = txt.replace('\t', ' ')
|
||||||
return txt
|
return txt
|
||||||
|
|
||||||
def opf_writer(path, opf_name, manifest, spine, mi):
|
def opf_writer(path, opf_name, manifest, spine, mi):
|
||||||
|
@ -209,8 +209,9 @@ class EditMetadataAction(InterfaceAction):
|
|||||||
dest_id, src_books, src_ids = self.books_to_merge(rows)
|
dest_id, src_books, src_ids = self.books_to_merge(rows)
|
||||||
if safe_merge:
|
if safe_merge:
|
||||||
if not confirm('<p>'+_(
|
if not confirm('<p>'+_(
|
||||||
'All book formats and metadata from the selected books '
|
'Book formats and metadata from the selected books '
|
||||||
'will be added to the <b>first selected book.</b><br><br> '
|
'will be added to the <b>first selected book.</b> '
|
||||||
|
'ISBN will <i>not</i> be merged.<br><br> '
|
||||||
'The second and subsequently selected books will not '
|
'The second and subsequently selected books will not '
|
||||||
'be deleted or changed.<br><br>'
|
'be deleted or changed.<br><br>'
|
||||||
'Please confirm you want to proceed.')
|
'Please confirm you want to proceed.')
|
||||||
@ -220,8 +221,9 @@ class EditMetadataAction(InterfaceAction):
|
|||||||
self.merge_metadata(dest_id, src_ids)
|
self.merge_metadata(dest_id, src_ids)
|
||||||
else:
|
else:
|
||||||
if not confirm('<p>'+_(
|
if not confirm('<p>'+_(
|
||||||
'All book formats and metadata from the selected books will be merged '
|
'Book formats and metadata from the selected books will be merged '
|
||||||
'into the <b>first selected book</b>.<br><br>'
|
'into the <b>first selected book</b>. '
|
||||||
|
'ISBN will <i>not</i> be merged.<br><br>'
|
||||||
'After merger the second and '
|
'After merger the second and '
|
||||||
'subsequently selected books will be <b>deleted</b>. <br><br>'
|
'subsequently selected books will be <b>deleted</b>. <br><br>'
|
||||||
'All book formats of the first selected book will be kept '
|
'All book formats of the first selected book will be kept '
|
||||||
|
@ -121,10 +121,8 @@ class BooksModel(QAbstractTableModel): # {{{
|
|||||||
def set_device_connected(self, is_connected):
|
def set_device_connected(self, is_connected):
|
||||||
self.device_connected = is_connected
|
self.device_connected = is_connected
|
||||||
self.db.refresh_ondevice()
|
self.db.refresh_ondevice()
|
||||||
self.refresh()
|
self.refresh() # does a resort()
|
||||||
self.research()
|
self.research()
|
||||||
if is_connected and self.sorted_on[0] == 'ondevice':
|
|
||||||
self.resort()
|
|
||||||
|
|
||||||
def set_book_on_device_func(self, func):
|
def set_book_on_device_func(self, func):
|
||||||
self.book_on_device = func
|
self.book_on_device = func
|
||||||
@ -264,19 +262,15 @@ class BooksModel(QAbstractTableModel): # {{{
|
|||||||
self.sorting_done.emit(self.db.index)
|
self.sorting_done.emit(self.db.index)
|
||||||
|
|
||||||
def refresh(self, reset=True):
|
def refresh(self, reset=True):
|
||||||
try:
|
|
||||||
col = self.column_map.index(self.sorted_on[0])
|
|
||||||
except:
|
|
||||||
col = 0
|
|
||||||
self.db.refresh(field=None)
|
self.db.refresh(field=None)
|
||||||
self.sort(col, self.sorted_on[1], reset=reset)
|
self.resort(reset=reset)
|
||||||
|
|
||||||
def resort(self, reset=True):
|
def resort(self, reset=True):
|
||||||
try:
|
if not self.db:
|
||||||
col = self.column_map.index(self.sorted_on[0])
|
return
|
||||||
except ValueError:
|
self.db.multisort(self.sort_history[:tweaks['maximum_resort_levels']])
|
||||||
col = 0
|
if reset:
|
||||||
self.sort(col, self.sorted_on[1], reset=reset)
|
self.reset()
|
||||||
|
|
||||||
def research(self, reset=True):
|
def research(self, reset=True):
|
||||||
self.search(self.last_search, reset=reset)
|
self.search(self.last_search, reset=reset)
|
||||||
@ -1030,6 +1024,11 @@ class DeviceBooksModel(BooksModel): # {{{
|
|||||||
if reset:
|
if reset:
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
|
def resort(self, reset=True):
|
||||||
|
if self.sorted_on:
|
||||||
|
self.sort(self.column_map.index(self.sorted_on[0]),
|
||||||
|
self.sorted_on[1], reset=reset)
|
||||||
|
|
||||||
def columnCount(self, parent):
|
def columnCount(self, parent):
|
||||||
if parent and parent.isValid():
|
if parent and parent.isValid():
|
||||||
return 0
|
return 0
|
||||||
|
@ -6,7 +6,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import re, itertools, functools
|
import re, itertools
|
||||||
from itertools import repeat
|
from itertools import repeat
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
from threading import Thread, RLock
|
from threading import Thread, RLock
|
||||||
@ -112,7 +112,7 @@ class ResultCache(SearchQueryParser):
|
|||||||
'''
|
'''
|
||||||
def __init__(self, FIELD_MAP, field_metadata):
|
def __init__(self, FIELD_MAP, field_metadata):
|
||||||
self.FIELD_MAP = FIELD_MAP
|
self.FIELD_MAP = FIELD_MAP
|
||||||
self._map = self._map_filtered = self._data = []
|
self._map = self._data = self._map_filtered = []
|
||||||
self.first_sort = True
|
self.first_sort = True
|
||||||
self.search_restriction = ''
|
self.search_restriction = ''
|
||||||
self.field_metadata = field_metadata
|
self.field_metadata = field_metadata
|
||||||
@ -141,6 +141,8 @@ class ResultCache(SearchQueryParser):
|
|||||||
for x in self.iterall():
|
for x in self.iterall():
|
||||||
yield x[idx]
|
yield x[idx]
|
||||||
|
|
||||||
|
# Search functions {{{
|
||||||
|
|
||||||
def universal_set(self):
|
def universal_set(self):
|
||||||
return set([i[0] for i in self._data if i is not None])
|
return set([i[0] for i in self._data if i is not None])
|
||||||
|
|
||||||
@ -462,12 +464,43 @@ class ResultCache(SearchQueryParser):
|
|||||||
continue
|
continue
|
||||||
return matches
|
return matches
|
||||||
|
|
||||||
|
def search(self, query, return_matches=False):
|
||||||
|
ans = self.search_getting_ids(query, self.search_restriction)
|
||||||
|
if return_matches:
|
||||||
|
return ans
|
||||||
|
self._map_filtered = ans
|
||||||
|
|
||||||
|
def search_getting_ids(self, query, search_restriction):
|
||||||
|
q = ''
|
||||||
|
if not query or not query.strip():
|
||||||
|
q = search_restriction
|
||||||
|
else:
|
||||||
|
q = query
|
||||||
|
if search_restriction:
|
||||||
|
q = u'%s (%s)' % (search_restriction, query)
|
||||||
|
if not q:
|
||||||
|
return list(self._map)
|
||||||
|
matches = self.parse(q)
|
||||||
|
tmap = list(itertools.repeat(False, len(self._data)))
|
||||||
|
for x in matches:
|
||||||
|
tmap[x] = True
|
||||||
|
return [x for x in self._map if tmap[x]]
|
||||||
|
|
||||||
|
def set_search_restriction(self, s):
|
||||||
|
self.search_restriction = s
|
||||||
|
|
||||||
|
# }}}
|
||||||
|
|
||||||
def remove(self, id):
|
def remove(self, id):
|
||||||
self._data[id] = None
|
self._data[id] = None
|
||||||
if id in self._map:
|
try:
|
||||||
self._map.remove(id)
|
self._map.remove(id)
|
||||||
if id in self._map_filtered:
|
except ValueError:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
self._map_filtered.remove(id)
|
self._map_filtered.remove(id)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
def set(self, row, col, val, row_is_id=False):
|
def set(self, row, col, val, row_is_id=False):
|
||||||
id = row if row_is_id else self._map_filtered[row]
|
id = row if row_is_id else self._map_filtered[row]
|
||||||
@ -522,9 +555,7 @@ class ResultCache(SearchQueryParser):
|
|||||||
|
|
||||||
def books_deleted(self, ids):
|
def books_deleted(self, ids):
|
||||||
for id in ids:
|
for id in ids:
|
||||||
self._data[id] = None
|
self.remove(id)
|
||||||
if id in self._map: self._map.remove(id)
|
|
||||||
if id in self._map_filtered: self._map_filtered.remove(id)
|
|
||||||
|
|
||||||
def count(self):
|
def count(self):
|
||||||
return len(self._map)
|
return len(self._map)
|
||||||
@ -549,90 +580,97 @@ class ResultCache(SearchQueryParser):
|
|||||||
self.sort(field, ascending)
|
self.sort(field, ascending)
|
||||||
self._map_filtered = list(self._map)
|
self._map_filtered = list(self._map)
|
||||||
if self.search_restriction:
|
if self.search_restriction:
|
||||||
self.search('', return_matches=False, ignore_search_restriction=False)
|
self.search('', return_matches=False)
|
||||||
|
|
||||||
def seriescmp(self, sidx, siidx, x, y, library_order=None):
|
# Sorting functions {{{
|
||||||
try:
|
|
||||||
if library_order:
|
|
||||||
ans = cmp(title_sort(self._data[x][sidx].lower()),
|
|
||||||
title_sort(self._data[y][sidx].lower()))
|
|
||||||
else:
|
|
||||||
ans = cmp(self._data[x][sidx].lower(),
|
|
||||||
self._data[y][sidx].lower())
|
|
||||||
except AttributeError: # Some entries may be None
|
|
||||||
ans = cmp(self._data[x][sidx], self._data[y][sidx])
|
|
||||||
if ans != 0: return ans
|
|
||||||
return cmp(self._data[x][siidx], self._data[y][siidx])
|
|
||||||
|
|
||||||
def cmp(self, loc, x, y, asstr=True, subsort=False):
|
def sanitize_sort_field_name(self, field):
|
||||||
try:
|
field = field.lower().strip()
|
||||||
ans = cmp(self._data[x][loc].lower(), self._data[y][loc].lower()) if \
|
if field not in self.field_metadata.iterkeys():
|
||||||
asstr else cmp(self._data[x][loc], self._data[y][loc])
|
if field in ('author', 'tag', 'comment'):
|
||||||
except AttributeError: # Some entries may be None
|
field += 's'
|
||||||
ans = cmp(self._data[x][loc], self._data[y][loc])
|
if field == 'date': field = 'timestamp'
|
||||||
except TypeError: ## raised when a datetime is None
|
elif field == 'title': field = 'sort'
|
||||||
x = self._data[x][loc]
|
elif field == 'authors': field = 'author_sort'
|
||||||
if x is None:
|
return field
|
||||||
x = UNDEFINED_DATE
|
|
||||||
y = self._data[y][loc]
|
|
||||||
if y is None:
|
|
||||||
y = UNDEFINED_DATE
|
|
||||||
return cmp(x, y)
|
|
||||||
if subsort and ans == 0:
|
|
||||||
return cmp(self._data[x][11].lower(), self._data[y][11].lower())
|
|
||||||
return ans
|
|
||||||
|
|
||||||
def sort(self, field, ascending, subsort=False):
|
def sort(self, field, ascending, subsort=False):
|
||||||
field = field.lower().strip()
|
self.multisort([(field, ascending)])
|
||||||
if field in ('author', 'tag', 'comment'):
|
|
||||||
field += 's'
|
|
||||||
if field == 'date': field = 'timestamp'
|
|
||||||
elif field == 'title': field = 'sort'
|
|
||||||
elif field == 'authors': field = 'author_sort'
|
|
||||||
as_string = field not in ('size', 'rating', 'timestamp')
|
|
||||||
|
|
||||||
if self.first_sort:
|
def multisort(self, fields=[], subsort=False):
|
||||||
subsort = True
|
fields = [(self.sanitize_sort_field_name(x), bool(y)) for x, y in fields]
|
||||||
self.first_sort = False
|
keys = self.field_metadata.field_keys()
|
||||||
if self.field_metadata[field]['is_custom']:
|
fields = [x for x in fields if x[0] in keys]
|
||||||
if self.field_metadata[field]['datatype'] == 'series':
|
if subsort and 'sort' not in [x[0] for x in fields]:
|
||||||
fcmp = functools.partial(self.seriescmp,
|
fields += [('sort', True)]
|
||||||
self.field_metadata[field]['rec_index'],
|
if not fields:
|
||||||
self.field_metadata.cc_series_index_column_for(field),
|
fields = [('timestamp', False)]
|
||||||
library_order=tweaks['title_series_sorting'] == 'library_order')
|
|
||||||
else:
|
keyg = SortKeyGenerator(fields, self.field_metadata, self._data)
|
||||||
as_string = self.field_metadata[field]['datatype'] in ('comments', 'text')
|
if len(fields) == 1:
|
||||||
field = self.field_metadata[field]['colnum']
|
self._map.sort(key=keyg, reverse=not fields[0][1])
|
||||||
fcmp = functools.partial(self.cmp, self.FIELD_MAP[field],
|
|
||||||
subsort=subsort, asstr=as_string)
|
|
||||||
elif field == 'series':
|
|
||||||
fcmp = functools.partial(self.seriescmp, self.FIELD_MAP['series'],
|
|
||||||
self.FIELD_MAP['series_index'],
|
|
||||||
library_order=tweaks['title_series_sorting'] == 'library_order')
|
|
||||||
else:
|
else:
|
||||||
fcmp = functools.partial(self.cmp, self.FIELD_MAP[field],
|
self._map.sort(key=keyg)
|
||||||
subsort=subsort, asstr=as_string)
|
|
||||||
self._map.sort(cmp=fcmp, reverse=not ascending)
|
|
||||||
self._map_filtered = [id for id in self._map if id in self._map_filtered]
|
|
||||||
|
|
||||||
def search(self, query, return_matches=False):
|
tmap = list(itertools.repeat(False, len(self._data)))
|
||||||
ans = self.search_getting_ids(query, self.search_restriction)
|
for x in self._map_filtered:
|
||||||
if return_matches:
|
tmap[x] = True
|
||||||
return ans
|
self._map_filtered = [x for x in self._map if tmap[x]]
|
||||||
self._map_filtered = ans
|
|
||||||
|
|
||||||
|
class SortKey(object):
|
||||||
|
|
||||||
|
def __init__(self, orders, values):
|
||||||
|
self.orders, self.values = orders, values
|
||||||
|
|
||||||
|
def __cmp__(self, other):
|
||||||
|
for i, ascending in enumerate(self.orders):
|
||||||
|
ans = cmp(self.values[i], other.values[i])
|
||||||
|
if ans != 0:
|
||||||
|
return ans * ascending
|
||||||
|
return 0
|
||||||
|
|
||||||
|
class SortKeyGenerator(object):
|
||||||
|
|
||||||
|
def __init__(self, fields, field_metadata, data):
|
||||||
|
self.field_metadata = field_metadata
|
||||||
|
self.orders = [-1 if x[1] else 1 for x in fields]
|
||||||
|
self.entries = [(x[0], field_metadata[x[0]]) for x in fields]
|
||||||
|
self.library_order = tweaks['title_series_sorting'] == 'library_order'
|
||||||
|
self.data = data
|
||||||
|
|
||||||
|
def __call__(self, record):
|
||||||
|
values = tuple(self.itervals(self.data[record]))
|
||||||
|
if len(values) == 1:
|
||||||
|
return values[0]
|
||||||
|
return SortKey(self.orders, values)
|
||||||
|
|
||||||
|
def itervals(self, record):
|
||||||
|
for name, fm in self.entries:
|
||||||
|
dt = fm['datatype']
|
||||||
|
val = record[fm['rec_index']]
|
||||||
|
|
||||||
|
if dt == 'datetime':
|
||||||
|
if val is None:
|
||||||
|
val = UNDEFINED_DATE
|
||||||
|
|
||||||
|
elif dt == 'series':
|
||||||
|
if val is None:
|
||||||
|
val = ('', 1)
|
||||||
|
else:
|
||||||
|
val = val.lower()
|
||||||
|
if self.library_order:
|
||||||
|
val = title_sort(val)
|
||||||
|
sidx_fm = self.field_metadata[name + '_index']
|
||||||
|
sidx = record[sidx_fm['rec_index']]
|
||||||
|
val = (val, sidx)
|
||||||
|
|
||||||
|
elif dt in ('text', 'comments'):
|
||||||
|
if val is None:
|
||||||
|
val = ''
|
||||||
|
val = val.lower()
|
||||||
|
yield val
|
||||||
|
|
||||||
|
# }}}
|
||||||
|
|
||||||
def search_getting_ids(self, query, search_restriction):
|
|
||||||
q = ''
|
|
||||||
if not query or not query.strip():
|
|
||||||
q = search_restriction
|
|
||||||
else:
|
|
||||||
q = query
|
|
||||||
if search_restriction:
|
|
||||||
q = u'%s (%s)' % (search_restriction, query)
|
|
||||||
if not q:
|
|
||||||
return list(self._map)
|
|
||||||
matches = sorted(self.parse(q))
|
|
||||||
return [id for id in self._map if id in matches]
|
|
||||||
|
|
||||||
def set_search_restriction(self, s):
|
|
||||||
self.search_restriction = s
|
|
||||||
|
@ -311,6 +311,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
|
|||||||
self.search_getting_ids = self.data.search_getting_ids
|
self.search_getting_ids = self.data.search_getting_ids
|
||||||
self.refresh = functools.partial(self.data.refresh, self)
|
self.refresh = functools.partial(self.data.refresh, self)
|
||||||
self.sort = self.data.sort
|
self.sort = self.data.sort
|
||||||
|
self.multisort = self.data.multisort
|
||||||
self.index = self.data.index
|
self.index = self.data.index
|
||||||
self.refresh_ids = functools.partial(self.data.refresh_ids, self)
|
self.refresh_ids = functools.partial(self.data.refresh_ids, self)
|
||||||
self.row = self.data.row
|
self.row = self.data.row
|
||||||
|
@ -69,6 +69,8 @@ class FieldMetadata(dict):
|
|||||||
VALID_DATA_TYPES = frozenset([None, 'rating', 'text', 'comments', 'datetime',
|
VALID_DATA_TYPES = frozenset([None, 'rating', 'text', 'comments', 'datetime',
|
||||||
'int', 'float', 'bool', 'series'])
|
'int', 'float', 'bool', 'series'])
|
||||||
|
|
||||||
|
# Builtin metadata {{{
|
||||||
|
|
||||||
_field_metadata = [
|
_field_metadata = [
|
||||||
('authors', {'table':'authors',
|
('authors', {'table':'authors',
|
||||||
'column':'name',
|
'column':'name',
|
||||||
@ -287,7 +289,8 @@ class FieldMetadata(dict):
|
|||||||
'search_terms':[],
|
'search_terms':[],
|
||||||
'is_custom':False,
|
'is_custom':False,
|
||||||
'is_category':False}),
|
'is_category':False}),
|
||||||
]
|
]
|
||||||
|
# }}}
|
||||||
|
|
||||||
# search labels that are not db columns
|
# search labels that are not db columns
|
||||||
search_items = [ 'all',
|
search_items = [ 'all',
|
||||||
@ -332,6 +335,9 @@ class FieldMetadata(dict):
|
|||||||
def keys(self):
|
def keys(self):
|
||||||
return self._tb_cats.keys()
|
return self._tb_cats.keys()
|
||||||
|
|
||||||
|
def field_keys(self):
|
||||||
|
return [k for k in self._tb_cats.keys() if self._tb_cats[k]['kind']=='field']
|
||||||
|
|
||||||
def iterkeys(self):
|
def iterkeys(self):
|
||||||
for key in self._tb_cats:
|
for key in self._tb_cats:
|
||||||
yield key
|
yield key
|
||||||
|
@ -5,7 +5,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import re, os, cStringIO, operator
|
import re, os, cStringIO
|
||||||
|
|
||||||
import cherrypy
|
import cherrypy
|
||||||
try:
|
try:
|
||||||
@ -16,7 +16,15 @@ except ImportError:
|
|||||||
|
|
||||||
from calibre import fit_image, guess_type
|
from calibre import fit_image, guess_type
|
||||||
from calibre.utils.date import fromtimestamp
|
from calibre.utils.date import fromtimestamp
|
||||||
from calibre.ebooks.metadata import title_sort
|
from calibre.library.caches import SortKeyGenerator
|
||||||
|
|
||||||
|
class CSSortKeyGenerator(SortKeyGenerator):
|
||||||
|
|
||||||
|
def __init__(self, fields, fm):
|
||||||
|
SortKeyGenerator.__init__(self, fields, fm, None)
|
||||||
|
|
||||||
|
def __call__(self, record):
|
||||||
|
return self.itervals(record).next()
|
||||||
|
|
||||||
class ContentServer(object):
|
class ContentServer(object):
|
||||||
|
|
||||||
@ -47,32 +55,12 @@ class ContentServer(object):
|
|||||||
|
|
||||||
|
|
||||||
def sort(self, items, field, order):
|
def sort(self, items, field, order):
|
||||||
field = field.lower().strip()
|
field = self.db.data.sanitize_sort_field_name(field)
|
||||||
if field == 'author':
|
|
||||||
field = 'authors'
|
|
||||||
if field == 'date':
|
|
||||||
field = 'timestamp'
|
|
||||||
if field not in ('title', 'authors', 'rating', 'timestamp', 'tags', 'size', 'series'):
|
if field not in ('title', 'authors', 'rating', 'timestamp', 'tags', 'size', 'series'):
|
||||||
raise cherrypy.HTTPError(400, '%s is not a valid sort field'%field)
|
raise cherrypy.HTTPError(400, '%s is not a valid sort field'%field)
|
||||||
cmpf = cmp if field in ('rating', 'size', 'timestamp') else \
|
keyg = CSSortKeyGenerator([(field, order)], self.db.field_metadata)
|
||||||
lambda x, y: cmp(x.lower() if x else '', y.lower() if y else '')
|
items.sort(key=keyg, reverse=not order)
|
||||||
if field == 'series':
|
|
||||||
items.sort(cmp=self.seriescmp, reverse=not order)
|
|
||||||
else:
|
|
||||||
lookup = 'sort' if field == 'title' else field
|
|
||||||
lookup = 'author_sort' if field == 'authors' else field
|
|
||||||
field = self.db.FIELD_MAP[lookup]
|
|
||||||
getter = operator.itemgetter(field)
|
|
||||||
items.sort(cmp=lambda x, y: cmpf(getter(x), getter(y)), reverse=not order)
|
|
||||||
|
|
||||||
def seriescmp(self, x, y):
|
|
||||||
si = self.db.FIELD_MAP['series']
|
|
||||||
try:
|
|
||||||
ans = cmp(title_sort(x[si].lower()), title_sort(y[si].lower()))
|
|
||||||
except AttributeError: # Some entries may be None
|
|
||||||
ans = cmp(x[si], y[si])
|
|
||||||
if ans != 0: return ans
|
|
||||||
return cmp(x[self.db.FIELD_MAP['series_index']], y[self.db.FIELD_MAP['series_index']])
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
|
|
||||||
|
@ -54,10 +54,8 @@ def shorten_components_to(length, components):
|
|||||||
r = x[0] if x is components[-1] else ''
|
r = x[0] if x is components[-1] else ''
|
||||||
else:
|
else:
|
||||||
if x is components[-1]:
|
if x is components[-1]:
|
||||||
b, _, e = x.rpartition('.')
|
b, e = os.path.splitext(x)
|
||||||
if not b and e:
|
if e == '.': e = ''
|
||||||
b = e
|
|
||||||
e = ''
|
|
||||||
r = b[:-delta]+e
|
r = b[:-delta]+e
|
||||||
if r.startswith('.'): r = x[0]+r
|
if r.startswith('.'): r = x[0]+r
|
||||||
else:
|
else:
|
||||||
|
@ -165,7 +165,9 @@ class Feed(object):
|
|||||||
if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
|
if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
|
||||||
self.articles.append(article)
|
self.articles.append(article)
|
||||||
else:
|
else:
|
||||||
self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%(title, article.localtime.strftime('%a, %d %b, %Y %H:%M'), self.title))
|
t = strftime(u'%a, %d %b, %Y %H:%M', article.localtime.timetuple())
|
||||||
|
self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%
|
||||||
|
(title, t, self.title))
|
||||||
d = item.get('date', '')
|
d = item.get('date', '')
|
||||||
article.formatted_date = d
|
article.formatted_date = d
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user