1):
+ weeknum -= 1
+
+ title = u'.tyzden ' + str(weeknum) + '/' + str(year)
+
+ base_url_path = 'http://www.tyzden.sk/casopis/' + str(year) + '/' + str(weeknum)
+ base_url = base_url_path + '.html'
+
+ oldest_article = 20
+ max_articles_per_feed = 100
+ remove_javascript = True
+
+ use_embedded_content = False
+ no_stylesheets = True
+
+ keep_only_tags = []
+ keep_only_tags.append(dict(name = 'h1'))
+ keep_only_tags.append(dict(name = 'div', attrs = {'class': 'text_area top_nofoto'}))
+ keep_only_tags.append(dict(name = 'div', attrs = {'class': 'text_block'}))
+
+ remove_tags_after = [dict(name = 'div', attrs = {'class': 'text_block'})]
+
+ def find_sections(self):
+ soup = self.index_to_soup(self.base_url)
+ # find cover pic
+ imgdiv = soup.find('div', attrs = {'class': 'foto'})
+ if imgdiv is not None:
+ img = imgdiv.find('img')
+ if img is not None:
+ self.cover_url = 'http://www.tyzden.sk/' + img['src']
+ # end find cover pic
+
+ for s in soup.findAll('a', attrs={'href': re.compile(r'rubrika/.*')}):
+ yield (self.tag_to_string(s), s)
+
+ def find_articles(self, soup):
+ for art in soup.findAllNext('a'):
+ if (not art['href'].startswith('casopis/')):
+ break;
+
+ url = art['href']
+ title = self.tag_to_string(art)
+ yield {
+ 'title': title, 'url':self.base_url_path + '/' + url, 'description':title,
+ 'date' : strftime('%a, %d %b'),
+ }
+
+ def parse_index(self):
+ feeds = []
+ for title, soup in self.find_sections():
+ feeds.append((title, list(self.find_articles(soup))))
+
+ return feeds
diff --git a/setup/build_environment.py b/setup/build_environment.py
index 10ab1b0735..f0adaf9584 100644
--- a/setup/build_environment.py
+++ b/setup/build_environment.py
@@ -117,7 +117,6 @@ if iswindows:
poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
r'%s\poppler;%s'%(sw_inc_dir, sw_inc_dir))
- popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[1]+r'\qt4']
poppler_lib_dirs = consolidate('POPPLER_LIB_DIR', sw_lib_dir)
popplerqt4_lib_dirs = poppler_lib_dirs
poppler_libs = ['poppler']
@@ -131,7 +130,6 @@ elif isosx:
fc_lib = '/sw/lib'
poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
'/sw/build/poppler-0.14.5/poppler:/sw/build/poppler-0.14.5')
- popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[0]+'/qt4']
poppler_lib_dirs = consolidate('POPPLER_LIB_DIR',
'/sw/lib')
poppler_libs = ['poppler']
@@ -150,9 +148,6 @@ else:
# Include directories
poppler_inc_dirs = pkgconfig_include_dirs('poppler',
'POPPLER_INC_DIR', '/usr/include/poppler')
- popplerqt4_inc_dirs = pkgconfig_include_dirs('poppler-qt4', '', '')
- if not popplerqt4_inc_dirs:
- popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[0]+'/qt4']
png_inc_dirs = pkgconfig_include_dirs('libpng', 'PNG_INC_DIR',
'/usr/include')
magick_inc_dirs = pkgconfig_include_dirs('MagickWand', 'MAGICK_INC', '/usr/include/ImageMagick')
@@ -187,20 +182,17 @@ if not poppler_inc_dirs or not os.path.exists(
poppler_error = \
('Poppler not found on your system. Various PDF related',
' functionality will not work. Use the POPPLER_INC_DIR and',
- ' POPPLER_LIB_DIR environment variables.')
-
-popplerqt4_error = None
-if not popplerqt4_inc_dirs or not os.path.exists(
- os.path.join(popplerqt4_inc_dirs[-1], 'poppler-qt4.h')):
- popplerqt4_error = \
- ('Poppler Qt4 bindings not found on your system.')
+ ' POPPLER_LIB_DIR environment variables. calibre requires '
+ ' the poppler XPDF headers. If your distro does not '
+ ' include them you will have to re-compile poppler '
+ ' by hand with --enable-xpdf-headers')
magick_error = None
if not magick_inc_dirs or not os.path.exists(os.path.join(magick_inc_dirs[0],
'wand')):
magick_error = ('ImageMagick not found on your system. '
'Try setting the environment variables MAGICK_INC '
- 'and MAGICK_LIB to help calibre locate the inclue and libbrary '
+ 'and MAGICK_LIB to help calibre locate the include and library '
'files.')
podofo_lib = os.environ.get('PODOFO_LIB_DIR', podofo_lib)
diff --git a/src/calibre/devices/android/driver.py b/src/calibre/devices/android/driver.py
index b7e2f0fd2e..5a82882dfa 100644
--- a/src/calibre/devices/android/driver.py
+++ b/src/calibre/devices/android/driver.py
@@ -29,7 +29,7 @@ class ANDROID(USBMS):
# Motorola
0x22b8 : { 0x41d9 : [0x216], 0x2d61 : [0x100], 0x2d67 : [0x100],
0x41db : [0x216], 0x4285 : [0x216], 0x42a3 : [0x216],
- 0x4286 : [0x216] },
+ 0x4286 : [0x216], 0x42b3 : [0x216] },
# Sony Ericsson
0xfce : { 0xd12e : [0x0100]},
@@ -53,6 +53,9 @@ class ANDROID(USBMS):
# LG
0x1004 : { 0x61cc : [0x100] },
+ # Archos
+ 0x0e79 : { 0x1420 : [0x0216]},
+
}
EBOOK_DIR_MAIN = ['eBooks/import', 'wordplayer/calibretransfer', 'Books']
EXTRA_CUSTOMIZATION_MESSAGE = _('Comma separated list of directories to '
@@ -61,18 +64,19 @@ class ANDROID(USBMS):
EXTRA_CUSTOMIZATION_DEFAULT = ', '.join(EBOOK_DIR_MAIN)
VENDOR_NAME = ['HTC', 'MOTOROLA', 'GOOGLE_', 'ANDROID', 'ACER',
- 'GT-I5700', 'SAMSUNG', 'DELL', 'LINUX', 'GOOGLE']
+ 'GT-I5700', 'SAMSUNG', 'DELL', 'LINUX', 'GOOGLE', 'ARCHOS']
WINDOWS_MAIN_MEM = ['ANDROID_PHONE', 'A855', 'A853', 'INC.NEXUS_ONE',
'__UMS_COMPOSITE', '_MB200', 'MASS_STORAGE', '_-_CARD', 'SGH-I897',
'GT-I9000', 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID',
'SCH-I500_CARD', 'SPH-D700_CARD', 'MB810', 'GT-P1000', 'DESIRE',
- 'SGH-T849', '_MB300']
+ 'SGH-T849', '_MB300', 'A70S']
WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897',
- 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD']
+ 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD',
+ 'A70S']
- OSX_MAIN_MEM = 'HTC Android Phone Media'
+ OSX_MAIN_MEM = 'Android Device Main Memory'
- MAIN_MEMORY_VOLUME_LABEL = 'Android Phone Internal Memory'
+ MAIN_MEMORY_VOLUME_LABEL = 'Android Device Main Memory'
SUPPORTS_SUB_DIRS = True
diff --git a/src/calibre/devices/prs505/driver.py b/src/calibre/devices/prs505/driver.py
index 98a7241a36..874fbe4b10 100644
--- a/src/calibre/devices/prs505/driver.py
+++ b/src/calibre/devices/prs505/driver.py
@@ -76,12 +76,23 @@ class PRS505(USBMS):
'sending DRMed books in which you cannot change the cover.'
' WARNING: This option should only be used with newer '
'SONY readers: 350, 650, 950 and newer.'),
+ _('Refresh separate covers when using automatic management (newer readers)') +
+ ':::' +
+ _('Set this option to have separate book covers uploaded '
+ 'every time you connect your device. Unset this option if '
+ 'you have so many books on the reader that performance is '
+ 'unacceptable.')
]
EXTRA_CUSTOMIZATION_DEFAULT = [
', '.join(['series', 'tags']),
+ False,
False
]
+ OPT_COLLECTIONS = 0
+ OPT_UPLOAD_COVERS = 1
+ OPT_REFRESH_COVERS = 2
+
plugboard = None
plugboard_func = None
@@ -171,7 +182,7 @@ class PRS505(USBMS):
opts = self.settings()
if opts.extra_customization:
collections = [x.strip() for x in
- opts.extra_customization[0].split(',')]
+ opts.extra_customization[self.OPT_COLLECTIONS].split(',')]
else:
collections = []
debug_print('PRS505: collection fields:', collections)
@@ -183,6 +194,20 @@ class PRS505(USBMS):
c.update(blists, collections, pb)
c.write()
+ if opts.extra_customization[self.OPT_REFRESH_COVERS]:
+ debug_print('PRS505: uploading covers in sync_booklists')
+ for idx,bl in blists.items():
+ prefix = self._card_a_prefix if idx == 1 else \
+ self._card_b_prefix if idx == 2 \
+ else self._main_prefix
+ for book in bl:
+ p = os.path.join(prefix, book.lpath)
+ self._upload_cover(os.path.dirname(p),
+ os.path.splitext(os.path.basename(p))[0],
+ book, p)
+ else:
+ debug_print('PRS505: NOT uploading covers in sync_booklists')
+
USBMS.sync_booklists(self, booklists, end_session=end_session)
debug_print('PRS505: finished sync_booklists')
@@ -199,11 +224,14 @@ class PRS505(USBMS):
def upload_cover(self, path, filename, metadata, filepath):
opts = self.settings()
- if not opts.extra_customization[1]:
+ if not opts.extra_customization[self.OPT_UPLOAD_COVERS]:
# Building thumbnails disabled
- debug_print('PRS505: not uploading covers')
+ debug_print('PRS505: not uploading cover')
return
- debug_print('PRS505: uploading covers')
+ debug_print('PRS505: uploading cover')
+ self._upload_cover(path, filename, metadata, filepath)
+
+ def _upload_cover(self, path, filename, metadata, filepath):
if metadata.thumbnail and metadata.thumbnail[-1]:
path = path.replace('/', os.sep)
is_main = path.startswith(self._main_prefix)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index ae111355e4..08a46cb8d9 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -79,7 +79,7 @@ class DocAnalysis(object):
elif format == 'spanned_html':
linere = re.compile('(?<=
)', re.DOTALL)
elif format == 'txt':
- linere = re.compile('.*?\n', re.DOTALL)
+ linere = re.compile('.*?\n')
self.lines = linere.findall(raw)
def line_length(self, percent):
@@ -177,7 +177,7 @@ class Dehyphenator(object):
def __init__(self):
# Add common suffixes to the regex below to increase the likelihood of a match -
# don't add suffixes which are also complete words, such as 'able' or 'sex'
- self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
+ self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$", re.IGNORECASE)
# remove prefixes if the prefix was not already the point of hyphenation
self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
@@ -199,7 +199,7 @@ class Dehyphenator(object):
searchresult = self.html.find(lookupword.lower())
except:
return hyphenated
- if self.format == 'html_cleanup':
+ if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
if self.html.find(lookupword) != -1 or searchresult != -1:
#print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
return dehyphenated
@@ -225,10 +225,15 @@ class Dehyphenator(object):
intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P()?\s*([iubp]>\s*){1,2}(?P
<(p|div)[^>]*>\s*(]*>\s*
\s*)?(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(]*>)?)\s*(?P[\w\d]+)' % length)
elif format == 'pdf':
intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P|[iub]>\s*
\s*<[iub]>)\s*(?P[\w\d]+)'% length)
+ elif format == 'txt':
+ intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P(\n(\u0020|\u0009)*)+)(?P[\w\d]+)'% length)
elif format == 'individual_words':
- intextmatch = re.compile(u'>[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)(?P[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P\w+)\b[^<]*<') # for later, not called anywhere yet
elif format == 'html_cleanup':
intextmatch = re.compile(u'(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P
\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)')
+ elif format == 'txt_cleanup':
+ intextmatch = re.compile(u'(?P\w+)(-|‐)(?P\s+)(?P[\w\d]+)')
+
html = intextmatch.sub(self.dehyphenate, html)
return html
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 27dacdf5fb..dac93fa2e2 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -190,16 +190,16 @@ class PreProcessor(object):
line_ending = "\s*(span|p|div)>\s*((p|span|div)>)?"
blanklines = "\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*"
line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*"
- txt_line_wrap = u"(\u0020|\u0009)*\n"
-
+ txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}"
+
unwrap_regex = lookahead+line_ending+blanklines+line_opening
if format == 'txt':
unwrap_regex = lookahead+txt_line_wrap
-
+
unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
content = unwrap.sub(' ', content)
return content
-
+
def __call__(self, html):
self.log("********* Preprocessing HTML *********")
@@ -357,6 +357,6 @@ class PreProcessor(object):
html = blankreg.sub('\n'+r'\g'+u'\u00a0'+r'\g', html)
# Center separator lines
- html = re.sub(u'\s*(?P([*#•]+\s*)+)\s*
', '' + '\g' + '
', html)
+ html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•]+\s*)+)\s*((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)>)?\s*(?P=outer)>', '' + '\g' + '
', html)
return html
diff --git a/src/calibre/ebooks/metadata/book/base.py b/src/calibre/ebooks/metadata/book/base.py
index 17f2c6705c..799bdef8e6 100644
--- a/src/calibre/ebooks/metadata/book/base.py
+++ b/src/calibre/ebooks/metadata/book/base.py
@@ -324,14 +324,16 @@ class Metadata(object):
if metadata is None:
traceback.print_stack()
return
- metadata = copy.deepcopy(metadata)
- if '#value#' not in metadata:
- if metadata['datatype'] == 'text' and metadata['is_multiple']:
- metadata['#value#'] = []
+ m = {}
+ for k in metadata:
+ m[k] = copy.copy(metadata[k])
+ if '#value#' not in m:
+ if m['datatype'] == 'text' and m['is_multiple']:
+ m['#value#'] = []
else:
- metadata['#value#'] = None
+ m['#value#'] = None
_data = object.__getattribute__(self, '_data')
- _data['user_metadata'][field] = metadata
+ _data['user_metadata'][field] = m
def template_to_attribute(self, other, ops):
'''
diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index 8c7561f68c..5154373eda 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -296,7 +296,7 @@ class RTFInput(InputFormatPlugin):
u'\u00a0
\n'.encode('utf-8'), res)
if self.opts.preprocess_html:
preprocessor = PreProcessor(self.opts, log=getattr(self, 'log', None))
- res = preprocessor(res)
+ res = preprocessor(res.decode('utf-8')).encode('utf-8')
f.write(res)
self.write_inline_css(inline_class, border_styles)
stream.seek(0)
diff --git a/src/calibre/ebooks/txt/heuristicprocessor.py b/src/calibre/ebooks/txt/heuristicprocessor.py
new file mode 100644
index 0000000000..b9d18fd23a
--- /dev/null
+++ b/src/calibre/ebooks/txt/heuristicprocessor.py
@@ -0,0 +1,58 @@
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPL 3'
+__copyright__ = '2011, John Schember '
+__docformat__ = 'restructuredtext en'
+
+import re
+
+from calibre import prepare_string_for_xml
+
+class TXTHeuristicProcessor(object):
+
+ def __init__(self):
+ self.ITALICIZE_WORDS = [
+ 'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
+ 'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.',
+ 'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
+ 'Mlle.', 'Mons.', 'PS.', 'PPS.',
+ ]
+ self.ITALICIZE_STYLE_PATS = [
+ r'(?msu)_(?P.+?)_',
+ r'(?msu)/(?P[^<>]+?)/',
+ r'(?msu)~~(?P.+?)~~',
+ r'(?msu)\*(?P.+?)\*',
+ r'(?msu)~(?P.+?)~',
+ r'(?msu)_/(?P[^<>]+?)/_',
+ r'(?msu)_\*(?P.+?)\*_',
+ r'(?msu)\*/(?P[^<>]+?)/\*',
+ r'(?msu)_\*/(?P[^<>]+?)/\*_',
+ r'(?msu)/:(?P[^<>]+?):/',
+ r'(?msu)\|:(?P.+?):\|',
+ ]
+
+ def process_paragraph(self, paragraph):
+ for word in self.ITALICIZE_WORDS:
+ paragraph = paragraph.replace(word, '%s' % word)
+ for pat in self.ITALICIZE_STYLE_PATS:
+ paragraph = re.sub(pat, lambda mo: '%s' % mo.group('words'), paragraph)
+ return paragraph
+
+ def convert(self, txt, title='', epub_split_size_kb=0):
+ from calibre.ebooks.txt.processor import clean_txt, split_txt, HTML_TEMPLATE
+ txt = clean_txt(txt)
+ txt = split_txt(txt, epub_split_size_kb)
+
+ processed = []
+ for line in txt.split('\n\n'):
+ processed.append(u'%s
' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
+
+ txt = u'\n'.join(processed)
+ txt = re.sub('[ ]{2,}', ' ', txt)
+ html = HTML_TEMPLATE % (title, txt)
+
+ from calibre.ebooks.conversion.utils import PreProcessor
+ pp = PreProcessor()
+ html = pp.markup_chapters(html, pp.get_word_count(html), False)
+
+ return html
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 98756c5fa1..aaff8b55c0 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -7,10 +7,12 @@ __docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
+from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.ebooks.chardet import detect
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
- preserve_spaces, detect_paragraph_type, detect_formatting_type
+ preserve_spaces, detect_paragraph_type, detect_formatting_type, \
+ convert_heuristic, normalize_line_endings
from calibre import _ent_pat, xml_entity_to_unicode
class TXTInput(InputFormatPlugin):
@@ -22,20 +24,24 @@ class TXTInput(InputFormatPlugin):
options = set([
OptionRecommendation(name='paragraph_type', recommended_value='auto',
- choices=['auto', 'block', 'single', 'print'],
+ choices=['auto', 'block', 'single', 'print', 'unformatted'],
help=_('Paragraph structure.\n'
- 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
+ 'choices are [\'auto\', \'block\', \'single\', \'print\', \'unformatted\']\n'
'* auto: Try to auto detect paragraph type.\n'
'* block: Treat a blank line as a paragraph break.\n'
'* single: Assume every line is a paragraph.\n'
'* print: Assume every line starting with 2+ spaces or a tab '
- 'starts a paragraph.')),
+ 'starts a paragraph.'
+ '* unformatted: Most lines have hard line breaks, few/no blank lines or indents.')),
OptionRecommendation(name='formatting_type', recommended_value='auto',
- choices=['auto', 'none', 'markdown'],
+ choices=['auto', 'none', 'heuristic', 'markdown'],
help=_('Formatting used within the document.'
- '* auto: Try to auto detect the document formatting.\n'
- '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
- '* markdown: Run the input though the markdown pre-processor. '
+ '* auto: Automatically decide which formatting processor to use.\n'
+ '* none: Do not process the document formatting. Everything is a '
+ 'paragraph and no styling is applied.\n'
+ '* heuristic: Process using heuristics to determine formatting such '
+ 'as chapter headings and italic text.\n'
+ '* markdown: Processing using markdown formatting. '
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
OptionRecommendation(name='preserve_spaces', recommended_value=False,
help=_('Normally extra spaces are condensed into a single space. '
@@ -47,7 +53,7 @@ class TXTInput(InputFormatPlugin):
def convert(self, stream, options, file_ext, log,
accelerators):
log.debug('Reading text from file...')
-
+
txt = stream.read()
# Get the encoding of the document.
if options.input_encoding:
@@ -67,7 +73,14 @@ class TXTInput(InputFormatPlugin):
# followed by the entity.
if options.preserve_spaces:
txt = preserve_spaces(txt)
-
+
+ # Normalize line endings
+ txt = normalize_line_endings(txt)
+
+ # Get length for hyphen removal and punctuation unwrap
+ docanalysis = DocAnalysis('txt', txt)
+ length = docanalysis.line_length(.5)
+
if options.formatting_type == 'auto':
options.formatting_type = detect_formatting_type(txt)
@@ -86,27 +99,39 @@ class TXTInput(InputFormatPlugin):
log.debug('Could not reliably determine paragraph type using block')
options.paragraph_type = 'block'
else:
- log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
-
+ log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
+
+ # Dehyphenate
+ dehyphenator = Dehyphenator()
+ txt = dehyphenator(txt,'txt', length)
+
# We don't check for block because the processor assumes block.
# single and print at transformed to block for processing.
- if options.paragraph_type == 'single' or 'unformatted':
+
+ if options.paragraph_type == 'single' or options.paragraph_type == 'unformatted':
txt = separate_paragraphs_single_line(txt)
elif options.paragraph_type == 'print':
txt = separate_paragraphs_print_formatted(txt)
if options.paragraph_type == 'unformatted':
from calibre.ebooks.conversion.utils import PreProcessor
- from calibre.ebooks.conversion.preprocess import DocAnalysis
# get length
- docanalysis = DocAnalysis('txt', txt)
- length = docanalysis.line_length(.5)
+
# unwrap lines based on punctuation
preprocessor = PreProcessor(options, log=getattr(self, 'log', None))
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
flow_size = getattr(options, 'flow_size', 0)
- html = convert_basic(txt, epub_split_size_kb=flow_size)
+
+ if options.formatting_type == 'heuristic':
+ html = convert_heuristic(txt, epub_split_size_kb=flow_size)
+ else:
+ html = convert_basic(txt, epub_split_size_kb=flow_size)
+
+ # Dehyphenate in cleanup mode for missed txt and markdown conversion
+ dehyphenator = Dehyphenator()
+ html = dehyphenator(html,'txt_cleanup', length)
+ html = dehyphenator(html,'html_cleanup', length)
from calibre.customize.ui import plugin_for_input_format
html_input = plugin_for_input_format('html')
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index c6cf1078cd..6a1a106681 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -9,6 +9,7 @@ import os, re
from calibre import prepare_string_for_xml, isbytestring
from calibre.ebooks.markdown import markdown
from calibre.ebooks.metadata.opf2 import OPFCreator
+from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
from calibre.ebooks.conversion.preprocess import DocAnalysis
__license__ = 'GPL v3'
@@ -17,7 +18,7 @@ __docformat__ = 'restructuredtext en'
HTML_TEMPLATE = u'%s\n%s\n'
-def convert_basic(txt, title='', epub_split_size_kb=0):
+def clean_txt(txt):
if isbytestring(txt):
txt = txt.decode('utf-8', 'replace')
# Strip whitespace from the beginning and end of the line. Also replace
@@ -36,6 +37,10 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19))
illegal_chars = re.compile(u'|'.join(map(unichr, chars)))
txt = illegal_chars.sub('', txt)
+
+ return txt
+
+def split_txt(txt, epub_split_size_kb=0):
#Takes care if there is no point to split
if epub_split_size_kb > 0:
if isinstance(txt, unicode):
@@ -50,6 +55,12 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
if isbytestring(txt):
txt = txt.decode('utf-8')
+ return txt
+
+def convert_basic(txt, title='', epub_split_size_kb=0):
+ txt = clean_txt(txt)
+ txt = split_txt(txt, epub_split_size_kb)
+
lines = []
# Split into paragraphs based on having a blank line between text.
for line in txt.split('\n\n'):
@@ -58,6 +69,10 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
return HTML_TEMPLATE % (title, u'\n'.join(lines))
+def convert_heuristic(txt, title='', epub_split_size_kb=0):
+ tp = TXTHeuristicProcessor()
+ return tp.convert(txt, title, epub_split_size_kb)
+
def convert_markdown(txt, title='', disable_toc=False):
md = markdown.Markdown(
extensions=['footnotes', 'tables', 'toc'],
@@ -65,9 +80,12 @@ def convert_markdown(txt, title='', disable_toc=False):
safe_mode=False)
return HTML_TEMPLATE % (title, md.convert(txt))
-def separate_paragraphs_single_line(txt):
+def normalize_line_endings(txt):
txt = txt.replace('\r\n', '\n')
txt = txt.replace('\r', '\n')
+ return txt
+
+def separate_paragraphs_single_line(txt):
txt = re.sub(u'(?<=.)\n(?=.)', '\n\n', txt)
return txt
@@ -102,7 +120,7 @@ def detect_paragraph_type(txt):
single: Each line is a paragraph.
print: Each paragraph starts with a 2+ spaces or a tab
and ends when a new paragraph is reached.
- unformatted: most lines have hard line breaks, few/no spaces or indents
+ unformatted: most lines have hard line breaks, few/no blank lines or indents
returns block, single, print, unformatted
'''
@@ -115,15 +133,21 @@ def detect_paragraph_type(txt):
hardbreaks = docanalysis.line_histogram(.55)
if hardbreaks:
- # Check for print
+ # Determine print percentage
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
- if tab_line_count / float(txt_line_count) >= .25:
- return 'print'
-
- # Check for block
+ print_percent = tab_line_count / float(txt_line_count)
+
+ # Determine block percentage
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
- if empty_line_count / float(txt_line_count) >= .25:
- return 'block'
+ block_percent = empty_line_count / float(txt_line_count)
+
+ # Compare the two types - the type with the larger number of instances wins
+ # in cases where only one or the other represents the vast majority of the document neither wins
+ if print_percent >= block_percent:
+ if .15 <= print_percent <= .75:
+ return 'print'
+ elif .15 <= block_percent <= .75:
+ return 'block'
# Assume unformatted text with hardbreaks if nothing else matches
return 'unformatted'
@@ -153,4 +177,4 @@ def detect_formatting_type(txt):
if txt.count('\\'+c) > 10:
return 'markdown'
- return 'none'
+ return 'heuristic'
diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py
index a3b4ed7afe..00992a8612 100644
--- a/src/calibre/ebooks/txt/txtml.py
+++ b/src/calibre/ebooks/txt/txtml.py
@@ -8,7 +8,6 @@ __docformat__ = 'restructuredtext en'
Transform OEB content into plain text
'''
-import os
import re
from lxml import etree
@@ -33,6 +32,15 @@ BLOCK_STYLES = [
'block',
]
+HEADING_TAGS = [
+ 'h1',
+ 'h2',
+ 'h3',
+ 'h4',
+ 'h5',
+ 'h6',
+]
+
SPACE_TAGS = [
'td',
'br',
@@ -47,6 +55,10 @@ class TXTMLizer(object):
self.log.info('Converting XHTML to TXT...')
self.oeb_book = oeb_book
self.opts = opts
+ self.toc_ids = []
+ self.last_was_heading = False
+
+ self.create_flat_toc(self.oeb_book.toc)
return self.mlize_spine()
@@ -58,8 +70,11 @@ class TXTMLizer(object):
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
content = self.remove_newlines(content)
- output += self.dump_text(etree.fromstring(content), stylizer)
- output = self.cleanup_text(u''.join(output))
+ output += self.dump_text(etree.fromstring(content), stylizer, item)
+ output += '\n\n\n\n\n\n'
+ output = u''.join(output)
+ output = u'\n'.join(l.rstrip() for l in output.splitlines())
+ output = self.cleanup_text(output)
return output
@@ -68,6 +83,8 @@ class TXTMLizer(object):
text = text.replace('\r\n', ' ')
text = text.replace('\n', ' ')
text = text.replace('\r', ' ')
+ # Condense redundant spaces created by replacing newlines with spaces.
+ text = re.sub(r'[ ]{2,}', ' ', text)
return text
@@ -80,6 +97,14 @@ class TXTMLizer(object):
toc.append(u'* %s\n\n' % item.title)
return ''.join(toc)
+ def create_flat_toc(self, nodes):
+ '''
+ Turns a hierarchical list of TOC href's into a flat list.
+ '''
+ for item in nodes:
+ self.toc_ids.append(item.href)
+ self.create_flat_toc(item.nodes)
+
def cleanup_text(self, text):
self.log.debug('\tClean up text...')
# Replace bad characters.
@@ -92,7 +117,7 @@ class TXTMLizer(object):
text = text.replace('\f+', ' ')
# Single line paragraph.
- text = re.sub('(?<=.)%s(?=.)' % os.linesep, ' ', text)
+ text = re.sub('(?<=.)\n(?=.)', ' ', text)
# Remove multiple spaces.
text = re.sub('[ ]{2,}', ' ', text)
@@ -101,13 +126,19 @@ class TXTMLizer(object):
text = re.sub('\n[ ]+\n', '\n\n', text)
if self.opts.remove_paragraph_spacing:
text = re.sub('\n{2,}', '\n', text)
- text = re.sub('(?imu)^(?=.)', '\t', text)
+ text = re.sub(r'(?msu)^(?P[^\t\n]+?)$', lambda mo: u'%s\n\n' % mo.group('t'), text)
+ text = re.sub(r'(?msu)(?P[^\n])\n+(?P[^\t\n]+?)(?=\n)', lambda mo: '%s\n\n\n\n\n\n%s' % (mo.group('b'), mo.group('t')), text)
else:
- text = re.sub('\n{3,}', '\n\n', text)
+ text = re.sub('\n{7,}', '\n\n\n\n\n\n', text)
# Replace spaces at the beginning and end of lines
+ # We don't replace tabs because those are only added
+ # when remove paragraph spacing is enabled.
text = re.sub('(?imu)^[ ]+', '', text)
text = re.sub('(?imu)[ ]+$', '', text)
+
+ # Remove empty space and newlines at the beginning of the document.
+ text = re.sub(r'(?u)^[ \n]+', '', text)
if self.opts.max_line_length:
max_length = self.opts.max_line_length
@@ -145,13 +176,11 @@ class TXTMLizer(object):
return text
- def dump_text(self, elem, stylizer, end=''):
+ def dump_text(self, elem, stylizer, page):
'''
@elem: The element in the etree that we are working on.
@stylizer: The style information attached to the element.
- @end: The last two characters of the text from the previous element.
- This is used to determine if a blank line is needed when starting
- a new block element.
+ @page: OEB page used to determine absolute urls.
'''
if not isinstance(elem.tag, basestring) \
@@ -170,29 +199,45 @@ class TXTMLizer(object):
return ['']
tag = barename(elem.tag)
+ tag_id = elem.attrib.get('id', None)
in_block = False
+ in_heading = False
+
+ # Are we in a heading?
+ # This can either be a heading tag or a TOC item.
+ if tag in HEADING_TAGS or '%s#%s' % (page.href, tag_id) in self.toc_ids:
+ in_heading = True
+ if not self.last_was_heading:
+ text.append('\n\n\n\n\n\n')
# Are we in a paragraph block?
if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
+ if self.opts.remove_paragraph_spacing and not in_heading:
+ text.append(u'\t')
in_block = True
- if not end.endswith(u'\n\n') and hasattr(elem, 'text') and elem.text:
- text.append(u'\n\n')
if tag in SPACE_TAGS:
text.append(u' ')
+
+ # Scene breaks.
+ if tag == 'hr':
+ text.append('\n\n* * *\n\n')
# Process tags that contain text.
if hasattr(elem, 'text') and elem.text:
text.append(elem.text)
+ # Recurse down into tags within the tag we are in.
for item in elem:
- en = u''
- if len(text) >= 2:
- en = text[-1][-2:]
- text += self.dump_text(item, stylizer, en)
+ text += self.dump_text(item, stylizer, page)
if in_block:
text.append(u'\n\n')
+ if in_heading:
+ text.append(u'\n')
+ self.last_was_heading = True
+ else:
+ self.last_was_heading = False
if hasattr(elem, 'tail') and elem.tail:
text.append(elem.tail)
diff --git a/src/calibre/gui2/device.py b/src/calibre/gui2/device.py
index 6d289a3e5c..944ce03305 100644
--- a/src/calibre/gui2/device.py
+++ b/src/calibre/gui2/device.py
@@ -637,7 +637,7 @@ class DeviceMixin(object): # {{{
self.device_manager.mount_device(kls=FOLDER_DEVICE, kind='folder', path=dir)
def connect_to_bambook(self):
- self.device_manager.mount_device(kls=BAMBOOKWifi, kind='bambook',
+ self.device_manager.mount_device(kls=BAMBOOKWifi, kind='bambook',
path=BAMBOOK.settings().extra_customization)
def connect_to_itunes(self):
@@ -1266,8 +1266,8 @@ class DeviceMixin(object): # {{{
# Force a reset if the caches are not initialized
if reset or not hasattr(self, 'db_book_title_cache'):
# Build a cache (map) of the library, so the search isn't On**2
- self.db_book_title_cache = {}
- self.db_book_uuid_cache = {}
+ db_book_title_cache = {}
+ db_book_uuid_cache = {}
# It might be possible to get here without having initialized the
# library view. In this case, simply give up
try:
@@ -1278,8 +1278,8 @@ class DeviceMixin(object): # {{{
for id in db.data.iterallids():
mi = db.get_metadata(id, index_is_id=True)
title = clean_string(mi.title)
- if title not in self.db_book_title_cache:
- self.db_book_title_cache[title] = \
+ if title not in db_book_title_cache:
+ db_book_title_cache[title] = \
{'authors':{}, 'author_sort':{}, 'db_ids':{}}
# If there are multiple books in the library with the same title
# and author, then remember the last one. That is OK, because as
@@ -1287,12 +1287,14 @@ class DeviceMixin(object): # {{{
# as another.
if mi.authors:
authors = clean_string(authors_to_string(mi.authors))
- self.db_book_title_cache[title]['authors'][authors] = mi
+ db_book_title_cache[title]['authors'][authors] = mi
if mi.author_sort:
aus = clean_string(mi.author_sort)
- self.db_book_title_cache[title]['author_sort'][aus] = mi
- self.db_book_title_cache[title]['db_ids'][mi.application_id] = mi
- self.db_book_uuid_cache[mi.uuid] = mi
+ db_book_title_cache[title]['author_sort'][aus] = mi
+ db_book_title_cache[title]['db_ids'][mi.application_id] = mi
+ db_book_uuid_cache[mi.uuid] = mi
+ self.db_book_title_cache = db_book_title_cache
+ self.db_book_uuid_cache = db_book_uuid_cache
# Now iterate through all the books on the device, setting the
# in_library field. If the UUID matches a book in the library, then
diff --git a/src/calibre/gui2/preferences/plugboard.py b/src/calibre/gui2/preferences/plugboard.py
index 296387106c..e1dc6b03bd 100644
--- a/src/calibre/gui2/preferences/plugboard.py
+++ b/src/calibre/gui2/preferences/plugboard.py
@@ -5,11 +5,11 @@ __license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal '
__docformat__ = 'restructuredtext en'
-from PyQt4 import QtGui
-from PyQt4.Qt import Qt
+from PyQt4.Qt import Qt, QLineEdit, QComboBox, SIGNAL, QListWidgetItem
from calibre.gui2 import error_dialog
from calibre.gui2.device import device_name_for_plugboards
+from calibre.gui2.dialogs.template_dialog import TemplateDialog
from calibre.gui2.preferences import ConfigWidgetBase, test_widget
from calibre.gui2.preferences.plugboard_ui import Ui_Form
from calibre.customize.ui import metadata_writers, device_plugins
@@ -17,6 +17,27 @@ from calibre.library.save_to_disk import plugboard_any_format_value, \
plugboard_any_device_value, plugboard_save_to_disk_value
from calibre.utils.formatter import validation_formatter
+
+class LineEditWithTextBox(QLineEdit):
+
+ '''
+ Extend the context menu of a QLineEdit to include more actions.
+ '''
+
+ def contextMenuEvent(self, event):
+ menu = self.createStandardContextMenu()
+ menu.addSeparator()
+
+ action_open_editor = menu.addAction(_('Open Editor'))
+
+ self.connect(action_open_editor, SIGNAL('triggered()'), self.open_editor)
+ menu.exec_(event.globalPos())
+
+ def open_editor(self):
+ t = TemplateDialog(self, self.text())
+ if t.exec_():
+ self.setText(t.textbox.toPlainText())
+
class ConfigWidget(ConfigWidgetBase, Ui_Form):
def genesis(self, gui):
@@ -72,10 +93,10 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form):
self.source_widgets = []
self.dest_widgets = []
for i in range(0, len(self.dest_fields)-1):
- w = QtGui.QLineEdit(self)
+ w = LineEditWithTextBox(self)
self.source_widgets.append(w)
self.fields_layout.addWidget(w, 5+i, 0, 1, 1)
- w = QtGui.QComboBox(self)
+ w = QComboBox(self)
self.dest_widgets.append(w)
self.fields_layout.addWidget(w, 5+i, 1, 1, 1)
@@ -297,7 +318,7 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form):
for op in self.current_plugboards[f][d]:
ops.append('([' + op[0] + '] -> ' + op[1] + ')')
txt = '%s:%s = %s\n'%(f, d, ', '.join(ops))
- item = QtGui.QListWidgetItem(txt)
+ item = QListWidgetItem(txt)
item.setData(Qt.UserRole, (f, d))
self.existing_plugboards.addItem(item)
self.refilling = False
diff --git a/src/calibre/library/caches.py b/src/calibre/library/caches.py
index 980c9f1fa9..0763318912 100644
--- a/src/calibre/library/caches.py
+++ b/src/calibre/library/caches.py
@@ -181,7 +181,7 @@ class ResultCache(SearchQueryParser): # {{{
self.search_restriction = ''
self.field_metadata = field_metadata
self.all_search_locations = field_metadata.get_search_terms()
- SearchQueryParser.__init__(self, self.all_search_locations)
+ SearchQueryParser.__init__(self, self.all_search_locations, optimize=True)
self.build_date_relop_dict()
self.build_numeric_relop_dict()
@@ -264,7 +264,7 @@ class ResultCache(SearchQueryParser): # {{{
'<=':[2, relop_le]
}
- def get_dates_matches(self, location, query):
+ def get_dates_matches(self, location, query, candidates):
matches = set([])
if len(query) < 2:
return matches
@@ -274,13 +274,15 @@ class ResultCache(SearchQueryParser): # {{{
loc = self.field_metadata[location]['rec_index']
if query == 'false':
- for item in self._data:
+ for id_ in candidates:
+ item = self._data[id_]
if item is None: continue
if item[loc] is None or item[loc] <= UNDEFINED_DATE:
matches.add(item[0])
return matches
if query == 'true':
- for item in self._data:
+ for id_ in candidates:
+ item = self._data[id_]
if item is None: continue
if item[loc] is not None and item[loc] > UNDEFINED_DATE:
matches.add(item[0])
@@ -319,7 +321,8 @@ class ResultCache(SearchQueryParser): # {{{
field_count = query.count('-') + 1
else:
field_count = query.count('/') + 1
- for item in self._data:
+ for id_ in candidates:
+ item = self._data[id_]
if item is None or item[loc] is None: continue
if relop(item[loc], qd, field_count):
matches.add(item[0])
@@ -335,7 +338,7 @@ class ResultCache(SearchQueryParser): # {{{
'<=':[2, lambda r, q: r <= q]
}
- def get_numeric_matches(self, location, query, val_func = None):
+ def get_numeric_matches(self, location, query, candidates, val_func = None):
matches = set([])
if len(query) == 0:
return matches
@@ -381,7 +384,8 @@ class ResultCache(SearchQueryParser): # {{{
except:
return matches
- for item in self._data:
+ for id_ in candidates:
+ item = self._data[id_]
if item is None:
continue
v = val_func(item)
@@ -393,8 +397,13 @@ class ResultCache(SearchQueryParser): # {{{
matches.add(item[0])
return matches
- def get_matches(self, location, query, allow_recursion=True):
+ def get_matches(self, location, query, allow_recursion=True, candidates=None):
matches = set([])
+ if candidates is None:
+ candidates = self.universal_set()
+ if len(candidates) == 0:
+ return matches
+
if query and query.strip():
# get metadata key associated with the search term. Eliminates
# dealing with plurals and other aliases
@@ -476,7 +485,8 @@ class ResultCache(SearchQueryParser): # {{{
else:
q = query
- for item in self._data:
+ for id_ in candidates:
+ item = self._data[id_]
if item is None: continue
if col_datatype[loc] == 'bool': # complexity caused by the two-/three-value tweak
diff --git a/src/calibre/library/custom_columns.py b/src/calibre/library/custom_columns.py
index ba218c3ecc..f94081f046 100644
--- a/src/calibre/library/custom_columns.py
+++ b/src/calibre/library/custom_columns.py
@@ -151,6 +151,8 @@ class CustomColumns(object):
return v
def adapt_number(x, d):
+ if x is None:
+ return None
if isinstance(x, (str, unicode, bytes)):
if x.lower() == 'none':
return None
diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py
index 611aa1cc89..138560020e 100644
--- a/src/calibre/library/database2.py
+++ b/src/calibre/library/database2.py
@@ -256,7 +256,8 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
'pubdate',
'flags',
'uuid',
- 'has_cover'
+ 'has_cover',
+ ('au_map', 'authors', 'author', 'aum_sortconcat(link.id, authors.name, authors.sort)')
]
lines = []
for col in columns:
@@ -273,9 +274,9 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
self.FIELD_MAP = {'id':0, 'title':1, 'authors':2, 'timestamp':3,
'size':4, 'rating':5, 'tags':6, 'comments':7, 'series':8,
- 'publisher':9, 'series_index':10,
- 'sort':11, 'author_sort':12, 'formats':13, 'isbn':14, 'path':15,
- 'lccn':16, 'pubdate':17, 'flags':18, 'uuid':19, 'cover':20}
+ 'publisher':9, 'series_index':10, 'sort':11, 'author_sort':12,
+ 'formats':13, 'isbn':14, 'path':15, 'lccn':16, 'pubdate':17,
+ 'flags':18, 'uuid':19, 'cover':20, 'au_map':21}
for k,v in self.FIELD_MAP.iteritems():
self.field_metadata.set_field_record_index(k, v, prefer_custom=False)
@@ -687,9 +688,11 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
Convenience method to return metadata as a :class:`Metadata` object.
Note that the list of formats is not verified.
'''
+ row = self.data._data[idx] if index_is_id else self.data[idx]
+ fm = self.FIELD_MAP
+
self.gm_count += 1
- mi = self.data.get(idx, self.FIELD_MAP['all_metadata'],
- row_is_id = index_is_id)
+ mi = row[self.FIELD_MAP['all_metadata']]
if mi is not None:
if get_cover:
# Always get the cover, because the value can be wrong if the
@@ -699,49 +702,46 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
self.gm_missed += 1
mi = Metadata(None)
- self.data.set(idx, self.FIELD_MAP['all_metadata'], mi,
- row_is_id = index_is_id)
+ self.data.set(idx, fm['all_metadata'], mi, row_is_id = index_is_id)
- aut_list = self.authors_with_sort_strings(idx, index_is_id=index_is_id)
+ aut_list = row[fm['au_map']]
+ aut_list = [p.split(':::') for p in aut_list.split(':#:')]
aum = []
aus = {}
for (author, author_sort) in aut_list:
aum.append(author)
- aus[author] = author_sort
- mi.title = self.title(idx, index_is_id=index_is_id)
+ aus[author] = author_sort.replace('|', ',')
+ mi.title = row[fm['title']]
mi.authors = aum
- mi.author_sort = self.author_sort(idx, index_is_id=index_is_id)
+ mi.author_sort = row[fm['author_sort']]
mi.author_sort_map = aus
- mi.comments = self.comments(idx, index_is_id=index_is_id)
- mi.publisher = self.publisher(idx, index_is_id=index_is_id)
- mi.timestamp = self.timestamp(idx, index_is_id=index_is_id)
- mi.pubdate = self.pubdate(idx, index_is_id=index_is_id)
- mi.uuid = self.uuid(idx, index_is_id=index_is_id)
- mi.title_sort = self.title_sort(idx, index_is_id=index_is_id)
- mi.formats = self.formats(idx, index_is_id=index_is_id,
- verify_formats=False)
- if hasattr(mi.formats, 'split'):
- mi.formats = mi.formats.split(',')
- else:
- mi.formats = None
- tags = self.tags(idx, index_is_id=index_is_id)
+ mi.comments = row[fm['comments']]
+ mi.publisher = row[fm['publisher']]
+ mi.timestamp = row[fm['timestamp']]
+ mi.pubdate = row[fm['pubdate']]
+ mi.uuid = row[fm['uuid']]
+ mi.title_sort = row[fm['sort']]
+ formats = row[fm['formats']]
+ if not formats:
+ formats = None
+ mi.formats = formats
+ tags = row[fm['tags']]
if tags:
mi.tags = [i.strip() for i in tags.split(',')]
- mi.series = self.series(idx, index_is_id=index_is_id)
+ mi.series = row[fm['series']]
if mi.series:
- mi.series_index = self.series_index(idx, index_is_id=index_is_id)
- mi.rating = self.rating(idx, index_is_id=index_is_id)
- mi.isbn = self.isbn(idx, index_is_id=index_is_id)
+ mi.series_index = row[fm['series_index']]
+ mi.rating = row[fm['rating']]
+ mi.isbn = row[fm['isbn']]
id = idx if index_is_id else self.id(idx)
mi.application_id = id
mi.id = id
- for key,meta in self.field_metadata.iteritems():
- if meta['is_custom']:
- mi.set_user_metadata(key, meta)
- mi.set(key, val=self.get_custom(idx, label=meta['label'],
- index_is_id=index_is_id),
- extra=self.get_custom_extra(idx, label=meta['label'],
- index_is_id=index_is_id))
+ for key, meta in self.field_metadata.custom_iteritems():
+ mi.set_user_metadata(key, meta)
+ mi.set(key, val=self.get_custom(idx, label=meta['label'],
+ index_is_id=index_is_id),
+ extra=self.get_custom_extra(idx, label=meta['label'],
+ index_is_id=index_is_id))
if get_cover:
mi.cover = self.cover(id, index_is_id=True, as_path=True)
return mi
@@ -877,18 +877,17 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
def formats(self, index, index_is_id=False, verify_formats=True):
''' Return available formats as a comma separated list or None if there are no available formats '''
- id = index if index_is_id else self.id(index)
- try:
- formats = self.conn.get('SELECT format FROM data WHERE book=?', (id,))
- formats = map(lambda x:x[0], formats)
- except:
+ id_ = index if index_is_id else self.id(index)
+ formats = self.data.get(id_, self.FIELD_MAP['formats'], row_is_id=True)
+ if not formats:
return None
if not verify_formats:
- return ','.join(formats)
+ return formats
+ formats = formats.split(',')
ans = []
- for format in formats:
- if self.format_abspath(id, format, index_is_id=True) is not None:
- ans.append(format)
+ for fmt in formats:
+ if self.format_abspath(id_, fmt, index_is_id=True) is not None:
+ ans.append(fmt)
if not ans:
return None
return ','.join(ans)
@@ -1607,6 +1606,10 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
','.join([a.replace(',', '|') for a in authors]),
row_is_id=True)
self.data.set(id, self.FIELD_MAP['author_sort'], ss, row_is_id=True)
+ aum = self.authors_with_sort_strings(id, index_is_id=True)
+ self.data.set(id, self.FIELD_MAP['au_map'],
+ ':#:'.join([':::'.join((au.replace(',', '|'), aus)) for (au, aus) in aum]),
+ row_is_id=True)
def set_authors(self, id, authors, notify=True, commit=True):
'''
diff --git a/src/calibre/library/field_metadata.py b/src/calibre/library/field_metadata.py
index 1be6604d5d..676eb13d2b 100644
--- a/src/calibre/library/field_metadata.py
+++ b/src/calibre/library/field_metadata.py
@@ -180,6 +180,15 @@ class FieldMetadata(dict):
'search_terms':['author_sort'],
'is_custom':False,
'is_category':False}),
+ ('au_map', {'table':None,
+ 'column':None,
+ 'datatype':'text',
+ 'is_multiple':',',
+ 'kind':'field',
+ 'name':None,
+ 'search_terms':[],
+ 'is_custom':False,
+ 'is_category':False}),
('comments', {'table':None,
'column':None,
'datatype':'text',
@@ -400,6 +409,12 @@ class FieldMetadata(dict):
for key in self._tb_cats:
yield (key, self._tb_cats[key])
+ def custom_iteritems(self):
+ for key in self._tb_cats:
+ fm = self._tb_cats[key]
+ if fm['is_custom']:
+ yield (key, self._tb_cats[key])
+
def items(self):
return list(self.iteritems())
diff --git a/src/calibre/library/server/browse.py b/src/calibre/library/server/browse.py
index 37799c4cbc..3e4687be95 100644
--- a/src/calibre/library/server/browse.py
+++ b/src/calibre/library/server/browse.py
@@ -756,7 +756,7 @@ class BrowseServer(object):
sort = self.browse_sort_book_list(items, list_sort)
ids = [x[0] for x in items]
html = render_book_list(ids, self.opts.url_prefix,
- suffix=_('in search')+': '+query)
+ suffix=_('in search')+': '+xml(query))
return self.browse_template(sort, category=False, initial_search=query).format(
title=_('Matching books'),
script='booklist();', main=html)
diff --git a/src/calibre/library/sqlite.py b/src/calibre/library/sqlite.py
index 0458ada27b..83f19b8711 100644
--- a/src/calibre/library/sqlite.py
+++ b/src/calibre/library/sqlite.py
@@ -87,6 +87,24 @@ class SortedConcatenate(object):
class SafeSortedConcatenate(SortedConcatenate):
sep = '|'
+class AumSortedConcatenate(object):
+ '''String concatenation aggregator for the author sort map'''
+ def __init__(self):
+ self.ans = {}
+
+ def step(self, ndx, author, sort):
+ if author is not None:
+ self.ans[ndx] = author + ':::' + sort
+
+ def finalize(self):
+ keys = self.ans.keys()
+ l = len(keys)
+ if l == 0:
+ return 'Unknown:::Unknown'
+ if l == 1:
+ return self.ans[keys[0]]
+ return ':#:'.join([self.ans[v] for v in sorted(keys)])
+
class Connection(sqlite.Connection):
def get(self, *args, **kw):
@@ -155,6 +173,7 @@ class DBThread(Thread):
c_ext_loaded = load_c_extensions(self.conn)
self.conn.row_factory = sqlite.Row if self.row_factory else lambda cursor, row : list(row)
self.conn.create_aggregate('concat', 1, Concatenate)
+ self.conn.create_aggregate('aum_sortconcat', 3, AumSortedConcatenate)
if not c_ext_loaded:
self.conn.create_aggregate('sortconcat', 2, SortedConcatenate)
self.conn.create_aggregate('sort_concat', 2, SafeSortedConcatenate)
diff --git a/src/calibre/translations/calibre.pot b/src/calibre/translations/calibre.pot
index fdf44d7b08..6fd44f7c54 100644
--- a/src/calibre/translations/calibre.pot
+++ b/src/calibre/translations/calibre.pot
@@ -5,8 +5,8 @@
msgid ""
msgstr ""
"Project-Id-Version: calibre 0.7.38\n"
-"POT-Creation-Date: 2011-01-07 13:12+MST\n"
-"PO-Revision-Date: 2011-01-07 13:12+MST\n"
+"POT-Creation-Date: 2011-01-08 18:40+MST\n"
+"PO-Revision-Date: 2011-01-08 18:40+MST\n"
"Last-Translator: Automatically generated\n"
"Language-Team: LANGUAGE\n"
"MIME-Version: 1.0\n"
@@ -2905,28 +2905,29 @@ msgstr ""
msgid " (Preface)"
msgstr ""
-#: /home/kovid/work/calibre/src/calibre/ebooks/txt/input.py:26
+#: /home/kovid/work/calibre/src/calibre/ebooks/txt/input.py:27
msgid ""
"Paragraph structure.\n"
-"choices are ['auto', 'block', 'single', 'print', 'markdown']\n"
+"choices are ['auto', 'block', 'single', 'print', 'unformatted']\n"
"* auto: Try to auto detect paragraph type.\n"
"* block: Treat a blank line as a paragraph break.\n"
"* single: Assume every line is a paragraph.\n"
-"* print: Assume every line starting with 2+ spaces or a tab starts a paragraph."
+"* print: Assume every line starting with 2+ spaces or a tab starts a paragraph.* unformatted: Most lines have hard line breaks, few/no spaces or indents."
msgstr ""
-#: /home/kovid/work/calibre/src/calibre/ebooks/txt/input.py:35
+#: /home/kovid/work/calibre/src/calibre/ebooks/txt/input.py:37
msgid ""
-"Formatting used within the document.* auto: Try to auto detect the document formatting.\n"
-"* none: Do not modify the paragraph formatting. Everything is a paragraph.\n"
-"* markdown: Run the input though the markdown pre-processor. To learn more about markdown see"
+"Formatting used within the document.* auto: Automatically decide which formatting processor to use.\n"
+"* none: Do not process the document formatting. Everything is a paragraph and no styling is applied.\n"
+"* heuristic: Process using heuristics to determine formatting such as chapter headings and italic text.\n"
+"* markdown: Processing using markdown formatting. To learn more about markdown see"
msgstr ""
-#: /home/kovid/work/calibre/src/calibre/ebooks/txt/input.py:41
+#: /home/kovid/work/calibre/src/calibre/ebooks/txt/input.py:46
msgid "Normally extra spaces are condensed into a single space. With this option all spaces will be displayed."
msgstr ""
-#: /home/kovid/work/calibre/src/calibre/ebooks/txt/input.py:44
+#: /home/kovid/work/calibre/src/calibre/ebooks/txt/input.py:49
msgid "Do not insert a Table of Contents into the output text."
msgstr ""
@@ -7225,7 +7226,7 @@ msgstr ""
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/password_ui.py:65
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/scheduler_ui.py:219
#: /home/kovid/work/calibre/src/calibre/gui2/preferences/server_ui.py:130
-#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:169
+#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:172
msgid "&Show password"
msgstr ""
@@ -10621,48 +10622,56 @@ msgstr ""
msgid "Mail successfully sent"
msgstr ""
-#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:136
+#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:95
+msgid "OK to proceed?"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:96
+msgid "This will display your email password on the screen. Is it OK to proceed?"
+msgstr ""
+
+#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:139
msgid "If you are setting up a new hotmail account, you must log in to it once before you will be able to send mails."
msgstr ""
-#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:147
+#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:150
msgid "Setup sending email using"
msgstr ""
-#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:149
+#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:152
msgid "If you don't have an account, you can sign up for a free {name} email account at http://{url}. {extra}"
msgstr ""
-#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:156
+#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:159
msgid "Your %s &email address:"
msgstr ""
-#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:157
+#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:160
msgid "Your %s &username:"
msgstr ""
-#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:158
+#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:161
msgid "Your %s &password:"
msgstr ""
-#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:176
+#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:179
msgid "If you plan to use email to send books to your Kindle, remember to add the your %s email address to the allowed email addresses in your Amazon.com Kindle management page."
msgstr ""
-#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:183
+#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:186
msgid "Setup"
msgstr ""
-#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:198
-#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:205
+#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:201
+#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:208
msgid "Bad configuration"
msgstr ""
-#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:199
+#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:202
msgid "You must set the From email address"
msgstr ""
-#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:206
+#: /home/kovid/work/calibre/src/calibre/gui2/wizard/send_email.py:209
msgid "You must set the username and password for the mail server."
msgstr ""
diff --git a/src/calibre/utils/formatter.py b/src/calibre/utils/formatter.py
index 2e4f843c3d..40760bf91b 100644
--- a/src/calibre/utils/formatter.py
+++ b/src/calibre/utils/formatter.py
@@ -98,9 +98,10 @@ class _Parser(object):
m = 'Formatter: ' + message + _(' near ')
if self.lex_pos > 0:
m = '{0} {1}'.format(m, self.prog[self.lex_pos-1][1])
- m = '{0} {1}'.format(m, self.prog[self.lex_pos][1])
- if self.lex_pos < len(self.prog):
+ elif self.lex_pos < len(self.prog):
m = '{0} {1}'.format(m, self.prog[self.lex_pos+1][1])
+ else:
+ m = '{0} {1}'.format(m, _('end of program'))
raise ValueError(m)
def token(self):
diff --git a/src/calibre/utils/search_query_parser.py b/src/calibre/utils/search_query_parser.py
index db7c7bde5f..447ff8cd14 100644
--- a/src/calibre/utils/search_query_parser.py
+++ b/src/calibre/utils/search_query_parser.py
@@ -118,8 +118,9 @@ class SearchQueryParser(object):
failed.append(test[0])
return failed
- def __init__(self, locations, test=False):
+ def __init__(self, locations, test=False, optimize=False):
self._tests_failed = False
+ self.optimize = optimize
# Define a token
standard_locations = map(lambda x : CaselessLiteral(x)+Suppress(':'),
locations)
@@ -182,38 +183,52 @@ class SearchQueryParser(object):
# empty the list of searches used for recursion testing
self.recurse_level = 0
self.searches_seen = set([])
- return self._parse(query)
+ candidates = self.universal_set()
+ return self._parse(query, candidates)
# this parse is used internally because it doesn't clear the
# recursive search test list. However, we permit seeing the
# same search a few times because the search might appear within
# another search.
- def _parse(self, query):
+ def _parse(self, query, candidates=None):
self.recurse_level += 1
res = self._parser.parseString(query)[0]
- t = self.evaluate(res)
+ if candidates is None:
+ candidates = self.universal_set()
+ t = self.evaluate(res, candidates)
self.recurse_level -= 1
return t
def method(self, group_name):
return getattr(self, 'evaluate_'+group_name)
- def evaluate(self, parse_result):
- return self.method(parse_result.getName())(parse_result)
+ def evaluate(self, parse_result, candidates):
+ return self.method(parse_result.getName())(parse_result, candidates)
- def evaluate_and(self, argument):
- return self.evaluate(argument[0]).intersection(self.evaluate(argument[1]))
+ def evaluate_and(self, argument, candidates):
+ # RHS checks only those items matched by LHS
+ # returns result of RHS check: RHmatches(LHmatches(c))
+ # return self.evaluate(argument[0]).intersection(self.evaluate(argument[1]))
+ l = self.evaluate(argument[0], candidates)
+ return l.intersection(self.evaluate(argument[1], l))
- def evaluate_or(self, argument):
- return self.evaluate(argument[0]).union(self.evaluate(argument[1]))
+ def evaluate_or(self, argument, candidates):
+ # RHS checks only those elements not matched by LHS
+ # returns LHS union RHS: LHmatches(c) + RHmatches(c-LHmatches(c))
+ # return self.evaluate(argument[0]).union(self.evaluate(argument[1]))
+ l = self.evaluate(argument[0], candidates)
+ return l.union(self.evaluate(argument[1], candidates.difference(l)))
- def evaluate_not(self, argument):
- return self.universal_set().difference(self.evaluate(argument[0]))
+ def evaluate_not(self, argument, candidates):
+ # unary op checks only candidates. Result: list of items matching
+ # returns: c - matches(c)
+ # return self.universal_set().difference(self.evaluate(argument[0]))
+ return candidates.difference(self.evaluate(argument[0], candidates))
- def evaluate_parenthesis(self, argument):
- return self.evaluate(argument[0])
+ def evaluate_parenthesis(self, argument, candidates):
+ return self.evaluate(argument[0], candidates)
- def evaluate_token(self, argument):
+ def evaluate_token(self, argument, candidates):
location = argument[0]
query = argument[1]
if location.lower() == 'search':
@@ -224,17 +239,27 @@ class SearchQueryParser(object):
raise ParseException(query, len(query), 'undefined saved search', self)
if self.recurse_level > 5:
self.searches_seen.add(query)
- return self._parse(saved_searches().lookup(query))
+ return self._parse(saved_searches().lookup(query), candidates)
except: # convert all exceptions (e.g., missing key) to a parse error
raise ParseException(query, len(query), 'undefined saved search', self)
- return self.get_matches(location, query)
+ return self._get_matches(location, query, candidates)
- def get_matches(self, location, query):
+ def _get_matches(self, location, query, candidates):
+ if self.optimize:
+ return self.get_matches(location, query, candidates=candidates)
+ else:
+ return self.get_matches(location, query)
+
+ def get_matches(self, location, query, candidates=None):
'''
Should return the set of matches for :param:'location` and :param:`query`.
+ The search must be performed over all entries is :param:`candidates` is
+ None otherwise only over the items in candidates.
+
:param:`location` is one of the items in :member:`SearchQueryParser.DEFAULT_LOCATIONS`.
:param:`query` is a string literal.
+ :param: None or a subset of the set returned by :meth:`universal_set`.
'''
return set([])
@@ -561,7 +586,7 @@ class Tester(SearchQueryParser):
def universal_set(self):
return self._universal_set
- def get_matches(self, location, query):
+ def get_matches(self, location, query, candidates=None):
location = location.lower()
if location in self.fields.keys():
getter = operator.itemgetter(self.fields[location])
@@ -573,8 +598,13 @@ class Tester(SearchQueryParser):
if not query:
return set([])
query = query.lower()
- return set(key for key, val in self.texts.items() \
- if query and query in getattr(getter(val), 'lower', lambda : '')())
+ if candidates:
+ return set(key for key, val in self.texts.items() \
+ if key in candidates and query and query
+ in getattr(getter(val), 'lower', lambda : '')())
+ else:
+ return set(key for key, val in self.texts.items() \
+ if query and query in getattr(getter(val), 'lower', lambda : '')())
@@ -592,6 +622,7 @@ class Tester(SearchQueryParser):
def main(args=sys.argv):
+ print 'testing unoptimized'
tester = Tester(['authors', 'author', 'series', 'formats', 'format',
'publisher', 'rating', 'tags', 'tag', 'comments', 'comment', 'cover',
'isbn', 'ondevice', 'pubdate', 'size', 'date', 'title', u'#read',
@@ -601,6 +632,16 @@ def main(args=sys.argv):
print '>>>>>>>>>>>>>> Tests Failed <<<<<<<<<<<<<<<'
return 1
+ print '\n\ntesting optimized'
+ tester = Tester(['authors', 'author', 'series', 'formats', 'format',
+ 'publisher', 'rating', 'tags', 'tag', 'comments', 'comment', 'cover',
+ 'isbn', 'ondevice', 'pubdate', 'size', 'date', 'title', u'#read',
+ 'all', 'search'], test=True, optimize=True)
+ failed = tester.run_tests()
+ if tester._tests_failed or failed:
+ print '>>>>>>>>>>>>>> Tests Failed <<<<<<<<<<<<<<<'
+ return 1
+
return 0
if __name__ == '__main__':