diff --git a/resources/recipes/cicero.recipe b/resources/recipes/cicero.recipe
new file mode 100644
index 0000000000..2df6b68000
--- /dev/null
+++ b/resources/recipes/cicero.recipe
@@ -0,0 +1,35 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Cicero(BasicNewsRecipe):
+ timefmt = ' [%Y-%m-%d]'
+ title = u'Cicero'
+ __author__ = 'mad@sharktooth.de'
+ description = u'Magazin f\xfcr politische Kultur'
+ oldest_article = 7
+ language = 'de'
+ max_articles_per_feed = 100
+ no_stylesheets = True
+ use_embedded_content = False
+ publisher = 'Ringier Publishing'
+ category = 'news, politics, Germany'
+ encoding = 'iso-8859-1'
+ publication_type = 'magazine'
+ masthead_url = 'http://www.cicero.de/img2/cicero_logo_rss.gif'
+ feeds = [
+(u'Das gesamte Portfolio', u'http://www.cicero.de/rss/rss.php?ress_id='),
+#(u'Alle Heft-Inhalte', u'http://www.cicero.de/rss/rss.php?ress_id=heft'),
+#(u'Alle Online-Inhalte', u'http://www.cicero.de/rss/rss.php?ress_id=online'),
+#(u'Berliner Republik', u'http://www.cicero.de/rss/rss.php?ress_id=4'),
+#(u'Weltb\xfchne', u'http://www.cicero.de/rss/rss.php?ress_id=1'),
+#(u'Salon', u'http://www.cicero.de/rss/rss.php?ress_id=7'),
+#(u'Kapital', u'http://www.cicero.de/rss/rss.php?ress_id=6'),
+#(u'Netzst\xfccke', u'http://www.cicero.de/rss/rss.php?ress_id=9'),
+#(u'Leinwand', u'http://www.cicero.de/rss/rss.php?ress_id=12'),
+#(u'Bibliothek', u'http://www.cicero.de/rss/rss.php?ress_id=15'),
+(u'Kolumne - Alle Kolulmnen', u'http://www.cicero.de/rss/rss2.php?ress_id='),
+#(u'Kolumne - Schreiber, Berlin', u'http://www.cicero.de/rss/rss2.php?ress_id=35'),
+#(u'Kolumne - TV Kritik', u'http://www.cicero.de/rss/rss2.php?ress_id=34')
+]
+
+ def print_version(self, url):
+ return 'http://www.cicero.de/page_print.php?' + url.rpartition('?')[2]
diff --git a/resources/recipes/el_correo.recipe b/resources/recipes/el_correo.recipe
new file mode 100644
index 0000000000..9190560b02
--- /dev/null
+++ b/resources/recipes/el_correo.recipe
@@ -0,0 +1,122 @@
+#!/usr/bin/env python
+__license__ = 'GPL v3'
+__copyright__ = '08 Januery 2011, desUBIKado'
+__author__ = 'desUBIKado'
+__description__ = 'Daily newspaper from Biscay'
+__version__ = 'v0.08'
+__date__ = '08, Januery 2011'
+'''
+[url]http://www.elcorreo.com/[/url]
+'''
+
+import time
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class heraldo(BasicNewsRecipe):
+ __author__ = 'desUBIKado'
+ description = 'Daily newspaper from Biscay'
+ title = u'El Correo'
+ publisher = 'Vocento'
+ category = 'News, politics, culture, economy, general interest'
+ oldest_article = 2
+ delay = 1
+ max_articles_per_feed = 100
+ no_stylesheets = True
+ use_embedded_content = False
+ language = 'es'
+ timefmt = '[%a, %d %b, %Y]'
+ encoding = 'iso-8859-1'
+ remove_empty_feeds = True
+ remove_javascript = False
+
+ feeds = [
+ (u'Portada', u'http://www.elcorreo.com/vizcaya/portada.xml'),
+ (u'Local', u'http://www.elcorreo.com/vizcaya/rss/feeds/vizcaya.xml'),
+ (u'Internacional', u'hhttp://www.elcorreo.com/vizcaya/rss/feeds/internacional.xml'),
+ (u'Econom\xeda', u'http://www.elcorreo.com/vizcaya/rss/feeds/economia.xml'),
+ (u'Pol\xedtica', u'http://www.elcorreo.com/vizcaya/rss/feeds/politica.xml'),
+ (u'Opini\xf3n', u'http://www.elcorreo.com/vizcaya/rss/feeds/opinion.xml'),
+ (u'Deportes', u'http://www.elcorreo.com/vizcaya/rss/feeds/deportes.xml'),
+ (u'Sociedad', u'http://www.elcorreo.com/vizcaya/rss/feeds/sociedad.xml'),
+ (u'Cultura', u'http://www.elcorreo.com/vizcaya/rss/feeds/cultura.xml'),
+ (u'Televisi\xf3n', u'http://www.elcorreo.com/vizcaya/rss/feeds/television.xml'),
+ (u'Gente', u'http://www.elcorreo.com/vizcaya/rss/feeds/gente.xml')
+ ]
+
+ keep_only_tags = [
+ dict(name='div', attrs={'class':['grouphead','date','art_head','story-texto','text','colC_articulo','contenido_comentarios']}),
+ dict(name='div' , attrs={'id':['articulo','story-texto','story-entradilla']})
+ ]
+
+ remove_tags = [
+ dict(name='div', attrs={'class':['art_barra','detalles-opinion','formdenunciar','modulo calculadoras','nubetags','pie']}),
+ dict(name='div', attrs={'class':['mod_lomas','bloque_lomas','blm_header','link-app3','link-app4','botones_listado']}),
+ dict(name='div', attrs={'class':['navegacion_galeria','modulocanalpromocion','separa','separacion','compartir','tags_relacionados']}),
+ dict(name='div', attrs={'class':['moduloBuscadorDeportes','modulo-gente','moddestacadopeq','OpcArt','articulopiniones']}),
+ dict(name='div', attrs={'class':['modulo-especial','publiEspecial']}),
+ dict(name='div', attrs={'id':['articulopina']}),
+ dict(name='br', attrs={'class':'clear'}),
+ dict(name='form', attrs={'name':'frm_conversor2'})
+ ]
+
+ remove_tags_before = dict(name='div' , attrs={'class':'articulo '})
+ remove_tags_after = dict(name='div' , attrs={'class':'comentarios'})
+
+ def get_cover_url(self):
+ cover = None
+ st = time.localtime()
+ year = str(st.tm_year)
+ month = "%.2d" % st.tm_mon
+ day = "%.2d" % st.tm_mday
+ #[url]http://img.kiosko.net/2011/01/02/es/elcorreo.750.jpg[/url]
+ #[url]http://info.elcorreo.com/pdf/06012011-viz.pdf[/url]
+ cover='http://info.elcorreo.com/pdf/'+ day + month + year +'-viz.pdf'
+
+ br = BasicNewsRecipe.get_browser()
+ try:
+ br.open(cover)
+ except:
+ self.log("\nPortada no disponible")
+ cover ='http://www.elcorreo.com/vizcaya/noticias/201002/02/Media/logo-elcorreo-nuevo.png'
+ return cover
+
+ extra_css = '''
+ h1, .headline {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;}
+ h2, .subhead {font-family:Arial,Helvetica,sans-serif; font-style:italic; font-weight:normal;font-size:18px;}
+ h3, .overhead {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}
+ h4 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}
+ h5 {font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:16px;}
+ h6 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}
+ .date,.byline, .photo {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:14px;}
+ img{margin-bottom: 0.4em}
+ '''
+
+
+
+ preprocess_regexps = [
+
+ # To present the image of the embedded video
+ (re.compile(r'var RUTA_IMAGEN', re.DOTALL|re.IGNORECASE), lambda match: '
'),
+ (re.compile(r'var SITIO = "elcorreo";', re.DOTALL|re.IGNORECASE), lambda match: '
', re.DOTALL|re.IGNORECASE), lambda match: '
'),
+
+# To put a blank line between the intro of the embedded videos and the previous text
+ (re.compile(r'
\n
\n', re.DOTALL|re.IGNORECASE), lambda match: ''),
+
+ ]
+
diff --git a/resources/recipes/heraldo.recipe b/resources/recipes/heraldo.recipe
index c5669e116b..f3236ec4a9 100644
--- a/resources/recipes/heraldo.recipe
+++ b/resources/recipes/heraldo.recipe
@@ -3,29 +3,31 @@ __license__ = 'GPL v3'
__copyright__ = '04 December 2010, desUBIKado'
__author__ = 'desUBIKado'
__description__ = 'Daily newspaper from Aragon'
-__version__ = 'v0.03'
-__date__ = '11, December 2010'
+__version__ = 'v0.04'
+__date__ = '6, Januery 2011'
'''
[url]http://www.heraldo.es/[/url]
'''
import time
+import re
from calibre.web.feeds.news import BasicNewsRecipe
class heraldo(BasicNewsRecipe):
- __author__ = 'desUBIKado'
- description = 'Daily newspaper from Aragon'
+ __author__ = 'desUBIKado'
+ description = 'Daily newspaper from Aragon'
title = u'Heraldo de Aragon'
publisher = 'OJD Nielsen'
category = 'News, politics, culture, economy, general interest'
language = 'es'
timefmt = '[%a, %d %b, %Y]'
- oldest_article = 1
+ oldest_article = 2
+ delay = 1
max_articles_per_feed = 100
use_embedded_content = False
remove_javascript = True
no_stylesheets = True
- recursion = 10
+
feeds = [
(u'Portadas', u'http://www.heraldo.es/index.php/mod.portadas/mem.rss')
@@ -37,29 +39,39 @@ class heraldo(BasicNewsRecipe):
remove_tags = [dict(name='a', attrs={'class':['com flo-r','enl-if','enl-df']}),
dict(name='div', attrs={'class':['brb-b-s con marg-btt','cnt-rel con']}),
- dict(name='form', attrs={'class':'form'})]
+ dict(name='form', attrs={'class':'form'}),
+ dict(name='ul', attrs={'id':['cont-tags','pag-1']})]
remove_tags_before = dict(name='div' , attrs={'id':'dts'})
remove_tags_after = dict(name='div' , attrs={'id':'com'})
def get_cover_url(self):
- cover = None
- st = time.localtime()
- year = str(st.tm_year)
- month = "%.2d" % st.tm_mon
- day = "%.2d" % st.tm_mday
+ cover = None
+ st = time.localtime()
+ year = str(st.tm_year)
+ month = "%.2d" % st.tm_mon
+ day = "%.2d" % st.tm_mday
#[url]http://oldorigin-www.heraldo.es/20101211/primeras/portada_aragon.pdf[/url]
- cover='http://oldorigin-www.heraldo.es/'+ year + month + day +'/primeras/portada_aragon.pdf'
- br = BasicNewsRecipe.get_browser()
- try:
- br.open(cover)
- except:
- self.log("\nPortada no disponible")
- cover ='http://www.heraldo.es/MODULOS/global/publico/interfaces/img/logo-Heraldo.png'
- return cover
-
+ cover='http://oldorigin-www.heraldo.es/'+ year + month + day +'/primeras/portada_aragon.pdf'
+ br = BasicNewsRecipe.get_browser()
+ try:
+ br.open(cover)
+ except:
+ self.log("\nPortada no disponible")
+ cover ='http://www.heraldo.es/MODULOS/global/publico/interfaces/img/logo-Heraldo.png'
+ return cover
extra_css = '''
- h2{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:xx-large;}
- '''
+ .con strong{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:16px;}
+ .con h2{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;}
+ .con span{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:12px;}
+ .ent {font-family:Arial,Helvetica,sans-serif; font-weight:normal; font-style:italic; font-size:18px;}
+ img{margin-bottom: 0.4em}
+ '''
+
+ preprocess_regexps = [
+
+# To separate the comments with a blank line
+ (re.compile(r')', re.DOTALL)
elif format == 'txt':
- linere = re.compile('.*?\n', re.DOTALL)
+ linere = re.compile('.*?\n')
self.lines = linere.findall(raw)
def line_length(self, percent):
@@ -177,7 +177,7 @@ class Dehyphenator(object):
def __init__(self):
# Add common suffixes to the regex below to increase the likelihood of a match -
# don't add suffixes which are also complete words, such as 'able' or 'sex'
- self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
+ self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$", re.IGNORECASE)
# remove prefixes if the prefix was not already the point of hyphenation
self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
@@ -199,7 +199,7 @@ class Dehyphenator(object):
searchresult = self.html.find(lookupword.lower())
except:
return hyphenated
- if self.format == 'html_cleanup':
+ if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
if self.html.find(lookupword) != -1 or searchresult != -1:
#print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
return dehyphenated
@@ -225,10 +225,15 @@ class Dehyphenator(object):
intextmatch = re.compile(u'(?<=.{%i})(?P
[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P()?\s*([iubp]>\s*){1,2}(?P<(p|div)[^>]*>\s*(]*>\s*
\s*)?(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(]*>)?)\s*(?P[\w\d]+)' % length)
elif format == 'pdf':
intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P|[iub]>\s*
\s*<[iub]>)\s*(?P[\w\d]+)'% length)
+ elif format == 'txt':
+ intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P(\n(\u0020|\u0009)*)+)(?P[\w\d]+)'% length)
elif format == 'individual_words':
- intextmatch = re.compile(u'>[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)(?P[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P\w+)\b[^<]*<') # for later, not called anywhere yet
elif format == 'html_cleanup':
intextmatch = re.compile(u'(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P
\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)')
+ elif format == 'txt_cleanup':
+ intextmatch = re.compile(u'(?P\w+)(-|‐)(?P\s+)(?P[\w\d]+)')
+
html = intextmatch.sub(self.dehyphenate, html)
return html
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 27dacdf5fb..52d1bcc619 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -190,7 +190,7 @@ class PreProcessor(object):
line_ending = "\s*(span|p|div)>\s*((p|span|div)>)?"
blanklines = "\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*"
line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*"
- txt_line_wrap = u"(\u0020|\u0009)*\n"
+ txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}"
unwrap_regex = lookahead+line_ending+blanklines+line_opening
if format == 'txt':
@@ -357,6 +357,6 @@ class PreProcessor(object):
html = blankreg.sub('\n'+r'\g'+u'\u00a0'+r'\g', html)
# Center separator lines
- html = re.sub(u'\s*(?P([*#•]+\s*)+)\s*
', '' + '\g' + '
', html)
+ html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•]+\s*)+)\s*((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)>)?\s*(?P=outer)>', '' + '\g' + '
', html)
return html
diff --git a/src/calibre/ebooks/metadata/book/base.py b/src/calibre/ebooks/metadata/book/base.py
index 17f2c6705c..799bdef8e6 100644
--- a/src/calibre/ebooks/metadata/book/base.py
+++ b/src/calibre/ebooks/metadata/book/base.py
@@ -324,14 +324,16 @@ class Metadata(object):
if metadata is None:
traceback.print_stack()
return
- metadata = copy.deepcopy(metadata)
- if '#value#' not in metadata:
- if metadata['datatype'] == 'text' and metadata['is_multiple']:
- metadata['#value#'] = []
+ m = {}
+ for k in metadata:
+ m[k] = copy.copy(metadata[k])
+ if '#value#' not in m:
+ if m['datatype'] == 'text' and m['is_multiple']:
+ m['#value#'] = []
else:
- metadata['#value#'] = None
+ m['#value#'] = None
_data = object.__getattribute__(self, '_data')
- _data['user_metadata'][field] = metadata
+ _data['user_metadata'][field] = m
def template_to_attribute(self, other, ops):
'''
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index e782cd0cd9..3957391494 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -7,11 +7,12 @@ __docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
+from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.ebooks.chardet import detect
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
- convert_heuristic
+ convert_heuristic, normalize_line_endings
from calibre import _ent_pat, xml_entity_to_unicode
class TXTInput(InputFormatPlugin):
@@ -23,7 +24,7 @@ class TXTInput(InputFormatPlugin):
options = set([
OptionRecommendation(name='paragraph_type', recommended_value='auto',
- choices=['auto', 'block', 'single', 'print'],
+ choices=['auto', 'block', 'single', 'print', 'unformatted'],
help=_('Paragraph structure.\n'
'choices are [\'auto\', \'block\', \'single\', \'print\', \'unformatted\']\n'
'* auto: Try to auto detect paragraph type.\n'
@@ -31,7 +32,7 @@ class TXTInput(InputFormatPlugin):
'* single: Assume every line is a paragraph.\n'
'* print: Assume every line starting with 2+ spaces or a tab '
'starts a paragraph.'
- '* unformatted: Most lines have hard line breaks, few/no spaces or indents.')),
+ '* unformatted: Most lines have hard line breaks, few/no blank lines or indents.')),
OptionRecommendation(name='formatting_type', recommended_value='auto',
choices=['auto', 'none', 'heuristic', 'markdown'],
help=_('Formatting used within the document.'
@@ -72,6 +73,13 @@ class TXTInput(InputFormatPlugin):
# followed by the entity.
if options.preserve_spaces:
txt = preserve_spaces(txt)
+
+ # Normalize line endings
+ txt = normalize_line_endings(txt)
+
+ # Get length for hyphen removal and punctuation unwrap
+ docanalysis = DocAnalysis('txt', txt)
+ length = docanalysis.line_length(.5)
if options.formatting_type == 'auto':
options.formatting_type = detect_formatting_type(txt)
@@ -91,10 +99,15 @@ class TXTInput(InputFormatPlugin):
log.debug('Could not reliably determine paragraph type using block')
options.paragraph_type = 'block'
else:
- log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
-
+ log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
+
+ # Dehyphenate
+ dehyphenator = Dehyphenator()
+ txt = dehyphenator(txt,'txt', length)
+
# We don't check for block because the processor assumes block.
# single and print at transformed to block for processing.
+
if options.paragraph_type == 'single' or options.paragraph_type == 'unformatted':
txt = separate_paragraphs_single_line(txt)
elif options.paragraph_type == 'print':
@@ -102,10 +115,8 @@ class TXTInput(InputFormatPlugin):
if options.paragraph_type == 'unformatted':
from calibre.ebooks.conversion.utils import PreProcessor
- from calibre.ebooks.conversion.preprocess import DocAnalysis
# get length
- docanalysis = DocAnalysis('txt', txt)
- length = docanalysis.line_length(.5)
+
# unwrap lines based on punctuation
preprocessor = PreProcessor(options, log=getattr(self, 'log', None))
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
@@ -116,7 +127,11 @@ class TXTInput(InputFormatPlugin):
html = convert_heuristic(txt, epub_split_size_kb=flow_size)
else:
html = convert_basic(txt, epub_split_size_kb=flow_size)
-
+
+ # Dehyphenate in cleanup mode for missed txt and markdown conversion
+ dehyphenator = Dehyphenator()
+ html = dehyphenator(html,'txt_cleanup', length)
+ html = dehyphenator(html,'html_cleanup', length)
from calibre.customize.ui import plugin_for_input_format
html_input = plugin_for_input_format('html')
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index 9dc29e45dd..6a1a106681 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -80,9 +80,12 @@ def convert_markdown(txt, title='', disable_toc=False):
safe_mode=False)
return HTML_TEMPLATE % (title, md.convert(txt))
-def separate_paragraphs_single_line(txt):
+def normalize_line_endings(txt):
txt = txt.replace('\r\n', '\n')
txt = txt.replace('\r', '\n')
+ return txt
+
+def separate_paragraphs_single_line(txt):
txt = re.sub(u'(?<=.)\n(?=.)', '\n\n', txt)
return txt
@@ -117,7 +120,7 @@ def detect_paragraph_type(txt):
single: Each line is a paragraph.
print: Each paragraph starts with a 2+ spaces or a tab
and ends when a new paragraph is reached.
- unformatted: most lines have hard line breaks, few/no spaces or indents
+ unformatted: most lines have hard line breaks, few/no blank lines or indents
returns block, single, print, unformatted
'''
@@ -130,15 +133,21 @@ def detect_paragraph_type(txt):
hardbreaks = docanalysis.line_histogram(.55)
if hardbreaks:
- # Check for print
+ # Determine print percentage
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
- if tab_line_count / float(txt_line_count) >= .15:
- return 'print'
-
- # Check for block
+ print_percent = tab_line_count / float(txt_line_count)
+
+ # Determine block percentage
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
- if empty_line_count / float(txt_line_count) >= .15:
- return 'block'
+ block_percent = empty_line_count / float(txt_line_count)
+
+ # Compare the two types - the type with the larger number of instances wins
+ # in cases where only one or the other represents the vast majority of the document neither wins
+ if print_percent >= block_percent:
+ if .15 <= print_percent <= .75:
+ return 'print'
+ elif .15 <= block_percent <= .75:
+ return 'block'
# Assume unformatted text with hardbreaks if nothing else matches
return 'unformatted'
diff --git a/src/calibre/gui2/device.py b/src/calibre/gui2/device.py
index 6d289a3e5c..944ce03305 100644
--- a/src/calibre/gui2/device.py
+++ b/src/calibre/gui2/device.py
@@ -637,7 +637,7 @@ class DeviceMixin(object): # {{{
self.device_manager.mount_device(kls=FOLDER_DEVICE, kind='folder', path=dir)
def connect_to_bambook(self):
- self.device_manager.mount_device(kls=BAMBOOKWifi, kind='bambook',
+ self.device_manager.mount_device(kls=BAMBOOKWifi, kind='bambook',
path=BAMBOOK.settings().extra_customization)
def connect_to_itunes(self):
@@ -1266,8 +1266,8 @@ class DeviceMixin(object): # {{{
# Force a reset if the caches are not initialized
if reset or not hasattr(self, 'db_book_title_cache'):
# Build a cache (map) of the library, so the search isn't On**2
- self.db_book_title_cache = {}
- self.db_book_uuid_cache = {}
+ db_book_title_cache = {}
+ db_book_uuid_cache = {}
# It might be possible to get here without having initialized the
# library view. In this case, simply give up
try:
@@ -1278,8 +1278,8 @@ class DeviceMixin(object): # {{{
for id in db.data.iterallids():
mi = db.get_metadata(id, index_is_id=True)
title = clean_string(mi.title)
- if title not in self.db_book_title_cache:
- self.db_book_title_cache[title] = \
+ if title not in db_book_title_cache:
+ db_book_title_cache[title] = \
{'authors':{}, 'author_sort':{}, 'db_ids':{}}
# If there are multiple books in the library with the same title
# and author, then remember the last one. That is OK, because as
@@ -1287,12 +1287,14 @@ class DeviceMixin(object): # {{{
# as another.
if mi.authors:
authors = clean_string(authors_to_string(mi.authors))
- self.db_book_title_cache[title]['authors'][authors] = mi
+ db_book_title_cache[title]['authors'][authors] = mi
if mi.author_sort:
aus = clean_string(mi.author_sort)
- self.db_book_title_cache[title]['author_sort'][aus] = mi
- self.db_book_title_cache[title]['db_ids'][mi.application_id] = mi
- self.db_book_uuid_cache[mi.uuid] = mi
+ db_book_title_cache[title]['author_sort'][aus] = mi
+ db_book_title_cache[title]['db_ids'][mi.application_id] = mi
+ db_book_uuid_cache[mi.uuid] = mi
+ self.db_book_title_cache = db_book_title_cache
+ self.db_book_uuid_cache = db_book_uuid_cache
# Now iterate through all the books on the device, setting the
# in_library field. If the UUID matches a book in the library, then
diff --git a/src/calibre/gui2/preferences/plugboard.py b/src/calibre/gui2/preferences/plugboard.py
index 296387106c..e1dc6b03bd 100644
--- a/src/calibre/gui2/preferences/plugboard.py
+++ b/src/calibre/gui2/preferences/plugboard.py
@@ -5,11 +5,11 @@ __license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal '
__docformat__ = 'restructuredtext en'
-from PyQt4 import QtGui
-from PyQt4.Qt import Qt
+from PyQt4.Qt import Qt, QLineEdit, QComboBox, SIGNAL, QListWidgetItem
from calibre.gui2 import error_dialog
from calibre.gui2.device import device_name_for_plugboards
+from calibre.gui2.dialogs.template_dialog import TemplateDialog
from calibre.gui2.preferences import ConfigWidgetBase, test_widget
from calibre.gui2.preferences.plugboard_ui import Ui_Form
from calibre.customize.ui import metadata_writers, device_plugins
@@ -17,6 +17,27 @@ from calibre.library.save_to_disk import plugboard_any_format_value, \
plugboard_any_device_value, plugboard_save_to_disk_value
from calibre.utils.formatter import validation_formatter
+
+class LineEditWithTextBox(QLineEdit):
+
+ '''
+ Extend the context menu of a QLineEdit to include more actions.
+ '''
+
+ def contextMenuEvent(self, event):
+ menu = self.createStandardContextMenu()
+ menu.addSeparator()
+
+ action_open_editor = menu.addAction(_('Open Editor'))
+
+ self.connect(action_open_editor, SIGNAL('triggered()'), self.open_editor)
+ menu.exec_(event.globalPos())
+
+ def open_editor(self):
+ t = TemplateDialog(self, self.text())
+ if t.exec_():
+ self.setText(t.textbox.toPlainText())
+
class ConfigWidget(ConfigWidgetBase, Ui_Form):
def genesis(self, gui):
@@ -72,10 +93,10 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form):
self.source_widgets = []
self.dest_widgets = []
for i in range(0, len(self.dest_fields)-1):
- w = QtGui.QLineEdit(self)
+ w = LineEditWithTextBox(self)
self.source_widgets.append(w)
self.fields_layout.addWidget(w, 5+i, 0, 1, 1)
- w = QtGui.QComboBox(self)
+ w = QComboBox(self)
self.dest_widgets.append(w)
self.fields_layout.addWidget(w, 5+i, 1, 1, 1)
@@ -297,7 +318,7 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form):
for op in self.current_plugboards[f][d]:
ops.append('([' + op[0] + '] -> ' + op[1] + ')')
txt = '%s:%s = %s\n'%(f, d, ', '.join(ops))
- item = QtGui.QListWidgetItem(txt)
+ item = QListWidgetItem(txt)
item.setData(Qt.UserRole, (f, d))
self.existing_plugboards.addItem(item)
self.refilling = False
diff --git a/src/calibre/library/caches.py b/src/calibre/library/caches.py
index 7caeeabda8..0763318912 100644
--- a/src/calibre/library/caches.py
+++ b/src/calibre/library/caches.py
@@ -486,7 +486,7 @@ class ResultCache(SearchQueryParser): # {{{
q = query
for id_ in candidates:
- item = self._data[id]
+ item = self._data[id_]
if item is None: continue
if col_datatype[loc] == 'bool': # complexity caused by the two-/three-value tweak
diff --git a/src/calibre/library/custom_columns.py b/src/calibre/library/custom_columns.py
index ba218c3ecc..f94081f046 100644
--- a/src/calibre/library/custom_columns.py
+++ b/src/calibre/library/custom_columns.py
@@ -151,6 +151,8 @@ class CustomColumns(object):
return v
def adapt_number(x, d):
+ if x is None:
+ return None
if isinstance(x, (str, unicode, bytes)):
if x.lower() == 'none':
return None
diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py
index 611aa1cc89..138560020e 100644
--- a/src/calibre/library/database2.py
+++ b/src/calibre/library/database2.py
@@ -256,7 +256,8 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
'pubdate',
'flags',
'uuid',
- 'has_cover'
+ 'has_cover',
+ ('au_map', 'authors', 'author', 'aum_sortconcat(link.id, authors.name, authors.sort)')
]
lines = []
for col in columns:
@@ -273,9 +274,9 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
self.FIELD_MAP = {'id':0, 'title':1, 'authors':2, 'timestamp':3,
'size':4, 'rating':5, 'tags':6, 'comments':7, 'series':8,
- 'publisher':9, 'series_index':10,
- 'sort':11, 'author_sort':12, 'formats':13, 'isbn':14, 'path':15,
- 'lccn':16, 'pubdate':17, 'flags':18, 'uuid':19, 'cover':20}
+ 'publisher':9, 'series_index':10, 'sort':11, 'author_sort':12,
+ 'formats':13, 'isbn':14, 'path':15, 'lccn':16, 'pubdate':17,
+ 'flags':18, 'uuid':19, 'cover':20, 'au_map':21}
for k,v in self.FIELD_MAP.iteritems():
self.field_metadata.set_field_record_index(k, v, prefer_custom=False)
@@ -687,9 +688,11 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
Convenience method to return metadata as a :class:`Metadata` object.
Note that the list of formats is not verified.
'''
+ row = self.data._data[idx] if index_is_id else self.data[idx]
+ fm = self.FIELD_MAP
+
self.gm_count += 1
- mi = self.data.get(idx, self.FIELD_MAP['all_metadata'],
- row_is_id = index_is_id)
+ mi = row[self.FIELD_MAP['all_metadata']]
if mi is not None:
if get_cover:
# Always get the cover, because the value can be wrong if the
@@ -699,49 +702,46 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
self.gm_missed += 1
mi = Metadata(None)
- self.data.set(idx, self.FIELD_MAP['all_metadata'], mi,
- row_is_id = index_is_id)
+ self.data.set(idx, fm['all_metadata'], mi, row_is_id = index_is_id)
- aut_list = self.authors_with_sort_strings(idx, index_is_id=index_is_id)
+ aut_list = row[fm['au_map']]
+ aut_list = [p.split(':::') for p in aut_list.split(':#:')]
aum = []
aus = {}
for (author, author_sort) in aut_list:
aum.append(author)
- aus[author] = author_sort
- mi.title = self.title(idx, index_is_id=index_is_id)
+ aus[author] = author_sort.replace('|', ',')
+ mi.title = row[fm['title']]
mi.authors = aum
- mi.author_sort = self.author_sort(idx, index_is_id=index_is_id)
+ mi.author_sort = row[fm['author_sort']]
mi.author_sort_map = aus
- mi.comments = self.comments(idx, index_is_id=index_is_id)
- mi.publisher = self.publisher(idx, index_is_id=index_is_id)
- mi.timestamp = self.timestamp(idx, index_is_id=index_is_id)
- mi.pubdate = self.pubdate(idx, index_is_id=index_is_id)
- mi.uuid = self.uuid(idx, index_is_id=index_is_id)
- mi.title_sort = self.title_sort(idx, index_is_id=index_is_id)
- mi.formats = self.formats(idx, index_is_id=index_is_id,
- verify_formats=False)
- if hasattr(mi.formats, 'split'):
- mi.formats = mi.formats.split(',')
- else:
- mi.formats = None
- tags = self.tags(idx, index_is_id=index_is_id)
+ mi.comments = row[fm['comments']]
+ mi.publisher = row[fm['publisher']]
+ mi.timestamp = row[fm['timestamp']]
+ mi.pubdate = row[fm['pubdate']]
+ mi.uuid = row[fm['uuid']]
+ mi.title_sort = row[fm['sort']]
+ formats = row[fm['formats']]
+ if not formats:
+ formats = None
+ mi.formats = formats
+ tags = row[fm['tags']]
if tags:
mi.tags = [i.strip() for i in tags.split(',')]
- mi.series = self.series(idx, index_is_id=index_is_id)
+ mi.series = row[fm['series']]
if mi.series:
- mi.series_index = self.series_index(idx, index_is_id=index_is_id)
- mi.rating = self.rating(idx, index_is_id=index_is_id)
- mi.isbn = self.isbn(idx, index_is_id=index_is_id)
+ mi.series_index = row[fm['series_index']]
+ mi.rating = row[fm['rating']]
+ mi.isbn = row[fm['isbn']]
id = idx if index_is_id else self.id(idx)
mi.application_id = id
mi.id = id
- for key,meta in self.field_metadata.iteritems():
- if meta['is_custom']:
- mi.set_user_metadata(key, meta)
- mi.set(key, val=self.get_custom(idx, label=meta['label'],
- index_is_id=index_is_id),
- extra=self.get_custom_extra(idx, label=meta['label'],
- index_is_id=index_is_id))
+ for key, meta in self.field_metadata.custom_iteritems():
+ mi.set_user_metadata(key, meta)
+ mi.set(key, val=self.get_custom(idx, label=meta['label'],
+ index_is_id=index_is_id),
+ extra=self.get_custom_extra(idx, label=meta['label'],
+ index_is_id=index_is_id))
if get_cover:
mi.cover = self.cover(id, index_is_id=True, as_path=True)
return mi
@@ -877,18 +877,17 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
def formats(self, index, index_is_id=False, verify_formats=True):
''' Return available formats as a comma separated list or None if there are no available formats '''
- id = index if index_is_id else self.id(index)
- try:
- formats = self.conn.get('SELECT format FROM data WHERE book=?', (id,))
- formats = map(lambda x:x[0], formats)
- except:
+ id_ = index if index_is_id else self.id(index)
+ formats = self.data.get(id_, self.FIELD_MAP['formats'], row_is_id=True)
+ if not formats:
return None
if not verify_formats:
- return ','.join(formats)
+ return formats
+ formats = formats.split(',')
ans = []
- for format in formats:
- if self.format_abspath(id, format, index_is_id=True) is not None:
- ans.append(format)
+ for fmt in formats:
+ if self.format_abspath(id_, fmt, index_is_id=True) is not None:
+ ans.append(fmt)
if not ans:
return None
return ','.join(ans)
@@ -1607,6 +1606,10 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
','.join([a.replace(',', '|') for a in authors]),
row_is_id=True)
self.data.set(id, self.FIELD_MAP['author_sort'], ss, row_is_id=True)
+ aum = self.authors_with_sort_strings(id, index_is_id=True)
+ self.data.set(id, self.FIELD_MAP['au_map'],
+ ':#:'.join([':::'.join((au.replace(',', '|'), aus)) for (au, aus) in aum]),
+ row_is_id=True)
def set_authors(self, id, authors, notify=True, commit=True):
'''
diff --git a/src/calibre/library/field_metadata.py b/src/calibre/library/field_metadata.py
index 1be6604d5d..676eb13d2b 100644
--- a/src/calibre/library/field_metadata.py
+++ b/src/calibre/library/field_metadata.py
@@ -180,6 +180,15 @@ class FieldMetadata(dict):
'search_terms':['author_sort'],
'is_custom':False,
'is_category':False}),
+ ('au_map', {'table':None,
+ 'column':None,
+ 'datatype':'text',
+ 'is_multiple':',',
+ 'kind':'field',
+ 'name':None,
+ 'search_terms':[],
+ 'is_custom':False,
+ 'is_category':False}),
('comments', {'table':None,
'column':None,
'datatype':'text',
@@ -400,6 +409,12 @@ class FieldMetadata(dict):
for key in self._tb_cats:
yield (key, self._tb_cats[key])
+ def custom_iteritems(self):
+ for key in self._tb_cats:
+ fm = self._tb_cats[key]
+ if fm['is_custom']:
+ yield (key, self._tb_cats[key])
+
def items(self):
return list(self.iteritems())
diff --git a/src/calibre/library/server/browse.py b/src/calibre/library/server/browse.py
index 37799c4cbc..3e4687be95 100644
--- a/src/calibre/library/server/browse.py
+++ b/src/calibre/library/server/browse.py
@@ -756,7 +756,7 @@ class BrowseServer(object):
sort = self.browse_sort_book_list(items, list_sort)
ids = [x[0] for x in items]
html = render_book_list(ids, self.opts.url_prefix,
- suffix=_('in search')+': '+query)
+ suffix=_('in search')+': '+xml(query))
return self.browse_template(sort, category=False, initial_search=query).format(
title=_('Matching books'),
script='booklist();', main=html)
diff --git a/src/calibre/library/sqlite.py b/src/calibre/library/sqlite.py
index 0458ada27b..75856dd0f6 100644
--- a/src/calibre/library/sqlite.py
+++ b/src/calibre/library/sqlite.py
@@ -87,6 +87,23 @@ class SortedConcatenate(object):
class SafeSortedConcatenate(SortedConcatenate):
sep = '|'
+class AumSortedConcatenate(object):
+ '''String concatenation aggregator for the author sort map'''
+ def __init__(self):
+ self.ans = {}
+
+ def step(self, ndx, author, sort):
+ if author is not None:
+ self.ans[ndx] = author + ':::' + sort
+
+ def finalize(self):
+ keys = self.ans.keys()
+ if len(keys) == 0:
+ return None
+ if len(keys) == 1:
+ return self.ans[keys[0]]
+ return ':#:'.join([self.ans[v] for v in sorted(keys)])
+
class Connection(sqlite.Connection):
def get(self, *args, **kw):
@@ -155,6 +172,7 @@ class DBThread(Thread):
c_ext_loaded = load_c_extensions(self.conn)
self.conn.row_factory = sqlite.Row if self.row_factory else lambda cursor, row : list(row)
self.conn.create_aggregate('concat', 1, Concatenate)
+ self.conn.create_aggregate('aum_sortconcat', 3, AumSortedConcatenate)
if not c_ext_loaded:
self.conn.create_aggregate('sortconcat', 2, SortedConcatenate)
self.conn.create_aggregate('sort_concat', 2, SafeSortedConcatenate)
diff --git a/src/calibre/utils/formatter.py b/src/calibre/utils/formatter.py
index 2e4f843c3d..40760bf91b 100644
--- a/src/calibre/utils/formatter.py
+++ b/src/calibre/utils/formatter.py
@@ -98,9 +98,10 @@ class _Parser(object):
m = 'Formatter: ' + message + _(' near ')
if self.lex_pos > 0:
m = '{0} {1}'.format(m, self.prog[self.lex_pos-1][1])
- m = '{0} {1}'.format(m, self.prog[self.lex_pos][1])
- if self.lex_pos < len(self.prog):
+ elif self.lex_pos < len(self.prog):
m = '{0} {1}'.format(m, self.prog[self.lex_pos+1][1])
+ else:
+ m = '{0} {1}'.format(m, _('end of program'))
raise ValueError(m)
def token(self):