Merge from trunk

This commit is contained in:
Charles Haley 2011-01-09 09:18:56 +00:00
commit 17db6d5c97
28 changed files with 654 additions and 411 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.3 KiB

View File

@ -22,7 +22,7 @@ class Deia(BasicNewsRecipe):
cover_url ='http://2.bp.blogspot.com/_RjrWzC6tI14/TM6jrPLaBZI/AAAAAAAAFaI/ayffwxidFEY/s1600/2009-10-13-logo-deia.jpg' cover_url ='http://2.bp.blogspot.com/_RjrWzC6tI14/TM6jrPLaBZI/AAAAAAAAFaI/ayffwxidFEY/s1600/2009-10-13-logo-deia.jpg'
timefmt ='[%a, %d %b, %Y]' timefmt ='[%a, %d %b, %Y]'
encoding ='utf8' encoding ='utf8'
language ='es_ES' language ='es'
remove_javascript =True remove_javascript =True
remove_tags_after =dict(id='Texto') remove_tags_after =dict(id='Texto')
remove_tags_before =dict(id='Texto') remove_tags_before =dict(id='Texto')

View File

@ -0,0 +1,43 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = 'Gerardo Diez'
__copyright__ = 'Gerardo Diez<gerardo.diez.garcia@gmail.com>'
description = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)'
__docformat__ = 'restructuredtext en'
'''
publico.es
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class Publico(BasicNewsRecipe):
title =u'Publico.es'
__author__ ='Gerardo Diez'
publisher =u'Mediapubli Sociedad de Publicaciones y Ediciones S.L.'
category ='news, politics, finances, world, spain, science, catalunya'
oldest_article =1
max_articles_per_feed =100
simultaneous_downloads =10
cover_url =u'http://imagenes.publico.es/css/img/logo_publico.gif'
timefmt ='[%a, %d %b, %Y]'
encoding ='utf8'
language ='es'
remove_javascript =True
no_stylesheets =True
keep_only_tags =dict(id='main')
remove_tags =[
dict(name='div', attrs={'class':['Noticias_642x50', 'contInfo ancho']}),
dict(name='ul', attrs={'class':['navComentarios', 'comentarios']}),
dict(name='div', attrs={'id':['commentsContext', 'toolbar', 'comentarios']}),
dict(name='h5', attrs={'id':'comentarios'})
]
feeds =[(u'Internacional', u'http://www.publico.es/estaticos/rss/internacional'),
(u'Espa\xf1a', u'http://www.publico.es/estaticos/rss/espana'),
(u'Dinero', u'http://www.publico.es/estaticos/rss/dinero'),
(u'Ciencias', u'http://www.publico.es/estaticos/rss/ciencias'),
(u'Culturas', u'http://www.publico.es/estaticos/rss/culturas'),
(u'Deportes', u'http://www.publico.es/estaticos/rss/deportes'),
(u'Televisi\xf3n y Gente', u'http://www.publico.es/estaticos/rss/televisionygente'),
(u'Catalu\xf1a', u'http://www.publico.es/estaticos/rss/catalunya'),
(u'Viajes', u'http://www.publico.es/estaticos/rss/viajes')]

View File

@ -17,7 +17,7 @@ class ElPais_RSS(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
encoding = 'cp1252' encoding = 'cp1252'
use_embedded_content = False use_embedded_content = False
language = 'es_ES' language = 'es'
remove_empty_feeds = True remove_empty_feeds = True
publication_type = 'newspaper' publication_type = 'newspaper'
masthead_url = 'http://www.elpais.com/im/tit_logo.gif' masthead_url = 'http://www.elpais.com/im/tit_logo.gif'
@ -57,14 +57,14 @@ class ElPais_RSS(BasicNewsRecipe):
,(u'Madrid' , u'http://www.elpais.com/rss/feed.html?feedId=1016' ) ,(u'Madrid' , u'http://www.elpais.com/rss/feed.html?feedId=1016' )
,(u'Pais Vasco' , u'http://www.elpais.com/rss/feed.html?feedId=17062') ,(u'Pais Vasco' , u'http://www.elpais.com/rss/feed.html?feedId=17062')
,(u'Galicia' , u'http://www.elpais.com/rss/feed.html?feedId=17063') ,(u'Galicia' , u'http://www.elpais.com/rss/feed.html?feedId=17063')
,(u'Opinion' , u'http://www.elpais.com/rss/feed.html?feedId=1003' ) ,(u'Opinion' , u'http://www.elpais.com/rss/feed.html?feedId=1003' )
,(u'Sociedad' , u'http://www.elpais.com/rss/feed.html?feedId=1004' ) ,(u'Sociedad' , u'http://www.elpais.com/rss/feed.html?feedId=1004' )
,(u'Deportes' , u'http://www.elpais.com/rss/feed.html?feedId=1007' ) ,(u'Deportes' , u'http://www.elpais.com/rss/feed.html?feedId=1007' )
,(u'Cultura' , u'http://www.elpais.com/rss/feed.html?feedId=1008' ) ,(u'Cultura' , u'http://www.elpais.com/rss/feed.html?feedId=1008' )
,(u'Cine' , u'http://www.elpais.com/rss/feed.html?feedId=17052') ,(u'Cine' , u'http://www.elpais.com/rss/feed.html?feedId=17052')
,(u'Literatura' , u'http://www.elpais.com/rss/feed.html?feedId=17053') ,(u'Literatura' , u'http://www.elpais.com/rss/feed.html?feedId=17053')
,(u'Musica' , u'http://www.elpais.com/rss/feed.html?feedId=17051') ,(u'Musica' , u'http://www.elpais.com/rss/feed.html?feedId=17051')
,(u'Arte' , u'http://www.elpais.com/rss/feed.html?feedId=17060') ,(u'Arte' , u'http://www.elpais.com/rss/feed.html?feedId=17060')
,(u'Tecnologia' , u'http://www.elpais.com/rss/feed.html?feedId=1005' ) ,(u'Tecnologia' , u'http://www.elpais.com/rss/feed.html?feedId=1005' )
,(u'Economia' , u'http://www.elpais.com/rss/feed.html?feedId=1006' ) ,(u'Economia' , u'http://www.elpais.com/rss/feed.html?feedId=1006' )
,(u'Ciencia' , u'http://www.elpais.com/rss/feed.html?feedId=17068') ,(u'Ciencia' , u'http://www.elpais.com/rss/feed.html?feedId=17068')

View File

@ -1,7 +1,5 @@
#!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'
''' '''
exiledonline.com exiledonline.com
''' '''
@ -20,18 +18,20 @@ class Exiled(BasicNewsRecipe):
use_embedded_content = False use_embedded_content = False
encoding = 'utf8' encoding = 'utf8'
remove_javascript = True remove_javascript = True
language = 'en' language = 'en'
publication_type = 'newsblog'
cover_url = 'http://exiledonline.com/wp-content/themes/exiledonline_theme/images/header-sm.gif' masthead_url = 'http://exiledonline.com/wp-content/themes/exiledonline_theme/images/header-sm.gif'
extra_css = """
html2lrf_options = [ body{font-family: Arial,Helvetica,sans-serif}
'--comment' , description #topslug{font-size: xx-large; font-weight: bold; color: red}
, '--base-font-size', '10' """
, '--category' , category
, '--publisher' , publisher conversion_options = {
] 'comment' : description
, 'tags' : category
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' , 'publisher' : publisher
, 'language' : language
}
keep_only_tags = [dict(name='div', attrs={'id':'main'})] keep_only_tags = [dict(name='div', attrs={'id':'main'})]
@ -47,12 +47,13 @@ class Exiled(BasicNewsRecipe):
def preprocess_html(self, soup): def preprocess_html(self, soup):
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
mtag = '\n<meta http-equiv="Content-Language" content="en"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n' for alink in soup.findAll('a'):
soup.head.insert(0,mtag) if alink.string is not None:
tstr = alink.string
alink.replaceWith(tstr)
return soup return soup
def get_article_url(self, article): def get_article_url(self, article):
raw = article.get('link', None) raw = article.get('link', None)
final = raw + 'all/1/' final = raw + 'all/1/'
return final return final

View File

@ -29,7 +29,7 @@ class ANDROID(USBMS):
# Motorola # Motorola
0x22b8 : { 0x41d9 : [0x216], 0x2d61 : [0x100], 0x2d67 : [0x100], 0x22b8 : { 0x41d9 : [0x216], 0x2d61 : [0x100], 0x2d67 : [0x100],
0x41db : [0x216], 0x4285 : [0x216], 0x42a3 : [0x216], 0x41db : [0x216], 0x4285 : [0x216], 0x42a3 : [0x216],
0x4286 : [0x216] }, 0x4286 : [0x216], 0x42b3 : [0x216] },
# Sony Ericsson # Sony Ericsson
0xfce : { 0xd12e : [0x0100]}, 0xfce : { 0xd12e : [0x0100]},

View File

@ -977,6 +977,8 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
from calibre.ebooks.oeb.base import OEBBook from calibre.ebooks.oeb.base import OEBBook
html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html, html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
opts.preprocess_html, opts) opts.preprocess_html, opts)
if not encoding:
encoding = None
oeb = OEBBook(log, html_preprocessor, oeb = OEBBook(log, html_preprocessor,
pretty_print=opts.pretty_print, input_encoding=encoding) pretty_print=opts.pretty_print, input_encoding=encoding)
if not populate: if not populate:

View File

@ -78,6 +78,8 @@ class DocAnalysis(object):
linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL) linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
elif format == 'spanned_html': elif format == 'spanned_html':
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL) linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
elif format == 'txt':
linere = re.compile('.*?\n', re.DOTALL)
self.lines = linere.findall(raw) self.lines = linere.findall(raw)
def line_length(self, percent): def line_length(self, percent):
@ -561,8 +563,8 @@ class HTMLPreProcessor(object):
html = html.replace(start, '<!--') html = html.replace(start, '<!--')
html = html.replace(stop, '-->') html = html.replace(stop, '-->')
# convert ellipsis to entities to prevent wrapping # convert ellipsis to entities to prevent wrapping
html = re.sub('(?u)(?<=\w)\s?(\.\s?){2}\.', '&hellip;', html) html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '&hellip;', html)
# convert double dashes to em-dash # convert double dashes to em-dash
html = re.sub('\s--\s', u'\u2014', html) html = re.sub(r'\s--\s', u'\u2014', html)
return substitute_entites(html) return substitute_entites(html)

View File

@ -154,7 +154,7 @@ class PreProcessor(object):
default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)" default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
chapter_types = [ chapter_types = [
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"], [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common Chapter Headings"],
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
[r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters
[r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering
@ -184,7 +184,22 @@ class PreProcessor(object):
self.log("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters") self.log("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
return html return html
def punctuation_unwrap(self, length, content, format):
# define the pieces of the regex
lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
line_ending = "\s*</(span|p|div)>\s*(</(p|span|div)>)?"
blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*"
txt_line_wrap = u"(\u0020|\u0009)*\n"
unwrap_regex = lookahead+line_ending+blanklines+line_opening
if format == 'txt':
unwrap_regex = lookahead+txt_line_wrap
unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
content = unwrap.sub(' ', content)
return content
def __call__(self, html): def __call__(self, html):
self.log("********* Preprocessing HTML *********") self.log("********* Preprocessing HTML *********")
@ -194,7 +209,7 @@ class PreProcessor(object):
totalwords = 0 totalwords = 0
totalwords = self.get_word_count(html) totalwords = self.get_word_count(html)
if totalwords < 20: if totalwords < 50:
self.log("not enough text, not preprocessing") self.log("not enough text, not preprocessing")
return html return html
@ -312,8 +327,7 @@ class PreProcessor(object):
self.log("Done dehyphenating") self.log("Done dehyphenating")
# Unwrap lines using punctation and line length # Unwrap lines using punctation and line length
#unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE) #unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE)
unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) html = self.punctuation_unwrap(length, html, 'html')
html = unwrap.sub(' ', html)
#check any remaining hyphens, but only unwrap if there is a match #check any remaining hyphens, but only unwrap if there is a match
dehyphenator = Dehyphenator() dehyphenator = Dehyphenator()
html = dehyphenator(html,'html_cleanup', length) html = dehyphenator(html,'html_cleanup', length)

View File

@ -119,7 +119,7 @@ class HTMLFile(object):
self.is_binary = level > 0 and not bool(self.HTML_PAT.search(src[:4096])) self.is_binary = level > 0 and not bool(self.HTML_PAT.search(src[:4096]))
if not self.is_binary: if not self.is_binary:
if encoding is None: if not encoding:
encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1] encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
self.encoding = encoding self.encoding = encoding
else: else:

View File

@ -139,7 +139,7 @@ class BookHeader(object):
65001: 'utf-8', 65001: 'utf-8',
}[self.codepage] }[self.codepage]
except (IndexError, KeyError): except (IndexError, KeyError):
self.codec = 'cp1252' if user_encoding is None else user_encoding self.codec = 'cp1252' if not user_encoding else user_encoding
log.warn('Unknown codepage %d. Assuming %s' % (self.codepage, log.warn('Unknown codepage %d. Assuming %s' % (self.codepage,
self.codec)) self.codec))
if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length \ if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length \

View File

@ -1892,7 +1892,7 @@ class OEBBook(object):
return fix_data(data.decode(bom_enc)) return fix_data(data.decode(bom_enc))
except UnicodeDecodeError: except UnicodeDecodeError:
pass pass
if self.input_encoding is not None: if self.input_encoding:
try: try:
return fix_data(data.decode(self.input_encoding, 'replace')) return fix_data(data.decode(self.input_encoding, 'replace'))
except UnicodeDecodeError: except UnicodeDecodeError:

View File

@ -65,9 +65,9 @@ class Reader(FormatReader):
from calibre.customize.ui import plugin_for_input_format from calibre.customize.ui import plugin_for_input_format
txt_plugin = plugin_for_input_format('txt') txt_plugin = plugin_for_input_format('txt')
for option in txt_plugin.options: for opt in txt_plugin.options:
if not hasattr(self.options, option.option.name): if not hasattr(self.options, opt.option.name):
setattr(self.options, option.name, option.recommended_value) setattr(self.options, opt.option.name, opt.recommended_value)
stream.seek(0) stream.seek(0)
return txt_plugin.convert(stream, self.options, 'txt', self.log, {}) return txt_plugin.convert(stream, self.options, 'txt', self.log, {})

View File

@ -31,9 +31,9 @@ class Reader(FormatReader):
from calibre.customize.ui import plugin_for_input_format from calibre.customize.ui import plugin_for_input_format
pdf_plugin = plugin_for_input_format('pdf') pdf_plugin = plugin_for_input_format('pdf')
for option in pdf_plugin.options: for opt in pdf_plugin.options:
if not hasattr(self.options, option.option.name): if not hasattr(self.options, opt.option.name):
setattr(self.options, option.name, option.recommended_value) setattr(self.options, opt.option.name, opt.recommended_value)
pdf.seek(0) pdf.seek(0)
return pdf_plugin.convert(pdf, self.options, 'pdf', self.log, {}) return pdf_plugin.convert(pdf, self.options, 'pdf', self.log, {})

View File

@ -83,9 +83,9 @@ class Reader(FormatReader):
from calibre.customize.ui import plugin_for_input_format from calibre.customize.ui import plugin_for_input_format
txt_plugin = plugin_for_input_format('txt') txt_plugin = plugin_for_input_format('txt')
for option in txt_plugin.options: for opt in txt_plugin.options:
if not hasattr(self.options, option.option.name): if not hasattr(self.options, opt.option.name):
setattr(self.options, option.name, option.recommended_value) setattr(self.options, opt.option.name, opt.recommended_value)
stream.seek(0) stream.seek(0)
return txt_plugin.convert(stream, self.options, 'txt', self.log, {}) return txt_plugin.convert(stream, self.options, 'txt', self.log, {})

View File

@ -26,9 +26,9 @@ class TCRInput(InputFormatPlugin):
from calibre.customize.ui import plugin_for_input_format from calibre.customize.ui import plugin_for_input_format
txt_plugin = plugin_for_input_format('txt') txt_plugin = plugin_for_input_format('txt')
for option in txt_plugin.options: for opt in txt_plugin.options:
if not hasattr(options, option.option.name): if not hasattr(self.options, opt.option.name):
setattr(options, option.name, option.recommended_value) setattr(self.options, opt.option.name, opt.recommended_value)
stream.seek(0) stream.seek(0)
return txt_plugin.convert(stream, options, return txt_plugin.convert(stream, options,

View File

@ -0,0 +1,58 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import re
from calibre import prepare_string_for_xml
class TXTHeuristicProcessor(object):
def __init__(self):
self.ITALICIZE_WORDS = [
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.',
'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
'Mlle.', 'Mons.', 'PS.', 'PPS.',
]
self.ITALICIZE_STYLE_PATS = [
r'(?msu)_(?P<words>.+?)_',
r'(?msu)/(?P<words>[^<>]+?)/',
r'(?msu)~~(?P<words>.+?)~~',
r'(?msu)\*(?P<words>.+?)\*',
r'(?msu)~(?P<words>.+?)~',
r'(?msu)_/(?P<words>[^<>]+?)/_',
r'(?msu)_\*(?P<words>.+?)\*_',
r'(?msu)\*/(?P<words>[^<>]+?)/\*',
r'(?msu)_\*/(?P<words>[^<>]+?)/\*_',
r'(?msu)/:(?P<words>[^<>]+?):/',
r'(?msu)\|:(?P<words>.+?):\|',
]
def process_paragraph(self, paragraph):
for word in self.ITALICIZE_WORDS:
paragraph = paragraph.replace(word, '<i>%s</i>' % word)
for pat in self.ITALICIZE_STYLE_PATS:
paragraph = re.sub(pat, lambda mo: '<i>%s</i>' % mo.group('words'), paragraph)
return paragraph
def convert(self, txt, title='', epub_split_size_kb=0):
from calibre.ebooks.txt.processor import clean_txt, split_txt, HTML_TEMPLATE
txt = clean_txt(txt)
txt = split_txt(txt, epub_split_size_kb)
processed = []
for line in txt.split('\n\n'):
processed.append(u'<p>%s</p>' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
txt = u'\n'.join(processed)
txt = re.sub('[ ]{2,}', ' ', txt)
html = HTML_TEMPLATE % (title, txt)
from calibre.ebooks.conversion.utils import PreProcessor
pp = PreProcessor()
html = pp.markup_chapters(html, pp.get_word_count(html), False)
return html

View File

@ -10,7 +10,8 @@ from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.chardet import detect from calibre.ebooks.chardet import detect
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
preserve_spaces, detect_paragraph_type, detect_formatting_type preserve_spaces, detect_paragraph_type, detect_formatting_type, \
convert_heuristic
from calibre import _ent_pat, xml_entity_to_unicode from calibre import _ent_pat, xml_entity_to_unicode
class TXTInput(InputFormatPlugin): class TXTInput(InputFormatPlugin):
@ -24,18 +25,22 @@ class TXTInput(InputFormatPlugin):
OptionRecommendation(name='paragraph_type', recommended_value='auto', OptionRecommendation(name='paragraph_type', recommended_value='auto',
choices=['auto', 'block', 'single', 'print'], choices=['auto', 'block', 'single', 'print'],
help=_('Paragraph structure.\n' help=_('Paragraph structure.\n'
'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n' 'choices are [\'auto\', \'block\', \'single\', \'print\', \'unformatted\']\n'
'* auto: Try to auto detect paragraph type.\n' '* auto: Try to auto detect paragraph type.\n'
'* block: Treat a blank line as a paragraph break.\n' '* block: Treat a blank line as a paragraph break.\n'
'* single: Assume every line is a paragraph.\n' '* single: Assume every line is a paragraph.\n'
'* print: Assume every line starting with 2+ spaces or a tab ' '* print: Assume every line starting with 2+ spaces or a tab '
'starts a paragraph.')), 'starts a paragraph.'
'* unformatted: Most lines have hard line breaks, few/no spaces or indents.')),
OptionRecommendation(name='formatting_type', recommended_value='auto', OptionRecommendation(name='formatting_type', recommended_value='auto',
choices=['auto', 'none', 'markdown'], choices=['auto', 'none', 'heuristic', 'markdown'],
help=_('Formatting used within the document.' help=_('Formatting used within the document.'
'* auto: Try to auto detect the document formatting.\n' '* auto: Automatically decide which formatting processor to use.\n'
'* none: Do not modify the paragraph formatting. Everything is a paragraph.\n' '* none: Do not process the document formatting. Everything is a '
'* markdown: Run the input though the markdown pre-processor. ' 'paragraph and no styling is applied.\n'
'* heuristic: Process using heuristics to determine formatting such '
'as chapter headings and italic text.\n'
'* markdown: Processing using markdown formatting. '
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
OptionRecommendation(name='preserve_spaces', recommended_value=False, OptionRecommendation(name='preserve_spaces', recommended_value=False,
help=_('Normally extra spaces are condensed into a single space. ' help=_('Normally extra spaces are condensed into a single space. '
@ -90,13 +95,28 @@ class TXTInput(InputFormatPlugin):
# We don't check for block because the processor assumes block. # We don't check for block because the processor assumes block.
# single and print at transformed to block for processing. # single and print at transformed to block for processing.
if options.paragraph_type == 'single': if options.paragraph_type == 'single' or options.paragraph_type == 'unformatted':
txt = separate_paragraphs_single_line(txt) txt = separate_paragraphs_single_line(txt)
elif options.paragraph_type == 'print': elif options.paragraph_type == 'print':
txt = separate_paragraphs_print_formatted(txt) txt = separate_paragraphs_print_formatted(txt)
if options.paragraph_type == 'unformatted':
from calibre.ebooks.conversion.utils import PreProcessor
from calibre.ebooks.conversion.preprocess import DocAnalysis
# get length
docanalysis = DocAnalysis('txt', txt)
length = docanalysis.line_length(.5)
# unwrap lines based on punctuation
preprocessor = PreProcessor(options, log=getattr(self, 'log', None))
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
flow_size = getattr(options, 'flow_size', 0) flow_size = getattr(options, 'flow_size', 0)
html = convert_basic(txt, epub_split_size_kb=flow_size)
if options.formatting_type == 'heuristic':
html = convert_heuristic(txt, epub_split_size_kb=flow_size)
else:
html = convert_basic(txt, epub_split_size_kb=flow_size)
from calibre.customize.ui import plugin_for_input_format from calibre.customize.ui import plugin_for_input_format
html_input = plugin_for_input_format('html') html_input = plugin_for_input_format('html')

View File

@ -9,6 +9,8 @@ import os, re
from calibre import prepare_string_for_xml, isbytestring from calibre import prepare_string_for_xml, isbytestring
from calibre.ebooks.markdown import markdown from calibre.ebooks.markdown import markdown
from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
from calibre.ebooks.conversion.preprocess import DocAnalysis
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
@ -16,7 +18,7 @@ __docformat__ = 'restructuredtext en'
HTML_TEMPLATE = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>' HTML_TEMPLATE = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'
def convert_basic(txt, title='', epub_split_size_kb=0): def clean_txt(txt):
if isbytestring(txt): if isbytestring(txt):
txt = txt.decode('utf-8', 'replace') txt = txt.decode('utf-8', 'replace')
# Strip whitespace from the beginning and end of the line. Also replace # Strip whitespace from the beginning and end of the line. Also replace
@ -35,6 +37,10 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19)) chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19))
illegal_chars = re.compile(u'|'.join(map(unichr, chars))) illegal_chars = re.compile(u'|'.join(map(unichr, chars)))
txt = illegal_chars.sub('', txt) txt = illegal_chars.sub('', txt)
return txt
def split_txt(txt, epub_split_size_kb=0):
#Takes care if there is no point to split #Takes care if there is no point to split
if epub_split_size_kb > 0: if epub_split_size_kb > 0:
if isinstance(txt, unicode): if isinstance(txt, unicode):
@ -49,6 +55,12 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
if isbytestring(txt): if isbytestring(txt):
txt = txt.decode('utf-8') txt = txt.decode('utf-8')
return txt
def convert_basic(txt, title='', epub_split_size_kb=0):
txt = clean_txt(txt)
txt = split_txt(txt, epub_split_size_kb)
lines = [] lines = []
# Split into paragraphs based on having a blank line between text. # Split into paragraphs based on having a blank line between text.
for line in txt.split('\n\n'): for line in txt.split('\n\n'):
@ -57,6 +69,10 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
return HTML_TEMPLATE % (title, u'\n'.join(lines)) return HTML_TEMPLATE % (title, u'\n'.join(lines))
def convert_heuristic(txt, title='', epub_split_size_kb=0):
tp = TXTHeuristicProcessor()
return tp.convert(txt, title, epub_split_size_kb)
def convert_markdown(txt, title='', disable_toc=False): def convert_markdown(txt, title='', disable_toc=False):
md = markdown.Markdown( md = markdown.Markdown(
extensions=['footnotes', 'tables', 'toc'], extensions=['footnotes', 'tables', 'toc'],
@ -101,27 +117,36 @@ def detect_paragraph_type(txt):
single: Each line is a paragraph. single: Each line is a paragraph.
print: Each paragraph starts with a 2+ spaces or a tab print: Each paragraph starts with a 2+ spaces or a tab
and ends when a new paragraph is reached. and ends when a new paragraph is reached.
markdown: Markdown formatting is in the document. unformatted: most lines have hard line breaks, few/no spaces or indents
returns block, single, print, markdown returns block, single, print, unformatted
''' '''
txt = txt.replace('\r\n', '\n') txt = txt.replace('\r\n', '\n')
txt = txt.replace('\r', '\n') txt = txt.replace('\r', '\n')
txt_line_count = len(re.findall('(?mu)^\s*.+$', txt)) txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))
# Check for print # Check for hard line breaks - true if 55% of the doc breaks in the same region
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt)) docanalysis = DocAnalysis('txt', txt)
if tab_line_count / float(txt_line_count) >= .25: hardbreaks = docanalysis.line_histogram(.55)
return 'print'
# Check for block if hardbreaks:
empty_line_count = len(re.findall('(?mu)^\s*$', txt)) # Check for print
if empty_line_count / float(txt_line_count) >= .25: tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
return 'block' if tab_line_count / float(txt_line_count) >= .15:
return 'print'
# Check for block
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
if empty_line_count / float(txt_line_count) >= .15:
return 'block'
# Assume unformatted text with hardbreaks if nothing else matches
return 'unformatted'
# Nothing else matched to assume single. # return single if hardbreaks is false
return 'single' return 'single'
def detect_formatting_type(txt): def detect_formatting_type(txt):
# Check for markdown # Check for markdown
# Headings # Headings
@ -143,4 +168,4 @@ def detect_formatting_type(txt):
if txt.count('\\'+c) > 10: if txt.count('\\'+c) > 10:
return 'markdown' return 'markdown'
return 'none' return 'heuristic'

View File

@ -8,7 +8,6 @@ __docformat__ = 'restructuredtext en'
Transform OEB content into plain text Transform OEB content into plain text
''' '''
import os
import re import re
from lxml import etree from lxml import etree
@ -33,6 +32,15 @@ BLOCK_STYLES = [
'block', 'block',
] ]
HEADING_TAGS = [
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
]
SPACE_TAGS = [ SPACE_TAGS = [
'td', 'td',
'br', 'br',
@ -47,6 +55,10 @@ class TXTMLizer(object):
self.log.info('Converting XHTML to TXT...') self.log.info('Converting XHTML to TXT...')
self.oeb_book = oeb_book self.oeb_book = oeb_book
self.opts = opts self.opts = opts
self.toc_ids = []
self.last_was_heading = False
self.create_flat_toc(self.oeb_book.toc)
return self.mlize_spine() return self.mlize_spine()
@ -58,8 +70,11 @@ class TXTMLizer(object):
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode)) content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
content = self.remove_newlines(content) content = self.remove_newlines(content)
output += self.dump_text(etree.fromstring(content), stylizer) output += self.dump_text(etree.fromstring(content), stylizer, item)
output = self.cleanup_text(u''.join(output)) output += '\n\n\n\n\n\n'
output = u''.join(output)
output = u'\n'.join(l.rstrip() for l in output.splitlines())
output = self.cleanup_text(output)
return output return output
@ -68,6 +83,8 @@ class TXTMLizer(object):
text = text.replace('\r\n', ' ') text = text.replace('\r\n', ' ')
text = text.replace('\n', ' ') text = text.replace('\n', ' ')
text = text.replace('\r', ' ') text = text.replace('\r', ' ')
# Condense redundant spaces created by replacing newlines with spaces.
text = re.sub(r'[ ]{2,}', ' ', text)
return text return text
@ -80,6 +97,14 @@ class TXTMLizer(object):
toc.append(u'* %s\n\n' % item.title) toc.append(u'* %s\n\n' % item.title)
return ''.join(toc) return ''.join(toc)
def create_flat_toc(self, nodes):
'''
Turns a hierarchical list of TOC href's into a flat list.
'''
for item in nodes:
self.toc_ids.append(item.href)
self.create_flat_toc(item.nodes)
def cleanup_text(self, text): def cleanup_text(self, text):
self.log.debug('\tClean up text...') self.log.debug('\tClean up text...')
# Replace bad characters. # Replace bad characters.
@ -92,7 +117,7 @@ class TXTMLizer(object):
text = text.replace('\f+', ' ') text = text.replace('\f+', ' ')
# Single line paragraph. # Single line paragraph.
text = re.sub('(?<=.)%s(?=.)' % os.linesep, ' ', text) text = re.sub('(?<=.)\n(?=.)', ' ', text)
# Remove multiple spaces. # Remove multiple spaces.
text = re.sub('[ ]{2,}', ' ', text) text = re.sub('[ ]{2,}', ' ', text)
@ -101,13 +126,19 @@ class TXTMLizer(object):
text = re.sub('\n[ ]+\n', '\n\n', text) text = re.sub('\n[ ]+\n', '\n\n', text)
if self.opts.remove_paragraph_spacing: if self.opts.remove_paragraph_spacing:
text = re.sub('\n{2,}', '\n', text) text = re.sub('\n{2,}', '\n', text)
text = re.sub('(?imu)^(?=.)', '\t', text) text = re.sub(r'(?msu)^(?P<t>[^\t\n]+?)$', lambda mo: u'%s\n\n' % mo.group('t'), text)
text = re.sub(r'(?msu)(?P<b>[^\n])\n+(?P<t>[^\t\n]+?)(?=\n)', lambda mo: '%s\n\n\n\n\n\n%s' % (mo.group('b'), mo.group('t')), text)
else: else:
text = re.sub('\n{3,}', '\n\n', text) text = re.sub('\n{7,}', '\n\n\n\n\n\n', text)
# Replace spaces at the beginning and end of lines # Replace spaces at the beginning and end of lines
# We don't replace tabs because those are only added
# when remove paragraph spacing is enabled.
text = re.sub('(?imu)^[ ]+', '', text) text = re.sub('(?imu)^[ ]+', '', text)
text = re.sub('(?imu)[ ]+$', '', text) text = re.sub('(?imu)[ ]+$', '', text)
# Remove empty space and newlines at the beginning of the document.
text = re.sub(r'(?u)^[ \n]+', '', text)
if self.opts.max_line_length: if self.opts.max_line_length:
max_length = self.opts.max_line_length max_length = self.opts.max_line_length
@ -145,13 +176,11 @@ class TXTMLizer(object):
return text return text
def dump_text(self, elem, stylizer, end=''): def dump_text(self, elem, stylizer, page):
''' '''
@elem: The element in the etree that we are working on. @elem: The element in the etree that we are working on.
@stylizer: The style information attached to the element. @stylizer: The style information attached to the element.
@end: The last two characters of the text from the previous element. @page: OEB page used to determine absolute urls.
This is used to determine if a blank line is needed when starting
a new block element.
''' '''
if not isinstance(elem.tag, basestring) \ if not isinstance(elem.tag, basestring) \
@ -170,13 +199,22 @@ class TXTMLizer(object):
return [''] return ['']
tag = barename(elem.tag) tag = barename(elem.tag)
tag_id = elem.attrib.get('id', None)
in_block = False in_block = False
in_heading = False
# Are we in a heading?
# This can either be a heading tag or a TOC item.
if tag in HEADING_TAGS or '%s#%s' % (page.href, tag_id) in self.toc_ids:
in_heading = True
if not self.last_was_heading:
text.append('\n\n\n\n\n\n')
# Are we in a paragraph block? # Are we in a paragraph block?
if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES: if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
if self.opts.remove_paragraph_spacing and not in_heading:
text.append(u'\t')
in_block = True in_block = True
if not end.endswith(u'\n\n') and hasattr(elem, 'text') and elem.text:
text.append(u'\n\n')
if tag in SPACE_TAGS: if tag in SPACE_TAGS:
text.append(u' ') text.append(u' ')
@ -185,14 +223,17 @@ class TXTMLizer(object):
if hasattr(elem, 'text') and elem.text: if hasattr(elem, 'text') and elem.text:
text.append(elem.text) text.append(elem.text)
# Recurse down into tags within the tag we are in.
for item in elem: for item in elem:
en = u'' text += self.dump_text(item, stylizer, page)
if len(text) >= 2:
en = text[-1][-2:]
text += self.dump_text(item, stylizer, en)
if in_block: if in_block:
text.append(u'\n\n') text.append(u'\n\n')
if in_heading:
text.append(u'\n')
self.last_was_heading = True
else:
self.last_was_heading = False
if hasattr(elem, 'tail') and elem.tail: if hasattr(elem, 'tail') and elem.tail:
text.append(elem.tail) text.append(elem.tail)

View File

@ -146,6 +146,8 @@ class Widget(QWidget):
codecs.lookup(ans) codecs.lookup(ans)
except: except:
ans = '' ans = ''
if not ans:
ans = None
return ans return ans
elif isinstance(g, QComboBox): elif isinstance(g, QComboBox):
return unicode(g.currentText()) return unicode(g.currentText())

View File

@ -16,7 +16,7 @@ from PyQt4.Qt import QWidget, pyqtSignal, QDialog, Qt, QLabel, \
from calibre.gui2.wizard.send_email_ui import Ui_Form from calibre.gui2.wizard.send_email_ui import Ui_Form
from calibre.utils.smtp import config as smtp_prefs from calibre.utils.smtp import config as smtp_prefs
from calibre.gui2.dialogs.test_email_ui import Ui_Dialog as TE_Dialog from calibre.gui2.dialogs.test_email_ui import Ui_Dialog as TE_Dialog
from calibre.gui2 import error_dialog from calibre.gui2 import error_dialog, question_dialog
class TestEmail(QDialog, TE_Dialog): class TestEmail(QDialog, TE_Dialog):
@ -92,7 +92,10 @@ class SendEmail(QWidget, Ui_Form):
pa = self.preferred_to_address() pa = self.preferred_to_address()
to_set = pa is not None to_set = pa is not None
if self.set_email_settings(to_set): if self.set_email_settings(to_set):
TestEmail(pa, self).exec_() if question_dialog(self, _('OK to proceed?'),
_('This will display your email password on the screen'
'. Is it OK to proceed?'), show_copy_button=False):
TestEmail(pa, self).exec_()
def test_email_settings(self, to): def test_email_settings(self, to):
opts = smtp_prefs().parse() opts = smtp_prefs().parse()

View File

@ -274,13 +274,15 @@ class ResultCache(SearchQueryParser): # {{{
loc = self.field_metadata[location]['rec_index'] loc = self.field_metadata[location]['rec_index']
if query == 'false': if query == 'false':
for item in [self._data[id] for id in candidates]: for id_ in candidates:
item = self._data[id_]
if item is None: continue if item is None: continue
if item[loc] is None or item[loc] <= UNDEFINED_DATE: if item[loc] is None or item[loc] <= UNDEFINED_DATE:
matches.add(item[0]) matches.add(item[0])
return matches return matches
if query == 'true': if query == 'true':
for item in [self._data[id] for id in candidates]: for id_ in candidates:
item = self._data[id_]
if item is None: continue if item is None: continue
if item[loc] is not None and item[loc] > UNDEFINED_DATE: if item[loc] is not None and item[loc] > UNDEFINED_DATE:
matches.add(item[0]) matches.add(item[0])
@ -319,7 +321,8 @@ class ResultCache(SearchQueryParser): # {{{
field_count = query.count('-') + 1 field_count = query.count('-') + 1
else: else:
field_count = query.count('/') + 1 field_count = query.count('/') + 1
for item in [self._data[id] for id in candidates]: for id_ in candidates:
item = self._data[id_]
if item is None or item[loc] is None: continue if item is None or item[loc] is None: continue
if relop(item[loc], qd, field_count): if relop(item[loc], qd, field_count):
matches.add(item[0]) matches.add(item[0])
@ -381,7 +384,8 @@ class ResultCache(SearchQueryParser): # {{{
except: except:
return matches return matches
for item in [self._data[id] for id in candidates]: for id_ in candidates:
item = self._data[id_]
if item is None: if item is None:
continue continue
v = val_func(item) v = val_func(item)
@ -481,7 +485,8 @@ class ResultCache(SearchQueryParser): # {{{
else: else:
q = query q = query
for item in [self._data[id] for id in candidates]: for id_ in candidates:
item = self._data[id_]
if item is None: continue if item is None: continue
if col_datatype[loc] == 'bool': # complexity caused by the two-/three-value tweak if col_datatype[loc] == 'bool': # complexity caused by the two-/three-value tweak

View File

@ -2861,25 +2861,17 @@ class EPUB_MOBI(CatalogPlugin):
self.updateProgressMicroStep("Thumbnail %d of %d" % \ self.updateProgressMicroStep("Thumbnail %d of %d" % \
(i,len(self.booksByTitle)), (i,len(self.booksByTitle)),
i/float(len(self.booksByTitle))) i/float(len(self.booksByTitle)))
# Check to see if source file exists
if 'cover' in title and os.path.isfile(title['cover']):
# Add the thumb spec to thumbs[]
thumbs.append("thumbnail_%d.jpg" % int(title['id']))
# Check to see if thumbnail exists thumb_file = 'thumbnail_%d.jpg' % int(title['id'])
thumb_fp = "%s/thumbnail_%d.jpg" % (image_dir,int(title['id'])) thumb_generated = True
thumb_file = 'thumbnail_%d.jpg' % int(title['id']) try:
if os.path.isfile(thumb_fp): self.generateThumbnail(title, image_dir, thumb_file)
# Check to see if cover is newer than thumbnail thumbs.append("thumbnail_%d.jpg" % int(title['id']))
# os.path.getmtime() = modified time except:
# os.path.ctime() = creation time thumb_generated = False
cover_timestamp = os.path.getmtime(title['cover'])
thumb_timestamp = os.path.getmtime(thumb_fp)
if thumb_timestamp < cover_timestamp: if not thumb_generated:
self.generateThumbnail(title, image_dir, thumb_file)
else:
self.generateThumbnail(title, image_dir, thumb_file)
else:
# Use default cover # Use default cover
if False and self.verbose: if False and self.verbose:
self.opts.log.warn(" using default cover for '%s'" % \ self.opts.log.warn(" using default cover for '%s'" % \

View File

@ -722,10 +722,9 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
mi.uuid = row[fm['uuid']] mi.uuid = row[fm['uuid']]
mi.title_sort = row[fm['sort']] mi.title_sort = row[fm['sort']]
formats = row[fm['formats']] formats = row[fm['formats']]
if hasattr(formats, 'split'): if not formats:
mi.formats = formats.split(',') formats = None
else: mi.formats = formats
mi.formats = None
tags = row[fm['tags']] tags = row[fm['tags']]
if tags: if tags:
mi.tags = [i.strip() for i in tags.split(',')] mi.tags = [i.strip() for i in tags.split(',')]
@ -737,7 +736,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
id = idx if index_is_id else self.id(idx) id = idx if index_is_id else self.id(idx)
mi.application_id = id mi.application_id = id
mi.id = id mi.id = id
for key,meta in self.field_metadata.custom_iteritems(): for key, meta in self.field_metadata.custom_iteritems():
mi.set_user_metadata(key, meta) mi.set_user_metadata(key, meta)
mi.set(key, val=self.get_custom(idx, label=meta['label'], mi.set(key, val=self.get_custom(idx, label=meta['label'],
index_is_id=index_is_id), index_is_id=index_is_id),
@ -878,16 +877,17 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
def formats(self, index, index_is_id=False, verify_formats=True): def formats(self, index, index_is_id=False, verify_formats=True):
''' Return available formats as a comma separated list or None if there are no available formats ''' ''' Return available formats as a comma separated list or None if there are no available formats '''
id = index if index_is_id else self.id(index) id_ = index if index_is_id else self.id(index)
formats = self.data.get(id, self.FIELD_MAP['formats'], row_is_id = True) formats = self.data.get(id_, self.FIELD_MAP['formats'], row_is_id=True)
if not formats: if not formats:
return None return None
if not verify_formats: if not verify_formats:
return ','.join(formats) return formats
formats = formats.split(',')
ans = [] ans = []
for format in formats: for fmt in formats:
if self.format_abspath(id, format, index_is_id=True) is not None: if self.format_abspath(id_, fmt, index_is_id=True) is not None:
ans.append(format) ans.append(fmt)
if not ans: if not ans:
return None return None
return ','.join(ans) return ','.join(ans)

View File

@ -533,17 +533,23 @@ PDF documents are one of the worst formats to convert from. They are a fixed pag
Meaning, it is very difficult to determine where one paragraph ends and another begins. |app| will try to unwrap Meaning, it is very difficult to determine where one paragraph ends and another begins. |app| will try to unwrap
paragraphs using a configurable, :guilabel:`Line Un-Wrapping Factor`. This is a scale used to determine the length paragraphs using a configurable, :guilabel:`Line Un-Wrapping Factor`. This is a scale used to determine the length
at which a line should be unwrapped. Valid values are a decimal at which a line should be unwrapped. Valid values are a decimal
between 0 and 1. The default is 0.5, this is the median line length. Lower this value to include more between 0 and 1. The default is 0.45, just under the median line length. Lower this value to include more
text in the unwrapping. Increase to include less. You can adjust this value in the conversion settings under PDF Input. text in the unwrapping. Increase to include less. You can adjust this value in the conversion settings under :guilabel:`PDF Input`.
Also, they often have headers and footers as part of the document that will become included with the text. Also, they often have headers and footers as part of the document that will become included with the text.
Use the options to remove headers and footers to mitigate this issue. If the headers and footers are not Use the options to remove headers and footers to mitigate this issue. If the headers and footers are not
removed from the text it can throw off the paragraph unwrapping. removed from the text it can throw off the paragraph unwrapping. To learn how to use the header and footer removal options, read
:ref:`regexptutorial`.
Some limitations of PDF input is complex, multi-column, and image based documents are not supported. Some limitations of PDF input are:
Extraction of vector images and tables from within the document is also not supported. Some PDFs use special glyphs to
represent double ll or doubfle ff or fi,etc. Conversion of these may or may not work depending on jusy how they are * Complex, multi-column, and image based documents are not supported.
represented internally in the PDF. * Extraction of vector images and tables from within the document is also not supported.
* Some PDFs use special glyphs to represent ll or ff or fi, etc. Conversion of these may or may not work depending on just how they are represented internally in the PDF.
* Some PDFs store their images upside down with a rotation instruction, |app| currently doesn't support that instruction, so the images will be rotated in the output as well.
To re-iterate **PDF is a really, really bad** format to use as input. If you absolutely must use PDF, then be prepared for an
output ranging anywhere from decent to unusable, depending on the input PDF.
Comic Book Collections Comic Book Collections
~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~

File diff suppressed because it is too large Load Diff

View File

@ -190,9 +190,11 @@ class SearchQueryParser(object):
# recursive search test list. However, we permit seeing the # recursive search test list. However, we permit seeing the
# same search a few times because the search might appear within # same search a few times because the search might appear within
# another search. # another search.
def _parse(self, query, candidates): def _parse(self, query, candidates=None):
self.recurse_level += 1 self.recurse_level += 1
res = self._parser.parseString(query)[0] res = self._parser.parseString(query)[0]
if candidates is None:
candidates = self.universal_set()
t = self.evaluate(res, candidates) t = self.evaluate(res, candidates)
self.recurse_level -= 1 self.recurse_level -= 1
return t return t
@ -248,15 +250,16 @@ class SearchQueryParser(object):
else: else:
return self.get_matches(location, query) return self.get_matches(location, query)
def get_matches(self, location, query): def get_matches(self, location, query, candidates=None):
''' '''
Should return the set of matches for :param:'location` and :param:`query`. Should return the set of matches for :param:'location` and :param:`query`.
If you set the optimized parameter in __init__, this method must accept
a named parameter 'candidates' The search must be performed over all entries is :param:`candidates` is
None otherwise only over the items in candidates.
:param:`location` is one of the items in :member:`SearchQueryParser.DEFAULT_LOCATIONS`. :param:`location` is one of the items in :member:`SearchQueryParser.DEFAULT_LOCATIONS`.
:param:`query` is a string literal. :param:`query` is a string literal.
:param: optional named parameter candidates, a set of items to check. :param: None or a subset of the set returned by :meth:`universal_set`.
''' '''
return set([]) return set([])