Merge from trunk

2025-07-09 03:04:10 -04:00 · 2011-01-09 09:18:56 +00:00 · 2011-01-09 09:18:56 +00:00 · 17db6d5c97
commit 17db6d5c97
parent 6a745b68c1 e4e2b2f467
28 changed files with 654 additions and 411 deletions
--- a/resources/images/news/exiled.png
+++ b/resources/images/news/exiled.png
--- a/resources/recipes/deia.recipe
+++ b/resources/recipes/deia.recipe
@ -22,7 +22,7 @@ class Deia(BasicNewsRecipe):
 	cover_url		='http://2.bp.blogspot.com/_RjrWzC6tI14/TM6jrPLaBZI/AAAAAAAAFaI/ayffwxidFEY/s1600/2009-10-13-logo-deia.jpg'
 	timefmt			='[%a, %d %b, %Y]'
 	encoding		='utf8'
-	language		='es_ES'
+	language		='es'
 	remove_javascript	=True
 	remove_tags_after	=dict(id='Texto')
 	remove_tags_before	=dict(id='Texto')
--- a/resources/recipes/el_publico.recipe
+++ b/resources/recipes/el_publico.recipe
@ -0,0 +1,43 @@
+#!/usr/bin/env  python
+__license__   = 'GPL v3'
+__author__    = 'Gerardo Diez'
+__copyright__ = 'Gerardo Diez<gerardo.diez.garcia@gmail.com>'
+description   = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)'
+__docformat__ = 'restructuredtext en'
+
+'''
+publico.es
+'''
+from calibre.web.feeds.recipes import BasicNewsRecipe
+class Publico(BasicNewsRecipe):
+    title               =u'Publico.es'
+    __author__      ='Gerardo Diez'
+    publisher       =u'Mediapubli Sociedad de Publicaciones y Ediciones S.L.'
+    category                ='news, politics, finances, world, spain, science, catalunya'
+    oldest_article      =1
+    max_articles_per_feed   =100
+    simultaneous_downloads  =10
+    cover_url       =u'http://imagenes.publico.es/css/img/logo_publico.gif'
+    timefmt         ='[%a, %d %b, %Y]'
+    encoding        ='utf8'
+    language        ='es'
+    remove_javascript   =True
+    no_stylesheets      =True
+    keep_only_tags      =dict(id='main')
+    remove_tags         =[
+                            dict(name='div', attrs={'class':['Noticias_642x50', 'contInfo ancho']}),
+                            dict(name='ul', attrs={'class':['navComentarios', 'comentarios']}),
+                            dict(name='div', attrs={'id':['commentsContext', 'toolbar', 'comentarios']}),
+                            dict(name='h5', attrs={'id':'comentarios'})
+                            ]
+    feeds               =[(u'Internacional', u'http://www.publico.es/estaticos/rss/internacional'),
+                 (u'Espa\xf1a', u'http://www.publico.es/estaticos/rss/espana'),
+                 (u'Dinero', u'http://www.publico.es/estaticos/rss/dinero'),
+                 (u'Ciencias', u'http://www.publico.es/estaticos/rss/ciencias'),
+                 (u'Culturas', u'http://www.publico.es/estaticos/rss/culturas'),
+                 (u'Deportes', u'http://www.publico.es/estaticos/rss/deportes'),
+                 (u'Televisi\xf3n y Gente', u'http://www.publico.es/estaticos/rss/televisionygente'),
+                 (u'Catalu\xf1a', u'http://www.publico.es/estaticos/rss/catalunya'),
+                 (u'Viajes', u'http://www.publico.es/estaticos/rss/viajes')]
+
+
--- a/resources/recipes/elpais_impreso.recipe
+++ b/resources/recipes/elpais_impreso.recipe
@ -17,7 +17,7 @@ class ElPais_RSS(BasicNewsRecipe):
    no_stylesheets        = True
    encoding              = 'cp1252'
    use_embedded_content  = False
-    language              = 'es_ES'
+    language              = 'es'
    remove_empty_feeds    = True
    publication_type      = 'newspaper'
    masthead_url          = 'http://www.elpais.com/im/tit_logo.gif'
--- a/resources/recipes/exiled.recipe
+++ b/resources/recipes/exiled.recipe
@ -1,7 +1,5 @@
-#!/usr/bin/env  python
-
 __license__   = 'GPL v3'
-__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'
 '''
 exiledonline.com
 '''
@ -21,17 +19,19 @@ class Exiled(BasicNewsRecipe):
    encoding              = 'utf8'
    remove_javascript     = True
    language              = 'en'
+    publication_type      = 'newsblog'
+    masthead_url          = 'http://exiledonline.com/wp-content/themes/exiledonline_theme/images/header-sm.gif'
+    extra_css             = """
+                               body{font-family: Arial,Helvetica,sans-serif}
+                               #topslug{font-size: xx-large; font-weight: bold; color: red}                               
+                            """
    
-    cover_url             = 'http://exiledonline.com/wp-content/themes/exiledonline_theme/images/header-sm.gif'
-
-    html2lrf_options = [
-                          '--comment'       , description
-                        , '--base-font-size', '10'
-                        , '--category'      , category
-                        , '--publisher'     , publisher
-                        ]
-
-    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
+    conversion_options = {
+                          'comment'   : description
+                        , 'tags'      : category
+                        , 'publisher' : publisher
+                        , 'language'  : language
+                        }

    keep_only_tags = [dict(name='div', attrs={'id':'main'})]

@ -47,12 +47,13 @@ class Exiled(BasicNewsRecipe):
    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
-        mtag = '\n<meta http-equiv="Content-Language" content="en"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n'
-        soup.head.insert(0,mtag)
+        for alink in soup.findAll('a'):
+            if alink.string is not None:
+               tstr = alink.string
+               alink.replaceWith(tstr)
        return soup

    def get_article_url(self, article):
        raw = article.get('link',  None)
        final = raw + 'all/1/'
        return final
-
--- a/src/calibre/devices/android/driver.py
+++ b/src/calibre/devices/android/driver.py
@ -29,7 +29,7 @@ class ANDROID(USBMS):
            # Motorola
            0x22b8 : { 0x41d9 : [0x216], 0x2d61 : [0x100], 0x2d67 : [0x100],
                       0x41db : [0x216], 0x4285 : [0x216], 0x42a3 : [0x216],
-                       0x4286 : [0x216] },
+                       0x4286 : [0x216], 0x42b3 : [0x216] },

            # Sony Ericsson
            0xfce : { 0xd12e : [0x0100]},
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -977,6 +977,8 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
    from calibre.ebooks.oeb.base import OEBBook
    html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
            opts.preprocess_html, opts)
+    if not encoding:
+        encoding = None
    oeb = OEBBook(log, html_preprocessor,
            pretty_print=opts.pretty_print, input_encoding=encoding)
    if not populate:
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -78,6 +78,8 @@ class DocAnalysis(object):
            linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
        elif format == 'spanned_html':
            linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
+        elif format == 'txt':
+            linere = re.compile('.*?\n', re.DOTALL)
        self.lines = linere.findall(raw)

    def line_length(self, percent):
@ -561,8 +563,8 @@ class HTMLPreProcessor(object):
        html = html.replace(start, '<!--')
        html = html.replace(stop, '-->')
        # convert ellipsis to entities to prevent wrapping
-        html = re.sub('(?u)(?<=\w)\s?(\.\s?){2}\.', '&hellip;', html)
+        html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '&hellip;', html)
        # convert double dashes to em-dash
-        html = re.sub('\s--\s', u'\u2014', html)
+        html = re.sub(r'\s--\s', u'\u2014', html)
        return substitute_entites(html)

--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -154,7 +154,7 @@ class PreProcessor(object):
        default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"

        chapter_types = [
-            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
+            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common Chapter Headings"],
            [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
            [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"],  # Numeric Chapters
            [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"],  # Spaced Lettering
@ -184,6 +184,21 @@ class PreProcessor(object):
        self.log("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
        return html

+    def punctuation_unwrap(self, length, content, format):
+        # define the pieces of the regex
+        lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
+        line_ending = "\s*</(span|p|div)>\s*(</(p|span|div)>)?"
+        blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
+        line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*"
+        txt_line_wrap = u"(\u0020|\u0009)*\n"
+        
+        unwrap_regex = lookahead+line_ending+blanklines+line_opening
+        if format == 'txt':
+            unwrap_regex = lookahead+txt_line_wrap
+        
+        unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
+        content = unwrap.sub(' ', content)
+        return content
       

    def __call__(self, html):
@ -194,7 +209,7 @@ class PreProcessor(object):
        totalwords = 0
        totalwords = self.get_word_count(html)

-        if totalwords < 20:
+        if totalwords < 50:
            self.log("not enough text, not preprocessing")
            return html

@ -312,8 +327,7 @@ class PreProcessor(object):
            self.log("Done dehyphenating")
            # Unwrap lines using punctation and line length
            #unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE)
-            unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
-            html = unwrap.sub(' ', html)
+            html = self.punctuation_unwrap(length, html, 'html')
            #check any remaining hyphens, but only unwrap if there is a match
            dehyphenator = Dehyphenator()
            html = dehyphenator(html,'html_cleanup', length)
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@ -119,7 +119,7 @@ class HTMLFile(object):

        self.is_binary = level > 0 and not bool(self.HTML_PAT.search(src[:4096]))
        if not self.is_binary:
-            if encoding is None:
+            if not encoding:
                encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
                self.encoding = encoding
            else:
--- a/src/calibre/ebooks/mobi/reader.py
+++ b/src/calibre/ebooks/mobi/reader.py
@ -139,7 +139,7 @@ class BookHeader(object):
                    65001: 'utf-8',
                    }[self.codepage]
            except (IndexError, KeyError):
-                self.codec = 'cp1252' if user_encoding is None else user_encoding
+                self.codec = 'cp1252' if not user_encoding else user_encoding
                log.warn('Unknown codepage %d. Assuming %s' % (self.codepage,
                    self.codec))
            if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length \
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -1892,7 +1892,7 @@ class OEBBook(object):
                return fix_data(data.decode(bom_enc))
            except UnicodeDecodeError:
                pass
-        if self.input_encoding is not None:
+        if self.input_encoding:
            try:
                return fix_data(data.decode(self.input_encoding, 'replace'))
            except UnicodeDecodeError:
--- a/src/calibre/ebooks/pdb/palmdoc/reader.py
+++ b/src/calibre/ebooks/pdb/palmdoc/reader.py
@ -65,9 +65,9 @@ class Reader(FormatReader):
        from calibre.customize.ui import plugin_for_input_format

        txt_plugin = plugin_for_input_format('txt')
-        for option in txt_plugin.options:
-            if not hasattr(self.options, option.option.name):
-                setattr(self.options, option.name, option.recommended_value)
+        for opt in txt_plugin.options:
+            if not hasattr(self.options, opt.option.name):
+                setattr(self.options, opt.option.name, opt.recommended_value)

        stream.seek(0)
        return txt_plugin.convert(stream, self.options, 'txt', self.log, {})
--- a/src/calibre/ebooks/pdb/pdf/reader.py
+++ b/src/calibre/ebooks/pdb/pdf/reader.py
@ -31,9 +31,9 @@ class Reader(FormatReader):
        from calibre.customize.ui import plugin_for_input_format

        pdf_plugin = plugin_for_input_format('pdf')
-        for option in pdf_plugin.options:
-            if not hasattr(self.options, option.option.name):
-                setattr(self.options, option.name, option.recommended_value)
+        for opt in pdf_plugin.options:
+            if not hasattr(self.options, opt.option.name):
+                setattr(self.options, opt.option.name, opt.recommended_value)

        pdf.seek(0)
        return pdf_plugin.convert(pdf, self.options, 'pdf', self.log, {})
--- a/src/calibre/ebooks/pdb/ztxt/reader.py
+++ b/src/calibre/ebooks/pdb/ztxt/reader.py
@ -83,9 +83,9 @@ class Reader(FormatReader):
        from calibre.customize.ui import plugin_for_input_format

        txt_plugin = plugin_for_input_format('txt')
-        for option in txt_plugin.options:
-            if not hasattr(self.options, option.option.name):
-                setattr(self.options, option.name, option.recommended_value)
+        for opt in txt_plugin.options:
+            if not hasattr(self.options, opt.option.name):
+                setattr(self.options, opt.option.name, opt.recommended_value)

        stream.seek(0)
        return txt_plugin.convert(stream, self.options, 'txt', self.log, {})
--- a/src/calibre/ebooks/tcr/input.py
+++ b/src/calibre/ebooks/tcr/input.py
@ -26,9 +26,9 @@ class TCRInput(InputFormatPlugin):
        from calibre.customize.ui import plugin_for_input_format

        txt_plugin = plugin_for_input_format('txt')
-        for option in txt_plugin.options:
-            if not hasattr(options, option.option.name):
-                setattr(options, option.name, option.recommended_value)
+        for opt in txt_plugin.options:
+            if not hasattr(self.options, opt.option.name):
+                setattr(self.options, opt.option.name, opt.recommended_value)

        stream.seek(0)
        return txt_plugin.convert(stream, options,
--- a/src/calibre/ebooks/txt/heuristicprocessor.py
+++ b/src/calibre/ebooks/txt/heuristicprocessor.py
@ -0,0 +1,58 @@
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPL 3'
+__copyright__ = '2011, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import re
+
+from calibre import prepare_string_for_xml
+
+class TXTHeuristicProcessor(object):
+
+    def __init__(self):
+        self.ITALICIZE_WORDS = [
+            'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
+            'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.',
+            'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
+            'Mlle.', 'Mons.', 'PS.', 'PPS.',
+        ]
+        self.ITALICIZE_STYLE_PATS = [
+            r'(?msu)_(?P<words>.+?)_',
+            r'(?msu)/(?P<words>[^<>]+?)/',
+            r'(?msu)~~(?P<words>.+?)~~',
+            r'(?msu)\*(?P<words>.+?)\*',
+            r'(?msu)~(?P<words>.+?)~',
+            r'(?msu)_/(?P<words>[^<>]+?)/_',
+            r'(?msu)_\*(?P<words>.+?)\*_',
+            r'(?msu)\*/(?P<words>[^<>]+?)/\*',
+            r'(?msu)_\*/(?P<words>[^<>]+?)/\*_',
+            r'(?msu)/:(?P<words>[^<>]+?):/',
+            r'(?msu)\|:(?P<words>.+?):\|',
+        ]
+
+    def process_paragraph(self, paragraph):
+        for word in self.ITALICIZE_WORDS:
+            paragraph = paragraph.replace(word, '<i>%s</i>' % word)
+        for pat in self.ITALICIZE_STYLE_PATS:
+            paragraph = re.sub(pat, lambda mo: '<i>%s</i>' % mo.group('words'), paragraph)
+        return paragraph
+
+    def convert(self, txt, title='', epub_split_size_kb=0):
+        from calibre.ebooks.txt.processor import clean_txt, split_txt, HTML_TEMPLATE
+        txt = clean_txt(txt)
+        txt = split_txt(txt, epub_split_size_kb)
+
+        processed = []
+        for line in txt.split('\n\n'):
+            processed.append(u'<p>%s</p>' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
+
+        txt = u'\n'.join(processed)
+        txt = re.sub('[ ]{2,}', ' ', txt)
+        html = HTML_TEMPLATE % (title, txt)
+
+        from calibre.ebooks.conversion.utils import PreProcessor
+        pp = PreProcessor()
+        html = pp.markup_chapters(html, pp.get_word_count(html), False)
+
+        return html
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@ -10,7 +10,8 @@ from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
 from calibre.ebooks.chardet import detect
 from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
    separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
-    preserve_spaces, detect_paragraph_type, detect_formatting_type
+    preserve_spaces, detect_paragraph_type, detect_formatting_type, \
+    convert_heuristic
 from calibre import _ent_pat, xml_entity_to_unicode

 class TXTInput(InputFormatPlugin):
@ -24,18 +25,22 @@ class TXTInput(InputFormatPlugin):
        OptionRecommendation(name='paragraph_type', recommended_value='auto',
            choices=['auto', 'block', 'single', 'print'],
            help=_('Paragraph structure.\n'
-                   'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
+                   'choices are [\'auto\', \'block\', \'single\', \'print\', \'unformatted\']\n'
                   '* auto: Try to auto detect paragraph type.\n'
                   '* block: Treat a blank line as a paragraph break.\n'
                   '* single: Assume every line is a paragraph.\n'
                   '* print:  Assume every line starting with 2+ spaces or a tab '
-                   'starts a paragraph.')),
+                   'starts a paragraph.'
+                   '* unformatted: Most lines have hard line breaks, few/no spaces or indents.')),
        OptionRecommendation(name='formatting_type', recommended_value='auto',
-            choices=['auto', 'none', 'markdown'],
+            choices=['auto', 'none', 'heuristic', 'markdown'],
            help=_('Formatting used within the document.'
-                   '* auto: Try to auto detect the document formatting.\n'
-                   '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
-                   '* markdown: Run the input though the markdown pre-processor. '
+                   '* auto: Automatically decide which formatting processor to use.\n'
+                   '* none: Do not process the document formatting. Everything is a '
+                   'paragraph and no styling is applied.\n'
+                   '* heuristic: Process using heuristics to determine formatting such '
+                   'as chapter headings and italic text.\n'
+                   '* markdown: Processing using markdown formatting. '
                   'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
        OptionRecommendation(name='preserve_spaces', recommended_value=False,
            help=_('Normally extra spaces are condensed into a single space. '
@ -90,14 +95,29 @@ class TXTInput(InputFormatPlugin):
            
            # We don't check for block because the processor assumes block.
            # single and print at transformed to block for processing.
-            if options.paragraph_type == 'single':
+            if options.paragraph_type == 'single' or options.paragraph_type == 'unformatted':
                txt = separate_paragraphs_single_line(txt)
            elif options.paragraph_type == 'print':
                txt = separate_paragraphs_print_formatted(txt)

+            if options.paragraph_type == 'unformatted':
+                from calibre.ebooks.conversion.utils import PreProcessor
+                from calibre.ebooks.conversion.preprocess import DocAnalysis
+                # get length
+                docanalysis = DocAnalysis('txt', txt)
+                length = docanalysis.line_length(.5)
+                # unwrap lines based on punctuation
+                preprocessor = PreProcessor(options, log=getattr(self, 'log', None))
+                txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
+
            flow_size = getattr(options, 'flow_size', 0)
+            
+            if options.formatting_type == 'heuristic':
+                html = convert_heuristic(txt, epub_split_size_kb=flow_size)
+            else:
                html = convert_basic(txt, epub_split_size_kb=flow_size)
            
+
        from calibre.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@ -9,6 +9,8 @@ import os, re
 from calibre import prepare_string_for_xml, isbytestring
 from calibre.ebooks.markdown import markdown
 from calibre.ebooks.metadata.opf2 import OPFCreator
+from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
+from calibre.ebooks.conversion.preprocess import DocAnalysis

 __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
@ -16,7 +18,7 @@ __docformat__ = 'restructuredtext en'

 HTML_TEMPLATE = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'

-def convert_basic(txt, title='', epub_split_size_kb=0):
+def clean_txt(txt):
    if isbytestring(txt):
        txt = txt.decode('utf-8', 'replace')
    # Strip whitespace from the beginning and end of the line. Also replace
@ -35,6 +37,10 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
    chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19))
    illegal_chars = re.compile(u'|'.join(map(unichr, chars)))
    txt = illegal_chars.sub('', txt)
+    
+    return txt
+
+def split_txt(txt, epub_split_size_kb=0):
    #Takes care if there is no point to split
    if epub_split_size_kb > 0:
        if isinstance(txt, unicode):
@ -49,6 +55,12 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
    if isbytestring(txt):
        txt = txt.decode('utf-8')

+    return txt
+
+def convert_basic(txt, title='', epub_split_size_kb=0):
+    txt = clean_txt(txt)
+    txt = split_txt(txt, epub_split_size_kb)
+
    lines = []
    # Split into paragraphs based on having a blank line between text.
    for line in txt.split('\n\n'):
@ -57,6 +69,10 @@ def convert_basic(txt, title='', epub_split_size_kb=0):

    return HTML_TEMPLATE % (title, u'\n'.join(lines))

+def convert_heuristic(txt, title='', epub_split_size_kb=0):
+    tp = TXTHeuristicProcessor()
+    return tp.convert(txt, title, epub_split_size_kb)
+
 def convert_markdown(txt, title='', disable_toc=False):
    md = markdown.Markdown(
          extensions=['footnotes', 'tables', 'toc'],
@ -101,27 +117,36 @@ def detect_paragraph_type(txt):
    single: Each line is a paragraph.
    print: Each paragraph starts with a 2+ spaces or a tab
           and ends when a new paragraph is reached.
-    markdown: Markdown formatting is in the document.
+    unformatted: most lines have hard line breaks, few/no spaces or indents
    
-    returns block, single, print, markdown
+    returns block, single, print, unformatted
    '''
    txt = txt.replace('\r\n', '\n')
    txt = txt.replace('\r', '\n')
    txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))
    
+    # Check for hard line breaks - true if 55% of the doc breaks in the same region
+    docanalysis = DocAnalysis('txt', txt)
+    hardbreaks = docanalysis.line_histogram(.55)
+    
+    if hardbreaks:
        # Check for print
        tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
-    if tab_line_count / float(txt_line_count) >= .25:
+        if tab_line_count / float(txt_line_count) >= .15:
            return 'print'
        
        # Check for block
        empty_line_count = len(re.findall('(?mu)^\s*$', txt))
-    if empty_line_count / float(txt_line_count) >= .25:
+        if empty_line_count / float(txt_line_count) >= .15:
            return 'block'

-    # Nothing else matched to assume single.
+        # Assume unformatted text with hardbreaks if nothing else matches        
+        return 'unformatted'
+    
+    # return single if hardbreaks is false
    return 'single'

+
 def detect_formatting_type(txt):
    # Check for markdown
    # Headings
@ -143,4 +168,4 @@ def detect_formatting_type(txt):
        if txt.count('\\'+c) > 10:
            return 'markdown'
    
-    return 'none'
+    return 'heuristic'
--- a/src/calibre/ebooks/txt/txtml.py
+++ b/src/calibre/ebooks/txt/txtml.py
@ -8,7 +8,6 @@ __docformat__ = 'restructuredtext en'
 Transform OEB content into plain text
 '''

-import os
 import re

 from lxml import etree
@ -33,6 +32,15 @@ BLOCK_STYLES = [
    'block',
 ]

+HEADING_TAGS = [
+    'h1',
+    'h2',
+    'h3',
+    'h4',
+    'h5',
+    'h6',
+]
+
 SPACE_TAGS = [
    'td',
    'br',
@ -47,6 +55,10 @@ class TXTMLizer(object):
        self.log.info('Converting XHTML to TXT...')
        self.oeb_book = oeb_book
        self.opts = opts
+        self.toc_ids = []
+        self.last_was_heading = False
+        
+        self.create_flat_toc(self.oeb_book.toc)

        return self.mlize_spine()

@ -58,8 +70,11 @@ class TXTMLizer(object):
            stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
            content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
            content = self.remove_newlines(content)
-            output += self.dump_text(etree.fromstring(content), stylizer)
-        output = self.cleanup_text(u''.join(output))
+            output += self.dump_text(etree.fromstring(content), stylizer, item)
+            output += '\n\n\n\n\n\n'
+        output = u''.join(output)
+        output = u'\n'.join(l.rstrip() for l in output.splitlines())
+        output = self.cleanup_text(output)

        return output

@ -68,6 +83,8 @@ class TXTMLizer(object):
        text = text.replace('\r\n', ' ')
        text = text.replace('\n', ' ')
        text = text.replace('\r', ' ')
+        # Condense redundant spaces created by replacing newlines with spaces.
+        text = re.sub(r'[ ]{2,}', ' ', text)

        return text

@ -80,6 +97,14 @@ class TXTMLizer(object):
                toc.append(u'* %s\n\n' % item.title)
        return ''.join(toc)

+    def create_flat_toc(self, nodes):
+        '''
+        Turns a hierarchical list of TOC href's into a flat list.
+        '''
+        for item in nodes:
+            self.toc_ids.append(item.href)
+            self.create_flat_toc(item.nodes)
+
    def cleanup_text(self, text):
        self.log.debug('\tClean up text...')
        # Replace bad characters.
@ -92,7 +117,7 @@ class TXTMLizer(object):
        text = text.replace('\f+', ' ')

        # Single line paragraph.
-        text = re.sub('(?<=.)%s(?=.)' % os.linesep, ' ', text)
+        text = re.sub('(?<=.)\n(?=.)', ' ', text)

        # Remove multiple spaces.
        text = re.sub('[ ]{2,}', ' ', text)
@ -101,14 +126,20 @@ class TXTMLizer(object):
        text = re.sub('\n[ ]+\n', '\n\n', text)
        if self.opts.remove_paragraph_spacing:
            text = re.sub('\n{2,}', '\n', text)
-            text = re.sub('(?imu)^(?=.)', '\t', text)
+            text = re.sub(r'(?msu)^(?P<t>[^\t\n]+?)$', lambda mo: u'%s\n\n' % mo.group('t'), text)
+            text = re.sub(r'(?msu)(?P<b>[^\n])\n+(?P<t>[^\t\n]+?)(?=\n)', lambda mo: '%s\n\n\n\n\n\n%s' % (mo.group('b'), mo.group('t')), text)
        else:
-            text = re.sub('\n{3,}', '\n\n', text)
+            text = re.sub('\n{7,}', '\n\n\n\n\n\n', text)

        # Replace spaces at the beginning and end of lines
+        # We don't replace tabs because those are only added
+        # when remove paragraph spacing is enabled.
        text = re.sub('(?imu)^[ ]+', '', text)
        text = re.sub('(?imu)[ ]+$', '', text)
        
+        # Remove empty space and newlines at the beginning of the document.
+        text = re.sub(r'(?u)^[ \n]+', '', text)
+
        if self.opts.max_line_length:
            max_length = self.opts.max_line_length
            if self.opts.max_line_length < 25 and not self.opts.force_max_line_length:
@ -145,13 +176,11 @@ class TXTMLizer(object):

        return text

-    def dump_text(self, elem, stylizer, end=''):
+    def dump_text(self, elem, stylizer, page):
        '''
        @elem: The element in the etree that we are working on.
        @stylizer: The style information attached to the element.
-        @end: The last two characters of the text from the previous element.
-              This is used to determine if a blank line is needed when starting
-              a new block element.
+        @page: OEB page used to determine absolute urls.
        '''

        if not isinstance(elem.tag, basestring) \
@ -170,13 +199,22 @@ class TXTMLizer(object):
            return ['']

        tag = barename(elem.tag)
+        tag_id = elem.attrib.get('id', None)
        in_block = False
+        in_heading = False
+
+        # Are we in a heading?
+        # This can either be a heading tag or a TOC item.
+        if tag in HEADING_TAGS or '%s#%s' % (page.href, tag_id) in self.toc_ids:
+            in_heading = True
+            if not self.last_was_heading:
+                text.append('\n\n\n\n\n\n')

        # Are we in a paragraph block?
        if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
+            if self.opts.remove_paragraph_spacing and not in_heading:
+                text.append(u'\t')
            in_block = True
-            if not end.endswith(u'\n\n') and hasattr(elem, 'text') and elem.text:
-                text.append(u'\n\n')

        if tag in SPACE_TAGS:
            text.append(u' ')
@ -185,14 +223,17 @@ class TXTMLizer(object):
        if hasattr(elem, 'text') and elem.text:
            text.append(elem.text)

+        # Recurse down into tags within the tag we are in.
        for item in elem:
-            en = u''
-            if len(text) >= 2:
-                en = text[-1][-2:]
-            text += self.dump_text(item, stylizer, en)
+            text += self.dump_text(item, stylizer, page)

        if in_block:
            text.append(u'\n\n')
+        if in_heading:
+            text.append(u'\n')
+            self.last_was_heading = True
+        else:
+            self.last_was_heading = False

        if hasattr(elem, 'tail') and elem.tail:
            text.append(elem.tail)
--- a/src/calibre/gui2/convert/init.py
+++ b/src/calibre/gui2/convert/init.py
@ -146,6 +146,8 @@ class Widget(QWidget):
                codecs.lookup(ans)
            except:
                ans = ''
+            if not ans:
+                ans = None
            return ans
        elif isinstance(g, QComboBox):
            return unicode(g.currentText())
--- a/src/calibre/gui2/wizard/send_email.py
+++ b/src/calibre/gui2/wizard/send_email.py
@ -16,7 +16,7 @@ from PyQt4.Qt import QWidget, pyqtSignal, QDialog, Qt, QLabel, \
 from calibre.gui2.wizard.send_email_ui import Ui_Form
 from calibre.utils.smtp import config as smtp_prefs
 from calibre.gui2.dialogs.test_email_ui import Ui_Dialog as TE_Dialog
-from calibre.gui2 import error_dialog
+from calibre.gui2 import error_dialog, question_dialog

 class TestEmail(QDialog, TE_Dialog):

@ -92,6 +92,9 @@ class SendEmail(QWidget, Ui_Form):
        pa = self.preferred_to_address()
        to_set = pa is not None
        if self.set_email_settings(to_set):
+            if question_dialog(self, _('OK to proceed?'),
+                    _('This will display your email password on the screen'
+                    '. Is it OK to proceed?'), show_copy_button=False):
                TestEmail(pa, self).exec_()

    def test_email_settings(self, to):
--- a/src/calibre/library/caches.py
+++ b/src/calibre/library/caches.py
@ -274,13 +274,15 @@ class ResultCache(SearchQueryParser): # {{{
        loc = self.field_metadata[location]['rec_index']

        if query == 'false':
-            for item in [self._data[id] for id in candidates]:
+            for id_ in candidates:
+                item = self._data[id_]
                if item is None: continue
                if item[loc] is None or item[loc] <= UNDEFINED_DATE:
                    matches.add(item[0])
            return matches
        if query == 'true':
-            for item in [self._data[id] for id in candidates]:
+            for id_ in candidates:
+                item = self._data[id_]
                if item is None: continue
                if item[loc] is not None and item[loc] > UNDEFINED_DATE:
                    matches.add(item[0])
@ -319,7 +321,8 @@ class ResultCache(SearchQueryParser): # {{{
                field_count = query.count('-') + 1
            else:
                field_count = query.count('/') + 1
-        for item in [self._data[id] for id in candidates]:
+        for id_ in candidates:
+            item = self._data[id_]
            if item is None or item[loc] is None: continue
            if relop(item[loc], qd, field_count):
                matches.add(item[0])
@ -381,7 +384,8 @@ class ResultCache(SearchQueryParser): # {{{
        except:
            return matches

-        for item in [self._data[id] for id in candidates]:
+        for id_ in candidates:
+            item = self._data[id_]
            if item is None:
                continue
            v = val_func(item)
@ -481,7 +485,8 @@ class ResultCache(SearchQueryParser): # {{{
                else:
                    q = query

-                for item in [self._data[id] for id in candidates]:
+                for id_ in candidates:
+                    item = self._data[id_]
                    if item is None: continue

                    if col_datatype[loc] == 'bool': # complexity caused by the two-/three-value tweak
--- a/src/calibre/library/catalog.py
+++ b/src/calibre/library/catalog.py
@ -2861,25 +2861,17 @@ class EPUB_MOBI(CatalogPlugin):
                self.updateProgressMicroStep("Thumbnail %d of %d" % \
                    (i,len(self.booksByTitle)),
                        i/float(len(self.booksByTitle)))
-                # Check to see if source file exists
-                if 'cover' in title and os.path.isfile(title['cover']):
-                    # Add the thumb spec to thumbs[]
-                    thumbs.append("thumbnail_%d.jpg" % int(title['id']))

-                    # Check to see if thumbnail exists
-                    thumb_fp = "%s/thumbnail_%d.jpg" % (image_dir,int(title['id']))
                thumb_file = 'thumbnail_%d.jpg' % int(title['id'])
-                    if os.path.isfile(thumb_fp):
-                        # Check to see if cover is newer than thumbnail
-                        # os.path.getmtime() = modified time
-                        # os.path.ctime() = creation time
-                        cover_timestamp = os.path.getmtime(title['cover'])
-                        thumb_timestamp = os.path.getmtime(thumb_fp)
-                        if thumb_timestamp < cover_timestamp:
+                thumb_generated = True
+                try:
                    self.generateThumbnail(title, image_dir, thumb_file)
-                    else:
-                        self.generateThumbnail(title, image_dir, thumb_file)
-                else:
+                    thumbs.append("thumbnail_%d.jpg" % int(title['id']))
+                except:
+                    thumb_generated = False
+
+
+                if not thumb_generated:
                    # Use default cover
                    if False and self.verbose:
                        self.opts.log.warn(" using default cover for '%s'" % \
--- a/src/calibre/library/database2.py
+++ b/src/calibre/library/database2.py
@ -722,10 +722,9 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
        mi.uuid        = row[fm['uuid']]
        mi.title_sort  = row[fm['sort']]
        formats = row[fm['formats']]
-        if hasattr(formats, 'split'):
-            mi.formats = formats.split(',')
-        else:
-            mi.formats = None
+        if not formats:
+            formats = None
+        mi.formats = formats
        tags = row[fm['tags']]
        if tags:
            mi.tags = [i.strip() for i in tags.split(',')]
@ -878,16 +877,17 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):

    def formats(self, index, index_is_id=False, verify_formats=True):
        ''' Return available formats as a comma separated list or None if there are no available formats '''
-        id = index if index_is_id else self.id(index)
-        formats = self.data.get(id, self.FIELD_MAP['formats'], row_is_id = True)
+        id_ = index if index_is_id else self.id(index)
+        formats = self.data.get(id_, self.FIELD_MAP['formats'], row_is_id=True)
        if not formats:
            return None
        if not verify_formats:
-            return ','.join(formats)
+            return formats
+        formats = formats.split(',')
        ans = []
-        for format in formats:
-            if self.format_abspath(id, format, index_is_id=True) is not None:
-                ans.append(format)
+        for fmt in formats:
+            if self.format_abspath(id_, fmt, index_is_id=True) is not None:
+                ans.append(fmt)
        if not ans:
            return None
        return ','.join(ans)
--- a/src/calibre/manual/conversion.rst
+++ b/src/calibre/manual/conversion.rst
@ -533,17 +533,23 @@ PDF documents are one of the worst formats to convert from. They are a fixed pag
 Meaning, it is very difficult to determine where one paragraph ends and another begins. |app| will try to unwrap
 paragraphs using a configurable, :guilabel:`Line Un-Wrapping Factor`. This is a scale used to determine the length
 at which a line should be unwrapped. Valid values are a decimal
-between 0 and 1. The default is 0.5, this is the median line length. Lower this value to include more
-text in the unwrapping. Increase to include less. You can adjust this value in the conversion settings under PDF Input.
+between 0 and 1. The default is 0.45, just under the median line length. Lower this value to include more
+text in the unwrapping. Increase to include less. You can adjust this value in the conversion settings under :guilabel:`PDF Input`.

 Also, they often have headers and footers as part of the document that will become included with the text.
 Use the options to remove headers and footers to mitigate this issue. If the headers and footers are not
-removed from the text it can throw off the paragraph unwrapping.
+removed from the text it can throw off the paragraph unwrapping. To learn how to use the header and footer removal options, read 
+:ref:`regexptutorial`.

-Some limitations of PDF input is complex, multi-column, and image based documents are not supported.
-Extraction of vector images and tables from within the document is also not supported. Some PDFs use special glyphs to
-represent double ll or doubfle ff or fi,etc. Conversion of these may or may not work depending on jusy how they are 
-represented internally in the PDF.
+Some limitations of PDF input are: 
+    
+    * Complex, multi-column, and image based documents are not supported.
+    * Extraction of vector images and tables from within the document is also not supported.
+    * Some PDFs use special glyphs to represent ll or ff or fi, etc. Conversion of these may or may not work depending on just how they are represented internally in the PDF.
+    * Some PDFs store their images upside down with a rotation instruction, |app| currently doesn't support that instruction, so the images will be rotated in the output as well. 
+
+To re-iterate **PDF is a really, really bad** format to use as input. If you absolutely must use PDF, then be prepared for an
+output ranging anywhere from decent to unusable, depending on the input PDF.

 Comic Book Collections
 ~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/src/calibre/translations/calibre.pot
+++ b/src/calibre/translations/calibre.pot
--- a/src/calibre/utils/search_query_parser.py
+++ b/src/calibre/utils/search_query_parser.py
@ -190,9 +190,11 @@ class SearchQueryParser(object):
    # recursive search test list. However, we permit seeing the
    # same search a few times because the search might appear within
    # another search.
-    def _parse(self, query, candidates):
+    def _parse(self, query, candidates=None):
        self.recurse_level += 1
        res = self._parser.parseString(query)[0]
+        if candidates is None:
+            candidates = self.universal_set()
        t = self.evaluate(res, candidates)
        self.recurse_level -= 1
        return t
@ -248,15 +250,16 @@ class SearchQueryParser(object):
        else:
            return self.get_matches(location, query)

-    def get_matches(self, location, query):
+    def get_matches(self, location, query, candidates=None):
        '''
        Should return the set of matches for :param:'location` and :param:`query`.
-        If you set the optimized parameter in __init__, this method must accept
-        a named parameter 'candidates'
+
+        The search must be performed over all entries is :param:`candidates` is
+        None otherwise only over the items in candidates.

        :param:`location` is one of the items in :member:`SearchQueryParser.DEFAULT_LOCATIONS`.
        :param:`query` is a string literal.
-        :param: optional named parameter candidates, a set of items to check.
+        :param: None or a subset of the set returned by :meth:`universal_set`.
        '''
        return set([])