From 09ff8524214cc51091f8ec8dca616e2675e40789 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 8 Jan 2011 06:53:24 -0700
Subject: [PATCH 01/14] El Publico by Gerardo Diez. Fixes #405 (New news feed)

---
 resources/recipes/deia.recipe           |  2 +-
 resources/recipes/el_publico.recipe     | 43 +++++++++++++++++++++++++
 resources/recipes/elpais_impreso.recipe |  8 ++---
 3 files changed, 48 insertions(+), 5 deletions(-)
 create mode 100644 resources/recipes/el_publico.recipe

diff --git a/resources/recipes/deia.recipe b/resources/recipes/deia.recipe
index 980d59d3d1..5d39be9a10 100644
--- a/resources/recipes/deia.recipe
+++ b/resources/recipes/deia.recipe
@@ -22,7 +22,7 @@ class Deia(BasicNewsRecipe):
 	cover_url		='http://2.bp.blogspot.com/_RjrWzC6tI14/TM6jrPLaBZI/AAAAAAAAFaI/ayffwxidFEY/s1600/2009-10-13-logo-deia.jpg'
 	timefmt			='[%a, %d %b, %Y]'
 	encoding		='utf8'
-	language		='es_ES'
+	language		='es'
 	remove_javascript	=True
 	remove_tags_after	=dict(id='Texto')
 	remove_tags_before	=dict(id='Texto')
diff --git a/resources/recipes/el_publico.recipe b/resources/recipes/el_publico.recipe
new file mode 100644
index 0000000000..d0da739b03
--- /dev/null
+++ b/resources/recipes/el_publico.recipe
@@ -0,0 +1,43 @@
+#!/usr/bin/env  python
+__license__   = 'GPL v3'
+__author__    = 'Gerardo Diez'
+__copyright__ = 'Gerardo Diez<gerardo.diez.garcia@gmail.com>'
+description   = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)'
+__docformat__ = 'restructuredtext en'
+
+'''
+publico.es
+'''
+from calibre.web.feeds.recipes import BasicNewsRecipe
+class Publico(BasicNewsRecipe):
+    title               =u'Publico.es'
+    __author__      ='Gerardo Diez'
+    publisher       =u'Mediapubli Sociedad de Publicaciones y Ediciones S.L.'
+    category                ='news, politics, finances, world, spain, science, catalunya'
+    oldest_article      =1
+    max_articles_per_feed   =100
+    simultaneous_downloads  =10
+    cover_url       =u'http://imagenes.publico.es/css/img/logo_publico.gif'
+    timefmt         ='[%a, %d %b, %Y]'
+    encoding        ='utf8'
+    language        ='es'
+    remove_javascript   =True
+    no_stylesheets      =True
+    keep_only_tags      =dict(id='main')
+    remove_tags         =[
+                            dict(name='div', attrs={'class':['Noticias_642x50', 'contInfo ancho']}),
+                            dict(name='ul', attrs={'class':['navComentarios', 'comentarios']}),
+                            dict(name='div', attrs={'id':['commentsContext', 'toolbar', 'comentarios']}),
+                            dict(name='h5', attrs={'id':'comentarios'})
+                            ]
+    feeds               =[(u'Internacional', u'http://www.publico.es/estaticos/rss/internacional'),
+                 (u'Espa\xf1a', u'http://www.publico.es/estaticos/rss/espana'),
+                 (u'Dinero', u'http://www.publico.es/estaticos/rss/dinero'),
+                 (u'Ciencias', u'http://www.publico.es/estaticos/rss/ciencias'),
+                 (u'Culturas', u'http://www.publico.es/estaticos/rss/culturas'),
+                 (u'Deportes', u'http://www.publico.es/estaticos/rss/deportes'),
+                 (u'Televisi\xf3n y Gente', u'http://www.publico.es/estaticos/rss/televisionygente'),
+                 (u'Catalu\xf1a', u'http://www.publico.es/estaticos/rss/catalunya'),
+                 (u'Viajes', u'http://www.publico.es/estaticos/rss/viajes')]
+
+
diff --git a/resources/recipes/elpais_impreso.recipe b/resources/recipes/elpais_impreso.recipe
index 130013286c..b22a41dcec 100644
--- a/resources/recipes/elpais_impreso.recipe
+++ b/resources/recipes/elpais_impreso.recipe
@@ -17,7 +17,7 @@ class ElPais_RSS(BasicNewsRecipe):
     no_stylesheets        = True
     encoding              = 'cp1252'
     use_embedded_content  = False
-    language              = 'es_ES'
+    language              = 'es'
     remove_empty_feeds    = True
     publication_type      = 'newspaper'
     masthead_url          = 'http://www.elpais.com/im/tit_logo.gif'
@@ -57,14 +57,14 @@ class ElPais_RSS(BasicNewsRecipe):
              ,(u'Madrid'               , u'http://www.elpais.com/rss/feed.html?feedId=1016' )
              ,(u'Pais Vasco'           , u'http://www.elpais.com/rss/feed.html?feedId=17062')
              ,(u'Galicia'              , u'http://www.elpais.com/rss/feed.html?feedId=17063')
-             ,(u'Opinion'              , u'http://www.elpais.com/rss/feed.html?feedId=1003' )             
-             ,(u'Sociedad'             , u'http://www.elpais.com/rss/feed.html?feedId=1004' )             
+             ,(u'Opinion'              , u'http://www.elpais.com/rss/feed.html?feedId=1003' )
+             ,(u'Sociedad'             , u'http://www.elpais.com/rss/feed.html?feedId=1004' )
              ,(u'Deportes'             , u'http://www.elpais.com/rss/feed.html?feedId=1007' )
              ,(u'Cultura'              , u'http://www.elpais.com/rss/feed.html?feedId=1008' )
              ,(u'Cine'                 , u'http://www.elpais.com/rss/feed.html?feedId=17052')
              ,(u'Literatura'           , u'http://www.elpais.com/rss/feed.html?feedId=17053')
              ,(u'Musica'               , u'http://www.elpais.com/rss/feed.html?feedId=17051')
-             ,(u'Arte'                 , u'http://www.elpais.com/rss/feed.html?feedId=17060')             
+             ,(u'Arte'                 , u'http://www.elpais.com/rss/feed.html?feedId=17060')
              ,(u'Tecnologia'           , u'http://www.elpais.com/rss/feed.html?feedId=1005' )
              ,(u'Economia'             , u'http://www.elpais.com/rss/feed.html?feedId=1006' )
              ,(u'Ciencia'              , u'http://www.elpais.com/rss/feed.html?feedId=17068')

From 823cdcc4373bc523a0ba584e0eb82febb7d1f231 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 8 Jan 2011 07:27:08 -0700
Subject: [PATCH 02/14] ...

---
 src/calibre/manual/conversion.rst | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst
index 3a7ae16598..a5aad9b450 100644
--- a/src/calibre/manual/conversion.rst
+++ b/src/calibre/manual/conversion.rst
@@ -533,17 +533,22 @@ PDF documents are one of the worst formats to convert from. They are a fixed pag
 Meaning, it is very difficult to determine where one paragraph ends and another begins. |app| will try to unwrap
 paragraphs using a configurable, :guilabel:`Line Un-Wrapping Factor`. This is a scale used to determine the length
 at which a line should be unwrapped. Valid values are a decimal
-between 0 and 1. The default is 0.5, this is the median line length. Lower this value to include more
-text in the unwrapping. Increase to include less. You can adjust this value in the conversion settings under PDF Input.
+between 0 and 1. The default is 0.45, just under the median line length. Lower this value to include more
+text in the unwrapping. Increase to include less. You can adjust this value in the conversion settings under :guilabel:`PDF Input`.
 
 Also, they often have headers and footers as part of the document that will become included with the text.
 Use the options to remove headers and footers to mitigate this issue. If the headers and footers are not
 removed from the text it can throw off the paragraph unwrapping.
 
-Some limitations of PDF input is complex, multi-column, and image based documents are not supported.
-Extraction of vector images and tables from within the document is also not supported. Some PDFs use special glyphs to
-represent double ll or doubfle ff or fi,etc. Conversion of these may or may not work depending on jusy how they are 
-represented internally in the PDF.
+Some limitations of PDF input are: 
+    
+    * Complex, multi-column, and image based documents are not supported.
+    * Extraction of vector images and tables from within the document is also not supported.
+    * Some PDFs use special glyphs to represent ll or ff or fi, etc. Conversion of these may or may not work depending on just how they are represented internally in the PDF.
+    * Some PDFs store their images upside down with a rotation instruction, |app| currently doesn't support that instruction, so the images will be rotated in the output as well. 
+
+To re-iterate **PDF is a really, really bad** format to use as input. If you absolutely must use PDF, then be prepared for an
+output ranging anywhere from decent to unusable, depending on the input PDF.
 
 Comic Book Collections
 ~~~~~~~~~~~~~~~~~~~~~~~~~

From 8ac2dd0a65776aafcb8132aca5f256c9fcb4acd4 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 8 Jan 2011 07:46:55 -0700
Subject: [PATCH 03/14] Email settings: Before displaying the email test dialog
 warn the user that it will expose their email password

---
 src/calibre/gui2/wizard/send_email.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/calibre/gui2/wizard/send_email.py b/src/calibre/gui2/wizard/send_email.py
index b9b65dc940..5785f52276 100644
--- a/src/calibre/gui2/wizard/send_email.py
+++ b/src/calibre/gui2/wizard/send_email.py
@@ -16,7 +16,7 @@ from PyQt4.Qt import QWidget, pyqtSignal, QDialog, Qt, QLabel, \
 from calibre.gui2.wizard.send_email_ui import Ui_Form
 from calibre.utils.smtp import config as smtp_prefs
 from calibre.gui2.dialogs.test_email_ui import Ui_Dialog as TE_Dialog
-from calibre.gui2 import error_dialog
+from calibre.gui2 import error_dialog, question_dialog
 
 class TestEmail(QDialog, TE_Dialog):
 
@@ -92,7 +92,10 @@ class SendEmail(QWidget, Ui_Form):
         pa = self.preferred_to_address()
         to_set = pa is not None
         if self.set_email_settings(to_set):
-          TestEmail(pa, self).exec_()
+            if question_dialog(self, _('OK to proceed?'),
+                    _('This will display your email password on the screen'
+                    '. Is it OK to proceed?'), show_copy_button=False):
+                TestEmail(pa, self).exec_()
 
     def test_email_settings(self, to):
         opts = smtp_prefs().parse()

From 4abfeed6accf655c8f61f05bc7027de6b8ecad27 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 8 Jan 2011 08:29:40 -0700
Subject: [PATCH 04/14] ...

---
 src/calibre/manual/conversion.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst
index a5aad9b450..4b2b169d72 100644
--- a/src/calibre/manual/conversion.rst
+++ b/src/calibre/manual/conversion.rst
@@ -538,7 +538,8 @@ text in the unwrapping. Increase to include less. You can adjust this value in t
 
 Also, they often have headers and footers as part of the document that will become included with the text.
 Use the options to remove headers and footers to mitigate this issue. If the headers and footers are not
-removed from the text it can throw off the paragraph unwrapping.
+removed from the text it can throw off the paragraph unwrapping. To learn how to use the header and footer removal options, read 
+:ref:`regexptutorial`.
 
 Some limitations of PDF input are: 
     

From 8f7d8c1022533ef5fd07f6162b03672cadafcb92 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 8 Jan 2011 10:17:36 -0700
Subject: [PATCH 05/14] Fix #8241 (Updated recipe for Exiled online)

---
 resources/images/news/exiled.png | Bin 0 -> 1352 bytes
 resources/recipes/exiled.recipe  |  37 ++++++++++++++++---------------
 2 files changed, 19 insertions(+), 18 deletions(-)
 create mode 100644 resources/images/news/exiled.png

diff --git a/resources/images/news/exiled.png b/resources/images/news/exiled.png
new file mode 100644
index 0000000000000000000000000000000000000000..c233aaf132d07704afa1841db6ddb886d0a76593
GIT binary patch
literal 1352
zcmeAS@N?(olHy`uVBq!ia0vp^0w65F1|<EHm6d=LTavfC%YQK7jQD;BD8gCb5m^kR
zJ;2!QWVRhhu&lr_9Y}-qGsGNQdzgWNrO?yGF~sBe)KK4S5pR*bSMPo{NK+H$37q(2
zqrhULK&LCN3M{c5+=h#0H^@i3D!<A<!5<L7BK}LbiQ93~g2)aRW<$fC!o(JdRHM7L
zowmj=-@Sad$|}DwQ(f?0^`2WXyH>y3`~05O_bCBhr!<(hh#fNN<Z15c@ZeBt@W@u=
zS^SfiVWnu3KhvAz=|7@55;o7j7X4W3=<l$tzaAYuJ>8|y#?;i*Ri#D1>tiAti^fsE
znLO;1CULxY`}VbRR`%=HuivV!x<22w`rn-M!iFDA61jr27g^-x=H{xNO3Aw~C#krp
zgF&;?=kwp+{PG<|JAeM%xic+kU8hI(J6{1FzIm3;Dyjbd`TFeUJW7XGTwN8q$S1An
z(^=_7)5MNzPL<dp;NrNU`un<AmuaQ{{#+3Fo$2f1mAO`A!Iam#^d4BK`^~Z8JhD;1
zDQ8<-qJ+cJqWAai?rsPQypq8Xq@}8Q+k(yJ{lVsT{zd~6-_uN|o*q2d$f(da9S9iL
zP7UI^FzfTApr~Soe}XK=#+x@cye=y%D=A1)Qt;$)=18dByEOH-*@7@>AYcySQ}tZA
zWfzx)-@<}75(&3$ZCjZHj%#b%*A+OeooyC7+iF4SB!(cblT&RfKYh^*Sf6fG|Cg1e
zF_Vi?rBk4damTxTwXv}dBDH@k%-Jo@?JQ0|&(ycMT>t<7FPU|69zts$&(OX<=ZAU2
z7SVd8kX08soEOCGtNVLORH4h~vcV2zCx*Zy;X&)m%ii7LwD7C0w&TC@`tjrSafOPW
zmwf)DKi$vxKtAl%43%xMdd%BmHDqkLJWkH(P`a$bB*pvg%&%WpS6^Sp=ykMEhS~Gb
zoq$!G4lDn&D!gq8@?`q)@Amfh_ZY8cMFYWSttBb1D|g0ZYd`qCKHlH2h_Ud`eFmO;
zNr|4i+`RpsS_&>fUY<*)q#Ek-HlP2T=iU9DopIv(`@2kyoR);Fe6s7!{8wJ9IHRXM
z-jTQTkjKu{m`ba--eZp*P1kRDEMu{u-u{6FGxKtP=Im^7Jsw^|OWtN{C6;*3$t4vP
z6%rGScxwO0TWr|&ta7UqbHJ*Z7M@;9=i5h$`^^PLUNbwtEYqWk%$K*#6+FX$fZ?`X
z(URS>za|+^D5<h}|M9VV|F(1gry1V4!Qm_>BImr+W_=WE&hJM@y{FGAe*aro`0-=k
zgMnUNOB;H7J3Bi$4}9KqGiMs(hCG7_@%zi(ym-NIsGzE<YF1;;I}^{F87zzf$Gf|a
zDm}0`e0aKkv9_=9imMg{GUv~nIl~ae-uT}5cBJ{C4RP$NuFf{kzmpKDo@B>)an3BS
zr$0VAF?K9!ZT(ppcFBMxqqMZ7q(o%H+Z?m`>{(lV)Fv~UndzL~#Be{|b8Se`&K$9b
zD65(syQMjrT3cILAG{71Q@DQi`R&^?g_m)%G$zhS3!ZejV%yx)Pd^`c^XBjG@^=hM
z9Cr13VvK92Pv2jkZ@2zsHQR^Z)Akj48BX7x{~id`KmASG7@@LB=ltsH+ly@HPAFOG
zzfr&?@XFP#>HHV(uW?q{)1Y8p3M`3KOI#yLQW8s2t&)pUffR$0fuW(U0T5Y)7#dg^
znp&Bf>l&C_85nFmXte}ILvDUbW?ChR1`{g-3oAnlhz6(dLvMi^7(8A5T-G@yGywq1
Cm_*M2

literal 0
HcmV?d00001

diff --git a/resources/recipes/exiled.recipe b/resources/recipes/exiled.recipe
index 72dfc02e8b..6a65e22edc 100644
--- a/resources/recipes/exiled.recipe
+++ b/resources/recipes/exiled.recipe
@@ -1,7 +1,5 @@
-#!/usr/bin/env  python
-
 __license__   = 'GPL v3'
-__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'
 '''
 exiledonline.com
 '''
@@ -20,18 +18,20 @@ class Exiled(BasicNewsRecipe):
     use_embedded_content  = False
     encoding              = 'utf8'
     remove_javascript     = True
-    language = 'en'
-
-    cover_url             = 'http://exiledonline.com/wp-content/themes/exiledonline_theme/images/header-sm.gif'
-
-    html2lrf_options = [
-                          '--comment'       , description
-                        , '--base-font-size', '10'
-                        , '--category'      , category
-                        , '--publisher'     , publisher
-                        ]
-
-    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
+    language              = 'en'
+    publication_type      = 'newsblog'
+    masthead_url          = 'http://exiledonline.com/wp-content/themes/exiledonline_theme/images/header-sm.gif'
+    extra_css             = """
+                               body{font-family: Arial,Helvetica,sans-serif}
+                               #topslug{font-size: xx-large; font-weight: bold; color: red}                               
+                            """
+    
+    conversion_options = {
+                          'comment'   : description
+                        , 'tags'      : category
+                        , 'publisher' : publisher
+                        , 'language'  : language
+                        }
 
     keep_only_tags = [dict(name='div', attrs={'id':'main'})]
 
@@ -47,12 +47,13 @@ class Exiled(BasicNewsRecipe):
     def preprocess_html(self, soup):
         for item in soup.findAll(style=True):
             del item['style']
-        mtag = '\n<meta http-equiv="Content-Language" content="en"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n'
-        soup.head.insert(0,mtag)
+        for alink in soup.findAll('a'):
+            if alink.string is not None:
+               tstr = alink.string
+               alink.replaceWith(tstr)
         return soup
 
     def get_article_url(self, article):
         raw = article.get('link',  None)
         final = raw + 'all/1/'
         return final
-

From 611c0373573a6ad74cc0ba5b4d4b8a5788760651 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 8 Jan 2011 10:52:29 -0700
Subject: [PATCH 06/14] ...

---
 src/calibre/ebooks/conversion/preprocess.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 97aaa653a9..ae111355e4 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -563,8 +563,8 @@ class HTMLPreProcessor(object):
         html = html.replace(start, '<!--')
         html = html.replace(stop, '-->')
         # convert ellipsis to entities to prevent wrapping
-        html = re.sub('(?u)(?<=\w)\s?(\.\s?){2}\.', '&hellip;', html)
+        html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '&hellip;', html)
         # convert double dashes to em-dash
-        html = re.sub('\s--\s', u'\u2014', html)
+        html = re.sub(r'\s--\s', u'\u2014', html)
         return substitute_entites(html)
 

From 843e1f2068cf1707f7f002be7c05c37282e9fa36 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 8 Jan 2011 13:17:32 -0500
Subject: [PATCH 07/14] TXT Input: Basic heuristic processor.

---
 src/calibre/ebooks/txt/heuristicprocessor.py | 88 ++++++++++++++++++++
 src/calibre/ebooks/txt/input.py              | 12 ++-
 src/calibre/ebooks/txt/processor.py          | 23 ++++-
 3 files changed, 116 insertions(+), 7 deletions(-)
 create mode 100644 src/calibre/ebooks/txt/heuristicprocessor.py

diff --git a/src/calibre/ebooks/txt/heuristicprocessor.py b/src/calibre/ebooks/txt/heuristicprocessor.py
new file mode 100644
index 0000000000..cbfa33a96a
--- /dev/null
+++ b/src/calibre/ebooks/txt/heuristicprocessor.py
@@ -0,0 +1,88 @@
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPL 3'
+__copyright__ = '2011, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import re
+import string
+
+from calibre import prepare_string_for_xml
+from calibre.ebooks.unidecode.unidecoder import Unidecoder
+
+class TXTHeuristicProcessor(object):
+
+    def __init__(self):
+        self.ITALICIZE_WORDS = [
+            'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
+            'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.',
+            'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
+            'Mlle.', 'Mons.', 'PS.', 'PPS.', 
+        ]
+        self.ITALICIZE_STYLE_PATS = [
+            r'(?msu)_(?P<words>.+?)_',
+            r'(?msu)/(?P<words>.+?)/',
+            r'(?msu)~~(?P<words>.+?)~~',
+            r'(?msu)\*(?P<words>.+?)\*',
+            r'(?msu)~(?P<words>.+?)~',
+            r'(?msu)_/(?P<words>.+?)/_',
+            r'(?msu)_\*(?P<words>.+?)\*_',
+            r'(?msu)\*/(?P<words>.+?)/\*',
+            r'(?msu)_\*/(?P<words>.+?)/\*_',
+            r'(?msu)/:(?P<words>.+?):/',
+            r'(?msu)\|:(?P<words>.+?):\|',
+        ]
+
+    def del_maketrans(self, deletechars):
+        return dict([(ord(x), u'') for x in deletechars])
+
+    def is_heading(self, line):
+        if not line:
+            return False
+        if len(line) > 40:
+            return False
+        
+        line = Unidecoder().decode(line)
+
+        # punctuation.
+        if line.translate(self.del_maketrans(string.letters + string.digits + ' :-')):
+            return False
+        
+        # All upper case.
+        #if line.isupper():
+        #    return True
+        # Roman numerals.
+        #if not line.translate(self.del_maketrans('IVXYCivxyc ')):
+        #    return True
+        
+        return True
+
+    def process_paragraph(self, paragraph):
+        for word in self.ITALICIZE_WORDS:
+            paragraph = paragraph.replace(word, '<i>%s</i>' % word)
+        for pat in self.ITALICIZE_STYLE_PATS:
+            paragraph = re.sub(pat, lambda mo: '<i>%s</i>' % mo.group('words'), paragraph)
+        return paragraph
+
+    def convert(self, txt, title='', epub_split_size_kb=0):
+        from calibre.ebooks.txt.processor import clean_txt, split_txt, HTML_TEMPLATE
+        txt = clean_txt(txt)
+        txt = split_txt(txt, epub_split_size_kb)
+        
+        processed = []
+        last_was_heading = False
+        for line in txt.split('\n\n'):
+            if self.is_heading(line):
+                if not last_was_heading:
+                    processed.append(u'<h1>%s</h1>' % prepare_string_for_xml(line.replace('\n', ' ')))
+                else:
+                    processed.append(u'<h2>%s</h2>' % prepare_string_for_xml(line.replace('\n', ' ')))
+                last_was_heading = True
+            else:
+                processed.append(u'<p>%s</p>' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
+                last_was_heading = False
+                
+        txt = u'\n'.join(processed)
+        txt = re.sub('[ ]{2,}', ' ', txt)
+
+        return HTML_TEMPLATE % (title, txt)
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 47e92a45a9..fd805f8ce8 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -10,7 +10,8 @@ from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
 from calibre.ebooks.chardet import detect
 from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
     separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
-    preserve_spaces, detect_paragraph_type, detect_formatting_type
+    preserve_spaces, detect_paragraph_type, detect_formatting_type, \
+    convert_heuristic
 from calibre import _ent_pat, xml_entity_to_unicode
 
 class TXTInput(InputFormatPlugin):
@@ -31,7 +32,7 @@ class TXTInput(InputFormatPlugin):
                    '* print:  Assume every line starting with 2+ spaces or a tab '
                    'starts a paragraph.')),
         OptionRecommendation(name='formatting_type', recommended_value='auto',
-            choices=['auto', 'none', 'markdown'],
+            choices=['auto', 'none', 'heuristic', 'markdown'],
             help=_('Formatting used within the document.'
                    '* auto: Try to auto detect the document formatting.\n'
                    '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
@@ -96,7 +97,12 @@ class TXTInput(InputFormatPlugin):
                 txt = separate_paragraphs_print_formatted(txt)
 
             flow_size = getattr(options, 'flow_size', 0)
-            html = convert_basic(txt, epub_split_size_kb=flow_size)
+            
+            if options.formatting_type == 'heuristic':
+                html = convert_heuristic(txt, epub_split_size_kb=flow_size)
+            else:
+                html = convert_basic(txt, epub_split_size_kb=flow_size)
+            
 
         from calibre.customize.ui import plugin_for_input_format
         html_input = plugin_for_input_format('html')
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index f6d628e7c5..79eee79c29 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -9,6 +9,7 @@ import os, re
 from calibre import prepare_string_for_xml, isbytestring
 from calibre.ebooks.markdown import markdown
 from calibre.ebooks.metadata.opf2 import OPFCreator
+from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
 
 __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
@@ -16,7 +17,7 @@ __docformat__ = 'restructuredtext en'
 
 HTML_TEMPLATE = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'
 
-def convert_basic(txt, title='', epub_split_size_kb=0):
+def clean_txt(txt):
     if isbytestring(txt):
         txt = txt.decode('utf-8', 'replace')
     # Strip whitespace from the beginning and end of the line. Also replace
@@ -35,6 +36,10 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
     chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19))
     illegal_chars = re.compile(u'|'.join(map(unichr, chars)))
     txt = illegal_chars.sub('', txt)
+    
+    return txt
+
+def split_txt(txt, epub_split_size_kb=0):
     #Takes care if there is no point to split
     if epub_split_size_kb > 0:
         if isinstance(txt, unicode):
@@ -49,6 +54,12 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
     if isbytestring(txt):
         txt = txt.decode('utf-8')
 
+    return txt
+
+def convert_basic(txt, title='', epub_split_size_kb=0):
+    txt = clean_txt(txt)
+    txt = split_txt(txt, epub_split_size_kb)
+
     lines = []
     # Split into paragraphs based on having a blank line between text.
     for line in txt.split('\n\n'):
@@ -57,6 +68,10 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
 
     return HTML_TEMPLATE % (title, u'\n'.join(lines))
 
+def convert_heuristic(txt, title='', epub_split_size_kb=0):
+    tp = TXTHeuristicProcessor()
+    return tp.convert(txt, title, epub_split_size_kb)
+
 def convert_markdown(txt, title='', disable_toc=False):
     md = markdown.Markdown(
           extensions=['footnotes', 'tables', 'toc'],
@@ -111,12 +126,12 @@ def detect_paragraph_type(txt):
     
     # Check for print
     tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
-    if tab_line_count / float(txt_line_count) >= .25:
+    if tab_line_count / float(txt_line_count) >= .15:
         return 'print'
     
     # Check for block
     empty_line_count = len(re.findall('(?mu)^\s*$', txt))
-    if empty_line_count / float(txt_line_count) >= .25:
+    if empty_line_count / float(txt_line_count) >= .15:
         return 'block'
     
     # Nothing else matched to assume single.
@@ -143,4 +158,4 @@ def detect_formatting_type(txt):
         if txt.count('\\'+c) > 10:
             return 'markdown'
     
-    return 'none'
+    return 'heuristic'

From c5a679a437c7ab52bb0320c83eef4535c151feb5 Mon Sep 17 00:00:00 2001
From: GRiker <griker@hotmail.com>
Date: Sat, 8 Jan 2011 11:42:31 -0700
Subject: [PATCH 08/14] GwR patch for bogus cover data

---
 src/calibre/library/catalog.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py
index 0a5d5284e2..1af9c3aa58 100644
--- a/src/calibre/library/catalog.py
+++ b/src/calibre/library/catalog.py
@@ -21,7 +21,7 @@ from calibre.utils.config import config_dir
 from calibre.utils.date import format_date, isoformat, now as nowf
 from calibre.utils.logging import default_log as log
 from calibre.utils.zipfile import ZipFile, ZipInfo
-from calibre.utils.magick.draw import thumbnail
+from calibre.utils.magick.draw import identify_data, thumbnail
 
 FIELDS = ['all', 'author_sort', 'authors', 'comments',
           'cover', 'formats', 'id', 'isbn', 'ondevice', 'pubdate', 'publisher', 'rating',
@@ -2861,11 +2861,19 @@ class EPUB_MOBI(CatalogPlugin):
                 self.updateProgressMicroStep("Thumbnail %d of %d" % \
                     (i,len(self.booksByTitle)),
                         i/float(len(self.booksByTitle)))
-                # Check to see if source file exists
-                if 'cover' in title and os.path.isfile(title['cover']):
+
+                # Confirm existence, integrity of cover image
+                valid_cover = True
+                try:
+                    _w, _h, _fmt = identify_data(open(title['cover'], 'rb').read())
+                except:
+                    valid_cover = False
+
+                if valid_cover:
                     # Add the thumb spec to thumbs[]
                     thumbs.append("thumbnail_%d.jpg" % int(title['id']))
-
+                    self.generateThumbnail(title, image_dir, thumb_file)
+                    '''
                     # Check to see if thumbnail exists
                     thumb_fp = "%s/thumbnail_%d.jpg" % (image_dir,int(title['id']))
                     thumb_file = 'thumbnail_%d.jpg' % int(title['id'])
@@ -2879,6 +2887,7 @@ class EPUB_MOBI(CatalogPlugin):
                            self.generateThumbnail(title, image_dir, thumb_file)
                     else:
                         self.generateThumbnail(title, image_dir, thumb_file)
+                    '''
                 else:
                     # Use default cover
                     if False and self.verbose:

From 8a44bf07edf1b3282a65edd044421b963d4dd794 Mon Sep 17 00:00:00 2001
From: GRiker <griker@hotmail.com>
Date: Sat, 8 Jan 2011 11:48:41 -0700
Subject: [PATCH 09/14] GwR patch for bogus cover data

---
 src/calibre/library/catalog.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py
index 1af9c3aa58..df1341fc38 100644
--- a/src/calibre/library/catalog.py
+++ b/src/calibre/library/catalog.py
@@ -2862,7 +2862,7 @@ class EPUB_MOBI(CatalogPlugin):
                     (i,len(self.booksByTitle)),
                         i/float(len(self.booksByTitle)))
 
-                # Confirm existence, integrity of cover image
+                thumb_file = 'thumbnail_%d.jpg' % int(title['id'])
                 valid_cover = True
                 try:
                     _w, _h, _fmt = identify_data(open(title['cover'], 'rb').read())

From f593b2163154bcd61e21b0e06f8cf0e29514af86 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 8 Jan 2011 13:53:32 -0500
Subject: [PATCH 10/14] TXT Input: Tweak Heuristic italicizing.

---
 src/calibre/ebooks/txt/heuristicprocessor.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/calibre/ebooks/txt/heuristicprocessor.py b/src/calibre/ebooks/txt/heuristicprocessor.py
index cbfa33a96a..b0bbd49961 100644
--- a/src/calibre/ebooks/txt/heuristicprocessor.py
+++ b/src/calibre/ebooks/txt/heuristicprocessor.py
@@ -21,15 +21,15 @@ class TXTHeuristicProcessor(object):
         ]
         self.ITALICIZE_STYLE_PATS = [
             r'(?msu)_(?P<words>.+?)_',
-            r'(?msu)/(?P<words>.+?)/',
+            r'(?msu)/(?P<words>[^<>]+?)/',
             r'(?msu)~~(?P<words>.+?)~~',
             r'(?msu)\*(?P<words>.+?)\*',
             r'(?msu)~(?P<words>.+?)~',
-            r'(?msu)_/(?P<words>.+?)/_',
+            r'(?msu)_/(?P<words>[^<>]+?)/_',
             r'(?msu)_\*(?P<words>.+?)\*_',
-            r'(?msu)\*/(?P<words>.+?)/\*',
-            r'(?msu)_\*/(?P<words>.+?)/\*_',
-            r'(?msu)/:(?P<words>.+?):/',
+            r'(?msu)\*/(?P<words>[^<>]+?)/\*',
+            r'(?msu)_\*/(?P<words>[^<>]+?)/\*_',
+            r'(?msu)/:(?P<words>[^<>]+?):/',
             r'(?msu)\|:(?P<words>.+?):\|',
         ]
 
@@ -84,5 +84,6 @@ class TXTHeuristicProcessor(object):
                 
         txt = u'\n'.join(processed)
         txt = re.sub('[ ]{2,}', ' ', txt)
+        print txt
 
         return HTML_TEMPLATE % (title, txt)

From c8f18ff02e32f56220f83872f4def00cca58e73d Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 8 Jan 2011 15:49:10 -0500
Subject: [PATCH 11/14] TXT Input: Heuristic processor, use PreProcessor to
 mark chapter headings.

---
 src/calibre/ebooks/txt/heuristicprocessor.py | 43 ++++----------------
 src/calibre/ebooks/txt/processor.py          |  3 --
 2 files changed, 7 insertions(+), 39 deletions(-)

diff --git a/src/calibre/ebooks/txt/heuristicprocessor.py b/src/calibre/ebooks/txt/heuristicprocessor.py
index b0bbd49961..c4489badc5 100644
--- a/src/calibre/ebooks/txt/heuristicprocessor.py
+++ b/src/calibre/ebooks/txt/heuristicprocessor.py
@@ -33,30 +33,6 @@ class TXTHeuristicProcessor(object):
             r'(?msu)\|:(?P<words>.+?):\|',
         ]
 
-    def del_maketrans(self, deletechars):
-        return dict([(ord(x), u'') for x in deletechars])
-
-    def is_heading(self, line):
-        if not line:
-            return False
-        if len(line) > 40:
-            return False
-        
-        line = Unidecoder().decode(line)
-
-        # punctuation.
-        if line.translate(self.del_maketrans(string.letters + string.digits + ' :-')):
-            return False
-        
-        # All upper case.
-        #if line.isupper():
-        #    return True
-        # Roman numerals.
-        #if not line.translate(self.del_maketrans('IVXYCivxyc ')):
-        #    return True
-        
-        return True
-
     def process_paragraph(self, paragraph):
         for word in self.ITALICIZE_WORDS:
             paragraph = paragraph.replace(word, '<i>%s</i>' % word)
@@ -70,20 +46,15 @@ class TXTHeuristicProcessor(object):
         txt = split_txt(txt, epub_split_size_kb)
         
         processed = []
-        last_was_heading = False
         for line in txt.split('\n\n'):
-            if self.is_heading(line):
-                if not last_was_heading:
-                    processed.append(u'<h1>%s</h1>' % prepare_string_for_xml(line.replace('\n', ' ')))
-                else:
-                    processed.append(u'<h2>%s</h2>' % prepare_string_for_xml(line.replace('\n', ' ')))
-                last_was_heading = True
-            else:
-                processed.append(u'<p>%s</p>' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
-                last_was_heading = False
+            processed.append(u'<p>%s</p>' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
                 
         txt = u'\n'.join(processed)
         txt = re.sub('[ ]{2,}', ' ', txt)
-        print txt
+        html = HTML_TEMPLATE % (title, txt)
+        
+        from calibre.ebooks.conversion.utils import PreProcessor
+        pp = PreProcessor()
+        html = pp.markup_chapters(html, pp.get_word_count(html), False)
 
-        return HTML_TEMPLATE % (title, txt)
+        return html
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index 1e67caccc6..9dc29e45dd 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -9,11 +9,8 @@ import os, re
 from calibre import prepare_string_for_xml, isbytestring
 from calibre.ebooks.markdown import markdown
 from calibre.ebooks.metadata.opf2 import OPFCreator
-<<<<<<< TREE
 from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
-=======
 from calibre.ebooks.conversion.preprocess import DocAnalysis
->>>>>>> MERGE-SOURCE
 
 __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'

From bd14205637cbf71fe4aad655de50f4f0fea98a60 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 8 Jan 2011 15:53:51 -0500
Subject: [PATCH 12/14] ...

---
 src/calibre/ebooks/txt/heuristicprocessor.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/txt/heuristicprocessor.py b/src/calibre/ebooks/txt/heuristicprocessor.py
index c4489badc5..c4c6a56123 100644
--- a/src/calibre/ebooks/txt/heuristicprocessor.py
+++ b/src/calibre/ebooks/txt/heuristicprocessor.py
@@ -5,7 +5,6 @@ __copyright__ = '2011, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 
 import re
-import string
 
 from calibre import prepare_string_for_xml
 from calibre.ebooks.unidecode.unidecoder import Unidecoder
@@ -48,7 +47,7 @@ class TXTHeuristicProcessor(object):
         processed = []
         for line in txt.split('\n\n'):
             processed.append(u'<p>%s</p>' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
-                
+
         txt = u'\n'.join(processed)
         txt = re.sub('[ ]{2,}', ' ', txt)
         html = HTML_TEMPLATE % (title, txt)

From 831ee1fc81b50d9ccd7c771161db322715fa3092 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 8 Jan 2011 16:53:54 -0500
Subject: [PATCH 13/14] TXT Input: Add documentation for the heuristic
 formatting option to the option help.

---
 src/calibre/ebooks/txt/input.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 5060e124ff..c8ce389574 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -37,6 +37,8 @@ class TXTInput(InputFormatPlugin):
             help=_('Formatting used within the document.'
                    '* auto: Try to auto detect the document formatting.\n'
                    '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
+                   '* heuristic: Try to detect formatting for elements such as chapter headings '
+                   'and style the elements appropriately.\n'
                    '* markdown: Run the input though the markdown pre-processor. '
                    'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
         OptionRecommendation(name='preserve_spaces', recommended_value=False,

From 12cbaa2304db610ccf101bbd4abe13ff58f68fee Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 8 Jan 2011 17:26:32 -0500
Subject: [PATCH 14/14] TXT Input: Make formatting_type options easier to
 understand.

---
 src/calibre/ebooks/txt/input.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index c8ce389574..e782cd0cd9 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -35,11 +35,12 @@ class TXTInput(InputFormatPlugin):
         OptionRecommendation(name='formatting_type', recommended_value='auto',
             choices=['auto', 'none', 'heuristic', 'markdown'],
             help=_('Formatting used within the document.'
-                   '* auto: Try to auto detect the document formatting.\n'
-                   '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
-                   '* heuristic: Try to detect formatting for elements such as chapter headings '
-                   'and style the elements appropriately.\n'
-                   '* markdown: Run the input though the markdown pre-processor. '
+                   '* auto: Automatically decide which formatting processor to use.\n'
+                   '* none: Do not process the document formatting. Everything is a '
+                   'paragraph and no styling is applied.\n'
+                   '* heuristic: Process using heuristics to determine formatting such '
+                   'as chapter headings and italic text.\n'
+                   '* markdown: Processing using markdown formatting. '
                    'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
         OptionRecommendation(name='preserve_spaces', recommended_value=False,
             help=_('Normally extra spaces are condensed into a single space. '