From da679b885faf07c3218946d072b529259c5e6955 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Fri, 7 Jan 2011 11:26:45 +0800
Subject: [PATCH 01/13] chapter heading tweaks

---
 src/calibre/ebooks/conversion/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 4bb96ac088..2090cff12d 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -153,7 +153,7 @@ class PreProcessor(object):
         default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
         
         chapter_types = [
-            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
+            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common Chapter Headings"],
             [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
             [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"],  # Numeric Chapters
             [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"],  # Spaced Lettering

From dd96c645f020cd57682bbeba8501c21b8b77b0b9 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Fri, 7 Jan 2011 14:19:12 +0800
Subject: [PATCH 02/13] tied line histogram into txt paragraph structure
 detection

---
 src/calibre/ebooks/conversion/preprocess.py |  2 ++
 src/calibre/ebooks/txt/input.py             |  2 +-
 src/calibre/ebooks/txt/processor.py         | 31 ++++++++++++++-------
 3 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 29006ffd9b..97aaa653a9 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -78,6 +78,8 @@ class DocAnalysis(object):
             linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
         elif format == 'spanned_html':
             linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
+        elif format == 'txt':
+            linere = re.compile('.*?\n', re.DOTALL)
         self.lines = linere.findall(raw)
 
     def line_length(self, percent):
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 47e92a45a9..7fb22755de 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -90,7 +90,7 @@ class TXTInput(InputFormatPlugin):
             
             # We don't check for block because the processor assumes block.
             # single and print at transformed to block for processing.
-            if options.paragraph_type == 'single':
+            if options.paragraph_type == 'single' or 'unformatted':
                 txt = separate_paragraphs_single_line(txt)
             elif options.paragraph_type == 'print':
                 txt = separate_paragraphs_print_formatted(txt)
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index f6d628e7c5..53935584d2 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -9,6 +9,7 @@ import os, re
 from calibre import prepare_string_for_xml, isbytestring
 from calibre.ebooks.markdown import markdown
 from calibre.ebooks.metadata.opf2 import OPFCreator
+from calibre.ebooks.conversion.preprocess import DocAnalysis
 
 __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
@@ -102,26 +103,36 @@ def detect_paragraph_type(txt):
     print: Each paragraph starts with a 2+ spaces or a tab
            and ends when a new paragraph is reached.
     markdown: Markdown formatting is in the document.
+    unformatted: most lines have hard line breaks, few/no spaces or indents
     
-    returns block, single, print, markdown
+    returns block, single, print, markdown, unformatted
     '''
     txt = txt.replace('\r\n', '\n')
     txt = txt.replace('\r', '\n')
     txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))
     
-    # Check for print
-    tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
-    if tab_line_count / float(txt_line_count) >= .25:
-        return 'print'
+    # Check for hard line breaks - true if 55% of the doc breaks in the same region
+    docanalysis = DocAnalysis('txt', txt)
+    hardbreaks = docanalysis.line_histogram(.55)
     
-    # Check for block
-    empty_line_count = len(re.findall('(?mu)^\s*$', txt))
-    if empty_line_count / float(txt_line_count) >= .25:
-        return 'block'
+    if hardbreaks:
+        # Check for print
+        tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
+        if tab_line_count / float(txt_line_count) >= .25:
+            return 'print'
+        
+        # Check for block
+        empty_line_count = len(re.findall('(?mu)^\s*$', txt))
+        if empty_line_count / float(txt_line_count) >= .25:
+            return 'block'
+
+        # Assume unformatted text with hardbreaks if nothing else matches        
+        return 'unformatted'
     
-    # Nothing else matched to assume single.
+    # return single if hardbreaks is false
     return 'single'
 
+
 def detect_formatting_type(txt):
     # Check for markdown
     # Headings

From 90177a42053f29c302faf7483de6dd3fc455d400 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 8 Jan 2011 09:23:32 +0800
Subject: [PATCH 03/13] tweaked threshold for preprocess

---
 src/calibre/ebooks/conversion/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index f367aa02d7..5db920b01d 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -194,7 +194,7 @@ class PreProcessor(object):
         totalwords = 0
         totalwords = self.get_word_count(html)
 
-        if totalwords < 20:
+        if totalwords < 50:
             self.log("not enough text, not preprocessing")
             return html
 

From 5854f5308e46d1be747cf85d789d9ca9de78e80b Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 8 Jan 2011 19:45:51 +0800
Subject: [PATCH 04/13] moved punctuation unwrap into a function, tied to txt
 input

---
 src/calibre/ebooks/conversion/utils.py | 20 +++++++++++++++++---
 src/calibre/ebooks/txt/input.py        | 10 ++++++++++
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 5db920b01d..27dacdf5fb 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -184,7 +184,22 @@ class PreProcessor(object):
         self.log("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
         return html
 
-
+    def punctuation_unwrap(self, length, content, format):
+        # define the pieces of the regex
+        lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
+        line_ending = "\s*</(span|p|div)>\s*(</(p|span|div)>)?"
+        blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
+        line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*"
+        txt_line_wrap = u"(\u0020|\u0009)*\n"
+        
+        unwrap_regex = lookahead+line_ending+blanklines+line_opening
+        if format == 'txt':
+            unwrap_regex = lookahead+txt_line_wrap
+        
+        unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
+        content = unwrap.sub(' ', content)
+        return content
+       
 
     def __call__(self, html):
         self.log("*********  Preprocessing HTML  *********")
@@ -312,8 +327,7 @@ class PreProcessor(object):
             self.log("Done dehyphenating")
             # Unwrap lines using punctation and line length
             #unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE)
-            unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
-            html = unwrap.sub(' ', html)
+            html = self.punctuation_unwrap(length, html, 'html')
             #check any remaining hyphens, but only unwrap if there is a match
             dehyphenator = Dehyphenator()
             html = dehyphenator(html,'html_cleanup', length)
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 7fb22755de..98756c5fa1 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -95,6 +95,16 @@ class TXTInput(InputFormatPlugin):
             elif options.paragraph_type == 'print':
                 txt = separate_paragraphs_print_formatted(txt)
 
+            if options.paragraph_type == 'unformatted':
+                from calibre.ebooks.conversion.utils import PreProcessor
+                from calibre.ebooks.conversion.preprocess import DocAnalysis
+                # get length
+                docanalysis = DocAnalysis('txt', txt)
+                length = docanalysis.line_length(.5)
+                # unwrap lines based on punctuation
+                preprocessor = PreProcessor(options, log=getattr(self, 'log', None))
+                txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
+
             flow_size = getattr(options, 'flow_size', 0)
             html = convert_basic(txt, epub_split_size_kb=flow_size)
 

From f88045c16266474ed625a0e38b0a9fa12aded75d Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 8 Jan 2011 20:35:19 +0800
Subject: [PATCH 05/13] fixed comments

---
 src/calibre/ebooks/txt/processor.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index 53935584d2..c6cf1078cd 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -102,10 +102,9 @@ def detect_paragraph_type(txt):
     single: Each line is a paragraph.
     print: Each paragraph starts with a 2+ spaces or a tab
            and ends when a new paragraph is reached.
-    markdown: Markdown formatting is in the document.
     unformatted: most lines have hard line breaks, few/no spaces or indents
     
-    returns block, single, print, markdown, unformatted
+    returns block, single, print, unformatted
     '''
     txt = txt.replace('\r\n', '\n')
     txt = txt.replace('\r', '\n')

From 09ff8524214cc51091f8ec8dca616e2675e40789 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 8 Jan 2011 06:53:24 -0700
Subject: [PATCH 06/13] El Publico by Gerardo Diez. Fixes #405 (New news feed)

---
 resources/recipes/deia.recipe           |  2 +-
 resources/recipes/el_publico.recipe     | 43 +++++++++++++++++++++++++
 resources/recipes/elpais_impreso.recipe |  8 ++---
 3 files changed, 48 insertions(+), 5 deletions(-)
 create mode 100644 resources/recipes/el_publico.recipe

diff --git a/resources/recipes/deia.recipe b/resources/recipes/deia.recipe
index 980d59d3d1..5d39be9a10 100644
--- a/resources/recipes/deia.recipe
+++ b/resources/recipes/deia.recipe
@@ -22,7 +22,7 @@ class Deia(BasicNewsRecipe):
 	cover_url		='http://2.bp.blogspot.com/_RjrWzC6tI14/TM6jrPLaBZI/AAAAAAAAFaI/ayffwxidFEY/s1600/2009-10-13-logo-deia.jpg'
 	timefmt			='[%a, %d %b, %Y]'
 	encoding		='utf8'
-	language		='es_ES'
+	language		='es'
 	remove_javascript	=True
 	remove_tags_after	=dict(id='Texto')
 	remove_tags_before	=dict(id='Texto')
diff --git a/resources/recipes/el_publico.recipe b/resources/recipes/el_publico.recipe
new file mode 100644
index 0000000000..d0da739b03
--- /dev/null
+++ b/resources/recipes/el_publico.recipe
@@ -0,0 +1,43 @@
+#!/usr/bin/env  python
+__license__   = 'GPL v3'
+__author__    = 'Gerardo Diez'
+__copyright__ = 'Gerardo Diez<gerardo.diez.garcia@gmail.com>'
+description   = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)'
+__docformat__ = 'restructuredtext en'
+
+'''
+publico.es
+'''
+from calibre.web.feeds.recipes import BasicNewsRecipe
+class Publico(BasicNewsRecipe):
+    title               =u'Publico.es'
+    __author__      ='Gerardo Diez'
+    publisher       =u'Mediapubli Sociedad de Publicaciones y Ediciones S.L.'
+    category                ='news, politics, finances, world, spain, science, catalunya'
+    oldest_article      =1
+    max_articles_per_feed   =100
+    simultaneous_downloads  =10
+    cover_url       =u'http://imagenes.publico.es/css/img/logo_publico.gif'
+    timefmt         ='[%a, %d %b, %Y]'
+    encoding        ='utf8'
+    language        ='es'
+    remove_javascript   =True
+    no_stylesheets      =True
+    keep_only_tags      =dict(id='main')
+    remove_tags         =[
+                            dict(name='div', attrs={'class':['Noticias_642x50', 'contInfo ancho']}),
+                            dict(name='ul', attrs={'class':['navComentarios', 'comentarios']}),
+                            dict(name='div', attrs={'id':['commentsContext', 'toolbar', 'comentarios']}),
+                            dict(name='h5', attrs={'id':'comentarios'})
+                            ]
+    feeds               =[(u'Internacional', u'http://www.publico.es/estaticos/rss/internacional'),
+                 (u'Espa\xf1a', u'http://www.publico.es/estaticos/rss/espana'),
+                 (u'Dinero', u'http://www.publico.es/estaticos/rss/dinero'),
+                 (u'Ciencias', u'http://www.publico.es/estaticos/rss/ciencias'),
+                 (u'Culturas', u'http://www.publico.es/estaticos/rss/culturas'),
+                 (u'Deportes', u'http://www.publico.es/estaticos/rss/deportes'),
+                 (u'Televisi\xf3n y Gente', u'http://www.publico.es/estaticos/rss/televisionygente'),
+                 (u'Catalu\xf1a', u'http://www.publico.es/estaticos/rss/catalunya'),
+                 (u'Viajes', u'http://www.publico.es/estaticos/rss/viajes')]
+
+
diff --git a/resources/recipes/elpais_impreso.recipe b/resources/recipes/elpais_impreso.recipe
index 130013286c..b22a41dcec 100644
--- a/resources/recipes/elpais_impreso.recipe
+++ b/resources/recipes/elpais_impreso.recipe
@@ -17,7 +17,7 @@ class ElPais_RSS(BasicNewsRecipe):
     no_stylesheets        = True
     encoding              = 'cp1252'
     use_embedded_content  = False
-    language              = 'es_ES'
+    language              = 'es'
     remove_empty_feeds    = True
     publication_type      = 'newspaper'
     masthead_url          = 'http://www.elpais.com/im/tit_logo.gif'
@@ -57,14 +57,14 @@ class ElPais_RSS(BasicNewsRecipe):
              ,(u'Madrid'               , u'http://www.elpais.com/rss/feed.html?feedId=1016' )
              ,(u'Pais Vasco'           , u'http://www.elpais.com/rss/feed.html?feedId=17062')
              ,(u'Galicia'              , u'http://www.elpais.com/rss/feed.html?feedId=17063')
-             ,(u'Opinion'              , u'http://www.elpais.com/rss/feed.html?feedId=1003' )             
-             ,(u'Sociedad'             , u'http://www.elpais.com/rss/feed.html?feedId=1004' )             
+             ,(u'Opinion'              , u'http://www.elpais.com/rss/feed.html?feedId=1003' )
+             ,(u'Sociedad'             , u'http://www.elpais.com/rss/feed.html?feedId=1004' )
              ,(u'Deportes'             , u'http://www.elpais.com/rss/feed.html?feedId=1007' )
              ,(u'Cultura'              , u'http://www.elpais.com/rss/feed.html?feedId=1008' )
              ,(u'Cine'                 , u'http://www.elpais.com/rss/feed.html?feedId=17052')
              ,(u'Literatura'           , u'http://www.elpais.com/rss/feed.html?feedId=17053')
              ,(u'Musica'               , u'http://www.elpais.com/rss/feed.html?feedId=17051')
-             ,(u'Arte'                 , u'http://www.elpais.com/rss/feed.html?feedId=17060')             
+             ,(u'Arte'                 , u'http://www.elpais.com/rss/feed.html?feedId=17060')
              ,(u'Tecnologia'           , u'http://www.elpais.com/rss/feed.html?feedId=1005' )
              ,(u'Economia'             , u'http://www.elpais.com/rss/feed.html?feedId=1006' )
              ,(u'Ciencia'              , u'http://www.elpais.com/rss/feed.html?feedId=17068')

From 823cdcc4373bc523a0ba584e0eb82febb7d1f231 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 8 Jan 2011 07:27:08 -0700
Subject: [PATCH 07/13] ...

---
 src/calibre/manual/conversion.rst | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst
index 3a7ae16598..a5aad9b450 100644
--- a/src/calibre/manual/conversion.rst
+++ b/src/calibre/manual/conversion.rst
@@ -533,17 +533,22 @@ PDF documents are one of the worst formats to convert from. They are a fixed pag
 Meaning, it is very difficult to determine where one paragraph ends and another begins. |app| will try to unwrap
 paragraphs using a configurable, :guilabel:`Line Un-Wrapping Factor`. This is a scale used to determine the length
 at which a line should be unwrapped. Valid values are a decimal
-between 0 and 1. The default is 0.5, this is the median line length. Lower this value to include more
-text in the unwrapping. Increase to include less. You can adjust this value in the conversion settings under PDF Input.
+between 0 and 1. The default is 0.45, just under the median line length. Lower this value to include more
+text in the unwrapping. Increase to include less. You can adjust this value in the conversion settings under :guilabel:`PDF Input`.
 
 Also, they often have headers and footers as part of the document that will become included with the text.
 Use the options to remove headers and footers to mitigate this issue. If the headers and footers are not
 removed from the text it can throw off the paragraph unwrapping.
 
-Some limitations of PDF input is complex, multi-column, and image based documents are not supported.
-Extraction of vector images and tables from within the document is also not supported. Some PDFs use special glyphs to
-represent double ll or doubfle ff or fi,etc. Conversion of these may or may not work depending on jusy how they are 
-represented internally in the PDF.
+Some limitations of PDF input are: 
+    
+    * Complex, multi-column, and image based documents are not supported.
+    * Extraction of vector images and tables from within the document is also not supported.
+    * Some PDFs use special glyphs to represent ll or ff or fi, etc. Conversion of these may or may not work depending on just how they are represented internally in the PDF.
+    * Some PDFs store their images upside down with a rotation instruction, |app| currently doesn't support that instruction, so the images will be rotated in the output as well. 
+
+To re-iterate **PDF is a really, really bad** format to use as input. If you absolutely must use PDF, then be prepared for an
+output ranging anywhere from decent to unusable, depending on the input PDF.
 
 Comic Book Collections
 ~~~~~~~~~~~~~~~~~~~~~~~~~

From 8ac2dd0a65776aafcb8132aca5f256c9fcb4acd4 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 8 Jan 2011 07:46:55 -0700
Subject: [PATCH 08/13] Email settings: Before displaying the email test dialog
 warn the user that it will expose their email password

---
 src/calibre/gui2/wizard/send_email.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/calibre/gui2/wizard/send_email.py b/src/calibre/gui2/wizard/send_email.py
index b9b65dc940..5785f52276 100644
--- a/src/calibre/gui2/wizard/send_email.py
+++ b/src/calibre/gui2/wizard/send_email.py
@@ -16,7 +16,7 @@ from PyQt4.Qt import QWidget, pyqtSignal, QDialog, Qt, QLabel, \
 from calibre.gui2.wizard.send_email_ui import Ui_Form
 from calibre.utils.smtp import config as smtp_prefs
 from calibre.gui2.dialogs.test_email_ui import Ui_Dialog as TE_Dialog
-from calibre.gui2 import error_dialog
+from calibre.gui2 import error_dialog, question_dialog
 
 class TestEmail(QDialog, TE_Dialog):
 
@@ -92,7 +92,10 @@ class SendEmail(QWidget, Ui_Form):
         pa = self.preferred_to_address()
         to_set = pa is not None
         if self.set_email_settings(to_set):
-          TestEmail(pa, self).exec_()
+            if question_dialog(self, _('OK to proceed?'),
+                    _('This will display your email password on the screen'
+                    '. Is it OK to proceed?'), show_copy_button=False):
+                TestEmail(pa, self).exec_()
 
     def test_email_settings(self, to):
         opts = smtp_prefs().parse()

From 4abfeed6accf655c8f61f05bc7027de6b8ecad27 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 8 Jan 2011 08:29:40 -0700
Subject: [PATCH 09/13] ...

---
 src/calibre/manual/conversion.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst
index a5aad9b450..4b2b169d72 100644
--- a/src/calibre/manual/conversion.rst
+++ b/src/calibre/manual/conversion.rst
@@ -538,7 +538,8 @@ text in the unwrapping. Increase to include less. You can adjust this value in t
 
 Also, they often have headers and footers as part of the document that will become included with the text.
 Use the options to remove headers and footers to mitigate this issue. If the headers and footers are not
-removed from the text it can throw off the paragraph unwrapping.
+removed from the text it can throw off the paragraph unwrapping. To learn how to use the header and footer removal options, read 
+:ref:`regexptutorial`.
 
 Some limitations of PDF input are: 
     

From 8f7d8c1022533ef5fd07f6162b03672cadafcb92 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 8 Jan 2011 10:17:36 -0700
Subject: [PATCH 10/13] Fix #8241 (Updated recipe for Exiled online)

---
 resources/images/news/exiled.png | Bin 0 -> 1352 bytes
 resources/recipes/exiled.recipe  |  37 ++++++++++++++++---------------
 2 files changed, 19 insertions(+), 18 deletions(-)
 create mode 100644 resources/images/news/exiled.png

diff --git a/resources/images/news/exiled.png b/resources/images/news/exiled.png
new file mode 100644
index 0000000000000000000000000000000000000000..c233aaf132d07704afa1841db6ddb886d0a76593
GIT binary patch
literal 1352
zcmeAS@N?(olHy`uVBq!ia0vp^0w65F1|<EHm6d=LTavfC%YQK7jQD;BD8gCb5m^kR
zJ;2!QWVRhhu&lr_9Y}-qGsGNQdzgWNrO?yGF~sBe)KK4S5pR*bSMPo{NK+H$37q(2
zqrhULK&LCN3M{c5+=h#0H^@i3D!<A<!5<L7BK}LbiQ93~g2)aRW<$fC!o(JdRHM7L
zowmj=-@Sad$|}DwQ(f?0^`2WXyH>y3`~05O_bCBhr!<(hh#fNN<Z15c@ZeBt@W@u=
zS^SfiVWnu3KhvAz=|7@55;o7j7X4W3=<l$tzaAYuJ>8|y#?;i*Ri#D1>tiAti^fsE
znLO;1CULxY`}VbRR`%=HuivV!x<22w`rn-M!iFDA61jr27g^-x=H{xNO3Aw~C#krp
zgF&;?=kwp+{PG<|JAeM%xic+kU8hI(J6{1FzIm3;Dyjbd`TFeUJW7XGTwN8q$S1An
z(^=_7)5MNzPL<dp;NrNU`un<AmuaQ{{#+3Fo$2f1mAO`A!Iam#^d4BK`^~Z8JhD;1
zDQ8<-qJ+cJqWAai?rsPQypq8Xq@}8Q+k(yJ{lVsT{zd~6-_uN|o*q2d$f(da9S9iL
zP7UI^FzfTApr~Soe}XK=#+x@cye=y%D=A1)Qt;$)=18dByEOH-*@7@>AYcySQ}tZA
zWfzx)-@<}75(&3$ZCjZHj%#b%*A+OeooyC7+iF4SB!(cblT&RfKYh^*Sf6fG|Cg1e
zF_Vi?rBk4damTxTwXv}dBDH@k%-Jo@?JQ0|&(ycMT>t<7FPU|69zts$&(OX<=ZAU2
z7SVd8kX08soEOCGtNVLORH4h~vcV2zCx*Zy;X&)m%ii7LwD7C0w&TC@`tjrSafOPW
zmwf)DKi$vxKtAl%43%xMdd%BmHDqkLJWkH(P`a$bB*pvg%&%WpS6^Sp=ykMEhS~Gb
zoq$!G4lDn&D!gq8@?`q)@Amfh_ZY8cMFYWSttBb1D|g0ZYd`qCKHlH2h_Ud`eFmO;
zNr|4i+`RpsS_&>fUY<*)q#Ek-HlP2T=iU9DopIv(`@2kyoR);Fe6s7!{8wJ9IHRXM
z-jTQTkjKu{m`ba--eZp*P1kRDEMu{u-u{6FGxKtP=Im^7Jsw^|OWtN{C6;*3$t4vP
z6%rGScxwO0TWr|&ta7UqbHJ*Z7M@;9=i5h$`^^PLUNbwtEYqWk%$K*#6+FX$fZ?`X
z(URS>za|+^D5<h}|M9VV|F(1gry1V4!Qm_>BImr+W_=WE&hJM@y{FGAe*aro`0-=k
zgMnUNOB;H7J3Bi$4}9KqGiMs(hCG7_@%zi(ym-NIsGzE<YF1;;I}^{F87zzf$Gf|a
zDm}0`e0aKkv9_=9imMg{GUv~nIl~ae-uT}5cBJ{C4RP$NuFf{kzmpKDo@B>)an3BS
zr$0VAF?K9!ZT(ppcFBMxqqMZ7q(o%H+Z?m`>{(lV)Fv~UndzL~#Be{|b8Se`&K$9b
zD65(syQMjrT3cILAG{71Q@DQi`R&^?g_m)%G$zhS3!ZejV%yx)Pd^`c^XBjG@^=hM
z9Cr13VvK92Pv2jkZ@2zsHQR^Z)Akj48BX7x{~id`KmASG7@@LB=ltsH+ly@HPAFOG
zzfr&?@XFP#>HHV(uW?q{)1Y8p3M`3KOI#yLQW8s2t&)pUffR$0fuW(U0T5Y)7#dg^
znp&Bf>l&C_85nFmXte}ILvDUbW?ChR1`{g-3oAnlhz6(dLvMi^7(8A5T-G@yGywq1
Cm_*M2

literal 0
HcmV?d00001

diff --git a/resources/recipes/exiled.recipe b/resources/recipes/exiled.recipe
index 72dfc02e8b..6a65e22edc 100644
--- a/resources/recipes/exiled.recipe
+++ b/resources/recipes/exiled.recipe
@@ -1,7 +1,5 @@
-#!/usr/bin/env  python
-
 __license__   = 'GPL v3'
-__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'
 '''
 exiledonline.com
 '''
@@ -20,18 +18,20 @@ class Exiled(BasicNewsRecipe):
     use_embedded_content  = False
     encoding              = 'utf8'
     remove_javascript     = True
-    language = 'en'
-
-    cover_url             = 'http://exiledonline.com/wp-content/themes/exiledonline_theme/images/header-sm.gif'
-
-    html2lrf_options = [
-                          '--comment'       , description
-                        , '--base-font-size', '10'
-                        , '--category'      , category
-                        , '--publisher'     , publisher
-                        ]
-
-    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
+    language              = 'en'
+    publication_type      = 'newsblog'
+    masthead_url          = 'http://exiledonline.com/wp-content/themes/exiledonline_theme/images/header-sm.gif'
+    extra_css             = """
+                               body{font-family: Arial,Helvetica,sans-serif}
+                               #topslug{font-size: xx-large; font-weight: bold; color: red}                               
+                            """
+    
+    conversion_options = {
+                          'comment'   : description
+                        , 'tags'      : category
+                        , 'publisher' : publisher
+                        , 'language'  : language
+                        }
 
     keep_only_tags = [dict(name='div', attrs={'id':'main'})]
 
@@ -47,12 +47,13 @@ class Exiled(BasicNewsRecipe):
     def preprocess_html(self, soup):
         for item in soup.findAll(style=True):
             del item['style']
-        mtag = '\n<meta http-equiv="Content-Language" content="en"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n'
-        soup.head.insert(0,mtag)
+        for alink in soup.findAll('a'):
+            if alink.string is not None:
+               tstr = alink.string
+               alink.replaceWith(tstr)
         return soup
 
     def get_article_url(self, article):
         raw = article.get('link',  None)
         final = raw + 'all/1/'
         return final
-

From 611c0373573a6ad74cc0ba5b4d4b8a5788760651 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 8 Jan 2011 10:52:29 -0700
Subject: [PATCH 11/13] ...

---
 src/calibre/ebooks/conversion/preprocess.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 97aaa653a9..ae111355e4 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -563,8 +563,8 @@ class HTMLPreProcessor(object):
         html = html.replace(start, '<!--')
         html = html.replace(stop, '-->')
         # convert ellipsis to entities to prevent wrapping
-        html = re.sub('(?u)(?<=\w)\s?(\.\s?){2}\.', '&hellip;', html)
+        html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '&hellip;', html)
         # convert double dashes to em-dash
-        html = re.sub('\s--\s', u'\u2014', html)
+        html = re.sub(r'\s--\s', u'\u2014', html)
         return substitute_entites(html)
 

From c5a679a437c7ab52bb0320c83eef4535c151feb5 Mon Sep 17 00:00:00 2001
From: GRiker <griker@hotmail.com>
Date: Sat, 8 Jan 2011 11:42:31 -0700
Subject: [PATCH 12/13] GwR patch for bogus cover data

---
 src/calibre/library/catalog.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py
index 0a5d5284e2..1af9c3aa58 100644
--- a/src/calibre/library/catalog.py
+++ b/src/calibre/library/catalog.py
@@ -21,7 +21,7 @@ from calibre.utils.config import config_dir
 from calibre.utils.date import format_date, isoformat, now as nowf
 from calibre.utils.logging import default_log as log
 from calibre.utils.zipfile import ZipFile, ZipInfo
-from calibre.utils.magick.draw import thumbnail
+from calibre.utils.magick.draw import identify_data, thumbnail
 
 FIELDS = ['all', 'author_sort', 'authors', 'comments',
           'cover', 'formats', 'id', 'isbn', 'ondevice', 'pubdate', 'publisher', 'rating',
@@ -2861,11 +2861,19 @@ class EPUB_MOBI(CatalogPlugin):
                 self.updateProgressMicroStep("Thumbnail %d of %d" % \
                     (i,len(self.booksByTitle)),
                         i/float(len(self.booksByTitle)))
-                # Check to see if source file exists
-                if 'cover' in title and os.path.isfile(title['cover']):
+
+                # Confirm existence, integrity of cover image
+                valid_cover = True
+                try:
+                    _w, _h, _fmt = identify_data(open(title['cover'], 'rb').read())
+                except:
+                    valid_cover = False
+
+                if valid_cover:
                     # Add the thumb spec to thumbs[]
                     thumbs.append("thumbnail_%d.jpg" % int(title['id']))
-
+                    self.generateThumbnail(title, image_dir, thumb_file)
+                    '''
                     # Check to see if thumbnail exists
                     thumb_fp = "%s/thumbnail_%d.jpg" % (image_dir,int(title['id']))
                     thumb_file = 'thumbnail_%d.jpg' % int(title['id'])
@@ -2879,6 +2887,7 @@ class EPUB_MOBI(CatalogPlugin):
                            self.generateThumbnail(title, image_dir, thumb_file)
                     else:
                         self.generateThumbnail(title, image_dir, thumb_file)
+                    '''
                 else:
                     # Use default cover
                     if False and self.verbose:

From 8a44bf07edf1b3282a65edd044421b963d4dd794 Mon Sep 17 00:00:00 2001
From: GRiker <griker@hotmail.com>
Date: Sat, 8 Jan 2011 11:48:41 -0700
Subject: [PATCH 13/13] GwR patch for bogus cover data

---
 src/calibre/library/catalog.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py
index 1af9c3aa58..df1341fc38 100644
--- a/src/calibre/library/catalog.py
+++ b/src/calibre/library/catalog.py
@@ -2862,7 +2862,7 @@ class EPUB_MOBI(CatalogPlugin):
                     (i,len(self.booksByTitle)),
                         i/float(len(self.booksByTitle)))
 
-                # Confirm existence, integrity of cover image
+                thumb_file = 'thumbnail_%d.jpg' % int(title['id'])
                 valid_cover = True
                 try:
                     _w, _h, _fmt = identify_data(open(title['cover'], 'rb').read())