From 19d7b2116514df5a88bbfb321aaa0f3199ebcbeb Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Wed, 15 Sep 2010 15:43:53 +0800
Subject: [PATCH 01/12] enabled preprocesing for LRF input

---
 src/calibre/ebooks/lrf/input.py | 6 ++++++
 1 file changed, 6 insertions(+)
diff --git a/src/calibre/ebooks/lrf/input.py b/src/calibre/ebooks/lrf/input.py
index 1d730ab573..b5591176d1 100644
--- a/src/calibre/ebooks/lrf/input.py
+++ b/src/calibre/ebooks/lrf/input.py
@@ -12,6 +12,7 @@ from copy import deepcopy
 from lxml import etree
 
 from calibre.customize.conversion import InputFormatPlugin
+from calibre.ebooks.conversion.utils import PreProcessor
 from calibre import guess_type
 
 class Canvas(etree.XSLTExtension):
@@ -419,4 +420,9 @@ class LRFInput(InputFormatPlugin):
         styles.write()
         return os.path.abspath('content.opf')
 
+	def preprocess_html(self, html):
+        preprocessor = PreProcessor(log=getattr(self, 'log', None))
+        return preprocessor(html)
+
+
 

From 70998858f13ab2131a1f72552e3d54e6e180a1da Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Wed, 15 Sep 2010 15:46:04 +0800
Subject: [PATCH 02/12] unwrapping doesn't work correctly if quotes are encded
 as entities

---
 src/calibre/ebooks/conversion/preprocess.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 4538af96c4..4838e4a054 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -18,7 +18,6 @@ convert_entities = functools.partial(entity_to_unicode,
             u'<' : '&lt;',
             u'>' : '&gt;',
             u"'" : '&apos;',
-            u'"' : '&quot;',
             u'&' : '&amp;',
         })
 _span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)

From 66b443adc52ce4d3eb256b11d238dfafdeeb478a Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Wed, 15 Sep 2010 15:58:51 +0800
Subject: [PATCH 03/12] unwrapping doesn't work correctly if quotes are encoded
 as entities

---
 src/calibre/ebooks/conversion/preprocess.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 4838e4a054..a2ec2912cb 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -17,7 +17,6 @@ convert_entities = functools.partial(entity_to_unicode,
         result_exceptions = {
             u'<' : '&lt;',
             u'>' : '&gt;',
-            u"'" : '&apos;',
             u'&' : '&amp;',
         })
 _span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)

From 569b84e1cb940326f90ddeadde48b066c19ad5bd Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Thu, 16 Sep 2010 16:44:28 +0800
Subject: [PATCH 04/12] Revert previous changes, now looking for entities in
 unwrapping rule

---
 src/calibre/ebooks/conversion/preprocess.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index a2ec2912cb..e72e15c3d9 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -17,6 +17,8 @@ convert_entities = functools.partial(entity_to_unicode,
         result_exceptions = {
             u'<' : '&lt;',
             u'>' : '&gt;',
+            u"'" : '&apos;',
+            u'"' : '&quot;',
             u'&' : '&amp;',
         })
 _span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
@@ -349,7 +351,7 @@ class HTMLPreProcessor(object):
                 # print "The pdf line length returned is " + str(length)
                 end_rules.append(
                     # Un wrap using punctuation
-                    (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
+                    (re.compile(r'(?<=.{%i}([a-z,:)\-IA]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
                 )
 
         for rule in self.PREPROCESS + start_rules:

From 8cac314ffe204be4ea946d69dd4c0ac7368db0f1 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Thu, 16 Sep 2010 16:48:59 +0800
Subject: [PATCH 05/12] adding html_unwrap_factor

---
 src/calibre/ebooks/conversion/cli.py          |  2 +-
 src/calibre/ebooks/conversion/plumber.py      |  9 ++++
 src/calibre/ebooks/conversion/utils.py        | 16 ++++--
 .../gui2/convert/structure_detection.py       |  7 ++-
 .../gui2/convert/structure_detection.ui       | 53 ++++++++++++++-----
 5 files changed, 67 insertions(+), 20 deletions(-)

diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py
index 2ef633d0bb..62a941142b 100644
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@@ -137,7 +137,7 @@ def add_pipeline_options(parser, plumber):
                       'chapter', 'chapter_mark',
                       'prefer_metadata_cover', 'remove_first_image',
                       'insert_metadata', 'page_breaks_before',
-                      'preprocess_html',
+                      'preprocess_html', 'html_unwrap_factor',
                   ]
                   ),
 
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 16282dd28d..c8803fb922 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -362,6 +362,15 @@ OptionRecommendation(name='preprocess_html',
             )
         ),
 
+OptionRecommendation(name='html_unwrap_factor',
+        recommended_value=0.40, level=OptionRecommendation.LOW,
+        help=_('Scale used to determine the length at which a line should '
+            'be unwrapped if preprocess is enabled. Valid values are a decimal between 0 and 1. The '
+            'default is 0.40, just below the median line length.  This will unwrap typical books '
+            ' with hard line breaks, but should be reduced if the line length is variable.'
+            )
+        ),
+
 OptionRecommendation(name='smarten_punctuation',
         recommended_value=False, level=OptionRecommendation.LOW,
         help=_('Convert plain quotes, dashes and ellipsis to their '
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 5301f70a16..f9d16b428c 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -11,10 +11,11 @@ from calibre.utils.logging import default_log
 
 class PreProcessor(object):
 
-    def __init__(self, log=None):
+    def __init__(self, log=None, extra_opts=None):
         self.log = default_log if log is None else log
         self.html_preprocess_sections = 0
         self.found_indents = 0
+        self.extra_opts = extra_opts
 
     def chapter_head(self, match):
         chap = match.group('chap')
@@ -91,6 +92,7 @@ class PreProcessor(object):
         # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
         linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
         blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
+        multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE)
         blanklines = blankreg.findall(html)
         lines = linereg.findall(html)
         if len(lines) > 1:
@@ -146,16 +148,20 @@ class PreProcessor(object):
         else:
             format = 'html'
 
-        # Calculate Length
-        length = line_length(format, html, 0.4)
+        # Calculate Length        
+        #if getattr(self.extra_opts, 'html_unwrap_factor', 0.0) > 0.01:
+        length = line_length('pdf', html, getattr(self.extra_opts, 'html_unwrap_factor'))
+        #else:
+        #    length = line_length(format, html, 0.4)
+        #    self.log("#@#%!$@#$ - didn't find unwrap_factor")
         self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***")
         #
         # Unwrap and/or delete soft-hyphens, hyphens
         html = re.sub(u'­\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
         html = re.sub(u'(?<=[-–—])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
 
-        # Unwrap lines using punctation if the median length of all lines is less than 200
-        unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
+        # Unwrap lines using punctation and line length
+        unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
         html = unwrap.sub(' ', html)
 
         # If still no sections after unwrapping mark split points on lines with no punctuation
diff --git a/src/calibre/gui2/convert/structure_detection.py b/src/calibre/gui2/convert/structure_detection.py
index f2ca49d1bd..68f820bda4 100644
--- a/src/calibre/gui2/convert/structure_detection.py
+++ b/src/calibre/gui2/convert/structure_detection.py
@@ -26,7 +26,7 @@ class StructureDetectionWidget(Widget, Ui_Form):
                 'remove_first_image',
                 'insert_metadata', 'page_breaks_before',
                 'preprocess_html', 'remove_header', 'header_regex',
-                'remove_footer', 'footer_regex']
+                'remove_footer', 'footer_regex','html_unwrap_factor']
                 )
         self.db, self.book_id = db, book_id
         for x in ('pagebreak', 'rule', 'both', 'none'):
@@ -64,3 +64,8 @@ class StructureDetectionWidget(Widget, Ui_Form):
                 _('The XPath expression %s is invalid.')%x.text).exec_()
                 return False
         return True
+
+    def set_value_handler(self, g, val):
+        if val is None and isinstance(g, QDoubleSpinBox):
+            g.setValue(0.0)
+            return True
\ No newline at end of file
diff --git a/src/calibre/gui2/convert/structure_detection.ui b/src/calibre/gui2/convert/structure_detection.ui
index eb2892a07a..54534af950 100644
--- a/src/calibre/gui2/convert/structure_detection.ui
+++ b/src/calibre/gui2/convert/structure_detection.ui
@@ -48,17 +48,7 @@
      </property>
     </widget>
    </item>
-   <item row="8" column="0" colspan="2">
-    <widget class="QCheckBox" name="opt_preprocess_html">
-     <property name="text">
-      <string>&amp;Preprocess input file to possibly improve structure detection</string>
-     </property>
-    </widget>
-   </item>
-   <item row="9" column="0" colspan="2">
-    <widget class="XPathEdit" name="opt_page_breaks_before" native="true"/>
-   </item>
-   <item row="10" column="0" colspan="2">
+   <item row="14" column="0" colspan="2">
     <spacer name="verticalSpacer">
      <property name="orientation">
       <enum>Qt::Vertical</enum>
@@ -88,8 +78,45 @@
    <item row="5" column="0" colspan="2">
     <widget class="RegexEdit" name="opt_header_regex" native="true"/>
    </item>
-   <item row="7" column="0" colspan="2">
-    <widget class="RegexEdit" name="opt_footer_regex" native="true"/>
+   <item row="8" column="0" colspan="2">
+    <widget class="RegexEdit" name="opt_footer_regex" native="true">
+     <zorder>opt_page_breaks_before</zorder>
+    </widget>
+   </item>
+   <item row="10" column="0" colspan="2">
+    <widget class="XPathEdit" name="opt_page_breaks_before" native="true">
+     <zorder>opt_footer_regex</zorder>
+    </widget>
+   </item>
+   <item row="11" column="0">
+    <widget class="QCheckBox" name="opt_preprocess_html">
+     <property name="text">
+      <string>&amp;Preprocess input file to possibly improve structure detection</string>
+     </property>
+    </widget>
+   </item>
+   <item row="12" column="0">
+    <widget class="QLabel" name="label_2">
+     <property name="layoutDirection">
+      <enum>Qt::RightToLeft</enum>
+     </property>
+     <property name="text">
+      <string>Line Un-Wrapping Factor</string>
+     </property>
+    </widget>
+   </item>
+   <item row="12" column="1">
+    <widget class="QDoubleSpinBox" name="opt_html_unwrap_factor">
+     <property name="maximum">
+      <double>1.000000000000000</double>
+     </property>
+     <property name="singleStep">
+      <double>0.050000000000000</double>
+     </property>
+     <property name="value">
+      <double>0.400000000000000</double>
+     </property>
+    </widget>
    </item>
   </layout>
  </widget>

From 307f90457d6a7e9fa1c0fa1a88750b03df3d0386 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 16 Sep 2010 15:17:40 -0600
Subject: [PATCH 06/12] Modify author sort tooltip to explain the color. Fix
 #6836 (Updated recipe for Adventure Gamers)

---
 resources/recipes/adventuregamers.recipe    | 63 ++++++++++++---------
 src/calibre/gui2/dialogs/metadata_single.py | 15 +++--
 2 files changed, 46 insertions(+), 32 deletions(-)

diff --git a/resources/recipes/adventuregamers.recipe b/resources/recipes/adventuregamers.recipe
index 1cde045953..d08eca1723 100644
--- a/resources/recipes/adventuregamers.recipe
+++ b/resources/recipes/adventuregamers.recipe
@@ -1,7 +1,5 @@
-#!/usr/bin/env  python
-
 __license__   = 'GPL v3'
-__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
 '''
 www.adventuregamers.com
 '''
@@ -10,14 +8,11 @@ from calibre.web.feeds.news import BasicNewsRecipe
 
 class AdventureGamers(BasicNewsRecipe):
     title                 = u'Adventure Gamers'
-    language = 'en'
-
+    language              = 'en'
     __author__            = 'Darko Miletic'
-    description           = 'Adventure games portal'    
+    description           = 'Adventure games portal'
     publisher             = 'Adventure Gamers'
-    category              = 'news, games, adventure, technology'    
-    language = 'en'
-
+    category              = 'news, games, adventure, technology'
     oldest_article        = 10
     delay                 = 10
     max_articles_per_feed = 100
@@ -26,14 +21,25 @@ class AdventureGamers(BasicNewsRecipe):
     remove_javascript     = True
     use_embedded_content  = False
     INDEX                 = u'http://www.adventuregamers.com'
-    
-    html2lrf_options = [
-                          '--comment', description
-                        , '--category', category
-                        , '--publisher', publisher
-                        ]
-    
-    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' 
+    extra_css             = """
+                                .pageheader_type{font-size: x-large; font-weight: bold; color: #828D74}
+                                .pageheader_title{font-size: xx-large; color: #394128}
+                                .pageheader_byline{font-size: small; font-weight: bold; color: #394128}
+                                .score_bg {display: inline; width: 100%; margin-bottom: 2em}
+                                .score_column_1{ padding-left: 10px; font-size: small; width: 50%}
+                                .score_column_2{ padding-left: 10px; font-size: small; width: 50%}
+                                .score_column_3{ padding-left: 10px; font-size: small; width: 50%}
+                                .score_header{font-size: large; color: #50544A}
+                                .bodytext{display: block}
+                                body{font-family: Helvetica,Arial,sans-serif}
+                            """
+
+    conversion_options = {
+                          'comment'   : description
+                        , 'tags'      : category
+                        , 'publisher' : publisher
+                        , 'language'  : language
+                        }
 
     keep_only_tags = [
                        dict(name='div', attrs={'class':'content_middle'})
@@ -43,14 +49,15 @@ class AdventureGamers(BasicNewsRecipe):
                      dict(name=['object','link','embed','form'])
                     ,dict(name='div', attrs={'class':['related-stories','article_leadout','prev','next','both']})
                   ]
-                  
+
     remove_tags_after = [dict(name='div', attrs={'class':'toolbar_fat'})]
-    
+    remove_attributes = ['width','height']
+
     feeds = [(u'Articles', u'http://feeds2.feedburner.com/AdventureGamers')]
-    
+
     def get_article_url(self, article):
         return article.get('guid',  None)
-    
+
     def append_page(self, soup, appendtag, position):
         pager = soup.find('div',attrs={'class':'toolbar_fat_next'})
         if pager:
@@ -59,19 +66,19 @@ class AdventureGamers(BasicNewsRecipe):
            texttag = soup2.find('div', attrs={'class':'bodytext'})
            for it in texttag.findAll(style=True):
                del it['style']
-           newpos = len(texttag.contents)          
+           newpos = len(texttag.contents)
            self.append_page(soup2,texttag,newpos)
            texttag.extract()
            appendtag.insert(position,texttag)
-        
-    
+
+
     def preprocess_html(self, soup):
-        mtag = '<meta http-equiv="Content-Language" content="en-US"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
-        soup.head.insert(0,mtag)    
         for item in soup.findAll(style=True):
             del item['style']
+        for item in soup.findAll('div', attrs={'class':'floatright'}):
+            item.extract()
         self.append_page(soup, soup.body, 3)
         pager = soup.find('div',attrs={'class':'toolbar_fat'})
         if pager:
-           pager.extract()        
-        return soup
+           pager.extract()
+        return self.adeify_images(soup)
diff --git a/src/calibre/gui2/dialogs/metadata_single.py b/src/calibre/gui2/dialogs/metadata_single.py
index ac10847f3d..b23baa9de6 100644
--- a/src/calibre/gui2/dialogs/metadata_single.py
+++ b/src/calibre/gui2/dialogs/metadata_single.py
@@ -6,10 +6,7 @@ The dialog used to edit meta information for a book as well as
 add/remove formats
 '''
 
-import os
-import re
-import time
-import traceback
+import os, re, time, traceback, textwrap
 
 from PyQt4.Qt import SIGNAL, QObject, Qt, QTimer, QThread, QDate, \
     QPixmap, QListWidgetItem, QDialog, pyqtSignal
@@ -331,6 +328,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
         ResizableDialog.__init__(self, window)
         self.bc_box.layout().setAlignment(self.cover, Qt.AlignCenter|Qt.AlignHCenter)
         self.cancel_all = False
+        self.normal_aus_tooltip = unicode(self.author_sort.toolTip())
         if cancel_all:
             self.__abort_button = self.button_box.addButton(self.button_box.Abort)
             self.__abort_button.setToolTip(_('Abort the editing of all remaining books'))
@@ -454,6 +452,9 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
         else:
             self.create_custom_column_editors()
         self.generate_cover_button.clicked.connect(self.generate_cover)
+        self.author_sort.setToolTip(textwrap.fill('<p>'+self.normal_aus_tooltip+'<br><br>'+
+                            _(' The green color indicates that the current '
+                    'author sort matches the current author')))
 
     def create_custom_column_editors(self):
         w = self.central_widget.widget(1)
@@ -490,6 +491,12 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
             col = 'rgb(255, 0, 0, 20%)'
         self.author_sort.setStyleSheet('QLineEdit { color: black; '
                                        'background-color: %s; }'%col)
+        tt = self.normal_aus_tooltip
+        if not normal:
+            tt = '<p>'+textwrap.fill(tt + '<br><br>'+
+                _(' The red color indicates that the current '
+                    'author sort does not match the current author'))
+        self.author_sort.setToolTip(tt)
 
     def validate_isbn(self, isbn):
         isbn = unicode(isbn).strip()

From 40ee6a2140c963f575aaec506108448e0fa1ac4c Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 16 Sep 2010 15:49:23 -0600
Subject: [PATCH 07/12] Fix author sort tooltip handling

---
 src/calibre/gui2/dialogs/metadata_single.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/calibre/gui2/dialogs/metadata_single.py b/src/calibre/gui2/dialogs/metadata_single.py
index b23baa9de6..d07eac7670 100644
--- a/src/calibre/gui2/dialogs/metadata_single.py
+++ b/src/calibre/gui2/dialogs/metadata_single.py
@@ -328,7 +328,14 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
         ResizableDialog.__init__(self, window)
         self.bc_box.layout().setAlignment(self.cover, Qt.AlignCenter|Qt.AlignHCenter)
         self.cancel_all = False
-        self.normal_aus_tooltip = unicode(self.author_sort.toolTip())
+        base = unicode(self.author_sort.toolTip())
+        self.ok_aus_tooltip = '<p>' + textwrap.fill(base+'<br><br>'+
+                            _(' The green color indicates that the current '
+                    'author sort matches the current author'))
+        self.bad_aus_tooltip = '<p>'+textwrap.fill(base + '<br><br>'+
+                _(' The red color indicates that the current '
+                    'author sort does not match the current author'))
+
         if cancel_all:
             self.__abort_button = self.button_box.addButton(self.button_box.Abort)
             self.__abort_button.setToolTip(_('Abort the editing of all remaining books'))
@@ -452,9 +459,6 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
         else:
             self.create_custom_column_editors()
         self.generate_cover_button.clicked.connect(self.generate_cover)
-        self.author_sort.setToolTip(textwrap.fill('<p>'+self.normal_aus_tooltip+'<br><br>'+
-                            _(' The green color indicates that the current '
-                    'author sort matches the current author')))
 
     def create_custom_column_editors(self):
         w = self.central_widget.widget(1)
@@ -491,11 +495,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
             col = 'rgb(255, 0, 0, 20%)'
         self.author_sort.setStyleSheet('QLineEdit { color: black; '
                                        'background-color: %s; }'%col)
-        tt = self.normal_aus_tooltip
-        if not normal:
-            tt = '<p>'+textwrap.fill(tt + '<br><br>'+
-                _(' The red color indicates that the current '
-                    'author sort does not match the current author'))
+        tt = self.ok_aus_tooltip if normal else self.bad_aus_tooltip
         self.author_sort.setToolTip(tt)
 
     def validate_isbn(self, isbn):

From 57ae10c5705d82fb2f94da142484e4f2554557ae Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 16 Sep 2010 16:00:10 -0600
Subject: [PATCH 08/12] Fix #6838 (Modifications to AJC recipe)

---
 resources/images/news/ajc.png | Bin 0 -> 1054 bytes
 resources/recipes/ajc.recipe  |  40 +++++++++++++++++++++++++++-------
 2 files changed, 32 insertions(+), 8 deletions(-)
 create mode 100644 resources/images/news/ajc.png

diff --git a/resources/images/news/ajc.png b/resources/images/news/ajc.png
new file mode 100644
index 0000000000000000000000000000000000000000..110f40e03942c07ddb28c5620b32348ec318dd03
GIT binary patch
literal 1054
zcmeAS@N?(olHy`uVBq!ia0vp^0wB!61|;P_|4#%`Y)RhkE)4%caKYZ?lYt_f1s;*b
zK-vS0-A-oPfdtD69Mgd`SU*F|v9*U87?|BXT^vI!PKQp8&6u6a(0cxNUHmM4-)W|*
zkqKcDcYSu3>AsbkC6+R~xJluXl5P`^)}y^~JbOO)P1!hm)luoaGo%;yC|bD+n&;{s
z%}n2}BI|p7>*gC<Vt?&6-S4|^{`U&Q)^9O#eCLlolyMZ=UAjU0-?n(+_Z;25mqO}}
zvjysIpK`<}u!CE>e%t*UbH)EN_q@0}>sMCp9gP)Y8ci%sisFCjI!ksjEoESD;yCqw
zS>Cq2``8OUq$oYmf4J{>NTMR6>!D<inwozGNptPKd|9$#TiT(gLK}3Q*uK78x##jJ
zYuPCb0!7_{Gq)dZUBrAjM1V;kXu&eK_Uo%1y%?8rFdi`KoSXVOB0J!}41>$vmoD>v
z-2P=1v07Q^V?yQ8BbSmS7cyJ*nf*-Ip*rcL%_mFgX{=Y|KQ4$UnNXKvb#TS+NS9qg
z+xPuRnkpTw<~iGcn{J)ajzeuHRMy8tCfmd%=uAt{5qtjPgm6^f;h2k$E?iJ{^Hb4Z
z^!dt_#}g;!&a?i`FFjkW;_q$WNl#9SPyUl_<Ui?6huV+F{?aNZ&F}dX78J}>seHCI
z#4SEv>-YKih(iZ%8dOzD2iBi(*tzDxx=9AwTz$@OKEJg1=)`+1?6idR$;7&CJ)O5J
zQ_RmqRsPu_Z(f&oY^AYX%pa}hix+cu#2gZ0Df4%^%%Z>$D}3U)$hB}?3$vCUlkn$D
z1$O*9E2WnI{KDO>{6SpqmsuGcycPs(C{z!0u*g}k_HK9McBdb?Ygo_Qq~|Q&`TO3H
zzc+<EFKx-u)0tvn9GtR7``pKmI-Z+$TOZ6YIr(xX)BIBp)14*dvUXO7uIF3n|ND3P
z`eRnBH*QT$&Z}d8IHB<voBTI+lL;-$IM<|Uy?8(0b<5%Qv%b=uDW0b8z2Bc|&pozP
zR^jLAvpUb7d7Vg=ol|$R-+THz_t%E{V&d92BDM>xxccByhkagZp?;)G?%db6*5BX0
z<SV~@#)D?|mDA<VC7#n<-EiDF#YpVZOCh0PYYDk~O=^t)!k1jv-uS;kZvFbp?BC|B
z++O~x;MvB{&Z_D1y3;wAHh4ukUT<`8Xny=)CA*<qwt3v%nw={*urn~!)@tU~9jpm|
zRm0DBSl|LjfRn~rFP$B$kGSve-^%-b-{<|QcMr>7-g$dAFn_6*xJHzuB$lLFB^RXv
zDF!10Ljzp{OI<^=5JOWdQwu9YBV7YCD+2?oRmGsph|rLmpOTqYiK4;W%GB7(7^-1n
SUQ;em1B0ilpUXO@geCx5k<G{e

literal 0
HcmV?d00001

diff --git a/resources/recipes/ajc.recipe b/resources/recipes/ajc.recipe
index 4315101a63..82809f626c 100644
--- a/resources/recipes/ajc.recipe
+++ b/resources/recipes/ajc.recipe
@@ -10,12 +10,31 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
     oldest_article = 1
     max_articles_per_feed = 100
     no_stylesheets = True
-    extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
+
     masthead_url = 'http://gawand.org/wp-content/uploads/2010/06/ajc-logo.gif'
+    extra_css = '''
+                    h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
+                    h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
+                    p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
+                    body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
+		        '''
+
+
     keep_only_tags    = [
-                       dict(name='div', attrs={'id':['cxArticleContent']})
-                       ,dict(attrs={'id':['cxArticleText','cxArticleBodyText']})
+                        dict(name='div', attrs={'class':['cxArticleHeader']})
+                       ,dict(attrs={'id':['cxArticleText']})
                         ]
+
+
+    remove_tags = [
+                     dict(name='div'  , attrs={'class':'cxArticleList'       })
+                    ,dict(name='div'  , attrs={'class':'cxFeedTease' })
+                    ,dict(name='div'  , attrs={'class':'cxElementEnlarge'  })
+                    ,dict(name='div'  , attrs={'id':'cxArticleTools'  })
+                  ]
+
+
+
     feeds          = [
                       ('Breaking News', 'http://www.ajc.com/genericList-rss.do?source=61499'),
                       # -------------------------------------------------------------------
@@ -23,7 +42,7 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
                       # read by simply removing the pound sign from it.  I currently have it
                       # set to only get the Cobb area
                       # --------------------------------------------------------------------
-                      ('Atlanta & Fulton', 'http://www.ajc.com/section-rss.do?source=atlanta'),
+                      #('Atlanta & Fulton', 'http://www.ajc.com/section-rss.do?source=atlanta'),
                       #('Clayton', 'http://www.ajc.com/section-rss.do?source=clayton'),
                       #('DeKalb', 'http://www.ajc.com/section-rss.do?source=dekalb'),
                       #('Gwinnett', 'http://www.ajc.com/section-rss.do?source=gwinnett'),
@@ -41,7 +60,7 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
                       # but again
                       # You can enable which ever team you like by removing the pound sign
                       # ------------------------------------------------------------------------
-                      ('Sports News', 'http://www.ajc.com/genericList-rss.do?source=61510'),
+                      #('Sports News', 'http://www.ajc.com/genericList-rss.do?source=61510'),
                       #('Braves', 'http://www.ajc.com/genericList-rss.do?source=61457'),
                        ('Falcons', 'http://www.ajc.com/genericList-rss.do?source=61458'),
                       #('Hawks', 'http://www.ajc.com/genericList-rss.do?source=61522'),
@@ -52,11 +71,16 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
                        ('Music', 'http://www.accessatlanta.com/section-rss.do?source=music'),
                     ]
 
+    def postprocess_html(self, soup, first):
+      for credit_tag in soup.findAll('span', attrs={'class':['imageCredit rightFloat']}):
+       credit_tag.name ='p'
+
+      return soup
+
+   #def print_version(self, url):
+   #     return url.partition('?')[0] +'?printArticle=y'
 
 
-
-    def print_version(self, url):
-        return url.partition('?')[0] +'?printArticle=y'
 
 
 

From f56f20c0801870dd1a9a0312bb2cb892c530cd48 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 16 Sep 2010 17:06:32 -0600
Subject: [PATCH 09/12] Brand Eins by Constantin Hofstetter

---
 resources/recipes/brand_eins.recipe | 125 ++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)
 create mode 100644 resources/recipes/brand_eins.recipe

diff --git a/resources/recipes/brand_eins.recipe b/resources/recipes/brand_eins.recipe
new file mode 100644
index 0000000000..be5b98ffe6
--- /dev/null
+++ b/resources/recipes/brand_eins.recipe
@@ -0,0 +1,125 @@
+#!/usr/bin/env  python
+# -*- coding: utf-8 -*-
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Constantin Hofstetter <consti at consti.de>'
+__version__   = '0.95'
+
+''' http://brandeins.de - Wirtschaftsmagazin '''
+import re
+import string
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class BrandEins(BasicNewsRecipe):
+
+  title = u'Brand Eins'
+  __author__ = 'Constantin Hofstetter'
+  description = u'Wirtschaftsmagazin'
+  publisher ='brandeins.de'
+  category = 'politics, business, wirtschaft, Germany'
+  use_embedded_content = False
+  lang = 'de-DE'
+  no_stylesheets = True
+  encoding = 'utf-8'
+  language = 'de'
+
+  # 2 is the last full magazine (default)
+  # 1 is the newest (but not full)
+  # 3 is one before 2 etc.
+  which_ausgabe = 2
+
+  keep_only_tags = [dict(name='div', attrs={'id':'theContent'}), dict(name='div', attrs={'id':'sidebar'}), dict(name='div', attrs={'class':'intro'}), dict(name='p', attrs={'class':'bodytext'}), dict(name='div', attrs={'class':'single_image'})]
+
+  '''
+  brandeins.de
+  '''
+
+  def postprocess_html(self, soup,first):
+
+    # Move the image of the sidebar right below the h3
+    first_h3 = soup.find(name='div', attrs={'id':'theContent'}).find('h3')
+    for imgdiv in soup.findAll(name='div', attrs={'class':'single_image'}):
+      if len(first_h3.findNextSiblings('div', {'class':'intro'})) >= 1:
+        # first_h3.parent.insert(2, imgdiv)
+        first_h3.findNextSiblings('div', {'class':'intro'})[0].parent.insert(4, imgdiv)
+      else:
+        first_h3.parent.insert(2, imgdiv)
+
+    # Now, remove the sidebar
+    soup.find(name='div', attrs={'id':'sidebar'}).extract()
+
+    # Remove the rating-image (stars) from the h3
+    for img in first_h3.findAll(name='img'):
+        img.extract()
+
+    # Mark the intro texts as italic
+    for div in soup.findAll(name='div', attrs={'class':'intro'}):
+      for p in div.findAll('p'):
+        content = self.tag_to_string(p)
+        new_p = "<p><i>"+ content +"</i></p>"
+        p.replaceWith(new_p)
+
+    return soup
+
+  def parse_index(self):
+    feeds = []
+
+    archive = "http://www.brandeins.de/archiv.html"
+
+    soup = self.index_to_soup(archive)
+    latest_jahrgang = soup.findAll('div', attrs={'class': re.compile(r'\bjahrgang-latest\b') })[0].findAll('ul')[0]
+    pre_latest_issue = latest_jahrgang.findAll('a')[len(latest_jahrgang.findAll('a'))-self.which_ausgabe]
+    url = pre_latest_issue.get('href', False)
+    # Get the title for the magazin - build it out of the title of the cover - take the issue and year;
+    self.title = "Brand Eins "+ re.search(r"(?P<date>\d\d\/\d\d\d\d+)", pre_latest_issue.find('img').get('title', False)).group('date')
+    url = 'http://brandeins.de/'+url
+
+    # url = "http://www.brandeins.de/archiv/magazin/tierisch.html"
+    titles_and_articles = self.brand_eins_parse_latest_issue(url)
+    if titles_and_articles:
+      for title, articles in titles_and_articles:
+        feeds.append((title, articles))
+    return feeds
+
+  def brand_eins_parse_latest_issue(self, url):
+    soup = self.index_to_soup(url)
+    article_lists = [soup.find('div', attrs={'class':'subColumnLeft articleList'}), soup.find('div', attrs={'class':'subColumnRight articleList'})]
+
+    titles_and_articles = []
+    current_articles = []
+    chapter_title = "Editorial"
+    self.log('Found Chapter:', chapter_title)
+
+    # Remove last list of links (thats just the impressum and the 'gewinnspiel')
+    article_lists[1].findAll('ul')[len(article_lists[1].findAll('ul'))-1].extract()
+
+    for article_list in article_lists:
+      for chapter in article_list.findAll('ul'):
+        if len(chapter.findPreviousSiblings('h3')) >= 1:
+          new_chapter_title = string.capwords(self.tag_to_string(chapter.findPreviousSiblings('h3')[0]))
+          if new_chapter_title != chapter_title:
+            titles_and_articles.append([chapter_title, current_articles])
+            current_articles = []
+            self.log('Found Chapter:', new_chapter_title)
+          chapter_title = new_chapter_title
+        for li in chapter.findAll('li'):
+          a = li.find('a', href = True)
+          if a is None:
+            continue
+          title = self.tag_to_string(a)
+          url = a.get('href', False)
+          if not url or not title:
+            continue
+          url = 'http://brandeins.de/'+url
+          if len(a.parent.findNextSiblings('p')) >= 1:
+            description = self.tag_to_string(a.parent.findNextSiblings('p')[0])
+          else:
+            description = ''
+
+          self.log('\t\tFound article:', title)
+          self.log('\t\t\t', url)
+          self.log('\t\t\t', description)
+
+          current_articles.append({'title': title, 'url': url, 'description': description, 'date':''})
+    titles_and_articles.append([chapter_title, current_articles])
+    return titles_and_articles

From 06d4d52fead99546c3c9bc50ae293d1f287e8b77 Mon Sep 17 00:00:00 2001
From: Timothy Legge <timlegge@gmail.com>
Date: Thu, 16 Sep 2010 21:12:36 -0300
Subject: [PATCH 10/12] Implement Kobo Im Reading list support

---
 src/calibre/devices/kobo/driver.py | 121 +++++++++++++++++++++++++++--
 1 file changed, 113 insertions(+), 8 deletions(-)

diff --git a/src/calibre/devices/kobo/driver.py b/src/calibre/devices/kobo/driver.py
index f24e00143b..a2be629449 100644
--- a/src/calibre/devices/kobo/driver.py
+++ b/src/calibre/devices/kobo/driver.py
@@ -5,15 +5,16 @@ __license__   = 'GPL v3'
 __copyright__ = '2010, Timothy Legge <timlegge at gmail.com> and Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
-import os
+import os, time
 import sqlite3 as sqlite
 
 from calibre.devices.usbms.books import BookList
 from calibre.devices.kobo.books import Book
 from calibre.devices.kobo.books import ImageWrapper
 from calibre.devices.mime import mime_type_ext
-from calibre.devices.usbms.driver import USBMS
+from calibre.devices.usbms.driver import USBMS, debug_print
 from calibre import prints
+from calibre.devices.usbms.books import CollectionsBookList
 
 class KOBO(USBMS):
 
@@ -21,12 +22,15 @@ class KOBO(USBMS):
     gui_name = 'Kobo Reader'
     description = _('Communicate with the Kobo Reader')
     author = 'Timothy Legge and Kovid Goyal'
-    version = (1, 0, 4)
+    version = (1, 0, 6)
 
     supported_platforms = ['windows', 'osx', 'linux']
 
+    booklist_class = CollectionsBookList
+
     # Ordered list of supported formats
     FORMATS     = ['epub', 'pdf']
+    CAN_SET_METADATA = True
 
     VENDOR_ID   = [0x2237]
     PRODUCT_ID  = [0x4161]
@@ -40,6 +44,12 @@ class KOBO(USBMS):
 
     VIRTUAL_BOOK_EXTENSIONS = frozenset(['kobo'])
 
+    EXTRA_CUSTOMIZATION_MESSAGE = _('The Kobo supports only one collection '
+            'currently: the \"Im_Reading\" list.  Create a tag called \"Im_Reading\" ')+\
+                    'for automatic management'
+
+    EXTRA_CUSTOMIZATION_DEFAULT = ', '.join(['tags'])
+
     def initialize(self):
         USBMS.initialize(self)
         self.book_class = Book
@@ -63,6 +73,8 @@ class KOBO(USBMS):
                  self._card_b_prefix if oncard == 'cardb' \
                  else self._main_prefix
 
+        self.booklist_class.rebuild_collections = self.rebuild_collections
+
         # get the metadata cache
         bl = self.booklist_class(oncard, prefix, self.settings)
         need_sync = self.parse_metadata_cache(bl, prefix, self.METADATA_CACHE)
@@ -85,9 +97,7 @@ class KOBO(USBMS):
                 playlist_map = {}
 
                 if readstatus == 1:
-                    if lpath not in playlist_map:
-                        playlist_map[lpath] = []
-                    playlist_map[lpath].append("I\'m Reading")
+                    playlist_map[lpath]= "Im_Reading"
 
                 path = self.normalize_path(path)
                 # print "Normalized FileName: " + path
@@ -104,14 +114,17 @@ class KOBO(USBMS):
                         if self.update_metadata_item(bl[idx]):
                             # print 'update_metadata_item returned true'
                             changed = True
-                    bl[idx].device_collections = playlist_map.get(lpath, [])
+                    if lpath in playlist_map and \
+                        playlist_map[lpath] not in bl[idx].device_collections:
+                            bl[idx].device_collections.append(playlist_map[lpath])
                 else:
                     if ContentType == '6':
                         book =  Book(prefix, lpath, title, authors, mime, date, ContentType, ImageID, size=1048576)
                     else:
                         book = self.book_from_path(prefix, lpath, title, authors, mime, date, ContentType, ImageID)
                     # print 'Update booklist'
-                    book.device_collections = playlist_map.get(book.lpath, [])
+                    book.device_collections = [playlist_map[lpath]] if lpath in playlist_map else []
+                                       
                     if bl.add_book(book, replace_metadata=False):
                         changed = True
             except: # Probably a path encoding error
@@ -398,3 +411,95 @@ class KOBO(USBMS):
         size = os.stat(cls.normalize_path(os.path.join(prefix, lpath))).st_size
         book =  Book(prefix, lpath, title, authors, mime, date, ContentType, ImageID, size=size, other=mi)
         return book
+
+    def get_device_paths(self):
+        paths, prefixes = {}, {}
+        for prefix, path, source_id in [
+                ('main', 'metadata.calibre', 0),
+                ('card_a', 'metadata.calibre', 1),
+                ('card_b', 'metadata.calibre', 2)
+                ]:
+            prefix = getattr(self, '_%s_prefix'%prefix)
+            if prefix is not None and os.path.exists(prefix):
+                paths[source_id] = os.path.join(prefix, *(path.split('/')))
+        return paths
+
+    def update_device_database_collections(self, booklists, collections_attributes):
+#        debug_print('Starting update_device_database_collections', collections_attributes)
+
+        # Force collections_attributes to be 'tags' as no other is currently supported
+#        debug_print('KOBO: overriding the provided collections_attributes:', collections_attributes)
+        collections_attributes = ['tags']
+
+        collections = booklists.get_collections(collections_attributes)
+#        debug_print('Collections', collections)
+        for category, books in collections.items():
+            if category == 'Im_Reading':
+                # Create a connection to the sqlite database
+                connection = sqlite.connect(self._main_prefix + '.kobo/KoboReader.sqlite')
+                cursor = connection.cursor()
+
+                # Reset Im_Reading list in the database
+                query= 'update content set ReadStatus=0, FirstTimeReading = \'true\' where BookID is Null'
+                try:
+                    cursor.execute (query)
+                except:
+                    debug_print('Database Exception:  Unable to reset Im_Reading list')
+                    raise
+                else:
+#                    debug_print('Commit: Reset Im_Reading list')
+                    connection.commit()
+
+                for book in books:
+#                    debug_print('Title:', book.title, 'lpath:', book.path)
+                    book.device_collections = ['Im_Reading']
+
+                    extension =  os.path.splitext(book.path)[1]
+                    ContentType = self.get_content_type_from_extension(extension)
+
+                    ContentID = self.contentid_from_path(book.path, ContentType)
+                    datelastread = time.strftime("%Y-%m-%dT%H:%M:%S", time.gmtime())
+
+                    t = (datelastread,ContentID,)
+
+                    try:
+                        cursor.execute('update content set ReadStatus=1,FirstTimeReading=\'false\',DateLastRead=? where BookID is Null and ContentID = ?', t)
+                    except:
+                        debug_print('Database Exception:  Unable create Im_Reading list')
+                        raise
+                    else:
+                        connection.commit()
+ #                       debug_print('Database: Commit create Im_Reading list')
+
+                cursor.close()
+                connection.close()
+
+#        debug_print('Finished update_device_database_collections', collections_attributes)
+        
+    def sync_booklists(self, booklists, end_session=True):
+#        debug_print('KOBO: started sync_booklists')
+        paths = self.get_device_paths()
+
+        blists = {}
+        for i in paths:
+            if booklists[i] is not None:
+               #debug_print('Booklist: ', i)
+               blists[i] = booklists[i]
+        opts = self.settings()
+        if opts.extra_customization:
+            collections = [x.lower().strip() for x in
+                    opts.extra_customization.split(',')]
+        else:
+            collections = []
+
+        #debug_print('KOBO: collection fields:', collections)
+        for i, blist in blists.items():
+                self.update_device_database_collections(blist, collections)
+
+        USBMS.sync_booklists(self, booklists, end_session=end_session)
+        #debug_print('KOBO: finished sync_booklists')
+
+    def rebuild_collections(self, booklist, oncard):
+        collections_attributes = []
+        self.update_device_database_collections(booklist, collections_attributes)
+

From 2215d2b3b5a8552d5503262893add5d43ece0b37 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 16 Sep 2010 18:49:00 -0600
Subject: [PATCH 11/12] Add documentation for the preprocess option to the User
 Manual

---
 src/calibre/manual/conversion.rst | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst
index c8bc3ef665..cfc2871396 100644
--- a/src/calibre/manual/conversion.rst
+++ b/src/calibre/manual/conversion.rst
@@ -329,6 +329,17 @@ There are a few more options in this section.
     of as a separate cover. If you also specify a cover in |app|, then the converted book will have
     two covers. This option will simply remove the first image from the source document, thereby
     ensuring that the converted book has only one cover, the one specified in |app|.
+
+:guilabel:`Preprocess input`
+    This option activates various algorithms that try to detect and correct common cases of
+    badly formatted input documents. Things like hard line breaks, large blocks of text with no formatting, etc.
+    Turn this option on if your input document suffers from bad formatting. But be aware that in
+    some cases, this option can lead to worse results, so use with care.
+
+:guilabel:`Line-unwrap factor`
+    This option control the algorithm |app| uses to remove hard line breaks. For example, if the value of this
+    option is 0.4, that means calibre will remove hard line breaks from the end of lines whose lengths are less
+    than the length of 40% of all lines in the document. 
     
 Table of Contents
 ------------------

From aef743316075dd034a47f4098fe67bdc5c1f8868 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 16 Sep 2010 19:02:18 -0600
Subject: [PATCH 12/12] Improved recipe for Slate

---
 resources/recipes/slate.recipe | 192 +++++++++++++++++----------------
 1 file changed, 100 insertions(+), 92 deletions(-)

diff --git a/resources/recipes/slate.recipe b/resources/recipes/slate.recipe
index 9da1c4da78..f2a5b71e3c 100644
--- a/resources/recipes/slate.recipe
+++ b/resources/recipes/slate.recipe
@@ -1,7 +1,8 @@
 #!/usr/bin/env  python
+__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 
 __license__   = 'GPL v3'
-__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
+
 '''
 calibre recipe for slate.com
 '''
@@ -10,13 +11,12 @@ import re
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Comment, Tag
 
-class PeriodicalNameHere(BasicNewsRecipe):
+class Slate(BasicNewsRecipe):
     # Method variables for customizing downloads
-    title                   = 'Slate'
     description             = 'A general-interest publication offering analysis and commentary about politics, news and culture.'
-    __author__              = 'GRiker and Sujata Raman'
-    max_articles_per_feed   = 20
-    oldest_article          = 7.0
+    __author__              = 'GRiker, Sujata Raman and Nick Redding'
+    max_articles_per_feed   = 100
+    oldest_article          = 14
     recursions              = 0
     delay                   = 0
     simultaneous_downloads  = 5
@@ -27,6 +27,12 @@ class PeriodicalNameHere(BasicNewsRecipe):
     encoding                = None
     language = 'en'
 
+    slate_complete = True
+    if slate_complete:
+        title = 'Slate (complete)'
+    else:
+        title = 'Slate (weekly)'
+
     # Method variables for customizing feed parsing
     summary_length          = 250
     use_embedded_content    = None
@@ -42,26 +48,15 @@ class PeriodicalNameHere(BasicNewsRecipe):
     match_regexps           = []
 
     # The second entry is for 'Big Money', which comes from a different site, uses different markup
-    keep_only_tags          = [dict(attrs={   'id':['article_top', 'article_body', 'story']}),
+    keep_only_tags          = [dict(attrs={   'id':['article_top', 'article_body']}),
                                dict(attrs={   'id':['content']})  ]
 
     # The second entry is for 'Big Money', which comes from a different site, uses different markup
-    remove_tags             = [dict(attrs={   'id':[
-                                                    'add_comments_button',
-                                                    'article_bottom_tools',
-                                                    'article_bottom_tools_cntr',
-                                                    'bizbox_links_bottom',
-                                                    'BOXXLE',
-                                                    'comments_button',
-                                                    'comments-to-fray',
-                                                    'fbog_article_bottom_cntr',
-                                                    'fray_article_discussion',                                                    'fray_article_links','bottom_sponsored_links','author_bio',
-                                                    'insider_ad_wrapper',
-                                                    'js_kit_cntr',
-                                                    'recommend_tab',
-                                                    'ris_links_wrapper',
-                                                    'toolbox',
-                                                    ]}),
+    remove_tags             = [dict(attrs={   'id':['toolbox','recommend_tab','insider_ad_wrapper',
+                                                    'article_bottom_tools_cntr','fray_article_discussion','fray_article_links','bottom_sponsored_links','author_bio',
+                                                    'bizbox_links_bottom','ris_links_wrapper','BOXXLE',
+                                                    'comments_button','add_comments_button','comments-to-fray','marriott_ad',
+                                                    'article_bottom_tools','recommend_tab2','fbog_article_bottom_cntr']}),
                                dict(attrs={    'id':['content-top','service-links-bottom','hed']})   ]
 
     excludedDescriptionKeywords =   ['Slate V','Twitter feed','podcast']
@@ -72,16 +67,15 @@ class PeriodicalNameHere(BasicNewsRecipe):
     extra_css = '''
                   .h1_subhead{font-family:Arial; font-size:small; }
                    h1{font-family:Verdana; font-size:large; }
-                 .byline        {font-family:Georgia;   margin-bottom: 0px; color: #660033;}
-                 .dateline      {font-family:Arial;  font-size: smaller; height: 0pt; color:#666666;}
+                 .byline        {font-family:Georgia;   margin-bottom: 0px; }
+                 .dateline      {font-family:Arial;  font-size: smaller; height: 0pt;}
                  .imagewrapper  {font-family:Verdana;font-size:x-small; }
                  .source        {font-family:Verdana; font-size:x-small;}
                  .credit        {font-family:Verdana; font-size:     smaller;}
                  #article_body  {font-family:Verdana; }
                  #content  {font-family:Arial; }
                  .caption{font-family:Verdana;font-style:italic; font-size:x-small;}
-                 h3{font-family:Arial; color:#666666; font-size:small}
-                  a{color:#0066CC;}
+                 h3{font-family:Arial; font-size:small}
                   '''
 
     # Local variables to extend class
@@ -99,32 +93,59 @@ class PeriodicalNameHere(BasicNewsRecipe):
             if isinstance(item, (NavigableString, CData)):
                 strings.append(item.string)
             elif isinstance(item, Tag):
-                res = self.tag_to_string(item)
+                res = self.tag_to_string(item,use_alt=False)
                 if res:
                     strings.append(res)
         return strings
 
-
-    def extract_sections(self):
+    def extract_named_sections(self):
         soup = self.index_to_soup( self.baseURL )
-        soup_top_stories = soup.find(True, attrs={'class':'tap2_topic entry-content'})
+        soup_nav_bar = soup.find(True, attrs={'id':'nav'})
+        briefing_nav = soup.find('li')
+        briefing_url = briefing_nav.a['href']
+        for section_nav in soup_nav_bar.findAll('li'):
+            section_name = self.tag_to_string(section_nav,use_alt=False)
+            self.section_dates.append(section_name)
+
+        soup = self.index_to_soup(briefing_url)
+
+        self.log("Briefing url = %s " % briefing_url)
+        section_lists = soup.findAll('ul','view_links_list')
+
+        sections = []
+        for section in section_lists :
+            sections.append(section)
+        return sections
+
+
+    def extract_dated_sections(self):
+        soup = self.index_to_soup( self.baseURL )
+        soup_top_stories = soup.find(True, attrs={'id':'tap3_cntr'})
+        if soup_top_stories:
+            self.section_dates.append("Top Stories")
+            self.log("SELECTION TOP STORIES %s" % "Top Stories")
+
         soup = soup.find(True, attrs={'id':'toc_links_container'})
 
         todays_section = soup.find(True, attrs={'class':'todaydateline'})
         self.section_dates.append(self.tag_to_string(todays_section,use_alt=False))
+        self.log("SELECTION DATE %s" % self.tag_to_string(todays_section,use_alt=False))
 
         older_section_dates = soup.findAll(True, attrs={'class':'maindateline'})
         for older_section in older_section_dates :
             self.section_dates.append(self.tag_to_string(older_section,use_alt=False))
+            self.log("SELECTION DATE %s" % self.tag_to_string(older_section,use_alt=False))
 
         if soup_top_stories:
-            headline_stories = soup_top_stories.find('ul')
+            headline_stories = soup_top_stories
+            self.log("HAVE top_stories")
         else:
             headline_stories = None
+            self.log("NO top_stories")
         section_lists = soup.findAll('ul')
         # Prepend the headlines to the first section
         if headline_stories:
-            section_lists[0].insert(0,headline_stories)
+            section_lists.insert(0,headline_stories)
 
         sections = []
         for section in section_lists :
@@ -133,9 +154,8 @@ class PeriodicalNameHere(BasicNewsRecipe):
 
 
     def extract_section_articles(self, sections_html) :
-        #       Find the containers with section content
-        soup = self.index_to_soup(str(sections_html))
-        sections = soup.findAll('ul')
+        # Find the containers with section content
+        sections = sections_html
 
         articles = {}
         key = None
@@ -145,10 +165,25 @@ class PeriodicalNameHere(BasicNewsRecipe):
 
             # Get the section name
             if section.has_key('id') :
+                self.log("PROCESSING SECTION id = %s" % section['id'])
                 key = self.section_dates[i]
+                if key.startswith("Pod"):
+                    continue
+                if key.startswith("Blog"):
+                    continue
+                articles[key] = []
+                ans.append(key)
+            elif self.slate_complete:
+                key = self.section_dates[i]
+                if key.startswith("Pod"):
+                    continue
+                if key.startswith("Blog"):
+                    continue
+                self.log("PROCESSING SECTION name = %s" % key)
                 articles[key] = []
                 ans.append(key)
             else :
+                self.log("SECTION %d HAS NO id" % i);
                 continue
 
             # Get the section article_list
@@ -159,8 +194,10 @@ class PeriodicalNameHere(BasicNewsRecipe):
                 bylines = self.tag_to_strings(article)
                 url = article.a['href']
                 title = bylines[0]
-                full_title = self.tag_to_string(article)
-
+                full_title = self.tag_to_string(article,use_alt=False)
+                #self.log("ARTICLE TITLE%s" % title)
+                #self.log("ARTICLE FULL_TITLE%s" % full_title)
+                #self.log("URL %s" % url)
                 author = None
                 description = None
                 pubdate = None
@@ -191,7 +228,7 @@ class PeriodicalNameHere(BasicNewsRecipe):
                     excluded = re.compile('|'.join(self.excludedDescriptionKeywords))
                     found_excluded = excluded.search(description)
                     if found_excluded :
-                        if self.verbose : self.log("  >>> skipping %s (description keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
+                        self.log("  >>> skipping %s (description keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
                         continue
 
                 # Skip articles whose title contain excluded keywords
@@ -200,7 +237,7 @@ class PeriodicalNameHere(BasicNewsRecipe):
                     #self.log("evaluating full_title: %s" % full_title)
                     found_excluded = excluded.search(full_title)
                     if found_excluded :
-                        if self.verbose : self.log("  >>> skipping %s (title keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
+                        self.log("  >>> skipping %s (title keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
                         continue
 
                 # Skip articles whose author contain excluded keywords
@@ -208,7 +245,7 @@ class PeriodicalNameHere(BasicNewsRecipe):
                     excluded = re.compile('|'.join(self.excludedAuthorKeywords))
                     found_excluded = excluded.search(author)
                     if found_excluded :
-                        if self.verbose : self.log("  >>> skipping %s (author keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
+                        self.log("  >>> skipping %s (author keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
                         continue
 
                 skip_this_article = False
@@ -216,6 +253,7 @@ class PeriodicalNameHere(BasicNewsRecipe):
                 for article in articles[key] :
                     if article['url'] == url :
                         skip_this_article = True
+                        self.log("SKIPPING DUP %s" % url)
                         break
 
                 if skip_this_article :
@@ -227,6 +265,8 @@ class PeriodicalNameHere(BasicNewsRecipe):
                     articles[feed] = []
                 articles[feed].append(dict(title=title, url=url, date=pubdate, description=description,
                                            author=author, content=''))
+                #self.log("KEY %s" % feed)
+                #self.log("APPENDED %s" % url)
             # Promote 'newspapers' to top
             for (i,article) in enumerate(articles[feed]) :
                 if article['description'] is not None :
@@ -235,32 +275,6 @@ class PeriodicalNameHere(BasicNewsRecipe):
 
 
         ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
-        ans = self.remove_duplicates(ans)
-        return ans
-
-    def flatten_document(self, ans):
-        flat_articles = []
-        for (i,section) in enumerate(ans) :
-            #self.log("flattening section %s: " % section[0])
-            for article in section[1] :
-                #self.log("moving %s to flat_articles[]" % article['title'])
-                flat_articles.append(article)
-        flat_section = ['All Articles', flat_articles]
-        flat_ans = [flat_section]
-        return flat_ans
-
-    def remove_duplicates(self, ans):
-        # Return a stripped ans
-        for (i,section) in enumerate(ans) :
-            #self.log("section %s: " % section[0])
-            for article in section[1] :
-                #self.log("\t%s" % article['title'])
-                #self.log("\looking for %s" % article['url'])
-                for (j,subsequent_section) in enumerate(ans[i+1:]) :
-                    for (k,subsequent_article) in enumerate(subsequent_section[1]) :
-                        if article['url'] == subsequent_article['url'] :
-                            #self.log( "removing %s (%s) from %s" % (subsequent_article['title'], subsequent_article['url'], subsequent_section[0]) )
-                            del subsequent_section[1][k]
         return ans
 
     def print_version(self, url) :
@@ -268,13 +282,22 @@ class PeriodicalNameHere(BasicNewsRecipe):
 
     # Class methods
     def parse_index(self) :
-        sections = self.extract_sections()
+        if self.slate_complete:
+            sections = self.extract_named_sections()
+        else:
+            sections = self.extract_dated_sections()
         section_list = self.extract_section_articles(sections)
-        section_list = self.flatten_document(section_list)
         return section_list
 
-    def get_browser(self) :
-        return BasicNewsRecipe.get_browser()
+    def get_masthead_url(self):
+        masthead = 'http://img.slate.com/images/redesign2008/slate_logo.gif'
+        br = BasicNewsRecipe.get_browser()
+        try:
+            br.open(masthead)
+        except:
+            self.log("\nMasthead unavailable")
+            masthead = None
+        return masthead
 
     def stripAnchors(self,soup):
         body = soup.find('div',attrs={'id':['article_body','content']})
@@ -304,8 +327,8 @@ class PeriodicalNameHere(BasicNewsRecipe):
             excluded = re.compile('|'.join(self.excludedContentKeywords))
             found_excluded = excluded.search(str(soup))
             if found_excluded :
-                print "no allowed content found, removing article"
-                raise Exception('String error')
+                print "No allowed content found, removing article"
+                raise Exception('Rejected article')
 
         # Articles from www.thebigmoney.com use different tagging for byline, dateline and body
         head = soup.find('head')
@@ -338,7 +361,6 @@ class PeriodicalNameHere(BasicNewsRecipe):
         dept_kicker = soup.find('div', attrs={'class':'department_kicker'})
         if dept_kicker is not None :
             kicker_strings = self.tag_to_strings(dept_kicker)
-            #kicker = kicker_strings[2] + kicker_strings[3]
             kicker = ''.join(kicker_strings[2:])
             kicker = re.sub('\.','',kicker)
             h3Tag = Tag(soup, "h3")
@@ -346,25 +368,11 @@ class PeriodicalNameHere(BasicNewsRecipe):
             emTag.insert(0,NavigableString(kicker))
             h3Tag.insert(0, emTag)
             dept_kicker.replaceWith(h3Tag)
+        else:
+            self.log("No kicker--return null")
+            return None
 
-        # Change <h1> to <h2>
-        headline = soup.find("h1")
-        #tag = headline.find("span")
-        #tag.name = 'div'
-
-        if headline is not None :
-            h2tag = Tag(soup, "h2")
-            h2tag['class'] = "headline"
-            strs = self.tag_to_strings(headline)
-            result = ''
-            for (i,substr) in enumerate(strs) :
-                result += substr
-                if i < len(strs) -1 :
-                    result += '<br />'
-            #h2tag.insert(0, result)
-            #headline.replaceWith(h2tag)
-
-        # Fix up the concatenated byline and dateline
+       # Fix up the concatenated byline and dateline
         byline = soup.find(True,attrs={'class':'byline'})
         if byline is not None :
             bylineTag = Tag(soup,'div')