From ab2e05b2c765a85b6c64e67f6b6b99976a4491ba Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 4 Oct 2009 09:34:55 -0600
Subject: [PATCH] IGN:Updated recipe for Honolulu advertiser

---
 src/calibre/devices/usbms/driver.py           |  5 ++
 src/calibre/web/feeds/recipes/__init__.py     |  2 +-
 .../recipes/recipe_honoluluadvertiser.py      | 77 ++++++++++++++-----
 3 files changed, 65 insertions(+), 19 deletions(-)
diff --git a/src/calibre/devices/usbms/driver.py b/src/calibre/devices/usbms/driver.py
index 5650a2f10e..12172f298b 100644
--- a/src/calibre/devices/usbms/driver.py
+++ b/src/calibre/devices/usbms/driver.py
@@ -162,6 +162,7 @@ class USBMS(CLI, Device):
     @classmethod
     def book_from_path(cls, path):
         from calibre.ebooks.metadata.meta import path_to_ext
+        from calibre.ebooks.metadata import MetaInformation
         mime = mime_type_ext(path_to_ext(path))
 
         if cls.settings().read_metadata or cls.MUST_READ_METADATA:
@@ -171,6 +172,10 @@ class USBMS(CLI, Device):
             mi = metadata_from_filename(os.path.basename(path),
                 re.compile(r'^(?P<title>[ \S]+?)[ _]-[ _](?P<author>[ \S]+?)_+\d+'))
 
+        if mi is None:
+            mi = MetaInformation(os.path.splitext(os.path.basename(path))[0],
+                    [_('Unknown')])
+
         authors = authors_to_string(mi.authors)
 
         book = Book(path, mi.title, authors, mime)
diff --git a/src/calibre/web/feeds/recipes/__init__.py b/src/calibre/web/feeds/recipes/__init__.py
index cb52d41111..dc5b7664f7 100644
--- a/src/calibre/web/feeds/recipes/__init__.py
+++ b/src/calibre/web/feeds/recipes/__init__.py
@@ -6,7 +6,7 @@ Builtin recipes.
 '''
 recipe_modules = ['recipe_' + r for r in (
            'newsweek', 'atlantic', 'economist', 'portfolio', 'the_register',
-           'usatoday', 'outlook_india', 'bbc', 'greader', 'wsj',
+           'usatoday', 'bbc', 'greader', 'wsj', #'outlook_india',
            'wired', 'globe_and_mail', 'smh', 'espn', 'business_week', 'miami_herald',
            'ars_technica', 'upi', 'new_yorker', 'irish_times', 'lanacion',
            'discover_magazine', 'scientific_american', 'new_york_review_of_books',
diff --git a/src/calibre/web/feeds/recipes/recipe_honoluluadvertiser.py b/src/calibre/web/feeds/recipes/recipe_honoluluadvertiser.py
index 99a5d674ae..bc7f8cc874 100644
--- a/src/calibre/web/feeds/recipes/recipe_honoluluadvertiser.py
+++ b/src/calibre/web/feeds/recipes/recipe_honoluluadvertiser.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env  python
+# -*- coding: cp1252 -*-
 
 __license__   = 'GPL v3'
 __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
@@ -10,28 +11,62 @@ from calibre.web.feeds.news import BasicNewsRecipe
 
 class Honoluluadvertiser(BasicNewsRecipe):
     title                 = 'Honolulu Advertiser'
-    __author__            = 'Darko Miletic'
+    __author__            = 'Darko Miletic and Sujata Raman'
     description           = "Latest national and local Hawaii sports news from The Honolulu Advertiser."
     publisher             = 'Honolulu Advertiser'
     category              = 'news, Honolulu, Hawaii'
     oldest_article        = 2
-    language              = 'en'
+    language = 'en'
+
     max_articles_per_feed = 100
     no_stylesheets        = True
     use_embedded_content  = False
     encoding              = 'cp1252'
+    remove_javascript     = True
+    cover_url             = 'http://www.honoluluadvertiser.com/graphics/frontpage/frontpage.jpg'
 
-    conversion_options = {
-                             'comments'  : description
-                            ,'tags'      : category
-                            ,'language'  : language
-                            ,'publisher' : publisher
-                         }
+    html2lrf_options = [
+                          '--comment'       , description
+                        , '--category'      , category
+                        , '--publisher'     , publisher
+                        ]
 
-    keep_only_tags = [dict(name='td')]
+    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
 
-    remove_tags = [dict(name=['object','link'])]
-    remove_attributes = ['style']
+    keep_only_tags = [dict(name='div', attrs={'class':["hon_article_top","article-bodytext","hon_article_photo","storyphoto","article"]}),
+                      dict(name='div', attrs={'id':["storycontentleft","article"]})
+                      ]
+
+    remove_tags = [dict(name=['object','link','embed']),
+                   dict(name='div', attrs={'class':["article-tools","titleBar","invisiblespacer","articleflex-container","hon_newslist","categoryheader","columnframe","subHeadline","poster-container"]}),
+                   dict(name='div', attrs={'align':["right"]}),
+                   dict(name='div', attrs={'id':["pluckcomments"]}),
+                   dict(name='td', attrs={'class':["prepsfacts"]}),
+                   dict(name='img', attrs={'height':["1"]}),
+                   dict(name='img', attrs={'alt':["Advertisement"]}),
+                   dict(name='img', attrs={'src':["/gcicommonfiles/sr/graphics/common/adlabel_horz.gif","/gcicommonfiles/sr/graphics/common/icon_whatsthis.gif",]}),
+                   ]
+
+    extra_css = '''
+                    h1{font-family:Arial,Helvetica,sans-serif; font-size:large; color:#000000; }
+                    .hon_article_timestamp{font-family:Arial,Helvetica,sans-serif; font-size:70%; }
+                    .postedStoryDate{font-family:Arial,Helvetica,sans-serif; font-size:30%; }
+                    .postedDate{font-family:Arial,Helvetica,sans-serif; font-size:30%; }
+                    .credit{font-family:Arial,Helvetica,sans-serif; font-size:30%; }
+                    .hon_article_top{font-family:Arial,Helvetica,sans-serif; color:#666666; font-size:30%; font-weight:bold;}
+                    .grayBackground{font-family:Arial,Helvetica,sans-serif; color:#666666; font-size:30%;}
+                    .hon_photocaption{font-family:Arial,Helvetica,sans-serif; font-size:30%; }
+                    .photoCaption{font-family:Arial,Helvetica,sans-serif; font-size:30%; }
+                    .hon_photocredit{font-family:Arial,Helvetica,sans-serif; font-size:30%; color:#666666;}
+                    .storyphoto{font-family:Arial,Helvetica,sans-serif; font-size:30%; color:#666666;}
+                    .article-bodytext{font-family:Arial,Helvetica,sans-serif; font-size:xx-small; }
+                    .storycontentleft{font-family:Arial,Helvetica,sans-serif; font-size:xx-small; }
+                    #article{font-family:Arial,Helvetica,sans-serif; font-size:xx-small; }
+                    .contentarea{font-family:Arial,Helvetica,sans-serif; font-size:xx-small; }
+                    .storytext{font-family:Verdana,Arial,Helvetica,sans-serif; font-size:xx-small;}
+                    .storyHeadline{font-family:Arial,Helvetica,sans-serif; font-size:large; color:#000000; font-weight:bold;}
+                    .source{font-family:Arial,Helvetica,sans-serif; color:#333333; font-style: italic; font-weight:bold; }
+                '''
 
     feeds = [
               (u'Breaking news', u'http://www.honoluluadvertiser.com/apps/pbcs.dll/section?Category=RSS01&MIME=XML' )
@@ -43,13 +78,19 @@ class Honoluluadvertiser(BasicNewsRecipe):
             ]
 
     def preprocess_html(self, soup):
-        st = soup.find('td')
-        if st:
-           st.name = 'div'
+        for item in soup.findAll(style=True):
+            del item['style']
+        mtag = '\n<meta http-equiv="Content-Language" content="en"/>\n'
+        soup.head.insert(0,mtag)
+
+        for tag in soup.findAll(name=['span','table','font']):
+               tag.name = 'div'
+
         return soup
 
-    def print_version(self, url):
-        ubody, sep, rest = url.rpartition('?source')
-        root, sep2, article_id = ubody.partition('/article/')
-        return u'http://www.honoluluadvertiser.com/apps/pbcs.dll/article?AID=/' + article_id + '&template=printart'
+
+   # def print_version(self, url):
+   #     ubody, sep, rest = url.rpartition('/-1/')
+   #     root, sep2, article_id = ubody.partition('/article/')
+   #     return u'http://www.honoluluadvertiser.com/apps/pbcs.dll/article?AID=/' + article_id + '&template=printart'