From 686bf72fe1fe67b148db98418392884230ae91d0 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 19 May 2009 12:59:51 -0700
Subject: [PATCH] HTML Input: Fix handling of --input-encoding option

---
 src/calibre/ebooks/conversion/plumber.py         |  5 +++--
 src/calibre/ebooks/html/input.py                 | 15 +++++++++++++--
 src/calibre/ebooks/oeb/base.py                   |  2 +-
 src/calibre/gui2/add.py                          |  2 +-
 src/calibre/web/feeds/recipes/recipe_newsweek.py | 14 +++-----------
 todo                                             |  2 ++
 6 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 7387cf158e..eb61e6d988 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -670,7 +670,8 @@ OptionRecommendation(name='list_recipes',
         self.ui_reporter(1.)
         self.log(self.output_fmt.upper(), 'output written to', self.output)
 
-def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None):
+def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
+        encoding='utf-8'):
     '''
     Create an OEBBook.
     '''
@@ -678,7 +679,7 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None):
     html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
             opts.preprocess_html)
     oeb = OEBBook(log, html_preprocessor,
-            pretty_print=opts.pretty_print)
+            pretty_print=opts.pretty_print, encoding=encoding)
     # Read OEB Book into OEBBook
     log('Parsing all content...')
     if reader is None:
diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index 255d975b1e..82c4f795d4 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -16,7 +16,7 @@ from urlparse import urlparse, urlunparse
 from urllib import unquote
 
 from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.metadata.opf2 import OPFCreator
+from calibre.ebooks.metadata.opf2 import OPFCreator, OPF
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.customize.conversion import OptionRecommendation
 from calibre import unicode_path
@@ -262,11 +262,19 @@ class HTMLInput(InputFormatPlugin):
         ),
     ])
 
+    def decode(self, raw):
+        if self.opts.input_encoding:
+            raw = raw.decode(self.opts.input_encoding, 'replace')
+        print 111111, type(raw)
+        return xml_to_unicode(raw, verbose=self.opts.verbose,
+                strip_encoding_pats=True, resolve_entities=True)[0]
+
     def convert(self, stream, opts, file_ext, log,
                 accelerators):
         from calibre.ebooks.metadata.meta import get_metadata
 
         basedir = os.getcwd()
+        self.opts = opts
 
         if hasattr(stream, 'name'):
             basedir = os.path.dirname(stream.name)
@@ -284,11 +292,14 @@ class HTMLInput(InputFormatPlugin):
             mi.render(open('metadata.opf', 'wb'))
             opfpath = os.path.abspath('metadata.opf')
 
+        opf = OPF(opfpath, os.getcwdu())
+
         if opts.dont_package:
             return opfpath
 
         from calibre.ebooks.conversion.plumber import create_oebbook
-        oeb = create_oebbook(log, opfpath, opts, self)
+        oeb = create_oebbook(log, opfpath, opts, self,
+                encoding=opts.input_encoding)
 
         from calibre.ebooks.oeb.transforms.package import Package
         Package(os.getcwdu())(oeb, opts)
diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py
index bdf78f96e4..55cc2f926b 100644
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@@ -1590,7 +1590,7 @@ class OEBBook(object):
                 pass
         if self.encoding is not None:
             try:
-                return fix_data(data.decode(self.encoding))
+                return fix_data(data.decode(self.encoding, 'replace'))
             except UnicodeDecodeError:
                 pass
         try:
diff --git a/src/calibre/gui2/add.py b/src/calibre/gui2/add.py
index 75c5f721d7..f5c8be4338 100644
--- a/src/calibre/gui2/add.py
+++ b/src/calibre/gui2/add.py
@@ -1,5 +1,5 @@
 '''
-UI for adding books to the database
+UI for adding books to the database and saving books to disk
 '''
 import os
 from Queue import Queue, Empty
diff --git a/src/calibre/web/feeds/recipes/recipe_newsweek.py b/src/calibre/web/feeds/recipes/recipe_newsweek.py
index 863bbb10a4..7d1fc403d4 100644
--- a/src/calibre/web/feeds/recipes/recipe_newsweek.py
+++ b/src/calibre/web/feeds/recipes/recipe_newsweek.py
@@ -2,7 +2,7 @@
 
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
-import re, time
+import re
 from calibre import strftime
 from calibre.web.feeds.news import BasicNewsRecipe
 
@@ -30,12 +30,12 @@ class Newsweek(BasicNewsRecipe):
 
     def find_title(self, section):
         d = {'scope':'Scope', 'thetake':'The Take', 'features':'Features',
-        None:'Departments'}
+                None:'Departments', 'culture':'Culture'}
         ans = None
         a = section.find('a', attrs={'name':True})
         if a is not None:
             ans = a['name']
-        return d[ans]
+        return d.get(ans, ans)
 
 
     def find_articles(self, section):
@@ -64,14 +64,6 @@ class Newsweek(BasicNewsRecipe):
         soup = self.get_current_issue()
         if not soup:
             raise RuntimeError('Unable to connect to newsweek.com. Try again later.')
-        img = soup.find(alt='Cover')
-        if img is not None and img.has_key('src'):
-            small = img['src']
-            match = re.search(r'(\d+)_', small.rpartition('/')[-1])
-            if match is not None:
-                self.timefmt = strftime(' [%d %b, %Y]', time.strptime(match.group(1), '%y%m%d'))
-            self.cover_url = small.replace('coversmall', 'coverlarge')
-
         sections = soup.findAll('div', attrs={'class':'featurewell'})
         titles = map(self.find_title, sections)
         articles = map(self.find_articles, sections)
diff --git a/todo b/todo
index c98c27ebfd..750d969822 100644
--- a/todo
+++ b/todo
@@ -8,3 +8,5 @@
 * Welcome wizard
 
 * MOBI navigation indexing support
+
+* Move pdf metadata setting into separate process