HTML Input: Fix handling of --input-encoding option

2025-07-09 03:04:10 -04:00 · 2009-05-19 12:59:51 -07:00 · 2009-05-19 12:59:51 -07:00 · 686bf72fe1
commit 686bf72fe1
parent ecc05f20bc
6 changed files with 23 additions and 17 deletions
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -670,7 +670,8 @@ OptionRecommendation(name='list_recipes',
        self.ui_reporter(1.)
        self.log(self.output_fmt.upper(), 'output written to', self.output)

-def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None):
+def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
+        encoding='utf-8'):
    '''
    Create an OEBBook.
    '''
@ -678,7 +679,7 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None):
    html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
            opts.preprocess_html)
    oeb = OEBBook(log, html_preprocessor,
-            pretty_print=opts.pretty_print)
+            pretty_print=opts.pretty_print, encoding=encoding)
    # Read OEB Book into OEBBook
    log('Parsing all content...')
    if reader is None:
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@ -16,7 +16,7 @@ from urlparse import urlparse, urlunparse
 from urllib import unquote

 from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.metadata.opf2 import OPFCreator
+from calibre.ebooks.metadata.opf2 import OPFCreator, OPF
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.customize.conversion import OptionRecommendation
 from calibre import unicode_path
@ -262,11 +262,19 @@ class HTMLInput(InputFormatPlugin):
        ),
    ])

+    def decode(self, raw):
+        if self.opts.input_encoding:
+            raw = raw.decode(self.opts.input_encoding, 'replace')
+        print 111111, type(raw)
+        return xml_to_unicode(raw, verbose=self.opts.verbose,
+                strip_encoding_pats=True, resolve_entities=True)[0]
+
    def convert(self, stream, opts, file_ext, log,
                accelerators):
        from calibre.ebooks.metadata.meta import get_metadata

        basedir = os.getcwd()
+        self.opts = opts

        if hasattr(stream, 'name'):
            basedir = os.path.dirname(stream.name)
@ -284,11 +292,14 @@ class HTMLInput(InputFormatPlugin):
            mi.render(open('metadata.opf', 'wb'))
            opfpath = os.path.abspath('metadata.opf')

+        opf = OPF(opfpath, os.getcwdu())
+
        if opts.dont_package:
            return opfpath

        from calibre.ebooks.conversion.plumber import create_oebbook
-        oeb = create_oebbook(log, opfpath, opts, self)
+        oeb = create_oebbook(log, opfpath, opts, self,
+                encoding=opts.input_encoding)

        from calibre.ebooks.oeb.transforms.package import Package
        Package(os.getcwdu())(oeb, opts)
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -1590,7 +1590,7 @@ class OEBBook(object):
                pass
        if self.encoding is not None:
            try:
-                return fix_data(data.decode(self.encoding))
+                return fix_data(data.decode(self.encoding, 'replace'))
            except UnicodeDecodeError:
                pass
        try:
--- a/src/calibre/gui2/add.py
+++ b/src/calibre/gui2/add.py
@ -1,5 +1,5 @@
 '''
-UI for adding books to the database
+UI for adding books to the database and saving books to disk
 '''
 import os
 from Queue import Queue, Empty
--- a/src/calibre/web/feeds/recipes/recipe_newsweek.py
+++ b/src/calibre/web/feeds/recipes/recipe_newsweek.py
@ -2,7 +2,7 @@

 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
-import re, time
+import re
 from calibre import strftime
 from calibre.web.feeds.news import BasicNewsRecipe

@ -30,12 +30,12 @@ class Newsweek(BasicNewsRecipe):

    def find_title(self, section):
        d = {'scope':'Scope', 'thetake':'The Take', 'features':'Features',
-        None:'Departments'}
+                None:'Departments', 'culture':'Culture'}
        ans = None
        a = section.find('a', attrs={'name':True})
        if a is not None:
            ans = a['name']
-        return d[ans]
+        return d.get(ans, ans)


    def find_articles(self, section):
@ -64,14 +64,6 @@ class Newsweek(BasicNewsRecipe):
        soup = self.get_current_issue()
        if not soup:
            raise RuntimeError('Unable to connect to newsweek.com. Try again later.')
-        img = soup.find(alt='Cover')
-        if img is not None and img.has_key('src'):
-            small = img['src']
-            match = re.search(r'(\d+)_', small.rpartition('/')[-1])
-            if match is not None:
-                self.timefmt = strftime(' [%d %b, %Y]', time.strptime(match.group(1), '%y%m%d'))
-            self.cover_url = small.replace('coversmall', 'coverlarge')
-
        sections = soup.findAll('div', attrs={'class':'featurewell'})
        titles = map(self.find_title, sections)
        articles = map(self.find_articles, sections)
--- a/2
+++ b/2
@ -8,3 +8,5 @@
 * Welcome wizard

 * MOBI navigation indexing support
+
+* Move pdf metadata setting into separate process