HTML Input: Fix handling of --input-encoding option

This commit is contained in:
Kovid Goyal 2009-05-19 12:59:51 -07:00
parent ecc05f20bc
commit 686bf72fe1
6 changed files with 23 additions and 17 deletions

View File

@ -670,7 +670,8 @@ OptionRecommendation(name='list_recipes',
self.ui_reporter(1.)
self.log(self.output_fmt.upper(), 'output written to', self.output)
def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None):
def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
encoding='utf-8'):
'''
Create an OEBBook.
'''
@ -678,7 +679,7 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None):
html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
opts.preprocess_html)
oeb = OEBBook(log, html_preprocessor,
pretty_print=opts.pretty_print)
pretty_print=opts.pretty_print, encoding=encoding)
# Read OEB Book into OEBBook
log('Parsing all content...')
if reader is None:

View File

@ -16,7 +16,7 @@ from urlparse import urlparse, urlunparse
from urllib import unquote
from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.opf2 import OPFCreator, OPF
from calibre.ebooks.chardet import xml_to_unicode
from calibre.customize.conversion import OptionRecommendation
from calibre import unicode_path
@ -262,11 +262,19 @@ class HTMLInput(InputFormatPlugin):
),
])
def decode(self, raw):
if self.opts.input_encoding:
raw = raw.decode(self.opts.input_encoding, 'replace')
print 111111, type(raw)
return xml_to_unicode(raw, verbose=self.opts.verbose,
strip_encoding_pats=True, resolve_entities=True)[0]
def convert(self, stream, opts, file_ext, log,
accelerators):
from calibre.ebooks.metadata.meta import get_metadata
basedir = os.getcwd()
self.opts = opts
if hasattr(stream, 'name'):
basedir = os.path.dirname(stream.name)
@ -284,11 +292,14 @@ class HTMLInput(InputFormatPlugin):
mi.render(open('metadata.opf', 'wb'))
opfpath = os.path.abspath('metadata.opf')
opf = OPF(opfpath, os.getcwdu())
if opts.dont_package:
return opfpath
from calibre.ebooks.conversion.plumber import create_oebbook
oeb = create_oebbook(log, opfpath, opts, self)
oeb = create_oebbook(log, opfpath, opts, self,
encoding=opts.input_encoding)
from calibre.ebooks.oeb.transforms.package import Package
Package(os.getcwdu())(oeb, opts)

View File

@ -1590,7 +1590,7 @@ class OEBBook(object):
pass
if self.encoding is not None:
try:
return fix_data(data.decode(self.encoding))
return fix_data(data.decode(self.encoding, 'replace'))
except UnicodeDecodeError:
pass
try:

View File

@ -1,5 +1,5 @@
'''
UI for adding books to the database
UI for adding books to the database and saving books to disk
'''
import os
from Queue import Queue, Empty

View File

@ -2,7 +2,7 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import re, time
import re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
@ -30,12 +30,12 @@ class Newsweek(BasicNewsRecipe):
def find_title(self, section):
d = {'scope':'Scope', 'thetake':'The Take', 'features':'Features',
None:'Departments'}
None:'Departments', 'culture':'Culture'}
ans = None
a = section.find('a', attrs={'name':True})
if a is not None:
ans = a['name']
return d[ans]
return d.get(ans, ans)
def find_articles(self, section):
@ -64,14 +64,6 @@ class Newsweek(BasicNewsRecipe):
soup = self.get_current_issue()
if not soup:
raise RuntimeError('Unable to connect to newsweek.com. Try again later.')
img = soup.find(alt='Cover')
if img is not None and img.has_key('src'):
small = img['src']
match = re.search(r'(\d+)_', small.rpartition('/')[-1])
if match is not None:
self.timefmt = strftime(' [%d %b, %Y]', time.strptime(match.group(1), '%y%m%d'))
self.cover_url = small.replace('coversmall', 'coverlarge')
sections = soup.findAll('div', attrs={'class':'featurewell'})
titles = map(self.find_title, sections)
articles = map(self.find_articles, sections)

2
todo
View File

@ -8,3 +8,5 @@
* Welcome wizard
* MOBI navigation indexing support
* Move pdf metadata setting into separate process