HTML Input: Fix handling of --input-encoding option

This commit is contained in:
Kovid Goyal 2009-05-19 12:59:51 -07:00
parent ecc05f20bc
commit 686bf72fe1
6 changed files with 23 additions and 17 deletions

View File

@ -670,7 +670,8 @@ OptionRecommendation(name='list_recipes',
self.ui_reporter(1.) self.ui_reporter(1.)
self.log(self.output_fmt.upper(), 'output written to', self.output) self.log(self.output_fmt.upper(), 'output written to', self.output)
def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None): def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
encoding='utf-8'):
''' '''
Create an OEBBook. Create an OEBBook.
''' '''
@ -678,7 +679,7 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None):
html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html, html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
opts.preprocess_html) opts.preprocess_html)
oeb = OEBBook(log, html_preprocessor, oeb = OEBBook(log, html_preprocessor,
pretty_print=opts.pretty_print) pretty_print=opts.pretty_print, encoding=encoding)
# Read OEB Book into OEBBook # Read OEB Book into OEBBook
log('Parsing all content...') log('Parsing all content...')
if reader is None: if reader is None:

View File

@ -16,7 +16,7 @@ from urlparse import urlparse, urlunparse
from urllib import unquote from urllib import unquote
from calibre.customize.conversion import InputFormatPlugin from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.opf2 import OPFCreator, OPF
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.customize.conversion import OptionRecommendation from calibre.customize.conversion import OptionRecommendation
from calibre import unicode_path from calibre import unicode_path
@ -262,11 +262,19 @@ class HTMLInput(InputFormatPlugin):
), ),
]) ])
def decode(self, raw):
if self.opts.input_encoding:
raw = raw.decode(self.opts.input_encoding, 'replace')
print 111111, type(raw)
return xml_to_unicode(raw, verbose=self.opts.verbose,
strip_encoding_pats=True, resolve_entities=True)[0]
def convert(self, stream, opts, file_ext, log, def convert(self, stream, opts, file_ext, log,
accelerators): accelerators):
from calibre.ebooks.metadata.meta import get_metadata from calibre.ebooks.metadata.meta import get_metadata
basedir = os.getcwd() basedir = os.getcwd()
self.opts = opts
if hasattr(stream, 'name'): if hasattr(stream, 'name'):
basedir = os.path.dirname(stream.name) basedir = os.path.dirname(stream.name)
@ -284,11 +292,14 @@ class HTMLInput(InputFormatPlugin):
mi.render(open('metadata.opf', 'wb')) mi.render(open('metadata.opf', 'wb'))
opfpath = os.path.abspath('metadata.opf') opfpath = os.path.abspath('metadata.opf')
opf = OPF(opfpath, os.getcwdu())
if opts.dont_package: if opts.dont_package:
return opfpath return opfpath
from calibre.ebooks.conversion.plumber import create_oebbook from calibre.ebooks.conversion.plumber import create_oebbook
oeb = create_oebbook(log, opfpath, opts, self) oeb = create_oebbook(log, opfpath, opts, self,
encoding=opts.input_encoding)
from calibre.ebooks.oeb.transforms.package import Package from calibre.ebooks.oeb.transforms.package import Package
Package(os.getcwdu())(oeb, opts) Package(os.getcwdu())(oeb, opts)

View File

@ -1590,7 +1590,7 @@ class OEBBook(object):
pass pass
if self.encoding is not None: if self.encoding is not None:
try: try:
return fix_data(data.decode(self.encoding)) return fix_data(data.decode(self.encoding, 'replace'))
except UnicodeDecodeError: except UnicodeDecodeError:
pass pass
try: try:

View File

@ -1,5 +1,5 @@
''' '''
UI for adding books to the database UI for adding books to the database and saving books to disk
''' '''
import os import os
from Queue import Queue, Empty from Queue import Queue, Empty

View File

@ -2,7 +2,7 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import re, time import re
from calibre import strftime from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -30,12 +30,12 @@ class Newsweek(BasicNewsRecipe):
def find_title(self, section): def find_title(self, section):
d = {'scope':'Scope', 'thetake':'The Take', 'features':'Features', d = {'scope':'Scope', 'thetake':'The Take', 'features':'Features',
None:'Departments'} None:'Departments', 'culture':'Culture'}
ans = None ans = None
a = section.find('a', attrs={'name':True}) a = section.find('a', attrs={'name':True})
if a is not None: if a is not None:
ans = a['name'] ans = a['name']
return d[ans] return d.get(ans, ans)
def find_articles(self, section): def find_articles(self, section):
@ -64,14 +64,6 @@ class Newsweek(BasicNewsRecipe):
soup = self.get_current_issue() soup = self.get_current_issue()
if not soup: if not soup:
raise RuntimeError('Unable to connect to newsweek.com. Try again later.') raise RuntimeError('Unable to connect to newsweek.com. Try again later.')
img = soup.find(alt='Cover')
if img is not None and img.has_key('src'):
small = img['src']
match = re.search(r'(\d+)_', small.rpartition('/')[-1])
if match is not None:
self.timefmt = strftime(' [%d %b, %Y]', time.strptime(match.group(1), '%y%m%d'))
self.cover_url = small.replace('coversmall', 'coverlarge')
sections = soup.findAll('div', attrs={'class':'featurewell'}) sections = soup.findAll('div', attrs={'class':'featurewell'})
titles = map(self.find_title, sections) titles = map(self.find_title, sections)
articles = map(self.find_articles, sections) articles = map(self.find_articles, sections)

2
todo
View File

@ -8,3 +8,5 @@
* Welcome wizard * Welcome wizard
* MOBI navigation indexing support * MOBI navigation indexing support
* Move pdf metadata setting into separate process