From 686bf72fe1fe67b148db98418392884230ae91d0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 19 May 2009 12:59:51 -0700 Subject: [PATCH] HTML Input: Fix handling of --input-encoding option --- src/calibre/ebooks/conversion/plumber.py | 5 +++-- src/calibre/ebooks/html/input.py | 15 +++++++++++++-- src/calibre/ebooks/oeb/base.py | 2 +- src/calibre/gui2/add.py | 2 +- src/calibre/web/feeds/recipes/recipe_newsweek.py | 14 +++----------- todo | 2 ++ 6 files changed, 23 insertions(+), 17 deletions(-) diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 7387cf158e..eb61e6d988 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -670,7 +670,8 @@ OptionRecommendation(name='list_recipes', self.ui_reporter(1.) self.log(self.output_fmt.upper(), 'output written to', self.output) -def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None): +def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None, + encoding='utf-8'): ''' Create an OEBBook. ''' @@ -678,7 +679,7 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None): html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html, opts.preprocess_html) oeb = OEBBook(log, html_preprocessor, - pretty_print=opts.pretty_print) + pretty_print=opts.pretty_print, encoding=encoding) # Read OEB Book into OEBBook log('Parsing all content...') if reader is None: diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index 255d975b1e..82c4f795d4 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -16,7 +16,7 @@ from urlparse import urlparse, urlunparse from urllib import unquote from calibre.customize.conversion import InputFormatPlugin -from calibre.ebooks.metadata.opf2 import OPFCreator +from calibre.ebooks.metadata.opf2 import OPFCreator, OPF from calibre.ebooks.chardet import xml_to_unicode from calibre.customize.conversion import OptionRecommendation from calibre import unicode_path @@ -262,11 +262,19 @@ class HTMLInput(InputFormatPlugin): ), ]) + def decode(self, raw): + if self.opts.input_encoding: + raw = raw.decode(self.opts.input_encoding, 'replace') + print 111111, type(raw) + return xml_to_unicode(raw, verbose=self.opts.verbose, + strip_encoding_pats=True, resolve_entities=True)[0] + def convert(self, stream, opts, file_ext, log, accelerators): from calibre.ebooks.metadata.meta import get_metadata basedir = os.getcwd() + self.opts = opts if hasattr(stream, 'name'): basedir = os.path.dirname(stream.name) @@ -284,11 +292,14 @@ class HTMLInput(InputFormatPlugin): mi.render(open('metadata.opf', 'wb')) opfpath = os.path.abspath('metadata.opf') + opf = OPF(opfpath, os.getcwdu()) + if opts.dont_package: return opfpath from calibre.ebooks.conversion.plumber import create_oebbook - oeb = create_oebbook(log, opfpath, opts, self) + oeb = create_oebbook(log, opfpath, opts, self, + encoding=opts.input_encoding) from calibre.ebooks.oeb.transforms.package import Package Package(os.getcwdu())(oeb, opts) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index bdf78f96e4..55cc2f926b 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -1590,7 +1590,7 @@ class OEBBook(object): pass if self.encoding is not None: try: - return fix_data(data.decode(self.encoding)) + return fix_data(data.decode(self.encoding, 'replace')) except UnicodeDecodeError: pass try: diff --git a/src/calibre/gui2/add.py b/src/calibre/gui2/add.py index 75c5f721d7..f5c8be4338 100644 --- a/src/calibre/gui2/add.py +++ b/src/calibre/gui2/add.py @@ -1,5 +1,5 @@ ''' -UI for adding books to the database +UI for adding books to the database and saving books to disk ''' import os from Queue import Queue, Empty diff --git a/src/calibre/web/feeds/recipes/recipe_newsweek.py b/src/calibre/web/feeds/recipes/recipe_newsweek.py index 863bbb10a4..7d1fc403d4 100644 --- a/src/calibre/web/feeds/recipes/recipe_newsweek.py +++ b/src/calibre/web/feeds/recipes/recipe_newsweek.py @@ -2,7 +2,7 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' -import re, time +import re from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe @@ -30,12 +30,12 @@ class Newsweek(BasicNewsRecipe): def find_title(self, section): d = {'scope':'Scope', 'thetake':'The Take', 'features':'Features', - None:'Departments'} + None:'Departments', 'culture':'Culture'} ans = None a = section.find('a', attrs={'name':True}) if a is not None: ans = a['name'] - return d[ans] + return d.get(ans, ans) def find_articles(self, section): @@ -64,14 +64,6 @@ class Newsweek(BasicNewsRecipe): soup = self.get_current_issue() if not soup: raise RuntimeError('Unable to connect to newsweek.com. Try again later.') - img = soup.find(alt='Cover') - if img is not None and img.has_key('src'): - small = img['src'] - match = re.search(r'(\d+)_', small.rpartition('/')[-1]) - if match is not None: - self.timefmt = strftime(' [%d %b, %Y]', time.strptime(match.group(1), '%y%m%d')) - self.cover_url = small.replace('coversmall', 'coverlarge') - sections = soup.findAll('div', attrs={'class':'featurewell'}) titles = map(self.find_title, sections) articles = map(self.find_articles, sections) diff --git a/todo b/todo index c98c27ebfd..750d969822 100644 --- a/todo +++ b/todo @@ -8,3 +8,5 @@ * Welcome wizard * MOBI navigation indexing support + +* Move pdf metadata setting into separate process