pdf get_cover returns cover image instead of nothing.

This commit is contained in:
John Schember 2009-04-18 07:54:56 -04:00
commit b104286f61
24 changed files with 405 additions and 210 deletions

View File

@ -280,6 +280,7 @@ from calibre.ebooks.epub.input import EPUBInput
from calibre.ebooks.mobi.input import MOBIInput from calibre.ebooks.mobi.input import MOBIInput
from calibre.ebooks.pdf.input import PDFInput from calibre.ebooks.pdf.input import PDFInput
from calibre.ebooks.txt.input import TXTInput from calibre.ebooks.txt.input import TXTInput
from calibre.ebooks.lit.input import LITInput
from calibre.ebooks.html.input import HTMLInput from calibre.ebooks.html.input import HTMLInput
from calibre.ebooks.oeb.output import OEBOutput from calibre.ebooks.oeb.output import OEBOutput
from calibre.ebooks.txt.output import TXTOutput from calibre.ebooks.txt.output import TXTOutput
@ -287,7 +288,7 @@ from calibre.ebooks.pdf.output import PDFOutput
from calibre.customize.profiles import input_profiles, output_profiles from calibre.customize.profiles import input_profiles, output_profiles
plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput, plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput,
TXTInput, OEBOutput, TXTOutput, PDFOutput] TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
x.__name__.endswith('MetadataReader')] x.__name__.endswith('MetadataReader')]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ plugins += [x for x in list(locals().values()) if isinstance(x, type) and \

View File

@ -41,6 +41,11 @@ class ConversionOption(object):
def __eq__(self, other): def __eq__(self, other):
return hash(self) == hash(other) return hash(self) == hash(other)
def clone(self):
return ConversionOption(name=self.name, help=self.help,
long_switch=self.long_switch, short_switch=self.short_switch,
choices=self.choices)
class OptionRecommendation(object): class OptionRecommendation(object):
LOW = 1 LOW = 1
MED = 2 MED = 2
@ -59,6 +64,10 @@ class OptionRecommendation(object):
self.validate_parameters() self.validate_parameters()
def clone(self):
return OptionRecommendation(recommended_value=self.recommended_value,
level=self.level, option=self.option.clone())
def validate_parameters(self): def validate_parameters(self):
if self.option.choices and self.recommended_value not in \ if self.option.choices and self.recommended_value not in \
self.option.choices: self.option.choices:
@ -170,8 +179,14 @@ class InputFormatPlugin(Plugin):
options.debug_input = os.path.abspath(options.debug_input) options.debug_input = os.path.abspath(options.debug_input)
if not os.path.exists(options.debug_input): if not os.path.exists(options.debug_input):
os.makedirs(options.debug_input) os.makedirs(options.debug_input)
shutil.rmtree(options.debug_input) if isinstance(ret, basestring):
shutil.copytree(output_dir, options.debug_input) shutil.rmtree(options.debug_input)
shutil.copytree(output_dir, options.debug_input)
else:
from calibre.ebooks.oeb.writer import OEBWriter
w = OEBWriter(pretty_print=options.pretty_print)
w(ret, options.debug_input)
log.info('Input debug saved to:', options.debug_input) log.info('Input debug saved to:', options.debug_input)
return ret return ret

View File

@ -57,7 +57,7 @@ def check_command_line_options(parser, args, log):
raise SystemExit(1) raise SystemExit(1)
output = args[2] output = args[2]
if output.startswith('.'): if output.startswith('.') and output != '.':
output = os.path.splitext(os.path.basename(input))[0]+output output = os.path.splitext(os.path.basename(input))[0]+output
output = os.path.abspath(output) output = os.path.abspath(output)
@ -171,7 +171,8 @@ def main(args=sys.argv):
plumber.run() plumber.run()
log(_('Output saved to'), ' ', plumber.output) if plumber.opts.debug_input is None:
log(_('Output saved to'), ' ', plumber.output)
return 0 return 0

View File

@ -32,8 +32,8 @@ class Plumber(object):
:param input: Path to input file. :param input: Path to input file.
:param output: Path to output file/directory :param output: Path to output file/directory
''' '''
self.input = input self.input = os.path.abspath(input)
self.output = output self.output = os.path.abspath(output)
self.log = log self.log = log
# Initialize the conversion options that are independent of input and # Initialize the conversion options that are independent of input and
@ -188,15 +188,15 @@ OptionRecommendation(name='language',
] ]
input_fmt = os.path.splitext(input)[1] input_fmt = os.path.splitext(self.input)[1]
if not input_fmt: if not input_fmt:
raise ValueError('Input file must have an extension') raise ValueError('Input file must have an extension')
input_fmt = input_fmt[1:].lower() input_fmt = input_fmt[1:].lower()
if os.path.exists(output) and os.path.isdir(output): if os.path.exists(self.output) and os.path.isdir(self.output):
output_fmt = 'oeb' output_fmt = 'oeb'
else: else:
output_fmt = os.path.splitext(output)[1] output_fmt = os.path.splitext(self.output)[1]
if not output_fmt: if not output_fmt:
output_fmt = '.oeb' output_fmt = '.oeb'
output_fmt = output_fmt[1:].lower() output_fmt = output_fmt[1:].lower()
@ -323,6 +323,9 @@ OptionRecommendation(name='language',
self.oeb = self.input_plugin(open(self.input, 'rb'), self.opts, self.oeb = self.input_plugin(open(self.input, 'rb'), self.opts,
self.input_fmt, self.log, self.input_fmt, self.log,
accelerators, tdir) accelerators, tdir)
if self.opts.debug_input is not None:
self.log('Debug input called, aborting the rest of the pipeline.')
return
if not hasattr(self.oeb, 'manifest'): if not hasattr(self.oeb, 'manifest'):
self.oeb = create_oebbook(self.log, self.oeb, self.opts) self.oeb = create_oebbook(self.log, self.oeb, self.opts)
@ -365,18 +368,20 @@ OptionRecommendation(name='language',
self.output_plugin.convert(self.oeb, self.output, self.input_plugin, self.output_plugin.convert(self.oeb, self.output, self.input_plugin,
self.opts, self.log) self.opts, self.log)
def create_oebbook(log, opfpath, opts): def create_oebbook(log, path_or_stream, opts, reader=None):
''' '''
Create an OEBBook from an OPF file. Create an OEBBook.
''' '''
from calibre.ebooks.oeb.reader import OEBReader
from calibre.ebooks.oeb.base import OEBBook from calibre.ebooks.oeb.base import OEBBook
html_preprocessor = HTMLPreProcessor() html_preprocessor = HTMLPreProcessor()
reader = OEBReader()
oeb = OEBBook(log, html_preprocessor=html_preprocessor, oeb = OEBBook(log, html_preprocessor=html_preprocessor,
pretty_print=opts.pretty_print) pretty_print=opts.pretty_print)
# Read OEB Book into OEBBook # Read OEB Book into OEBBook
log.info('Parsing all content...') log('Parsing all content...')
reader(oeb, opfpath) if reader is None:
from calibre.ebooks.oeb.reader import OEBReader
reader = OEBReader
reader()(oeb, path_or_stream)
return oeb return oeb

View File

@ -252,6 +252,14 @@ class HTMLInput(InputFormatPlugin):
) )
), ),
OptionRecommendation(name='dont_package',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Normally this input plugin re-arranges all the input '
'files into a standard folder hierarchy. Only use this option '
'if you know what you are doing as it can result in various '
'nasty side effects in the rest of of the conversion pipeline.'
)
),
]) ])
def convert(self, stream, opts, file_ext, log, def convert(self, stream, opts, file_ext, log,
@ -276,6 +284,9 @@ class HTMLInput(InputFormatPlugin):
mi.render(open('metadata.opf', 'wb')) mi.render(open('metadata.opf', 'wb'))
opfpath = os.path.abspath('metadata.opf') opfpath = os.path.abspath('metadata.opf')
if opts.dont_package:
return opfpath
from calibre.ebooks.conversion.plumber import create_oebbook from calibre.ebooks.conversion.plumber import create_oebbook
oeb = create_oebbook(log, opfpath, opts) oeb = create_oebbook(log, opfpath, opts)

View File

@ -0,0 +1,24 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.customize.conversion import InputFormatPlugin
class LITInput(InputFormatPlugin):
name = 'LIT Input'
author = 'Marshall T. Vandegrift'
description = 'Convert LIT files to HTML'
file_types = set(['lit'])
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.lit.reader import LitReader
from calibre.ebooks.conversion.plumber import create_oebbook
return create_oebbook(log, stream, options, reader=LitReader)

View File

@ -7,13 +7,12 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> ' \ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> ' \
'and Marshall T. Vandegrift <llasram@gmail.com>' 'and Marshall T. Vandegrift <llasram@gmail.com>'
import sys, struct, os import struct, os
import functools import functools
import re import re
from urlparse import urldefrag from urlparse import urldefrag
from cStringIO import StringIO from cStringIO import StringIO
from urllib import unquote as urlunquote from urllib import unquote as urlunquote
from lxml import etree
from calibre.ebooks.lit import LitError from calibre.ebooks.lit import LitError
from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP
import calibre.ebooks.lit.mssha1 as mssha1 import calibre.ebooks.lit.mssha1 as mssha1

View File

@ -1,10 +1,10 @@
from __future__ import with_statement from __future__ import with_statement
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''Read meta information from PDF files''' '''Read meta information from PDF files'''
import sys, os, cStringIO import sys, os, cStringIO
from threading import Thread
from calibre import FileWrapper from calibre import FileWrapper
from calibre.ebooks.metadata import MetaInformation, authors_to_string from calibre.ebooks.metadata import MetaInformation, authors_to_string
@ -13,7 +13,8 @@ from pyPdf import PdfFileReader, PdfFileWriter
import Image import Image
try: try:
from calibre.utils.PythonMagickWand import \ from calibre.utils.PythonMagickWand import \
NewMagickWand, MagickReadImage, MagickSetImageFormat, MagickWriteImage NewMagickWand, MagickReadImage, MagickSetImageFormat, \
MagickWriteImage, ImageMagick
_imagemagick_loaded = True _imagemagick_loaded = True
except: except:
_imagemagick_loaded = False _imagemagick_loaded = False
@ -51,9 +52,23 @@ def get_metadata(stream, extract_cover=True):
print >>sys.stderr, msg.encode('utf8') print >>sys.stderr, msg.encode('utf8')
return mi return mi
class MetadataWriter(Thread):
def __init__(self, out_pdf, buf):
self.out_pdf = out_pdf
self.buf = buf
Thread.__init__(self)
self.daemon = True
def run(self):
try:
self.out_pdf.write(self.buf)
except RuntimeError:
pass
def set_metadata(stream, mi): def set_metadata(stream, mi):
stream.seek(0) stream.seek(0)
# Use a cStringIO object for the pdf because we will want to over # Use a StringIO object for the pdf because we will want to over
# write it later and if we are working on the stream directly it # write it later and if we are working on the stream directly it
# could cause some issues. # could cause some issues.
raw = cStringIO.StringIO(stream.read()) raw = cStringIO.StringIO(stream.read())
@ -61,10 +76,18 @@ def set_metadata(stream, mi):
title = mi.title if mi.title else orig_pdf.documentInfo.title title = mi.title if mi.title else orig_pdf.documentInfo.title
author = authors_to_string(mi.authors) if mi.authors else orig_pdf.documentInfo.author author = authors_to_string(mi.authors) if mi.authors else orig_pdf.documentInfo.author
out_pdf = PdfFileWriter(title=title, author=author) out_pdf = PdfFileWriter(title=title, author=author)
out_str = cStringIO.StringIO()
writer = MetadataWriter(out_pdf, out_str)
for page in orig_pdf.pages: for page in orig_pdf.pages:
out_pdf.addPage(page) out_pdf.addPage(page)
out_str = cStringIO.StringIO() writer.start()
out_pdf.write(out_str) writer.join(10) # Wait 10 secs for writing to complete
out_pdf.killed = True
writer.join()
if out_pdf.killed:
print 'Failed to set metadata: took too long'
return
stream.seek(0) stream.seek(0)
stream.truncate() stream.truncate()
out_str.seek(0) out_str.seek(0)
@ -72,35 +95,32 @@ def set_metadata(stream, mi):
stream.seek(0) stream.seek(0)
def get_cover(stream): def get_cover(stream):
stream.seek(0)
data = cStringIO.StringIO() data = cStringIO.StringIO()
try: try:
with FileWrapper(stream) as stream: pdf = PdfFileReader(stream)
pdf = PdfFileReader(stream) output = PdfFileWriter()
output = PdfFileWriter()
if len(pdf.pages) >= 1: if len(pdf.pages) >= 1:
output.addPage(pdf.getPage(0)) output.addPage(pdf.getPage(0))
with TemporaryDirectory('_pdfmeta') as tdir: with TemporaryDirectory('_pdfmeta') as tdir:
cover_path = os.path.join(tdir, 'cover.pdf') cover_path = os.path.join(tdir, 'cover.pdf')
outputStream = file(cover_path, "wb") with open(cover_path, "wb") as outputStream:
output.write(outputStream) output.write(outputStream)
outputStream.close()
with ImageMagick():
wand = NewMagickWand() wand = NewMagickWand()
MagickReadImage(wand, cover_path) MagickReadImage(wand, cover_path)
MagickSetImageFormat(wand, 'JPEG') MagickSetImageFormat(wand, 'JPEG')
MagickWriteImage(wand, '%s.jpg' % cover_path) MagickWriteImage(wand, '%s.jpg' % cover_path)
img = Image.open('%s.jpg' % cover_path) img = Image.open('%s.jpg' % cover_path)
img.save(data, 'JPEG') img.save(data, 'JPEG')
except: except:
import traceback import traceback
traceback.print_exc() traceback.print_exc()
return data.getvalue() return data.getvalue()

View File

@ -272,11 +272,7 @@ def XPath(expr):
def xpath(elem, expr): def xpath(elem, expr):
return elem.xpath(expr, namespaces=XPNSMAP) return elem.xpath(expr, namespaces=XPNSMAP)
def _prepare_xml_for_serialization(root):
pass
def xml2str(root, pretty_print=False, strip_comments=False): def xml2str(root, pretty_print=False, strip_comments=False):
_prepare_xml_for_serialization(root)
ans = etree.tostring(root, encoding='utf-8', xml_declaration=True, ans = etree.tostring(root, encoding='utf-8', xml_declaration=True,
pretty_print=pretty_print) pretty_print=pretty_print)
@ -287,7 +283,6 @@ def xml2str(root, pretty_print=False, strip_comments=False):
def xml2unicode(root, pretty_print=False): def xml2unicode(root, pretty_print=False):
_prepare_xml_for_serialization(root)
return etree.tostring(root, pretty_print=pretty_print) return etree.tostring(root, pretty_print=pretty_print)
ASCII_CHARS = set(chr(x) for x in xrange(128)) ASCII_CHARS = set(chr(x) for x in xrange(128))
@ -321,6 +316,25 @@ def urlnormalize(href):
parts = (urlquote(part) for part in parts) parts = (urlquote(part) for part in parts)
return urlunparse(parts) return urlunparse(parts)
class DummyHandler(logging.Handler):
def __init__(self):
logging.Handler.__init__(self, logging.WARNING)
self.setFormatter(logging.Formatter('%(message)s'))
self.log = None
def emit(self, record):
if self.log is not None:
msg = self.format(record)
f = self.log.error if record.levelno >= logging.ERROR \
else self.log.warn
f(msg)
_css_logger = logging.getLogger('calibre.css')
_css_logger.setLevel(logging.WARNING)
_css_log_handler = DummyHandler()
_css_logger.addHandler(_css_log_handler)
class OEBError(Exception): class OEBError(Exception):
"""Generic OEB-processing error.""" """Generic OEB-processing error."""
@ -778,7 +792,8 @@ class Manifest(object):
data = self.oeb.css_preprocessor(data) data = self.oeb.css_preprocessor(data)
data = XHTML_CSS_NAMESPACE + data data = XHTML_CSS_NAMESPACE + data
parser = CSSParser(loglevel=logging.WARNING, parser = CSSParser(loglevel=logging.WARNING,
fetcher=self._fetch_css) fetcher=self._fetch_css,
log=_css_logger)
data = parser.parseString(data, href=self.href) data = parser.parseString(data, href=self.href)
data.namespaces['h'] = XHTML_NS data.namespaces['h'] = XHTML_NS
return data return data
@ -1435,7 +1450,7 @@ class OEBBook(object):
:attr:`pages`: List of "pages," such as indexed to a print edition of :attr:`pages`: List of "pages," such as indexed to a print edition of
the same text. the same text.
""" """
_css_log_handler.log = logger
self.encoding = encoding self.encoding = encoding
self.html_preprocessor = html_preprocessor self.html_preprocessor = html_preprocessor
self.css_preprocessor = css_preprocessor self.css_preprocessor = css_preprocessor
@ -1450,6 +1465,7 @@ class OEBBook(object):
self.guide = Guide(self) self.guide = Guide(self)
self.toc = TOC() self.toc = TOC()
self.pages = PageList() self.pages = PageList()
self.auto_generated_toc = True
@classmethod @classmethod
def generate(cls, opts): def generate(cls, opts):

View File

@ -13,13 +13,12 @@ from PyQt4.Qt import QFontDatabase
from calibre.customize.ui import available_input_formats from calibre.customize.ui import available_input_formats
from calibre.ebooks.epub.from_html import TITLEPAGE from calibre.ebooks.epub.from_html import TITLEPAGE
from calibre.ebooks.metadata.opf2 import OPF, OPFCreator from calibre.ebooks.metadata.opf2 import OPF
from calibre.ptempfile import TemporaryDirectory from calibre.ptempfile import TemporaryDirectory
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.zipfile import safe_replace, ZipFile from calibre.utils.zipfile import safe_replace, ZipFile
from calibre.utils.config import DynamicConfig from calibre.utils.config import DynamicConfig
from calibre.utils.logging import Log from calibre.utils.logging import Log
from calibre import CurrentDir
def character_count(html): def character_count(html):
''' '''
@ -57,31 +56,21 @@ class FakeOpts(object):
max_levels = 5 max_levels = 5
input_encoding = None input_encoding = None
def html2opf(path, tdir, log):
from calibre.ebooks.html.input import get_filelist
from calibre.ebooks.metadata.meta import get_metadata
with CurrentDir(tdir):
fl = get_filelist(path, tdir, FakeOpts(), log)
mi = get_metadata(open(path, 'rb'), 'html')
mi = OPFCreator(os.getcwdu(), mi)
mi.guide = None
entries = [(f.path, 'application/xhtml+xml') for f in fl]
mi.create_manifest(entries)
mi.create_spine([f.path for f in fl])
mi.render(open('metadata.opf', 'wb'))
opfpath = os.path.abspath('metadata.opf')
return opfpath
def opf2opf(path, tdir, opts):
return path
def is_supported(path): def is_supported(path):
ext = os.path.splitext(path)[1].replace('.', '').lower() ext = os.path.splitext(path)[1].replace('.', '').lower()
ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext) ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
return ext in available_input_formats() return ext in available_input_formats()
def write_oebbook(oeb, path):
from calibre.ebooks.oeb.writer import OEBWriter
from calibre import walk
w = OEBWriter()
w(oeb, path)
for f in walk(path):
if f.endswith('.opf'):
return f
class EbookIterator(object): class EbookIterator(object):
CHARACTERS_PER_PAGE = 1000 CHARACTERS_PER_PAGE = 1000
@ -131,17 +120,16 @@ class EbookIterator(object):
def __enter__(self): def __enter__(self):
self._tdir = TemporaryDirectory('_ebook_iter') self._tdir = TemporaryDirectory('_ebook_iter')
self.base = self._tdir.__enter__() self.base = self._tdir.__enter__()
if self.ebook_ext == 'opf': from calibre.ebooks.conversion.plumber import Plumber
self.pathtoopf = self.pathtoebook plumber = Plumber(self.pathtoebook, self.base, self.log)
elif self.ebook_ext == 'html': plumber.setup_options()
self.pathtoopf = html2opf(self.pathtoebook, self.base, self.log) if hasattr(plumber.opts, 'dont_package'):
else: plumber.opts.dont_package = True
from calibre.ebooks.conversion.plumber import Plumber self.pathtoopf = plumber.input_plugin(open(plumber.input, 'rb'),
plumber = Plumber(self.pathtoebook, self.base, self.log) plumber.opts, plumber.input_fmt, self.log,
plumber.setup_options() {}, self.base)
self.pathtoopf = plumber.input_plugin(open(plumber.input, 'rb'), if hasattr(self.pathtoopf, 'manifest'):
plumber.opts, plumber.input_fmt, self.log, self.pathtoopf = write_oebbook(self.pathtoebook, self._tdir)
{}, self.base)
self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf)) self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf))

View File

@ -16,7 +16,6 @@ class OEBOutput(OutputFormatPlugin):
author = 'Kovid Goyal' author = 'Kovid Goyal'
file_type = 'oeb' file_type = 'oeb'
def convert(self, oeb_book, output_path, input_plugin, opts, log): def convert(self, oeb_book, output_path, input_plugin, opts, log):
self.log, self.opts = log, opts self.log, self.opts = log, opts
if not os.path.exists(output_path): if not os.path.exists(output_path):

View File

@ -349,6 +349,7 @@ class OEBReader(object):
def _toc_from_ncx(self, item): def _toc_from_ncx(self, item):
if item is None: if item is None:
return False return False
self.log.debug('Reading TOC from NCX...')
ncx = item.data ncx = item.data
title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()')) title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()'))
title = COLLAPSE_RE.sub(' ', title.strip()) title = COLLAPSE_RE.sub(' ', title.strip())
@ -364,6 +365,7 @@ class OEBReader(object):
result = xpath(opf, 'o2:tours/o2:tour') result = xpath(opf, 'o2:tours/o2:tour')
if not result: if not result:
return False return False
self.log.debug('Reading TOC from tour...')
tour = result[0] tour = result[0]
toc = self.oeb.toc toc = self.oeb.toc
toc.title = tour.get('title') toc.title = tour.get('title')
@ -384,6 +386,7 @@ class OEBReader(object):
def _toc_from_html(self, opf): def _toc_from_html(self, opf):
if 'toc' not in self.oeb.guide: if 'toc' not in self.oeb.guide:
return False return False
self.log.debug('Reading TOC from HTML...')
itempath, frag = urldefrag(self.oeb.guide['toc'].href) itempath, frag = urldefrag(self.oeb.guide['toc'].href)
item = self.oeb.manifest.hrefs[itempath] item = self.oeb.manifest.hrefs[itempath]
html = item.data html = item.data
@ -414,6 +417,7 @@ class OEBReader(object):
return True return True
def _toc_from_spine(self, opf): def _toc_from_spine(self, opf):
self.log.warn('Generating default TOC from spine...')
toc = self.oeb.toc toc = self.oeb.toc
titles = [] titles = []
headers = [] headers = []
@ -441,11 +445,14 @@ class OEBReader(object):
return True return True
def _toc_from_opf(self, opf, item): def _toc_from_opf(self, opf, item):
self.oeb.auto_generated_toc = False
if self._toc_from_ncx(item): return if self._toc_from_ncx(item): return
if self._toc_from_tour(opf): return # Prefer HTML to tour based TOC, since several LIT files
self.logger.warn('No metadata table of contents found') # have good HTML TOCs but bad tour based TOCs
if self._toc_from_html(opf): return if self._toc_from_html(opf): return
if self._toc_from_tour(opf): return
self._toc_from_spine(opf) self._toc_from_spine(opf)
self.oeb.auto_generated_toc = True
def _pages_from_ncx(self, opf, item): def _pages_from_ncx(self, opf, item):
if item is None: if item is None:

View File

@ -51,8 +51,8 @@ class Split(object):
self.log = oeb.log self.log = oeb.log
self.map = {} self.map = {}
self.page_break_selectors = None self.page_break_selectors = None
for item in self.oeb.manifest.items: for item in list(self.oeb.manifest.items):
if etree.iselement(item.data): if item.spine_position is not None and etree.iselement(item.data):
self.split_item(item) self.split_item(item)
self.fix_links() self.fix_links()
@ -74,31 +74,34 @@ class Split(object):
self.page_break_selectors = set([]) self.page_break_selectors = set([])
stylesheets = [x.data for x in self.oeb.manifest if x.media_type in stylesheets = [x.data for x in self.oeb.manifest if x.media_type in
OEB_STYLES] OEB_STYLES]
page_break_selectors = set([]) for rule in rules(stylesheets):
for rule in rules(stylesheets): before = getattr(rule.style.getPropertyCSSValue(
before = getattr(rule.style.getPropertyCSSValue( 'page-break-before'), 'cssText', '').strip().lower()
'page-break-before'), 'cssText', '').strip().lower() after = getattr(rule.style.getPropertyCSSValue(
after = getattr(rule.style.getPropertyCSSValue( 'page-break-after'), 'cssText', '').strip().lower()
'page-break-after'), 'cssText', '').strip().lower() try:
try: if before and before != 'avoid':
if before and before != 'avoid': self.page_break_selectors.add((CSSSelector(rule.selectorText),
page_break_selectors.add((CSSSelector(rule.selectorText), True))
True)) except:
except: pass
pass try:
try: if after and after != 'avoid':
if after and after != 'avoid': self.page_break_selectors.add((CSSSelector(rule.selectorText),
page_break_selectors.add((CSSSelector(rule.selectorText), False))
False)) except:
except: pass
pass
page_breaks = set([]) page_breaks = set([])
for selector, before in page_break_selectors: for selector, before in self.page_break_selectors:
for elem in selector(item.data): body = item.data.xpath('//h:body', namespaces=NAMESPACES)
if before: if not body:
elem.set('pb_before', '1') continue
page_breaks.add(elem) for elem in selector(body[0]):
if elem not in body:
if before:
elem.set('pb_before', '1')
page_breaks.add(elem)
for i, elem in enumerate(item.data.iter()): for i, elem in enumerate(item.data.iter()):
elem.set('pb_order', str(i)) elem.set('pb_order', str(i))
@ -136,8 +139,10 @@ class Split(object):
if href in self.map: if href in self.map:
anchor_map = self.map[href] anchor_map = self.map[href]
nhref = anchor_map[frag if frag else None] nhref = anchor_map[frag if frag else None]
nhref = self.current_item.relhref(nhref)
if frag: if frag:
nhref = '#'.join(href, frag) nhref = '#'.join((nhref, frag))
return nhref return nhref
return url return url
@ -153,7 +158,7 @@ class FlowSplitter(object):
self.page_breaks = page_breaks self.page_breaks = page_breaks
self.page_break_ids = page_break_ids self.page_break_ids = page_break_ids
self.max_flow_size = max_flow_size self.max_flow_size = max_flow_size
self.base = item.abshref(item.href) self.base = item.href
base, ext = os.path.splitext(self.base) base, ext = os.path.splitext(self.base)
self.base = base.replace('%', '%%')+'_split_%d'+ext self.base = base.replace('%', '%%')+'_split_%d'+ext
@ -192,9 +197,9 @@ class FlowSplitter(object):
self.trees = [] self.trees = []
tree = orig_tree tree = orig_tree
for pattern, before in ordered_ids: for pattern, before in ordered_ids:
self.log.debug('\t\tSplitting on page-break')
elem = pattern(tree) elem = pattern(tree)
if elem: if elem:
self.log.debug('\t\tSplitting on page-break')
before, after = self.do_split(tree, elem[0], before) before, after = self.do_split(tree, elem[0], before)
self.trees.append(before) self.trees.append(before)
tree = after tree = after
@ -414,13 +419,14 @@ class FlowSplitter(object):
elem.attrib.pop(SPLIT_ATTR, None) elem.attrib.pop(SPLIT_ATTR, None)
elem.attrib.pop(SPLIT_POINT_ATTR, '0') elem.attrib.pop(SPLIT_POINT_ATTR, '0')
spine_pos = self.item.spine_pos spine_pos = self.item.spine_position
for current, tree in zip(map(reversed, (self.files, self.trees))): for current, tree in zip(*map(reversed, (self.files, self.trees))):
for a in tree.getroot().xpath('//h:a[@href]', namespaces=NAMESPACES): for a in tree.getroot().xpath('//h:a[@href]', namespaces=NAMESPACES):
href = a.get('href').strip() href = a.get('href').strip()
if href.startswith('#'): if href.startswith('#'):
anchor = href[1:] anchor = href[1:]
file = self.anchor_map[anchor] file = self.anchor_map[anchor]
file = self.item.relhref(file)
if file != current: if file != current:
a.set('href', file+href) a.set('href', file+href)
@ -430,12 +436,12 @@ class FlowSplitter(object):
self.oeb.spine.insert(spine_pos, new_item, self.item.linear) self.oeb.spine.insert(spine_pos, new_item, self.item.linear)
if self.oeb.guide: if self.oeb.guide:
for ref in self.oeb.guide: for ref in self.oeb.guide.values():
href, frag = urldefrag(ref.href) href, frag = urldefrag(ref.href)
if href == self.item.href: if href == self.item.href:
nhref = self.anchor_map[frag if frag else None] nhref = self.anchor_map[frag if frag else None]
if frag: if frag:
nhref = '#'.join(nhref, frag) nhref = '#'.join((nhref, frag))
ref.href = nhref ref.href = nhref
def fix_toc_entry(toc): def fix_toc_entry(toc):
@ -444,7 +450,7 @@ class FlowSplitter(object):
if href == self.item.href: if href == self.item.href:
nhref = self.anchor_map[frag if frag else None] nhref = self.anchor_map[frag if frag else None]
if frag: if frag:
nhref = '#'.join(nhref, frag) nhref = '#'.join((nhref, frag))
toc.href = nhref toc.href = nhref
for x in toc: for x in toc:
fix_toc_entry(x) fix_toc_entry(x)

View File

@ -49,7 +49,7 @@ class OEBWriter(object):
def __call__(self, oeb, path): def __call__(self, oeb, path):
""" """
Read the book in the :class:`OEBBook` object :param:`oeb` to a file Write the book in the :class:`OEBBook` object :param:`oeb` to a folder
at :param:`path`. at :param:`path`.
""" """
version = int(self.version[0]) version = int(self.version[0])

View File

@ -319,6 +319,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
self.cover_changed = True self.cover_changed = True
def initialize_series(self): def initialize_series(self):
self.series.setSizeAdjustPolicy(self.series.AdjustToContentsOnFirstShow)
all_series = self.db.all_series() all_series = self.db.all_series()
all_series.sort(cmp=lambda x, y : cmp(x[1], y[1])) all_series.sort(cmp=lambda x, y : cmp(x[1], y[1]))
series_id = self.db.series_id(self.row) series_id = self.db.series_id(self.row)
@ -335,13 +336,6 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
self.series.setCurrentIndex(idx) self.series.setCurrentIndex(idx)
self.enable_series_index() self.enable_series_index()
pl = self.series.parentWidget().layout()
for i in range(pl.count()):
l = pl.itemAt(i).layout()
if l:
l.invalidate()
l.activate()
def initialize_series_and_publisher(self): def initialize_series_and_publisher(self):
self.initialize_series() self.initialize_series()
all_publishers = self.db.all_publishers() all_publishers = self.db.all_publishers()

Binary file not shown.

After

Width:  |  Height:  |  Size: 509 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 637 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 746 B

View File

@ -40,6 +40,7 @@ recipe_modules = ['recipe_' + r for r in (
'krstarica', 'krstarica_en', 'tanjug', 'laprensa_ni', 'azstarnet', 'krstarica', 'krstarica_en', 'tanjug', 'laprensa_ni', 'azstarnet',
'corriere_della_sera_it', 'corriere_della_sera_en', 'msdnmag_en', 'corriere_della_sera_it', 'corriere_della_sera_en', 'msdnmag_en',
'moneynews', 'der_standard', 'diepresse', 'nzz_ger', 'hna', 'moneynews', 'der_standard', 'diepresse', 'nzz_ger', 'hna',
'seattle_times',
)] )]
import re, imp, inspect, time, os import re, imp, inspect, time, os

View File

@ -1,14 +1,37 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>'
''' http://www.derstandard.at - Austrian Newspaper ''' ''' http://www.derstandard.at - Austrian Newspaper '''
import re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class DerStandardRecipe(BasicNewsRecipe): class DerStandardRecipe(BasicNewsRecipe):
title = u'derStandard' title = u'derStandard'
__author__ = 'Gerhard Aigner' __author__ = 'Gerhard Aigner'
description = u'Nachrichten aus Österreich'
publisher ='derStandard.at'
category = 'news, politics, nachrichten, Austria'
use_embedded_content = False
remove_empty_feeds = True
lang = 'de-AT'
no_stylesheets = True
encoding = 'utf-8'
language = _('German')
recursions = 0
oldest_article = 1 oldest_article = 1
max_articles_per_feed = 100 max_articles_per_feed = 100
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [(u'International', u'http://derstandard.at/?page=rss&ressort=internationalpolitik'), feeds = [(u'International', u'http://derstandard.at/?page=rss&ressort=internationalpolitik'),
(u'Inland', u'http://derstandard.at/?page=rss&ressort=innenpolitik'), (u'Inland', u'http://derstandard.at/?page=rss&ressort=innenpolitik'),
(u'Wirtschaft', u'http://derstandard.at/?page=rss&ressort=investor'), (u'Wirtschaft', u'http://derstandard.at/?page=rss&ressort=investor'),
@ -20,14 +43,10 @@ class DerStandardRecipe(BasicNewsRecipe):
(u'Wissenschaft', u'http://derstandard.at/?page=rss&ressort=wissenschaft'), (u'Wissenschaft', u'http://derstandard.at/?page=rss&ressort=wissenschaft'),
(u'Gesundheit', u'http://derstandard.at/?page=rss&ressort=gesundheit'), (u'Gesundheit', u'http://derstandard.at/?page=rss&ressort=gesundheit'),
(u'Bildung', u'http://derstandard.at/?page=rss&ressort=subildung')] (u'Bildung', u'http://derstandard.at/?page=rss&ressort=subildung')]
encoding = 'utf-8'
language = _('German')
recursions = 0
remove_tags = [dict(name='div'), dict(name='a'), dict(name='link'), dict(name='meta'), remove_tags = [dict(name='div'), dict(name='a'), dict(name='link'), dict(name='meta'),
dict(name='form',attrs={'name':'sitesearch'}), dict(name='hr')] dict(name='form',attrs={'name':'sitesearch'}), dict(name='hr')]
preprocess_regexps = [ preprocess_regexps = [
(re.compile(r'\[[\d*]\]', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(r'\[[\d]*\]', re.DOTALL|re.IGNORECASE), lambda match: ''),
(re.compile(r'bgcolor="#\w{3,6}"', re.DOTALL|re.IGNORECASE), lambda match: '') (re.compile(r'bgcolor="#\w{3,6}"', re.DOTALL|re.IGNORECASE), lambda match: '')
] ]
@ -40,3 +59,10 @@ class DerStandardRecipe(BasicNewsRecipe):
if (article.link.count('ressort') > 0 or article.title.lower().count('ansichtssache') > 0): if (article.link.count('ressort') > 0 or article.title.lower().count('ansichtssache') > 0):
return None return None
return article.link return article.link
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '">'
soup.head.insert(0,mtag)
return soup

View File

@ -1,18 +1,42 @@
import re # -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>'
''' http://www.diepresse.at - Austrian Newspaper '''
import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class DiePresseRecipe(BasicNewsRecipe): class DiePresseRecipe(BasicNewsRecipe):
title = u'diePresse' title = u'diePresse'
__author__ = 'Gerhard Aigner'
description = u'DiePresse.com - Die Online-Ausgabe der Österreichischen Tageszeitung Die Presse.'
publisher ='DiePresse.com'
category = 'news, politics, nachrichten, Austria'
use_embedded_content = False
remove_empty_feeds = True
lang = 'de-AT'
no_stylesheets = True
encoding = 'ISO-8859-1'
language = _('German')
recursions = 0
oldest_article = 1 oldest_article = 1
max_articles_per_feed = 100 max_articles_per_feed = 100
recursions = 0
language = _('German') html2lrf_options = [
__author__ = 'Gerhard Aigner' '--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
preprocess_regexps = [ preprocess_regexps = [
(re.compile(r'Textversion', re.DOTALL), lambda match: ''), (re.compile(r'Textversion', re.DOTALL), lambda match: ''),
] ]
remove_tags = [dict(name='hr'), remove_tags = [dict(name='hr'),
dict(name='br'), dict(name='br'),
dict(name='small'), dict(name='small'),
@ -21,6 +45,7 @@ class DiePresseRecipe(BasicNewsRecipe):
dict(name='h1', attrs={'class':'titel'}), dict(name='h1', attrs={'class':'titel'}),
dict(name='a', attrs={'class':'print'}), dict(name='a', attrs={'class':'print'}),
dict(name='div', attrs={'class':'hline'})] dict(name='div', attrs={'class':'hline'})]
feeds = [(u'Politik', u'http://diepresse.com/rss/Politik'), feeds = [(u'Politik', u'http://diepresse.com/rss/Politik'),
(u'Wirtschaft', u'http://diepresse.com/rss/Wirtschaft'), (u'Wirtschaft', u'http://diepresse.com/rss/Wirtschaft'),
(u'Europa', u'http://diepresse.com/rss/EU'), (u'Europa', u'http://diepresse.com/rss/EU'),
@ -29,7 +54,7 @@ class DiePresseRecipe(BasicNewsRecipe):
(u'Kultur', u'http://diepresse.com/rss/Kultur'), (u'Kultur', u'http://diepresse.com/rss/Kultur'),
(u'Leben', u'http://diepresse.com/rss/Leben'), (u'Leben', u'http://diepresse.com/rss/Leben'),
(u'Tech', u'http://diepresse.com/rss/Tech'), (u'Tech', u'http://diepresse.com/rss/Tech'),
(u'Science', u'http://diepresse.com/rss/Science'), (u'Wissenschaft', u'http://diepresse.com/rss/Science'),
(u'Bildung', u'http://diepresse.com/rss/Bildung'), (u'Bildung', u'http://diepresse.com/rss/Bildung'),
(u'Gesundheit', u'http://diepresse.com/rss/Gesundheit'), (u'Gesundheit', u'http://diepresse.com/rss/Gesundheit'),
(u'Recht', u'http://diepresse.com/rss/Recht'), (u'Recht', u'http://diepresse.com/rss/Recht'),
@ -38,3 +63,10 @@ class DiePresseRecipe(BasicNewsRecipe):
def print_version(self, url): def print_version(self, url):
return url.replace('home','text/home') return url.replace('home','text/home')
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
soup.head.insert(0,mtag)
return soup

View File

@ -0,0 +1,50 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
seattletimes.nwsource.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class SeattleTimes(BasicNewsRecipe):
title = 'The Seattle Times'
__author__ = 'Darko Miletic'
description = 'News from Seattle and USA'
publisher = 'The Seattle Times'
category = 'news, politics, USA'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
language = _('English')
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [(u'Articles', u'http://seattletimes.nwsource.com/rss/seattletimes.xml')]
remove_tags = [
dict(name=['object','link','script'])
,dict(name='p', attrs={'class':'permission'})
]
def print_version(self, url):
start_url, sep, rest_url = url.rpartition('_')
rurl, rsep, article_id = start_url.rpartition('/')
return u'http://seattletimes.nwsource.com/cgi-bin/PrintStory.pl?document_id=' + article_id
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="en-US"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -299,7 +299,7 @@ def readStringFromStream(stream):
elif tok == "t": elif tok == "t":
tok = "\t" tok = "\t"
elif tok == "b": elif tok == "b":
tok == "\b" tok = "\b"
elif tok == "f": elif tok == "f":
tok = "\f" tok = "\f"
elif tok == "(": elif tok == "(":

View File

@ -39,15 +39,12 @@ __author__ = "Mathieu Fenniak"
__author_email__ = "biziqe@mathieu.fenniak.net" __author_email__ = "biziqe@mathieu.fenniak.net"
import struct import struct
try: from cStringIO import StringIO
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
import filters from generic import DictionaryObject, NameObject, NumberObject, \
import utils createStringObject, ArrayObject, ByteStringObject, StreamObject, \
import warnings IndirectObject, utils, readObject, TextStringObject, BooleanObject, \
from generic import * RectangleObject, DecodedStreamObject
from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList
@ -56,6 +53,7 @@ from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirt
# class (typically {@link #PdfFileReader PdfFileReader}). # class (typically {@link #PdfFileReader PdfFileReader}).
class PdfFileWriter(object): class PdfFileWriter(object):
def __init__(self,title=u"Unknown",author=u"Unknown"): def __init__(self,title=u"Unknown",author=u"Unknown"):
self.killed = False
self._header = "%PDF-1.3" self._header = "%PDF-1.3"
self._objects = [] # array of indirect objects self._objects = [] # array of indirect objects
@ -162,7 +160,7 @@ class PdfFileWriter(object):
# @param stream An object to write the file to. The object must support # @param stream An object to write the file to. The object must support
# the write method, and the tell method, similar to a file object. # the write method, and the tell method, similar to a file object.
def write(self, stream): def write(self, stream):
import struct, md5 import md5
externalReferenceMap = {} externalReferenceMap = {}
self.stack = [] self.stack = []
@ -214,6 +212,8 @@ class PdfFileWriter(object):
stream.write("\nstartxref\n%s\n%%%%EOF\n" % (xref_location)) stream.write("\nstartxref\n%s\n%%%%EOF\n" % (xref_location))
def _sweepIndirectReferences(self, externMap, data): def _sweepIndirectReferences(self, externMap, data):
if self.killed:
raise RuntimeError('Writer killed')
if isinstance(data, DictionaryObject): if isinstance(data, DictionaryObject):
for key, value in data.items(): for key, value in data.items():
origvalue = value origvalue = value