pdf get_cover returns cover image instead of nothing.

This commit is contained in:
John Schember 2009-04-18 07:54:56 -04:00
commit b104286f61
24 changed files with 405 additions and 210 deletions

View File

@ -263,14 +263,14 @@ class MOBIMetadataWriter(MetadataWriterPlugin):
def set_metadata(self, stream, mi, type): def set_metadata(self, stream, mi, type):
from calibre.ebooks.metadata.mobi import set_metadata from calibre.ebooks.metadata.mobi import set_metadata
set_metadata(stream, mi) set_metadata(stream, mi)
class PDFMetadataWriter(MetadataWriterPlugin): class PDFMetadataWriter(MetadataWriterPlugin):
name = 'Set PDF metadata' name = 'Set PDF metadata'
file_types = set(['pdf']) file_types = set(['pdf'])
description = _('Set metadata in %s files') % 'PDF' description = _('Set metadata in %s files') % 'PDF'
author = 'John Schember' author = 'John Schember'
def set_metadata(self, stream, mi, type): def set_metadata(self, stream, mi, type):
from calibre.ebooks.metadata.pdf import set_metadata from calibre.ebooks.metadata.pdf import set_metadata
set_metadata(stream, mi) set_metadata(stream, mi)
@ -280,6 +280,7 @@ from calibre.ebooks.epub.input import EPUBInput
from calibre.ebooks.mobi.input import MOBIInput from calibre.ebooks.mobi.input import MOBIInput
from calibre.ebooks.pdf.input import PDFInput from calibre.ebooks.pdf.input import PDFInput
from calibre.ebooks.txt.input import TXTInput from calibre.ebooks.txt.input import TXTInput
from calibre.ebooks.lit.input import LITInput
from calibre.ebooks.html.input import HTMLInput from calibre.ebooks.html.input import HTMLInput
from calibre.ebooks.oeb.output import OEBOutput from calibre.ebooks.oeb.output import OEBOutput
from calibre.ebooks.txt.output import TXTOutput from calibre.ebooks.txt.output import TXTOutput
@ -287,7 +288,7 @@ from calibre.ebooks.pdf.output import PDFOutput
from calibre.customize.profiles import input_profiles, output_profiles from calibre.customize.profiles import input_profiles, output_profiles
plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput, plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput,
TXTInput, OEBOutput, TXTOutput, PDFOutput] TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
x.__name__.endswith('MetadataReader')] x.__name__.endswith('MetadataReader')]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ plugins += [x for x in list(locals().values()) if isinstance(x, type) and \

View File

@ -41,6 +41,11 @@ class ConversionOption(object):
def __eq__(self, other): def __eq__(self, other):
return hash(self) == hash(other) return hash(self) == hash(other)
def clone(self):
return ConversionOption(name=self.name, help=self.help,
long_switch=self.long_switch, short_switch=self.short_switch,
choices=self.choices)
class OptionRecommendation(object): class OptionRecommendation(object):
LOW = 1 LOW = 1
MED = 2 MED = 2
@ -59,6 +64,10 @@ class OptionRecommendation(object):
self.validate_parameters() self.validate_parameters()
def clone(self):
return OptionRecommendation(recommended_value=self.recommended_value,
level=self.level, option=self.option.clone())
def validate_parameters(self): def validate_parameters(self):
if self.option.choices and self.recommended_value not in \ if self.option.choices and self.recommended_value not in \
self.option.choices: self.option.choices:
@ -170,8 +179,14 @@ class InputFormatPlugin(Plugin):
options.debug_input = os.path.abspath(options.debug_input) options.debug_input = os.path.abspath(options.debug_input)
if not os.path.exists(options.debug_input): if not os.path.exists(options.debug_input):
os.makedirs(options.debug_input) os.makedirs(options.debug_input)
shutil.rmtree(options.debug_input) if isinstance(ret, basestring):
shutil.copytree(output_dir, options.debug_input) shutil.rmtree(options.debug_input)
shutil.copytree(output_dir, options.debug_input)
else:
from calibre.ebooks.oeb.writer import OEBWriter
w = OEBWriter(pretty_print=options.pretty_print)
w(ret, options.debug_input)
log.info('Input debug saved to:', options.debug_input) log.info('Input debug saved to:', options.debug_input)
return ret return ret

View File

@ -57,7 +57,7 @@ def check_command_line_options(parser, args, log):
raise SystemExit(1) raise SystemExit(1)
output = args[2] output = args[2]
if output.startswith('.'): if output.startswith('.') and output != '.':
output = os.path.splitext(os.path.basename(input))[0]+output output = os.path.splitext(os.path.basename(input))[0]+output
output = os.path.abspath(output) output = os.path.abspath(output)
@ -171,7 +171,8 @@ def main(args=sys.argv):
plumber.run() plumber.run()
log(_('Output saved to'), ' ', plumber.output) if plumber.opts.debug_input is None:
log(_('Output saved to'), ' ', plumber.output)
return 0 return 0

View File

@ -32,8 +32,8 @@ class Plumber(object):
:param input: Path to input file. :param input: Path to input file.
:param output: Path to output file/directory :param output: Path to output file/directory
''' '''
self.input = input self.input = os.path.abspath(input)
self.output = output self.output = os.path.abspath(output)
self.log = log self.log = log
# Initialize the conversion options that are independent of input and # Initialize the conversion options that are independent of input and
@ -188,15 +188,15 @@ OptionRecommendation(name='language',
] ]
input_fmt = os.path.splitext(input)[1] input_fmt = os.path.splitext(self.input)[1]
if not input_fmt: if not input_fmt:
raise ValueError('Input file must have an extension') raise ValueError('Input file must have an extension')
input_fmt = input_fmt[1:].lower() input_fmt = input_fmt[1:].lower()
if os.path.exists(output) and os.path.isdir(output): if os.path.exists(self.output) and os.path.isdir(self.output):
output_fmt = 'oeb' output_fmt = 'oeb'
else: else:
output_fmt = os.path.splitext(output)[1] output_fmt = os.path.splitext(self.output)[1]
if not output_fmt: if not output_fmt:
output_fmt = '.oeb' output_fmt = '.oeb'
output_fmt = output_fmt[1:].lower() output_fmt = output_fmt[1:].lower()
@ -323,6 +323,9 @@ OptionRecommendation(name='language',
self.oeb = self.input_plugin(open(self.input, 'rb'), self.opts, self.oeb = self.input_plugin(open(self.input, 'rb'), self.opts,
self.input_fmt, self.log, self.input_fmt, self.log,
accelerators, tdir) accelerators, tdir)
if self.opts.debug_input is not None:
self.log('Debug input called, aborting the rest of the pipeline.')
return
if not hasattr(self.oeb, 'manifest'): if not hasattr(self.oeb, 'manifest'):
self.oeb = create_oebbook(self.log, self.oeb, self.opts) self.oeb = create_oebbook(self.log, self.oeb, self.opts)
@ -365,18 +368,20 @@ OptionRecommendation(name='language',
self.output_plugin.convert(self.oeb, self.output, self.input_plugin, self.output_plugin.convert(self.oeb, self.output, self.input_plugin,
self.opts, self.log) self.opts, self.log)
def create_oebbook(log, opfpath, opts): def create_oebbook(log, path_or_stream, opts, reader=None):
''' '''
Create an OEBBook from an OPF file. Create an OEBBook.
''' '''
from calibre.ebooks.oeb.reader import OEBReader
from calibre.ebooks.oeb.base import OEBBook from calibre.ebooks.oeb.base import OEBBook
html_preprocessor = HTMLPreProcessor() html_preprocessor = HTMLPreProcessor()
reader = OEBReader()
oeb = OEBBook(log, html_preprocessor=html_preprocessor, oeb = OEBBook(log, html_preprocessor=html_preprocessor,
pretty_print=opts.pretty_print) pretty_print=opts.pretty_print)
# Read OEB Book into OEBBook # Read OEB Book into OEBBook
log.info('Parsing all content...') log('Parsing all content...')
reader(oeb, opfpath) if reader is None:
from calibre.ebooks.oeb.reader import OEBReader
reader = OEBReader
reader()(oeb, path_or_stream)
return oeb return oeb

View File

@ -252,6 +252,14 @@ class HTMLInput(InputFormatPlugin):
) )
), ),
OptionRecommendation(name='dont_package',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Normally this input plugin re-arranges all the input '
'files into a standard folder hierarchy. Only use this option '
'if you know what you are doing as it can result in various '
'nasty side effects in the rest of of the conversion pipeline.'
)
),
]) ])
def convert(self, stream, opts, file_ext, log, def convert(self, stream, opts, file_ext, log,
@ -276,6 +284,9 @@ class HTMLInput(InputFormatPlugin):
mi.render(open('metadata.opf', 'wb')) mi.render(open('metadata.opf', 'wb'))
opfpath = os.path.abspath('metadata.opf') opfpath = os.path.abspath('metadata.opf')
if opts.dont_package:
return opfpath
from calibre.ebooks.conversion.plumber import create_oebbook from calibre.ebooks.conversion.plumber import create_oebbook
oeb = create_oebbook(log, opfpath, opts) oeb = create_oebbook(log, opfpath, opts)

View File

@ -0,0 +1,24 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.customize.conversion import InputFormatPlugin
class LITInput(InputFormatPlugin):
name = 'LIT Input'
author = 'Marshall T. Vandegrift'
description = 'Convert LIT files to HTML'
file_types = set(['lit'])
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.lit.reader import LitReader
from calibre.ebooks.conversion.plumber import create_oebbook
return create_oebbook(log, stream, options, reader=LitReader)

View File

@ -7,13 +7,12 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> ' \ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> ' \
'and Marshall T. Vandegrift <llasram@gmail.com>' 'and Marshall T. Vandegrift <llasram@gmail.com>'
import sys, struct, os import struct, os
import functools import functools
import re import re
from urlparse import urldefrag from urlparse import urldefrag
from cStringIO import StringIO from cStringIO import StringIO
from urllib import unquote as urlunquote from urllib import unquote as urlunquote
from lxml import etree
from calibre.ebooks.lit import LitError from calibre.ebooks.lit import LitError
from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP
import calibre.ebooks.lit.mssha1 as mssha1 import calibre.ebooks.lit.mssha1 as mssha1
@ -29,12 +28,12 @@ __all__ = ["LitReader"]
XML_DECL = """<?xml version="1.0" encoding="UTF-8" ?> XML_DECL = """<?xml version="1.0" encoding="UTF-8" ?>
""" """
OPF_DECL = """<?xml version="1.0" encoding="UTF-8" ?> OPF_DECL = """<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE package <!DOCTYPE package
PUBLIC "+//ISBN 0-9673008-1-9//DTD OEB 1.0.1 Package//EN" PUBLIC "+//ISBN 0-9673008-1-9//DTD OEB 1.0.1 Package//EN"
"http://openebook.org/dtds/oeb-1.0.1/oebpkg101.dtd"> "http://openebook.org/dtds/oeb-1.0.1/oebpkg101.dtd">
""" """
HTML_DECL = """<?xml version="1.0" encoding="UTF-8" ?> HTML_DECL = """<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE html PUBLIC <!DOCTYPE html PUBLIC
"+//ISBN 0-9673008-1-9//DTD OEB 1.0.1 Document//EN" "+//ISBN 0-9673008-1-9//DTD OEB 1.0.1 Document//EN"
"http://openebook.org/dtds/oeb-1.0.1/oebdoc101.dtd"> "http://openebook.org/dtds/oeb-1.0.1/oebdoc101.dtd">
""" """
@ -73,7 +72,7 @@ def encint(bytes, remaining):
val <<= 7 val <<= 7
val |= (b & 0x7f) val |= (b & 0x7f)
if b & 0x80 == 0: break if b & 0x80 == 0: break
return val, bytes[pos:], remaining return val, bytes[pos:], remaining
def msguid(bytes): def msguid(bytes):
values = struct.unpack("<LHHBBBBBBBB", bytes[:16]) values = struct.unpack("<LHHBBBBBBBB", bytes[:16])
@ -123,7 +122,7 @@ class UnBinary(object):
CLOSE_ANGLE_RE = re.compile(r'(?<!--)>>(?=>>|[^>])') CLOSE_ANGLE_RE = re.compile(r'(?<!--)>>(?=>>|[^>])')
DOUBLE_ANGLE_RE = re.compile(r'([<>])\1') DOUBLE_ANGLE_RE = re.compile(r'([<>])\1')
EMPTY_ATOMS = ({},{}) EMPTY_ATOMS = ({},{})
def __init__(self, bin, path, manifest={}, map=HTML_MAP, atoms=EMPTY_ATOMS): def __init__(self, bin, path, manifest={}, map=HTML_MAP, atoms=EMPTY_ATOMS):
self.manifest = manifest self.manifest = manifest
self.tag_map, self.attr_map, self.tag_to_attr_map = map self.tag_map, self.attr_map, self.tag_to_attr_map = map
@ -143,7 +142,7 @@ class UnBinary(object):
raw = self.CLOSE_ANGLE_RE.sub(r'&gt;', raw) raw = self.CLOSE_ANGLE_RE.sub(r'&gt;', raw)
raw = self.DOUBLE_ANGLE_RE.sub(r'\1', raw) raw = self.DOUBLE_ANGLE_RE.sub(r'\1', raw)
self.raw = raw self.raw = raw
def item_path(self, internal_id): def item_path(self, internal_id):
try: try:
target = self.manifest[internal_id].path target = self.manifest[internal_id].path
@ -159,7 +158,7 @@ class UnBinary(object):
index += 1 index += 1
relpath = (['..'] * (len(base) - index)) + target[index:] relpath = (['..'] * (len(base) - index)) + target[index:]
return '/'.join(relpath) return '/'.join(relpath)
def __unicode__(self): def __unicode__(self):
return self.raw.decode('utf-8') return self.raw.decode('utf-8')
@ -172,11 +171,11 @@ class UnBinary(object):
in_censorship = is_goingdown = False in_censorship = is_goingdown = False
state = 'text' state = 'text'
flags = 0 flags = 0
while index < len(bin): while index < len(bin):
c, index = read_utf8_char(bin, index) c, index = read_utf8_char(bin, index)
oc = ord(c) oc = ord(c)
if state == 'text': if state == 'text':
if oc == 0: if oc == 0:
state = 'get flags' state = 'get flags'
@ -188,14 +187,14 @@ class UnBinary(object):
elif c == '<': elif c == '<':
c = '<<' c = '<<'
buf.write(encode(c)) buf.write(encode(c))
elif state == 'get flags': elif state == 'get flags':
if oc == 0: if oc == 0:
state = 'text' state = 'text'
continue continue
flags = oc flags = oc
state = 'get tag' state = 'get tag'
elif state == 'get tag': elif state == 'get tag':
state = 'text' if oc == 0 else 'get attr' state = 'text' if oc == 0 else 'get attr'
if flags & FLAG_OPENING: if flags & FLAG_OPENING:
@ -226,7 +225,7 @@ class UnBinary(object):
if depth == 0: if depth == 0:
raise LitError('Extra closing tag') raise LitError('Extra closing tag')
return index return index
elif state == 'get attr': elif state == 'get attr':
in_censorship = False in_censorship = False
if oc == 0: if oc == 0:
@ -265,7 +264,7 @@ class UnBinary(object):
state = 'get href length' state = 'get href length'
else: else:
state = 'get value length' state = 'get value length'
elif state == 'get value length': elif state == 'get value length':
if not in_censorship: if not in_censorship:
buf.write('"') buf.write('"')
@ -281,7 +280,7 @@ class UnBinary(object):
continue continue
if count < 0 or count > (len(bin) - index): if count < 0 or count > (len(bin) - index):
raise LitError('Invalid character count %d' % count) raise LitError('Invalid character count %d' % count)
elif state == 'get value': elif state == 'get value':
if count == 0xfffe: if count == 0xfffe:
if not in_censorship: if not in_censorship:
@ -301,7 +300,7 @@ class UnBinary(object):
buf.write('"') buf.write('"')
in_censorship = False in_censorship = False
state = 'get attr' state = 'get attr'
elif state == 'get custom length': elif state == 'get custom length':
count = oc - 1 count = oc - 1
if count <= 0 or count > len(bin)-index: if count <= 0 or count > len(bin)-index:
@ -309,21 +308,21 @@ class UnBinary(object):
dynamic_tag += 1 dynamic_tag += 1
state = 'get custom' state = 'get custom'
tag_name = '' tag_name = ''
elif state == 'get custom': elif state == 'get custom':
tag_name += c tag_name += c
count -= 1 count -= 1
if count == 0: if count == 0:
buf.write(encode(tag_name)) buf.write(encode(tag_name))
state = 'get attr' state = 'get attr'
elif state == 'get attr length': elif state == 'get attr length':
count = oc - 1 count = oc - 1
if count <= 0 or count > (len(bin) - index): if count <= 0 or count > (len(bin) - index):
raise LitError('Invalid character count %d' % count) raise LitError('Invalid character count %d' % count)
buf.write(' ') buf.write(' ')
state = 'get custom attr' state = 'get custom attr'
elif state == 'get custom attr': elif state == 'get custom attr':
buf.write(encode(c)) buf.write(encode(c))
count -= 1 count -= 1
@ -337,7 +336,7 @@ class UnBinary(object):
raise LitError('Invalid character count %d' % count) raise LitError('Invalid character count %d' % count)
href = '' href = ''
state = 'get href' state = 'get href'
elif state == 'get href': elif state == 'get href':
href += c href += c
count -= 1 count -= 1
@ -350,7 +349,7 @@ class UnBinary(object):
buf.write(encode(u'"%s"' % path)) buf.write(encode(u'"%s"' % path))
state = 'get attr' state = 'get attr'
return index return index
class DirectoryEntry(object): class DirectoryEntry(object):
def __init__(self, name, section, offset, size): def __init__(self, name, section, offset, size):
@ -358,11 +357,11 @@ class DirectoryEntry(object):
self.section = section self.section = section
self.offset = offset self.offset = offset
self.size = size self.size = size
def __repr__(self): def __repr__(self):
return "DirectoryEntry(name=%s, section=%d, offset=%d, size=%d)" \ return "DirectoryEntry(name=%s, section=%d, offset=%d, size=%d)" \
% (repr(self.name), self.section, self.offset, self.size) % (repr(self.name), self.section, self.offset, self.size)
def __str__(self): def __str__(self):
return repr(self) return repr(self)
@ -382,12 +381,12 @@ class ManifestItem(object):
path = os.path.normpath(path).replace('\\', '/') path = os.path.normpath(path).replace('\\', '/')
while path.startswith('../'): path = path[3:] while path.startswith('../'): path = path[3:]
self.path = path self.path = path
def __eq__(self, other): def __eq__(self, other):
if hasattr(other, 'internal'): if hasattr(other, 'internal'):
return self.internal == other.internal return self.internal == other.internal
return self.internal == other return self.internal == other
def __repr__(self): def __repr__(self):
return "ManifestItem(internal=%r, path=%r, mime_type=%r, " \ return "ManifestItem(internal=%r, path=%r, mime_type=%r, " \
"offset=%d, root=%r, state=%r)" \ "offset=%d, root=%r, state=%r)" \
@ -404,7 +403,7 @@ def preserve(function):
self.stream.seek(opos) self.stream.seek(opos)
functools.update_wrapper(wrapper, function) functools.update_wrapper(wrapper, function)
return wrapper return wrapper
class LitFile(object): class LitFile(object):
PIECE_SIZE = 16 PIECE_SIZE = 16
@ -438,14 +437,14 @@ class LitFile(object):
return self.stream.read(8) return self.stream.read(8)
return property(fget=fget) return property(fget=fget)
magic = magic() magic = magic()
def version(): def version():
def fget(self): def fget(self):
self.stream.seek(8) self.stream.seek(8)
return u32(self.stream.read(4)) return u32(self.stream.read(4))
return property(fget=fget) return property(fget=fget)
version = version() version = version()
def hdr_len(): def hdr_len():
@preserve @preserve
def fget(self): def fget(self):
@ -453,7 +452,7 @@ class LitFile(object):
return int32(self.stream.read(4)) return int32(self.stream.read(4))
return property(fget=fget) return property(fget=fget)
hdr_len = hdr_len() hdr_len = hdr_len()
def num_pieces(): def num_pieces():
@preserve @preserve
def fget(self): def fget(self):
@ -461,7 +460,7 @@ class LitFile(object):
return int32(self.stream.read(4)) return int32(self.stream.read(4))
return property(fget=fget) return property(fget=fget)
num_pieces = num_pieces() num_pieces = num_pieces()
def sec_hdr_len(): def sec_hdr_len():
@preserve @preserve
def fget(self): def fget(self):
@ -469,7 +468,7 @@ class LitFile(object):
return int32(self.stream.read(4)) return int32(self.stream.read(4))
return property(fget=fget) return property(fget=fget)
sec_hdr_len = sec_hdr_len() sec_hdr_len = sec_hdr_len()
def guid(): def guid():
@preserve @preserve
def fget(self): def fget(self):
@ -477,7 +476,7 @@ class LitFile(object):
return self.stream.read(16) return self.stream.read(16)
return property(fget=fget) return property(fget=fget)
guid = guid() guid = guid()
def header(): def header():
@preserve @preserve
def fget(self): def fget(self):
@ -488,7 +487,7 @@ class LitFile(object):
return self.stream.read(size) return self.stream.read(size)
return property(fget=fget) return property(fget=fget)
header = header() header = header()
@preserve @preserve
def __len__(self): def __len__(self):
self.stream.seek(0, 2) self.stream.seek(0, 2)
@ -501,7 +500,7 @@ class LitFile(object):
def read_content(self, offset, size): def read_content(self, offset, size):
return self.read_raw(self.content_offset + offset, size) return self.read_raw(self.content_offset + offset, size)
def read_secondary_header(self): def read_secondary_header(self):
offset = self.hdr_len + (self.num_pieces * self.PIECE_SIZE) offset = self.hdr_len + (self.num_pieces * self.PIECE_SIZE)
bytes = self.read_raw(offset, self.sec_hdr_len) bytes = self.read_raw(offset, self.sec_hdr_len)
@ -526,12 +525,12 @@ class LitFile(object):
if u32(bytes[offset+4+16:]): if u32(bytes[offset+4+16:]):
raise LitError('This file has a 64bit content offset') raise LitError('This file has a 64bit content offset')
self.content_offset = u32(bytes[offset+16:]) self.content_offset = u32(bytes[offset+16:])
self.timestamp = u32(bytes[offset+24:]) self.timestamp = u32(bytes[offset+24:])
self.language_id = u32(bytes[offset+28:]) self.language_id = u32(bytes[offset+28:])
offset += 48 offset += 48
if not hasattr(self, 'content_offset'): if not hasattr(self, 'content_offset'):
raise LitError('Could not figure out the content offset') raise LitError('Could not figure out the content offset')
def read_header_pieces(self): def read_header_pieces(self):
src = self.header[self.hdr_len:] src = self.header[self.hdr_len:]
for i in xrange(self.num_pieces): for i in xrange(self.num_pieces):
@ -556,7 +555,7 @@ class LitFile(object):
self.piece3_guid = piece self.piece3_guid = piece
elif i == 4: elif i == 4:
self.piece4_guid = piece self.piece4_guid = piece
def read_directory(self, piece): def read_directory(self, piece):
if not piece.startswith('IFCM'): if not piece.startswith('IFCM'):
raise LitError('Header piece #1 is not main directory.') raise LitError('Header piece #1 is not main directory.')
@ -760,9 +759,9 @@ class LitFile(object):
raise LitError("Reset table is too short") raise LitError("Reset table is too short")
if u32(reset_table[RESET_UCLENGTH + 4:]) != 0: if u32(reset_table[RESET_UCLENGTH + 4:]) != 0:
raise LitError("Reset table has 64bit value for UCLENGTH") raise LitError("Reset table has 64bit value for UCLENGTH")
result = [] result = []
window_size = 14 window_size = 14
u = u32(control[CONTROL_WINDOW_SIZE:]) u = u32(control[CONTROL_WINDOW_SIZE:])
while u > 0: while u > 0:
@ -847,13 +846,13 @@ class LitContainer(object):
def __init__(self, filename_or_stream): def __init__(self, filename_or_stream):
self._litfile = LitFile(filename_or_stream) self._litfile = LitFile(filename_or_stream)
def namelist(self): def namelist(self):
return self._litfile.paths.keys() return self._litfile.paths.keys()
def exists(self, name): def exists(self, name):
return urlunquote(name) in self._litfile.paths return urlunquote(name) in self._litfile.paths
def read(self, name): def read(self, name):
entry = self._litfile.paths[urlunquote(name)] if name else None entry = self._litfile.paths[urlunquote(name)] if name else None
if entry is None: if entry is None:
@ -869,7 +868,7 @@ class LitContainer(object):
internal = '/'.join(('/data', entry.internal)) internal = '/'.join(('/data', entry.internal))
content = self._litfile.get_file(internal) content = self._litfile.get_file(internal)
return content return content
def _read_meta(self): def _read_meta(self):
path = 'content.opf' path = 'content.opf'
raw = self._litfile.get_file('/meta') raw = self._litfile.get_file('/meta')

View File

@ -1,10 +1,10 @@
from __future__ import with_statement from __future__ import with_statement
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''Read meta information from PDF files''' '''Read meta information from PDF files'''
import sys, os, cStringIO import sys, os, cStringIO
from threading import Thread
from calibre import FileWrapper from calibre import FileWrapper
from calibre.ebooks.metadata import MetaInformation, authors_to_string from calibre.ebooks.metadata import MetaInformation, authors_to_string
@ -13,7 +13,8 @@ from pyPdf import PdfFileReader, PdfFileWriter
import Image import Image
try: try:
from calibre.utils.PythonMagickWand import \ from calibre.utils.PythonMagickWand import \
NewMagickWand, MagickReadImage, MagickSetImageFormat, MagickWriteImage NewMagickWand, MagickReadImage, MagickSetImageFormat, \
MagickWriteImage, ImageMagick
_imagemagick_loaded = True _imagemagick_loaded = True
except: except:
_imagemagick_loaded = False _imagemagick_loaded = False
@ -51,9 +52,23 @@ def get_metadata(stream, extract_cover=True):
print >>sys.stderr, msg.encode('utf8') print >>sys.stderr, msg.encode('utf8')
return mi return mi
class MetadataWriter(Thread):
def __init__(self, out_pdf, buf):
self.out_pdf = out_pdf
self.buf = buf
Thread.__init__(self)
self.daemon = True
def run(self):
try:
self.out_pdf.write(self.buf)
except RuntimeError:
pass
def set_metadata(stream, mi): def set_metadata(stream, mi):
stream.seek(0) stream.seek(0)
# Use a cStringIO object for the pdf because we will want to over # Use a StringIO object for the pdf because we will want to over
# write it later and if we are working on the stream directly it # write it later and if we are working on the stream directly it
# could cause some issues. # could cause some issues.
raw = cStringIO.StringIO(stream.read()) raw = cStringIO.StringIO(stream.read())
@ -61,10 +76,18 @@ def set_metadata(stream, mi):
title = mi.title if mi.title else orig_pdf.documentInfo.title title = mi.title if mi.title else orig_pdf.documentInfo.title
author = authors_to_string(mi.authors) if mi.authors else orig_pdf.documentInfo.author author = authors_to_string(mi.authors) if mi.authors else orig_pdf.documentInfo.author
out_pdf = PdfFileWriter(title=title, author=author) out_pdf = PdfFileWriter(title=title, author=author)
out_str = cStringIO.StringIO()
writer = MetadataWriter(out_pdf, out_str)
for page in orig_pdf.pages: for page in orig_pdf.pages:
out_pdf.addPage(page) out_pdf.addPage(page)
out_str = cStringIO.StringIO() writer.start()
out_pdf.write(out_str) writer.join(10) # Wait 10 secs for writing to complete
out_pdf.killed = True
writer.join()
if out_pdf.killed:
print 'Failed to set metadata: took too long'
return
stream.seek(0) stream.seek(0)
stream.truncate() stream.truncate()
out_str.seek(0) out_str.seek(0)
@ -72,35 +95,32 @@ def set_metadata(stream, mi):
stream.seek(0) stream.seek(0)
def get_cover(stream): def get_cover(stream):
stream.seek(0)
data = cStringIO.StringIO() data = cStringIO.StringIO()
try: try:
with FileWrapper(stream) as stream: pdf = PdfFileReader(stream)
pdf = PdfFileReader(stream) output = PdfFileWriter()
output = PdfFileWriter()
if len(pdf.pages) >= 1:
if len(pdf.pages) >= 1: output.addPage(pdf.getPage(0))
output.addPage(pdf.getPage(0))
with TemporaryDirectory('_pdfmeta') as tdir:
with TemporaryDirectory('_pdfmeta') as tdir: cover_path = os.path.join(tdir, 'cover.pdf')
cover_path = os.path.join(tdir, 'cover.pdf')
with open(cover_path, "wb") as outputStream:
outputStream = file(cover_path, "wb")
output.write(outputStream) output.write(outputStream)
outputStream.close()
with ImageMagick():
wand = NewMagickWand() wand = NewMagickWand()
MagickReadImage(wand, cover_path) MagickReadImage(wand, cover_path)
MagickSetImageFormat(wand, 'JPEG') MagickSetImageFormat(wand, 'JPEG')
MagickWriteImage(wand, '%s.jpg' % cover_path) MagickWriteImage(wand, '%s.jpg' % cover_path)
img = Image.open('%s.jpg' % cover_path) img = Image.open('%s.jpg' % cover_path)
img.save(data, 'JPEG') img.save(data, 'JPEG')
except: except:
import traceback import traceback
traceback.print_exc() traceback.print_exc()
return data.getvalue() return data.getvalue()

View File

@ -272,11 +272,7 @@ def XPath(expr):
def xpath(elem, expr): def xpath(elem, expr):
return elem.xpath(expr, namespaces=XPNSMAP) return elem.xpath(expr, namespaces=XPNSMAP)
def _prepare_xml_for_serialization(root):
pass
def xml2str(root, pretty_print=False, strip_comments=False): def xml2str(root, pretty_print=False, strip_comments=False):
_prepare_xml_for_serialization(root)
ans = etree.tostring(root, encoding='utf-8', xml_declaration=True, ans = etree.tostring(root, encoding='utf-8', xml_declaration=True,
pretty_print=pretty_print) pretty_print=pretty_print)
@ -287,7 +283,6 @@ def xml2str(root, pretty_print=False, strip_comments=False):
def xml2unicode(root, pretty_print=False): def xml2unicode(root, pretty_print=False):
_prepare_xml_for_serialization(root)
return etree.tostring(root, pretty_print=pretty_print) return etree.tostring(root, pretty_print=pretty_print)
ASCII_CHARS = set(chr(x) for x in xrange(128)) ASCII_CHARS = set(chr(x) for x in xrange(128))
@ -321,6 +316,25 @@ def urlnormalize(href):
parts = (urlquote(part) for part in parts) parts = (urlquote(part) for part in parts)
return urlunparse(parts) return urlunparse(parts)
class DummyHandler(logging.Handler):
def __init__(self):
logging.Handler.__init__(self, logging.WARNING)
self.setFormatter(logging.Formatter('%(message)s'))
self.log = None
def emit(self, record):
if self.log is not None:
msg = self.format(record)
f = self.log.error if record.levelno >= logging.ERROR \
else self.log.warn
f(msg)
_css_logger = logging.getLogger('calibre.css')
_css_logger.setLevel(logging.WARNING)
_css_log_handler = DummyHandler()
_css_logger.addHandler(_css_log_handler)
class OEBError(Exception): class OEBError(Exception):
"""Generic OEB-processing error.""" """Generic OEB-processing error."""
@ -778,7 +792,8 @@ class Manifest(object):
data = self.oeb.css_preprocessor(data) data = self.oeb.css_preprocessor(data)
data = XHTML_CSS_NAMESPACE + data data = XHTML_CSS_NAMESPACE + data
parser = CSSParser(loglevel=logging.WARNING, parser = CSSParser(loglevel=logging.WARNING,
fetcher=self._fetch_css) fetcher=self._fetch_css,
log=_css_logger)
data = parser.parseString(data, href=self.href) data = parser.parseString(data, href=self.href)
data.namespaces['h'] = XHTML_NS data.namespaces['h'] = XHTML_NS
return data return data
@ -1435,7 +1450,7 @@ class OEBBook(object):
:attr:`pages`: List of "pages," such as indexed to a print edition of :attr:`pages`: List of "pages," such as indexed to a print edition of
the same text. the same text.
""" """
_css_log_handler.log = logger
self.encoding = encoding self.encoding = encoding
self.html_preprocessor = html_preprocessor self.html_preprocessor = html_preprocessor
self.css_preprocessor = css_preprocessor self.css_preprocessor = css_preprocessor
@ -1450,6 +1465,7 @@ class OEBBook(object):
self.guide = Guide(self) self.guide = Guide(self)
self.toc = TOC() self.toc = TOC()
self.pages = PageList() self.pages = PageList()
self.auto_generated_toc = True
@classmethod @classmethod
def generate(cls, opts): def generate(cls, opts):

View File

@ -13,13 +13,12 @@ from PyQt4.Qt import QFontDatabase
from calibre.customize.ui import available_input_formats from calibre.customize.ui import available_input_formats
from calibre.ebooks.epub.from_html import TITLEPAGE from calibre.ebooks.epub.from_html import TITLEPAGE
from calibre.ebooks.metadata.opf2 import OPF, OPFCreator from calibre.ebooks.metadata.opf2 import OPF
from calibre.ptempfile import TemporaryDirectory from calibre.ptempfile import TemporaryDirectory
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.zipfile import safe_replace, ZipFile from calibre.utils.zipfile import safe_replace, ZipFile
from calibre.utils.config import DynamicConfig from calibre.utils.config import DynamicConfig
from calibre.utils.logging import Log from calibre.utils.logging import Log
from calibre import CurrentDir
def character_count(html): def character_count(html):
''' '''
@ -57,31 +56,21 @@ class FakeOpts(object):
max_levels = 5 max_levels = 5
input_encoding = None input_encoding = None
def html2opf(path, tdir, log):
from calibre.ebooks.html.input import get_filelist
from calibre.ebooks.metadata.meta import get_metadata
with CurrentDir(tdir):
fl = get_filelist(path, tdir, FakeOpts(), log)
mi = get_metadata(open(path, 'rb'), 'html')
mi = OPFCreator(os.getcwdu(), mi)
mi.guide = None
entries = [(f.path, 'application/xhtml+xml') for f in fl]
mi.create_manifest(entries)
mi.create_spine([f.path for f in fl])
mi.render(open('metadata.opf', 'wb'))
opfpath = os.path.abspath('metadata.opf')
return opfpath
def opf2opf(path, tdir, opts):
return path
def is_supported(path): def is_supported(path):
ext = os.path.splitext(path)[1].replace('.', '').lower() ext = os.path.splitext(path)[1].replace('.', '').lower()
ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext) ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
return ext in available_input_formats() return ext in available_input_formats()
def write_oebbook(oeb, path):
from calibre.ebooks.oeb.writer import OEBWriter
from calibre import walk
w = OEBWriter()
w(oeb, path)
for f in walk(path):
if f.endswith('.opf'):
return f
class EbookIterator(object): class EbookIterator(object):
CHARACTERS_PER_PAGE = 1000 CHARACTERS_PER_PAGE = 1000
@ -131,17 +120,16 @@ class EbookIterator(object):
def __enter__(self): def __enter__(self):
self._tdir = TemporaryDirectory('_ebook_iter') self._tdir = TemporaryDirectory('_ebook_iter')
self.base = self._tdir.__enter__() self.base = self._tdir.__enter__()
if self.ebook_ext == 'opf': from calibre.ebooks.conversion.plumber import Plumber
self.pathtoopf = self.pathtoebook plumber = Plumber(self.pathtoebook, self.base, self.log)
elif self.ebook_ext == 'html': plumber.setup_options()
self.pathtoopf = html2opf(self.pathtoebook, self.base, self.log) if hasattr(plumber.opts, 'dont_package'):
else: plumber.opts.dont_package = True
from calibre.ebooks.conversion.plumber import Plumber self.pathtoopf = plumber.input_plugin(open(plumber.input, 'rb'),
plumber = Plumber(self.pathtoebook, self.base, self.log) plumber.opts, plumber.input_fmt, self.log,
plumber.setup_options() {}, self.base)
self.pathtoopf = plumber.input_plugin(open(plumber.input, 'rb'), if hasattr(self.pathtoopf, 'manifest'):
plumber.opts, plumber.input_fmt, self.log, self.pathtoopf = write_oebbook(self.pathtoebook, self._tdir)
{}, self.base)
self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf)) self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf))

View File

@ -16,7 +16,6 @@ class OEBOutput(OutputFormatPlugin):
author = 'Kovid Goyal' author = 'Kovid Goyal'
file_type = 'oeb' file_type = 'oeb'
def convert(self, oeb_book, output_path, input_plugin, opts, log): def convert(self, oeb_book, output_path, input_plugin, opts, log):
self.log, self.opts = log, opts self.log, self.opts = log, opts
if not os.path.exists(output_path): if not os.path.exists(output_path):

View File

@ -349,6 +349,7 @@ class OEBReader(object):
def _toc_from_ncx(self, item): def _toc_from_ncx(self, item):
if item is None: if item is None:
return False return False
self.log.debug('Reading TOC from NCX...')
ncx = item.data ncx = item.data
title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()')) title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()'))
title = COLLAPSE_RE.sub(' ', title.strip()) title = COLLAPSE_RE.sub(' ', title.strip())
@ -364,6 +365,7 @@ class OEBReader(object):
result = xpath(opf, 'o2:tours/o2:tour') result = xpath(opf, 'o2:tours/o2:tour')
if not result: if not result:
return False return False
self.log.debug('Reading TOC from tour...')
tour = result[0] tour = result[0]
toc = self.oeb.toc toc = self.oeb.toc
toc.title = tour.get('title') toc.title = tour.get('title')
@ -384,6 +386,7 @@ class OEBReader(object):
def _toc_from_html(self, opf): def _toc_from_html(self, opf):
if 'toc' not in self.oeb.guide: if 'toc' not in self.oeb.guide:
return False return False
self.log.debug('Reading TOC from HTML...')
itempath, frag = urldefrag(self.oeb.guide['toc'].href) itempath, frag = urldefrag(self.oeb.guide['toc'].href)
item = self.oeb.manifest.hrefs[itempath] item = self.oeb.manifest.hrefs[itempath]
html = item.data html = item.data
@ -414,6 +417,7 @@ class OEBReader(object):
return True return True
def _toc_from_spine(self, opf): def _toc_from_spine(self, opf):
self.log.warn('Generating default TOC from spine...')
toc = self.oeb.toc toc = self.oeb.toc
titles = [] titles = []
headers = [] headers = []
@ -441,11 +445,14 @@ class OEBReader(object):
return True return True
def _toc_from_opf(self, opf, item): def _toc_from_opf(self, opf, item):
self.oeb.auto_generated_toc = False
if self._toc_from_ncx(item): return if self._toc_from_ncx(item): return
if self._toc_from_tour(opf): return # Prefer HTML to tour based TOC, since several LIT files
self.logger.warn('No metadata table of contents found') # have good HTML TOCs but bad tour based TOCs
if self._toc_from_html(opf): return if self._toc_from_html(opf): return
if self._toc_from_tour(opf): return
self._toc_from_spine(opf) self._toc_from_spine(opf)
self.oeb.auto_generated_toc = True
def _pages_from_ncx(self, opf, item): def _pages_from_ncx(self, opf, item):
if item is None: if item is None:

View File

@ -51,8 +51,8 @@ class Split(object):
self.log = oeb.log self.log = oeb.log
self.map = {} self.map = {}
self.page_break_selectors = None self.page_break_selectors = None
for item in self.oeb.manifest.items: for item in list(self.oeb.manifest.items):
if etree.iselement(item.data): if item.spine_position is not None and etree.iselement(item.data):
self.split_item(item) self.split_item(item)
self.fix_links() self.fix_links()
@ -74,31 +74,34 @@ class Split(object):
self.page_break_selectors = set([]) self.page_break_selectors = set([])
stylesheets = [x.data for x in self.oeb.manifest if x.media_type in stylesheets = [x.data for x in self.oeb.manifest if x.media_type in
OEB_STYLES] OEB_STYLES]
page_break_selectors = set([]) for rule in rules(stylesheets):
for rule in rules(stylesheets): before = getattr(rule.style.getPropertyCSSValue(
before = getattr(rule.style.getPropertyCSSValue( 'page-break-before'), 'cssText', '').strip().lower()
'page-break-before'), 'cssText', '').strip().lower() after = getattr(rule.style.getPropertyCSSValue(
after = getattr(rule.style.getPropertyCSSValue( 'page-break-after'), 'cssText', '').strip().lower()
'page-break-after'), 'cssText', '').strip().lower() try:
try: if before and before != 'avoid':
if before and before != 'avoid': self.page_break_selectors.add((CSSSelector(rule.selectorText),
page_break_selectors.add((CSSSelector(rule.selectorText), True))
True)) except:
except: pass
pass try:
try: if after and after != 'avoid':
if after and after != 'avoid': self.page_break_selectors.add((CSSSelector(rule.selectorText),
page_break_selectors.add((CSSSelector(rule.selectorText), False))
False)) except:
except: pass
pass
page_breaks = set([]) page_breaks = set([])
for selector, before in page_break_selectors: for selector, before in self.page_break_selectors:
for elem in selector(item.data): body = item.data.xpath('//h:body', namespaces=NAMESPACES)
if before: if not body:
elem.set('pb_before', '1') continue
page_breaks.add(elem) for elem in selector(body[0]):
if elem not in body:
if before:
elem.set('pb_before', '1')
page_breaks.add(elem)
for i, elem in enumerate(item.data.iter()): for i, elem in enumerate(item.data.iter()):
elem.set('pb_order', str(i)) elem.set('pb_order', str(i))
@ -136,8 +139,10 @@ class Split(object):
if href in self.map: if href in self.map:
anchor_map = self.map[href] anchor_map = self.map[href]
nhref = anchor_map[frag if frag else None] nhref = anchor_map[frag if frag else None]
nhref = self.current_item.relhref(nhref)
if frag: if frag:
nhref = '#'.join(href, frag) nhref = '#'.join((nhref, frag))
return nhref return nhref
return url return url
@ -153,7 +158,7 @@ class FlowSplitter(object):
self.page_breaks = page_breaks self.page_breaks = page_breaks
self.page_break_ids = page_break_ids self.page_break_ids = page_break_ids
self.max_flow_size = max_flow_size self.max_flow_size = max_flow_size
self.base = item.abshref(item.href) self.base = item.href
base, ext = os.path.splitext(self.base) base, ext = os.path.splitext(self.base)
self.base = base.replace('%', '%%')+'_split_%d'+ext self.base = base.replace('%', '%%')+'_split_%d'+ext
@ -192,9 +197,9 @@ class FlowSplitter(object):
self.trees = [] self.trees = []
tree = orig_tree tree = orig_tree
for pattern, before in ordered_ids: for pattern, before in ordered_ids:
self.log.debug('\t\tSplitting on page-break')
elem = pattern(tree) elem = pattern(tree)
if elem: if elem:
self.log.debug('\t\tSplitting on page-break')
before, after = self.do_split(tree, elem[0], before) before, after = self.do_split(tree, elem[0], before)
self.trees.append(before) self.trees.append(before)
tree = after tree = after
@ -414,13 +419,14 @@ class FlowSplitter(object):
elem.attrib.pop(SPLIT_ATTR, None) elem.attrib.pop(SPLIT_ATTR, None)
elem.attrib.pop(SPLIT_POINT_ATTR, '0') elem.attrib.pop(SPLIT_POINT_ATTR, '0')
spine_pos = self.item.spine_pos spine_pos = self.item.spine_position
for current, tree in zip(map(reversed, (self.files, self.trees))): for current, tree in zip(*map(reversed, (self.files, self.trees))):
for a in tree.getroot().xpath('//h:a[@href]', namespaces=NAMESPACES): for a in tree.getroot().xpath('//h:a[@href]', namespaces=NAMESPACES):
href = a.get('href').strip() href = a.get('href').strip()
if href.startswith('#'): if href.startswith('#'):
anchor = href[1:] anchor = href[1:]
file = self.anchor_map[anchor] file = self.anchor_map[anchor]
file = self.item.relhref(file)
if file != current: if file != current:
a.set('href', file+href) a.set('href', file+href)
@ -430,12 +436,12 @@ class FlowSplitter(object):
self.oeb.spine.insert(spine_pos, new_item, self.item.linear) self.oeb.spine.insert(spine_pos, new_item, self.item.linear)
if self.oeb.guide: if self.oeb.guide:
for ref in self.oeb.guide: for ref in self.oeb.guide.values():
href, frag = urldefrag(ref.href) href, frag = urldefrag(ref.href)
if href == self.item.href: if href == self.item.href:
nhref = self.anchor_map[frag if frag else None] nhref = self.anchor_map[frag if frag else None]
if frag: if frag:
nhref = '#'.join(nhref, frag) nhref = '#'.join((nhref, frag))
ref.href = nhref ref.href = nhref
def fix_toc_entry(toc): def fix_toc_entry(toc):
@ -444,7 +450,7 @@ class FlowSplitter(object):
if href == self.item.href: if href == self.item.href:
nhref = self.anchor_map[frag if frag else None] nhref = self.anchor_map[frag if frag else None]
if frag: if frag:
nhref = '#'.join(nhref, frag) nhref = '#'.join((nhref, frag))
toc.href = nhref toc.href = nhref
for x in toc: for x in toc:
fix_toc_entry(x) fix_toc_entry(x)

View File

@ -49,7 +49,7 @@ class OEBWriter(object):
def __call__(self, oeb, path): def __call__(self, oeb, path):
""" """
Read the book in the :class:`OEBBook` object :param:`oeb` to a file Write the book in the :class:`OEBBook` object :param:`oeb` to a folder
at :param:`path`. at :param:`path`.
""" """
version = int(self.version[0]) version = int(self.version[0])

View File

@ -319,6 +319,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
self.cover_changed = True self.cover_changed = True
def initialize_series(self): def initialize_series(self):
self.series.setSizeAdjustPolicy(self.series.AdjustToContentsOnFirstShow)
all_series = self.db.all_series() all_series = self.db.all_series()
all_series.sort(cmp=lambda x, y : cmp(x[1], y[1])) all_series.sort(cmp=lambda x, y : cmp(x[1], y[1]))
series_id = self.db.series_id(self.row) series_id = self.db.series_id(self.row)
@ -335,13 +336,6 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
self.series.setCurrentIndex(idx) self.series.setCurrentIndex(idx)
self.enable_series_index() self.enable_series_index()
pl = self.series.parentWidget().layout()
for i in range(pl.count()):
l = pl.itemAt(i).layout()
if l:
l.invalidate()
l.activate()
def initialize_series_and_publisher(self): def initialize_series_and_publisher(self):
self.initialize_series() self.initialize_series()
all_publishers = self.db.all_publishers() all_publishers = self.db.all_publishers()

Binary file not shown.

After

Width:  |  Height:  |  Size: 509 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 637 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 746 B

View File

@ -40,6 +40,7 @@ recipe_modules = ['recipe_' + r for r in (
'krstarica', 'krstarica_en', 'tanjug', 'laprensa_ni', 'azstarnet', 'krstarica', 'krstarica_en', 'tanjug', 'laprensa_ni', 'azstarnet',
'corriere_della_sera_it', 'corriere_della_sera_en', 'msdnmag_en', 'corriere_della_sera_it', 'corriere_della_sera_en', 'msdnmag_en',
'moneynews', 'der_standard', 'diepresse', 'nzz_ger', 'hna', 'moneynews', 'der_standard', 'diepresse', 'nzz_ger', 'hna',
'seattle_times',
)] )]
import re, imp, inspect, time, os import re, imp, inspect, time, os

View File

@ -1,14 +1,37 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>'
''' http://www.derstandard.at - Austrian Newspaper ''' ''' http://www.derstandard.at - Austrian Newspaper '''
import re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class DerStandardRecipe(BasicNewsRecipe): class DerStandardRecipe(BasicNewsRecipe):
title = u'derStandard' title = u'derStandard'
__author__ = 'Gerhard Aigner' __author__ = 'Gerhard Aigner'
description = u'Nachrichten aus Österreich'
publisher ='derStandard.at'
category = 'news, politics, nachrichten, Austria'
use_embedded_content = False
remove_empty_feeds = True
lang = 'de-AT'
no_stylesheets = True
encoding = 'utf-8'
language = _('German')
recursions = 0
oldest_article = 1 oldest_article = 1
max_articles_per_feed = 100 max_articles_per_feed = 100
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [(u'International', u'http://derstandard.at/?page=rss&ressort=internationalpolitik'), feeds = [(u'International', u'http://derstandard.at/?page=rss&ressort=internationalpolitik'),
(u'Inland', u'http://derstandard.at/?page=rss&ressort=innenpolitik'), (u'Inland', u'http://derstandard.at/?page=rss&ressort=innenpolitik'),
(u'Wirtschaft', u'http://derstandard.at/?page=rss&ressort=investor'), (u'Wirtschaft', u'http://derstandard.at/?page=rss&ressort=investor'),
@ -20,17 +43,13 @@ class DerStandardRecipe(BasicNewsRecipe):
(u'Wissenschaft', u'http://derstandard.at/?page=rss&ressort=wissenschaft'), (u'Wissenschaft', u'http://derstandard.at/?page=rss&ressort=wissenschaft'),
(u'Gesundheit', u'http://derstandard.at/?page=rss&ressort=gesundheit'), (u'Gesundheit', u'http://derstandard.at/?page=rss&ressort=gesundheit'),
(u'Bildung', u'http://derstandard.at/?page=rss&ressort=subildung')] (u'Bildung', u'http://derstandard.at/?page=rss&ressort=subildung')]
encoding = 'utf-8'
language = _('German')
recursions = 0
remove_tags = [dict(name='div'), dict(name='a'), dict(name='link'), dict(name='meta'), remove_tags = [dict(name='div'), dict(name='a'), dict(name='link'), dict(name='meta'),
dict(name='form',attrs={'name':'sitesearch'}), dict(name='hr')] dict(name='form',attrs={'name':'sitesearch'}), dict(name='hr')]
preprocess_regexps = [ preprocess_regexps = [
(re.compile(r'\[[\d*]\]', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(r'\[[\d]*\]', re.DOTALL|re.IGNORECASE), lambda match: ''),
(re.compile(r'bgcolor="#\w{3,6}"', re.DOTALL|re.IGNORECASE), lambda match: '') (re.compile(r'bgcolor="#\w{3,6}"', re.DOTALL|re.IGNORECASE), lambda match: '')
] ]
def print_version(self, url): def print_version(self, url):
return url.replace('?id=', 'txt/?id=') return url.replace('?id=', 'txt/?id=')
@ -40,3 +59,10 @@ class DerStandardRecipe(BasicNewsRecipe):
if (article.link.count('ressort') > 0 or article.title.lower().count('ansichtssache') > 0): if (article.link.count('ressort') > 0 or article.title.lower().count('ansichtssache') > 0):
return None return None
return article.link return article.link
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '">'
soup.head.insert(0,mtag)
return soup

View File

@ -1,18 +1,42 @@
import re # -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>'
''' http://www.diepresse.at - Austrian Newspaper '''
import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class DiePresseRecipe(BasicNewsRecipe): class DiePresseRecipe(BasicNewsRecipe):
title = u'diePresse' title = u'diePresse'
__author__ = 'Gerhard Aigner'
description = u'DiePresse.com - Die Online-Ausgabe der Österreichischen Tageszeitung Die Presse.'
publisher ='DiePresse.com'
category = 'news, politics, nachrichten, Austria'
use_embedded_content = False
remove_empty_feeds = True
lang = 'de-AT'
no_stylesheets = True
encoding = 'ISO-8859-1'
language = _('German')
recursions = 0
oldest_article = 1 oldest_article = 1
max_articles_per_feed = 100 max_articles_per_feed = 100
recursions = 0
language = _('German') html2lrf_options = [
__author__ = 'Gerhard Aigner' '--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
preprocess_regexps = [ preprocess_regexps = [
(re.compile(r'Textversion', re.DOTALL), lambda match: ''), (re.compile(r'Textversion', re.DOTALL), lambda match: ''),
] ]
remove_tags = [dict(name='hr'), remove_tags = [dict(name='hr'),
dict(name='br'), dict(name='br'),
dict(name='small'), dict(name='small'),
@ -21,6 +45,7 @@ class DiePresseRecipe(BasicNewsRecipe):
dict(name='h1', attrs={'class':'titel'}), dict(name='h1', attrs={'class':'titel'}),
dict(name='a', attrs={'class':'print'}), dict(name='a', attrs={'class':'print'}),
dict(name='div', attrs={'class':'hline'})] dict(name='div', attrs={'class':'hline'})]
feeds = [(u'Politik', u'http://diepresse.com/rss/Politik'), feeds = [(u'Politik', u'http://diepresse.com/rss/Politik'),
(u'Wirtschaft', u'http://diepresse.com/rss/Wirtschaft'), (u'Wirtschaft', u'http://diepresse.com/rss/Wirtschaft'),
(u'Europa', u'http://diepresse.com/rss/EU'), (u'Europa', u'http://diepresse.com/rss/EU'),
@ -29,7 +54,7 @@ class DiePresseRecipe(BasicNewsRecipe):
(u'Kultur', u'http://diepresse.com/rss/Kultur'), (u'Kultur', u'http://diepresse.com/rss/Kultur'),
(u'Leben', u'http://diepresse.com/rss/Leben'), (u'Leben', u'http://diepresse.com/rss/Leben'),
(u'Tech', u'http://diepresse.com/rss/Tech'), (u'Tech', u'http://diepresse.com/rss/Tech'),
(u'Science', u'http://diepresse.com/rss/Science'), (u'Wissenschaft', u'http://diepresse.com/rss/Science'),
(u'Bildung', u'http://diepresse.com/rss/Bildung'), (u'Bildung', u'http://diepresse.com/rss/Bildung'),
(u'Gesundheit', u'http://diepresse.com/rss/Gesundheit'), (u'Gesundheit', u'http://diepresse.com/rss/Gesundheit'),
(u'Recht', u'http://diepresse.com/rss/Recht'), (u'Recht', u'http://diepresse.com/rss/Recht'),
@ -38,3 +63,10 @@ class DiePresseRecipe(BasicNewsRecipe):
def print_version(self, url): def print_version(self, url):
return url.replace('home','text/home') return url.replace('home','text/home')
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
soup.head.insert(0,mtag)
return soup

View File

@ -0,0 +1,50 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
seattletimes.nwsource.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class SeattleTimes(BasicNewsRecipe):
title = 'The Seattle Times'
__author__ = 'Darko Miletic'
description = 'News from Seattle and USA'
publisher = 'The Seattle Times'
category = 'news, politics, USA'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
language = _('English')
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [(u'Articles', u'http://seattletimes.nwsource.com/rss/seattletimes.xml')]
remove_tags = [
dict(name=['object','link','script'])
,dict(name='p', attrs={'class':'permission'})
]
def print_version(self, url):
start_url, sep, rest_url = url.rpartition('_')
rurl, rsep, article_id = start_url.rpartition('/')
return u'http://seattletimes.nwsource.com/cgi-bin/PrintStory.pl?document_id=' + article_id
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="en-US"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -299,7 +299,7 @@ def readStringFromStream(stream):
elif tok == "t": elif tok == "t":
tok = "\t" tok = "\t"
elif tok == "b": elif tok == "b":
tok == "\b" tok = "\b"
elif tok == "f": elif tok == "f":
tok = "\f" tok = "\f"
elif tok == "(": elif tok == "(":
@ -673,7 +673,7 @@ class RectangleObject(ArrayObject):
def getUpperLeft_x(self): def getUpperLeft_x(self):
return self.getLowerLeft_x() return self.getLowerLeft_x()
def getUpperLeft_y(self): def getUpperLeft_y(self):
return self.getUpperRight_y() return self.getUpperRight_y()

View File

@ -39,15 +39,12 @@ __author__ = "Mathieu Fenniak"
__author_email__ = "biziqe@mathieu.fenniak.net" __author_email__ = "biziqe@mathieu.fenniak.net"
import struct import struct
try: from cStringIO import StringIO
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
import filters from generic import DictionaryObject, NameObject, NumberObject, \
import utils createStringObject, ArrayObject, ByteStringObject, StreamObject, \
import warnings IndirectObject, utils, readObject, TextStringObject, BooleanObject, \
from generic import * RectangleObject, DecodedStreamObject
from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList
@ -56,6 +53,7 @@ from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirt
# class (typically {@link #PdfFileReader PdfFileReader}). # class (typically {@link #PdfFileReader PdfFileReader}).
class PdfFileWriter(object): class PdfFileWriter(object):
def __init__(self,title=u"Unknown",author=u"Unknown"): def __init__(self,title=u"Unknown",author=u"Unknown"):
self.killed = False
self._header = "%PDF-1.3" self._header = "%PDF-1.3"
self._objects = [] # array of indirect objects self._objects = [] # array of indirect objects
@ -162,7 +160,7 @@ class PdfFileWriter(object):
# @param stream An object to write the file to. The object must support # @param stream An object to write the file to. The object must support
# the write method, and the tell method, similar to a file object. # the write method, and the tell method, similar to a file object.
def write(self, stream): def write(self, stream):
import struct, md5 import md5
externalReferenceMap = {} externalReferenceMap = {}
self.stack = [] self.stack = []
@ -209,11 +207,13 @@ class PdfFileWriter(object):
if hasattr(self, "_encrypt"): if hasattr(self, "_encrypt"):
trailer[NameObject("/Encrypt")] = self._encrypt trailer[NameObject("/Encrypt")] = self._encrypt
trailer.writeToStream(stream, None) trailer.writeToStream(stream, None)
# eof # eof
stream.write("\nstartxref\n%s\n%%%%EOF\n" % (xref_location)) stream.write("\nstartxref\n%s\n%%%%EOF\n" % (xref_location))
def _sweepIndirectReferences(self, externMap, data): def _sweepIndirectReferences(self, externMap, data):
if self.killed:
raise RuntimeError('Writer killed')
if isinstance(data, DictionaryObject): if isinstance(data, DictionaryObject):
for key, value in data.items(): for key, value in data.items():
origvalue = value origvalue = value
@ -356,8 +356,8 @@ class PdfFileReader(object):
return self.flattenedPages[pageNumber] return self.flattenedPages[pageNumber]
## ##
# Read-only property that accesses the # Read-only property that accesses the
# {@link #PdfFileReader.getNamedDestinations # {@link #PdfFileReader.getNamedDestinations
# getNamedDestinations} function. # getNamedDestinations} function.
# <p> # <p>
# Stability: Added in v1.10, will exist for all future v1.x releases. # Stability: Added in v1.10, will exist for all future v1.x releases.
@ -374,7 +374,7 @@ class PdfFileReader(object):
if retval == None: if retval == None:
retval = {} retval = {}
catalog = self.trailer["/Root"] catalog = self.trailer["/Root"]
# get the name tree # get the name tree
if catalog.has_key("/Dests"): if catalog.has_key("/Dests"):
tree = catalog["/Dests"] tree = catalog["/Dests"]
@ -382,7 +382,7 @@ class PdfFileReader(object):
names = catalog['/Names'] names = catalog['/Names']
if names.has_key("/Dests"): if names.has_key("/Dests"):
tree = names['/Dests'] tree = names['/Dests']
if tree == None: if tree == None:
return retval return retval
@ -420,17 +420,17 @@ class PdfFileReader(object):
if outlines == None: if outlines == None:
outlines = [] outlines = []
catalog = self.trailer["/Root"] catalog = self.trailer["/Root"]
# get the outline dictionary and named destinations # get the outline dictionary and named destinations
if catalog.has_key("/Outlines"): if catalog.has_key("/Outlines"):
lines = catalog["/Outlines"] lines = catalog["/Outlines"]
if lines.has_key("/First"): if lines.has_key("/First"):
node = lines["/First"] node = lines["/First"]
self._namedDests = self.getNamedDestinations() self._namedDests = self.getNamedDestinations()
if node == None: if node == None:
return outlines return outlines
# see if there are any more outlines # see if there are any more outlines
while 1: while 1:
outline = self._buildOutline(node) outline = self._buildOutline(node)
@ -454,10 +454,10 @@ class PdfFileReader(object):
page, typ = array[0:2] page, typ = array[0:2]
array = array[2:] array = array[2:]
return Destination(title, page, typ, *array) return Destination(title, page, typ, *array)
def _buildOutline(self, node): def _buildOutline(self, node):
dest, title, outline = None, None, None dest, title, outline = None, None, None
if node.has_key("/A") and node.has_key("/Title"): if node.has_key("/A") and node.has_key("/Title"):
# Action, section 8.5 (only type GoTo supported) # Action, section 8.5 (only type GoTo supported)
title = node["/Title"] title = node["/Title"]
@ -951,7 +951,7 @@ class PageObject(DictionaryObject):
def _pushPopGS(contents, pdf): def _pushPopGS(contents, pdf):
# adds a graphics state "push" and "pop" to the beginning and end # adds a graphics state "push" and "pop" to the beginning and end
# of a content stream. This isolates it from changes such as # of a content stream. This isolates it from changes such as
# transformation matricies. # transformation matricies.
stream = ContentStream(contents, pdf) stream = ContentStream(contents, pdf)
stream.operations.insert(0, [[], "q"]) stream.operations.insert(0, [[], "q"])
@ -1291,7 +1291,7 @@ class Destination(DictionaryObject):
self[NameObject("/Title")] = title self[NameObject("/Title")] = title
self[NameObject("/Page")] = page self[NameObject("/Page")] = page
self[NameObject("/Type")] = typ self[NameObject("/Type")] = typ
# from table 8.2 of the PDF 1.6 reference. # from table 8.2 of the PDF 1.6 reference.
if typ == "/XYZ": if typ == "/XYZ":
(self[NameObject("/Left")], self[NameObject("/Top")], (self[NameObject("/Left")], self[NameObject("/Top")],
@ -1307,7 +1307,7 @@ class Destination(DictionaryObject):
pass pass
else: else:
raise utils.PdfReadError("Unknown Destination Type: %r" % typ) raise utils.PdfReadError("Unknown Destination Type: %r" % typ)
## ##
# Read-only property accessing the destination title. # Read-only property accessing the destination title.
# @return A string. # @return A string.
@ -1474,25 +1474,25 @@ def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encr
# described in Algorithm 3.2. # described in Algorithm 3.2.
key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry) key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry)
# 2. Initialize the MD5 hash function and pass the 32-byte padding string # 2. Initialize the MD5 hash function and pass the 32-byte padding string
# shown in step 1 of Algorithm 3.2 as input to this function. # shown in step 1 of Algorithm 3.2 as input to this function.
import md5 import md5
m = md5.new() m = md5.new()
m.update(_encryption_padding) m.update(_encryption_padding)
# 3. Pass the first element of the file's file identifier array (the value # 3. Pass the first element of the file's file identifier array (the value
# of the ID entry in the document's trailer dictionary; see Table 3.13 on # of the ID entry in the document's trailer dictionary; see Table 3.13 on
# page 73) to the hash function and finish the hash. (See implementation # page 73) to the hash function and finish the hash. (See implementation
# note 25 in Appendix H.) # note 25 in Appendix H.)
m.update(id1_entry) m.update(id1_entry)
md5_hash = m.digest() md5_hash = m.digest()
# 4. Encrypt the 16-byte result of the hash, using an RC4 encryption # 4. Encrypt the 16-byte result of the hash, using an RC4 encryption
# function with the encryption key from step 1. # function with the encryption key from step 1.
val = utils.RC4_encrypt(key, md5_hash) val = utils.RC4_encrypt(key, md5_hash)
# 5. Do the following 19 times: Take the output from the previous # 5. Do the following 19 times: Take the output from the previous
# invocation of the RC4 function and pass it as input to a new invocation # invocation of the RC4 function and pass it as input to a new invocation
# of the function; use an encryption key generated by taking each byte of # of the function; use an encryption key generated by taking each byte of
# the original encryption key (obtained in step 2) and performing an XOR # the original encryption key (obtained in step 2) and performing an XOR
# operation between that byte and the single-byte value of the iteration # operation between that byte and the single-byte value of the iteration
# counter (from 1 to 19). # counter (from 1 to 19).
for i in range(1, 20): for i in range(1, 20):
new_key = '' new_key = ''
for l in range(len(key)): for l in range(len(key)):
@ -1500,7 +1500,7 @@ def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encr
val = utils.RC4_encrypt(new_key, val) val = utils.RC4_encrypt(new_key, val)
# 6. Append 16 bytes of arbitrary padding to the output from the final # 6. Append 16 bytes of arbitrary padding to the output from the final
# invocation of the RC4 function and store the 32-byte result as the value # invocation of the RC4 function and store the 32-byte result as the value
# of the U entry in the encryption dictionary. # of the U entry in the encryption dictionary.
# (implementator note: I don't know what "arbitrary padding" is supposed to # (implementator note: I don't know what "arbitrary padding" is supposed to
# mean, so I have used null bytes. This seems to match a few other # mean, so I have used null bytes. This seems to match a few other
# people's implementations) # people's implementations)