mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 18:54:09 -04:00
pdf get_cover returns cover image instead of nothing.
This commit is contained in:
commit
b104286f61
@ -263,14 +263,14 @@ class MOBIMetadataWriter(MetadataWriterPlugin):
|
||||
def set_metadata(self, stream, mi, type):
|
||||
from calibre.ebooks.metadata.mobi import set_metadata
|
||||
set_metadata(stream, mi)
|
||||
|
||||
|
||||
class PDFMetadataWriter(MetadataWriterPlugin):
|
||||
|
||||
name = 'Set PDF metadata'
|
||||
file_types = set(['pdf'])
|
||||
description = _('Set metadata in %s files') % 'PDF'
|
||||
author = 'John Schember'
|
||||
|
||||
|
||||
def set_metadata(self, stream, mi, type):
|
||||
from calibre.ebooks.metadata.pdf import set_metadata
|
||||
set_metadata(stream, mi)
|
||||
@ -280,6 +280,7 @@ from calibre.ebooks.epub.input import EPUBInput
|
||||
from calibre.ebooks.mobi.input import MOBIInput
|
||||
from calibre.ebooks.pdf.input import PDFInput
|
||||
from calibre.ebooks.txt.input import TXTInput
|
||||
from calibre.ebooks.lit.input import LITInput
|
||||
from calibre.ebooks.html.input import HTMLInput
|
||||
from calibre.ebooks.oeb.output import OEBOutput
|
||||
from calibre.ebooks.txt.output import TXTOutput
|
||||
@ -287,7 +288,7 @@ from calibre.ebooks.pdf.output import PDFOutput
|
||||
from calibre.customize.profiles import input_profiles, output_profiles
|
||||
|
||||
plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput,
|
||||
TXTInput, OEBOutput, TXTOutput, PDFOutput]
|
||||
TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput]
|
||||
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
||||
x.__name__.endswith('MetadataReader')]
|
||||
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
||||
|
@ -41,6 +41,11 @@ class ConversionOption(object):
|
||||
def __eq__(self, other):
|
||||
return hash(self) == hash(other)
|
||||
|
||||
def clone(self):
|
||||
return ConversionOption(name=self.name, help=self.help,
|
||||
long_switch=self.long_switch, short_switch=self.short_switch,
|
||||
choices=self.choices)
|
||||
|
||||
class OptionRecommendation(object):
|
||||
LOW = 1
|
||||
MED = 2
|
||||
@ -59,6 +64,10 @@ class OptionRecommendation(object):
|
||||
|
||||
self.validate_parameters()
|
||||
|
||||
def clone(self):
|
||||
return OptionRecommendation(recommended_value=self.recommended_value,
|
||||
level=self.level, option=self.option.clone())
|
||||
|
||||
def validate_parameters(self):
|
||||
if self.option.choices and self.recommended_value not in \
|
||||
self.option.choices:
|
||||
@ -170,8 +179,14 @@ class InputFormatPlugin(Plugin):
|
||||
options.debug_input = os.path.abspath(options.debug_input)
|
||||
if not os.path.exists(options.debug_input):
|
||||
os.makedirs(options.debug_input)
|
||||
shutil.rmtree(options.debug_input)
|
||||
shutil.copytree(output_dir, options.debug_input)
|
||||
if isinstance(ret, basestring):
|
||||
shutil.rmtree(options.debug_input)
|
||||
shutil.copytree(output_dir, options.debug_input)
|
||||
else:
|
||||
from calibre.ebooks.oeb.writer import OEBWriter
|
||||
w = OEBWriter(pretty_print=options.pretty_print)
|
||||
w(ret, options.debug_input)
|
||||
|
||||
log.info('Input debug saved to:', options.debug_input)
|
||||
|
||||
return ret
|
||||
|
@ -57,7 +57,7 @@ def check_command_line_options(parser, args, log):
|
||||
raise SystemExit(1)
|
||||
|
||||
output = args[2]
|
||||
if output.startswith('.'):
|
||||
if output.startswith('.') and output != '.':
|
||||
output = os.path.splitext(os.path.basename(input))[0]+output
|
||||
output = os.path.abspath(output)
|
||||
|
||||
@ -171,7 +171,8 @@ def main(args=sys.argv):
|
||||
|
||||
plumber.run()
|
||||
|
||||
log(_('Output saved to'), ' ', plumber.output)
|
||||
if plumber.opts.debug_input is None:
|
||||
log(_('Output saved to'), ' ', plumber.output)
|
||||
|
||||
return 0
|
||||
|
||||
|
@ -32,8 +32,8 @@ class Plumber(object):
|
||||
:param input: Path to input file.
|
||||
:param output: Path to output file/directory
|
||||
'''
|
||||
self.input = input
|
||||
self.output = output
|
||||
self.input = os.path.abspath(input)
|
||||
self.output = os.path.abspath(output)
|
||||
self.log = log
|
||||
|
||||
# Initialize the conversion options that are independent of input and
|
||||
@ -188,15 +188,15 @@ OptionRecommendation(name='language',
|
||||
]
|
||||
|
||||
|
||||
input_fmt = os.path.splitext(input)[1]
|
||||
input_fmt = os.path.splitext(self.input)[1]
|
||||
if not input_fmt:
|
||||
raise ValueError('Input file must have an extension')
|
||||
input_fmt = input_fmt[1:].lower()
|
||||
|
||||
if os.path.exists(output) and os.path.isdir(output):
|
||||
if os.path.exists(self.output) and os.path.isdir(self.output):
|
||||
output_fmt = 'oeb'
|
||||
else:
|
||||
output_fmt = os.path.splitext(output)[1]
|
||||
output_fmt = os.path.splitext(self.output)[1]
|
||||
if not output_fmt:
|
||||
output_fmt = '.oeb'
|
||||
output_fmt = output_fmt[1:].lower()
|
||||
@ -323,6 +323,9 @@ OptionRecommendation(name='language',
|
||||
self.oeb = self.input_plugin(open(self.input, 'rb'), self.opts,
|
||||
self.input_fmt, self.log,
|
||||
accelerators, tdir)
|
||||
if self.opts.debug_input is not None:
|
||||
self.log('Debug input called, aborting the rest of the pipeline.')
|
||||
return
|
||||
if not hasattr(self.oeb, 'manifest'):
|
||||
self.oeb = create_oebbook(self.log, self.oeb, self.opts)
|
||||
|
||||
@ -365,18 +368,20 @@ OptionRecommendation(name='language',
|
||||
self.output_plugin.convert(self.oeb, self.output, self.input_plugin,
|
||||
self.opts, self.log)
|
||||
|
||||
def create_oebbook(log, opfpath, opts):
|
||||
def create_oebbook(log, path_or_stream, opts, reader=None):
|
||||
'''
|
||||
Create an OEBBook from an OPF file.
|
||||
Create an OEBBook.
|
||||
'''
|
||||
from calibre.ebooks.oeb.reader import OEBReader
|
||||
from calibre.ebooks.oeb.base import OEBBook
|
||||
html_preprocessor = HTMLPreProcessor()
|
||||
reader = OEBReader()
|
||||
oeb = OEBBook(log, html_preprocessor=html_preprocessor,
|
||||
pretty_print=opts.pretty_print)
|
||||
# Read OEB Book into OEBBook
|
||||
log.info('Parsing all content...')
|
||||
reader(oeb, opfpath)
|
||||
log('Parsing all content...')
|
||||
if reader is None:
|
||||
from calibre.ebooks.oeb.reader import OEBReader
|
||||
reader = OEBReader
|
||||
|
||||
reader()(oeb, path_or_stream)
|
||||
return oeb
|
||||
|
||||
|
@ -252,6 +252,14 @@ class HTMLInput(InputFormatPlugin):
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='dont_package',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Normally this input plugin re-arranges all the input '
|
||||
'files into a standard folder hierarchy. Only use this option '
|
||||
'if you know what you are doing as it can result in various '
|
||||
'nasty side effects in the rest of of the conversion pipeline.'
|
||||
)
|
||||
),
|
||||
])
|
||||
|
||||
def convert(self, stream, opts, file_ext, log,
|
||||
@ -276,6 +284,9 @@ class HTMLInput(InputFormatPlugin):
|
||||
mi.render(open('metadata.opf', 'wb'))
|
||||
opfpath = os.path.abspath('metadata.opf')
|
||||
|
||||
if opts.dont_package:
|
||||
return opfpath
|
||||
|
||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||
oeb = create_oebbook(log, opfpath, opts)
|
||||
|
||||
|
24
src/calibre/ebooks/lit/input.py
Normal file
24
src/calibre/ebooks/lit/input.py
Normal file
@ -0,0 +1,24 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import with_statement
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
|
||||
class LITInput(InputFormatPlugin):
|
||||
|
||||
name = 'LIT Input'
|
||||
author = 'Marshall T. Vandegrift'
|
||||
description = 'Convert LIT files to HTML'
|
||||
file_types = set(['lit'])
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.lit.reader import LitReader
|
||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||
return create_oebbook(log, stream, options, reader=LitReader)
|
||||
|
||||
|
@ -7,13 +7,12 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> ' \
|
||||
'and Marshall T. Vandegrift <llasram@gmail.com>'
|
||||
|
||||
import sys, struct, os
|
||||
import struct, os
|
||||
import functools
|
||||
import re
|
||||
from urlparse import urldefrag
|
||||
from cStringIO import StringIO
|
||||
from urllib import unquote as urlunquote
|
||||
from lxml import etree
|
||||
from calibre.ebooks.lit import LitError
|
||||
from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP
|
||||
import calibre.ebooks.lit.mssha1 as mssha1
|
||||
@ -29,12 +28,12 @@ __all__ = ["LitReader"]
|
||||
XML_DECL = """<?xml version="1.0" encoding="UTF-8" ?>
|
||||
"""
|
||||
OPF_DECL = """<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<!DOCTYPE package
|
||||
<!DOCTYPE package
|
||||
PUBLIC "+//ISBN 0-9673008-1-9//DTD OEB 1.0.1 Package//EN"
|
||||
"http://openebook.org/dtds/oeb-1.0.1/oebpkg101.dtd">
|
||||
"""
|
||||
HTML_DECL = """<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<!DOCTYPE html PUBLIC
|
||||
<!DOCTYPE html PUBLIC
|
||||
"+//ISBN 0-9673008-1-9//DTD OEB 1.0.1 Document//EN"
|
||||
"http://openebook.org/dtds/oeb-1.0.1/oebdoc101.dtd">
|
||||
"""
|
||||
@ -73,7 +72,7 @@ def encint(bytes, remaining):
|
||||
val <<= 7
|
||||
val |= (b & 0x7f)
|
||||
if b & 0x80 == 0: break
|
||||
return val, bytes[pos:], remaining
|
||||
return val, bytes[pos:], remaining
|
||||
|
||||
def msguid(bytes):
|
||||
values = struct.unpack("<LHHBBBBBBBB", bytes[:16])
|
||||
@ -123,7 +122,7 @@ class UnBinary(object):
|
||||
CLOSE_ANGLE_RE = re.compile(r'(?<!--)>>(?=>>|[^>])')
|
||||
DOUBLE_ANGLE_RE = re.compile(r'([<>])\1')
|
||||
EMPTY_ATOMS = ({},{})
|
||||
|
||||
|
||||
def __init__(self, bin, path, manifest={}, map=HTML_MAP, atoms=EMPTY_ATOMS):
|
||||
self.manifest = manifest
|
||||
self.tag_map, self.attr_map, self.tag_to_attr_map = map
|
||||
@ -143,7 +142,7 @@ class UnBinary(object):
|
||||
raw = self.CLOSE_ANGLE_RE.sub(r'>', raw)
|
||||
raw = self.DOUBLE_ANGLE_RE.sub(r'\1', raw)
|
||||
self.raw = raw
|
||||
|
||||
|
||||
def item_path(self, internal_id):
|
||||
try:
|
||||
target = self.manifest[internal_id].path
|
||||
@ -159,7 +158,7 @@ class UnBinary(object):
|
||||
index += 1
|
||||
relpath = (['..'] * (len(base) - index)) + target[index:]
|
||||
return '/'.join(relpath)
|
||||
|
||||
|
||||
def __unicode__(self):
|
||||
return self.raw.decode('utf-8')
|
||||
|
||||
@ -172,11 +171,11 @@ class UnBinary(object):
|
||||
in_censorship = is_goingdown = False
|
||||
state = 'text'
|
||||
flags = 0
|
||||
|
||||
|
||||
while index < len(bin):
|
||||
c, index = read_utf8_char(bin, index)
|
||||
oc = ord(c)
|
||||
|
||||
|
||||
if state == 'text':
|
||||
if oc == 0:
|
||||
state = 'get flags'
|
||||
@ -188,14 +187,14 @@ class UnBinary(object):
|
||||
elif c == '<':
|
||||
c = '<<'
|
||||
buf.write(encode(c))
|
||||
|
||||
|
||||
elif state == 'get flags':
|
||||
if oc == 0:
|
||||
state = 'text'
|
||||
continue
|
||||
flags = oc
|
||||
state = 'get tag'
|
||||
|
||||
|
||||
elif state == 'get tag':
|
||||
state = 'text' if oc == 0 else 'get attr'
|
||||
if flags & FLAG_OPENING:
|
||||
@ -226,7 +225,7 @@ class UnBinary(object):
|
||||
if depth == 0:
|
||||
raise LitError('Extra closing tag')
|
||||
return index
|
||||
|
||||
|
||||
elif state == 'get attr':
|
||||
in_censorship = False
|
||||
if oc == 0:
|
||||
@ -265,7 +264,7 @@ class UnBinary(object):
|
||||
state = 'get href length'
|
||||
else:
|
||||
state = 'get value length'
|
||||
|
||||
|
||||
elif state == 'get value length':
|
||||
if not in_censorship:
|
||||
buf.write('"')
|
||||
@ -281,7 +280,7 @@ class UnBinary(object):
|
||||
continue
|
||||
if count < 0 or count > (len(bin) - index):
|
||||
raise LitError('Invalid character count %d' % count)
|
||||
|
||||
|
||||
elif state == 'get value':
|
||||
if count == 0xfffe:
|
||||
if not in_censorship:
|
||||
@ -301,7 +300,7 @@ class UnBinary(object):
|
||||
buf.write('"')
|
||||
in_censorship = False
|
||||
state = 'get attr'
|
||||
|
||||
|
||||
elif state == 'get custom length':
|
||||
count = oc - 1
|
||||
if count <= 0 or count > len(bin)-index:
|
||||
@ -309,21 +308,21 @@ class UnBinary(object):
|
||||
dynamic_tag += 1
|
||||
state = 'get custom'
|
||||
tag_name = ''
|
||||
|
||||
|
||||
elif state == 'get custom':
|
||||
tag_name += c
|
||||
count -= 1
|
||||
if count == 0:
|
||||
buf.write(encode(tag_name))
|
||||
state = 'get attr'
|
||||
|
||||
|
||||
elif state == 'get attr length':
|
||||
count = oc - 1
|
||||
if count <= 0 or count > (len(bin) - index):
|
||||
raise LitError('Invalid character count %d' % count)
|
||||
buf.write(' ')
|
||||
state = 'get custom attr'
|
||||
|
||||
|
||||
elif state == 'get custom attr':
|
||||
buf.write(encode(c))
|
||||
count -= 1
|
||||
@ -337,7 +336,7 @@ class UnBinary(object):
|
||||
raise LitError('Invalid character count %d' % count)
|
||||
href = ''
|
||||
state = 'get href'
|
||||
|
||||
|
||||
elif state == 'get href':
|
||||
href += c
|
||||
count -= 1
|
||||
@ -350,7 +349,7 @@ class UnBinary(object):
|
||||
buf.write(encode(u'"%s"' % path))
|
||||
state = 'get attr'
|
||||
return index
|
||||
|
||||
|
||||
|
||||
class DirectoryEntry(object):
|
||||
def __init__(self, name, section, offset, size):
|
||||
@ -358,11 +357,11 @@ class DirectoryEntry(object):
|
||||
self.section = section
|
||||
self.offset = offset
|
||||
self.size = size
|
||||
|
||||
|
||||
def __repr__(self):
|
||||
return "DirectoryEntry(name=%s, section=%d, offset=%d, size=%d)" \
|
||||
% (repr(self.name), self.section, self.offset, self.size)
|
||||
|
||||
|
||||
def __str__(self):
|
||||
return repr(self)
|
||||
|
||||
@ -382,12 +381,12 @@ class ManifestItem(object):
|
||||
path = os.path.normpath(path).replace('\\', '/')
|
||||
while path.startswith('../'): path = path[3:]
|
||||
self.path = path
|
||||
|
||||
|
||||
def __eq__(self, other):
|
||||
if hasattr(other, 'internal'):
|
||||
return self.internal == other.internal
|
||||
return self.internal == other
|
||||
|
||||
|
||||
def __repr__(self):
|
||||
return "ManifestItem(internal=%r, path=%r, mime_type=%r, " \
|
||||
"offset=%d, root=%r, state=%r)" \
|
||||
@ -404,7 +403,7 @@ def preserve(function):
|
||||
self.stream.seek(opos)
|
||||
functools.update_wrapper(wrapper, function)
|
||||
return wrapper
|
||||
|
||||
|
||||
class LitFile(object):
|
||||
PIECE_SIZE = 16
|
||||
|
||||
@ -438,14 +437,14 @@ class LitFile(object):
|
||||
return self.stream.read(8)
|
||||
return property(fget=fget)
|
||||
magic = magic()
|
||||
|
||||
|
||||
def version():
|
||||
def fget(self):
|
||||
self.stream.seek(8)
|
||||
return u32(self.stream.read(4))
|
||||
return property(fget=fget)
|
||||
version = version()
|
||||
|
||||
|
||||
def hdr_len():
|
||||
@preserve
|
||||
def fget(self):
|
||||
@ -453,7 +452,7 @@ class LitFile(object):
|
||||
return int32(self.stream.read(4))
|
||||
return property(fget=fget)
|
||||
hdr_len = hdr_len()
|
||||
|
||||
|
||||
def num_pieces():
|
||||
@preserve
|
||||
def fget(self):
|
||||
@ -461,7 +460,7 @@ class LitFile(object):
|
||||
return int32(self.stream.read(4))
|
||||
return property(fget=fget)
|
||||
num_pieces = num_pieces()
|
||||
|
||||
|
||||
def sec_hdr_len():
|
||||
@preserve
|
||||
def fget(self):
|
||||
@ -469,7 +468,7 @@ class LitFile(object):
|
||||
return int32(self.stream.read(4))
|
||||
return property(fget=fget)
|
||||
sec_hdr_len = sec_hdr_len()
|
||||
|
||||
|
||||
def guid():
|
||||
@preserve
|
||||
def fget(self):
|
||||
@ -477,7 +476,7 @@ class LitFile(object):
|
||||
return self.stream.read(16)
|
||||
return property(fget=fget)
|
||||
guid = guid()
|
||||
|
||||
|
||||
def header():
|
||||
@preserve
|
||||
def fget(self):
|
||||
@ -488,7 +487,7 @@ class LitFile(object):
|
||||
return self.stream.read(size)
|
||||
return property(fget=fget)
|
||||
header = header()
|
||||
|
||||
|
||||
@preserve
|
||||
def __len__(self):
|
||||
self.stream.seek(0, 2)
|
||||
@ -501,7 +500,7 @@ class LitFile(object):
|
||||
|
||||
def read_content(self, offset, size):
|
||||
return self.read_raw(self.content_offset + offset, size)
|
||||
|
||||
|
||||
def read_secondary_header(self):
|
||||
offset = self.hdr_len + (self.num_pieces * self.PIECE_SIZE)
|
||||
bytes = self.read_raw(offset, self.sec_hdr_len)
|
||||
@ -526,12 +525,12 @@ class LitFile(object):
|
||||
if u32(bytes[offset+4+16:]):
|
||||
raise LitError('This file has a 64bit content offset')
|
||||
self.content_offset = u32(bytes[offset+16:])
|
||||
self.timestamp = u32(bytes[offset+24:])
|
||||
self.timestamp = u32(bytes[offset+24:])
|
||||
self.language_id = u32(bytes[offset+28:])
|
||||
offset += 48
|
||||
if not hasattr(self, 'content_offset'):
|
||||
raise LitError('Could not figure out the content offset')
|
||||
|
||||
|
||||
def read_header_pieces(self):
|
||||
src = self.header[self.hdr_len:]
|
||||
for i in xrange(self.num_pieces):
|
||||
@ -556,7 +555,7 @@ class LitFile(object):
|
||||
self.piece3_guid = piece
|
||||
elif i == 4:
|
||||
self.piece4_guid = piece
|
||||
|
||||
|
||||
def read_directory(self, piece):
|
||||
if not piece.startswith('IFCM'):
|
||||
raise LitError('Header piece #1 is not main directory.')
|
||||
@ -760,9 +759,9 @@ class LitFile(object):
|
||||
raise LitError("Reset table is too short")
|
||||
if u32(reset_table[RESET_UCLENGTH + 4:]) != 0:
|
||||
raise LitError("Reset table has 64bit value for UCLENGTH")
|
||||
|
||||
|
||||
result = []
|
||||
|
||||
|
||||
window_size = 14
|
||||
u = u32(control[CONTROL_WINDOW_SIZE:])
|
||||
while u > 0:
|
||||
@ -847,13 +846,13 @@ class LitContainer(object):
|
||||
|
||||
def __init__(self, filename_or_stream):
|
||||
self._litfile = LitFile(filename_or_stream)
|
||||
|
||||
|
||||
def namelist(self):
|
||||
return self._litfile.paths.keys()
|
||||
|
||||
def exists(self, name):
|
||||
return urlunquote(name) in self._litfile.paths
|
||||
|
||||
|
||||
def read(self, name):
|
||||
entry = self._litfile.paths[urlunquote(name)] if name else None
|
||||
if entry is None:
|
||||
@ -869,7 +868,7 @@ class LitContainer(object):
|
||||
internal = '/'.join(('/data', entry.internal))
|
||||
content = self._litfile.get_file(internal)
|
||||
return content
|
||||
|
||||
|
||||
def _read_meta(self):
|
||||
path = 'content.opf'
|
||||
raw = self._litfile.get_file('/meta')
|
||||
|
@ -1,10 +1,10 @@
|
||||
from __future__ import with_statement
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
'''Read meta information from PDF files'''
|
||||
|
||||
import sys, os, cStringIO
|
||||
from threading import Thread
|
||||
|
||||
from calibre import FileWrapper
|
||||
from calibre.ebooks.metadata import MetaInformation, authors_to_string
|
||||
@ -13,7 +13,8 @@ from pyPdf import PdfFileReader, PdfFileWriter
|
||||
import Image
|
||||
try:
|
||||
from calibre.utils.PythonMagickWand import \
|
||||
NewMagickWand, MagickReadImage, MagickSetImageFormat, MagickWriteImage
|
||||
NewMagickWand, MagickReadImage, MagickSetImageFormat, \
|
||||
MagickWriteImage, ImageMagick
|
||||
_imagemagick_loaded = True
|
||||
except:
|
||||
_imagemagick_loaded = False
|
||||
@ -51,9 +52,23 @@ def get_metadata(stream, extract_cover=True):
|
||||
print >>sys.stderr, msg.encode('utf8')
|
||||
return mi
|
||||
|
||||
class MetadataWriter(Thread):
|
||||
|
||||
def __init__(self, out_pdf, buf):
|
||||
self.out_pdf = out_pdf
|
||||
self.buf = buf
|
||||
Thread.__init__(self)
|
||||
self.daemon = True
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
self.out_pdf.write(self.buf)
|
||||
except RuntimeError:
|
||||
pass
|
||||
|
||||
def set_metadata(stream, mi):
|
||||
stream.seek(0)
|
||||
# Use a cStringIO object for the pdf because we will want to over
|
||||
# Use a StringIO object for the pdf because we will want to over
|
||||
# write it later and if we are working on the stream directly it
|
||||
# could cause some issues.
|
||||
raw = cStringIO.StringIO(stream.read())
|
||||
@ -61,10 +76,18 @@ def set_metadata(stream, mi):
|
||||
title = mi.title if mi.title else orig_pdf.documentInfo.title
|
||||
author = authors_to_string(mi.authors) if mi.authors else orig_pdf.documentInfo.author
|
||||
out_pdf = PdfFileWriter(title=title, author=author)
|
||||
out_str = cStringIO.StringIO()
|
||||
writer = MetadataWriter(out_pdf, out_str)
|
||||
for page in orig_pdf.pages:
|
||||
out_pdf.addPage(page)
|
||||
out_str = cStringIO.StringIO()
|
||||
out_pdf.write(out_str)
|
||||
writer.start()
|
||||
writer.join(10) # Wait 10 secs for writing to complete
|
||||
out_pdf.killed = True
|
||||
writer.join()
|
||||
if out_pdf.killed:
|
||||
print 'Failed to set metadata: took too long'
|
||||
return
|
||||
|
||||
stream.seek(0)
|
||||
stream.truncate()
|
||||
out_str.seek(0)
|
||||
@ -72,35 +95,32 @@ def set_metadata(stream, mi):
|
||||
stream.seek(0)
|
||||
|
||||
def get_cover(stream):
|
||||
stream.seek(0)
|
||||
|
||||
data = cStringIO.StringIO()
|
||||
|
||||
try:
|
||||
with FileWrapper(stream) as stream:
|
||||
pdf = PdfFileReader(stream)
|
||||
output = PdfFileWriter()
|
||||
|
||||
if len(pdf.pages) >= 1:
|
||||
output.addPage(pdf.getPage(0))
|
||||
|
||||
with TemporaryDirectory('_pdfmeta') as tdir:
|
||||
cover_path = os.path.join(tdir, 'cover.pdf')
|
||||
|
||||
outputStream = file(cover_path, "wb")
|
||||
pdf = PdfFileReader(stream)
|
||||
output = PdfFileWriter()
|
||||
|
||||
if len(pdf.pages) >= 1:
|
||||
output.addPage(pdf.getPage(0))
|
||||
|
||||
with TemporaryDirectory('_pdfmeta') as tdir:
|
||||
cover_path = os.path.join(tdir, 'cover.pdf')
|
||||
|
||||
with open(cover_path, "wb") as outputStream:
|
||||
output.write(outputStream)
|
||||
outputStream.close()
|
||||
|
||||
|
||||
with ImageMagick():
|
||||
wand = NewMagickWand()
|
||||
MagickReadImage(wand, cover_path)
|
||||
MagickSetImageFormat(wand, 'JPEG')
|
||||
MagickWriteImage(wand, '%s.jpg' % cover_path)
|
||||
|
||||
|
||||
img = Image.open('%s.jpg' % cover_path)
|
||||
|
||||
img.save(data, 'JPEG')
|
||||
except:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
return data.getvalue()
|
||||
|
||||
|
@ -272,11 +272,7 @@ def XPath(expr):
|
||||
def xpath(elem, expr):
|
||||
return elem.xpath(expr, namespaces=XPNSMAP)
|
||||
|
||||
def _prepare_xml_for_serialization(root):
|
||||
pass
|
||||
|
||||
def xml2str(root, pretty_print=False, strip_comments=False):
|
||||
_prepare_xml_for_serialization(root)
|
||||
ans = etree.tostring(root, encoding='utf-8', xml_declaration=True,
|
||||
pretty_print=pretty_print)
|
||||
|
||||
@ -287,7 +283,6 @@ def xml2str(root, pretty_print=False, strip_comments=False):
|
||||
|
||||
|
||||
def xml2unicode(root, pretty_print=False):
|
||||
_prepare_xml_for_serialization(root)
|
||||
return etree.tostring(root, pretty_print=pretty_print)
|
||||
|
||||
ASCII_CHARS = set(chr(x) for x in xrange(128))
|
||||
@ -321,6 +316,25 @@ def urlnormalize(href):
|
||||
parts = (urlquote(part) for part in parts)
|
||||
return urlunparse(parts)
|
||||
|
||||
class DummyHandler(logging.Handler):
|
||||
|
||||
def __init__(self):
|
||||
logging.Handler.__init__(self, logging.WARNING)
|
||||
self.setFormatter(logging.Formatter('%(message)s'))
|
||||
self.log = None
|
||||
|
||||
def emit(self, record):
|
||||
if self.log is not None:
|
||||
msg = self.format(record)
|
||||
f = self.log.error if record.levelno >= logging.ERROR \
|
||||
else self.log.warn
|
||||
f(msg)
|
||||
|
||||
|
||||
_css_logger = logging.getLogger('calibre.css')
|
||||
_css_logger.setLevel(logging.WARNING)
|
||||
_css_log_handler = DummyHandler()
|
||||
_css_logger.addHandler(_css_log_handler)
|
||||
|
||||
class OEBError(Exception):
|
||||
"""Generic OEB-processing error."""
|
||||
@ -778,7 +792,8 @@ class Manifest(object):
|
||||
data = self.oeb.css_preprocessor(data)
|
||||
data = XHTML_CSS_NAMESPACE + data
|
||||
parser = CSSParser(loglevel=logging.WARNING,
|
||||
fetcher=self._fetch_css)
|
||||
fetcher=self._fetch_css,
|
||||
log=_css_logger)
|
||||
data = parser.parseString(data, href=self.href)
|
||||
data.namespaces['h'] = XHTML_NS
|
||||
return data
|
||||
@ -1435,7 +1450,7 @@ class OEBBook(object):
|
||||
:attr:`pages`: List of "pages," such as indexed to a print edition of
|
||||
the same text.
|
||||
"""
|
||||
|
||||
_css_log_handler.log = logger
|
||||
self.encoding = encoding
|
||||
self.html_preprocessor = html_preprocessor
|
||||
self.css_preprocessor = css_preprocessor
|
||||
@ -1450,6 +1465,7 @@ class OEBBook(object):
|
||||
self.guide = Guide(self)
|
||||
self.toc = TOC()
|
||||
self.pages = PageList()
|
||||
self.auto_generated_toc = True
|
||||
|
||||
@classmethod
|
||||
def generate(cls, opts):
|
||||
|
@ -13,13 +13,12 @@ from PyQt4.Qt import QFontDatabase
|
||||
|
||||
from calibre.customize.ui import available_input_formats
|
||||
from calibre.ebooks.epub.from_html import TITLEPAGE
|
||||
from calibre.ebooks.metadata.opf2 import OPF, OPFCreator
|
||||
from calibre.ebooks.metadata.opf2 import OPF
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.utils.zipfile import safe_replace, ZipFile
|
||||
from calibre.utils.config import DynamicConfig
|
||||
from calibre.utils.logging import Log
|
||||
from calibre import CurrentDir
|
||||
|
||||
def character_count(html):
|
||||
'''
|
||||
@ -57,31 +56,21 @@ class FakeOpts(object):
|
||||
max_levels = 5
|
||||
input_encoding = None
|
||||
|
||||
def html2opf(path, tdir, log):
|
||||
from calibre.ebooks.html.input import get_filelist
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
with CurrentDir(tdir):
|
||||
fl = get_filelist(path, tdir, FakeOpts(), log)
|
||||
mi = get_metadata(open(path, 'rb'), 'html')
|
||||
mi = OPFCreator(os.getcwdu(), mi)
|
||||
mi.guide = None
|
||||
entries = [(f.path, 'application/xhtml+xml') for f in fl]
|
||||
mi.create_manifest(entries)
|
||||
mi.create_spine([f.path for f in fl])
|
||||
|
||||
mi.render(open('metadata.opf', 'wb'))
|
||||
opfpath = os.path.abspath('metadata.opf')
|
||||
|
||||
return opfpath
|
||||
|
||||
def opf2opf(path, tdir, opts):
|
||||
return path
|
||||
|
||||
def is_supported(path):
|
||||
ext = os.path.splitext(path)[1].replace('.', '').lower()
|
||||
ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
|
||||
return ext in available_input_formats()
|
||||
|
||||
|
||||
def write_oebbook(oeb, path):
|
||||
from calibre.ebooks.oeb.writer import OEBWriter
|
||||
from calibre import walk
|
||||
w = OEBWriter()
|
||||
w(oeb, path)
|
||||
for f in walk(path):
|
||||
if f.endswith('.opf'):
|
||||
return f
|
||||
|
||||
class EbookIterator(object):
|
||||
|
||||
CHARACTERS_PER_PAGE = 1000
|
||||
@ -131,17 +120,16 @@ class EbookIterator(object):
|
||||
def __enter__(self):
|
||||
self._tdir = TemporaryDirectory('_ebook_iter')
|
||||
self.base = self._tdir.__enter__()
|
||||
if self.ebook_ext == 'opf':
|
||||
self.pathtoopf = self.pathtoebook
|
||||
elif self.ebook_ext == 'html':
|
||||
self.pathtoopf = html2opf(self.pathtoebook, self.base, self.log)
|
||||
else:
|
||||
from calibre.ebooks.conversion.plumber import Plumber
|
||||
plumber = Plumber(self.pathtoebook, self.base, self.log)
|
||||
plumber.setup_options()
|
||||
self.pathtoopf = plumber.input_plugin(open(plumber.input, 'rb'),
|
||||
plumber.opts, plumber.input_fmt, self.log,
|
||||
{}, self.base)
|
||||
from calibre.ebooks.conversion.plumber import Plumber
|
||||
plumber = Plumber(self.pathtoebook, self.base, self.log)
|
||||
plumber.setup_options()
|
||||
if hasattr(plumber.opts, 'dont_package'):
|
||||
plumber.opts.dont_package = True
|
||||
self.pathtoopf = plumber.input_plugin(open(plumber.input, 'rb'),
|
||||
plumber.opts, plumber.input_fmt, self.log,
|
||||
{}, self.base)
|
||||
if hasattr(self.pathtoopf, 'manifest'):
|
||||
self.pathtoopf = write_oebbook(self.pathtoebook, self._tdir)
|
||||
|
||||
|
||||
self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf))
|
||||
|
@ -16,7 +16,6 @@ class OEBOutput(OutputFormatPlugin):
|
||||
author = 'Kovid Goyal'
|
||||
file_type = 'oeb'
|
||||
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
self.log, self.opts = log, opts
|
||||
if not os.path.exists(output_path):
|
||||
|
@ -349,6 +349,7 @@ class OEBReader(object):
|
||||
def _toc_from_ncx(self, item):
|
||||
if item is None:
|
||||
return False
|
||||
self.log.debug('Reading TOC from NCX...')
|
||||
ncx = item.data
|
||||
title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()'))
|
||||
title = COLLAPSE_RE.sub(' ', title.strip())
|
||||
@ -364,6 +365,7 @@ class OEBReader(object):
|
||||
result = xpath(opf, 'o2:tours/o2:tour')
|
||||
if not result:
|
||||
return False
|
||||
self.log.debug('Reading TOC from tour...')
|
||||
tour = result[0]
|
||||
toc = self.oeb.toc
|
||||
toc.title = tour.get('title')
|
||||
@ -384,6 +386,7 @@ class OEBReader(object):
|
||||
def _toc_from_html(self, opf):
|
||||
if 'toc' not in self.oeb.guide:
|
||||
return False
|
||||
self.log.debug('Reading TOC from HTML...')
|
||||
itempath, frag = urldefrag(self.oeb.guide['toc'].href)
|
||||
item = self.oeb.manifest.hrefs[itempath]
|
||||
html = item.data
|
||||
@ -414,6 +417,7 @@ class OEBReader(object):
|
||||
return True
|
||||
|
||||
def _toc_from_spine(self, opf):
|
||||
self.log.warn('Generating default TOC from spine...')
|
||||
toc = self.oeb.toc
|
||||
titles = []
|
||||
headers = []
|
||||
@ -441,11 +445,14 @@ class OEBReader(object):
|
||||
return True
|
||||
|
||||
def _toc_from_opf(self, opf, item):
|
||||
self.oeb.auto_generated_toc = False
|
||||
if self._toc_from_ncx(item): return
|
||||
if self._toc_from_tour(opf): return
|
||||
self.logger.warn('No metadata table of contents found')
|
||||
# Prefer HTML to tour based TOC, since several LIT files
|
||||
# have good HTML TOCs but bad tour based TOCs
|
||||
if self._toc_from_html(opf): return
|
||||
if self._toc_from_tour(opf): return
|
||||
self._toc_from_spine(opf)
|
||||
self.oeb.auto_generated_toc = True
|
||||
|
||||
def _pages_from_ncx(self, opf, item):
|
||||
if item is None:
|
||||
|
@ -51,8 +51,8 @@ class Split(object):
|
||||
self.log = oeb.log
|
||||
self.map = {}
|
||||
self.page_break_selectors = None
|
||||
for item in self.oeb.manifest.items:
|
||||
if etree.iselement(item.data):
|
||||
for item in list(self.oeb.manifest.items):
|
||||
if item.spine_position is not None and etree.iselement(item.data):
|
||||
self.split_item(item)
|
||||
|
||||
self.fix_links()
|
||||
@ -74,31 +74,34 @@ class Split(object):
|
||||
self.page_break_selectors = set([])
|
||||
stylesheets = [x.data for x in self.oeb.manifest if x.media_type in
|
||||
OEB_STYLES]
|
||||
page_break_selectors = set([])
|
||||
for rule in rules(stylesheets):
|
||||
before = getattr(rule.style.getPropertyCSSValue(
|
||||
'page-break-before'), 'cssText', '').strip().lower()
|
||||
after = getattr(rule.style.getPropertyCSSValue(
|
||||
'page-break-after'), 'cssText', '').strip().lower()
|
||||
try:
|
||||
if before and before != 'avoid':
|
||||
page_break_selectors.add((CSSSelector(rule.selectorText),
|
||||
True))
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
if after and after != 'avoid':
|
||||
page_break_selectors.add((CSSSelector(rule.selectorText),
|
||||
False))
|
||||
except:
|
||||
pass
|
||||
for rule in rules(stylesheets):
|
||||
before = getattr(rule.style.getPropertyCSSValue(
|
||||
'page-break-before'), 'cssText', '').strip().lower()
|
||||
after = getattr(rule.style.getPropertyCSSValue(
|
||||
'page-break-after'), 'cssText', '').strip().lower()
|
||||
try:
|
||||
if before and before != 'avoid':
|
||||
self.page_break_selectors.add((CSSSelector(rule.selectorText),
|
||||
True))
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
if after and after != 'avoid':
|
||||
self.page_break_selectors.add((CSSSelector(rule.selectorText),
|
||||
False))
|
||||
except:
|
||||
pass
|
||||
|
||||
page_breaks = set([])
|
||||
for selector, before in page_break_selectors:
|
||||
for elem in selector(item.data):
|
||||
if before:
|
||||
elem.set('pb_before', '1')
|
||||
page_breaks.add(elem)
|
||||
for selector, before in self.page_break_selectors:
|
||||
body = item.data.xpath('//h:body', namespaces=NAMESPACES)
|
||||
if not body:
|
||||
continue
|
||||
for elem in selector(body[0]):
|
||||
if elem not in body:
|
||||
if before:
|
||||
elem.set('pb_before', '1')
|
||||
page_breaks.add(elem)
|
||||
|
||||
for i, elem in enumerate(item.data.iter()):
|
||||
elem.set('pb_order', str(i))
|
||||
@ -136,8 +139,10 @@ class Split(object):
|
||||
if href in self.map:
|
||||
anchor_map = self.map[href]
|
||||
nhref = anchor_map[frag if frag else None]
|
||||
nhref = self.current_item.relhref(nhref)
|
||||
if frag:
|
||||
nhref = '#'.join(href, frag)
|
||||
nhref = '#'.join((nhref, frag))
|
||||
|
||||
return nhref
|
||||
return url
|
||||
|
||||
@ -153,7 +158,7 @@ class FlowSplitter(object):
|
||||
self.page_breaks = page_breaks
|
||||
self.page_break_ids = page_break_ids
|
||||
self.max_flow_size = max_flow_size
|
||||
self.base = item.abshref(item.href)
|
||||
self.base = item.href
|
||||
|
||||
base, ext = os.path.splitext(self.base)
|
||||
self.base = base.replace('%', '%%')+'_split_%d'+ext
|
||||
@ -192,9 +197,9 @@ class FlowSplitter(object):
|
||||
self.trees = []
|
||||
tree = orig_tree
|
||||
for pattern, before in ordered_ids:
|
||||
self.log.debug('\t\tSplitting on page-break')
|
||||
elem = pattern(tree)
|
||||
if elem:
|
||||
self.log.debug('\t\tSplitting on page-break')
|
||||
before, after = self.do_split(tree, elem[0], before)
|
||||
self.trees.append(before)
|
||||
tree = after
|
||||
@ -414,13 +419,14 @@ class FlowSplitter(object):
|
||||
elem.attrib.pop(SPLIT_ATTR, None)
|
||||
elem.attrib.pop(SPLIT_POINT_ATTR, '0')
|
||||
|
||||
spine_pos = self.item.spine_pos
|
||||
for current, tree in zip(map(reversed, (self.files, self.trees))):
|
||||
spine_pos = self.item.spine_position
|
||||
for current, tree in zip(*map(reversed, (self.files, self.trees))):
|
||||
for a in tree.getroot().xpath('//h:a[@href]', namespaces=NAMESPACES):
|
||||
href = a.get('href').strip()
|
||||
if href.startswith('#'):
|
||||
anchor = href[1:]
|
||||
file = self.anchor_map[anchor]
|
||||
file = self.item.relhref(file)
|
||||
if file != current:
|
||||
a.set('href', file+href)
|
||||
|
||||
@ -430,12 +436,12 @@ class FlowSplitter(object):
|
||||
self.oeb.spine.insert(spine_pos, new_item, self.item.linear)
|
||||
|
||||
if self.oeb.guide:
|
||||
for ref in self.oeb.guide:
|
||||
for ref in self.oeb.guide.values():
|
||||
href, frag = urldefrag(ref.href)
|
||||
if href == self.item.href:
|
||||
nhref = self.anchor_map[frag if frag else None]
|
||||
if frag:
|
||||
nhref = '#'.join(nhref, frag)
|
||||
nhref = '#'.join((nhref, frag))
|
||||
ref.href = nhref
|
||||
|
||||
def fix_toc_entry(toc):
|
||||
@ -444,7 +450,7 @@ class FlowSplitter(object):
|
||||
if href == self.item.href:
|
||||
nhref = self.anchor_map[frag if frag else None]
|
||||
if frag:
|
||||
nhref = '#'.join(nhref, frag)
|
||||
nhref = '#'.join((nhref, frag))
|
||||
toc.href = nhref
|
||||
for x in toc:
|
||||
fix_toc_entry(x)
|
||||
|
@ -49,7 +49,7 @@ class OEBWriter(object):
|
||||
|
||||
def __call__(self, oeb, path):
|
||||
"""
|
||||
Read the book in the :class:`OEBBook` object :param:`oeb` to a file
|
||||
Write the book in the :class:`OEBBook` object :param:`oeb` to a folder
|
||||
at :param:`path`.
|
||||
"""
|
||||
version = int(self.version[0])
|
||||
|
@ -319,6 +319,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
|
||||
self.cover_changed = True
|
||||
|
||||
def initialize_series(self):
|
||||
self.series.setSizeAdjustPolicy(self.series.AdjustToContentsOnFirstShow)
|
||||
all_series = self.db.all_series()
|
||||
all_series.sort(cmp=lambda x, y : cmp(x[1], y[1]))
|
||||
series_id = self.db.series_id(self.row)
|
||||
@ -335,13 +336,6 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
|
||||
self.series.setCurrentIndex(idx)
|
||||
self.enable_series_index()
|
||||
|
||||
pl = self.series.parentWidget().layout()
|
||||
for i in range(pl.count()):
|
||||
l = pl.itemAt(i).layout()
|
||||
if l:
|
||||
l.invalidate()
|
||||
l.activate()
|
||||
|
||||
def initialize_series_and_publisher(self):
|
||||
self.initialize_series()
|
||||
all_publishers = self.db.all_publishers()
|
||||
|
BIN
src/calibre/gui2/images/news/der_standard.png
Normal file
BIN
src/calibre/gui2/images/news/der_standard.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 509 B |
BIN
src/calibre/gui2/images/news/diepresse.png
Normal file
BIN
src/calibre/gui2/images/news/diepresse.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 637 B |
BIN
src/calibre/gui2/images/news/seattle_times.png
Normal file
BIN
src/calibre/gui2/images/news/seattle_times.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 746 B |
@ -40,6 +40,7 @@ recipe_modules = ['recipe_' + r for r in (
|
||||
'krstarica', 'krstarica_en', 'tanjug', 'laprensa_ni', 'azstarnet',
|
||||
'corriere_della_sera_it', 'corriere_della_sera_en', 'msdnmag_en',
|
||||
'moneynews', 'der_standard', 'diepresse', 'nzz_ger', 'hna',
|
||||
'seattle_times',
|
||||
)]
|
||||
|
||||
import re, imp, inspect, time, os
|
||||
|
@ -1,14 +1,37 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>'
|
||||
|
||||
''' http://www.derstandard.at - Austrian Newspaper '''
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class DerStandardRecipe(BasicNewsRecipe):
|
||||
title = u'derStandard'
|
||||
__author__ = 'Gerhard Aigner'
|
||||
|
||||
title = u'derStandard'
|
||||
__author__ = 'Gerhard Aigner'
|
||||
description = u'Nachrichten aus Österreich'
|
||||
publisher ='derStandard.at'
|
||||
category = 'news, politics, nachrichten, Austria'
|
||||
use_embedded_content = False
|
||||
remove_empty_feeds = True
|
||||
lang = 'de-AT'
|
||||
no_stylesheets = True
|
||||
encoding = 'utf-8'
|
||||
language = _('German')
|
||||
recursions = 0
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
|
||||
html2lrf_options = [
|
||||
'--comment' , description
|
||||
, '--category' , category
|
||||
, '--publisher', publisher
|
||||
]
|
||||
|
||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
|
||||
|
||||
feeds = [(u'International', u'http://derstandard.at/?page=rss&ressort=internationalpolitik'),
|
||||
(u'Inland', u'http://derstandard.at/?page=rss&ressort=innenpolitik'),
|
||||
(u'Wirtschaft', u'http://derstandard.at/?page=rss&ressort=investor'),
|
||||
@ -20,17 +43,13 @@ class DerStandardRecipe(BasicNewsRecipe):
|
||||
(u'Wissenschaft', u'http://derstandard.at/?page=rss&ressort=wissenschaft'),
|
||||
(u'Gesundheit', u'http://derstandard.at/?page=rss&ressort=gesundheit'),
|
||||
(u'Bildung', u'http://derstandard.at/?page=rss&ressort=subildung')]
|
||||
|
||||
encoding = 'utf-8'
|
||||
language = _('German')
|
||||
recursions = 0
|
||||
remove_tags = [dict(name='div'), dict(name='a'), dict(name='link'), dict(name='meta'),
|
||||
dict(name='form',attrs={'name':'sitesearch'}), dict(name='hr')]
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'\[[\d*]\]', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
(re.compile(r'\[[\d]*\]', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
(re.compile(r'bgcolor="#\w{3,6}"', re.DOTALL|re.IGNORECASE), lambda match: '')
|
||||
]
|
||||
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('?id=', 'txt/?id=')
|
||||
|
||||
@ -40,3 +59,10 @@ class DerStandardRecipe(BasicNewsRecipe):
|
||||
if (article.link.count('ressort') > 0 or article.title.lower().count('ansichtssache') > 0):
|
||||
return None
|
||||
return article.link
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
soup.html['xml:lang'] = self.lang
|
||||
soup.html['lang'] = self.lang
|
||||
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '">'
|
||||
soup.head.insert(0,mtag)
|
||||
return soup
|
@ -1,18 +1,42 @@
|
||||
import re
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>'
|
||||
|
||||
''' http://www.diepresse.at - Austrian Newspaper '''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class DiePresseRecipe(BasicNewsRecipe):
|
||||
title = u'diePresse'
|
||||
title = u'diePresse'
|
||||
__author__ = 'Gerhard Aigner'
|
||||
description = u'DiePresse.com - Die Online-Ausgabe der Österreichischen Tageszeitung Die Presse.'
|
||||
publisher ='DiePresse.com'
|
||||
category = 'news, politics, nachrichten, Austria'
|
||||
use_embedded_content = False
|
||||
remove_empty_feeds = True
|
||||
lang = 'de-AT'
|
||||
no_stylesheets = True
|
||||
encoding = 'ISO-8859-1'
|
||||
language = _('German')
|
||||
recursions = 0
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
recursions = 0
|
||||
language = _('German')
|
||||
__author__ = 'Gerhard Aigner'
|
||||
|
||||
html2lrf_options = [
|
||||
'--comment' , description
|
||||
, '--category' , category
|
||||
, '--publisher', publisher
|
||||
]
|
||||
|
||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'Textversion', re.DOTALL), lambda match: ''),
|
||||
]
|
||||
|
||||
remove_tags = [dict(name='hr'),
|
||||
dict(name='br'),
|
||||
dict(name='small'),
|
||||
@ -21,6 +45,7 @@ class DiePresseRecipe(BasicNewsRecipe):
|
||||
dict(name='h1', attrs={'class':'titel'}),
|
||||
dict(name='a', attrs={'class':'print'}),
|
||||
dict(name='div', attrs={'class':'hline'})]
|
||||
|
||||
feeds = [(u'Politik', u'http://diepresse.com/rss/Politik'),
|
||||
(u'Wirtschaft', u'http://diepresse.com/rss/Wirtschaft'),
|
||||
(u'Europa', u'http://diepresse.com/rss/EU'),
|
||||
@ -29,7 +54,7 @@ class DiePresseRecipe(BasicNewsRecipe):
|
||||
(u'Kultur', u'http://diepresse.com/rss/Kultur'),
|
||||
(u'Leben', u'http://diepresse.com/rss/Leben'),
|
||||
(u'Tech', u'http://diepresse.com/rss/Tech'),
|
||||
(u'Science', u'http://diepresse.com/rss/Science'),
|
||||
(u'Wissenschaft', u'http://diepresse.com/rss/Science'),
|
||||
(u'Bildung', u'http://diepresse.com/rss/Bildung'),
|
||||
(u'Gesundheit', u'http://diepresse.com/rss/Gesundheit'),
|
||||
(u'Recht', u'http://diepresse.com/rss/Recht'),
|
||||
@ -38,3 +63,10 @@ class DiePresseRecipe(BasicNewsRecipe):
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('home','text/home')
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
soup.html['xml:lang'] = self.lang
|
||||
soup.html['lang'] = self.lang
|
||||
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
|
||||
soup.head.insert(0,mtag)
|
||||
return soup
|
50
src/calibre/web/feeds/recipes/recipe_seattle_times.py
Normal file
50
src/calibre/web/feeds/recipes/recipe_seattle_times.py
Normal file
@ -0,0 +1,50 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
seattletimes.nwsource.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class SeattleTimes(BasicNewsRecipe):
|
||||
title = 'The Seattle Times'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'News from Seattle and USA'
|
||||
publisher = 'The Seattle Times'
|
||||
category = 'news, politics, USA'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
encoding = 'cp1252'
|
||||
language = _('English')
|
||||
|
||||
html2lrf_options = [
|
||||
'--comment' , description
|
||||
, '--category' , category
|
||||
, '--publisher', publisher
|
||||
]
|
||||
|
||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
|
||||
|
||||
feeds = [(u'Articles', u'http://seattletimes.nwsource.com/rss/seattletimes.xml')]
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['object','link','script'])
|
||||
,dict(name='p', attrs={'class':'permission'})
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
start_url, sep, rest_url = url.rpartition('_')
|
||||
rurl, rsep, article_id = start_url.rpartition('/')
|
||||
return u'http://seattletimes.nwsource.com/cgi-bin/PrintStory.pl?document_id=' + article_id
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
mtag = '<meta http-equiv="Content-Language" content="en-US"/>'
|
||||
soup.head.insert(0,mtag)
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
||||
|
@ -299,7 +299,7 @@ def readStringFromStream(stream):
|
||||
elif tok == "t":
|
||||
tok = "\t"
|
||||
elif tok == "b":
|
||||
tok == "\b"
|
||||
tok = "\b"
|
||||
elif tok == "f":
|
||||
tok = "\f"
|
||||
elif tok == "(":
|
||||
@ -673,7 +673,7 @@ class RectangleObject(ArrayObject):
|
||||
|
||||
def getUpperLeft_x(self):
|
||||
return self.getLowerLeft_x()
|
||||
|
||||
|
||||
def getUpperLeft_y(self):
|
||||
return self.getUpperRight_y()
|
||||
|
||||
|
@ -39,15 +39,12 @@ __author__ = "Mathieu Fenniak"
|
||||
__author_email__ = "biziqe@mathieu.fenniak.net"
|
||||
|
||||
import struct
|
||||
try:
|
||||
from cStringIO import StringIO
|
||||
except ImportError:
|
||||
from StringIO import StringIO
|
||||
from cStringIO import StringIO
|
||||
|
||||
import filters
|
||||
import utils
|
||||
import warnings
|
||||
from generic import *
|
||||
from generic import DictionaryObject, NameObject, NumberObject, \
|
||||
createStringObject, ArrayObject, ByteStringObject, StreamObject, \
|
||||
IndirectObject, utils, readObject, TextStringObject, BooleanObject, \
|
||||
RectangleObject, DecodedStreamObject
|
||||
from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList
|
||||
|
||||
|
||||
@ -56,6 +53,7 @@ from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirt
|
||||
# class (typically {@link #PdfFileReader PdfFileReader}).
|
||||
class PdfFileWriter(object):
|
||||
def __init__(self,title=u"Unknown",author=u"Unknown"):
|
||||
self.killed = False
|
||||
self._header = "%PDF-1.3"
|
||||
self._objects = [] # array of indirect objects
|
||||
|
||||
@ -162,7 +160,7 @@ class PdfFileWriter(object):
|
||||
# @param stream An object to write the file to. The object must support
|
||||
# the write method, and the tell method, similar to a file object.
|
||||
def write(self, stream):
|
||||
import struct, md5
|
||||
import md5
|
||||
|
||||
externalReferenceMap = {}
|
||||
self.stack = []
|
||||
@ -209,11 +207,13 @@ class PdfFileWriter(object):
|
||||
if hasattr(self, "_encrypt"):
|
||||
trailer[NameObject("/Encrypt")] = self._encrypt
|
||||
trailer.writeToStream(stream, None)
|
||||
|
||||
|
||||
# eof
|
||||
stream.write("\nstartxref\n%s\n%%%%EOF\n" % (xref_location))
|
||||
|
||||
def _sweepIndirectReferences(self, externMap, data):
|
||||
if self.killed:
|
||||
raise RuntimeError('Writer killed')
|
||||
if isinstance(data, DictionaryObject):
|
||||
for key, value in data.items():
|
||||
origvalue = value
|
||||
@ -356,8 +356,8 @@ class PdfFileReader(object):
|
||||
return self.flattenedPages[pageNumber]
|
||||
|
||||
##
|
||||
# Read-only property that accesses the
|
||||
# {@link #PdfFileReader.getNamedDestinations
|
||||
# Read-only property that accesses the
|
||||
# {@link #PdfFileReader.getNamedDestinations
|
||||
# getNamedDestinations} function.
|
||||
# <p>
|
||||
# Stability: Added in v1.10, will exist for all future v1.x releases.
|
||||
@ -374,7 +374,7 @@ class PdfFileReader(object):
|
||||
if retval == None:
|
||||
retval = {}
|
||||
catalog = self.trailer["/Root"]
|
||||
|
||||
|
||||
# get the name tree
|
||||
if catalog.has_key("/Dests"):
|
||||
tree = catalog["/Dests"]
|
||||
@ -382,7 +382,7 @@ class PdfFileReader(object):
|
||||
names = catalog['/Names']
|
||||
if names.has_key("/Dests"):
|
||||
tree = names['/Dests']
|
||||
|
||||
|
||||
if tree == None:
|
||||
return retval
|
||||
|
||||
@ -420,17 +420,17 @@ class PdfFileReader(object):
|
||||
if outlines == None:
|
||||
outlines = []
|
||||
catalog = self.trailer["/Root"]
|
||||
|
||||
|
||||
# get the outline dictionary and named destinations
|
||||
if catalog.has_key("/Outlines"):
|
||||
lines = catalog["/Outlines"]
|
||||
if lines.has_key("/First"):
|
||||
node = lines["/First"]
|
||||
self._namedDests = self.getNamedDestinations()
|
||||
|
||||
|
||||
if node == None:
|
||||
return outlines
|
||||
|
||||
|
||||
# see if there are any more outlines
|
||||
while 1:
|
||||
outline = self._buildOutline(node)
|
||||
@ -454,10 +454,10 @@ class PdfFileReader(object):
|
||||
page, typ = array[0:2]
|
||||
array = array[2:]
|
||||
return Destination(title, page, typ, *array)
|
||||
|
||||
|
||||
def _buildOutline(self, node):
|
||||
dest, title, outline = None, None, None
|
||||
|
||||
|
||||
if node.has_key("/A") and node.has_key("/Title"):
|
||||
# Action, section 8.5 (only type GoTo supported)
|
||||
title = node["/Title"]
|
||||
@ -951,7 +951,7 @@ class PageObject(DictionaryObject):
|
||||
|
||||
def _pushPopGS(contents, pdf):
|
||||
# adds a graphics state "push" and "pop" to the beginning and end
|
||||
# of a content stream. This isolates it from changes such as
|
||||
# of a content stream. This isolates it from changes such as
|
||||
# transformation matricies.
|
||||
stream = ContentStream(contents, pdf)
|
||||
stream.operations.insert(0, [[], "q"])
|
||||
@ -1291,7 +1291,7 @@ class Destination(DictionaryObject):
|
||||
self[NameObject("/Title")] = title
|
||||
self[NameObject("/Page")] = page
|
||||
self[NameObject("/Type")] = typ
|
||||
|
||||
|
||||
# from table 8.2 of the PDF 1.6 reference.
|
||||
if typ == "/XYZ":
|
||||
(self[NameObject("/Left")], self[NameObject("/Top")],
|
||||
@ -1307,7 +1307,7 @@ class Destination(DictionaryObject):
|
||||
pass
|
||||
else:
|
||||
raise utils.PdfReadError("Unknown Destination Type: %r" % typ)
|
||||
|
||||
|
||||
##
|
||||
# Read-only property accessing the destination title.
|
||||
# @return A string.
|
||||
@ -1474,25 +1474,25 @@ def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encr
|
||||
# described in Algorithm 3.2.
|
||||
key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry)
|
||||
# 2. Initialize the MD5 hash function and pass the 32-byte padding string
|
||||
# shown in step 1 of Algorithm 3.2 as input to this function.
|
||||
# shown in step 1 of Algorithm 3.2 as input to this function.
|
||||
import md5
|
||||
m = md5.new()
|
||||
m.update(_encryption_padding)
|
||||
# 3. Pass the first element of the file's file identifier array (the value
|
||||
# of the ID entry in the document's trailer dictionary; see Table 3.13 on
|
||||
# page 73) to the hash function and finish the hash. (See implementation
|
||||
# note 25 in Appendix H.)
|
||||
# note 25 in Appendix H.)
|
||||
m.update(id1_entry)
|
||||
md5_hash = m.digest()
|
||||
# 4. Encrypt the 16-byte result of the hash, using an RC4 encryption
|
||||
# function with the encryption key from step 1.
|
||||
# function with the encryption key from step 1.
|
||||
val = utils.RC4_encrypt(key, md5_hash)
|
||||
# 5. Do the following 19 times: Take the output from the previous
|
||||
# invocation of the RC4 function and pass it as input to a new invocation
|
||||
# of the function; use an encryption key generated by taking each byte of
|
||||
# the original encryption key (obtained in step 2) and performing an XOR
|
||||
# operation between that byte and the single-byte value of the iteration
|
||||
# counter (from 1 to 19).
|
||||
# counter (from 1 to 19).
|
||||
for i in range(1, 20):
|
||||
new_key = ''
|
||||
for l in range(len(key)):
|
||||
@ -1500,7 +1500,7 @@ def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encr
|
||||
val = utils.RC4_encrypt(new_key, val)
|
||||
# 6. Append 16 bytes of arbitrary padding to the output from the final
|
||||
# invocation of the RC4 function and store the 32-byte result as the value
|
||||
# of the U entry in the encryption dictionary.
|
||||
# of the U entry in the encryption dictionary.
|
||||
# (implementator note: I don't know what "arbitrary padding" is supposed to
|
||||
# mean, so I have used null bytes. This seems to match a few other
|
||||
# people's implementations)
|
||||
|
Loading…
x
Reference in New Issue
Block a user