pdf get_cover returns cover image instead of nothing.

This commit is contained in:
John Schember 2009-04-18 07:54:56 -04:00
commit b104286f61
24 changed files with 405 additions and 210 deletions

View File

@ -280,6 +280,7 @@ from calibre.ebooks.epub.input import EPUBInput
from calibre.ebooks.mobi.input import MOBIInput
from calibre.ebooks.pdf.input import PDFInput
from calibre.ebooks.txt.input import TXTInput
from calibre.ebooks.lit.input import LITInput
from calibre.ebooks.html.input import HTMLInput
from calibre.ebooks.oeb.output import OEBOutput
from calibre.ebooks.txt.output import TXTOutput
@ -287,7 +288,7 @@ from calibre.ebooks.pdf.output import PDFOutput
from calibre.customize.profiles import input_profiles, output_profiles
plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDFInput, HTMLInput,
TXTInput, OEBOutput, TXTOutput, PDFOutput]
TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
x.__name__.endswith('MetadataReader')]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \

View File

@ -41,6 +41,11 @@ class ConversionOption(object):
def __eq__(self, other):
return hash(self) == hash(other)
def clone(self):
return ConversionOption(name=self.name, help=self.help,
long_switch=self.long_switch, short_switch=self.short_switch,
choices=self.choices)
class OptionRecommendation(object):
LOW = 1
MED = 2
@ -59,6 +64,10 @@ class OptionRecommendation(object):
self.validate_parameters()
def clone(self):
return OptionRecommendation(recommended_value=self.recommended_value,
level=self.level, option=self.option.clone())
def validate_parameters(self):
if self.option.choices and self.recommended_value not in \
self.option.choices:
@ -170,8 +179,14 @@ class InputFormatPlugin(Plugin):
options.debug_input = os.path.abspath(options.debug_input)
if not os.path.exists(options.debug_input):
os.makedirs(options.debug_input)
if isinstance(ret, basestring):
shutil.rmtree(options.debug_input)
shutil.copytree(output_dir, options.debug_input)
else:
from calibre.ebooks.oeb.writer import OEBWriter
w = OEBWriter(pretty_print=options.pretty_print)
w(ret, options.debug_input)
log.info('Input debug saved to:', options.debug_input)
return ret

View File

@ -57,7 +57,7 @@ def check_command_line_options(parser, args, log):
raise SystemExit(1)
output = args[2]
if output.startswith('.'):
if output.startswith('.') and output != '.':
output = os.path.splitext(os.path.basename(input))[0]+output
output = os.path.abspath(output)
@ -171,6 +171,7 @@ def main(args=sys.argv):
plumber.run()
if plumber.opts.debug_input is None:
log(_('Output saved to'), ' ', plumber.output)
return 0

View File

@ -32,8 +32,8 @@ class Plumber(object):
:param input: Path to input file.
:param output: Path to output file/directory
'''
self.input = input
self.output = output
self.input = os.path.abspath(input)
self.output = os.path.abspath(output)
self.log = log
# Initialize the conversion options that are independent of input and
@ -188,15 +188,15 @@ OptionRecommendation(name='language',
]
input_fmt = os.path.splitext(input)[1]
input_fmt = os.path.splitext(self.input)[1]
if not input_fmt:
raise ValueError('Input file must have an extension')
input_fmt = input_fmt[1:].lower()
if os.path.exists(output) and os.path.isdir(output):
if os.path.exists(self.output) and os.path.isdir(self.output):
output_fmt = 'oeb'
else:
output_fmt = os.path.splitext(output)[1]
output_fmt = os.path.splitext(self.output)[1]
if not output_fmt:
output_fmt = '.oeb'
output_fmt = output_fmt[1:].lower()
@ -323,6 +323,9 @@ OptionRecommendation(name='language',
self.oeb = self.input_plugin(open(self.input, 'rb'), self.opts,
self.input_fmt, self.log,
accelerators, tdir)
if self.opts.debug_input is not None:
self.log('Debug input called, aborting the rest of the pipeline.')
return
if not hasattr(self.oeb, 'manifest'):
self.oeb = create_oebbook(self.log, self.oeb, self.opts)
@ -365,18 +368,20 @@ OptionRecommendation(name='language',
self.output_plugin.convert(self.oeb, self.output, self.input_plugin,
self.opts, self.log)
def create_oebbook(log, opfpath, opts):
def create_oebbook(log, path_or_stream, opts, reader=None):
'''
Create an OEBBook from an OPF file.
Create an OEBBook.
'''
from calibre.ebooks.oeb.reader import OEBReader
from calibre.ebooks.oeb.base import OEBBook
html_preprocessor = HTMLPreProcessor()
reader = OEBReader()
oeb = OEBBook(log, html_preprocessor=html_preprocessor,
pretty_print=opts.pretty_print)
# Read OEB Book into OEBBook
log.info('Parsing all content...')
reader(oeb, opfpath)
log('Parsing all content...')
if reader is None:
from calibre.ebooks.oeb.reader import OEBReader
reader = OEBReader
reader()(oeb, path_or_stream)
return oeb

View File

@ -252,6 +252,14 @@ class HTMLInput(InputFormatPlugin):
)
),
OptionRecommendation(name='dont_package',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Normally this input plugin re-arranges all the input '
'files into a standard folder hierarchy. Only use this option '
'if you know what you are doing as it can result in various '
'nasty side effects in the rest of of the conversion pipeline.'
)
),
])
def convert(self, stream, opts, file_ext, log,
@ -276,6 +284,9 @@ class HTMLInput(InputFormatPlugin):
mi.render(open('metadata.opf', 'wb'))
opfpath = os.path.abspath('metadata.opf')
if opts.dont_package:
return opfpath
from calibre.ebooks.conversion.plumber import create_oebbook
oeb = create_oebbook(log, opfpath, opts)

View File

@ -0,0 +1,24 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.customize.conversion import InputFormatPlugin
class LITInput(InputFormatPlugin):
name = 'LIT Input'
author = 'Marshall T. Vandegrift'
description = 'Convert LIT files to HTML'
file_types = set(['lit'])
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.lit.reader import LitReader
from calibre.ebooks.conversion.plumber import create_oebbook
return create_oebbook(log, stream, options, reader=LitReader)

View File

@ -7,13 +7,12 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> ' \
'and Marshall T. Vandegrift <llasram@gmail.com>'
import sys, struct, os
import struct, os
import functools
import re
from urlparse import urldefrag
from cStringIO import StringIO
from urllib import unquote as urlunquote
from lxml import etree
from calibre.ebooks.lit import LitError
from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP
import calibre.ebooks.lit.mssha1 as mssha1

View File

@ -1,10 +1,10 @@
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''Read meta information from PDF files'''
import sys, os, cStringIO
from threading import Thread
from calibre import FileWrapper
from calibre.ebooks.metadata import MetaInformation, authors_to_string
@ -13,7 +13,8 @@ from pyPdf import PdfFileReader, PdfFileWriter
import Image
try:
from calibre.utils.PythonMagickWand import \
NewMagickWand, MagickReadImage, MagickSetImageFormat, MagickWriteImage
NewMagickWand, MagickReadImage, MagickSetImageFormat, \
MagickWriteImage, ImageMagick
_imagemagick_loaded = True
except:
_imagemagick_loaded = False
@ -51,9 +52,23 @@ def get_metadata(stream, extract_cover=True):
print >>sys.stderr, msg.encode('utf8')
return mi
class MetadataWriter(Thread):
def __init__(self, out_pdf, buf):
self.out_pdf = out_pdf
self.buf = buf
Thread.__init__(self)
self.daemon = True
def run(self):
try:
self.out_pdf.write(self.buf)
except RuntimeError:
pass
def set_metadata(stream, mi):
stream.seek(0)
# Use a cStringIO object for the pdf because we will want to over
# Use a StringIO object for the pdf because we will want to over
# write it later and if we are working on the stream directly it
# could cause some issues.
raw = cStringIO.StringIO(stream.read())
@ -61,10 +76,18 @@ def set_metadata(stream, mi):
title = mi.title if mi.title else orig_pdf.documentInfo.title
author = authors_to_string(mi.authors) if mi.authors else orig_pdf.documentInfo.author
out_pdf = PdfFileWriter(title=title, author=author)
out_str = cStringIO.StringIO()
writer = MetadataWriter(out_pdf, out_str)
for page in orig_pdf.pages:
out_pdf.addPage(page)
out_str = cStringIO.StringIO()
out_pdf.write(out_str)
writer.start()
writer.join(10) # Wait 10 secs for writing to complete
out_pdf.killed = True
writer.join()
if out_pdf.killed:
print 'Failed to set metadata: took too long'
return
stream.seek(0)
stream.truncate()
out_str.seek(0)
@ -72,12 +95,9 @@ def set_metadata(stream, mi):
stream.seek(0)
def get_cover(stream):
stream.seek(0)
data = cStringIO.StringIO()
try:
with FileWrapper(stream) as stream:
pdf = PdfFileReader(stream)
output = PdfFileWriter()
@ -87,20 +107,20 @@ def get_cover(stream):
with TemporaryDirectory('_pdfmeta') as tdir:
cover_path = os.path.join(tdir, 'cover.pdf')
outputStream = file(cover_path, "wb")
with open(cover_path, "wb") as outputStream:
output.write(outputStream)
outputStream.close()
with ImageMagick():
wand = NewMagickWand()
MagickReadImage(wand, cover_path)
MagickSetImageFormat(wand, 'JPEG')
MagickWriteImage(wand, '%s.jpg' % cover_path)
img = Image.open('%s.jpg' % cover_path)
img.save(data, 'JPEG')
except:
import traceback
traceback.print_exc()
return data.getvalue()

View File

@ -272,11 +272,7 @@ def XPath(expr):
def xpath(elem, expr):
return elem.xpath(expr, namespaces=XPNSMAP)
def _prepare_xml_for_serialization(root):
pass
def xml2str(root, pretty_print=False, strip_comments=False):
_prepare_xml_for_serialization(root)
ans = etree.tostring(root, encoding='utf-8', xml_declaration=True,
pretty_print=pretty_print)
@ -287,7 +283,6 @@ def xml2str(root, pretty_print=False, strip_comments=False):
def xml2unicode(root, pretty_print=False):
_prepare_xml_for_serialization(root)
return etree.tostring(root, pretty_print=pretty_print)
ASCII_CHARS = set(chr(x) for x in xrange(128))
@ -321,6 +316,25 @@ def urlnormalize(href):
parts = (urlquote(part) for part in parts)
return urlunparse(parts)
class DummyHandler(logging.Handler):
def __init__(self):
logging.Handler.__init__(self, logging.WARNING)
self.setFormatter(logging.Formatter('%(message)s'))
self.log = None
def emit(self, record):
if self.log is not None:
msg = self.format(record)
f = self.log.error if record.levelno >= logging.ERROR \
else self.log.warn
f(msg)
_css_logger = logging.getLogger('calibre.css')
_css_logger.setLevel(logging.WARNING)
_css_log_handler = DummyHandler()
_css_logger.addHandler(_css_log_handler)
class OEBError(Exception):
"""Generic OEB-processing error."""
@ -778,7 +792,8 @@ class Manifest(object):
data = self.oeb.css_preprocessor(data)
data = XHTML_CSS_NAMESPACE + data
parser = CSSParser(loglevel=logging.WARNING,
fetcher=self._fetch_css)
fetcher=self._fetch_css,
log=_css_logger)
data = parser.parseString(data, href=self.href)
data.namespaces['h'] = XHTML_NS
return data
@ -1435,7 +1450,7 @@ class OEBBook(object):
:attr:`pages`: List of "pages," such as indexed to a print edition of
the same text.
"""
_css_log_handler.log = logger
self.encoding = encoding
self.html_preprocessor = html_preprocessor
self.css_preprocessor = css_preprocessor
@ -1450,6 +1465,7 @@ class OEBBook(object):
self.guide = Guide(self)
self.toc = TOC()
self.pages = PageList()
self.auto_generated_toc = True
@classmethod
def generate(cls, opts):

View File

@ -13,13 +13,12 @@ from PyQt4.Qt import QFontDatabase
from calibre.customize.ui import available_input_formats
from calibre.ebooks.epub.from_html import TITLEPAGE
from calibre.ebooks.metadata.opf2 import OPF, OPFCreator
from calibre.ebooks.metadata.opf2 import OPF
from calibre.ptempfile import TemporaryDirectory
from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.zipfile import safe_replace, ZipFile
from calibre.utils.config import DynamicConfig
from calibre.utils.logging import Log
from calibre import CurrentDir
def character_count(html):
'''
@ -57,31 +56,21 @@ class FakeOpts(object):
max_levels = 5
input_encoding = None
def html2opf(path, tdir, log):
from calibre.ebooks.html.input import get_filelist
from calibre.ebooks.metadata.meta import get_metadata
with CurrentDir(tdir):
fl = get_filelist(path, tdir, FakeOpts(), log)
mi = get_metadata(open(path, 'rb'), 'html')
mi = OPFCreator(os.getcwdu(), mi)
mi.guide = None
entries = [(f.path, 'application/xhtml+xml') for f in fl]
mi.create_manifest(entries)
mi.create_spine([f.path for f in fl])
mi.render(open('metadata.opf', 'wb'))
opfpath = os.path.abspath('metadata.opf')
return opfpath
def opf2opf(path, tdir, opts):
return path
def is_supported(path):
ext = os.path.splitext(path)[1].replace('.', '').lower()
ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
return ext in available_input_formats()
def write_oebbook(oeb, path):
from calibre.ebooks.oeb.writer import OEBWriter
from calibre import walk
w = OEBWriter()
w(oeb, path)
for f in walk(path):
if f.endswith('.opf'):
return f
class EbookIterator(object):
CHARACTERS_PER_PAGE = 1000
@ -131,17 +120,16 @@ class EbookIterator(object):
def __enter__(self):
self._tdir = TemporaryDirectory('_ebook_iter')
self.base = self._tdir.__enter__()
if self.ebook_ext == 'opf':
self.pathtoopf = self.pathtoebook
elif self.ebook_ext == 'html':
self.pathtoopf = html2opf(self.pathtoebook, self.base, self.log)
else:
from calibre.ebooks.conversion.plumber import Plumber
plumber = Plumber(self.pathtoebook, self.base, self.log)
plumber.setup_options()
if hasattr(plumber.opts, 'dont_package'):
plumber.opts.dont_package = True
self.pathtoopf = plumber.input_plugin(open(plumber.input, 'rb'),
plumber.opts, plumber.input_fmt, self.log,
{}, self.base)
if hasattr(self.pathtoopf, 'manifest'):
self.pathtoopf = write_oebbook(self.pathtoebook, self._tdir)
self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf))

View File

@ -16,7 +16,6 @@ class OEBOutput(OutputFormatPlugin):
author = 'Kovid Goyal'
file_type = 'oeb'
def convert(self, oeb_book, output_path, input_plugin, opts, log):
self.log, self.opts = log, opts
if not os.path.exists(output_path):

View File

@ -349,6 +349,7 @@ class OEBReader(object):
def _toc_from_ncx(self, item):
if item is None:
return False
self.log.debug('Reading TOC from NCX...')
ncx = item.data
title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()'))
title = COLLAPSE_RE.sub(' ', title.strip())
@ -364,6 +365,7 @@ class OEBReader(object):
result = xpath(opf, 'o2:tours/o2:tour')
if not result:
return False
self.log.debug('Reading TOC from tour...')
tour = result[0]
toc = self.oeb.toc
toc.title = tour.get('title')
@ -384,6 +386,7 @@ class OEBReader(object):
def _toc_from_html(self, opf):
if 'toc' not in self.oeb.guide:
return False
self.log.debug('Reading TOC from HTML...')
itempath, frag = urldefrag(self.oeb.guide['toc'].href)
item = self.oeb.manifest.hrefs[itempath]
html = item.data
@ -414,6 +417,7 @@ class OEBReader(object):
return True
def _toc_from_spine(self, opf):
self.log.warn('Generating default TOC from spine...')
toc = self.oeb.toc
titles = []
headers = []
@ -441,11 +445,14 @@ class OEBReader(object):
return True
def _toc_from_opf(self, opf, item):
self.oeb.auto_generated_toc = False
if self._toc_from_ncx(item): return
if self._toc_from_tour(opf): return
self.logger.warn('No metadata table of contents found')
# Prefer HTML to tour based TOC, since several LIT files
# have good HTML TOCs but bad tour based TOCs
if self._toc_from_html(opf): return
if self._toc_from_tour(opf): return
self._toc_from_spine(opf)
self.oeb.auto_generated_toc = True
def _pages_from_ncx(self, opf, item):
if item is None:

View File

@ -51,8 +51,8 @@ class Split(object):
self.log = oeb.log
self.map = {}
self.page_break_selectors = None
for item in self.oeb.manifest.items:
if etree.iselement(item.data):
for item in list(self.oeb.manifest.items):
if item.spine_position is not None and etree.iselement(item.data):
self.split_item(item)
self.fix_links()
@ -74,7 +74,6 @@ class Split(object):
self.page_break_selectors = set([])
stylesheets = [x.data for x in self.oeb.manifest if x.media_type in
OEB_STYLES]
page_break_selectors = set([])
for rule in rules(stylesheets):
before = getattr(rule.style.getPropertyCSSValue(
'page-break-before'), 'cssText', '').strip().lower()
@ -82,20 +81,24 @@ class Split(object):
'page-break-after'), 'cssText', '').strip().lower()
try:
if before and before != 'avoid':
page_break_selectors.add((CSSSelector(rule.selectorText),
self.page_break_selectors.add((CSSSelector(rule.selectorText),
True))
except:
pass
try:
if after and after != 'avoid':
page_break_selectors.add((CSSSelector(rule.selectorText),
self.page_break_selectors.add((CSSSelector(rule.selectorText),
False))
except:
pass
page_breaks = set([])
for selector, before in page_break_selectors:
for elem in selector(item.data):
for selector, before in self.page_break_selectors:
body = item.data.xpath('//h:body', namespaces=NAMESPACES)
if not body:
continue
for elem in selector(body[0]):
if elem not in body:
if before:
elem.set('pb_before', '1')
page_breaks.add(elem)
@ -136,8 +139,10 @@ class Split(object):
if href in self.map:
anchor_map = self.map[href]
nhref = anchor_map[frag if frag else None]
nhref = self.current_item.relhref(nhref)
if frag:
nhref = '#'.join(href, frag)
nhref = '#'.join((nhref, frag))
return nhref
return url
@ -153,7 +158,7 @@ class FlowSplitter(object):
self.page_breaks = page_breaks
self.page_break_ids = page_break_ids
self.max_flow_size = max_flow_size
self.base = item.abshref(item.href)
self.base = item.href
base, ext = os.path.splitext(self.base)
self.base = base.replace('%', '%%')+'_split_%d'+ext
@ -192,9 +197,9 @@ class FlowSplitter(object):
self.trees = []
tree = orig_tree
for pattern, before in ordered_ids:
self.log.debug('\t\tSplitting on page-break')
elem = pattern(tree)
if elem:
self.log.debug('\t\tSplitting on page-break')
before, after = self.do_split(tree, elem[0], before)
self.trees.append(before)
tree = after
@ -414,13 +419,14 @@ class FlowSplitter(object):
elem.attrib.pop(SPLIT_ATTR, None)
elem.attrib.pop(SPLIT_POINT_ATTR, '0')
spine_pos = self.item.spine_pos
for current, tree in zip(map(reversed, (self.files, self.trees))):
spine_pos = self.item.spine_position
for current, tree in zip(*map(reversed, (self.files, self.trees))):
for a in tree.getroot().xpath('//h:a[@href]', namespaces=NAMESPACES):
href = a.get('href').strip()
if href.startswith('#'):
anchor = href[1:]
file = self.anchor_map[anchor]
file = self.item.relhref(file)
if file != current:
a.set('href', file+href)
@ -430,12 +436,12 @@ class FlowSplitter(object):
self.oeb.spine.insert(spine_pos, new_item, self.item.linear)
if self.oeb.guide:
for ref in self.oeb.guide:
for ref in self.oeb.guide.values():
href, frag = urldefrag(ref.href)
if href == self.item.href:
nhref = self.anchor_map[frag if frag else None]
if frag:
nhref = '#'.join(nhref, frag)
nhref = '#'.join((nhref, frag))
ref.href = nhref
def fix_toc_entry(toc):
@ -444,7 +450,7 @@ class FlowSplitter(object):
if href == self.item.href:
nhref = self.anchor_map[frag if frag else None]
if frag:
nhref = '#'.join(nhref, frag)
nhref = '#'.join((nhref, frag))
toc.href = nhref
for x in toc:
fix_toc_entry(x)

View File

@ -49,7 +49,7 @@ class OEBWriter(object):
def __call__(self, oeb, path):
"""
Read the book in the :class:`OEBBook` object :param:`oeb` to a file
Write the book in the :class:`OEBBook` object :param:`oeb` to a folder
at :param:`path`.
"""
version = int(self.version[0])

View File

@ -319,6 +319,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
self.cover_changed = True
def initialize_series(self):
self.series.setSizeAdjustPolicy(self.series.AdjustToContentsOnFirstShow)
all_series = self.db.all_series()
all_series.sort(cmp=lambda x, y : cmp(x[1], y[1]))
series_id = self.db.series_id(self.row)
@ -335,13 +336,6 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
self.series.setCurrentIndex(idx)
self.enable_series_index()
pl = self.series.parentWidget().layout()
for i in range(pl.count()):
l = pl.itemAt(i).layout()
if l:
l.invalidate()
l.activate()
def initialize_series_and_publisher(self):
self.initialize_series()
all_publishers = self.db.all_publishers()

Binary file not shown.

After

Width:  |  Height:  |  Size: 509 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 637 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 746 B

View File

@ -40,6 +40,7 @@ recipe_modules = ['recipe_' + r for r in (
'krstarica', 'krstarica_en', 'tanjug', 'laprensa_ni', 'azstarnet',
'corriere_della_sera_it', 'corriere_della_sera_en', 'msdnmag_en',
'moneynews', 'der_standard', 'diepresse', 'nzz_ger', 'hna',
'seattle_times',
)]
import re, imp, inspect, time, os

View File

@ -1,3 +1,8 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>'
''' http://www.derstandard.at - Austrian Newspaper '''
import re
@ -6,9 +11,27 @@ from calibre.web.feeds.news import BasicNewsRecipe
class DerStandardRecipe(BasicNewsRecipe):
title = u'derStandard'
__author__ = 'Gerhard Aigner'
description = u'Nachrichten aus Österreich'
publisher ='derStandard.at'
category = 'news, politics, nachrichten, Austria'
use_embedded_content = False
remove_empty_feeds = True
lang = 'de-AT'
no_stylesheets = True
encoding = 'utf-8'
language = _('German')
recursions = 0
oldest_article = 1
max_articles_per_feed = 100
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [(u'International', u'http://derstandard.at/?page=rss&ressort=internationalpolitik'),
(u'Inland', u'http://derstandard.at/?page=rss&ressort=innenpolitik'),
(u'Wirtschaft', u'http://derstandard.at/?page=rss&ressort=investor'),
@ -20,14 +43,10 @@ class DerStandardRecipe(BasicNewsRecipe):
(u'Wissenschaft', u'http://derstandard.at/?page=rss&ressort=wissenschaft'),
(u'Gesundheit', u'http://derstandard.at/?page=rss&ressort=gesundheit'),
(u'Bildung', u'http://derstandard.at/?page=rss&ressort=subildung')]
encoding = 'utf-8'
language = _('German')
recursions = 0
remove_tags = [dict(name='div'), dict(name='a'), dict(name='link'), dict(name='meta'),
dict(name='form',attrs={'name':'sitesearch'}), dict(name='hr')]
preprocess_regexps = [
(re.compile(r'\[[\d*]\]', re.DOTALL|re.IGNORECASE), lambda match: ''),
(re.compile(r'\[[\d]*\]', re.DOTALL|re.IGNORECASE), lambda match: ''),
(re.compile(r'bgcolor="#\w{3,6}"', re.DOTALL|re.IGNORECASE), lambda match: '')
]
@ -40,3 +59,10 @@ class DerStandardRecipe(BasicNewsRecipe):
if (article.link.count('ressort') > 0 or article.title.lower().count('ansichtssache') > 0):
return None
return article.link
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '">'
soup.head.insert(0,mtag)
return soup

View File

@ -1,18 +1,42 @@
import re
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>'
''' http://www.diepresse.at - Austrian Newspaper '''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class DiePresseRecipe(BasicNewsRecipe):
title = u'diePresse'
__author__ = 'Gerhard Aigner'
description = u'DiePresse.com - Die Online-Ausgabe der Österreichischen Tageszeitung Die Presse.'
publisher ='DiePresse.com'
category = 'news, politics, nachrichten, Austria'
use_embedded_content = False
remove_empty_feeds = True
lang = 'de-AT'
no_stylesheets = True
encoding = 'ISO-8859-1'
language = _('German')
recursions = 0
oldest_article = 1
max_articles_per_feed = 100
recursions = 0
language = _('German')
__author__ = 'Gerhard Aigner'
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
preprocess_regexps = [
(re.compile(r'Textversion', re.DOTALL), lambda match: ''),
]
remove_tags = [dict(name='hr'),
dict(name='br'),
dict(name='small'),
@ -21,6 +45,7 @@ class DiePresseRecipe(BasicNewsRecipe):
dict(name='h1', attrs={'class':'titel'}),
dict(name='a', attrs={'class':'print'}),
dict(name='div', attrs={'class':'hline'})]
feeds = [(u'Politik', u'http://diepresse.com/rss/Politik'),
(u'Wirtschaft', u'http://diepresse.com/rss/Wirtschaft'),
(u'Europa', u'http://diepresse.com/rss/EU'),
@ -29,7 +54,7 @@ class DiePresseRecipe(BasicNewsRecipe):
(u'Kultur', u'http://diepresse.com/rss/Kultur'),
(u'Leben', u'http://diepresse.com/rss/Leben'),
(u'Tech', u'http://diepresse.com/rss/Tech'),
(u'Science', u'http://diepresse.com/rss/Science'),
(u'Wissenschaft', u'http://diepresse.com/rss/Science'),
(u'Bildung', u'http://diepresse.com/rss/Bildung'),
(u'Gesundheit', u'http://diepresse.com/rss/Gesundheit'),
(u'Recht', u'http://diepresse.com/rss/Recht'),
@ -38,3 +63,10 @@ class DiePresseRecipe(BasicNewsRecipe):
def print_version(self, url):
return url.replace('home','text/home')
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
soup.head.insert(0,mtag)
return soup

View File

@ -0,0 +1,50 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
seattletimes.nwsource.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class SeattleTimes(BasicNewsRecipe):
title = 'The Seattle Times'
__author__ = 'Darko Miletic'
description = 'News from Seattle and USA'
publisher = 'The Seattle Times'
category = 'news, politics, USA'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
language = _('English')
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [(u'Articles', u'http://seattletimes.nwsource.com/rss/seattletimes.xml')]
remove_tags = [
dict(name=['object','link','script'])
,dict(name='p', attrs={'class':'permission'})
]
def print_version(self, url):
start_url, sep, rest_url = url.rpartition('_')
rurl, rsep, article_id = start_url.rpartition('/')
return u'http://seattletimes.nwsource.com/cgi-bin/PrintStory.pl?document_id=' + article_id
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="en-US"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -299,7 +299,7 @@ def readStringFromStream(stream):
elif tok == "t":
tok = "\t"
elif tok == "b":
tok == "\b"
tok = "\b"
elif tok == "f":
tok = "\f"
elif tok == "(":

View File

@ -39,15 +39,12 @@ __author__ = "Mathieu Fenniak"
__author_email__ = "biziqe@mathieu.fenniak.net"
import struct
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
import filters
import utils
import warnings
from generic import *
from generic import DictionaryObject, NameObject, NumberObject, \
createStringObject, ArrayObject, ByteStringObject, StreamObject, \
IndirectObject, utils, readObject, TextStringObject, BooleanObject, \
RectangleObject, DecodedStreamObject
from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList
@ -56,6 +53,7 @@ from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirt
# class (typically {@link #PdfFileReader PdfFileReader}).
class PdfFileWriter(object):
def __init__(self,title=u"Unknown",author=u"Unknown"):
self.killed = False
self._header = "%PDF-1.3"
self._objects = [] # array of indirect objects
@ -162,7 +160,7 @@ class PdfFileWriter(object):
# @param stream An object to write the file to. The object must support
# the write method, and the tell method, similar to a file object.
def write(self, stream):
import struct, md5
import md5
externalReferenceMap = {}
self.stack = []
@ -214,6 +212,8 @@ class PdfFileWriter(object):
stream.write("\nstartxref\n%s\n%%%%EOF\n" % (xref_location))
def _sweepIndirectReferences(self, externMap, data):
if self.killed:
raise RuntimeError('Writer killed')
if isinstance(data, DictionaryObject):
for key, value in data.items():
origvalue = value