Pull from trunk

This commit is contained in:
Kovid Goyal 2009-04-18 01:03:52 -07:00
commit 7dd20f593b
9 changed files with 145 additions and 50 deletions

View File

@ -4,7 +4,8 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''Read meta information from PDF files''' '''Read meta information from PDF files'''
import sys, os, StringIO import sys, os, cStringIO
from threading import Thread
from calibre.ebooks.metadata import MetaInformation, authors_to_string from calibre.ebooks.metadata import MetaInformation, authors_to_string
from calibre.ptempfile import TemporaryDirectory from calibre.ptempfile import TemporaryDirectory
@ -49,20 +50,42 @@ def get_metadata(stream, extract_cover=True):
print >>sys.stderr, msg.encode('utf8') print >>sys.stderr, msg.encode('utf8')
return mi return mi
class MetadataWriter(Thread):
def __init__(self, out_pdf, buf):
self.out_pdf = out_pdf
self.buf = buf
Thread.__init__(self)
self.daemon = True
def run(self):
try:
self.out_pdf.write(self.buf)
except RuntimeError:
pass
def set_metadata(stream, mi): def set_metadata(stream, mi):
stream.seek(0) stream.seek(0)
# Use a StringIO object for the pdf because we will want to over # Use a StringIO object for the pdf because we will want to over
# write it later and if we are working on the stream directly it # write it later and if we are working on the stream directly it
# could cause some issues. # could cause some issues.
raw = StringIO.StringIO(stream.read()) raw = cStringIO.StringIO(stream.read())
orig_pdf = PdfFileReader(raw) orig_pdf = PdfFileReader(raw)
title = mi.title if mi.title else orig_pdf.documentInfo.title title = mi.title if mi.title else orig_pdf.documentInfo.title
author = authors_to_string(mi.authors) if mi.authors else orig_pdf.documentInfo.author author = authors_to_string(mi.authors) if mi.authors else orig_pdf.documentInfo.author
out_pdf = PdfFileWriter(title=title, author=author) out_pdf = PdfFileWriter(title=title, author=author)
out_str = cStringIO.StringIO()
writer = MetadataWriter(out_pdf, out_str)
for page in orig_pdf.pages: for page in orig_pdf.pages:
out_pdf.addPage(page) out_pdf.addPage(page)
out_str = StringIO.StringIO() writer.start()
out_pdf.write(out_str) writer.join(10) # Wait 10 secs for writing to complete
out_pdf.killed = True
writer.join()
if out_pdf.killed:
print 'Failed to set metadata: took too long'
return
stream.seek(0) stream.seek(0)
stream.truncate() stream.truncate()
out_str.seek(0) out_str.seek(0)
@ -70,7 +93,7 @@ def set_metadata(stream, mi):
stream.seek(0) stream.seek(0)
def get_cover(stream): def get_cover(stream):
data = StringIO.StringIO() data = cStringIO.StringIO()
try: try:
pdf = PdfFileReader(stream) pdf = PdfFileReader(stream)
@ -99,3 +122,4 @@ def get_cover(stream):
traceback.print_exc() traceback.print_exc()
return data.getvalue() return data.getvalue()

View File

@ -319,6 +319,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
self.cover_changed = True self.cover_changed = True
def initialize_series(self): def initialize_series(self):
self.series.setSizeAdjustPolicy(self.series.AdjustToContentsOnFirstShow)
all_series = self.db.all_series() all_series = self.db.all_series()
all_series.sort(cmp=lambda x, y : cmp(x[1], y[1])) all_series.sort(cmp=lambda x, y : cmp(x[1], y[1]))
series_id = self.db.series_id(self.row) series_id = self.db.series_id(self.row)
@ -335,13 +336,6 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
self.series.setCurrentIndex(idx) self.series.setCurrentIndex(idx)
self.enable_series_index() self.enable_series_index()
pl = self.series.parentWidget().layout()
for i in range(pl.count()):
l = pl.itemAt(i).layout()
if l:
l.invalidate()
l.activate()
def initialize_series_and_publisher(self): def initialize_series_and_publisher(self):
self.initialize_series() self.initialize_series()
all_publishers = self.db.all_publishers() all_publishers = self.db.all_publishers()

Binary file not shown.

After

Width:  |  Height:  |  Size: 509 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 746 B

View File

@ -40,6 +40,7 @@ recipe_modules = ['recipe_' + r for r in (
'krstarica', 'krstarica_en', 'tanjug', 'laprensa_ni', 'azstarnet', 'krstarica', 'krstarica_en', 'tanjug', 'laprensa_ni', 'azstarnet',
'corriere_della_sera_it', 'corriere_della_sera_en', 'msdnmag_en', 'corriere_della_sera_it', 'corriere_della_sera_en', 'msdnmag_en',
'moneynews', 'der_standard', 'diepresse', 'nzz_ger', 'hna', 'moneynews', 'der_standard', 'diepresse', 'nzz_ger', 'hna',
'seattle_times',
)] )]
import re, imp, inspect, time, os import re, imp, inspect, time, os

View File

@ -1,14 +1,37 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>'
''' http://www.derstandard.at - Austrian Newspaper ''' ''' http://www.derstandard.at - Austrian Newspaper '''
import re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class DerStandardRecipe(BasicNewsRecipe): class DerStandardRecipe(BasicNewsRecipe):
title = u'derStandard' title = u'derStandard'
__author__ = 'Gerhard Aigner' __author__ = 'Gerhard Aigner'
description = u'Nachrichten aus Österreich'
publisher ='derStandard.at'
category = 'news, politics, nachrichten, Austria'
use_embedded_content = False
remove_empty_feeds = True
lang = 'de-AT'
no_stylesheets = True
encoding = 'utf-8'
language = _('German')
recursions = 0
oldest_article = 1 oldest_article = 1
max_articles_per_feed = 100 max_articles_per_feed = 100
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [(u'International', u'http://derstandard.at/?page=rss&ressort=internationalpolitik'), feeds = [(u'International', u'http://derstandard.at/?page=rss&ressort=internationalpolitik'),
(u'Inland', u'http://derstandard.at/?page=rss&ressort=innenpolitik'), (u'Inland', u'http://derstandard.at/?page=rss&ressort=innenpolitik'),
(u'Wirtschaft', u'http://derstandard.at/?page=rss&ressort=investor'), (u'Wirtschaft', u'http://derstandard.at/?page=rss&ressort=investor'),
@ -20,17 +43,13 @@ class DerStandardRecipe(BasicNewsRecipe):
(u'Wissenschaft', u'http://derstandard.at/?page=rss&ressort=wissenschaft'), (u'Wissenschaft', u'http://derstandard.at/?page=rss&ressort=wissenschaft'),
(u'Gesundheit', u'http://derstandard.at/?page=rss&ressort=gesundheit'), (u'Gesundheit', u'http://derstandard.at/?page=rss&ressort=gesundheit'),
(u'Bildung', u'http://derstandard.at/?page=rss&ressort=subildung')] (u'Bildung', u'http://derstandard.at/?page=rss&ressort=subildung')]
encoding = 'utf-8'
language = _('German')
recursions = 0
remove_tags = [dict(name='div'), dict(name='a'), dict(name='link'), dict(name='meta'), remove_tags = [dict(name='div'), dict(name='a'), dict(name='link'), dict(name='meta'),
dict(name='form',attrs={'name':'sitesearch'}), dict(name='hr')] dict(name='form',attrs={'name':'sitesearch'}), dict(name='hr')]
preprocess_regexps = [ preprocess_regexps = [
(re.compile(r'\[[\d*]\]', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(r'\[[\d]*\]', re.DOTALL|re.IGNORECASE), lambda match: ''),
(re.compile(r'bgcolor="#\w{3,6}"', re.DOTALL|re.IGNORECASE), lambda match: '') (re.compile(r'bgcolor="#\w{3,6}"', re.DOTALL|re.IGNORECASE), lambda match: '')
] ]
def print_version(self, url): def print_version(self, url):
return url.replace('?id=', 'txt/?id=') return url.replace('?id=', 'txt/?id=')
@ -40,3 +59,10 @@ class DerStandardRecipe(BasicNewsRecipe):
if (article.link.count('ressort') > 0 or article.title.lower().count('ansichtssache') > 0): if (article.link.count('ressort') > 0 or article.title.lower().count('ansichtssache') > 0):
return None return None
return article.link return article.link
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '">'
soup.head.insert(0,mtag)
return soup

View File

@ -0,0 +1,50 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
seattletimes.nwsource.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class SeattleTimes(BasicNewsRecipe):
title = 'The Seattle Times'
__author__ = 'Darko Miletic'
description = 'News from Seattle and USA'
publisher = 'The Seattle Times'
category = 'news, politics, USA'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
language = _('English')
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [(u'Articles', u'http://seattletimes.nwsource.com/rss/seattletimes.xml')]
remove_tags = [
dict(name=['object','link','script'])
,dict(name='p', attrs={'class':'permission'})
]
def print_version(self, url):
start_url, sep, rest_url = url.rpartition('_')
rurl, rsep, article_id = start_url.rpartition('/')
return u'http://seattletimes.nwsource.com/cgi-bin/PrintStory.pl?document_id=' + article_id
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="en-US"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -299,7 +299,7 @@ def readStringFromStream(stream):
elif tok == "t": elif tok == "t":
tok = "\t" tok = "\t"
elif tok == "b": elif tok == "b":
tok == "\b" tok = "\b"
elif tok == "f": elif tok == "f":
tok = "\f" tok = "\f"
elif tok == "(": elif tok == "(":
@ -673,7 +673,7 @@ class RectangleObject(ArrayObject):
def getUpperLeft_x(self): def getUpperLeft_x(self):
return self.getLowerLeft_x() return self.getLowerLeft_x()
def getUpperLeft_y(self): def getUpperLeft_y(self):
return self.getUpperRight_y() return self.getUpperRight_y()

View File

@ -39,15 +39,12 @@ __author__ = "Mathieu Fenniak"
__author_email__ = "biziqe@mathieu.fenniak.net" __author_email__ = "biziqe@mathieu.fenniak.net"
import struct import struct
try: from cStringIO import StringIO
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
import filters from generic import DictionaryObject, NameObject, NumberObject, \
import utils createStringObject, ArrayObject, ByteStringObject, StreamObject, \
import warnings IndirectObject, utils, readObject, TextStringObject, BooleanObject, \
from generic import * RectangleObject, DecodedStreamObject
from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList
@ -56,6 +53,7 @@ from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirt
# class (typically {@link #PdfFileReader PdfFileReader}). # class (typically {@link #PdfFileReader PdfFileReader}).
class PdfFileWriter(object): class PdfFileWriter(object):
def __init__(self,title=u"Unknown",author=u"Unknown"): def __init__(self,title=u"Unknown",author=u"Unknown"):
self.killed = False
self._header = "%PDF-1.3" self._header = "%PDF-1.3"
self._objects = [] # array of indirect objects self._objects = [] # array of indirect objects
@ -162,7 +160,7 @@ class PdfFileWriter(object):
# @param stream An object to write the file to. The object must support # @param stream An object to write the file to. The object must support
# the write method, and the tell method, similar to a file object. # the write method, and the tell method, similar to a file object.
def write(self, stream): def write(self, stream):
import struct, md5 import md5
externalReferenceMap = {} externalReferenceMap = {}
self.stack = [] self.stack = []
@ -209,11 +207,13 @@ class PdfFileWriter(object):
if hasattr(self, "_encrypt"): if hasattr(self, "_encrypt"):
trailer[NameObject("/Encrypt")] = self._encrypt trailer[NameObject("/Encrypt")] = self._encrypt
trailer.writeToStream(stream, None) trailer.writeToStream(stream, None)
# eof # eof
stream.write("\nstartxref\n%s\n%%%%EOF\n" % (xref_location)) stream.write("\nstartxref\n%s\n%%%%EOF\n" % (xref_location))
def _sweepIndirectReferences(self, externMap, data): def _sweepIndirectReferences(self, externMap, data):
if self.killed:
raise RuntimeError('Writer killed')
if isinstance(data, DictionaryObject): if isinstance(data, DictionaryObject):
for key, value in data.items(): for key, value in data.items():
origvalue = value origvalue = value
@ -356,8 +356,8 @@ class PdfFileReader(object):
return self.flattenedPages[pageNumber] return self.flattenedPages[pageNumber]
## ##
# Read-only property that accesses the # Read-only property that accesses the
# {@link #PdfFileReader.getNamedDestinations # {@link #PdfFileReader.getNamedDestinations
# getNamedDestinations} function. # getNamedDestinations} function.
# <p> # <p>
# Stability: Added in v1.10, will exist for all future v1.x releases. # Stability: Added in v1.10, will exist for all future v1.x releases.
@ -374,7 +374,7 @@ class PdfFileReader(object):
if retval == None: if retval == None:
retval = {} retval = {}
catalog = self.trailer["/Root"] catalog = self.trailer["/Root"]
# get the name tree # get the name tree
if catalog.has_key("/Dests"): if catalog.has_key("/Dests"):
tree = catalog["/Dests"] tree = catalog["/Dests"]
@ -382,7 +382,7 @@ class PdfFileReader(object):
names = catalog['/Names'] names = catalog['/Names']
if names.has_key("/Dests"): if names.has_key("/Dests"):
tree = names['/Dests'] tree = names['/Dests']
if tree == None: if tree == None:
return retval return retval
@ -420,17 +420,17 @@ class PdfFileReader(object):
if outlines == None: if outlines == None:
outlines = [] outlines = []
catalog = self.trailer["/Root"] catalog = self.trailer["/Root"]
# get the outline dictionary and named destinations # get the outline dictionary and named destinations
if catalog.has_key("/Outlines"): if catalog.has_key("/Outlines"):
lines = catalog["/Outlines"] lines = catalog["/Outlines"]
if lines.has_key("/First"): if lines.has_key("/First"):
node = lines["/First"] node = lines["/First"]
self._namedDests = self.getNamedDestinations() self._namedDests = self.getNamedDestinations()
if node == None: if node == None:
return outlines return outlines
# see if there are any more outlines # see if there are any more outlines
while 1: while 1:
outline = self._buildOutline(node) outline = self._buildOutline(node)
@ -454,10 +454,10 @@ class PdfFileReader(object):
page, typ = array[0:2] page, typ = array[0:2]
array = array[2:] array = array[2:]
return Destination(title, page, typ, *array) return Destination(title, page, typ, *array)
def _buildOutline(self, node): def _buildOutline(self, node):
dest, title, outline = None, None, None dest, title, outline = None, None, None
if node.has_key("/A") and node.has_key("/Title"): if node.has_key("/A") and node.has_key("/Title"):
# Action, section 8.5 (only type GoTo supported) # Action, section 8.5 (only type GoTo supported)
title = node["/Title"] title = node["/Title"]
@ -951,7 +951,7 @@ class PageObject(DictionaryObject):
def _pushPopGS(contents, pdf): def _pushPopGS(contents, pdf):
# adds a graphics state "push" and "pop" to the beginning and end # adds a graphics state "push" and "pop" to the beginning and end
# of a content stream. This isolates it from changes such as # of a content stream. This isolates it from changes such as
# transformation matricies. # transformation matricies.
stream = ContentStream(contents, pdf) stream = ContentStream(contents, pdf)
stream.operations.insert(0, [[], "q"]) stream.operations.insert(0, [[], "q"])
@ -1291,7 +1291,7 @@ class Destination(DictionaryObject):
self[NameObject("/Title")] = title self[NameObject("/Title")] = title
self[NameObject("/Page")] = page self[NameObject("/Page")] = page
self[NameObject("/Type")] = typ self[NameObject("/Type")] = typ
# from table 8.2 of the PDF 1.6 reference. # from table 8.2 of the PDF 1.6 reference.
if typ == "/XYZ": if typ == "/XYZ":
(self[NameObject("/Left")], self[NameObject("/Top")], (self[NameObject("/Left")], self[NameObject("/Top")],
@ -1307,7 +1307,7 @@ class Destination(DictionaryObject):
pass pass
else: else:
raise utils.PdfReadError("Unknown Destination Type: %r" % typ) raise utils.PdfReadError("Unknown Destination Type: %r" % typ)
## ##
# Read-only property accessing the destination title. # Read-only property accessing the destination title.
# @return A string. # @return A string.
@ -1474,25 +1474,25 @@ def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encr
# described in Algorithm 3.2. # described in Algorithm 3.2.
key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry) key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry)
# 2. Initialize the MD5 hash function and pass the 32-byte padding string # 2. Initialize the MD5 hash function and pass the 32-byte padding string
# shown in step 1 of Algorithm 3.2 as input to this function. # shown in step 1 of Algorithm 3.2 as input to this function.
import md5 import md5
m = md5.new() m = md5.new()
m.update(_encryption_padding) m.update(_encryption_padding)
# 3. Pass the first element of the file's file identifier array (the value # 3. Pass the first element of the file's file identifier array (the value
# of the ID entry in the document's trailer dictionary; see Table 3.13 on # of the ID entry in the document's trailer dictionary; see Table 3.13 on
# page 73) to the hash function and finish the hash. (See implementation # page 73) to the hash function and finish the hash. (See implementation
# note 25 in Appendix H.) # note 25 in Appendix H.)
m.update(id1_entry) m.update(id1_entry)
md5_hash = m.digest() md5_hash = m.digest()
# 4. Encrypt the 16-byte result of the hash, using an RC4 encryption # 4. Encrypt the 16-byte result of the hash, using an RC4 encryption
# function with the encryption key from step 1. # function with the encryption key from step 1.
val = utils.RC4_encrypt(key, md5_hash) val = utils.RC4_encrypt(key, md5_hash)
# 5. Do the following 19 times: Take the output from the previous # 5. Do the following 19 times: Take the output from the previous
# invocation of the RC4 function and pass it as input to a new invocation # invocation of the RC4 function and pass it as input to a new invocation
# of the function; use an encryption key generated by taking each byte of # of the function; use an encryption key generated by taking each byte of
# the original encryption key (obtained in step 2) and performing an XOR # the original encryption key (obtained in step 2) and performing an XOR
# operation between that byte and the single-byte value of the iteration # operation between that byte and the single-byte value of the iteration
# counter (from 1 to 19). # counter (from 1 to 19).
for i in range(1, 20): for i in range(1, 20):
new_key = '' new_key = ''
for l in range(len(key)): for l in range(len(key)):
@ -1500,7 +1500,7 @@ def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encr
val = utils.RC4_encrypt(new_key, val) val = utils.RC4_encrypt(new_key, val)
# 6. Append 16 bytes of arbitrary padding to the output from the final # 6. Append 16 bytes of arbitrary padding to the output from the final
# invocation of the RC4 function and store the 32-byte result as the value # invocation of the RC4 function and store the 32-byte result as the value
# of the U entry in the encryption dictionary. # of the U entry in the encryption dictionary.
# (implementator note: I don't know what "arbitrary padding" is supposed to # (implementator note: I don't know what "arbitrary padding" is supposed to
# mean, so I have used null bytes. This seems to match a few other # mean, so I have used null bytes. This seems to match a few other
# people's implementations) # people's implementations)