diff --git a/src/calibre/ebooks/metadata/pdf.py b/src/calibre/ebooks/metadata/pdf.py index 4476eb0847..1a664e638d 100644 --- a/src/calibre/ebooks/metadata/pdf.py +++ b/src/calibre/ebooks/metadata/pdf.py @@ -4,7 +4,8 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' '''Read meta information from PDF files''' -import sys, os, StringIO +import sys, os, cStringIO +from threading import Thread from calibre.ebooks.metadata import MetaInformation, authors_to_string from calibre.ptempfile import TemporaryDirectory @@ -49,20 +50,42 @@ def get_metadata(stream, extract_cover=True): print >>sys.stderr, msg.encode('utf8') return mi +class MetadataWriter(Thread): + + def __init__(self, out_pdf, buf): + self.out_pdf = out_pdf + self.buf = buf + Thread.__init__(self) + self.daemon = True + + def run(self): + try: + self.out_pdf.write(self.buf) + except RuntimeError: + pass + def set_metadata(stream, mi): stream.seek(0) # Use a StringIO object for the pdf because we will want to over # write it later and if we are working on the stream directly it # could cause some issues. - raw = StringIO.StringIO(stream.read()) + raw = cStringIO.StringIO(stream.read()) orig_pdf = PdfFileReader(raw) title = mi.title if mi.title else orig_pdf.documentInfo.title author = authors_to_string(mi.authors) if mi.authors else orig_pdf.documentInfo.author out_pdf = PdfFileWriter(title=title, author=author) + out_str = cStringIO.StringIO() + writer = MetadataWriter(out_pdf, out_str) for page in orig_pdf.pages: out_pdf.addPage(page) - out_str = StringIO.StringIO() - out_pdf.write(out_str) + writer.start() + writer.join(10) # Wait 10 secs for writing to complete + out_pdf.killed = True + writer.join() + if out_pdf.killed: + print 'Failed to set metadata: took too long' + return + stream.seek(0) stream.truncate() out_str.seek(0) @@ -70,7 +93,7 @@ def set_metadata(stream, mi): stream.seek(0) def get_cover(stream): - data = StringIO.StringIO() + data = cStringIO.StringIO() try: pdf = PdfFileReader(stream) @@ -99,3 +122,4 @@ def get_cover(stream): traceback.print_exc() return data.getvalue() + diff --git a/src/calibre/gui2/dialogs/metadata_single.py b/src/calibre/gui2/dialogs/metadata_single.py index c48c7c3640..4a74c87097 100644 --- a/src/calibre/gui2/dialogs/metadata_single.py +++ b/src/calibre/gui2/dialogs/metadata_single.py @@ -319,6 +319,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): self.cover_changed = True def initialize_series(self): + self.series.setSizeAdjustPolicy(self.series.AdjustToContentsOnFirstShow) all_series = self.db.all_series() all_series.sort(cmp=lambda x, y : cmp(x[1], y[1])) series_id = self.db.series_id(self.row) @@ -335,13 +336,6 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): self.series.setCurrentIndex(idx) self.enable_series_index() - pl = self.series.parentWidget().layout() - for i in range(pl.count()): - l = pl.itemAt(i).layout() - if l: - l.invalidate() - l.activate() - def initialize_series_and_publisher(self): self.initialize_series() all_publishers = self.db.all_publishers() diff --git a/src/calibre/gui2/images/news/der_standard.png b/src/calibre/gui2/images/news/der_standard.png new file mode 100644 index 0000000000..4d750fe5a8 Binary files /dev/null and b/src/calibre/gui2/images/news/der_standard.png differ diff --git a/src/calibre/gui2/images/news/seattle_times.png b/src/calibre/gui2/images/news/seattle_times.png new file mode 100644 index 0000000000..b885684992 Binary files /dev/null and b/src/calibre/gui2/images/news/seattle_times.png differ diff --git a/src/calibre/web/feeds/recipes/__init__.py b/src/calibre/web/feeds/recipes/__init__.py index 9e2ef1969d..405fd265a7 100644 --- a/src/calibre/web/feeds/recipes/__init__.py +++ b/src/calibre/web/feeds/recipes/__init__.py @@ -40,6 +40,7 @@ recipe_modules = ['recipe_' + r for r in ( 'krstarica', 'krstarica_en', 'tanjug', 'laprensa_ni', 'azstarnet', 'corriere_della_sera_it', 'corriere_della_sera_en', 'msdnmag_en', 'moneynews', 'der_standard', 'diepresse', 'nzz_ger', 'hna', + 'seattle_times', )] import re, imp, inspect, time, os diff --git a/src/calibre/web/feeds/recipes/recipe_der_standard.py b/src/calibre/web/feeds/recipes/recipe_der_standard.py index eec4c4e74d..c053d74cfb 100644 --- a/src/calibre/web/feeds/recipes/recipe_der_standard.py +++ b/src/calibre/web/feeds/recipes/recipe_der_standard.py @@ -1,14 +1,37 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +__license__ = 'GPL v3' +__copyright__ = '2009, Gerhard Aigner ' ''' http://www.derstandard.at - Austrian Newspaper ''' import re from calibre.web.feeds.news import BasicNewsRecipe class DerStandardRecipe(BasicNewsRecipe): - title = u'derStandard' - __author__ = 'Gerhard Aigner' - + title = u'derStandard' + __author__ = 'Gerhard Aigner' + description = u'Nachrichten aus Österreich' + publisher ='derStandard.at' + category = 'news, politics, nachrichten, Austria' + use_embedded_content = False + remove_empty_feeds = True + lang = 'de-AT' + no_stylesheets = True + encoding = 'utf-8' + language = _('German') + recursions = 0 oldest_article = 1 max_articles_per_feed = 100 + + html2lrf_options = [ + '--comment' , description + , '--category' , category + , '--publisher', publisher + ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' + feeds = [(u'International', u'http://derstandard.at/?page=rss&ressort=internationalpolitik'), (u'Inland', u'http://derstandard.at/?page=rss&ressort=innenpolitik'), (u'Wirtschaft', u'http://derstandard.at/?page=rss&ressort=investor'), @@ -20,17 +43,13 @@ class DerStandardRecipe(BasicNewsRecipe): (u'Wissenschaft', u'http://derstandard.at/?page=rss&ressort=wissenschaft'), (u'Gesundheit', u'http://derstandard.at/?page=rss&ressort=gesundheit'), (u'Bildung', u'http://derstandard.at/?page=rss&ressort=subildung')] - - encoding = 'utf-8' - language = _('German') - recursions = 0 remove_tags = [dict(name='div'), dict(name='a'), dict(name='link'), dict(name='meta'), dict(name='form',attrs={'name':'sitesearch'}), dict(name='hr')] preprocess_regexps = [ - (re.compile(r'\[[\d*]\]', re.DOTALL|re.IGNORECASE), lambda match: ''), + (re.compile(r'\[[\d]*\]', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(r'bgcolor="#\w{3,6}"', re.DOTALL|re.IGNORECASE), lambda match: '') ] - + def print_version(self, url): return url.replace('?id=', 'txt/?id=') @@ -40,3 +59,10 @@ class DerStandardRecipe(BasicNewsRecipe): if (article.link.count('ressort') > 0 or article.title.lower().count('ansichtssache') > 0): return None return article.link + + def preprocess_html(self, soup): + soup.html['xml:lang'] = self.lang + soup.html['lang'] = self.lang + mtag = '' + soup.head.insert(0,mtag) + return soup \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_seattle_times.py b/src/calibre/web/feeds/recipes/recipe_seattle_times.py new file mode 100644 index 0000000000..695a82b5b4 --- /dev/null +++ b/src/calibre/web/feeds/recipes/recipe_seattle_times.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2009, Darko Miletic ' +''' +seattletimes.nwsource.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class SeattleTimes(BasicNewsRecipe): + title = 'The Seattle Times' + __author__ = 'Darko Miletic' + description = 'News from Seattle and USA' + publisher = 'The Seattle Times' + category = 'news, politics, USA' + oldest_article = 2 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + encoding = 'cp1252' + language = _('English') + + html2lrf_options = [ + '--comment' , description + , '--category' , category + , '--publisher', publisher + ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' + + feeds = [(u'Articles', u'http://seattletimes.nwsource.com/rss/seattletimes.xml')] + + remove_tags = [ + dict(name=['object','link','script']) + ,dict(name='p', attrs={'class':'permission'}) + ] + + def print_version(self, url): + start_url, sep, rest_url = url.rpartition('_') + rurl, rsep, article_id = start_url.rpartition('/') + return u'http://seattletimes.nwsource.com/cgi-bin/PrintStory.pl?document_id=' + article_id + + def preprocess_html(self, soup): + mtag = '' + soup.head.insert(0,mtag) + for item in soup.findAll(style=True): + del item['style'] + return soup + diff --git a/src/pyPdf/generic.py b/src/pyPdf/generic.py index fb75ef3b3f..5447ef5fbc 100644 --- a/src/pyPdf/generic.py +++ b/src/pyPdf/generic.py @@ -299,7 +299,7 @@ def readStringFromStream(stream): elif tok == "t": tok = "\t" elif tok == "b": - tok == "\b" + tok = "\b" elif tok == "f": tok = "\f" elif tok == "(": @@ -673,7 +673,7 @@ class RectangleObject(ArrayObject): def getUpperLeft_x(self): return self.getLowerLeft_x() - + def getUpperLeft_y(self): return self.getUpperRight_y() diff --git a/src/pyPdf/pdf.py b/src/pyPdf/pdf.py index 362879a39a..710d128ad0 100644 --- a/src/pyPdf/pdf.py +++ b/src/pyPdf/pdf.py @@ -39,15 +39,12 @@ __author__ = "Mathieu Fenniak" __author_email__ = "biziqe@mathieu.fenniak.net" import struct -try: - from cStringIO import StringIO -except ImportError: - from StringIO import StringIO +from cStringIO import StringIO -import filters -import utils -import warnings -from generic import * +from generic import DictionaryObject, NameObject, NumberObject, \ +createStringObject, ArrayObject, ByteStringObject, StreamObject, \ +IndirectObject, utils, readObject, TextStringObject, BooleanObject, \ +RectangleObject, DecodedStreamObject from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList @@ -56,6 +53,7 @@ from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirt # class (typically {@link #PdfFileReader PdfFileReader}). class PdfFileWriter(object): def __init__(self,title=u"Unknown",author=u"Unknown"): + self.killed = False self._header = "%PDF-1.3" self._objects = [] # array of indirect objects @@ -162,7 +160,7 @@ class PdfFileWriter(object): # @param stream An object to write the file to. The object must support # the write method, and the tell method, similar to a file object. def write(self, stream): - import struct, md5 + import md5 externalReferenceMap = {} self.stack = [] @@ -209,11 +207,13 @@ class PdfFileWriter(object): if hasattr(self, "_encrypt"): trailer[NameObject("/Encrypt")] = self._encrypt trailer.writeToStream(stream, None) - + # eof stream.write("\nstartxref\n%s\n%%%%EOF\n" % (xref_location)) def _sweepIndirectReferences(self, externMap, data): + if self.killed: + raise RuntimeError('Writer killed') if isinstance(data, DictionaryObject): for key, value in data.items(): origvalue = value @@ -356,8 +356,8 @@ class PdfFileReader(object): return self.flattenedPages[pageNumber] ## - # Read-only property that accesses the - # {@link #PdfFileReader.getNamedDestinations + # Read-only property that accesses the + # {@link #PdfFileReader.getNamedDestinations # getNamedDestinations} function. #

# Stability: Added in v1.10, will exist for all future v1.x releases. @@ -374,7 +374,7 @@ class PdfFileReader(object): if retval == None: retval = {} catalog = self.trailer["/Root"] - + # get the name tree if catalog.has_key("/Dests"): tree = catalog["/Dests"] @@ -382,7 +382,7 @@ class PdfFileReader(object): names = catalog['/Names'] if names.has_key("/Dests"): tree = names['/Dests'] - + if tree == None: return retval @@ -420,17 +420,17 @@ class PdfFileReader(object): if outlines == None: outlines = [] catalog = self.trailer["/Root"] - + # get the outline dictionary and named destinations if catalog.has_key("/Outlines"): lines = catalog["/Outlines"] if lines.has_key("/First"): node = lines["/First"] self._namedDests = self.getNamedDestinations() - + if node == None: return outlines - + # see if there are any more outlines while 1: outline = self._buildOutline(node) @@ -454,10 +454,10 @@ class PdfFileReader(object): page, typ = array[0:2] array = array[2:] return Destination(title, page, typ, *array) - + def _buildOutline(self, node): dest, title, outline = None, None, None - + if node.has_key("/A") and node.has_key("/Title"): # Action, section 8.5 (only type GoTo supported) title = node["/Title"] @@ -951,7 +951,7 @@ class PageObject(DictionaryObject): def _pushPopGS(contents, pdf): # adds a graphics state "push" and "pop" to the beginning and end - # of a content stream. This isolates it from changes such as + # of a content stream. This isolates it from changes such as # transformation matricies. stream = ContentStream(contents, pdf) stream.operations.insert(0, [[], "q"]) @@ -1291,7 +1291,7 @@ class Destination(DictionaryObject): self[NameObject("/Title")] = title self[NameObject("/Page")] = page self[NameObject("/Type")] = typ - + # from table 8.2 of the PDF 1.6 reference. if typ == "/XYZ": (self[NameObject("/Left")], self[NameObject("/Top")], @@ -1307,7 +1307,7 @@ class Destination(DictionaryObject): pass else: raise utils.PdfReadError("Unknown Destination Type: %r" % typ) - + ## # Read-only property accessing the destination title. # @return A string. @@ -1474,25 +1474,25 @@ def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encr # described in Algorithm 3.2. key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry) # 2. Initialize the MD5 hash function and pass the 32-byte padding string - # shown in step 1 of Algorithm 3.2 as input to this function. + # shown in step 1 of Algorithm 3.2 as input to this function. import md5 m = md5.new() m.update(_encryption_padding) # 3. Pass the first element of the file's file identifier array (the value # of the ID entry in the document's trailer dictionary; see Table 3.13 on # page 73) to the hash function and finish the hash. (See implementation - # note 25 in Appendix H.) + # note 25 in Appendix H.) m.update(id1_entry) md5_hash = m.digest() # 4. Encrypt the 16-byte result of the hash, using an RC4 encryption - # function with the encryption key from step 1. + # function with the encryption key from step 1. val = utils.RC4_encrypt(key, md5_hash) # 5. Do the following 19 times: Take the output from the previous # invocation of the RC4 function and pass it as input to a new invocation # of the function; use an encryption key generated by taking each byte of # the original encryption key (obtained in step 2) and performing an XOR # operation between that byte and the single-byte value of the iteration - # counter (from 1 to 19). + # counter (from 1 to 19). for i in range(1, 20): new_key = '' for l in range(len(key)): @@ -1500,7 +1500,7 @@ def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encr val = utils.RC4_encrypt(new_key, val) # 6. Append 16 bytes of arbitrary padding to the output from the final # invocation of the RC4 function and store the 32-byte result as the value - # of the U entry in the encryption dictionary. + # of the U entry in the encryption dictionary. # (implementator note: I don't know what "arbitrary padding" is supposed to # mean, so I have used null bytes. This seems to match a few other # people's implementations)