From 4e1f851a445737575725e0c3cd7b0f34d0bb9fcb Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 16 Apr 2009 14:39:17 -0700 Subject: [PATCH 1/3] Add a timeout to the PDF metadata writer as it hangs on some PDF files --- src/calibre/ebooks/metadata/pdf.py | 48 +++++++++++++++++++------- src/pyPdf/generic.py | 4 +-- src/pyPdf/pdf.py | 54 +++++++++++++++--------------- 3 files changed, 64 insertions(+), 42 deletions(-) diff --git a/src/calibre/ebooks/metadata/pdf.py b/src/calibre/ebooks/metadata/pdf.py index 80cdc82070..54d52f0b58 100644 --- a/src/calibre/ebooks/metadata/pdf.py +++ b/src/calibre/ebooks/metadata/pdf.py @@ -2,7 +2,8 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' '''Read meta information from PDF files''' -import sys, os, StringIO +import sys, os, cStringIO +from threading import Thread from calibre.ebooks.metadata import MetaInformation, authors_to_string, get_parser from pyPdf import PdfFileReader, PdfFileWriter @@ -29,25 +30,46 @@ def get_metadata(stream): print >>sys.stderr, msg.encode('utf8') return mi +class MetadataWriter(Thread): + + def __init__(self, out_pdf, buf): + self.out_pdf = out_pdf + self.buf = buf + Thread.__init__(self) + self.daemon = True + + def run(self): + try: + self.out_pdf.write(self.buf) + except RuntimeError: + pass + def set_metadata(stream, mi): stream.seek(0) - + # Use a StringIO object for the pdf because we will want to over # write it later and if we are working on the stream directly it # could cause some issues. - raw = StringIO.StringIO(stream.read()) + raw = cStringIO.StringIO(stream.read()) orig_pdf = PdfFileReader(raw) - + title = mi.title if mi.title else orig_pdf.documentInfo.title author = authors_to_string(mi.authors) if mi.authors else orig_pdf.documentInfo.author - + out_pdf = PdfFileWriter(title=title, author=author) + out_str = cStringIO.StringIO() + writer = MetadataWriter(out_pdf, out_str) for page in orig_pdf.pages: out_pdf.addPage(page) - - out_str = StringIO.StringIO() - out_pdf.write(out_str) - + + writer.start() + writer.join(10) # Wait 10 secs for writing to complete + out_pdf.killed = True + writer.join() + if out_pdf.killed: + print 'Failed to set metadata: took too long' + return + stream.seek(0) stream.truncate() out_str.seek(0) @@ -59,7 +81,7 @@ def option_parser(): p.remove_option('--category') p.remove_option('--comment') return p - + def main(args=sys.argv): #p = option_parser() #opts, args = p.parse_args(args) @@ -67,14 +89,14 @@ def main(args=sys.argv): print >>sys.stderr, _('Usage: pdf-meta file.pdf') print >>sys.stderr, _('No filename specified.') return 1 - + stream = open(os.path.abspath(os.path.expanduser(args[1])), 'r+b') #mi = MetaInformation(opts.title, opts.authors) #if mi.title or mi.authors: # set_metadata(stream, mi) print unicode(get_metadata(stream)).encode('utf-8') - + return 0 if __name__ == '__main__': - sys.exit(main()) \ No newline at end of file + sys.exit(main()) diff --git a/src/pyPdf/generic.py b/src/pyPdf/generic.py index fb75ef3b3f..5447ef5fbc 100644 --- a/src/pyPdf/generic.py +++ b/src/pyPdf/generic.py @@ -299,7 +299,7 @@ def readStringFromStream(stream): elif tok == "t": tok = "\t" elif tok == "b": - tok == "\b" + tok = "\b" elif tok == "f": tok = "\f" elif tok == "(": @@ -673,7 +673,7 @@ class RectangleObject(ArrayObject): def getUpperLeft_x(self): return self.getLowerLeft_x() - + def getUpperLeft_y(self): return self.getUpperRight_y() diff --git a/src/pyPdf/pdf.py b/src/pyPdf/pdf.py index 362879a39a..710d128ad0 100644 --- a/src/pyPdf/pdf.py +++ b/src/pyPdf/pdf.py @@ -39,15 +39,12 @@ __author__ = "Mathieu Fenniak" __author_email__ = "biziqe@mathieu.fenniak.net" import struct -try: - from cStringIO import StringIO -except ImportError: - from StringIO import StringIO +from cStringIO import StringIO -import filters -import utils -import warnings -from generic import * +from generic import DictionaryObject, NameObject, NumberObject, \ +createStringObject, ArrayObject, ByteStringObject, StreamObject, \ +IndirectObject, utils, readObject, TextStringObject, BooleanObject, \ +RectangleObject, DecodedStreamObject from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList @@ -56,6 +53,7 @@ from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirt # class (typically {@link #PdfFileReader PdfFileReader}). class PdfFileWriter(object): def __init__(self,title=u"Unknown",author=u"Unknown"): + self.killed = False self._header = "%PDF-1.3" self._objects = [] # array of indirect objects @@ -162,7 +160,7 @@ class PdfFileWriter(object): # @param stream An object to write the file to. The object must support # the write method, and the tell method, similar to a file object. def write(self, stream): - import struct, md5 + import md5 externalReferenceMap = {} self.stack = [] @@ -209,11 +207,13 @@ class PdfFileWriter(object): if hasattr(self, "_encrypt"): trailer[NameObject("/Encrypt")] = self._encrypt trailer.writeToStream(stream, None) - + # eof stream.write("\nstartxref\n%s\n%%%%EOF\n" % (xref_location)) def _sweepIndirectReferences(self, externMap, data): + if self.killed: + raise RuntimeError('Writer killed') if isinstance(data, DictionaryObject): for key, value in data.items(): origvalue = value @@ -356,8 +356,8 @@ class PdfFileReader(object): return self.flattenedPages[pageNumber] ## - # Read-only property that accesses the - # {@link #PdfFileReader.getNamedDestinations + # Read-only property that accesses the + # {@link #PdfFileReader.getNamedDestinations # getNamedDestinations} function. #

# Stability: Added in v1.10, will exist for all future v1.x releases. @@ -374,7 +374,7 @@ class PdfFileReader(object): if retval == None: retval = {} catalog = self.trailer["/Root"] - + # get the name tree if catalog.has_key("/Dests"): tree = catalog["/Dests"] @@ -382,7 +382,7 @@ class PdfFileReader(object): names = catalog['/Names'] if names.has_key("/Dests"): tree = names['/Dests'] - + if tree == None: return retval @@ -420,17 +420,17 @@ class PdfFileReader(object): if outlines == None: outlines = [] catalog = self.trailer["/Root"] - + # get the outline dictionary and named destinations if catalog.has_key("/Outlines"): lines = catalog["/Outlines"] if lines.has_key("/First"): node = lines["/First"] self._namedDests = self.getNamedDestinations() - + if node == None: return outlines - + # see if there are any more outlines while 1: outline = self._buildOutline(node) @@ -454,10 +454,10 @@ class PdfFileReader(object): page, typ = array[0:2] array = array[2:] return Destination(title, page, typ, *array) - + def _buildOutline(self, node): dest, title, outline = None, None, None - + if node.has_key("/A") and node.has_key("/Title"): # Action, section 8.5 (only type GoTo supported) title = node["/Title"] @@ -951,7 +951,7 @@ class PageObject(DictionaryObject): def _pushPopGS(contents, pdf): # adds a graphics state "push" and "pop" to the beginning and end - # of a content stream. This isolates it from changes such as + # of a content stream. This isolates it from changes such as # transformation matricies. stream = ContentStream(contents, pdf) stream.operations.insert(0, [[], "q"]) @@ -1291,7 +1291,7 @@ class Destination(DictionaryObject): self[NameObject("/Title")] = title self[NameObject("/Page")] = page self[NameObject("/Type")] = typ - + # from table 8.2 of the PDF 1.6 reference. if typ == "/XYZ": (self[NameObject("/Left")], self[NameObject("/Top")], @@ -1307,7 +1307,7 @@ class Destination(DictionaryObject): pass else: raise utils.PdfReadError("Unknown Destination Type: %r" % typ) - + ## # Read-only property accessing the destination title. # @return A string. @@ -1474,25 +1474,25 @@ def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encr # described in Algorithm 3.2. key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry) # 2. Initialize the MD5 hash function and pass the 32-byte padding string - # shown in step 1 of Algorithm 3.2 as input to this function. + # shown in step 1 of Algorithm 3.2 as input to this function. import md5 m = md5.new() m.update(_encryption_padding) # 3. Pass the first element of the file's file identifier array (the value # of the ID entry in the document's trailer dictionary; see Table 3.13 on # page 73) to the hash function and finish the hash. (See implementation - # note 25 in Appendix H.) + # note 25 in Appendix H.) m.update(id1_entry) md5_hash = m.digest() # 4. Encrypt the 16-byte result of the hash, using an RC4 encryption - # function with the encryption key from step 1. + # function with the encryption key from step 1. val = utils.RC4_encrypt(key, md5_hash) # 5. Do the following 19 times: Take the output from the previous # invocation of the RC4 function and pass it as input to a new invocation # of the function; use an encryption key generated by taking each byte of # the original encryption key (obtained in step 2) and performing an XOR # operation between that byte and the single-byte value of the iteration - # counter (from 1 to 19). + # counter (from 1 to 19). for i in range(1, 20): new_key = '' for l in range(len(key)): @@ -1500,7 +1500,7 @@ def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encr val = utils.RC4_encrypt(new_key, val) # 6. Append 16 bytes of arbitrary padding to the output from the final # invocation of the RC4 function and store the 32-byte result as the value - # of the U entry in the encryption dictionary. + # of the U entry in the encryption dictionary. # (implementator note: I don't know what "arbitrary padding" is supposed to # mean, so I have used null bytes. This seems to match a few other # people's implementations) From d6022d93fc7b6fd7a62da922c00a6ce7c1b4f05a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 16 Apr 2009 21:37:36 -0700 Subject: [PATCH 2/3] New recipe for the Seattle Times by Darko Miletic --- .../gui2/images/news/seattle_times.png | Bin 0 -> 746 bytes src/calibre/web/feeds/recipes/__init__.py | 1 + .../web/feeds/recipes/recipe_seattle_times.py | 50 ++++++++++++++++++ 3 files changed, 51 insertions(+) create mode 100644 src/calibre/gui2/images/news/seattle_times.png create mode 100644 src/calibre/web/feeds/recipes/recipe_seattle_times.py diff --git a/src/calibre/gui2/images/news/seattle_times.png b/src/calibre/gui2/images/news/seattle_times.png new file mode 100644 index 0000000000000000000000000000000000000000..b885684992a4b40df7a7eeabbe39de2caff2dda9 GIT binary patch literal 746 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!61|;P_|4#%`Y)RhkE)4%caKYZ?lYt_f1s;*b zK-vS0-A-oPfdtD69Mgd`SU*F|v9*U87?^}TT^vI!PA{Eom?4}f(KdgJ@A=n`Z8ugY z@qGz0UgDcP^}znAL9Y_Fcw0*x^jn&JU~MZOA8Vn|s}~su{bE;@8ElD_;S=@pIXPj; z_B)*w;_Rx&jHg<}6mJVy`hwfu{y+Eq-}~+#|KG}BG^T$dd@N?$$_;hZ6_y`!uiR3= z9vf`f^?Sy)!}BkcZdtz7S2-X&MRP~L)jW*_CxnWPlQlLSX;#h0~3`g;~% z{L<9_' + soup.head.insert(0,mtag) + for item in soup.findAll(style=True): + del item['style'] + return soup + From 0cb7a49d112d5194301497f0f78e61b42975bd3b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 17 Apr 2009 11:37:52 -0700 Subject: [PATCH 3/3] IGN:Updated Der standard and fix series box being too long initially on OS X --- src/calibre/gui2/dialogs/metadata_single.py | 8 +--- src/calibre/gui2/images/news/der_standard.png | Bin 0 -> 509 bytes .../web/feeds/recipes/recipe_der_standard.py | 44 ++++++++++++++---- 3 files changed, 36 insertions(+), 16 deletions(-) create mode 100644 src/calibre/gui2/images/news/der_standard.png diff --git a/src/calibre/gui2/dialogs/metadata_single.py b/src/calibre/gui2/dialogs/metadata_single.py index c48c7c3640..4a74c87097 100644 --- a/src/calibre/gui2/dialogs/metadata_single.py +++ b/src/calibre/gui2/dialogs/metadata_single.py @@ -319,6 +319,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): self.cover_changed = True def initialize_series(self): + self.series.setSizeAdjustPolicy(self.series.AdjustToContentsOnFirstShow) all_series = self.db.all_series() all_series.sort(cmp=lambda x, y : cmp(x[1], y[1])) series_id = self.db.series_id(self.row) @@ -335,13 +336,6 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): self.series.setCurrentIndex(idx) self.enable_series_index() - pl = self.series.parentWidget().layout() - for i in range(pl.count()): - l = pl.itemAt(i).layout() - if l: - l.invalidate() - l.activate() - def initialize_series_and_publisher(self): self.initialize_series() all_publishers = self.db.all_publishers() diff --git a/src/calibre/gui2/images/news/der_standard.png b/src/calibre/gui2/images/news/der_standard.png new file mode 100644 index 0000000000000000000000000000000000000000..4d750fe5a8583839440916aae93cdb7bb2222f1e GIT binary patch literal 509 zcmeAS@N?(olHy`uVBq!ia0vp^1|ZDA1|-9oezpTCwj^(N7l!{JxM1({$v_d#0*}aI zAngIhZYQ(tK!Rljj_E)ete@fByPj|c21Yec7srr{dv9l4Zm$Q`4`|5=^vZrxz z8eZ>ND&RhK+v$Bp6<_+@J*DRcDYB|rZ9L|o>9|dKiu2-Nt&}5M{Px72__J|->a%in zec>6$k{K?>Jl!o|q@`K5WZRPn^?qT2hqea-)aN>{>`vzsirXJ!U>eC00bvTFR#$>6;u(RStv{wqZd@sV~& zt=pP(&A$^D8Lavo@}h_-`k&l&p}2+5ykq7XM$MIR_EqTddnkH}VfRTeDy)@S+IJz5 zDaN#J5?AEmi^obC@*b-{6s=-NzvQ02-DlP|VDPDyxJHyD7o{ear0S*s2?iqr14~^4 z6J0~|5CcmqQ$s6LQ(XgdD+7b?)=w>A8glbfGSe#2G?;}LT3DHwS(#WsG@J;T-vQLX N;OXk;vd$@?2>=Kjy)^&; literal 0 HcmV?d00001 diff --git a/src/calibre/web/feeds/recipes/recipe_der_standard.py b/src/calibre/web/feeds/recipes/recipe_der_standard.py index eec4c4e74d..c053d74cfb 100644 --- a/src/calibre/web/feeds/recipes/recipe_der_standard.py +++ b/src/calibre/web/feeds/recipes/recipe_der_standard.py @@ -1,14 +1,37 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +__license__ = 'GPL v3' +__copyright__ = '2009, Gerhard Aigner ' ''' http://www.derstandard.at - Austrian Newspaper ''' import re from calibre.web.feeds.news import BasicNewsRecipe class DerStandardRecipe(BasicNewsRecipe): - title = u'derStandard' - __author__ = 'Gerhard Aigner' - + title = u'derStandard' + __author__ = 'Gerhard Aigner' + description = u'Nachrichten aus Österreich' + publisher ='derStandard.at' + category = 'news, politics, nachrichten, Austria' + use_embedded_content = False + remove_empty_feeds = True + lang = 'de-AT' + no_stylesheets = True + encoding = 'utf-8' + language = _('German') + recursions = 0 oldest_article = 1 max_articles_per_feed = 100 + + html2lrf_options = [ + '--comment' , description + , '--category' , category + , '--publisher', publisher + ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' + feeds = [(u'International', u'http://derstandard.at/?page=rss&ressort=internationalpolitik'), (u'Inland', u'http://derstandard.at/?page=rss&ressort=innenpolitik'), (u'Wirtschaft', u'http://derstandard.at/?page=rss&ressort=investor'), @@ -20,17 +43,13 @@ class DerStandardRecipe(BasicNewsRecipe): (u'Wissenschaft', u'http://derstandard.at/?page=rss&ressort=wissenschaft'), (u'Gesundheit', u'http://derstandard.at/?page=rss&ressort=gesundheit'), (u'Bildung', u'http://derstandard.at/?page=rss&ressort=subildung')] - - encoding = 'utf-8' - language = _('German') - recursions = 0 remove_tags = [dict(name='div'), dict(name='a'), dict(name='link'), dict(name='meta'), dict(name='form',attrs={'name':'sitesearch'}), dict(name='hr')] preprocess_regexps = [ - (re.compile(r'\[[\d*]\]', re.DOTALL|re.IGNORECASE), lambda match: ''), + (re.compile(r'\[[\d]*\]', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(r'bgcolor="#\w{3,6}"', re.DOTALL|re.IGNORECASE), lambda match: '') ] - + def print_version(self, url): return url.replace('?id=', 'txt/?id=') @@ -40,3 +59,10 @@ class DerStandardRecipe(BasicNewsRecipe): if (article.link.count('ressort') > 0 or article.title.lower().count('ansichtssache') > 0): return None return article.link + + def preprocess_html(self, soup): + soup.html['xml:lang'] = self.lang + soup.html['lang'] = self.lang + mtag = '' + soup.head.insert(0,mtag) + return soup \ No newline at end of file