From 0f138cd43cd91249253f115568fbe7ee17c0c78f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 24 Sep 2011 10:52:22 -0600 Subject: [PATCH 1/5] Fix Times of India --- recipes/toi.recipe | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/recipes/toi.recipe b/recipes/toi.recipe index 8a772b6f9d..a44979dd4a 100644 --- a/recipes/toi.recipe +++ b/recipes/toi.recipe @@ -9,11 +9,12 @@ class TimesOfIndia(BasicNewsRecipe): max_articles_per_feed = 25 no_stylesheets = True - keep_only_tags = [{'class':['maintable12', 'prttabl']}] + remove_attributes = ['style'] + keep_only_tags = [{'class':re.compile(r'maintable12|prttabl')}] remove_tags = [ - dict(style=lambda x: x and 'float' in x), - {'class':['prvnxtbg', 'footbdrin', 'bcclftr']}, - ] + {'class':re.compile('tabsintbgshow|prvnxtbg')}, + {'id':['fbrecommend', 'relmaindiv']} + ] feeds = [ ('Top Stories', @@ -41,6 +42,8 @@ class TimesOfIndia(BasicNewsRecipe): ] def get_article_url(self, article): + # Times of India sometimes serves an ad page instead of the article, + # this code, detects and circumvents that url = BasicNewsRecipe.get_article_url(self, article) if '/0Ltimesofindia' in url: url = url.partition('/0L')[-1] @@ -61,6 +64,3 @@ class TimesOfIndia(BasicNewsRecipe): return url - - def preprocess_html(self, soup): - return soup From de794e1403900f121482a5c5f1a52cb81a9fd593 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 24 Sep 2011 11:00:48 -0600 Subject: [PATCH 2/5] ... --- recipes/toi.recipe | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/recipes/toi.recipe b/recipes/toi.recipe index a44979dd4a..9d05b583a7 100644 --- a/recipes/toi.recipe +++ b/recipes/toi.recipe @@ -10,7 +10,11 @@ class TimesOfIndia(BasicNewsRecipe): no_stylesheets = True remove_attributes = ['style'] - keep_only_tags = [{'class':re.compile(r'maintable12|prttabl')}] + keep_only_tags = [ + {'class':re.compile(r'maintable12|prttabl')}, + {'id':['mod-article-header', + 'mod-a-body-after-first-para', 'mod-a-body-first-para']}, + ] remove_tags = [ {'class':re.compile('tabsintbgshow|prvnxtbg')}, {'id':['fbrecommend', 'relmaindiv']} From f30efc907f7acd828a608c5b5a4c6b6339ca5af0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 24 Sep 2011 13:00:44 -0600 Subject: [PATCH 3/5] Fix DNA recipe --- recipes/dna.recipe | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/recipes/dna.recipe b/recipes/dna.recipe index 83c19f3a13..69ac090ca4 100644 --- a/recipes/dna.recipe +++ b/recipes/dna.recipe @@ -1,6 +1,3 @@ -''' -dnaindia.com -''' import re from calibre.web.feeds.news import BasicNewsRecipe @@ -12,6 +9,10 @@ class DNAIndia(BasicNewsRecipe): language = 'en_IN' encoding = 'cp1252' + use_embedded_content = False + + no_stylesheets = True + auto_cleanup = True feeds = [ ('Top News', 'http://www.dnaindia.com/syndication/rss_topnews.xml'), @@ -22,15 +23,10 @@ class DNAIndia(BasicNewsRecipe): ('World', 'http://www.dnaindia.com/syndication/rss,catid-9.xml'), ('Money', 'http://www.dnaindia.com/syndication/rss,catid-4.xml'), ('Sports', 'http://www.dnaindia.com/syndication/rss,catid-6.xml'), - ('After Hours', 'http://www.dnaindia.com/syndication/rss,catid-7.xml'), - ('Digital Life', 'http://www.dnaindia.com/syndication/rss,catid-1089741.xml'), + ('After Hours', 'http://www.dnaindia.com/syndication/rss,catid-7.xml') ] - remove_tags = [{'id':['footer', 'lhs-col']}, {'class':['bottom', 'categoryHead', - 'article_tools']}] - keep_only_tags = dict(id='middle-col') - remove_tags_after=[dict(attrs={'id':'story'})] - remove_attributes=['style'] - no_stylesheets = True + + def print_version(self, url): match = re.search(r'newsid=(\d+)', url) From cf823dff2658869aede781a8f8485236206ecf55 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 24 Sep 2011 15:47:54 -0400 Subject: [PATCH 4/5] PDF output fixes: Raise an exception for input so corrupt that it cannot be rendered. Fix issue with enormous PDF's being created. Due to who PyPDF works this brings back the out of file decriptors bug. --- src/calibre/ebooks/pdf/writer.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index 05d874c9c3..632ccf230a 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -173,6 +173,10 @@ class PDFWriter(QObject): # {{{ printer.setOutputFormat(QPrinter.NativeFormat) self.view.print_(printer) printer.abort() + else: + # The document is so corrupt that we can't render the page. + self.loop.exit(0) + raise Exception('Document cannot be rendered.') self._render_book() def _delete_tmpdir(self): @@ -207,11 +211,14 @@ class PDFWriter(QObject): # {{{ try: outPDF = PdfFileWriter(title=self.metadata.title, author=self.metadata.author) for item in self.combine_queue: - with open(item, 'rb') as item_stream: - inputPDF = PdfFileReader(item_stream) - for page in inputPDF.pages: - outPDF.addPage(page) - outPDF.write(self.out_stream) + # The input PDF stream must remain open until the final PDF + # is written to disk. PyPDF references pages added to the + # final PDF from the input PDF on disk. It does not store + # the pages in memory so we can't close the input PDF. + inputPDF = PdfFileReader(open(item, 'rb')) + for page in inputPDF.pages: + outPDF.addPage(page) + outPDF.write(self.out_stream) finally: self._delete_tmpdir() self.loop.exit(0) From 90bb4382e009585f94d700fa92b4a7db38cdb30c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 24 Sep 2011 15:53:08 -0600 Subject: [PATCH 5/5] Amazon metadata download plugin: Add option to donload metadata from amazon.es --- src/calibre/ebooks/metadata/sources/amazon.py | 35 +++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index fa72766a0a..122e3ac19b 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -73,6 +73,20 @@ class Worker(Thread): # Get details {{{ 8: ['août'], 9: ['sept'], 12: ['déc'], + }, + 'es': { + 1: ['enero'], + 2: ['febrero'], + 3: ['marzo'], + 4: ['abril'], + 5: ['mayo'], + 6: ['junio'], + 7: ['julio'], + 8: ['agosto'], + 9: ['septiembre', 'setiembre'], + 10: ['octubre'], + 11: ['noviembre'], + 12: ['diciembre'], }, 'jp': { 1: [u'1月'], @@ -101,13 +115,16 @@ class Worker(Thread): # Get details {{{ text()="Dettagli prodotto" or \ text()="Product details" or \ text()="Détails sur le produit" or \ + text()="Detalles del producto" or \ text()="登録情報"]/../div[@class="content"] ''' + # Editor: is for Spanish self.publisher_xpath = ''' descendant::*[starts-with(text(), "Publisher:") or \ starts-with(text(), "Verlag:") or \ starts-with(text(), "Editore:") or \ starts-with(text(), "Editeur") or \ + starts-with(text(), "Editor:") or \ starts-with(text(), "出版社:")] ''' self.language_xpath = ''' @@ -116,12 +133,14 @@ class Worker(Thread): # Get details {{{ or text() = "Language" \ or text() = "Sprache:" \ or text() = "Lingua:" \ + or text() = "Idioma:" \ or starts-with(text(), "Langue") \ or starts-with(text(), "言語") \ ] ''' + self.ratings_pat = re.compile( - r'([0-9.]+) ?(out of|von|su|étoiles sur|つ星のうち) ([\d\.]+)( (stars|Sternen|stelle)){0,1}') + r'([0-9.]+) ?(out of|von|su|étoiles sur|つ星のうち|de un máximo de) ([\d\.]+)( (stars|Sternen|stelle|estrellas)){0,1}') lm = { 'eng': ('English', 'Englisch'), @@ -143,6 +162,7 @@ class Worker(Thread): # Get details {{{ for i, vals in self.months.iteritems(): for x in vals: ans = ans.replace(x, self.english_months[i]) + ans = ans.replace(' de ', ' ') return ans def run(self): @@ -422,6 +442,7 @@ class Amazon(Source): 'uk' : _('UK'), 'it' : _('Italy'), 'jp' : _('Japan'), + 'es' : _('Spain'), } options = ( @@ -789,6 +810,16 @@ if __name__ == '__main__': # tests {{{ ), ] # }}} + es_tests = [ # {{{ + ( + {'identifiers':{'isbn': '8483460831'}}, + [title_test('Tiempos Interesantes', + exact=True), authors_test(['Terry Pratchett']) + ] + + ), + ] # }}} + jp_tests = [ # {{{ ( # isbn -> title, authors {'identifiers':{'isbn': '9784101302720' }}, @@ -804,6 +835,6 @@ if __name__ == '__main__': # tests {{{ ] # }}} test_identify_plugin(Amazon.name, com_tests) - #test_identify_plugin(Amazon.name, jp_tests) + #test_identify_plugin(Amazon.name, es_tests) # }}}