diff --git a/recipes/dna.recipe b/recipes/dna.recipe index 83c19f3a13..69ac090ca4 100644 --- a/recipes/dna.recipe +++ b/recipes/dna.recipe @@ -1,6 +1,3 @@ -''' -dnaindia.com -''' import re from calibre.web.feeds.news import BasicNewsRecipe @@ -12,6 +9,10 @@ class DNAIndia(BasicNewsRecipe): language = 'en_IN' encoding = 'cp1252' + use_embedded_content = False + + no_stylesheets = True + auto_cleanup = True feeds = [ ('Top News', 'http://www.dnaindia.com/syndication/rss_topnews.xml'), @@ -22,15 +23,10 @@ class DNAIndia(BasicNewsRecipe): ('World', 'http://www.dnaindia.com/syndication/rss,catid-9.xml'), ('Money', 'http://www.dnaindia.com/syndication/rss,catid-4.xml'), ('Sports', 'http://www.dnaindia.com/syndication/rss,catid-6.xml'), - ('After Hours', 'http://www.dnaindia.com/syndication/rss,catid-7.xml'), - ('Digital Life', 'http://www.dnaindia.com/syndication/rss,catid-1089741.xml'), + ('After Hours', 'http://www.dnaindia.com/syndication/rss,catid-7.xml') ] - remove_tags = [{'id':['footer', 'lhs-col']}, {'class':['bottom', 'categoryHead', - 'article_tools']}] - keep_only_tags = dict(id='middle-col') - remove_tags_after=[dict(attrs={'id':'story'})] - remove_attributes=['style'] - no_stylesheets = True + + def print_version(self, url): match = re.search(r'newsid=(\d+)', url) diff --git a/recipes/toi.recipe b/recipes/toi.recipe index 8a772b6f9d..9d05b583a7 100644 --- a/recipes/toi.recipe +++ b/recipes/toi.recipe @@ -9,11 +9,16 @@ class TimesOfIndia(BasicNewsRecipe): max_articles_per_feed = 25 no_stylesheets = True - keep_only_tags = [{'class':['maintable12', 'prttabl']}] + remove_attributes = ['style'] + keep_only_tags = [ + {'class':re.compile(r'maintable12|prttabl')}, + {'id':['mod-article-header', + 'mod-a-body-after-first-para', 'mod-a-body-first-para']}, + ] remove_tags = [ - dict(style=lambda x: x and 'float' in x), - {'class':['prvnxtbg', 'footbdrin', 'bcclftr']}, - ] + {'class':re.compile('tabsintbgshow|prvnxtbg')}, + {'id':['fbrecommend', 'relmaindiv']} + ] feeds = [ ('Top Stories', @@ -41,6 +46,8 @@ class TimesOfIndia(BasicNewsRecipe): ] def get_article_url(self, article): + # Times of India sometimes serves an ad page instead of the article, + # this code, detects and circumvents that url = BasicNewsRecipe.get_article_url(self, article) if '/0Ltimesofindia' in url: url = url.partition('/0L')[-1] @@ -61,6 +68,3 @@ class TimesOfIndia(BasicNewsRecipe): return url - - def preprocess_html(self, soup): - return soup diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index fa72766a0a..122e3ac19b 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -73,6 +73,20 @@ class Worker(Thread): # Get details {{{ 8: ['août'], 9: ['sept'], 12: ['déc'], + }, + 'es': { + 1: ['enero'], + 2: ['febrero'], + 3: ['marzo'], + 4: ['abril'], + 5: ['mayo'], + 6: ['junio'], + 7: ['julio'], + 8: ['agosto'], + 9: ['septiembre', 'setiembre'], + 10: ['octubre'], + 11: ['noviembre'], + 12: ['diciembre'], }, 'jp': { 1: [u'1月'], @@ -101,13 +115,16 @@ class Worker(Thread): # Get details {{{ text()="Dettagli prodotto" or \ text()="Product details" or \ text()="Détails sur le produit" or \ + text()="Detalles del producto" or \ text()="登録情報"]/../div[@class="content"] ''' + # Editor: is for Spanish self.publisher_xpath = ''' descendant::*[starts-with(text(), "Publisher:") or \ starts-with(text(), "Verlag:") or \ starts-with(text(), "Editore:") or \ starts-with(text(), "Editeur") or \ + starts-with(text(), "Editor:") or \ starts-with(text(), "出版社:")] ''' self.language_xpath = ''' @@ -116,12 +133,14 @@ class Worker(Thread): # Get details {{{ or text() = "Language" \ or text() = "Sprache:" \ or text() = "Lingua:" \ + or text() = "Idioma:" \ or starts-with(text(), "Langue") \ or starts-with(text(), "言語") \ ] ''' + self.ratings_pat = re.compile( - r'([0-9.]+) ?(out of|von|su|étoiles sur|つ星のうち) ([\d\.]+)( (stars|Sternen|stelle)){0,1}') + r'([0-9.]+) ?(out of|von|su|étoiles sur|つ星のうち|de un máximo de) ([\d\.]+)( (stars|Sternen|stelle|estrellas)){0,1}') lm = { 'eng': ('English', 'Englisch'), @@ -143,6 +162,7 @@ class Worker(Thread): # Get details {{{ for i, vals in self.months.iteritems(): for x in vals: ans = ans.replace(x, self.english_months[i]) + ans = ans.replace(' de ', ' ') return ans def run(self): @@ -422,6 +442,7 @@ class Amazon(Source): 'uk' : _('UK'), 'it' : _('Italy'), 'jp' : _('Japan'), + 'es' : _('Spain'), } options = ( @@ -789,6 +810,16 @@ if __name__ == '__main__': # tests {{{ ), ] # }}} + es_tests = [ # {{{ + ( + {'identifiers':{'isbn': '8483460831'}}, + [title_test('Tiempos Interesantes', + exact=True), authors_test(['Terry Pratchett']) + ] + + ), + ] # }}} + jp_tests = [ # {{{ ( # isbn -> title, authors {'identifiers':{'isbn': '9784101302720' }}, @@ -804,6 +835,6 @@ if __name__ == '__main__': # tests {{{ ] # }}} test_identify_plugin(Amazon.name, com_tests) - #test_identify_plugin(Amazon.name, jp_tests) + #test_identify_plugin(Amazon.name, es_tests) # }}} diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index 05d874c9c3..632ccf230a 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -173,6 +173,10 @@ class PDFWriter(QObject): # {{{ printer.setOutputFormat(QPrinter.NativeFormat) self.view.print_(printer) printer.abort() + else: + # The document is so corrupt that we can't render the page. + self.loop.exit(0) + raise Exception('Document cannot be rendered.') self._render_book() def _delete_tmpdir(self): @@ -207,11 +211,14 @@ class PDFWriter(QObject): # {{{ try: outPDF = PdfFileWriter(title=self.metadata.title, author=self.metadata.author) for item in self.combine_queue: - with open(item, 'rb') as item_stream: - inputPDF = PdfFileReader(item_stream) - for page in inputPDF.pages: - outPDF.addPage(page) - outPDF.write(self.out_stream) + # The input PDF stream must remain open until the final PDF + # is written to disk. PyPDF references pages added to the + # final PDF from the input PDF on disk. It does not store + # the pages in memory so we can't close the input PDF. + inputPDF = PdfFileReader(open(item, 'rb')) + for page in inputPDF.pages: + outPDF.addPage(page) + outPDF.write(self.out_stream) finally: self._delete_tmpdir() self.loop.exit(0)