Merge from trunk

2025-07-09 03:04:10 -04:00 · 2011-09-25 09:01:33 +02:00 · 2011-09-25 09:01:33 +02:00 · b2dceb1a7a
commit b2dceb1a7a
parent 4ece2a59e2 90bb4382e0
4 changed files with 63 additions and 25 deletions
--- a/recipes/dna.recipe
+++ b/recipes/dna.recipe
@ -1,6 +1,3 @@
-'''
-dnaindia.com
-'''
 import re
 from calibre.web.feeds.news import BasicNewsRecipe

@ -12,6 +9,10 @@ class DNAIndia(BasicNewsRecipe):
    language = 'en_IN'

    encoding    = 'cp1252'
+    use_embedded_content = False
+
+    no_stylesheets = True
+    auto_cleanup = True

    feeds       = [
                   ('Top News', 'http://www.dnaindia.com/syndication/rss_topnews.xml'),
@ -22,15 +23,10 @@ class DNAIndia(BasicNewsRecipe):
                   ('World', 'http://www.dnaindia.com/syndication/rss,catid-9.xml'),
                   ('Money', 'http://www.dnaindia.com/syndication/rss,catid-4.xml'),
                   ('Sports', 'http://www.dnaindia.com/syndication/rss,catid-6.xml'),
-                   ('After Hours', 'http://www.dnaindia.com/syndication/rss,catid-7.xml'),
-                   ('Digital Life', 'http://www.dnaindia.com/syndication/rss,catid-1089741.xml'),
+                   ('After Hours', 'http://www.dnaindia.com/syndication/rss,catid-7.xml')
                   ]
-    remove_tags = [{'id':['footer', 'lhs-col']}, {'class':['bottom', 'categoryHead',
-        'article_tools']}]
-    keep_only_tags = dict(id='middle-col')
-    remove_tags_after=[dict(attrs={'id':'story'})]
-    remove_attributes=['style']
-    no_stylesheets = True
+
+    

    def print_version(self, url):
        match = re.search(r'newsid=(\d+)', url)
--- a/recipes/toi.recipe
+++ b/recipes/toi.recipe
@ -9,11 +9,16 @@ class TimesOfIndia(BasicNewsRecipe):
    max_articles_per_feed = 25

    no_stylesheets = True
-    keep_only_tags = [{'class':['maintable12', 'prttabl']}]
+    remove_attributes = ['style']
+    keep_only_tags = [
+            {'class':re.compile(r'maintable12|prttabl')},
+            {'id':['mod-article-header',
+                'mod-a-body-after-first-para', 'mod-a-body-first-para']},
+            ]
    remove_tags = [
-            dict(style=lambda x: x and 'float' in x),
-            {'class':['prvnxtbg', 'footbdrin', 'bcclftr']},
-    ]
+            {'class':re.compile('tabsintbgshow|prvnxtbg')},
+            {'id':['fbrecommend', 'relmaindiv']}
+            ]

    feeds          = [
 ('Top Stories',
@ -41,6 +46,8 @@ class TimesOfIndia(BasicNewsRecipe):
 ]

    def get_article_url(self, article):
+        # Times of India sometimes serves an ad page instead of the article,
+        # this code, detects and circumvents that
        url = BasicNewsRecipe.get_article_url(self, article)
        if '/0Ltimesofindia' in url:
            url = url.partition('/0L')[-1]
@ -61,6 +68,3 @@ class TimesOfIndia(BasicNewsRecipe):

        return url

-
-    def preprocess_html(self, soup):
-        return soup
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@ -73,6 +73,20 @@ class Worker(Thread): # Get details {{{
            8: ['août'],
            9: ['sept'],
            12: ['déc'],
+            },
+            'es': {
+                1: ['enero'],
+                2: ['febrero'],
+                3: ['marzo'],
+                4: ['abril'],
+                5: ['mayo'],
+                6: ['junio'],
+                7: ['julio'],
+                8: ['agosto'],
+                9: ['septiembre', 'setiembre'],
+                10: ['octubre'],
+                11: ['noviembre'],
+                12: ['diciembre'],
            },
                'jp': {
            1: [u'1月'],
@ -101,13 +115,16 @@ class Worker(Thread): # Get details {{{
                 text()="Dettagli prodotto" or \
                 text()="Product details" or \
                 text()="Détails sur le produit" or \
+                 text()="Detalles del producto" or \
                 text()="登録情報"]/../div[@class="content"]
            '''
+        # Editor: is for Spanish
        self.publisher_xpath = '''
            descendant::*[starts-with(text(), "Publisher:") or \
                    starts-with(text(), "Verlag:") or \
                    starts-with(text(), "Editore:") or \
                    starts-with(text(), "Editeur") or \
+                    starts-with(text(), "Editor:") or \
                    starts-with(text(), "出版社:")]
            '''
        self.language_xpath =    '''
@ -116,12 +133,14 @@ class Worker(Thread): # Get details {{{
                or text() = "Language" \
                or text() = "Sprache:" \
                or text() = "Lingua:" \
+                or text() = "Idioma:" \
                or starts-with(text(), "Langue") \
                or starts-with(text(), "言語") \
                ]
            '''
+
        self.ratings_pat = re.compile(
-            r'([0-9.]+) ?(out of|von|su|étoiles sur|つ星のうち) ([\d\.]+)( (stars|Sternen|stelle)){0,1}')
+            r'([0-9.]+) ?(out of|von|su|étoiles sur|つ星のうち|de un máximo de) ([\d\.]+)( (stars|Sternen|stelle|estrellas)){0,1}')

        lm = {
                'eng': ('English', 'Englisch'),
@ -143,6 +162,7 @@ class Worker(Thread): # Get details {{{
        for i, vals in self.months.iteritems():
            for x in vals:
                ans = ans.replace(x, self.english_months[i])
+        ans = ans.replace(' de ', ' ')
        return ans

    def run(self):
@ -422,6 +442,7 @@ class Amazon(Source):
            'uk' : _('UK'),
            'it' : _('Italy'),
            'jp' : _('Japan'),
+            'es' : _('Spain'),
    }

    options = (
@ -789,6 +810,16 @@ if __name__ == '__main__': # tests {{{
            ),
    ] # }}}

+    es_tests = [ # {{{
+            (
+                {'identifiers':{'isbn': '8483460831'}},
+                [title_test('Tiempos Interesantes',
+                    exact=True), authors_test(['Terry Pratchett'])
+                 ]
+
+            ),
+    ] # }}}
+
    jp_tests = [ # {{{
            ( # isbn -> title, authors
                {'identifiers':{'isbn': '9784101302720' }},
@ -804,6 +835,6 @@ if __name__ == '__main__': # tests {{{
    ] # }}}

    test_identify_plugin(Amazon.name, com_tests)
-    #test_identify_plugin(Amazon.name, jp_tests)
+    #test_identify_plugin(Amazon.name, es_tests)
 # }}}

--- a/src/calibre/ebooks/pdf/writer.py
+++ b/src/calibre/ebooks/pdf/writer.py
@ -173,6 +173,10 @@ class PDFWriter(QObject): # {{{
                printer.setOutputFormat(QPrinter.NativeFormat)
            self.view.print_(printer)
            printer.abort()
+        else:
+            # The document is so corrupt that we can't render the page.
+            self.loop.exit(0)
+            raise Exception('Document cannot be rendered.')
        self._render_book()

    def _delete_tmpdir(self):
@ -207,11 +211,14 @@ class PDFWriter(QObject): # {{{
        try:
            outPDF = PdfFileWriter(title=self.metadata.title, author=self.metadata.author)
            for item in self.combine_queue:
-                with open(item, 'rb') as item_stream:
-                    inputPDF = PdfFileReader(item_stream)
-                    for page in inputPDF.pages:
-                        outPDF.addPage(page)
-                        outPDF.write(self.out_stream)
+                # The input PDF stream must remain open until the final PDF
+                # is written to disk. PyPDF references pages added to the
+                # final PDF from the input PDF on disk. It does not store
+                # the pages in memory so we can't close the input PDF.
+                inputPDF = PdfFileReader(open(item, 'rb'))
+                for page in inputPDF.pages:
+                    outPDF.addPage(page)
+            outPDF.write(self.out_stream)
        finally:
            self._delete_tmpdir()
            self.loop.exit(0)