Pull from trunk

2025-08-11 09:13:57 -04:00 · 2009-02-21 20:42:54 -08:00 · 2009-02-21 20:42:54 -08:00 · 1d6a6586a9
commit 1d6a6586a9
parent 87ff17b50f 6c8d6a4edb
20 changed files with 278 additions and 38 deletions
--- a/src/calibre/constants.py
+++ b/src/calibre/constants.py
@ -2,7 +2,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
 __appname__   = 'calibre'
-__version__   = '0.4.138'
+__version__   = '0.4.139'
 __author__    = "Kovid Goyal <kovid@kovidgoyal.net>"
 '''
 Various run time constants.
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -233,7 +233,7 @@ class RTFMetadataWriter(MetadataWriterPlugin):
 class MOBIMetadataWriter(MetadataWriterPlugin):
    name        = 'Set MOBI metadata'
-    file_types  = set(['mobi', 'prc'])
+    file_types  = set(['mobi', 'prc', 'azw'])
    description = _('Set metadata in %s files')%'MOBI'
    author      = 'Marshall T. Vandegrift'
--- a/src/calibre/devices/cybookg3/driver.py
+++ b/src/calibre/devices/cybookg3/driver.py
@ -33,6 +33,7 @@ class CYBOOKG3(USBMS):
    EBOOK_DIR_MAIN = "eBooks"
    EBOOK_DIR_CARD = "eBooks"
    THUMBNAIL_HEIGHT = 144
    SUPPORTS_SUB_DIRS = True
    def upload_books(self, files, names, on_card=False, end_session=True, 
--- a/src/calibre/devices/cybookg3/t2b.py
+++ b/src/calibre/devices/cybookg3/t2b.py
@ -30,7 +30,7 @@ def write_t2b(t2bfile, coverdata=None):
    if coverdata != None:
        coverdata = StringIO.StringIO(coverdata)
        cover = Image.open(coverdata).convert("L")
-        cover.thumbnail((96, 144))
+        cover.thumbnail((96, 144), Image.ANTIALIAS)
        t2bcover = Image.new('L', (96, 144), 'white')
        x, y = cover.size
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@ -205,9 +205,8 @@ class HTMLProcessor(Processor, Rationalizer):
    def save(self):
        for meta in list(self.root.xpath('//meta')):
            meta.getparent().remove(meta)
-        #for img in self.root.xpath('//img[@src]'):
+        # Strip all comments since Adobe DE is petrified of them
-        #    self.convert_image(img)
+        Processor.save(self, strip_comments=True)
        Processor.save(self)
    def remove_first_image(self):
        images = self.root.xpath('//img')
--- a/src/calibre/ebooks/html.py
+++ b/src/calibre/ebooks/html.py
@ -331,9 +331,8 @@ class PreProcessor(object):
                  # Convert all entities, since lxml doesn't handle them well
                  (re.compile(r'&(\S+?);'), convert_entities),
                  # Remove the <![if/endif tags inserted by everybody's darling, MS Word
-                  (re.compile(r'(?i)<{0,1}!\[(end){0,1}if[^>]*>'), lambda match: ''),
+                  (re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE), 
-                  # Strip all comments since Adobe DE is petrified of them
+                   lambda match: ''),
                  (re.compile(r'<!--[^>]*>'), lambda match : ''),
                  ]
    # Fix pdftohtml markup
@ -447,7 +446,7 @@ class Parser(PreProcessor, LoggingInterface):
    def save_path(self):
        return os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path])
-    def save(self):
+    def save(self, strip_comments=False):
        '''
        Save processed HTML into the content directory.
        Should be called after all HTML processing is finished.
@ -458,7 +457,11 @@ class Parser(PreProcessor, LoggingInterface):
            svg.set('xmlns', 'http://www.w3.org/2000/svg')
        ans = tostring(self.root, pretty_print=self.opts.pretty_print)
-        ans = re.compile(r'<head>', re.IGNORECASE).sub('<head>\n\t<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n', ans[:1000])+ans[1000:]
+        ans = re.compile(r'<head>', re.IGNORECASE).sub(
            '<head>\n\t<meta http-equiv="Content-Type" '
            'content="text/html; charset=utf-8" />\n', ans[:1000])+ans[1000:]
        if strip_comments:
            ans = re.compile(r'<!--.*?-->', re.DOTALL).sub('', ans)
        with open(self.save_path(), 'wb') as f:
            f.write(ans)
            return f.name
@ -594,7 +597,7 @@ class Processor(Parser):
                mark = etree.Element('hr', style=page_break_before)
            elem.addprevious(mark)
-    def save(self):
+    def save(self, strip_comments=False):
        style_path = os.path.splitext(os.path.basename(self.save_path()))[0]
        for i, sheet in enumerate([self.stylesheet, self.font_css, self.override_css]):
            if sheet is not None:
@ -608,7 +611,7 @@ class Processor(Parser):
                if isinstance(raw, unicode):
                    raw = raw.encode('utf-8')
                open(path, 'wb').write(raw)
-        return Parser.save(self)
+        return Parser.save(self, strip_comments=strip_comments)
    def populate_toc(self, toc):
        '''
--- a/src/calibre/ebooks/lrf/init.py
+++ b/src/calibre/ebooks/lrf/init.py
@ -30,6 +30,7 @@ preferred_source_formats = [
                            'XHTML',
                            'PRC',
                            'AZW',
                            'FB2',
                            'RTF',
                            'PDF',
                            'TXT',
--- a/src/calibre/ebooks/lrf/fb2/convert_from.py
+++ b/src/calibre/ebooks/lrf/fb2/convert_from.py
@ -38,6 +38,7 @@ def extract_embedded_content(doc):
            open(fname, 'wb').write(data)
 def to_html(fb2file, tdir):
    fb2file = os.path.abspath(fb2file)
    cwd = os.getcwd()
    try:
        os.chdir(tdir)
@ -52,7 +53,7 @@ def to_html(fb2file, tdir):
        result = transform(doc)
        open('index.html', 'wb').write(transform.tostring(result))
        try:
-            mi = get_metadata(open(fb2file, 'rb'))
+            mi = get_metadata(open(fb2file, 'rb'), 'fb2')
        except:
            mi = MetaInformation(None, None)
        if not mi.title:
--- a/src/calibre/gui2/images/news/e_novine.png
+++ b/src/calibre/gui2/images/news/e_novine.png
--- a/src/calibre/trac/plugins/templates/linux.html
+++ b/src/calibre/trac/plugins/templates/linux.html
@ -114,10 +114,13 @@ sudo python -c "import urllib2; exec urllib2.urlopen('http://calibre.kovidgoyal.
 wget -O- http://calibre.kovidgoyal.net/downloads/${app}-${version}.tar.gz | tar xvz 
 cd calibre*
 python setup.py build &amp;&amp; sudo python setup.py install
 sudo calibre_postinstall
                                </pre>
                                Note that if your distribution does not have a
                                correctly compiled libunrar.so, ${app} will not 
-                                support rar files.
+                                support rar files. The calibre_postinstall step 
                                is required for device detection and integration
                                with your desktop environment.
                            </p>
                        </div>
                    </td>
--- a/src/calibre/web/feeds/init.py
+++ b/src/calibre/web/feeds/init.py
@ -5,10 +5,11 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''
 Contains the logic for parsing feeds.
 '''
-import time, logging, traceback, copy
+import time, logging, traceback, copy, re
 from datetime import datetime
 from calibre.web.feeds.feedparser import parse
 from calibre import entity_to_unicode
 from lxml import html
 class Article(object):
@ -19,6 +20,11 @@ class Article(object):
        self.downloaded = False
        self.id = id
        self.title = title.strip() if title else title
        try:
            self.title = re.sub(r'&(\S+);', 
                entity_to_unicode, self.title)
        except:
            pass
        self.url = url
        self.summary = summary
        if summary and not isinstance(summary, unicode):
@ -38,6 +44,7 @@ class Article(object):
        self.utctime = datetime(*self.date[:6])
        self.localtime = self.utctime + self.time_offset
    def __repr__(self):
        return \
 (u'''\
@ -92,6 +99,7 @@ class Feed(object):
                break
            self.parse_article(item)
    def populate_from_preparsed_feed(self, title, articles, oldest_article=7, 
                           max_articles_per_feed=100):
        self.title      = title if title else _('Unknown feed')
--- a/src/calibre/web/feeds/recipes/init.py
+++ b/src/calibre/web/feeds/recipes/init.py
@ -30,7 +30,8 @@ recipe_modules = ['recipe_' + r for r in (
           'honoluluadvertiser', 'starbulletin', 'exiled', 'indy_star', 'dna',
           'pobjeda', 'chicago_breaking_news', 'glasgow_herald', 'linuxdevices',
           'hindu', 'cincinnati_enquirer', 'physics_world', 'pressonline',
-           'la_republica', 'physics_today',
+           'la_republica', 'physics_today', 'chicago_tribune', 'e_novine',
           'al_jazeera', 'winsupersite', 
          )]
 import re, imp, inspect, time, os
--- a/src/calibre/web/feeds/recipes/recipe_al_jazeera.py
+++ b/src/calibre/web/feeds/recipes/recipe_al_jazeera.py
@ -0,0 +1,50 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
 '''
 aljazeera.net
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 class AlJazeera(BasicNewsRecipe):
    title                  = 'Al Jazeera in English'
    __author__             = 'Darko Miletic'
    description            = 'News from Middle East'
    publisher              = 'Al Jazeera'
    category               = 'news, politics, middle east'
    simultaneous_downloads = 1
    delay                  = 4    
    oldest_article         = 1
    max_articles_per_feed  = 100
    no_stylesheets         = True
    encoding               = 'iso-8859-1'
    remove_javascript      = True
    use_embedded_content   = False
    html2lrf_options = [
                          '--comment', description
                        , '--category', category
                        , '--publisher', publisher
                        , '--ignore-tables'
                        ]
    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_table=True' 
    keep_only_tags = [dict(name='div', attrs={'id':'ctl00_divContent'})]
    remove_tags = [
                     dict(name=['object','link'])
                    ,dict(name='td', attrs={'class':['MostActiveDescHeader','MostActiveDescBody']})
                  ]
    feeds = [(u'AL JAZEERA ENGLISH (AJE)', u'http://english.aljazeera.net/Services/Rss/?PostingId=2007731105943979989' )]
    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
        for item in soup.findAll(face=True):
            del item['face']
        return soup
--- a/src/calibre/web/feeds/recipes/recipe_chicago_tribune.py
+++ b/src/calibre/web/feeds/recipes/recipe_chicago_tribune.py
@ -0,0 +1,82 @@
 from __future__ import with_statement
 __license__ = 'GPL 3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import re
 from urlparse import urlparse, urlunparse
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ptempfile import PersistentTemporaryFile
 from threading import RLock
 class ChicagoTribune(BasicNewsRecipe):
    title       = 'Chicago Tribune'
    __author__  = 'Kovid Goyal'
    description = 'Politics, local and business news from Chicago'
    language    = _('English')
    use_embedded_content    = False
    articles_are_obfuscated = True
    remove_tags_before      = dict(name='h1')
    obfuctation_lock        = RLock()
    feeds = [
             ('Latest news', 'http://feeds.chicagotribune.com/chicagotribune/news/'),
             ('Local news', 'http://feeds.chicagotribune.com/chicagotribune/news/local/'),
             ('Nation/world', 'http://feeds.chicagotribune.com/chicagotribune/news/nationworld/'),
             ('Hot topics', 'http://feeds.chicagotribune.com/chicagotribune/hottopics/'),
             ('Most E-mailed stories', 'http://feeds.chicagotribune.com/chicagotribune/email/'),
             ('Opinion', 'http://feeds.chicagotribune.com/chicagotribune/opinion/'),
             ('Off Topic', 'http://feeds.chicagotribune.com/chicagotribune/offtopic/'),
             ('Politics', 'http://feeds.chicagotribune.com/chicagotribune/politics/'),
             ('Special Reports', 'http://feeds.chicagotribune.com/chicagotribune/special/'),
             ('Religion News', 'http://feeds.chicagotribune.com/chicagotribune/religion/'),
             ('Business news', 'http://feeds.chicagotribune.com/chicagotribune/business/'),
             ('Jobs and Careers', 'http://feeds.chicagotribune.com/chicagotribune/career/'),
             ('Local scene', 'http://feeds.chicagotribune.com/chicagohomes/localscene/'),
             ('Phil Rosenthal', 'http://feeds.chicagotribune.com/chicagotribune/rosenthal/'),
             ('Tech Buzz', 'http://feeds.chicagotribune.com/chicagotribune/techbuzz/'),
             ('Your Money', 'http://feeds.chicagotribune.com/chicagotribune/yourmoney/'),
             ('Jon Hilkevitch - Getting around', 'http://feeds.chicagotribune.com/chicagotribune/gettingaround/'),
             ('Jon Yates - What\'s your problem?', 'http://feeds.chicagotribune.com/chicagotribune/problem/'),
             ('Garisson Keillor', 'http://feeds.chicagotribune.com/chicagotribune/keillor/'),
             ('Marks Jarvis - On Money', 'http://feeds.chicagotribune.com/chicagotribune/marksjarvisonmoney/'),
             ('Sports', 'http://feeds.chicagotribune.com/chicagotribune/sports/'),
             ('Arts and Architecture', 'http://feeds.chicagotribune.com/chicagotribune/arts/'),
             ('Books', 'http://feeds.chicagotribune.com/chicagotribune/books/'),
             ('Magazine', 'http://feeds.chicagotribune.com/chicagotribune/magazine/'),
             ('Movies', 'http://feeds.chicagotribune.com/chicagotribune/movies/'),
             ('Music', 'http://feeds.chicagotribune.com/chicagotribune/movies/'),
             ('TV', 'http://feeds.chicagotribune.com/chicagotribune/tv/'),
             ('Hypertext', 'http://feeds.chicagotribune.com/chicagotribune/hypertext/'),
             ('iPhone Blog', 'http://feeds.feedburner.com/redeye/iphoneblog'),
             ('Julie\'s Health Club', 'http://feeds.chicagotribune.com/chicagotribune_julieshealthclub/'),
             ]
    temp_files = []
    def get_article_url(self, article):
        return article.get('feedburner_origlink', article.get('guid', article.get('link')))
    def get_obfuscated_article(self, url, logger):
        with self.obfuctation_lock:
            soup = self.index_to_soup(url)
            img = soup.find('img', alt='Print')
            if img is not None:
                a = img.parent.find('a', href=True)
                purl = urlparse(url)
                xurl = urlunparse(purl[:2] + (a['href'], '', '', ''))
                soup = self.index_to_soup(xurl)
                for img in soup.findAll('img', src=True):
                    if img['src'].startswith('/'):
                        img['src'] = urlunparse(purl[:2]+(img['src'], '', '', ''))
                html = unicode(soup)
            else:
                h1 = soup.find(id='page-title')
                body = soup.find(attrs={'class':re.compile('asset-content')})
                html = u'<html><head/><body>%s</body></html>'%(unicode(h1)+unicode(body))
            self.temp_files.append(PersistentTemporaryFile('_chicago_tribune.xhtml'))
            self.temp_files[-1].write(html.encode('utf-8'))
            self.temp_files[-1].close()
            return self.temp_files[-1].name
--- a/src/calibre/web/feeds/recipes/recipe_e_novine.py
+++ b/src/calibre/web/feeds/recipes/recipe_e_novine.py
@ -0,0 +1,58 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
 '''
 e-novine.com
 '''
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 class E_novine(BasicNewsRecipe):
    title                 = 'E-Novine'
    __author__            = 'Darko Miletic'
    description           = 'News from Serbia'
    publisher             = 'E-novine'
    category              = 'news, politics, Balcans'
    oldest_article        = 1
    max_articles_per_feed = 100
    no_stylesheets        = True
    encoding              = 'cp1250'
    cover_url             = 'http://www.e-novine.com/slike/slike_3/r1/g2008/m03/y3165525326702598.jpg'
    remove_javascript     = True
    use_embedded_content  = False
    language              = _('Serbian')
    extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: justify; font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}'
    html2lrf_options = [
                          '--comment', description
                        , '--category', category
                        , '--publisher', publisher
                        ]
    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"' 
    preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
    keep_only_tags = [dict(name='div', attrs={'id':['css_47_0_2844H']})]
    remove_tags = [dict(name=['object','link','embed','iframe'])]
    feeds = [(u'Sve vesti', u'http://www.e-novine.com/rss/e-novine.xml' )]
    def preprocess_html(self, soup):
        soup.html['xml:lang'] = 'sr-Latn-ME'
        soup.html['lang']     = 'sr-Latn-ME'
        mtag = '<meta http-equiv="Content-Language" content="sr-Latn-ME"/>'
        soup.head.insert(0,mtag)
        for item in soup.findAll(style=True):
            del item['style']
        ftag = soup.find('div', attrs={'id':'css_47_0_2844H'})
        if ftag:
           it = ftag.div
           it.extract()
           ftag.div.extract()
           ftag.insert(0,it)
        return soup
--- a/src/calibre/web/feeds/recipes/recipe_infobae.py
+++ b/src/calibre/web/feeds/recipes/recipe_infobae.py
@ -19,7 +19,7 @@ class Infobae(BasicNewsRecipe):
    no_stylesheets        = True
    use_embedded_content  = False
    language              = _('Spanish')
-    encoding              = 'iso-8859-1'
+    encoding              = 'cp1252'
    cover_url             = 'http://www.infobae.com/imgs/header/header.gif'
    remove_javascript     = True
@ -28,6 +28,7 @@ class Infobae(BasicNewsRecipe):
                        , '--category' , category
                        , '--publisher', publisher
                        , '--ignore-tables'
                        , '--ignore-colors'
                        ]
    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
--- a/src/calibre/web/feeds/recipes/recipe_la_segunda.py
+++ b/src/calibre/web/feeds/recipes/recipe_la_segunda.py
@ -21,14 +21,16 @@ class LaSegunda(BasicNewsRecipe):
    encoding              = 'cp1252'
    cover_url             = 'http://www.lasegunda.com/imagenes/logotipo_lasegunda_Oli.gif'
    remove_javascript     = True
    language              = _('Spanish')    
    html2lrf_options = [
                          '--comment', description
                        , '--category', category
                        , '--publisher', publisher
                        , '--ignore-tables'
                        ]
-    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' 
+    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} "' 
    keep_only_tags = [dict(name='table')]
@ -52,10 +54,7 @@ class LaSegunda(BasicNewsRecipe):
    def preprocess_html(self, soup):
        mtag = '<meta http-equiv="Content-Language" content="es-CL"/>'
        soup.head.insert(0,mtag)
        for item in soup.findAll(name='table', width=True):
            del item['width']
        for item in soup.findAll(style=True):
            del item['style']
        return soup
    language = _('Spanish')    
--- a/src/calibre/web/feeds/recipes/recipe_pagina12.py
+++ b/src/calibre/web/feeds/recipes/recipe_pagina12.py
@ -7,11 +7,10 @@ pagina12.com.ar
 '''
 from calibre import strftime
 from calibre.web.feeds.news import BasicNewsRecipe
 class Pagina12(BasicNewsRecipe):
-    title                 = u'Pagina/12'
+    title                 = 'Pagina/12'
    __author__            = 'Darko Miletic'
    description           = 'Noticias de Argentina y el resto del mundo'
    publisher             = 'La Pagina S.A.'
@ -20,9 +19,11 @@ class Pagina12(BasicNewsRecipe):
    max_articles_per_feed = 100
    no_stylesheets        = True
    encoding              = 'cp1252'
-    cover_url             = strftime('http://www.pagina12.com.ar/fotos/%Y%m%d/diario/TAPAN.jpg')
+    cover_url             = strftime('http://www.pagina12.com.ar/fotos/%Y%m%d/diario/tapagn.jpg')
    remove_javascript     = True
    use_embedded_content  = False
    language              = _('Spanish')    
    html2lrf_options = [
                          '--comment', description
@ -50,5 +51,3 @@ class Pagina12(BasicNewsRecipe):
        for item in soup.findAll(style=True):
            del item['style']
        return soup
    language = _('Spanish')    
--- a/src/calibre/web/feeds/recipes/recipe_winsupersite.py
+++ b/src/calibre/web/feeds/recipes/recipe_winsupersite.py
@ -0,0 +1,28 @@
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 class Winsupersite(BasicNewsRecipe):
    title          = u'Supersite for Windows'
    description           = u'Paul Thurrott SuperSite for Windows'
    publisher             = 'Paul Thurrott'
    __author__            = 'Hypernova'
    language              = _('English')
    oldest_article = 30
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    remove_javascript     = True
    html2lrf_options = ['--ignore-tables']
    html2epub_options = 'linearize_tables = True'
    remove_tags_before = dict(name='h1')
    preprocess_regexps = [
   (re.compile(r'<p>--Paul Thurrott.*</body>', re.DOTALL|re.IGNORECASE),
    lambda match: '</body>'),
 ]
    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        br.open('http://www.winsupersite.com')
        return br
    feeds          = [(u'Supersite for Windows', u'http://www.winsupersite.com/supersite.xml')]
--- a/upload.py
+++ b/upload.py
@ -284,7 +284,13 @@ class gui(OptionlessCommand):
                manifest = '<RCC>\n<qresource prefix="/">\n%s\n</qresource>\n</RCC>'%'\n'.join(files)
                with open('images.qrc', 'wb') as f:
                    f.write(manifest)
                try:
                    check_call(['pyrcc4', '-o', images, 'images.qrc'])
                except:
                    import traceback
                    traceback.print_exc()
                    raise Exception('You do not have pyrcc4 in your PATH. '
                                    'Install the PyQt4 development tools.')
            else:
                print 'Images are up to date'
        finally: