Merge from trunk

2025-06-23 15:30:45 -04:00 · 2013-02-02 08:52:27 +01:00 · 2013-02-02 08:52:27 +01:00 · 8839bf5459
commit 8839bf5459
parent cd87a4825d cbe0e8bd17
10 changed files with 186 additions and 96 deletions
--- a/manual/faq.rst
+++ b/manual/faq.rst
@ -663,7 +663,7 @@ Post any output you see in a help message on the `Forum <http://www.mobileread.c
 |app| freezes/crashes occasionally?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-There are three possible things I know of, that can cause this:
+There are five possible things I know of, that can cause this:

    * You recently connected an external monitor or TV to your computer. In
      this case, whenever |app| opens a new window like the edit metadata
--- a/recipes/globe_and_mail.recipe
+++ b/recipes/globe_and_mail.recipe
@ -21,6 +21,10 @@ class AdvancedUserRecipe1287083651(BasicNewsRecipe):
    encoding               = 'utf8'
    publisher              = 'Globe & Mail'
    language               = 'en_CA'
+    use_embedded_content = False
+
+    no_stylesheets = True
+    auto_cleanup = True
    extra_css = 'p.meta {font-size:75%}\n .redtext {color: red;}\n .byline {font-size: 70%}'

    feeds          = [
@ -44,12 +48,12 @@ class AdvancedUserRecipe1287083651(BasicNewsRecipe):
        (re.compile(r'<script.*?</script>', re.DOTALL), lambda m: ''),
        ]

-    remove_tags_before = dict(name='h1')
-    remove_tags = [
-            dict(name='div', attrs={'id':['ShareArticles', 'topStories']}),
-            dict(href=lambda x: x and 'tracking=' in x),
-            {'class':['articleTools', 'pagination', 'Ads', 'topad',
-                'breadcrumbs', 'footerNav', 'footerUtil', 'downloadlinks']}]
+    #remove_tags_before = dict(name='h1')
+    #remove_tags = [
+            #dict(name='div', attrs={'id':['ShareArticles', 'topStories']}),
+            #dict(href=lambda x: x and 'tracking=' in x),
+            #{'class':['articleTools', 'pagination', 'Ads', 'topad',
+                #'breadcrumbs', 'footerNav', 'footerUtil', 'downloadlinks']}]

    def populate_article_metadata(self, article, soup, first):
        if first and hasattr(self, 'add_toc_thumbnail'):
--- a/recipes/icons/libertad_digital.png
+++ b/recipes/icons/libertad_digital.png
--- a/recipes/japan_times.recipe
+++ b/recipes/japan_times.recipe
@ -1,5 +1,5 @@
 __license__   = 'GPL v3'
-__copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2008-2013, Darko Miletic <darko.miletic at gmail.com>'
 '''
 japantimes.co.jp
 '''
@ -13,59 +13,41 @@ class JapanTimes(BasicNewsRecipe):
    language              = 'en_JP'
    category              = 'news, politics, japan'
    publisher             = 'The Japan Times'
-    oldest_article        = 5
+    oldest_article        = 2
    max_articles_per_feed = 150
    no_stylesheets        = True
    use_embedded_content  = False
    encoding              = 'utf8'
    publication_type      = 'newspaper'
-    masthead_url          = 'http://search.japantimes.co.jp/images/header_title.gif'
+    masthead_url          = 'http://www.japantimes.co.jp/wp-content/themes/jt_theme/library/img/logo-japan-times.png'
    extra_css             = 'body{font-family: Geneva,Arial,Helvetica,sans-serif}'

    conversion_options = {
-                          'comment'          : description
-                        , 'tags'             : category
-                        , 'publisher'        : publisher
-                        , 'language'         : language
-                        , 'linearize_tables' : True
+                          'comment'   : description
+                        , 'tags'      : category
+                        , 'publisher' : publisher
+                        , 'language'  : language
                        }

-
-    keep_only_tags    = [dict(name='div', attrs={'id':'printresult'})]
-    remove_tags       = [
-                          dict(name=['iframe','meta','link','embed','object','base'])
-                         ,dict(attrs={'id':'searchfooter'})
-                        ]
-    feeds             = [(u'The Japan Times', u'http://feeds.feedburner.com/japantimes')]
-    remove_attributes = ['border']
+    remove_tags_after  = dict(name='div', attrs={'class':'entry'})
+    keep_only_tags     = [dict(name='div', attrs={'class':'padding_block'})]
+    remove_tags        = [
+                           dict(name=['iframe','embed','object','base'])
+                          ,dict(attrs={'class':['meta_extras','related_articles']})
+                          ,dict(attrs={'id':'content_footer_menu'})
+                         ]
+    feeds              = [
+                            (u'News'     , u'http://www.japantimes.co.jp/news/feed/'     )
+                           ,(u'Opinion'  , u'http://www.japantimes.co.jp/opinion/feed/'  )
+                           ,(u'Life'     , u'http://www.japantimes.co.jp/opinion/feed/'  )
+                           ,(u'Community', u'http://www.japantimes.co.jp/community/feed/')
+                           ,(u'Culture'  , u'http://www.japantimes.co.jp/culture/feed/'  )
+                           ,(u'Sports'   , u'http://www.japantimes.co.jp/sports/feed/'   )
+                         ]

    def get_article_url(self, article):
        rurl = BasicNewsRecipe.get_article_url(self, article)
        return rurl.partition('?')[0]
-
-    def print_version(self, url):
-        if '/rss/' in url:
-            return url.replace('.jp/rss/','.jp/print/')
-        if '/text/' in url:
-            return url.replace('.jp/text/','.jp/print/')
-        return url
-
-    def preprocess_html(self, soup):
-        for item in soup.findAll(style=True):
-            del item['style']
-        for item in soup.findAll('img'):
-            if not item.has_key('alt'):
-               item['alt'] = 'image'
-        for item in soup.findAll('photo'):
-            item.name = 'div'
-        for item in soup.head.findAll('paragraph'):
-            item.extract()
-        for item in soup.findAll('wwfilename'):
-            item.extract()
-        for item in soup.findAll('jtcategory'):
-            item.extract()
-        for item in soup.findAll('nomooter'):
-            item.extract()
-        for item in soup.body.findAll('paragraph'):
-            item.name = 'p'
-        return soup
+        
+    def preprocess_raw_html(self, raw, url):
+       return '<html><head>'+raw[raw.find('</head>'):]
--- a/recipes/le_monde_sub.recipe
+++ b/recipes/le_monde_sub.recipe
@ -1,15 +1,16 @@
 #!/usr/bin/env  python

 __license__   = 'GPL v3'
-__copyright__ = '2012, Rémi Vanicat <vanicat at debian.org>'
+__copyright__ = '2012, 2013, Rémi Vanicat <vanicat at debian.org>'
 '''
 Lemonde.fr: Version abonnée
 '''


 import os, zipfile, re, time
+from urllib2 import HTTPError
+from calibre.constants import preferred_encoding

-from calibre import strftime
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.ptempfile import PersistentTemporaryFile
@ -20,28 +21,38 @@ class LeMondeAbonne(BasicNewsRecipe):
    __author__            = u'Rémi Vanicat'
    description           = u'Actualités'
    category              = u'Actualités, France, Monde'
+    publisher             = 'Le Monde'
    language              = 'fr'
    needs_subscription    = True
+    no_stylesheets        = True
+    smarten_punctuation   = True
+    remove_attributes     = [ 'border', 'cellspacing', 'display', 'align', 'cellpadding', 'colspan', 'valign', 'vscape', 'hspace', 'alt', 'width', 'height']
+    extra_css = ''' li{margin:6pt 0}
+                    ul{margin:0}

-    no_stylesheets         = True
+                    div.photo img{max-width:100%; border:0px transparent solid;}
+                    div.photo{font-family:inherit; color:#333; text-align:center;}
+                    div.photo p{text-align:justify;font-size:.9em; line-height:.9em;}

-    extra_css = u'''
-                    h1{font-size:130%;}
-                    .ariane{font-size:xx-small;}
-                    .source{font-size:xx-small;}
-                    .href{font-size:xx-small;}
-                    .LM_caption{color:#666666; font-size:x-small;}
-                    .main-article-info{font-family:Arial,Helvetica,sans-serif;}
-                    #full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
-                    #match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
-                '''
+                    @page{margin:10pt}
+                    .ar-txt {color:#000; text-align:justify;}
+                    h1{text-align:left; font-size:1.25em;}
+
+                    .auteur{text-align:right; font-weight:bold}
+                    .feed{text-align:right; font-weight:bold}
+                    .po-ti2{font-weight:bold}
+                    .fen-tt{font-weight:bold;font-size:1.1em}
+    '''

    zipurl_format = 'http://medias.lemonde.fr/abonnes/editionelectronique/%Y%m%d/html/%y%m%d.zip'
    coverurl_format = '/img/%y%m%d01.jpg'
    path_format = "%y%m%d"
    login_url = 'http://www.lemonde.fr/web/journal_electronique/identification/1,56-0,45-0,0.html'

-    keep_only_tags = [ dict(name="div", attrs={ 'class': 'po-prti' }),  dict(name=['h1']), dict(name='div', attrs={ 'class': 'photo' }), dict(name='div', attrs={ 'class': 'po-ti2' }), dict(name='div', attrs={ 'class': 'ar-txt' }), dict(name='div', attrs={ 'class': 'po_rtcol' }) ]
+    keep_only_tags = [dict(name=['h1']), dict(name='div', attrs={ 'class': 'photo' }), dict(name='div', attrs={ 'class': 'po-ti2' }), dict(name='div', attrs={ 'class': 'ar-txt' }), dict(name='div', attrs={ 'class': 'po_rtcol' }) ]
+
+
+    remove_tags = [ dict(name='div', attrs={ 'class': 'po-ti' }),dict(name='div', attrs={ 'class': 'po-copy' })]

    article_id_pattern = re.compile("[0-9]+\\.html")
    article_url_format = 'http://www.lemonde.fr/journalelectronique/donnees/protege/%Y%m%d/html/'
@ -67,12 +78,16 @@ class LeMondeAbonne(BasicNewsRecipe):

        second = time.time()
        second += self.decalage
-        ltime = self.ltime = time.gmtime(second)
-        url = time.strftime(self.zipurl_format, ltime)

-        self.timefmt=strftime(" %A %d %B %Y", ltime)
-
-        response = browser.open(url)
+        for i in range(7):
+            self.ltime = time.gmtime(second)
+            self.timefmt=time.strftime(" %A %d %B %Y",self.ltime).decode(preferred_encoding)
+            url = time.strftime(self.zipurl_format,self.ltime)
+            try:
+                response = browser.open(url)
+                continue
+            except HTTPError:
+                second -= 24*60*60

        tmp = PersistentTemporaryFile(suffix='.zip')
        self.report_progress(0.1,_('downloading zip file'))
@ -85,7 +100,7 @@ class LeMondeAbonne(BasicNewsRecipe):
        zfile.extractall(self.output_dir)
        zfile.close()

-        path = os.path.join(self.output_dir, time.strftime(self.path_format, ltime), "data")
+        path = os.path.join(self.output_dir, time.strftime(self.path_format, self.ltime), "data")

        self.articles_path = path

@ -95,13 +110,33 @@ class LeMondeAbonne(BasicNewsRecipe):

        flux = []

-        article_url = time.strftime(self.article_url_format, ltime)
+        article_url = time.strftime(self.article_url_format, self.ltime)

        for i in range(nb_index_files):
            filename = os.path.join(path, "selection_%d.html" % (i + 1))
            tmp = open(filename,'r')
-            soup=BeautifulSoup(tmp)
+            soup=BeautifulSoup(tmp,convertEntities=BeautifulSoup.HTML_ENTITIES)
            title=soup.find('span').contents[0]
+            if title=="Une":
+                title="À la une"
+            if title=="Evenement":
+                title="L'événement"
+            if title=="Planete":
+                title="Planète"
+            if title=="Economie - Entreprises":
+                title="Économie"
+            if title=="L'Oeil du Monde":
+                title="L'œil du Monde"
+            if title=="Enquete":
+                title="Enquête"
+            if title=="Editorial - Analyses":
+                title="Analyses"
+            if title=="Le Monde Economie":
+                title="Économie"
+            if title=="Le Monde Culture et idées":
+                title="Idées"
+            if title=="Le Monde Géo et politique":
+                title="Géopolitique"
            tmp.close()

            filename = os.path.join(path, "frame_gauche_%d.html" % (i + 1))
@ -114,7 +149,7 @@ class LeMondeAbonne(BasicNewsRecipe):
                article = {
                    'title': link.contents[0],
                    'url': article_url + article_id,
-                    'descripion': '',
+                    'description': '',
                    'content': ''
                    }
                articles.append(article)
@ -129,4 +164,3 @@ class LeMondeAbonne(BasicNewsRecipe):
 # Local Variables:
 # mode: python
 # End:
-
--- a/recipes/libertad_digital.recipe
+++ b/recipes/libertad_digital.recipe
@ -0,0 +1,65 @@
+__license__   = 'GPL v3'
+__copyright__ = '2013, Darko Miletic <darko.miletic at gmail.com>'
+'''
+www.libertaddigital.com
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class LibertadDigital(BasicNewsRecipe):
+    title                 = 'Libertad Digital'
+    __author__            = 'Darko Miletic'
+    description           = 'En Libertad Digital encontraras noticias y opinion sobre: España, el Mundo, Internet, sociedad, economia y deportes'
+    publisher             = 'Libertad Digital S.A.'
+    category              = 'noticias, ultima hora, españa, internet, mundo, economia, sociedad, Libertad Digital'
+    oldest_article        = 2
+    max_articles_per_feed = 200
+    no_stylesheets        = True
+    encoding              = 'cp1252'
+    use_embedded_content  = False
+    language              = 'es'
+    remove_empty_feeds    = True
+    publication_type      = 'website'
+    masthead_url          = 'http://s.libertaddigital.com/images/logo.gif'
+    extra_css             = """
+                               body{font-family: Verdana,sans-serif }
+                               img{margin-bottom: 0.4em; display:block}
+                            """
+
+    conversion_options = {
+                          'comment'   : description
+                        , 'tags'      : category
+                        , 'publisher' : publisher
+                        , 'language'  : language
+                        }
+
+    remove_tags = [
+                     dict(name=['meta','link','iframe','embed','object'])
+                    ,dict(name='p', attrs={'class':'copyright'})
+                  ]
+    remove_attributes=['lang']
+
+
+    feeds = [
+              (u'Portada'      , u'http://feeds2.feedburner.com/libertaddigital/deportes'     )
+             ,(u'Opinion'      , u'http://feeds2.feedburner.com/libertaddigital/opinion'      )
+             ,(u'España'       , u'http://feeds2.feedburner.com/libertaddigital/nacional'     )
+             ,(u'Internacional', u'http://feeds2.feedburner.com/libertaddigital/internacional')
+             ,(u'Libre Mercado', u'http://feeds2.feedburner.com/libertaddigital/economia'     )
+             ,(u'Chic'         , u'http://feeds2.feedburner.com/libertaddigital/el-candelabro')
+             ,(u'Internet'     , u'http://feeds2.feedburner.com/libertaddigital/internet'     )
+             ,(u'Deportes'     , u'http://feeds2.feedburner.com/libertaddigital/deportes'     )
+            ]
+
+    def get_article_url(self, article):
+        return article.get('guid',  None)
+            
+    def print_version(self, url):
+        art, sep, rest = url.rpartition('/')
+        aart, asep, artid = art.rpartition('-')
+        return 'http://www.libertaddigital.com/c.php?op=imprimir&id=' + artid
+
+    def preprocess_html(self, soup):
+        for item in soup.findAll(style=True):
+            del item['style']
+        return soup
--- a/recipes/thestar.recipe
+++ b/recipes/thestar.recipe
@ -1,7 +1,5 @@
-#!/usr/bin/env  python
-
 __license__   = 'GPL v3'
-__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2009-2013, Darko Miletic <darko.miletic at gmail.com>'
 '''
 www.thestar.com
 '''
@ -11,18 +9,17 @@ from calibre.web.feeds.news import BasicNewsRecipe
 class TheTorontoStar(BasicNewsRecipe):
    title                 = 'The Toronto Star'
    __author__            = 'Darko Miletic'
-    description           = "Canada's largest daily newspaper"
+    description           = "Thestar.com is Canada's largest online news site. Stay current with our sports, business entertainment news and more from the Toronto Star and thestar.com"
    oldest_article        = 2
    language              = 'en_CA'
    max_articles_per_feed = 100
    no_stylesheets        = True
-    #auto_cleanup = True
-    #auto_cleanup_keep = '//div[@class="topsContent topsContentActive"]'
    use_embedded_content  = False
    delay                 = 2
    publisher             = 'The Toronto Star'
    category              = "Toronto Star,Canada's largest daily newspaper,breaking news,classifieds,careers,GTA,Toronto Maple Leafs,sports,Toronto,news,editorial,The Star,Ontario,information,columnists,business,entertainment,births,deaths,automotive,rentals,weather,archives,Torstar,technology,Joseph Atkinson"
    encoding              = 'utf-8'
+    masthead_url          = 'http://www.thestar.com/etc/designs/thestar/images/general/logoLrg.png'

    conversion_options = {
                             'comments'    : description
@ -30,23 +27,18 @@ class TheTorontoStar(BasicNewsRecipe):
                            ,'publisher'   : publisher
                         }

-    #keep_only_tags = [dict(name='div', attrs={'class':'ts-article'})]
-    #remove_tags_before = dict(name='div',attrs={'id':'ts-article_header'})
+    remove_tags_before = dict(name='div',attrs={'class':'article-headline'})

    feeds          = [
-                        (u'News'         , u'http://www.thestar.com/rss/?categories=293'    )
-                       ,(u'Opinion'     ,  u'http://www.thestar.com/rss/?categories=303'    )
-                       ,(u'Business'     , u'http://www.thestar.com/rss/?categories=294'    )
-                       ,(u'Sports'       , u'http://www.thestar.com/rss/?categories=295'    )
-                       ,(u'Entertainment', u'http://www.toronto.com/rss?categories=6298'    )
-                       ,(u'Living'       , u'http://www.thestar.com/rss/?categories=297'    )
-                       ,(u'Travel'       , u'http://www.thestar.com/rss/list/1042246?'              )
-                       ,(u'Science'      , u'http://www.thestar.com/rss?categories=6481')
+                        (u'News'         , u'http://www.thestar.com/feeds.articles.news.rss'           )
+                       ,(u'Opinion'      , u'http://www.thestar.com/feeds.articles.opinion.rss'        )
+                       ,(u'Business'     , u'http://www.thestar.com/feeds.articles.business.rss'       )
+                       ,(u'Sports'       , u'http://www.thestar.com/feeds.articles.sports.rss'         )
+                       ,(u'Entertainment', u'http://www.thestar.com/feeds.articles.entertainment.rss'  )
+                       ,(u'Living'       , u'http://www.thestar.com/feeds.articles.life.rss'           )
+                       ,(u'Travel'       , u'http://www.thestar.com/feeds.articles.life.travel.rss'    )
+                       ,(u'Technology'   , u'http://www.thestar.com/feeds.articles.life.technology.rss')
                     ]

    def print_version(self, url):
-        artl = url.rpartition('--')[0]
-        artid = artl.rpartition('/')[2]
-        return 'http://www.thestar.com/printarticle/' + artid
-
- 
+        return url.replace('.html', '.print.html')
--- a/src/calibre/customize/profiles.py
+++ b/src/calibre/customize/profiles.py
@ -452,6 +452,13 @@ class SamsungGalaxy(TabletOutput):
            'a resolution of 600x1280')
    screen_size = comic_screen_size = (600, 1280)

+class NookHD(TabletOutput):
+    name = 'Nook HD+'
+    short_name = 'nook_hd_plus'
+    description = _('Intended for the Nook HD+ and similar tablet devices with '
+            'a resolution of 1080x1920')
+    screen_size = comic_screen_size = (1080, 1920)
+
 class SonyReaderOutput(OutputProfile):

    name        = 'Sony Reader'
@ -786,7 +793,7 @@ output_profiles = [OutputProfile, SonyReaderOutput, SonyReader300Output,
        SonyReader900Output, MSReaderOutput, MobipocketOutput, HanlinV3Output,
        HanlinV5Output, CybookG3Output, CybookOpusOutput, KindleOutput,
        iPadOutput, iPad3Output, KoboReaderOutput, TabletOutput, SamsungGalaxy,
-        SonyReaderLandscapeOutput, KindleDXOutput, IlliadOutput,
+        SonyReaderLandscapeOutput, KindleDXOutput, IlliadOutput, NookHD,
        IRexDR1000Output, IRexDR800Output, JetBook5Output, NookOutput,
        BambookOutput, NookColorOutput, PocketBook900Output, PocketBookPro912Output,
        GenericEink, GenericEinkLarge, KindleFireOutput, KindlePaperWhiteOutput]
--- a/src/calibre/ebooks/mobi/writer8/main.py
+++ b/src/calibre/ebooks/mobi/writer8/main.py
@ -241,6 +241,11 @@ class KF8Writer(object):
            j = 0
            for tag in root.iterdescendants(etree.Element):
                id_ = tag.attrib.get('id', None)
+                if id_ is None:
+                    # Can happen during tweaking
+                    id_ = tag.attrib.get('name', None)
+                    if id_ is not None:
+                        tag.attrib['id'] = id_
                if id_ is not None or barename(tag.tag).lower() in aid_able_tags:
                    aid = aidbase + j
                    tag.attrib['aid'] = to_base(aid, base=32)
--- a/src/calibre/gui2/wizard/init.py
+++ b/src/calibre/gui2/wizard/init.py
@ -198,6 +198,7 @@ class NookColor(Nook):
 class NookTablet(NookColor):
    id = 'nook_tablet'
    name = 'Nook Tablet/HD'
+    output_profile = 'nook_hd_plus'

 class CybookG3(Device):