Merge from trunk

2025-08-30 23:00:21 -04:00 · 2011-01-25 09:53:01 +00:00 · 2011-01-25 09:53:01 +00:00 · d9c9accdda
commit d9c9accdda
parent d3cfeb56a2 559ff8c59f
10 changed files with 339 additions and 21 deletions
--- a/resources/recipes/20_minutos.recipe
+++ b/resources/recipes/20_minutos.recipe
@ -0,0 +1,17 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1295310874(BasicNewsRecipe):
+    title          = u'20 Minutos (Boletin)'
+    __author__            = 'Luis Hernandez'
+    description           = 'Periódico gratuito en español'
+    cover_url     = 'http://estaticos.20minutos.es/mmedia/especiales/corporativo/css/img/logotipos_grupo20minutos.gif'
+    language      = 'es'
+
+    oldest_article = 2
+    max_articles_per_feed = 50
+
+    feeds          = [(u'VESPERTINO', u'http://20minutos.feedsportal.com/c/32489/f/478284/index.rss')
+                        , (u'DEPORTES', u'http://20minutos.feedsportal.com/c/32489/f/478286/index.rss')
+                        , (u'CULTURA', u'http://www.20minutos.es/rss/ocio/')
+                        , (u'TV', u'http://20minutos.feedsportal.com/c/32489/f/490877/index.rss')
+]
--- a/resources/recipes/abc.recipe
+++ b/resources/recipes/abc.recipe
@ -0,0 +1,43 @@
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class ABCRecipe(BasicNewsRecipe):
+  title          = u'ABC Linuxu'
+  oldest_article = 5
+  max_articles_per_feed = 3#5
+  __author__ = 'Funthomas'
+  language = 'cs'
+
+  feeds = [
+    #(u'Blogy', u'http://www.abclinuxu.cz/auto/blogDigest.rss'),
+    (u'Články', u'http://www.abclinuxu.cz/auto/abc.rss'),
+    (u'Zprávičky','http://www.abclinuxu.cz/auto/zpravicky.rss')
+  ]
+
+  remove_javascript = True
+  no_stylesheets    = True
+  remove_attributes = ['width','height']
+
+  remove_tags_before = dict(name='h1')
+  remove_tags = [
+    dict(attrs={'class':['meta-vypis','page_tools','cl_perex']}),
+    dict(attrs={'class':['cl_nadpis-link','komix-nav']})
+  ]
+
+  remove_tags_after = [
+    dict(name='div',attrs={'class':['cl_perex','komix-nav']}),
+    dict(attrs={'class':['meta-vypis','page_tools']}),
+    dict(name='',attrs={'':''}),
+  ]
+
+
+  preprocess_regexps = [
+    (re.compile(r'</div>.*<p class="perex">', re.DOTALL),lambda match: '</div><p class="perex">')
+  ]
+  def print_version(self, url):
+    return url + '?varianta=print&noDiz'
+
+  extra_css = '''
+            h1 {font-size:130%; font-weight:bold}
+            h3 {font-size:111%; font-weight:bold}
+        '''
--- a/resources/recipes/idnes.recipe
+++ b/resources/recipes/idnes.recipe
@ -0,0 +1,54 @@
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class iHeuteRecipe(BasicNewsRecipe):
+    __author__  = 'FunThomas'
+    title = u'iDnes.cz'
+    publisher = u'MAFRA a.s.'
+    description = 'iDNES.cz Zprávy, Technet, Komiksy a další'
+    oldest_article = 3
+    max_articles_per_feed = 2
+
+    feeds = [
+            (u'Zprávy', u'http://servis.idnes.cz/rss.asp?c=zpravodaj'),
+            (u'Sport', u'http://servis.idnes.cz/rss.asp?c=sport'),
+            (u'Technet', u'http://servis.idnes.cz/rss.asp?c=technet'),
+            (u'Mobil', u'http://servis.idnes.cz/rss.asp?c=mobil'),
+            (u'Ekonomika', u'http://servis.idnes.cz/rss.asp?c=ekonomikah'),
+            #(u'Kultura', u'http://servis.idnes.cz/rss.asp?c=kultura'),
+            (u'Cestování', u'http://servis.idnes.cz/rss.asp?c=iglobe'),
+            #(u'Kavárna', u'http://servis.idnes.cz/rss.asp?r=kavarna'),
+            (u'Komixy', u'http://servis.idnes.cz/rss.asp?c=komiksy')
+            ]
+
+
+    encoding = 'cp1250'
+    language = 'cs'
+    cover_url = 'http://g.idnes.cz/u/loga-n4/idnes.gif'
+    remove_javascript = True
+    no_stylesheets = True
+
+    remove_attributes = ['width','height']
+    remove_tags = [dict(name='div',   attrs={'id':['zooming']}),
+                   dict(name='div',   attrs={'class':['related','mapa-wrapper']}),
+                   dict(name='table', attrs={'id':['opener-img','portal']}),
+                   dict(name='table', attrs={'class':['video-16ku9']})]
+    remove_tags_after  = [dict(name='div',attrs={'id':['related','related2']})]
+
+    keep_only_tags = [dict(name='div', attrs={'class':['art-full adwords-text','dil-day']})
+                      ,dict(name='table',attrs={'class':['kemel-box']})]
+
+    def print_version(self, url):
+        print_url = url
+        split_url = url.split("?")
+        if (split_url[0].rfind('dilbert.asp')  != -1):      #dilbert komix
+            print_url = print_url.replace('.htm','.gif&tisk=1')
+            print_url = print_url.replace('.asp','.aspx')
+        elif (split_url[0].rfind('kemel.asp')  == -1):      #not Kemel komix
+            print_url = 'http://zpravy.idnes.cz/tiskni.asp?' +  split_url[1]
+        #kemel                  kemel print page doesn't work
+        return print_url
+
+    extra_css = '''
+                  h1 {font-size:125%; font-weight:bold}
+                  h3 {font-size:110%; font-weight:bold}
+                '''
--- a/resources/recipes/la_tribuna.recipe
+++ b/resources/recipes/la_tribuna.recipe
@ -0,0 +1,29 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1294946868(BasicNewsRecipe):
+    title             = u'La Tribuna de Talavera'
+    __author__  = 'Luis Hernández'
+    description   = 'Diario de Talavera de la Reina'
+    cover_url     = 'http://www.latribunadetalavera.es/entorno/mancheta.gif'
+
+    oldest_article = 5
+    max_articles_per_feed = 50
+
+    remove_javascript = True
+    no_stylesheets        = True
+    use_embedded_content  = False
+
+    encoding              = 'utf-8'
+    language              = 'es'
+    timefmt        = '[%a, %d %b, %Y]'
+
+    keep_only_tags     = [dict(name='div', attrs={'id':['articulo']})
+                                  ,dict(name='div', attrs={'class':['foto']})
+                                  ,dict(name='p', attrs={'id':['texto']})
+                                ]
+
+    remove_tags_before = dict(name='div' , attrs={'class':['comparte']})
+    remove_tags_after  = dict(name='div' , attrs={'id':['relacionadas']})
+
+
+    feeds          = [(u'Portada', u'http://www.latribunadetalavera.es/rss.html')]
--- a/resources/recipes/nytimes_sub.recipe
+++ b/resources/recipes/nytimes_sub.recipe
@ -1,6 +1,5 @@
 #!/usr/bin/env  python
 # -*- coding: utf-8 -*-
-
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''
@ -28,6 +27,10 @@ class NYTimes(BasicNewsRecipe):
    # previous paid versions of the new york times to best sent to the back issues folder on the kindle
    replaceKindleVersion = False

+    # download higher resolution images than the small thumbnails typically included in the article
+    # the down side of having large beautiful images is the file size is much larger, on the order of 7MB per paper
+    useHighResImages = True
+
    # includeSections: List of sections to include. If empty, all sections found will be included.
    # Otherwise, only the sections named will be included. For example,
    #
@ -90,7 +93,6 @@ class NYTimes(BasicNewsRecipe):
                    (u'Sunday Magazine',u'magazine'),
                    (u'Week in Review',u'weekinreview')]

-
    if headlinesOnly:
        title='New York Times Headlines'
        description = 'Headlines from the New York Times'
@ -127,7 +129,7 @@ class NYTimes(BasicNewsRecipe):

    earliest_date = date.today() - timedelta(days=oldest_article)

-    __author__  = 'GRiker/Kovid Goyal/Nick Redding'
+    __author__  = 'GRiker/Kovid Goyal/Nick Redding/Ben Collier'
    language = 'en'
    requires_version = (0, 7, 5)

@ -149,7 +151,7 @@ class NYTimes(BasicNewsRecipe):
                            'dottedLine',
                            'entry-meta',
                            'entry-response module',
-                            'icon enlargeThis',
+                            #'icon enlargeThis', #removed to provide option for high res images
                            'leftNavTabs',
                            'metaFootnote',
                            'module box nav',
@ -163,7 +165,23 @@ class NYTimes(BasicNewsRecipe):
                            'entry-tags', #added for DealBook
                            'footer promos clearfix', #added for DealBook
                            'footer links clearfix', #added for DealBook
-                            'inlineImage module', #added for DealBook
+                            'tabsContainer', #added for other blog downloads
+                            'column lastColumn', #added for other blog downloads
+                            'pageHeaderWithLabel', #added for other gadgetwise downloads
+                            'column two', #added for other blog downloads
+                            'column two last', #added for other blog downloads
+                            'column three', #added for other blog downloads
+                            'column three last', #added for other blog downloads
+                            'column four',#added for other blog downloads
+                            'column four last',#added for other blog downloads
+                            'column last', #added for other blog downloads
+                            'timestamp published', #added for other blog downloads
+                            'entry entry-related',
+                            'subNavigation tabContent active', #caucus blog navigation
+                            'columnGroup doubleRule',
+                            'mediaOverlay slideshow',
+                            'headlinesOnly multiline flush',
+                            'wideThumb',
                            re.compile('^subNavigation'),
                            re.compile('^leaderboard'),
                            re.compile('^module'),
@ -254,7 +272,7 @@ class NYTimes(BasicNewsRecipe):
    def exclude_url(self,url):
        if not url.startswith("http"):
            return True
-        if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: #added for DealBook
+        if not url.endswith(".html") and 'dealbook.nytimes.com' not in url and 'blogs.nytimes.com' not in url: #added for DealBook
            return True
        if 'nytimes.com' not in url:
            return True
@ -480,7 +498,7 @@ class NYTimes(BasicNewsRecipe):
                for lidiv in div.findAll('li'):
                    if not skipping:
                        self.handle_article(lidiv)
-
+            
        self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
        return self.filter_ans(self.ans)

@ -591,20 +609,85 @@ class NYTimes(BasicNewsRecipe):
                if article_date < self.earliest_date:
                    self.log("Skipping article dated %s" % date_str)
                    return None
+                    
+        #all articles are from today, no need to print the date on every page
+        try:
+            if not self.webEdition:
+                date_tag = soup.find(True,attrs={'class': ['dateline','date']})
+                if date_tag:
+                    date_tag.extract()
+        except:
+            self.log("Error removing the published date")

-        kicker_tag = soup.find(attrs={'class':'kicker'})
-        if kicker_tag: # remove Op_Ed author head shots
-            tagline = self.tag_to_string(kicker_tag)
-            if tagline=='Op-Ed Columnist':
-                img_div = soup.find('div','inlineImage module')
-                if img_div:
-                    img_div.extract()
-
+        if self.useHighResImages:
+            try:
+                #open up all the "Enlarge this Image" pop-ups and download the full resolution jpegs
+                enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
+                if enlargeThisList:
+                    for popupref in enlargeThisList:
+                        popupreflink = popupref.find('a')
+                        if popupreflink:
+                            reflinkstring = str(popupreflink['href'])
+                            refstart = reflinkstring.find("javascript:pop_me_up2('") + len("javascript:pop_me_up2('")
+                            refend = reflinkstring.find(".html", refstart) + len(".html")
+                            reflinkstring = reflinkstring[refstart:refend]
+                            
+                            popuppage = self.browser.open(reflinkstring)
+                            popuphtml = popuppage.read()
+                            popuppage.close()
+                            if popuphtml:
+                                st = time.localtime()
+                                year = str(st.tm_year)
+                                month = "%.2d" % st.tm_mon
+                                day = "%.2d" % st.tm_mday
+                                imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/') + len('http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/')                                
+                                highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/' + popuphtml[imgstartpos:popuphtml.find('.jpg',imgstartpos)+4]
+                                popupSoup = BeautifulSoup(popuphtml)
+                                highResTag = popupSoup.find('img', {'src':highResImageLink})
+                                if highResTag:
+                                    try:
+                                        newWidth = highResTag['width']
+                                        newHeight = highResTag['height']
+                                        imageTag = popupref.parent.find("img")
+                                    except:
+                                        self.log("Error: finding width and height of img")
+                                    popupref.extract()
+                                    if imageTag:
+                                        try:
+                                            imageTag['src'] = highResImageLink
+                                            imageTag['width'] = newWidth
+                                            imageTag['height'] = newHeight
+                                        except:
+                                            self.log("Error setting the src width and height parameters")
+            except Exception as e:
+                self.log("Error pulling high resolution images")
+                
+            try:
+                #remove "Related content" bar
+                runAroundsFound = soup.findAll('div',{'class':['articleInline runaroundLeft','articleInline doubleRule runaroundLeft','articleInline runaroundLeft firstArticleInline']})
+                if runAroundsFound:
+                    for runAround in runAroundsFound:
+                        #find all section headers
+                        hlines = runAround.findAll(True ,{'class':['sectionHeader','sectionHeader flushBottom']})
+                        if hlines:
+                            for hline in hlines:
+                                hline.extract()
+            except:
+                self.log("Error removing related content bar")
+     
+                
+            try:
+                #in case pulling images failed, delete the enlarge this text
+                enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
+                if enlargeThisList:
+                    for popupref in enlargeThisList:
+                        popupref.extract()
+            except:
+                self.log("Error removing Enlarge this text")

        return self.strip_anchors(soup)

    def postprocess_html(self,soup, True):
-
        try:
                if self.one_picture_per_article:
                        # Remove all images after first
@ -766,6 +849,8 @@ class NYTimes(BasicNewsRecipe):
        try:
            if len(article.text_summary.strip()) == 0:
                articlebodies = soup.findAll('div',attrs={'class':'articleBody'})
+                if not articlebodies: #added to account for blog formats
+                    articlebodies = soup.findAll('div', attrs={'class':'entry-content'}) #added to account for blog formats
                if articlebodies:
                    for articlebody in articlebodies:
                        if articlebody:
@ -774,13 +859,14 @@ class NYTimes(BasicNewsRecipe):
                                refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
                                #account for blank paragraphs and short paragraphs by appending them to longer ones
                                if len(refparagraph) > 0:
-                                    if len(refparagraph) > 70: #approximately one line of text
+                                    if len(refparagraph) > 140: #approximately two lines of text
                                        article.summary = article.text_summary = shortparagraph + refparagraph
                                        return
                                    else:
                                        shortparagraph = refparagraph + " "
                                        if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
                                            shortparagraph = shortparagraph + "- "
+
        except:
            self.log("Error creating article descriptions")
            return
--- a/resources/recipes/root.recipe
+++ b/resources/recipes/root.recipe
@ -0,0 +1,39 @@
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1289939440(BasicNewsRecipe):
+  __author__  = 'FunThomas'
+  title       = u'Root.cz'
+  description = u'Zprávičky a články z Root.cz'
+  publisher   = u'Internet Info, s.r.o'
+  oldest_article = 2               #max stari clanku ve dnech
+  max_articles_per_feed = 50       #max pocet clanku na feed
+
+  feeds = [
+    (u'Články', u'http://www.root.cz/rss/clanky/'),
+    (u'Zprávičky', u'http://www.root.cz/rss/zpravicky/')
+  ]
+
+  publication_type = u'magazine'
+  language = u'cs'
+  no_stylesheets = True
+  remove_javascript = True
+  cover_url = u'http://i.iinfo.cz/urs/logo-root-bila-oranzova-cerna-111089527143118.gif'
+
+  remove_attributes = ['width','height','href'] #,'href'
+  keep_only_tags = [
+    dict(name='h1'),
+    dict(name='a',attrs={'class':'author'}),
+    dict(name='p', attrs={'class':'intro'}),
+    dict(name='div',attrs={'class':'urs'})
+  ]
+
+  preprocess_regexps = [
+    (re.compile(u'<p class="perex[^"]*">[^<]*<img[^>]*>', re.DOTALL),lambda match: '<p class="intro">'),
+    (re.compile(u'<h3><a name="tucnak">Tričko tučňák.*</body>', re.DOTALL),lambda match: '<!--deleted-->')
+  ]
+
+  extra_css = '''
+                h1 {font-size:130%; font-weight:bold}
+                h3 {font-size:111%; font-weight:bold}
+              '''
--- a/resources/recipes/sinfest.recipe
+++ b/resources/recipes/sinfest.recipe
@ -0,0 +1,33 @@
+__license__   = 'GPL v3'
+__copyright__ = '2010, Nadid <nadid.skywalker at gmail.com>'
+'''
+http://www.sinfest.net
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class SinfestBig(BasicNewsRecipe):
+    title                  = 'Sinfest'
+    __author__             = 'nadid'
+    description            = 'Sinfest'
+    reverse_article_order = False
+    oldest_article         = 5
+    max_articles_per_feed  = 100
+    no_stylesheets         = True
+    use_embedded_content   = True
+    encoding               = 'utf-8'
+    publisher              = 'Tatsuya Ishida/Museworks'
+    category               = 'comic'
+    language               = 'en'
+
+    conversion_options = {
+                             'comments'        : description
+                            ,'tags'            : category
+                            ,'language'        : language
+                            ,'publisher'       : publisher
+                         }
+
+    feeds = [(u'SinFest', u'http://henrik.nyh.se/scrapers/sinfest.rss' )]
+    def get_article_url(self, article):
+        return article.get('link')
+
--- a/src/calibre/ebooks/metadata/sources/init.py
+++ b/src/calibre/ebooks/metadata/sources/init.py
@ -0,0 +1,9 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+
+
--- a/src/calibre/gui2/convert/search_and_replace.py
+++ b/src/calibre/gui2/convert/search_and_replace.py
@ -42,9 +42,15 @@ class SearchAndReplaceWidget(Widget, Ui_Form):
    def break_cycles(self):
        Widget.break_cycles(self)

-        self.opt_sr1_search.doc_update.disconnect()
-        self.opt_sr2_search.doc_update.disconnect()
-        self.opt_sr3_search.doc_update.disconnect()
+        def d(x):
+            try:
+                x.disconnect()
+            except:
+                pass
+
+        d(self.opt_sr1_search)
+        d(self.opt_sr2_search)
+        d(self.opt_sr3_search)

        self.opt_sr1_search.break_cycles()
        self.opt_sr2_search.break_cycles()
--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@ -310,7 +310,9 @@ What formats does |app| read metadata from?

 Where are the book files stored?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-When you first run |app|, it will ask you for a folder in which to store your books. Whenever you add a book to |app|, it will copy the book into that folder. Books in the folder are nicely arranged into sub-folders by Author and Title. Metadata about the books is stored in the file ``metadata.db`` (which is a sqlite database).
+When you first run |app|, it will ask you for a folder in which to store your books. Whenever you add a book to |app|, it will copy the book into that folder. Books in the folder are nicely arranged into sub-folders by Author and Title. Note that the contents of this folder are automatically managed by |app|, **do not** add any files/folders manually to this folder, as they may be automatically deleted. If you want to add a file associated to a particular book, use the top right area of :guilabel:`Edit metadata` dialog to do so. Then, |app| will automatically put that file into the correct folder and move it around when the title/author changes.
+
+Metadata about the books is stored in the file ``metadata.db`` at the top level of the library folder This file is is a sqlite database. When backing up your library make sure you copy the entire folder and all its sub-folders. 

 Why doesn't |app| let me store books in my own directory structure?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~