Merge from trunk

2025-07-09 03:04:10 -04:00 · 2010-10-21 21:11:17 +01:00 · 2010-10-21 21:11:17 +01:00 · cda0402d1c
commit cda0402d1c
parent 77284f7528 91f8c368c1
7 changed files with 93 additions and 46 deletions
--- a/resources/images/news/theecocolapse.png
+++ b/resources/images/news/theecocolapse.png
--- a/resources/recipes/el_pais.recipe
+++ b/resources/recipes/el_pais.recipe
@ -2,7 +2,7 @@
 __license__   = 'GPL v3'
 __author__    = 'Jordi Balcells, based on an earlier version by Lorenzo Vigentini & Kovid Goyal'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
-description   = 'Main daily newspaper from Spain - v1.03 (03, September 2010)'
+description   = 'Main daily newspaper from Spain - v1.04 (19, October 2010)'
 __docformat__ = 'restructuredtext en'

 '''
@ -32,19 +32,16 @@ class ElPais(BasicNewsRecipe):
    remove_javascript = True
    no_stylesheets = True

-    keep_only_tags = [ dict(name='div', attrs={'class':['cabecera_noticia','cabecera_noticia_reportaje','cabecera_noticia_opinion','contenido_noticia','caja_despiece','presentacion']})]
+    keep_only_tags = [ dict(name='div', attrs={'class':['cabecera_noticia_reportaje estirar','cabecera_noticia_opinion estirar','cabecera_noticia estirar','contenido_noticia','caja_despiece']})]

-    extra_css      = '''
-                        p{style:normal size:12 serif}
-
-                    '''
+    extra_css             = ' p{text-align: justify; font-size: 100%} body{ text-align: left; font-family: serif; font-size: 100% } h1{ font-family: sans-serif; font-size:200%; font-weight: bolder; text-align: justify; } h2{ font-family: sans-serif; font-size:150%; font-weight: 500; text-align: justify } h3{ font-family: sans-serif; font-size:125%; font-weight: 500; text-align: justify } img{margin-bottom: 0.4em} '

    remove_tags    = [
                        dict(name='div', attrs={'class':['zona_superior','pie_enlaces_inferiores','contorno_f','ampliar']}),
-                        dict(name='div', attrs={'class':['limpiar','mod_apoyo','borde_sup','votos','info_complementa','info_relacionada','buscador_m','nav_ant_sig']}),
+                        dict(name='div', attrs={'class':['limpiar','mod_apoyo','borde_sup','votos estirar','info_complementa','info_relacionada','buscador_m','nav_ant_sig']}),
                        dict(name='div', attrs={'id':['suscribirse suscrito','google_noticia','utilidades','coment','foros_not','pie','lomas','calendar']}),
                        dict(name='p', attrs={'class':'nav_meses'}),
-                        dict(attrs={'class':['enlaces_m','miniaturas_m']})
+                        dict(attrs={'class':['enlaces_m','miniaturas_m','nav_miniaturas_m']})
                    ]

    feeds          = [
--- a/resources/recipes/foxnews.recipe
+++ b/resources/recipes/foxnews.recipe
@ -4,7 +4,6 @@ __copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
 foxnews.com
 '''

-import re
 from calibre.web.feeds.news import BasicNewsRecipe

 class FoxNews(BasicNewsRecipe):
@ -21,11 +20,10 @@ class FoxNews(BasicNewsRecipe):
    language              = 'en'
    publication_type      = 'newsportal'
    remove_empty_feeds    = True
-    extra_css             = ' body{font-family: Arial,sans-serif } img{margin-bottom: 0.4em} .caption{font-size: x-small} '
-
-    preprocess_regexps = [
-       (re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')
-    ]
+    extra_css             = """
+                                body{font-family: Arial,sans-serif }
+                                .caption{font-size: x-small}
+                            """

    conversion_options = {
                          'comment'   : description
@ -34,27 +32,15 @@ class FoxNews(BasicNewsRecipe):
                        , 'language'  : language
                        }

-    remove_attributes = ['xmlns']
-
-    keep_only_tags      = [
-                            dict(name='div', attrs={'id'   :['story','browse-story-content']})
-                           ,dict(name='div', attrs={'class':['posts articles','slideshow']})
-                           ,dict(name='h4' , attrs={'class':'storyDate'})
-                           ,dict(name='h1' , attrs={'xmlns:functx':'http://www.functx.com'})
-                           ,dict(name='div', attrs={'class':'authInfo'})
-                           ,dict(name='div', attrs={'id':'articleCont'})
-                          ]
+    remove_attributes = ['xmlns','lang']

    remove_tags = [
-                     dict(name='div', attrs={'class':['share-links','quigo quigo2','share-text','storyControls','socShare','btm-links']})
-                    ,dict(name='div', attrs={'id'   :['otherMedia','loomia_display','img-all-path','story-vcmId','story-url','pane-browse-story-comments','story_related']})
-                    ,dict(name='ul' , attrs={'class':['tools','tools alt','tools alt2','tabs']})
-                    ,dict(name='a' , attrs={'class':'join-discussion'})
-                    ,dict(name='ul' , attrs={'class':['tools','tools alt','tools alt2']})
-                    ,dict(name='p' , attrs={'class':'see_fullarchive'})
-                    ,dict(name=['object','embed','link','script'])
+                     dict(name=['object','embed','link','script','iframe','meta','base'])
+                    ,dict(attrs={'class':['user-control','url-description','ad-context']})
                  ]

+    remove_tags_before=dict(name='h1')
+    remove_tags_after =dict(attrs={'class':'url-description'})

    feeds = [
              (u'Latest Headlines', u'http://feeds.foxnews.com/foxnews/latest'        )
@ -67,8 +53,5 @@ class FoxNews(BasicNewsRecipe):
             ,(u'Entertainment'   , u'http://feeds.foxnews.com/foxnews/entertainment' )
            ]

-    def preprocess_html(self, soup):
-        for item in soup.findAll(style=True):
-            del item['style']
-        return self.adeify_images(soup)
-
+    def print_version(self, url):
+        return url + 'print'
--- a/resources/recipes/new_scientist.recipe
+++ b/resources/recipes/new_scientist.recipe
@ -8,11 +8,11 @@ import re
 from calibre.web.feeds.news import BasicNewsRecipe

 class NewScientist(BasicNewsRecipe):
-    title                 = 'New Scientist - Online News'
+    title                 = 'New Scientist - Online News w. subscription'
    __author__            = 'Darko Miletic'
    description           = 'Science news and science articles from New Scientist.'
    language              = 'en'
-    publisher             = 'New Scientist'
+    publisher             = 'Reed Business Information Ltd.'
    category              = 'science news, science articles, science jobs, drugs, cancer, depression, computer software'
    oldest_article        = 7
    max_articles_per_feed = 100
@ -21,7 +21,12 @@ class NewScientist(BasicNewsRecipe):
    cover_url             = 'http://www.newscientist.com/currentcover.jpg'
    masthead_url          = 'http://www.newscientist.com/img/misc/ns_logo.jpg'
    encoding              = 'utf-8'
-    extra_css             = ' body{font-family: Arial,sans-serif} img{margin-bottom: 0.8em} '
+    needs_subscription    = 'optional'
+    extra_css             = """
+                                 body{font-family: Arial,sans-serif}
+                                 img{margin-bottom: 0.8em}
+                                 .quotebx{font-size: x-large; font-weight: bold; margin-right: 2em; margin-left: 2em}
+                            """

    conversion_options = {
                          'comment'          : description
@ -33,15 +38,27 @@ class NewScientist(BasicNewsRecipe):

    keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol','blgmaincol','nsblgposts','hldgalcols']})]

+    def get_browser(self):
+        br = BasicNewsRecipe.get_browser()
+        br.open('http://www.newscientist.com/')
+        if self.username is not None and self.password is not None:
+            br.open('https://www.newscientist.com/user/login?redirectURL=')
+            br.select_form(nr=2)
+            br['loginId' ] = self.username
+            br['password'] = self.password
+            br.submit()
+        return br
+
    remove_tags = [
                     dict(name='div'  , attrs={'class':['hldBd','adline','pnl','infotext' ]})
                    ,dict(name='div'  , attrs={'id'   :['compnl','artIssueInfo','artTools','comments','blgsocial','sharebtns']})
                    ,dict(name='p'    , attrs={'class':['marker','infotext'               ]})
                    ,dict(name='meta' , attrs={'name' :'description'                       })
                    ,dict(name='a'    , attrs={'rel'  :'tag'                               })
+                    ,dict(name=['link','base','meta','iframe','object','embed'])
                  ]
    remove_tags_after = dict(attrs={'class':['nbpcopy','comments']})
-    remove_attributes = ['height','width']
+    remove_attributes = ['height','width','lang']

    feeds          = [
                        (u'Latest Headlines'        , u'http://feeds.newscientist.com/science-news'              )
@ -62,6 +79,8 @@ class NewScientist(BasicNewsRecipe):
        return url + '?full=true&print=true'

    def preprocess_html(self, soup):
+        for item in soup.findAll(['quote','quotetext']):
+            item.name='p'
        for tg in soup.findAll('a'):
            if tg.string == 'Home':
                tg.parent.extract()
--- a/resources/recipes/theecocolapse.recipe
+++ b/resources/recipes/theecocolapse.recipe
@ -0,0 +1,46 @@
+__license__   = 'GPL v3'
+__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
+'''
+theeconomiccollapseblog.com
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+class TheEconomicCollapse(BasicNewsRecipe):
+    title                 = 'The Economic Collapse'
+    __author__            = 'Darko Miletic'
+    description           = 'Are You Prepared For The Coming Economic Collapse And The Next Great Depression?'
+    publisher             = 'The Economic Collapse'
+    category              = 'news, politics, USA, economy'
+    oldest_article        = 2
+    max_articles_per_feed = 200
+    no_stylesheets        = True
+    encoding              = 'utf8'
+    use_embedded_content  = False
+    language              = 'en'
+    remove_empty_feeds    = True
+    extra_css             = """
+                                body{font-family: Tahoma,Arial,sans-serif }
+                                img{margin-bottom: 0.4em}
+                            """
+
+    conversion_options = {
+                          'comment'   : description
+                        , 'tags'      : category
+                        , 'publisher' : publisher
+                        , 'language'  : language
+                        }
+
+    remove_tags = [
+                     dict(attrs={'class':'sociable'})
+                    ,dict(name=['iframe','object','embed','meta','link','base'])
+                  ]
+    remove_attributes=['lang','onclick','width','height']
+    keep_only_tags=[dict(attrs={'class':['post-headline','post-bodycopy clearfix','']})]
+
+    feeds = [(u'Posts', u'http://theeconomiccollapseblog.com/feed')]
+
+    def preprocess_html(self, soup):
+        for item in soup.findAll(style=True):
+            del item['style']
+        return self.adeify_images(soup)
+
--- a/resources/recipes/theeconomictimes_india.recipe
+++ b/resources/recipes/theeconomictimes_india.recipe
@ -21,8 +21,9 @@ class TheEconomicTimes(BasicNewsRecipe):
    language               = 'en_IN'
    publication_type       = 'newspaper'
    masthead_url           = 'http://economictimes.indiatimes.com/photo/2676871.cms'
-    extra_css              = """ body{font-family: Arial,Helvetica,sans-serif}
-                                .heading1{font-size: xx-large; font-weight: bold} """
+    extra_css              = """
+                                 body{font-family: Arial,Helvetica,sans-serif}
+                             """

    conversion_options = {
                          'comment'          : description
@ -31,8 +32,9 @@ class TheEconomicTimes(BasicNewsRecipe):
                        , 'language'         : language
                        }

-    keep_only_tags = [dict(attrs={'class':['heading1','headingnext','Normal']})]
+    keep_only_tags = [dict(attrs={'class':'printdiv'})]
    remove_tags    = [dict(name=['object','link','embed','iframe','base','table','meta'])]
+    remove_attributes = ['name']

    feeds          = [(u'All articles', u'http://economictimes.indiatimes.com/rssfeedsdefault.cms')]

--- a/src/calibre/library/database2.py
+++ b/src/calibre/library/database2.py
@ -748,10 +748,10 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
        return False

    def find_identical_books(self, mi):
-        fuzzy_title_patterns = [(re.compile(pat), repl) for pat, repl in
+        fuzzy_title_patterns = [(re.compile(pat, re.IGNORECASE), repl) for pat, repl in
                [
                    (r'[\[\](){}<>\'";,:#]', ''),
-                    (r'^(the|a|an) ', ''),
+                    (tweaks.get('title_sort_articles', r'^(a|the|an)\s+'), ''),
                    (r'[-._]', ' '),
                    (r'\s+', ' ')
                ]