Merge from trunk

2025-07-09 03:04:10 -04:00 · 2010-06-16 20:57:58 +01:00 · 2010-06-16 20:57:58 +01:00 · 468dcea634
commit 468dcea634
parent e4660d788c 61504a8527
4 changed files with 188 additions and 101 deletions
--- a/resources/images/news/akter.png
+++ b/resources/images/news/akter.png
--- a/resources/recipes/akter.recipe
+++ b/resources/recipes/akter.recipe
@ -0,0 +1,78 @@
+__license__   = 'GPL v3'
+__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
+'''
+akter.co.rs
+'''
+
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Akter(BasicNewsRecipe):
+    title                 = 'AKTER'
+    __author__            = 'Darko Miletic'
+    description           = 'AKTER - nedeljni politicki magazin savremene Srbije'
+    publisher             = 'Akter Media Group d.o.o.'
+    category              = 'vesti, online vesti, najnovije vesti, politika, sport, ekonomija, biznis, finansije, berza, kultura, zivot, putovanja, auto, automobili, tehnologija, politicki magazin, dogadjaji, desavanja, lifestyle, zdravlje, zdravstvo, vest, novine, nedeljnik, srbija, novi sad, vojvodina, svet, drustvo, zabava, republika srpska, beograd, intervju, komentar, reportaza, arhiva vesti, news, serbia, politics'
+    oldest_article        = 8
+    max_articles_per_feed = 100
+    no_stylesheets        = False
+    use_embedded_content  = False
+    encoding              = 'utf-8'
+    masthead_url          = 'http://www.akter.co.rs/templates/gk_thenews2/images/style2/logo.png'
+    language              = 'sr'
+    publication_type      = 'magazine'
+    remove_empty_feeds    = True
+    PREFIX                 = 'http://www.akter.co.rs'
+    extra_css             = """ @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)}
+                                @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
+                                .article_description,body,.lokacija{font-family: Arial,Helvetica,sans1,sans-serif}
+                                .color-2{display:block; margin-bottom: 10px; padding: 5px, 10px;
+                                border-left: 1px solid #D00000; color: #D00000}
+                                img{margin-bottom: 0.8em} """
+
+    conversion_options = {
+                          'comment'          : description
+                        , 'tags'             : category
+                        , 'publisher'        : publisher
+                        , 'language'         : language
+                        , 'linearize_tables' : True
+                        }
+
+    preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
+
+    feeds          = [
+                        (u'Politika'   , u'http://www.akter.co.rs/index.php/politikaprint.html' )
+                       ,(u'Ekonomija'  , u'http://www.akter.co.rs/index.php/ekonomijaprint.html')
+                       ,(u'Life&Style' , u'http://www.akter.co.rs/index.php/lsprint.html'       )
+                       ,(u'Sport'      , u'http://www.akter.co.rs/index.php/sportprint.html'    )
+                     ]
+
+    def preprocess_html(self, soup):
+        for item in soup.findAll(style=True):
+            del item['style']
+        return self.adeify_images(soup)
+
+    def print_version(self, url):
+        return url + '?tmpl=component&print=1&page='
+
+    def parse_index(self):
+        totalfeeds = []
+        lfeeds = self.get_feeds()
+        for feedobj in lfeeds:
+            feedtitle, feedurl = feedobj
+            self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
+            articles = []
+            soup = self.index_to_soup(feedurl)
+            for item in soup.findAll(attrs={'class':['sectiontableentry1','sectiontableentry2']}):
+                link = item.find('a')
+                url         = self.PREFIX + link['href']
+                title       = self.tag_to_string(link)
+                articles.append({
+                                      'title'      :title
+                                     ,'date'       :''
+                                     ,'url'        :url
+                                     ,'description':''
+                                    })
+            totalfeeds.append((feedtitle, articles))
+        return totalfeeds
+
--- a/resources/recipes/wsj.recipe
+++ b/resources/recipes/wsj.recipe
@ -3,9 +3,8 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'

-
+import string
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre import strftime

 # http://online.wsj.com/page/us_in_todays_paper.html

@ -72,56 +71,61 @@ class WallStreetJournal(BasicNewsRecipe):
    def parse_index(self):
        soup = self.wsj_get_index()

-            year = strftime('%Y')
-            for x in soup.findAll('td', height='25', attrs={'class':'b14'}):
-                txt = self.tag_to_string(x).strip()
-                txt = txt.replace(u'\xa0', ' ')
-                txt = txt.encode('ascii', 'ignore')
-                if year in txt:
-                    self.timefmt = ' [%s]'%txt
-                    break
+        date = soup.find('span', attrs={'class':'date-date'})
+        if date is not None:
+            self.timefmt = ' [%s]'%self.tag_to_string(date)

-            left_column = soup.find(
-                    text=lambda t: 'begin ITP Left Column' in str(t))
-
-            table = left_column.findNext('table')
-
-            current_section = None
-            current_articles = []
-            feeds = []
-            for x in table.findAllNext(True):
-                if x.name == 'td' and x.get('class', None) == 'b13':
-                    if current_articles and current_section:
-                        feeds.append((current_section, current_articles))
-                    current_section = self.tag_to_string(x.a).strip()
-                    current_articles = []
-                    self.log('\tProcessing section:', current_section)
-                if current_section is not None and x.name == 'a' and \
-                        x.get('class', None) == 'bold80':
-                    title = self.tag_to_string(x)
-                    url = x.get('href', False)
-                    if not url or not title:
-                        continue
-                    url = url.partition('#')[0]
+        sections = {}
+        sec_order = []
+        for a in soup.findAll('a', attrs={'class':'mjLinkItem'}, href=True):
+            container = a.findParent(['li', 'div'])
+            if container.name == 'div':
+                section = 'Page One'
+            else:
+                section = ''
+                sec = container.find('a', href=lambda x: x and '/search?' in x)
+                if sec is not None:
+                    section = self.tag_to_string(sec).strip()
+                if not section:
+                    h = container.find(['h1','h2','h3','h4','h5','h6'])
+                    section = self.tag_to_string(h)
+            section = string.capitalize(section).replace('U.s.', 'U.S.')
+            if section not in sections:
+                sections[section] = []
+                sec_order.append(section)
+            meta = a.find(attrs={'class':'meta_sectionName'})
+            if meta is not None:
+                meta.extract()
+            title = self.tag_to_string(a).strip() + ' [%s]'%self.tag_to_string(meta)
+            url = 'http://online.wsj.com'+a['href']
            desc = ''
-                    d = x.findNextSibling(True)
-                    if d is not None and d.get('class', None) == 'arialResize':
-                        desc = self.tag_to_string(d)
-                        desc = desc.partition(u'\u2022')[0]
-                    self.log('\t\tFound article:', title)
-                    self.log('\t\t\t', url)
-                    if url.startswith('/'):
-                        url = 'http://online.wsj.com'+url
-                    if desc:
-                        self.log('\t\t\t', desc)
-                    current_articles.append({'title': title, 'url':url,
+            p = container.find('p')
+            if p is not None:
+                desc = self.tag_to_string(p)
+
+            sections[section].append({'title':title, 'url':url,
                'description':desc, 'date':''})

-            if current_articles and current_section:
-                feeds.append((current_section, current_articles))
+            self.log('Found article:', title)

+            a.extract()
+            for a in container.findAll('a', href=lambda x: x and '/article/'
+                    in x):
+                url = a['href']
+                if not url.startswith('http:'):
+                    url = 'http://online.wsj.com'+url
+                title = self.tag_to_string(a).strip()
+                if not title or title.startswith('['): continue
+                if title:
+                    sections[section].append({'title':self.tag_to_string(a),
+                        'url':url, 'description':'', 'date':''})
+                    self.log('\tFound related:', title)
+
+
+        feeds = [(sec, sections[sec]) for sec in sec_order]
        return feeds

+
    def cleanup(self):
        self.browser.open('http://online.wsj.com/logout?url=http://online.wsj.com')

--- a/src/calibre/ebooks/mobi/mobiml.py
+++ b/src/calibre/ebooks/mobi/mobiml.py
@ -61,6 +61,7 @@ class FormatState(object):
        self.italic = False
        self.bold = False
        self.strikethrough = False
+        self.underline = False
        self.preserve = False
        self.family = 'serif'
        self.bgcolor = 'transparent'
@ -79,7 +80,8 @@ class FormatState(object):
               and self.family == other.family \
               and self.bgcolor == other.bgcolor \
               and self.fgcolor == other.fgcolor \
-               and self.strikethrough == other.strikethrough
+               and self.strikethrough == other.strikethrough \
+               and self.underline == other.underline

    def __ne__(self, other):
        return not self.__eq__(other)
@ -251,6 +253,8 @@ class MobiMLizer(object):
                        color=unicode(istate.fgcolor))
            if istate.strikethrough:
                inline = etree.SubElement(inline, XHTML('s'))
+            if istate.underline:
+                inline = etree.SubElement(inline, XHTML('u'))
            bstate.inline = inline
        bstate.istate = istate
        inline = bstate.inline
@ -330,6 +334,7 @@ class MobiMLizer(object):
        istate.bgcolor  = style['background-color']
        istate.fgcolor  = style['color']
        istate.strikethrough = style['text-decoration'] == 'line-through'
+        istate.underline = style['text-decoration'] == 'underline'
        if 'monospace' in style['font-family']:
            istate.family = 'monospace'
        elif 'sans-serif' in style['font-family']: