Merge from trunk

2025-07-09 03:04:10 -04:00 · 2010-06-16 20:57:58 +01:00 · 2010-06-16 20:57:58 +01:00 · 468dcea634
commit 468dcea634
parent e4660d788c 61504a8527
4 changed files with 188 additions and 101 deletions
--- a/resources/images/news/akter.png
+++ b/resources/images/news/akter.png
--- a/resources/recipes/akter.recipe
+++ b/resources/recipes/akter.recipe
@ -0,0 +1,78 @@
 __license__   = 'GPL v3'
 __copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
 '''
 akter.co.rs
 '''
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 class Akter(BasicNewsRecipe):
    title                 = 'AKTER'
    __author__            = 'Darko Miletic'
    description           = 'AKTER - nedeljni politicki magazin savremene Srbije'
    publisher             = 'Akter Media Group d.o.o.'
    category              = 'vesti, online vesti, najnovije vesti, politika, sport, ekonomija, biznis, finansije, berza, kultura, zivot, putovanja, auto, automobili, tehnologija, politicki magazin, dogadjaji, desavanja, lifestyle, zdravlje, zdravstvo, vest, novine, nedeljnik, srbija, novi sad, vojvodina, svet, drustvo, zabava, republika srpska, beograd, intervju, komentar, reportaza, arhiva vesti, news, serbia, politics'
    oldest_article        = 8
    max_articles_per_feed = 100
    no_stylesheets        = False
    use_embedded_content  = False
    encoding              = 'utf-8'
    masthead_url          = 'http://www.akter.co.rs/templates/gk_thenews2/images/style2/logo.png'
    language              = 'sr'
    publication_type      = 'magazine'
    remove_empty_feeds    = True
    PREFIX                 = 'http://www.akter.co.rs'
    extra_css             = """ @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)}
                                @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
                                .article_description,body,.lokacija{font-family: Arial,Helvetica,sans1,sans-serif}
                                .color-2{display:block; margin-bottom: 10px; padding: 5px, 10px;
                                border-left: 1px solid #D00000; color: #D00000}
                                img{margin-bottom: 0.8em} """
    conversion_options = {
                          'comment'          : description
                        , 'tags'             : category
                        , 'publisher'        : publisher
                        , 'language'         : language
                        , 'linearize_tables' : True
                        }
    preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
    feeds          = [
                        (u'Politika'   , u'http://www.akter.co.rs/index.php/politikaprint.html' )
                       ,(u'Ekonomija'  , u'http://www.akter.co.rs/index.php/ekonomijaprint.html')
                       ,(u'Life&Style' , u'http://www.akter.co.rs/index.php/lsprint.html'       )
                       ,(u'Sport'      , u'http://www.akter.co.rs/index.php/sportprint.html'    )
                     ]
    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
        return self.adeify_images(soup)
    def print_version(self, url):
        return url + '?tmpl=component&print=1&page='
    def parse_index(self):
        totalfeeds = []
        lfeeds = self.get_feeds()
        for feedobj in lfeeds:
            feedtitle, feedurl = feedobj
            self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
            articles = []
            soup = self.index_to_soup(feedurl)
            for item in soup.findAll(attrs={'class':['sectiontableentry1','sectiontableentry2']}):
                link = item.find('a')
                url         = self.PREFIX + link['href']
                title       = self.tag_to_string(link)
                articles.append({
                                      'title'      :title
                                     ,'date'       :''
                                     ,'url'        :url
                                     ,'description':''
                                    })
            totalfeeds.append((feedtitle, articles))
        return totalfeeds
--- a/resources/recipes/wsj.recipe
+++ b/resources/recipes/wsj.recipe
@ -3,126 +3,130 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
-
+import string
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre import strftime
 # http://online.wsj.com/page/us_in_todays_paper.html
 class WallStreetJournal(BasicNewsRecipe):
-        title = 'The Wall Street Journal (US)'
+    title = 'The Wall Street Journal (US)'
-        __author__ = 'Kovid Goyal and Sujata Raman'
+    __author__ = 'Kovid Goyal and Sujata Raman'
-        description = 'News and current affairs'
+    description = 'News and current affairs'
-        needs_subscription = True
+    needs_subscription = True
-        language = 'en'
+    language = 'en'
-        max_articles_per_feed = 1000
+    max_articles_per_feed = 1000
-        timefmt  = ' [%a, %b %d, %Y]'
+    timefmt  = ' [%a, %b %d, %Y]'
-        no_stylesheets = True
+    no_stylesheets = True
-        extra_css      = '''h1{color:#093D72 ; font-size:large ; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; }
+    extra_css      = '''h1{color:#093D72 ; font-size:large ; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; }
-                        h2{color:#474537; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;}
+                    h2{color:#474537; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;}
-                        .subhead{color:gray; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;}
+                    .subhead{color:gray; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;}
-                        .insettipUnit {color:#666666; font-family:Arial,Sans-serif;font-size:xx-small }
+                    .insettipUnit {color:#666666; font-family:Arial,Sans-serif;font-size:xx-small }
-                        .targetCaption{ font-size:x-small; color:#333333; font-family:Arial,Helvetica,sans-serif}
+                    .targetCaption{ font-size:x-small; color:#333333; font-family:Arial,Helvetica,sans-serif}
-                        .article{font-family :Arial,Helvetica,sans-serif; font-size:x-small}
+                    .article{font-family :Arial,Helvetica,sans-serif; font-size:x-small}
-                        .tagline {color:#333333; font-size:xx-small}
+                    .tagline {color:#333333; font-size:xx-small}
-                        .dateStamp {color:#666666; font-family:Arial,Helvetica,sans-serif}
+                    .dateStamp {color:#666666; font-family:Arial,Helvetica,sans-serif}
-                         h3{color:blue ;font-family:Arial,Helvetica,sans-serif; font-size:xx-small}
+                        h3{color:blue ;font-family:Arial,Helvetica,sans-serif; font-size:xx-small}
-                         .byline{color:blue;font-family:Arial,Helvetica,sans-serif; font-size:xx-small}
+                        .byline{color:blue;font-family:Arial,Helvetica,sans-serif; font-size:xx-small}
-                         h6{color:#333333; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small;font-style:italic; }
+                        h6{color:#333333; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small;font-style:italic; }
-                        .paperLocation{color:#666666; font-size:xx-small}'''
+                    .paperLocation{color:#666666; font-size:xx-small}'''
-        remove_tags_before = dict(name='h1')
+    remove_tags_before = dict(name='h1')
-        remove_tags = [
+    remove_tags = [
-                       dict(id=["articleTabs_tab_article", "articleTabs_tab_comments", "articleTabs_tab_interactive","articleTabs_tab_video","articleTabs_tab_map","articleTabs_tab_slideshow"]),
+                    dict(id=["articleTabs_tab_article", "articleTabs_tab_comments", "articleTabs_tab_interactive","articleTabs_tab_video","articleTabs_tab_map","articleTabs_tab_slideshow"]),
-                       {'class':['footer_columns','network','insetCol3wide','interactive','video','slideshow','map','insettip','insetClose','more_in', "insetContent", 'articleTools_bottom', 'aTools', "tooltip", "adSummary", "nav-inline"]},
+                    {'class':['footer_columns','network','insetCol3wide','interactive','video','slideshow','map','insettip','insetClose','more_in', "insetContent", 'articleTools_bottom', 'aTools', "tooltip", "adSummary", "nav-inline"]},
-                       dict(rel='shortcut icon'),
+                    dict(rel='shortcut icon'),
-                      ]
+                    ]
-        remove_tags_after = [dict(id="article_story_body"), {'class':"article story"},]
+    remove_tags_after = [dict(id="article_story_body"), {'class':"article story"},]
-        def get_browser(self):
+    def get_browser(self):
-            br = BasicNewsRecipe.get_browser()
+        br = BasicNewsRecipe.get_browser()
-            if self.username is not None and self.password is not None:
+        if self.username is not None and self.password is not None:
-                br.open('http://commerce.wsj.com/auth/login')
+            br.open('http://commerce.wsj.com/auth/login')
-                br.select_form(nr=0)
+            br.select_form(nr=0)
-                br['user']   = self.username
+            br['user']   = self.username
-                br['password'] = self.password
+            br['password'] = self.password
-                res = br.submit()
+            res = br.submit()
-                raw = res.read()
+            raw = res.read()
-                if 'Welcome,' not in raw:
+            if 'Welcome,' not in raw:
-                    raise ValueError('Failed to log in to wsj.com, check your '
+                raise ValueError('Failed to log in to wsj.com, check your '
-                            'username and password')
+                        'username and password')
-            return br
+        return br
-        def postprocess_html(self, soup, first):
+    def postprocess_html(self, soup, first):
-            for tag in soup.findAll(name=['table', 'tr', 'td']):
+        for tag in soup.findAll(name=['table', 'tr', 'td']):
-                tag.name = 'div'
+            tag.name = 'div'
-            for tag in soup.findAll('div', dict(id=["articleThumbnail_1", "articleThumbnail_2", "articleThumbnail_3", "articleThumbnail_4", "articleThumbnail_5", "articleThumbnail_6", "articleThumbnail_7"])):
+        for tag in soup.findAll('div', dict(id=["articleThumbnail_1", "articleThumbnail_2", "articleThumbnail_3", "articleThumbnail_4", "articleThumbnail_5", "articleThumbnail_6", "articleThumbnail_7"])):
-                tag.extract()
+            tag.extract()
-            return soup
+        return soup
-        def wsj_get_index(self):
+    def wsj_get_index(self):
-            return self.index_to_soup('http://online.wsj.com/page/us_in_todays_paper.html')
+        return self.index_to_soup('http://online.wsj.com/page/us_in_todays_paper.html')
-        def parse_index(self):
+    def parse_index(self):
-            soup = self.wsj_get_index()
+        soup = self.wsj_get_index()
-            year = strftime('%Y')
+        date = soup.find('span', attrs={'class':'date-date'})
-            for x in soup.findAll('td', height='25', attrs={'class':'b14'}):
+        if date is not None:
-                txt = self.tag_to_string(x).strip()
+            self.timefmt = ' [%s]'%self.tag_to_string(date)
                txt = txt.replace(u'\xa0', ' ')
                txt = txt.encode('ascii', 'ignore')
                if year in txt:
                    self.timefmt = ' [%s]'%txt
                    break
-            left_column = soup.find(
+        sections = {}
-                    text=lambda t: 'begin ITP Left Column' in str(t))
+        sec_order = []
        for a in soup.findAll('a', attrs={'class':'mjLinkItem'}, href=True):
            container = a.findParent(['li', 'div'])
            if container.name == 'div':
                section = 'Page One'
            else:
                section = ''
                sec = container.find('a', href=lambda x: x and '/search?' in x)
                if sec is not None:
                    section = self.tag_to_string(sec).strip()
                if not section:
                    h = container.find(['h1','h2','h3','h4','h5','h6'])
                    section = self.tag_to_string(h)
            section = string.capitalize(section).replace('U.s.', 'U.S.')
            if section not in sections:
                sections[section] = []
                sec_order.append(section)
            meta = a.find(attrs={'class':'meta_sectionName'})
            if meta is not None:
                meta.extract()
            title = self.tag_to_string(a).strip() + ' [%s]'%self.tag_to_string(meta)
            url = 'http://online.wsj.com'+a['href']
            desc = ''
            p = container.find('p')
            if p is not None:
                desc = self.tag_to_string(p)
-            table = left_column.findNext('table')
+            sections[section].append({'title':title, 'url':url,
                'description':desc, 'date':''})
-            current_section = None
+            self.log('Found article:', title)
            current_articles = []
            feeds = []
            for x in table.findAllNext(True):
                if x.name == 'td' and x.get('class', None) == 'b13':
                    if current_articles and current_section:
                        feeds.append((current_section, current_articles))
                    current_section = self.tag_to_string(x.a).strip()
                    current_articles = []
                    self.log('\tProcessing section:', current_section)
                if current_section is not None and x.name == 'a' and \
                        x.get('class', None) == 'bold80':
                    title = self.tag_to_string(x)
                    url = x.get('href', False)
                    if not url or not title:
                        continue
                    url = url.partition('#')[0]
                    desc = ''
                    d = x.findNextSibling(True)
                    if d is not None and d.get('class', None) == 'arialResize':
                        desc = self.tag_to_string(d)
                        desc = desc.partition(u'\u2022')[0]
                    self.log('\t\tFound article:', title)
                    self.log('\t\t\t', url)
                    if url.startswith('/'):
                        url = 'http://online.wsj.com'+url
                    if desc:
                        self.log('\t\t\t', desc)
                    current_articles.append({'title': title, 'url':url,
                        'description':desc, 'date':''})
-            if current_articles and current_section:
+            a.extract()
-                feeds.append((current_section, current_articles))
+            for a in container.findAll('a', href=lambda x: x and '/article/'
-
+                    in x):
-            return feeds
+                url = a['href']
-
+                if not url.startswith('http:'):
-        def cleanup(self):
+                    url = 'http://online.wsj.com'+url
-            self.browser.open('http://online.wsj.com/logout?url=http://online.wsj.com')
+                title = self.tag_to_string(a).strip()
                if not title or title.startswith('['): continue
                if title:
                    sections[section].append({'title':self.tag_to_string(a),
                        'url':url, 'description':'', 'date':''})
                    self.log('\tFound related:', title)
        feeds = [(sec, sections[sec]) for sec in sec_order]
        return feeds
    def cleanup(self):
        self.browser.open('http://online.wsj.com/logout?url=http://online.wsj.com')
--- a/src/calibre/ebooks/mobi/mobiml.py
+++ b/src/calibre/ebooks/mobi/mobiml.py
@ -61,6 +61,7 @@ class FormatState(object):
        self.italic = False
        self.bold = False
        self.strikethrough = False
        self.underline = False
        self.preserve = False
        self.family = 'serif'
        self.bgcolor = 'transparent'
@ -79,7 +80,8 @@ class FormatState(object):
               and self.family == other.family \
               and self.bgcolor == other.bgcolor \
               and self.fgcolor == other.fgcolor \
-               and self.strikethrough == other.strikethrough
+               and self.strikethrough == other.strikethrough \
               and self.underline == other.underline
    def __ne__(self, other):
        return not self.__eq__(other)
@ -251,6 +253,8 @@ class MobiMLizer(object):
                        color=unicode(istate.fgcolor))
            if istate.strikethrough:
                inline = etree.SubElement(inline, XHTML('s'))
            if istate.underline:
                inline = etree.SubElement(inline, XHTML('u'))
            bstate.inline = inline
        bstate.istate = istate
        inline = bstate.inline
@ -330,6 +334,7 @@ class MobiMLizer(object):
        istate.bgcolor  = style['background-color']
        istate.fgcolor  = style['color']
        istate.strikethrough = style['text-decoration'] == 'line-through'
        istate.underline = style['text-decoration'] == 'underline'
        if 'monospace' in style['font-family']:
            istate.family = 'monospace'
        elif 'sans-serif' in style['font-family']: