diff --git a/resources/images/news/akter.png b/resources/images/news/akter.png new file mode 100644 index 0000000000..60c352849e Binary files /dev/null and b/resources/images/news/akter.png differ diff --git a/resources/recipes/akter.recipe b/resources/recipes/akter.recipe new file mode 100644 index 0000000000..3959fff717 --- /dev/null +++ b/resources/recipes/akter.recipe @@ -0,0 +1,78 @@ +__license__ = 'GPL v3' +__copyright__ = '2010, Darko Miletic ' +''' +akter.co.rs +''' + +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class Akter(BasicNewsRecipe): + title = 'AKTER' + __author__ = 'Darko Miletic' + description = 'AKTER - nedeljni politicki magazin savremene Srbije' + publisher = 'Akter Media Group d.o.o.' + category = 'vesti, online vesti, najnovije vesti, politika, sport, ekonomija, biznis, finansije, berza, kultura, zivot, putovanja, auto, automobili, tehnologija, politicki magazin, dogadjaji, desavanja, lifestyle, zdravlje, zdravstvo, vest, novine, nedeljnik, srbija, novi sad, vojvodina, svet, drustvo, zabava, republika srpska, beograd, intervju, komentar, reportaza, arhiva vesti, news, serbia, politics' + oldest_article = 8 + max_articles_per_feed = 100 + no_stylesheets = False + use_embedded_content = False + encoding = 'utf-8' + masthead_url = 'http://www.akter.co.rs/templates/gk_thenews2/images/style2/logo.png' + language = 'sr' + publication_type = 'magazine' + remove_empty_feeds = True + PREFIX = 'http://www.akter.co.rs' + extra_css = """ @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} + @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} + .article_description,body,.lokacija{font-family: Arial,Helvetica,sans1,sans-serif} + .color-2{display:block; margin-bottom: 10px; padding: 5px, 10px; + border-left: 1px solid #D00000; color: #D00000} + img{margin-bottom: 0.8em} """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + , 'linearize_tables' : True + } + + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] + + feeds = [ + (u'Politika' , u'http://www.akter.co.rs/index.php/politikaprint.html' ) + ,(u'Ekonomija' , u'http://www.akter.co.rs/index.php/ekonomijaprint.html') + ,(u'Life&Style' , u'http://www.akter.co.rs/index.php/lsprint.html' ) + ,(u'Sport' , u'http://www.akter.co.rs/index.php/sportprint.html' ) + ] + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return self.adeify_images(soup) + + def print_version(self, url): + return url + '?tmpl=component&print=1&page=' + + def parse_index(self): + totalfeeds = [] + lfeeds = self.get_feeds() + for feedobj in lfeeds: + feedtitle, feedurl = feedobj + self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl)) + articles = [] + soup = self.index_to_soup(feedurl) + for item in soup.findAll(attrs={'class':['sectiontableentry1','sectiontableentry2']}): + link = item.find('a') + url = self.PREFIX + link['href'] + title = self.tag_to_string(link) + articles.append({ + 'title' :title + ,'date' :'' + ,'url' :url + ,'description':'' + }) + totalfeeds.append((feedtitle, articles)) + return totalfeeds + diff --git a/resources/recipes/wsj.recipe b/resources/recipes/wsj.recipe index 25f175f78b..e8e29505c4 100644 --- a/resources/recipes/wsj.recipe +++ b/resources/recipes/wsj.recipe @@ -3,126 +3,130 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __docformat__ = 'restructuredtext en' - +import string from calibre.web.feeds.news import BasicNewsRecipe -from calibre import strftime # http://online.wsj.com/page/us_in_todays_paper.html class WallStreetJournal(BasicNewsRecipe): - title = 'The Wall Street Journal (US)' - __author__ = 'Kovid Goyal and Sujata Raman' - description = 'News and current affairs' - needs_subscription = True - language = 'en' + title = 'The Wall Street Journal (US)' + __author__ = 'Kovid Goyal and Sujata Raman' + description = 'News and current affairs' + needs_subscription = True + language = 'en' - max_articles_per_feed = 1000 - timefmt = ' [%a, %b %d, %Y]' - no_stylesheets = True + max_articles_per_feed = 1000 + timefmt = ' [%a, %b %d, %Y]' + no_stylesheets = True - extra_css = '''h1{color:#093D72 ; font-size:large ; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; } - h2{color:#474537; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;} - .subhead{color:gray; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;} - .insettipUnit {color:#666666; font-family:Arial,Sans-serif;font-size:xx-small } - .targetCaption{ font-size:x-small; color:#333333; font-family:Arial,Helvetica,sans-serif} - .article{font-family :Arial,Helvetica,sans-serif; font-size:x-small} - .tagline {color:#333333; font-size:xx-small} - .dateStamp {color:#666666; font-family:Arial,Helvetica,sans-serif} - h3{color:blue ;font-family:Arial,Helvetica,sans-serif; font-size:xx-small} - .byline{color:blue;font-family:Arial,Helvetica,sans-serif; font-size:xx-small} - h6{color:#333333; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small;font-style:italic; } - .paperLocation{color:#666666; font-size:xx-small}''' + extra_css = '''h1{color:#093D72 ; font-size:large ; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; } + h2{color:#474537; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;} + .subhead{color:gray; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;} + .insettipUnit {color:#666666; font-family:Arial,Sans-serif;font-size:xx-small } + .targetCaption{ font-size:x-small; color:#333333; font-family:Arial,Helvetica,sans-serif} + .article{font-family :Arial,Helvetica,sans-serif; font-size:x-small} + .tagline {color:#333333; font-size:xx-small} + .dateStamp {color:#666666; font-family:Arial,Helvetica,sans-serif} + h3{color:blue ;font-family:Arial,Helvetica,sans-serif; font-size:xx-small} + .byline{color:blue;font-family:Arial,Helvetica,sans-serif; font-size:xx-small} + h6{color:#333333; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small;font-style:italic; } + .paperLocation{color:#666666; font-size:xx-small}''' - remove_tags_before = dict(name='h1') - remove_tags = [ - dict(id=["articleTabs_tab_article", "articleTabs_tab_comments", "articleTabs_tab_interactive","articleTabs_tab_video","articleTabs_tab_map","articleTabs_tab_slideshow"]), - {'class':['footer_columns','network','insetCol3wide','interactive','video','slideshow','map','insettip','insetClose','more_in', "insetContent", 'articleTools_bottom', 'aTools', "tooltip", "adSummary", "nav-inline"]}, - dict(rel='shortcut icon'), - ] - remove_tags_after = [dict(id="article_story_body"), {'class':"article story"},] + remove_tags_before = dict(name='h1') + remove_tags = [ + dict(id=["articleTabs_tab_article", "articleTabs_tab_comments", "articleTabs_tab_interactive","articleTabs_tab_video","articleTabs_tab_map","articleTabs_tab_slideshow"]), + {'class':['footer_columns','network','insetCol3wide','interactive','video','slideshow','map','insettip','insetClose','more_in', "insetContent", 'articleTools_bottom', 'aTools', "tooltip", "adSummary", "nav-inline"]}, + dict(rel='shortcut icon'), + ] + remove_tags_after = [dict(id="article_story_body"), {'class':"article story"},] - def get_browser(self): - br = BasicNewsRecipe.get_browser() - if self.username is not None and self.password is not None: - br.open('http://commerce.wsj.com/auth/login') - br.select_form(nr=0) - br['user'] = self.username - br['password'] = self.password - res = br.submit() - raw = res.read() - if 'Welcome,' not in raw: - raise ValueError('Failed to log in to wsj.com, check your ' - 'username and password') - return br + def get_browser(self): + br = BasicNewsRecipe.get_browser() + if self.username is not None and self.password is not None: + br.open('http://commerce.wsj.com/auth/login') + br.select_form(nr=0) + br['user'] = self.username + br['password'] = self.password + res = br.submit() + raw = res.read() + if 'Welcome,' not in raw: + raise ValueError('Failed to log in to wsj.com, check your ' + 'username and password') + return br - def postprocess_html(self, soup, first): - for tag in soup.findAll(name=['table', 'tr', 'td']): - tag.name = 'div' + def postprocess_html(self, soup, first): + for tag in soup.findAll(name=['table', 'tr', 'td']): + tag.name = 'div' - for tag in soup.findAll('div', dict(id=["articleThumbnail_1", "articleThumbnail_2", "articleThumbnail_3", "articleThumbnail_4", "articleThumbnail_5", "articleThumbnail_6", "articleThumbnail_7"])): - tag.extract() + for tag in soup.findAll('div', dict(id=["articleThumbnail_1", "articleThumbnail_2", "articleThumbnail_3", "articleThumbnail_4", "articleThumbnail_5", "articleThumbnail_6", "articleThumbnail_7"])): + tag.extract() - return soup + return soup - def wsj_get_index(self): - return self.index_to_soup('http://online.wsj.com/page/us_in_todays_paper.html') + def wsj_get_index(self): + return self.index_to_soup('http://online.wsj.com/page/us_in_todays_paper.html') - def parse_index(self): - soup = self.wsj_get_index() + def parse_index(self): + soup = self.wsj_get_index() - year = strftime('%Y') - for x in soup.findAll('td', height='25', attrs={'class':'b14'}): - txt = self.tag_to_string(x).strip() - txt = txt.replace(u'\xa0', ' ') - txt = txt.encode('ascii', 'ignore') - if year in txt: - self.timefmt = ' [%s]'%txt - break + date = soup.find('span', attrs={'class':'date-date'}) + if date is not None: + self.timefmt = ' [%s]'%self.tag_to_string(date) - left_column = soup.find( - text=lambda t: 'begin ITP Left Column' in str(t)) + sections = {} + sec_order = [] + for a in soup.findAll('a', attrs={'class':'mjLinkItem'}, href=True): + container = a.findParent(['li', 'div']) + if container.name == 'div': + section = 'Page One' + else: + section = '' + sec = container.find('a', href=lambda x: x and '/search?' in x) + if sec is not None: + section = self.tag_to_string(sec).strip() + if not section: + h = container.find(['h1','h2','h3','h4','h5','h6']) + section = self.tag_to_string(h) + section = string.capitalize(section).replace('U.s.', 'U.S.') + if section not in sections: + sections[section] = [] + sec_order.append(section) + meta = a.find(attrs={'class':'meta_sectionName'}) + if meta is not None: + meta.extract() + title = self.tag_to_string(a).strip() + ' [%s]'%self.tag_to_string(meta) + url = 'http://online.wsj.com'+a['href'] + desc = '' + p = container.find('p') + if p is not None: + desc = self.tag_to_string(p) - table = left_column.findNext('table') + sections[section].append({'title':title, 'url':url, + 'description':desc, 'date':''}) - current_section = None - current_articles = [] - feeds = [] - for x in table.findAllNext(True): - if x.name == 'td' and x.get('class', None) == 'b13': - if current_articles and current_section: - feeds.append((current_section, current_articles)) - current_section = self.tag_to_string(x.a).strip() - current_articles = [] - self.log('\tProcessing section:', current_section) - if current_section is not None and x.name == 'a' and \ - x.get('class', None) == 'bold80': - title = self.tag_to_string(x) - url = x.get('href', False) - if not url or not title: - continue - url = url.partition('#')[0] - desc = '' - d = x.findNextSibling(True) - if d is not None and d.get('class', None) == 'arialResize': - desc = self.tag_to_string(d) - desc = desc.partition(u'\u2022')[0] - self.log('\t\tFound article:', title) - self.log('\t\t\t', url) - if url.startswith('/'): - url = 'http://online.wsj.com'+url - if desc: - self.log('\t\t\t', desc) - current_articles.append({'title': title, 'url':url, - 'description':desc, 'date':''}) + self.log('Found article:', title) - if current_articles and current_section: - feeds.append((current_section, current_articles)) - - return feeds - - def cleanup(self): - self.browser.open('http://online.wsj.com/logout?url=http://online.wsj.com') + a.extract() + for a in container.findAll('a', href=lambda x: x and '/article/' + in x): + url = a['href'] + if not url.startswith('http:'): + url = 'http://online.wsj.com'+url + title = self.tag_to_string(a).strip() + if not title or title.startswith('['): continue + if title: + sections[section].append({'title':self.tag_to_string(a), + 'url':url, 'description':'', 'date':''}) + self.log('\tFound related:', title) + + + feeds = [(sec, sections[sec]) for sec in sec_order] + return feeds + + + def cleanup(self): + self.browser.open('http://online.wsj.com/logout?url=http://online.wsj.com') diff --git a/src/calibre/ebooks/mobi/mobiml.py b/src/calibre/ebooks/mobi/mobiml.py index 9361d52d31..bfa8758c85 100644 --- a/src/calibre/ebooks/mobi/mobiml.py +++ b/src/calibre/ebooks/mobi/mobiml.py @@ -61,6 +61,7 @@ class FormatState(object): self.italic = False self.bold = False self.strikethrough = False + self.underline = False self.preserve = False self.family = 'serif' self.bgcolor = 'transparent' @@ -79,7 +80,8 @@ class FormatState(object): and self.family == other.family \ and self.bgcolor == other.bgcolor \ and self.fgcolor == other.fgcolor \ - and self.strikethrough == other.strikethrough + and self.strikethrough == other.strikethrough \ + and self.underline == other.underline def __ne__(self, other): return not self.__eq__(other) @@ -251,6 +253,8 @@ class MobiMLizer(object): color=unicode(istate.fgcolor)) if istate.strikethrough: inline = etree.SubElement(inline, XHTML('s')) + if istate.underline: + inline = etree.SubElement(inline, XHTML('u')) bstate.inline = inline bstate.istate = istate inline = bstate.inline @@ -330,6 +334,7 @@ class MobiMLizer(object): istate.bgcolor = style['background-color'] istate.fgcolor = style['color'] istate.strikethrough = style['text-decoration'] == 'line-through' + istate.underline = style['text-decoration'] == 'underline' if 'monospace' in style['font-family']: istate.family = 'monospace' elif 'sans-serif' in style['font-family']: