Merge from trunk

This commit is contained in:
Charles Haley 2010-06-16 20:57:58 +01:00
commit 468dcea634
4 changed files with 188 additions and 101 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 429 B

View File

@ -0,0 +1,78 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
akter.co.rs
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Akter(BasicNewsRecipe):
title = 'AKTER'
__author__ = 'Darko Miletic'
description = 'AKTER - nedeljni politicki magazin savremene Srbije'
publisher = 'Akter Media Group d.o.o.'
category = 'vesti, online vesti, najnovije vesti, politika, sport, ekonomija, biznis, finansije, berza, kultura, zivot, putovanja, auto, automobili, tehnologija, politicki magazin, dogadjaji, desavanja, lifestyle, zdravlje, zdravstvo, vest, novine, nedeljnik, srbija, novi sad, vojvodina, svet, drustvo, zabava, republika srpska, beograd, intervju, komentar, reportaza, arhiva vesti, news, serbia, politics'
oldest_article = 8
max_articles_per_feed = 100
no_stylesheets = False
use_embedded_content = False
encoding = 'utf-8'
masthead_url = 'http://www.akter.co.rs/templates/gk_thenews2/images/style2/logo.png'
language = 'sr'
publication_type = 'magazine'
remove_empty_feeds = True
PREFIX = 'http://www.akter.co.rs'
extra_css = """ @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)}
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
.article_description,body,.lokacija{font-family: Arial,Helvetica,sans1,sans-serif}
.color-2{display:block; margin-bottom: 10px; padding: 5px, 10px;
border-left: 1px solid #D00000; color: #D00000}
img{margin-bottom: 0.8em} """
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
, 'linearize_tables' : True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
feeds = [
(u'Politika' , u'http://www.akter.co.rs/index.php/politikaprint.html' )
,(u'Ekonomija' , u'http://www.akter.co.rs/index.php/ekonomijaprint.html')
,(u'Life&Style' , u'http://www.akter.co.rs/index.php/lsprint.html' )
,(u'Sport' , u'http://www.akter.co.rs/index.php/sportprint.html' )
]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return self.adeify_images(soup)
def print_version(self, url):
return url + '?tmpl=component&print=1&page='
def parse_index(self):
totalfeeds = []
lfeeds = self.get_feeds()
for feedobj in lfeeds:
feedtitle, feedurl = feedobj
self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
articles = []
soup = self.index_to_soup(feedurl)
for item in soup.findAll(attrs={'class':['sectiontableentry1','sectiontableentry2']}):
link = item.find('a')
url = self.PREFIX + link['href']
title = self.tag_to_string(link)
articles.append({
'title' :title
,'date' :''
,'url' :url
,'description':''
})
totalfeeds.append((feedtitle, articles))
return totalfeeds

View File

@ -3,9 +3,8 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import string
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre import strftime
# http://online.wsj.com/page/us_in_todays_paper.html # http://online.wsj.com/page/us_in_todays_paper.html
@ -72,56 +71,61 @@ class WallStreetJournal(BasicNewsRecipe):
def parse_index(self): def parse_index(self):
soup = self.wsj_get_index() soup = self.wsj_get_index()
year = strftime('%Y') date = soup.find('span', attrs={'class':'date-date'})
for x in soup.findAll('td', height='25', attrs={'class':'b14'}): if date is not None:
txt = self.tag_to_string(x).strip() self.timefmt = ' [%s]'%self.tag_to_string(date)
txt = txt.replace(u'\xa0', ' ')
txt = txt.encode('ascii', 'ignore')
if year in txt:
self.timefmt = ' [%s]'%txt
break
left_column = soup.find( sections = {}
text=lambda t: 'begin ITP Left Column' in str(t)) sec_order = []
for a in soup.findAll('a', attrs={'class':'mjLinkItem'}, href=True):
table = left_column.findNext('table') container = a.findParent(['li', 'div'])
if container.name == 'div':
current_section = None section = 'Page One'
current_articles = [] else:
feeds = [] section = ''
for x in table.findAllNext(True): sec = container.find('a', href=lambda x: x and '/search?' in x)
if x.name == 'td' and x.get('class', None) == 'b13': if sec is not None:
if current_articles and current_section: section = self.tag_to_string(sec).strip()
feeds.append((current_section, current_articles)) if not section:
current_section = self.tag_to_string(x.a).strip() h = container.find(['h1','h2','h3','h4','h5','h6'])
current_articles = [] section = self.tag_to_string(h)
self.log('\tProcessing section:', current_section) section = string.capitalize(section).replace('U.s.', 'U.S.')
if current_section is not None and x.name == 'a' and \ if section not in sections:
x.get('class', None) == 'bold80': sections[section] = []
title = self.tag_to_string(x) sec_order.append(section)
url = x.get('href', False) meta = a.find(attrs={'class':'meta_sectionName'})
if not url or not title: if meta is not None:
continue meta.extract()
url = url.partition('#')[0] title = self.tag_to_string(a).strip() + ' [%s]'%self.tag_to_string(meta)
url = 'http://online.wsj.com'+a['href']
desc = '' desc = ''
d = x.findNextSibling(True) p = container.find('p')
if d is not None and d.get('class', None) == 'arialResize': if p is not None:
desc = self.tag_to_string(d) desc = self.tag_to_string(p)
desc = desc.partition(u'\u2022')[0]
self.log('\t\tFound article:', title) sections[section].append({'title':title, 'url':url,
self.log('\t\t\t', url)
if url.startswith('/'):
url = 'http://online.wsj.com'+url
if desc:
self.log('\t\t\t', desc)
current_articles.append({'title': title, 'url':url,
'description':desc, 'date':''}) 'description':desc, 'date':''})
if current_articles and current_section: self.log('Found article:', title)
feeds.append((current_section, current_articles))
a.extract()
for a in container.findAll('a', href=lambda x: x and '/article/'
in x):
url = a['href']
if not url.startswith('http:'):
url = 'http://online.wsj.com'+url
title = self.tag_to_string(a).strip()
if not title or title.startswith('['): continue
if title:
sections[section].append({'title':self.tag_to_string(a),
'url':url, 'description':'', 'date':''})
self.log('\tFound related:', title)
feeds = [(sec, sections[sec]) for sec in sec_order]
return feeds return feeds
def cleanup(self): def cleanup(self):
self.browser.open('http://online.wsj.com/logout?url=http://online.wsj.com') self.browser.open('http://online.wsj.com/logout?url=http://online.wsj.com')

View File

@ -61,6 +61,7 @@ class FormatState(object):
self.italic = False self.italic = False
self.bold = False self.bold = False
self.strikethrough = False self.strikethrough = False
self.underline = False
self.preserve = False self.preserve = False
self.family = 'serif' self.family = 'serif'
self.bgcolor = 'transparent' self.bgcolor = 'transparent'
@ -79,7 +80,8 @@ class FormatState(object):
and self.family == other.family \ and self.family == other.family \
and self.bgcolor == other.bgcolor \ and self.bgcolor == other.bgcolor \
and self.fgcolor == other.fgcolor \ and self.fgcolor == other.fgcolor \
and self.strikethrough == other.strikethrough and self.strikethrough == other.strikethrough \
and self.underline == other.underline
def __ne__(self, other): def __ne__(self, other):
return not self.__eq__(other) return not self.__eq__(other)
@ -251,6 +253,8 @@ class MobiMLizer(object):
color=unicode(istate.fgcolor)) color=unicode(istate.fgcolor))
if istate.strikethrough: if istate.strikethrough:
inline = etree.SubElement(inline, XHTML('s')) inline = etree.SubElement(inline, XHTML('s'))
if istate.underline:
inline = etree.SubElement(inline, XHTML('u'))
bstate.inline = inline bstate.inline = inline
bstate.istate = istate bstate.istate = istate
inline = bstate.inline inline = bstate.inline
@ -330,6 +334,7 @@ class MobiMLizer(object):
istate.bgcolor = style['background-color'] istate.bgcolor = style['background-color']
istate.fgcolor = style['color'] istate.fgcolor = style['color']
istate.strikethrough = style['text-decoration'] == 'line-through' istate.strikethrough = style['text-decoration'] == 'line-through'
istate.underline = style['text-decoration'] == 'underline'
if 'monospace' in style['font-family']: if 'monospace' in style['font-family']:
istate.family = 'monospace' istate.family = 'monospace'
elif 'sans-serif' in style['font-family']: elif 'sans-serif' in style['font-family']: