Merge from trunk

This commit is contained in:
Charles Haley 2010-06-16 20:57:58 +01:00
commit 468dcea634
4 changed files with 188 additions and 101 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 429 B

View File

@ -0,0 +1,78 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
akter.co.rs
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Akter(BasicNewsRecipe):
title = 'AKTER'
__author__ = 'Darko Miletic'
description = 'AKTER - nedeljni politicki magazin savremene Srbije'
publisher = 'Akter Media Group d.o.o.'
category = 'vesti, online vesti, najnovije vesti, politika, sport, ekonomija, biznis, finansije, berza, kultura, zivot, putovanja, auto, automobili, tehnologija, politicki magazin, dogadjaji, desavanja, lifestyle, zdravlje, zdravstvo, vest, novine, nedeljnik, srbija, novi sad, vojvodina, svet, drustvo, zabava, republika srpska, beograd, intervju, komentar, reportaza, arhiva vesti, news, serbia, politics'
oldest_article = 8
max_articles_per_feed = 100
no_stylesheets = False
use_embedded_content = False
encoding = 'utf-8'
masthead_url = 'http://www.akter.co.rs/templates/gk_thenews2/images/style2/logo.png'
language = 'sr'
publication_type = 'magazine'
remove_empty_feeds = True
PREFIX = 'http://www.akter.co.rs'
extra_css = """ @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)}
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
.article_description,body,.lokacija{font-family: Arial,Helvetica,sans1,sans-serif}
.color-2{display:block; margin-bottom: 10px; padding: 5px, 10px;
border-left: 1px solid #D00000; color: #D00000}
img{margin-bottom: 0.8em} """
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
, 'linearize_tables' : True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
feeds = [
(u'Politika' , u'http://www.akter.co.rs/index.php/politikaprint.html' )
,(u'Ekonomija' , u'http://www.akter.co.rs/index.php/ekonomijaprint.html')
,(u'Life&Style' , u'http://www.akter.co.rs/index.php/lsprint.html' )
,(u'Sport' , u'http://www.akter.co.rs/index.php/sportprint.html' )
]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return self.adeify_images(soup)
def print_version(self, url):
return url + '?tmpl=component&print=1&page='
def parse_index(self):
totalfeeds = []
lfeeds = self.get_feeds()
for feedobj in lfeeds:
feedtitle, feedurl = feedobj
self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
articles = []
soup = self.index_to_soup(feedurl)
for item in soup.findAll(attrs={'class':['sectiontableentry1','sectiontableentry2']}):
link = item.find('a')
url = self.PREFIX + link['href']
title = self.tag_to_string(link)
articles.append({
'title' :title
,'date' :''
,'url' :url
,'description':''
})
totalfeeds.append((feedtitle, articles))
return totalfeeds

View File

@ -3,126 +3,130 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
import string
from calibre.web.feeds.news import BasicNewsRecipe
from calibre import strftime
# http://online.wsj.com/page/us_in_todays_paper.html
class WallStreetJournal(BasicNewsRecipe):
title = 'The Wall Street Journal (US)'
__author__ = 'Kovid Goyal and Sujata Raman'
description = 'News and current affairs'
needs_subscription = True
language = 'en'
title = 'The Wall Street Journal (US)'
__author__ = 'Kovid Goyal and Sujata Raman'
description = 'News and current affairs'
needs_subscription = True
language = 'en'
max_articles_per_feed = 1000
timefmt = ' [%a, %b %d, %Y]'
no_stylesheets = True
max_articles_per_feed = 1000
timefmt = ' [%a, %b %d, %Y]'
no_stylesheets = True
extra_css = '''h1{color:#093D72 ; font-size:large ; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; }
h2{color:#474537; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;}
.subhead{color:gray; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;}
.insettipUnit {color:#666666; font-family:Arial,Sans-serif;font-size:xx-small }
.targetCaption{ font-size:x-small; color:#333333; font-family:Arial,Helvetica,sans-serif}
.article{font-family :Arial,Helvetica,sans-serif; font-size:x-small}
.tagline {color:#333333; font-size:xx-small}
.dateStamp {color:#666666; font-family:Arial,Helvetica,sans-serif}
h3{color:blue ;font-family:Arial,Helvetica,sans-serif; font-size:xx-small}
.byline{color:blue;font-family:Arial,Helvetica,sans-serif; font-size:xx-small}
h6{color:#333333; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small;font-style:italic; }
.paperLocation{color:#666666; font-size:xx-small}'''
extra_css = '''h1{color:#093D72 ; font-size:large ; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; }
h2{color:#474537; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;}
.subhead{color:gray; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;}
.insettipUnit {color:#666666; font-family:Arial,Sans-serif;font-size:xx-small }
.targetCaption{ font-size:x-small; color:#333333; font-family:Arial,Helvetica,sans-serif}
.article{font-family :Arial,Helvetica,sans-serif; font-size:x-small}
.tagline {color:#333333; font-size:xx-small}
.dateStamp {color:#666666; font-family:Arial,Helvetica,sans-serif}
h3{color:blue ;font-family:Arial,Helvetica,sans-serif; font-size:xx-small}
.byline{color:blue;font-family:Arial,Helvetica,sans-serif; font-size:xx-small}
h6{color:#333333; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small;font-style:italic; }
.paperLocation{color:#666666; font-size:xx-small}'''
remove_tags_before = dict(name='h1')
remove_tags = [
dict(id=["articleTabs_tab_article", "articleTabs_tab_comments", "articleTabs_tab_interactive","articleTabs_tab_video","articleTabs_tab_map","articleTabs_tab_slideshow"]),
{'class':['footer_columns','network','insetCol3wide','interactive','video','slideshow','map','insettip','insetClose','more_in', "insetContent", 'articleTools_bottom', 'aTools', "tooltip", "adSummary", "nav-inline"]},
dict(rel='shortcut icon'),
]
remove_tags_after = [dict(id="article_story_body"), {'class':"article story"},]
remove_tags_before = dict(name='h1')
remove_tags = [
dict(id=["articleTabs_tab_article", "articleTabs_tab_comments", "articleTabs_tab_interactive","articleTabs_tab_video","articleTabs_tab_map","articleTabs_tab_slideshow"]),
{'class':['footer_columns','network','insetCol3wide','interactive','video','slideshow','map','insettip','insetClose','more_in', "insetContent", 'articleTools_bottom', 'aTools', "tooltip", "adSummary", "nav-inline"]},
dict(rel='shortcut icon'),
]
remove_tags_after = [dict(id="article_story_body"), {'class':"article story"},]
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://commerce.wsj.com/auth/login')
br.select_form(nr=0)
br['user'] = self.username
br['password'] = self.password
res = br.submit()
raw = res.read()
if 'Welcome,' not in raw:
raise ValueError('Failed to log in to wsj.com, check your '
'username and password')
return br
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://commerce.wsj.com/auth/login')
br.select_form(nr=0)
br['user'] = self.username
br['password'] = self.password
res = br.submit()
raw = res.read()
if 'Welcome,' not in raw:
raise ValueError('Failed to log in to wsj.com, check your '
'username and password')
return br
def postprocess_html(self, soup, first):
for tag in soup.findAll(name=['table', 'tr', 'td']):
tag.name = 'div'
def postprocess_html(self, soup, first):
for tag in soup.findAll(name=['table', 'tr', 'td']):
tag.name = 'div'
for tag in soup.findAll('div', dict(id=["articleThumbnail_1", "articleThumbnail_2", "articleThumbnail_3", "articleThumbnail_4", "articleThumbnail_5", "articleThumbnail_6", "articleThumbnail_7"])):
tag.extract()
for tag in soup.findAll('div', dict(id=["articleThumbnail_1", "articleThumbnail_2", "articleThumbnail_3", "articleThumbnail_4", "articleThumbnail_5", "articleThumbnail_6", "articleThumbnail_7"])):
tag.extract()
return soup
return soup
def wsj_get_index(self):
return self.index_to_soup('http://online.wsj.com/page/us_in_todays_paper.html')
def wsj_get_index(self):
return self.index_to_soup('http://online.wsj.com/page/us_in_todays_paper.html')
def parse_index(self):
soup = self.wsj_get_index()
def parse_index(self):
soup = self.wsj_get_index()
year = strftime('%Y')
for x in soup.findAll('td', height='25', attrs={'class':'b14'}):
txt = self.tag_to_string(x).strip()
txt = txt.replace(u'\xa0', ' ')
txt = txt.encode('ascii', 'ignore')
if year in txt:
self.timefmt = ' [%s]'%txt
break
date = soup.find('span', attrs={'class':'date-date'})
if date is not None:
self.timefmt = ' [%s]'%self.tag_to_string(date)
left_column = soup.find(
text=lambda t: 'begin ITP Left Column' in str(t))
sections = {}
sec_order = []
for a in soup.findAll('a', attrs={'class':'mjLinkItem'}, href=True):
container = a.findParent(['li', 'div'])
if container.name == 'div':
section = 'Page One'
else:
section = ''
sec = container.find('a', href=lambda x: x and '/search?' in x)
if sec is not None:
section = self.tag_to_string(sec).strip()
if not section:
h = container.find(['h1','h2','h3','h4','h5','h6'])
section = self.tag_to_string(h)
section = string.capitalize(section).replace('U.s.', 'U.S.')
if section not in sections:
sections[section] = []
sec_order.append(section)
meta = a.find(attrs={'class':'meta_sectionName'})
if meta is not None:
meta.extract()
title = self.tag_to_string(a).strip() + ' [%s]'%self.tag_to_string(meta)
url = 'http://online.wsj.com'+a['href']
desc = ''
p = container.find('p')
if p is not None:
desc = self.tag_to_string(p)
table = left_column.findNext('table')
sections[section].append({'title':title, 'url':url,
'description':desc, 'date':''})
current_section = None
current_articles = []
feeds = []
for x in table.findAllNext(True):
if x.name == 'td' and x.get('class', None) == 'b13':
if current_articles and current_section:
feeds.append((current_section, current_articles))
current_section = self.tag_to_string(x.a).strip()
current_articles = []
self.log('\tProcessing section:', current_section)
if current_section is not None and x.name == 'a' and \
x.get('class', None) == 'bold80':
title = self.tag_to_string(x)
url = x.get('href', False)
if not url or not title:
continue
url = url.partition('#')[0]
desc = ''
d = x.findNextSibling(True)
if d is not None and d.get('class', None) == 'arialResize':
desc = self.tag_to_string(d)
desc = desc.partition(u'\u2022')[0]
self.log('\t\tFound article:', title)
self.log('\t\t\t', url)
if url.startswith('/'):
url = 'http://online.wsj.com'+url
if desc:
self.log('\t\t\t', desc)
current_articles.append({'title': title, 'url':url,
'description':desc, 'date':''})
self.log('Found article:', title)
if current_articles and current_section:
feeds.append((current_section, current_articles))
return feeds
def cleanup(self):
self.browser.open('http://online.wsj.com/logout?url=http://online.wsj.com')
a.extract()
for a in container.findAll('a', href=lambda x: x and '/article/'
in x):
url = a['href']
if not url.startswith('http:'):
url = 'http://online.wsj.com'+url
title = self.tag_to_string(a).strip()
if not title or title.startswith('['): continue
if title:
sections[section].append({'title':self.tag_to_string(a),
'url':url, 'description':'', 'date':''})
self.log('\tFound related:', title)
feeds = [(sec, sections[sec]) for sec in sec_order]
return feeds
def cleanup(self):
self.browser.open('http://online.wsj.com/logout?url=http://online.wsj.com')

View File

@ -61,6 +61,7 @@ class FormatState(object):
self.italic = False
self.bold = False
self.strikethrough = False
self.underline = False
self.preserve = False
self.family = 'serif'
self.bgcolor = 'transparent'
@ -79,7 +80,8 @@ class FormatState(object):
and self.family == other.family \
and self.bgcolor == other.bgcolor \
and self.fgcolor == other.fgcolor \
and self.strikethrough == other.strikethrough
and self.strikethrough == other.strikethrough \
and self.underline == other.underline
def __ne__(self, other):
return not self.__eq__(other)
@ -251,6 +253,8 @@ class MobiMLizer(object):
color=unicode(istate.fgcolor))
if istate.strikethrough:
inline = etree.SubElement(inline, XHTML('s'))
if istate.underline:
inline = etree.SubElement(inline, XHTML('u'))
bstate.inline = inline
bstate.istate = istate
inline = bstate.inline
@ -330,6 +334,7 @@ class MobiMLizer(object):
istate.bgcolor = style['background-color']
istate.fgcolor = style['color']
istate.strikethrough = style['text-decoration'] == 'line-through'
istate.underline = style['text-decoration'] == 'underline'
if 'monospace' in style['font-family']:
istate.family = 'monospace'
elif 'sans-serif' in style['font-family']: