__license__ = 'GPL v3' __copyright__ = '2008-2011, Darko Miletic ' ''' nspm.rs ''' import re from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import NavigableString, Tag def new_tag(soup, name, attrs=()): impl = getattr(soup, 'new_tag', None) if impl is not None: return impl(name, attrs=dict(attrs)) return Tag(soup, name, attrs=attrs or None) class Nspm(BasicNewsRecipe): title = 'Nova srpska politicka misao' __author__ = 'Darko Miletic' description = 'Casopis za politicku teoriju i drustvena istrazivanja' publisher = 'NSPM' category = 'news, politics, Serbia' oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False INDEX = 'http://www.nspm.rs/?alphabet=l' encoding = 'utf-8' language = 'sr' remove_empty_feeds = True publication_type = 'magazine' masthead_url = 'http://www.nspm.rs/templates/jsn_epic_pro/images/logol.jpg' extra_css = """ @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: "Times New Roman", serif1, serif} .article_description{font-family: Arial, sans1, sans-serif} img{margin-top:0.5em; margin-bottom: 0.7em; display: block} .author{color: #990000; font-weight: bold} .author,.createdate{font-size: 0.9em} """ conversion_options = { 'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'pretty_print': True } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] remove_tags = [dict(name=['link', 'script', 'meta', 'base', 'img'])] remove_attributes = ['width', 'height', 'lang', 'xmlns:fb', 'xmlns:og', 'vspace', 'hspace', 'type', 'start', 'size'] def get_browser(self): br = BasicNewsRecipe.get_browser(self) br.open(self.INDEX) return br feeds = [ (u'Rubrike', u'http://www.nspm.rs/rubrike/feed/rss.html'), (u'Debate', u'http://www.nspm.rs/debate/feed/rss.html'), (u'Reci i misli', u'http://www.nspm.rs/reci-i-misli/feed/rss.html'), (u'Samo smeh srbina spasava', u'http://www.nspm.rs/samo-smeh-srbina-spasava/feed/rss.html'), (u'Polemike', u'http://www.nspm.rs/polemike/feed/rss.html'), (u'Prikazi', u'http://www.nspm.rs/prikazi/feed/rss.html'), (u'Prenosimo', u'http://www.nspm.rs/prenosimo/feed/rss.html'), (u'Hronika', u'http://www.nspm.rs/tabela/hronika/feed/rss.html') ] def preprocess_html(self, soup): atitle = soup.body.find('a', attrs={'class': 'contentpagetitle'}) if atitle: cleanTitle = new_tag(soup, 'h1', [('class', 'contentpagetitle')]) cnt = NavigableString(self.tag_to_string(atitle)) cleanTitle.append(cnt) author = soup.body.find('span', attrs={'class': 'author'}) if author: author.extract() author.name = 'div' crdate = soup.body.find('td', attrs={'class': 'createdate'}) if crdate: cleanCrdate = new_tag(soup, 'div', [('class', 'createdate')]) cnt = NavigableString(self.tag_to_string(crdate)) cleanCrdate.append(cnt) # get the dependant element artText = new_tag(soup, 'div', [('class', 'text')]) textHolderp = crdate.parent textHolder = textHolderp.nextSibling while textHolder and (not isinstance(textHolder, Tag) or (textHolder.name != textHolderp.name)): textHolder = textHolder.nextSibling if textHolder.td: artText = textHolder.td artText.name = 'div' artText.attrs = [] artText['class'] = 'text' artText.extract() soup.body.contents = [] soup.body.append(cleanTitle) soup.body.append(author) soup.body.append(cleanCrdate) soup.body.append(artText) for item in soup.findAll('a'): limg = item.find('img') if item.string is not None: str = item.string item.replaceWith(str) else: if limg: item.name = 'div' item.attrs = [] else: str = self.tag_to_string(item) item.replaceWith(str) for item in soup.findAll('img', alt=False): item['alt'] = 'image' return soup