New recipes for index.hu and pcworld.hu by Ezmegaz

This commit is contained in:
Kovid Goyal 2009-05-12 12:16:14 -07:00
parent 04d8e251c5
commit 5ff51fc3f9
11 changed files with 144 additions and 90 deletions

View File

@ -20,7 +20,7 @@ DEPENDENCIES = [
('BeautifulSoup', '3.0.5', 'beautifulsoup', 'python-beautifulsoup', 'python-BeautifulSoup'), ('BeautifulSoup', '3.0.5', 'beautifulsoup', 'python-beautifulsoup', 'python-BeautifulSoup'),
('dnspython', '1.6.0', 'dnspython', 'dnspython', 'dnspython', 'dnspython'), ('dnspython', '1.6.0', 'dnspython', 'dnspython', 'dnspython', 'dnspython'),
('poppler', '0.10.5', 'poppler', 'poppler', 'poppler', 'poppler'), ('poppler', '0.10.5', 'poppler', 'poppler', 'poppler', 'poppler'),
('pdftk', '1.12', 'pdftk', 'pdftk', 'pdftk', 'pdftk'), ('podofo', '0.7', 'podofo', 'podofo', 'podofo', 'podofo'),
] ]

View File

@ -49,7 +49,7 @@
</p> </p>
<p> <p>
${app} is available in the software repositories of the following ${app} is available in the software repositories of the following
linux distributions: supported linux distributions:
<table id="install_info"> <table id="install_info">
<col width="150" /><col width="*" /> <col width="150" /><col width="*" />
<tr> <tr>

View File

@ -42,7 +42,7 @@ recipe_modules = ['recipe_' + r for r in (
'moneynews', 'der_standard', 'diepresse', 'nzz_ger', 'hna', 'moneynews', 'der_standard', 'diepresse', 'nzz_ger', 'hna',
'seattle_times', 'scott_hanselman', 'coding_horror', 'twitchfilms', 'seattle_times', 'scott_hanselman', 'coding_horror', 'twitchfilms',
'stackoverflow', 'telepolis_artikel', 'zaobao', 'usnews', 'stackoverflow', 'telepolis_artikel', 'zaobao', 'usnews',
'straitstimes', 'straitstimes', 'index_hu', 'pcworld_hu',
)] )]
import re, imp, inspect, time, os import re, imp, inspect, time, os

View File

@ -16,12 +16,14 @@ class Blic(BasicNewsRecipe):
description = 'Blic.co.yu online verzija najtiraznije novine u Srbiji donosi najnovije vesti iz Srbije i sveta, komentare, politicke analize, poslovne i ekonomske vesti, vesti iz regiona, intervjue, informacije iz kulture, reportaze, pokriva sve sportske dogadjaje, detaljan tv program, nagradne igre, zabavu, fenomenalni Blic strip, dnevni horoskop, arhivu svih dogadjaja' description = 'Blic.co.yu online verzija najtiraznije novine u Srbiji donosi najnovije vesti iz Srbije i sveta, komentare, politicke analize, poslovne i ekonomske vesti, vesti iz regiona, intervjue, informacije iz kulture, reportaze, pokriva sve sportske dogadjaje, detaljan tv program, nagradne igre, zabavu, fenomenalni Blic strip, dnevni horoskop, arhivu svih dogadjaja'
publisher = 'RINGIER d.o.o.' publisher = 'RINGIER d.o.o.'
category = 'news, politics, Serbia' category = 'news, politics, Serbia'
delay = 1
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
remove_javascript = True remove_javascript = True
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
language = _('Serbian') language = _('Serbian')
lang = 'sr-Latn-RS'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif} ' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif} '
html2lrf_options = [ html2lrf_options = [
@ -45,26 +47,14 @@ class Blic(BasicNewsRecipe):
start_url, question, rest_url = url.partition('?') start_url, question, rest_url = url.partition('?')
return u'http://www.blic.rs/_print.php?' + rest_url return u'http://www.blic.rs/_print.php?' + rest_url
def cleanup_image_tags(self,soup):
for item in soup.findAll('img'):
for attrib in ['height','width','border','align']:
if item.has_key(attrib):
del item[attrib]
oldParent = item.parent
myIndex = oldParent.contents.index(item)
item.extract()
divtag = Tag(soup,'div')
brtag = Tag(soup,'br')
oldParent.insert(myIndex,divtag)
divtag.append(item)
divtag.append(brtag)
return soup
def preprocess_html(self, soup): def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="sr-Latn-RS"/>' mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
soup.head.insert(0,mtag) soup.head.insert(0,mlang)
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
return self.cleanup_image_tags(soup) return self.adeify_images(soup)
def get_article_url(self, article):
raw = article.get('link', None)
return raw.replace('.co.yu','.rs')

View File

@ -0,0 +1,20 @@
from calibre.web.feeds.news import BasicNewsRecipe
class Index(BasicNewsRecipe):
title = u'INDEX.HU'
oldest_article = 3
max_articles_per_feed = 50
language = _('Hungarian')
__author__ = 'Ezmegaz'
feeds = [(u'ALL', u'http://index.hu/24ora/rss/'),
(u'BELF\xd6LD', u'http://index.hu/belfold/rss/default/'),
(u'K\xdcLF\xd6LD', u'http://index.hu/kulfold/rss/default/'),
(u'BULV\xc1R', u'http://index.hu/bulvar/rss/default/'),
(u'GAZDAS\xc1G', u'http://index.hu/gazdasag/rss/default/'),
(u'TECH', u'http://index.hu/tech/rss/main/'),
(u'KULT\xdaRA', u'http://index.hu/kultur/rss/main/'),
(u'TUDOM\xc1NY', u'http://index.hu/tudomany/rss/main/'),
(u'V\xc9LEM\xc9NY', u'http://index.hu/velemeny/rss/default/')]

View File

@ -8,12 +8,13 @@ nin.co.rs
import re, urllib import re, urllib
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class Nin(BasicNewsRecipe): class Nin(BasicNewsRecipe):
title = 'NIN online' title = 'NIN online'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Nedeljne informativne novine' description = 'Nedeljne informativne novine'
publisher = 'NIN' publisher = 'NIN D.O.O.'
category = 'news, politics, Serbia' category = 'news, politics, Serbia'
no_stylesheets = True no_stylesheets = True
oldest_article = 15 oldest_article = 15
@ -28,9 +29,9 @@ class Nin(BasicNewsRecipe):
remove_javascript = True remove_javascript = True
use_embedded_content = False use_embedded_content = False
language = _('Serbian') language = _('Serbian')
lang = 'sr-RS' lang = 'sr-Latn-RS'
direction = 'ltr' direction = 'ltr'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: justify; font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif} .artTitle{font-size: x-large; font-weight: bold} .columnhead{font-size: small; font-weight: bold}'
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment' , description
@ -70,9 +71,10 @@ class Nin(BasicNewsRecipe):
def preprocess_html(self, soup): def preprocess_html(self, soup):
soup.html['lang'] = self.lang soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction soup.html['dir' ] = self.direction
mtag = '<meta http-equiv="Content-Language" content="' + self.lang + '"/>' mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mtag += '\n<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '"/>' mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")])
soup.head.insert(0,mtag) soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
return soup return soup

View File

@ -0,0 +1,22 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.web.feeds.news import BasicNewsRecipe
class Index(BasicNewsRecipe):
title = u'PCWORLD.HU'
oldest_article = 3
max_articles_per_feed = 50
language = _('Hungarian')
__author__ = 'Ezmegaz'
feeds = [(u'H\xedrek', u'http://pcworld.hu/rss/rss.xml'), (u'Hardver h\xedrek', u'http://www.pcworld.hu/rss/rss_hardverhirek.xml'), (u'Szoftver h\xedrek', u'http://www.pcworld.hu/rss/rss_szoftverhirek.xml'), (u'Hardver cikkek', u'http://www.pcworld.hu/rss/rss_hardvercikkek.xml'), (u'Szoftver cikkek', u'http://www.pcworld.hu/rss/rss_szoftvercikkek.xml'), (u'Mobil h\xedrek', u'http://www.pcworld.hu/rss/rss_mobil.xml'), (u'\xdczleti h\xedrek', u'http://www.pcworld.hu/rss/rss_uzlet.xml'), (u'Let\xf6lt\xe9sek', u'http://www.pcworld.hu/rss/rss_letoltes.xml'), (u'PC World TV', u'http://tv.pcworld.hu/rss/rss_hun_pcw.xml'), (u'Tudta-e...?', u'http://pcworld.hu/rss/rss_tudtae.xml')]

View File

@ -10,6 +10,7 @@ pobjeda.co.me
import re import re
from calibre import strftime from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class Pobjeda(BasicNewsRecipe): class Pobjeda(BasicNewsRecipe):
title = 'Pobjeda Online' title = 'Pobjeda Online'
@ -22,12 +23,13 @@ class Pobjeda(BasicNewsRecipe):
encoding = 'utf8' encoding = 'utf8'
remove_javascript = True remove_javascript = True
use_embedded_content = False use_embedded_content = False
language = _('Serbian')
lang = 'sr-Latn-Me'
INDEX = u'http://www.pobjeda.co.me' INDEX = u'http://www.pobjeda.co.me'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{text-align: justify; font-family: serif1, serif} .article_description{font-family: serif1, serif}' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}'
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--base-font-size', '10'
, '--category', category , '--category', category
, '--publisher', publisher , '--publisher', publisher
] ]
@ -59,11 +61,13 @@ class Pobjeda(BasicNewsRecipe):
] ]
def preprocess_html(self, soup): def preprocess_html(self, soup):
soup.html['xml:lang'] = 'sr-Latn-ME' soup.html['xml:lang'] = self.lang
soup.html['lang'] = 'sr-Latn-ME' soup.html['lang'] = self.lang
mtag = '<meta http-equiv="Content-Language" content="sr-Latn-ME"/>' mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
soup.head.insert(0,mtag) mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")])
return soup soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return self.adeify_images(soup)
def get_cover_url(self): def get_cover_url(self):
cover_url = None cover_url = None

View File

@ -1,39 +1,48 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
sptimes.ru sptimes.ru
''' '''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class PetersburgTimes(BasicNewsRecipe): class PetersburgTimes(BasicNewsRecipe):
title = u'The St. Petersburg Times' title = 'The St. Petersburg Times'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'News from Russia' description = 'News from Russia'
oldest_article = 7 publisher = 'sptimes.ru'
category = 'news, politics, Russia'
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
remove_javascript = True
encoding = 'cp1251'
use_embedded_content = False use_embedded_content = False
language = _('English') language = _('English')
INDEX = 'http://www.sptimes.ru'
def parse_index(self): html2lrf_options = [
articles = [] '--comment', description
soup = self.index_to_soup(self.INDEX) , '--category', category
, '--publisher', publisher
, '--ignore-tables'
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
remove_tags = [dict(name=['object','link','embed'])]
feeds = [(u'Headlines', u'http://sptimes.ru/headlines.php' )]
def preprocess_html(self, soup):
return self.adeify_images(soup)
def get_article_url(self, article):
raw = article.get('guid', None)
return raw
def print_version(self, url):
start_url, question, article_id = url.rpartition('/')
return u'http://www.sptimes.ru/index.php?action_id=100&story_id=' + article_id
for item in soup.findAll('a', attrs={'class':'story_link_o'}):
if item.has_key('href'):
url = self.INDEX + item['href'].replace('action_id=2','action_id=100')
title = self.tag_to_string(item)
c_date = strftime('%A, %d %B, %Y')
description = ''
articles.append({
'title':title,
'date':c_date,
'url':url,
'description':description
})
return [(soup.head.title.string, articles)]

View File

@ -9,6 +9,7 @@ vijesti.me
import re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class Vijesti(BasicNewsRecipe): class Vijesti(BasicNewsRecipe):
title = 'Vijesti' title = 'Vijesti'
@ -16,8 +17,8 @@ class Vijesti(BasicNewsRecipe):
description = 'News from Montenegro' description = 'News from Montenegro'
publisher = 'Daily Press Vijesti' publisher = 'Daily Press Vijesti'
category = 'news, politics, Montenegro' category = 'news, politics, Montenegro'
oldest_article = 1 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 150
no_stylesheets = True no_stylesheets = True
remove_javascript = True remove_javascript = True
encoding = 'cp1250' encoding = 'cp1250'
@ -25,7 +26,8 @@ class Vijesti(BasicNewsRecipe):
remove_javascript = True remove_javascript = True
use_embedded_content = False use_embedded_content = False
language = _('Serbian') language = _('Serbian')
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: justify; font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}' lang ='sr-Latn-Me'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}'
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
@ -44,12 +46,15 @@ class Vijesti(BasicNewsRecipe):
feeds = [(u'Sve vijesti', u'http://www.vijesti.me/rss.php' )] feeds = [(u'Sve vijesti', u'http://www.vijesti.me/rss.php' )]
def preprocess_html(self, soup): def preprocess_html(self, soup):
soup.html['xml:lang'] = 'sr-Latn-ME' soup.html['xml:lang'] = self.lang
soup.html['lang'] = 'sr-Latn-ME' soup.html['lang'] = self.lang
mtag = '<meta http-equiv="Content-Language" content="sr-Latn-ME"/>' mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
soup.head.insert(0,mtag) mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")])
for item in soup.findAll('img'): soup.head.insert(0,mlang)
if item.has_key('align'): soup.head.insert(1,mcharset)
del item['align'] return self.adeify_images(soup)
item.insert(0,'<br /><br />')
return soup def get_article_url(self, article):
raw = article.get('link', None)
return raw.replace('.cg.yu','.me')

View File

@ -9,6 +9,7 @@ vreme.com
import re import re
from calibre import strftime from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class Vreme(BasicNewsRecipe): class Vreme(BasicNewsRecipe):
title = 'Vreme' title = 'Vreme'
@ -27,7 +28,7 @@ class Vreme(BasicNewsRecipe):
language = _('Serbian') language = _('Serbian')
lang = 'sr-Latn-RS' lang = 'sr-Latn-RS'
direction = 'ltr' direction = 'ltr'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{text-align: justify; font-family: serif1, serif} .article_description{font-family: serif1, serif}' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif} .heading1{font-size: x-large; font-weight: bold} .heading2{font-size: large; font-weight: bold} .toc-heading{font-size: small}'
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment' , description
@ -89,9 +90,10 @@ class Vreme(BasicNewsRecipe):
del item['size'] del item['size']
soup.html['lang'] = self.lang soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction soup.html['dir' ] = self.direction
mtag = '<meta http-equiv="Content-Language" content="' + self.lang + '"/>' mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mtag += '\n<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '"/>' mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")])
soup.head.insert(0,mtag) soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return soup return soup
def get_cover_url(self): def get_cover_url(self):