From fd2888af180c20a3656736ef990e3d94599802b2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 16 Aug 2009 15:56:04 -0600 Subject: [PATCH] Update all Serbian and Croatin recipes to work with calibre 0.6 --- .../web/feeds/recipes/recipe_24sata.py | 25 +++---- .../web/feeds/recipes/recipe_24sata_rs.py | 66 +++++++++---------- src/calibre/web/feeds/recipes/recipe_b92.py | 38 ++++++----- src/calibre/web/feeds/recipes/recipe_blic.py | 33 ++++++---- src/calibre/web/feeds/recipes/recipe_borba.py | 42 ++++++------ src/calibre/web/feeds/recipes/recipe_danas.py | 38 +++++++---- .../web/feeds/recipes/recipe_dnevni_avaz.py | 35 ++++++---- .../web/feeds/recipes/recipe_dnevnik_cro.py | 46 ++++++++----- .../web/feeds/recipes/recipe_e_novine.py | 28 ++++---- .../web/feeds/recipes/recipe_glas_srpske.py | 20 +++--- src/calibre/web/feeds/recipes/recipe_hrt.py | 14 ++-- .../web/feeds/recipes/recipe_jutarnji.py | 57 +++++++++------- .../web/feeds/recipes/recipe_nacional_cro.py | 29 ++++---- src/calibre/web/feeds/recipes/recipe_nin.py | 32 +++++---- .../web/feeds/recipes/recipe_novosti.py | 43 +++++++----- src/calibre/web/feeds/recipes/recipe_nspm.py | 57 +++++++--------- .../web/feeds/recipes/recipe_pescanik.py | 46 +++++++------ .../web/feeds/recipes/recipe_pobjeda.py | 20 +++--- .../web/feeds/recipes/recipe_politika.py | 29 ++++---- .../web/feeds/recipes/recipe_pressonline.py | 32 ++++----- src/calibre/web/feeds/recipes/recipe_rts.py | 14 ++-- .../web/feeds/recipes/recipe_spiegel_int.py | 24 ++++--- .../web/feeds/recipes/recipe_tanjug.py | 26 ++++---- .../web/feeds/recipes/recipe_twitchfilms.py | 19 +++--- .../web/feeds/recipes/recipe_vecernji_list.py | 38 ++++++----- .../web/feeds/recipes/recipe_vijesti.py | 19 +++--- src/calibre/web/feeds/recipes/recipe_vreme.py | 35 ++++++---- 27 files changed, 501 insertions(+), 404 deletions(-) diff --git a/src/calibre/web/feeds/recipes/recipe_24sata.py b/src/calibre/web/feeds/recipes/recipe_24sata.py index 004e7ff5e6..25fe556cd4 100644 --- a/src/calibre/web/feeds/recipes/recipe_24sata.py +++ b/src/calibre/web/feeds/recipes/recipe_24sata.py @@ -9,6 +9,7 @@ __copyright__ = '2009, Darko Miletic ' import re from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag class Cro24Sata(BasicNewsRecipe): title = '24 Sata - Hr' @@ -22,18 +23,18 @@ class Cro24Sata(BasicNewsRecipe): no_stylesheets = True encoding = 'utf-8' use_embedded_content = False - remove_javascript = True language = _('Croatian') + lang = 'hr-HR' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}' - html2lrf_options = [ - '--comment', description - , '--category', category - , '--publisher', publisher - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : lang + , 'pretty_print' : True + } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] @@ -45,9 +46,11 @@ class Cro24Sata(BasicNewsRecipe): feeds = [(u'Najnovije Vijesti', u'http://www.24sata.hr/index.php?cmd=show_rss&action=novo')] def preprocess_html(self, soup): - soup.html['lang'] = 'hr-HR' - mtag = '\n' - soup.head.insert(0,mtag) + soup.html['lang'] = self.lang + mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) + mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")]) + soup.head.insert(0,mlang) + soup.head.insert(1,mcharset) for item in soup.findAll(style=True): del item['style'] return soup diff --git a/src/calibre/web/feeds/recipes/recipe_24sata_rs.py b/src/calibre/web/feeds/recipes/recipe_24sata_rs.py index 88860bf493..9ddee0be32 100644 --- a/src/calibre/web/feeds/recipes/recipe_24sata_rs.py +++ b/src/calibre/web/feeds/recipes/recipe_24sata_rs.py @@ -17,53 +17,51 @@ class Ser24Sata(BasicNewsRecipe): description = '24 sata portal vesti iz Srbije' publisher = 'Ringier d.o.o.' category = 'news, politics, entertainment, Serbia' - oldest_article = 1 + oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True encoding = 'utf-8' use_embedded_content = False - remove_javascript = True language = _('Serbian') - + lang = 'sr-Latn-RS' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}' - html2lrf_options = [ - '--comment', description - , '--category', category - , '--publisher', publisher - , '--ignore-tables' - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True' + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : lang + , 'pretty_print' : True + } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] feeds = [(u'Vesti Dana', u'http://www.24sata.rs/rss.php')] - def cleanup_image_tags(self,soup): - for item in soup.findAll('img'): - for attrib in ['height','width','border','align']: - if item.has_key(attrib): - del item[attrib] - oldParent = item.parent - myIndex = oldParent.contents.index(item) - item.extract() - divtag = Tag(soup,'div') - brtag = Tag(soup,'br') - oldParent.insert(myIndex,divtag) - divtag.append(item) - divtag.append(brtag) - return soup - def preprocess_html(self, soup): - soup.html['xml:lang'] = 'sr-Latn-RS' - soup.html['lang'] = 'sr-Latn-RS' - mtag = '\n' - soup.head.insert(0,mtag) - return self.cleanup_image_tags(soup) + soup.html['xml:lang'] = self.lang + soup.html['lang'] = self.lang + + attribs = [ 'style','font','valign' + ,'colspan','width','height' + ,'rowspan','summary','align' + ,'cellspacing','cellpadding' + ,'frames','rules','border' + ] + for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']): + item.name = 'div' + for attrib in attribs: + if item.has_key(attrib): + del item[attrib] + + mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) + mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")]) + soup.head.insert(0,mlang) + soup.head.insert(1,mcharset) + return self.adeify_images(soup) def print_version(self, url): - article, sep, rest = url.partition('#') - article_base, sep2, article_id = article.partition('id=') - return 'http://www.24sata.co.rs/_print.php?id=' + article_id + article = url.partition('#')[0] + article_id = article.partition('id=')[2] + return 'http://www.24sata.rs/_print.php?id=' + article_id diff --git a/src/calibre/web/feeds/recipes/recipe_b92.py b/src/calibre/web/feeds/recipes/recipe_b92.py index 6f466dfaa8..98e1967bd2 100644 --- a/src/calibre/web/feeds/recipes/recipe_b92.py +++ b/src/calibre/web/feeds/recipes/recipe_b92.py @@ -14,23 +14,21 @@ class B92(BasicNewsRecipe): description = 'Dnevne vesti iz Srbije i sveta' publisher = 'B92' category = 'news, politics, Serbia' - oldest_article = 1 + oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False - remove_javascript = True encoding = 'cp1250' language = _('Serbian') + lang = 'sr-Latn-RS' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}' - html2lrf_options = [ - '--comment', description - , '--category', category - , '--publisher', publisher - , '--ignore-tables' - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em}"' + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : lang + } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] @@ -39,6 +37,7 @@ class B92(BasicNewsRecipe): remove_tags = [ dict(name='ul', attrs={'class':'comment-nav'}) ,dict(name=['embed','link','base'] ) + ,dict(name='div', attrs={'class':'udokum'} ) ] feeds = [ @@ -51,14 +50,19 @@ class B92(BasicNewsRecipe): def preprocess_html(self, soup): del soup.body['onload'] - mtag = '' - soup.head.insert(0,mtag) - for item in soup.findAll(style=True): - del item['style'] - for item in soup.findAll(align=True): - del item['align'] for item in soup.findAll('font'): - item.name='p' + item.name='div' if item.has_key('size'): del item['size'] + attribs = [ 'style','font','valign' + ,'colspan','width','height' + ,'rowspan','summary','align' + ,'cellspacing','cellpadding' + ,'frames','rules','border' + ] + for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']): + item.name = 'div' + for attrib in attribs: + if item.has_key(attrib): + del item[attrib] return soup diff --git a/src/calibre/web/feeds/recipes/recipe_blic.py b/src/calibre/web/feeds/recipes/recipe_blic.py index e212e73218..5a2c290c60 100644 --- a/src/calibre/web/feeds/recipes/recipe_blic.py +++ b/src/calibre/web/feeds/recipes/recipe_blic.py @@ -26,15 +26,13 @@ class Blic(BasicNewsRecipe): lang = 'sr-Latn-RS' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif} ' - html2lrf_options = [ - '--comment' , description - , '--category' , category - , '--publisher', publisher - , '--ignore-tables' - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} "' - + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : lang + } + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] keep_only_tags = [dict(name='div', attrs={'class':'single_news'})] @@ -44,14 +42,21 @@ class Blic(BasicNewsRecipe): remove_tags = [dict(name=['object','link'])] def print_version(self, url): - start_url, question, rest_url = url.partition('?') + rest_url = url.partition('?')[2] return u'http://www.blic.rs/_print.php?' + rest_url def preprocess_html(self, soup): - mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) - soup.head.insert(0,mlang) - for item in soup.findAll(style=True): - del item['style'] + attribs = [ 'style','font','valign' + ,'colspan','width','height' + ,'rowspan','summary','align' + ,'cellspacing','cellpadding' + ,'frames','rules','border' + ] + for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']): + item.name = 'div' + for attrib in attribs: + if item.has_key(attrib): + del item[attrib] return self.adeify_images(soup) def get_article_url(self, article): diff --git a/src/calibre/web/feeds/recipes/recipe_borba.py b/src/calibre/web/feeds/recipes/recipe_borba.py index a7d8d9f0a4..827c94cacd 100644 --- a/src/calibre/web/feeds/recipes/recipe_borba.py +++ b/src/calibre/web/feeds/recipes/recipe_borba.py @@ -17,24 +17,23 @@ class Borba(BasicNewsRecipe): publisher = 'IP Novine Borba' category = 'news, politics, Serbia' language = _('Serbian') - oldest_article = 1 + lang = _('sr-Latn-RS') + oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True - encoding = 'utf8' - remove_javascript = True + encoding = 'utf-8' use_embedded_content = False cover_url = 'http://www.borba.rs/images/stories/novine/naslovna_v.jpg' INDEX = u'http://www.borba.rs/' - extra_css = '@font-face {font-family: "serif0";src:url(res:///Data/FONT/serif0.ttf)} @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif0, serif1, serif} .article_description{font-family: serif0, serif1, serif}' + extra_css = ' @font-face {font-family: "serif1"; src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif} .contentheading{font-size: x-large; font-weight: bold} .createdate{font-size: small; font-weight: bold} ' - html2lrf_options = [ - '--comment', description - , '--category', category - , '--publisher', publisher - , '--ignore-tables' - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True' + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : lang + , 'pretty_print' : True + } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] @@ -60,14 +59,17 @@ class Borba(BasicNewsRecipe): ] def preprocess_html(self, soup): - soup.html['xml:lang'] = 'sr-Latn-ME' - soup.html['lang'] = 'sr-Latn-ME' - mtag = '' - soup.head.insert(0,mtag) - for item in soup.findAll(style=True): - del item['style'] - for item in soup.findAll(font=True): - del item['font'] + attribs = [ 'style','font','valign' + ,'colspan','width','height' + ,'rowspan','summary','align' + ,'cellspacing','cellpadding' + ,'frames','rules','border' + ] + for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']): + item.name = 'div' + for attrib in attribs: + if item.has_key(attrib): + del item[attrib] return soup def parse_index(self): diff --git a/src/calibre/web/feeds/recipes/recipe_danas.py b/src/calibre/web/feeds/recipes/recipe_danas.py index 63a7b45738..8882b9db9d 100644 --- a/src/calibre/web/feeds/recipes/recipe_danas.py +++ b/src/calibre/web/feeds/recipes/recipe_danas.py @@ -7,9 +7,10 @@ danas.rs ''' import re from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag class Danas(BasicNewsRecipe): - title = u'Danas' + title = 'Danas' __author__ = 'Darko Miletic' description = 'Vesti' publisher = 'Danas d.o.o.' @@ -17,19 +18,19 @@ class Danas(BasicNewsRecipe): oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = False - remove_javascript = True use_embedded_content = False language = _('Serbian') + lang = 'sr-Latn-RS' + direction = 'ltr' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}' - html2lrf_options = [ - '--comment' , description - , '--category' , category - , '--publisher', publisher - , '--ignore-tables' - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True' + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : lang + , 'pretty_print' : True + } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] @@ -44,8 +45,17 @@ class Danas(BasicNewsRecipe): feeds = [ (u'Vesti', u'http://www.danas.rs/rss/rss.asp')] def preprocess_html(self, soup): - mtag = '' - soup.head.insert(0,mtag) - for item in soup.findAll(style=True): - del item['style'] + mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) + soup.head.insert(0,mlang) + attribs = [ 'style','font','valign' + ,'colspan','width','height' + ,'rowspan','summary','align' + ,'cellspacing','cellpadding' + ,'frames','rules','border' + ] + for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']): + item.name = 'div' + for attrib in attribs: + if item.has_key(attrib): + del item[attrib] return soup \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_dnevni_avaz.py b/src/calibre/web/feeds/recipes/recipe_dnevni_avaz.py index 5a55db9404..e4c47c3f2e 100644 --- a/src/calibre/web/feeds/recipes/recipe_dnevni_avaz.py +++ b/src/calibre/web/feeds/recipes/recipe_dnevni_avaz.py @@ -9,6 +9,7 @@ dnevniavaz.ba import re from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag class DnevniAvaz(BasicNewsRecipe): title = 'Dnevni Avaz' @@ -25,17 +26,18 @@ class DnevniAvaz(BasicNewsRecipe): cover_url = 'http://www.dnevniavaz.ba/img/logo.gif' lang = 'bs-BA' language = _('Bosnian') + direction = 'ltr' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}' - html2lrf_options = [ - '--comment', description - , '--category', category - , '--publisher', publisher - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"' - + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : lang + , 'pretty_print' : True + } + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] keep_only_tags = [dict(name='div', attrs={'id':['fullarticle-title','fullarticle-leading','fullarticle-date','fullarticle-text','articleauthor']})] @@ -47,9 +49,20 @@ class DnevniAvaz(BasicNewsRecipe): ,(u'Najpopularnije', u'http://www.dnevniavaz.ba/rss/popularno') ] + def replace_tagname(self,soup,tagname,tagid,newtagname): + headtag = soup.find(tagname,attrs={'id':tagid}) + if headtag: + headtag.name = newtagname + return + def preprocess_html(self, soup): soup.html['xml:lang'] = self.lang soup.html['lang'] = self.lang - mtag = '\n' - soup.head.insert(0,mtag) - return soup + mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) + mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")]) + soup.head.insert(0,mlang) + soup.head.insert(1,mcharset) + self.replace_tagname(soup,'div','fullarticle-title' ,'h1') + self.replace_tagname(soup,'div','fullarticle-leading','h3') + self.replace_tagname(soup,'div','fullarticle-date' ,'h5') + return self.adeify_images(soup) diff --git a/src/calibre/web/feeds/recipes/recipe_dnevnik_cro.py b/src/calibre/web/feeds/recipes/recipe_dnevnik_cro.py index ada5ea22d5..b2c3013513 100644 --- a/src/calibre/web/feeds/recipes/recipe_dnevnik_cro.py +++ b/src/calibre/web/feeds/recipes/recipe_dnevnik_cro.py @@ -9,6 +9,7 @@ dnevnik.hr import re from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag class DnevnikCro(BasicNewsRecipe): title = 'Dnevnik - Hr' @@ -22,19 +23,18 @@ class DnevnikCro(BasicNewsRecipe): no_stylesheets = True encoding = 'utf-8' use_embedded_content = False - remove_javascript = True language = _('Croatian') - + lang = 'hr-HR' + direction = 'ltr' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}' - html2lrf_options = [ - '--comment', description - , '--category', category - , '--publisher', publisher - , '--ignore-tables' - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True' + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : lang + , 'pretty_print' : True + } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] @@ -51,10 +51,24 @@ class DnevnikCro(BasicNewsRecipe): feeds = [(u'Vijesti', u'http://rss.dnevnik.hr/index.rss')] def preprocess_html(self, soup): - soup.html['lang'] = 'hr-HR' - mtag = '\n' - soup.head.insert(0,mtag) - for item in soup.findAll(style=True): - del item['style'] - return soup + soup.html['lang'] = self.lang + soup.html['dir' ] = self.direction + + attribs = [ 'style','font','valign' + ,'colspan','width','height' + ,'rowspan','summary','align' + ,'cellspacing','cellpadding' + ,'frames','rules','border' + ] + for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']): + item.name = 'div' + for attrib in attribs: + if item.has_key(attrib): + del item[attrib] + + mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) + mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")]) + soup.head.insert(0,mlang) + soup.head.insert(1,mcharset) + return self.adeify_images(soup) diff --git a/src/calibre/web/feeds/recipes/recipe_e_novine.py b/src/calibre/web/feeds/recipes/recipe_e_novine.py index 83654fe4c8..166f0087d1 100644 --- a/src/calibre/web/feeds/recipes/recipe_e_novine.py +++ b/src/calibre/web/feeds/recipes/recipe_e_novine.py @@ -9,6 +9,7 @@ e-novine.com import re from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag class E_novine(BasicNewsRecipe): title = 'E-Novine' @@ -16,23 +17,22 @@ class E_novine(BasicNewsRecipe): description = 'News from Serbia' publisher = 'E-novine' category = 'news, politics, Balcans' - oldest_article = 1 + oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True encoding = 'cp1250' - cover_url = 'http://www.e-novine.com/slike/slike_3/r1/g2008/m03/y3165525326702598.jpg' - remove_javascript = True use_embedded_content = False language = _('Serbian') + lang = 'sr' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: justify; font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}' - html2lrf_options = [ - '--comment', description - , '--category', category - , '--publisher', publisher - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"' + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : lang + , 'pretty_print' : True + } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] @@ -43,10 +43,10 @@ class E_novine(BasicNewsRecipe): feeds = [(u'Sve vesti', u'http://www.e-novine.com/rss/e-novine.xml' )] def preprocess_html(self, soup): - soup.html['xml:lang'] = 'sr-Latn-ME' - soup.html['lang'] = 'sr-Latn-ME' - mtag = '' - soup.head.insert(0,mtag) + soup.html['xml:lang'] = self.lang + soup.html['lang'] = self.lang + mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) + soup.head.insert(0,mlang) for item in soup.findAll(style=True): del item['style'] ftag = soup.find('div', attrs={'id':'css_47_0_2844H'}) diff --git a/src/calibre/web/feeds/recipes/recipe_glas_srpske.py b/src/calibre/web/feeds/recipes/recipe_glas_srpske.py index c6329387ce..8e5624792d 100644 --- a/src/calibre/web/feeds/recipes/recipe_glas_srpske.py +++ b/src/calibre/web/feeds/recipes/recipe_glas_srpske.py @@ -9,6 +9,7 @@ glassrpske.com import re from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag class GlasSrpske(BasicNewsRecipe): title = 'Glas Srpske' @@ -21,7 +22,6 @@ class GlasSrpske(BasicNewsRecipe): no_stylesheets = True encoding = 'utf-8' use_embedded_content = False - remove_javascript = True cover_url = 'http://www.glassrpske.com/var/slike/glassrpske-logo.png' lang = 'sr-BA' language = _('Serbian') @@ -29,13 +29,13 @@ class GlasSrpske(BasicNewsRecipe): extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}' - html2lrf_options = [ - '--comment', description - , '--category', category - , '--publisher', publisher - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"' + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : lang + , 'pretty_print' : True + } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] @@ -64,8 +64,8 @@ class GlasSrpske(BasicNewsRecipe): def preprocess_html(self, soup): soup.html['xml:lang'] = self.lang soup.html['lang'] = self.lang - mtag = '\n' - soup.head.insert(0,mtag) + mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) + soup.head.insert(0,mlang) return soup def parse_index(self): diff --git a/src/calibre/web/feeds/recipes/recipe_hrt.py b/src/calibre/web/feeds/recipes/recipe_hrt.py index d07b214e02..9e8e60b945 100644 --- a/src/calibre/web/feeds/recipes/recipe_hrt.py +++ b/src/calibre/web/feeds/recipes/recipe_hrt.py @@ -24,13 +24,13 @@ class HRT(BasicNewsRecipe): lang = 'hr-HR' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}' - html2lrf_options = [ - '--comment', description - , '--category', category - , '--publisher', publisher - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"' + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : lang + , 'pretty_print' : True + } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] diff --git a/src/calibre/web/feeds/recipes/recipe_jutarnji.py b/src/calibre/web/feeds/recipes/recipe_jutarnji.py index e8826bc4e1..89164e801e 100644 --- a/src/calibre/web/feeds/recipes/recipe_jutarnji.py +++ b/src/calibre/web/feeds/recipes/recipe_jutarnji.py @@ -8,32 +8,32 @@ jutarnji.hr import re from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag class Jutarnji(BasicNewsRecipe): - title = u'Jutarnji' - __author__ = u'Darko Miletic' - description = u'Hrvatski portal' + title = 'Jutarnji' + __author__ = 'Darko Miletic' + description = 'Hrvatski portal' publisher = 'Jutarnji.hr' category = 'news, politics, Croatia' - oldest_article = 1 + oldest_article = 2 max_articles_per_feed = 100 - simultaneous_downloads = 2 delay = 1 language = _('Croatian') no_stylesheets = True use_embedded_content = False - remove_javascript = True encoding = 'cp1250' - extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: justify; font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}' + lang = 'hr-HR' + direction = 'ltr' + extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: justify; font-family: serif1, serif} .article_description{font-family: sans1, sans-serif} .vijestnaslov{font-size: x-large; font-weight: bold}' - html2lrf_options = [ - '--comment' , description - , '--category' , category - , '--publisher', publisher - , '--ignore-tables' - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True' + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : lang + , 'pretty_print' : True + } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] @@ -59,11 +59,24 @@ class Jutarnji(BasicNewsRecipe): return 'http://www.jutarnji.hr/ispis_clanka.jl?artid=' + rrest def preprocess_html(self, soup): - mtag = '\n' - soup.head.insert(0,mtag) - for item in soup.findAll(style=True): - del item['style'] - for item in soup.findAll(width=True): - del item['width'] - return soup + soup.html['lang'] = self.lang + soup.html['dir' ] = self.direction + + attribs = [ 'style','font','valign' + ,'colspan','width','height' + ,'rowspan','summary','align' + ,'cellspacing','cellpadding' + ,'frames','rules','border' + ] + for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']): + item.name = 'div' + for attrib in attribs: + if item.has_key(attrib): + del item[attrib] + + mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) + mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")]) + soup.head.insert(0,mlang) + soup.head.insert(1,mcharset) + return self.adeify_images(soup) \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_nacional_cro.py b/src/calibre/web/feeds/recipes/recipe_nacional_cro.py index bbb0112ceb..70156b8766 100644 --- a/src/calibre/web/feeds/recipes/recipe_nacional_cro.py +++ b/src/calibre/web/feeds/recipes/recipe_nacional_cro.py @@ -9,6 +9,7 @@ nacional.hr import re from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag class NacionalCro(BasicNewsRecipe): title = 'Nacional - Hr' @@ -22,19 +23,20 @@ class NacionalCro(BasicNewsRecipe): no_stylesheets = True encoding = 'utf-8' use_embedded_content = False - remove_javascript = True language = _('Croatian') + lang = 'hr-HR' + direction = 'ltr' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}' - html2lrf_options = [ - '--comment', description - , '--category', category - , '--publisher', publisher - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' - + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : lang + , 'pretty_print' : True + } + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] remove_tags = [dict(name=['object','link','embed'])] @@ -42,9 +44,12 @@ class NacionalCro(BasicNewsRecipe): feeds = [(u'Najnovije Vijesti', u'http://www.nacional.hr/rss')] def preprocess_html(self, soup): - soup.html['lang'] = 'hr-HR' - mtag = '\n' - soup.head.insert(0,mtag) + soup.html['lang'] = self.lang + soup.html['dir' ] = self.direction + mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) + mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")]) + soup.head.insert(0,mlang) + soup.head.insert(1,mcharset) for item in soup.findAll(style=True): del item['style'] return soup diff --git a/src/calibre/web/feeds/recipes/recipe_nin.py b/src/calibre/web/feeds/recipes/recipe_nin.py index 4de53a1049..f7492cffce 100644 --- a/src/calibre/web/feeds/recipes/recipe_nin.py +++ b/src/calibre/web/feeds/recipes/recipe_nin.py @@ -26,21 +26,19 @@ class Nin(BasicNewsRecipe): INDEX = PREFIX + '/?change_lang=ls' LOGIN = PREFIX + '/?logout=true' FEED = PREFIX + '/misc/rss.php?feed=RSS2.0' - remove_javascript = True use_embedded_content = False language = _('Serbian') lang = 'sr-Latn-RS' direction = 'ltr' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif} .artTitle{font-size: x-large; font-weight: bold} .columnhead{font-size: small; font-weight: bold}' - html2lrf_options = [ - '--comment' , description - , '--category' , category - , '--publisher', publisher - , '--ignore-tables' - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True' + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : lang + , 'pretty_print' : True + } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] @@ -74,12 +72,20 @@ class Nin(BasicNewsRecipe): mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")]) soup.head.insert(0,mlang) - soup.head.insert(1,mcharset) - for item in soup.findAll(style=True): - del item['style'] + soup.head.insert(1,mcharset) + attribs = [ 'style','font','valign' + ,'colspan','width','height' + ,'rowspan','summary','align' + ,'cellspacing','cellpadding' + ,'frames','rules','border' + ] + for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']): + item.name = 'div' + for attrib in attribs: + if item.has_key(attrib): + del item[attrib] return soup def get_article_url(self, article): raw = article.get('link', None) return raw.replace('.co.yu','.co.rs') - \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_novosti.py b/src/calibre/web/feeds/recipes/recipe_novosti.py index 0190307542..165a04329a 100644 --- a/src/calibre/web/feeds/recipes/recipe_novosti.py +++ b/src/calibre/web/feeds/recipes/recipe_novosti.py @@ -8,30 +8,30 @@ novosti.rs import re from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag class Novosti(BasicNewsRecipe): - title = u'Vecernje Novosti' - __author__ = u'Darko Miletic' - description = u'Vesti' + title = 'Vecernje Novosti' + __author__ = 'Darko Miletic' + description = 'Vesti' publisher = 'Kompanija Novosti' category = 'news, politics, Serbia' oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False - encoding = 'utf8' - remove_javascript = True + encoding = 'utf-8' language = _('Serbian') + lang = 'sr-Latn-RS' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}' - html2lrf_options = [ - '--comment' , description - , '--category' , category - , '--publisher', publisher - , '--ignore-tables' - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True' + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : lang + , 'pretty_print' : True + } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] @@ -41,8 +41,17 @@ class Novosti(BasicNewsRecipe): feeds = [(u'Vesti', u'http://www.novosti.rs/php/vesti/rss.php')] def preprocess_html(self, soup): - mtag = '' - soup.head.insert(0,mtag) - for item in soup.findAll(style=True): - del item['style'] + mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) + soup.head.insert(0,mlang) + attribs = [ 'style','font','valign' + ,'colspan','width','height' + ,'rowspan','summary','align' + ,'cellspacing','cellpadding' + ,'frames','rules','border' + ] + for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']): + item.name = 'div' + for attrib in attribs: + if item.has_key(attrib): + del item[attrib] return soup diff --git a/src/calibre/web/feeds/recipes/recipe_nspm.py b/src/calibre/web/feeds/recipes/recipe_nspm.py index 360fb35c35..f90e971c37 100644 --- a/src/calibre/web/feeds/recipes/recipe_nspm.py +++ b/src/calibre/web/feeds/recipes/recipe_nspm.py @@ -21,19 +21,18 @@ class Nspm(BasicNewsRecipe): no_stylesheets = True use_embedded_content = False INDEX = 'http://www.nspm.rs/?alphabet=l' - encoding = 'utf8' - remove_javascript = True + encoding = 'utf-8' language = _('Serbian') + lang = 'sr-Latn-RS' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: justify; font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}' - html2lrf_options = [ - '--comment' , description - , '--category' , category - , '--publisher', publisher - , '--ignore-tables' - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True' + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : lang + , 'pretty_print' : True + } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] remove_tags = [ @@ -51,28 +50,18 @@ class Nspm(BasicNewsRecipe): def print_version(self, url): return url.replace('.html','/stampa.html') - def cleanup_image_tags(self,soup): - for item in soup.findAll('img'): - for attrib in ['height','width','border','align']: - if item.has_key(attrib): - del item[attrib] - oldParent = item.parent - myIndex = oldParent.contents.index(item) - item.extract() - divtag = Tag(soup,'div') - brtag = Tag(soup,'br') - oldParent.insert(myIndex,divtag) - divtag.append(item) - divtag.append(brtag) - return soup - def preprocess_html(self, soup): - lng = 'sr-Latn-RS' - soup.html['xml:lang'] = lng - soup.html['lang'] = lng - ftag = soup.find('meta',attrs={'http-equiv':'Content-Language'}) - if ftag: - ftag['content'] = lng - for item in soup.findAll(style=True): - del item['style'] - return self.cleanup_image_tags(soup) + soup.html['xml:lang'] = self.lang + soup.html['lang'] = self.lang + attribs = [ 'style','font','valign' + ,'colspan','width','height' + ,'rowspan','summary','align' + ,'cellspacing','cellpadding' + ,'frames','rules','border' + ] + for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']): + item.name = 'div' + for attrib in attribs: + if item.has_key(attrib): + del item[attrib] + return self.adeify_images(soup) diff --git a/src/calibre/web/feeds/recipes/recipe_pescanik.py b/src/calibre/web/feeds/recipes/recipe_pescanik.py index 278ed38183..adc4d67a8e 100644 --- a/src/calibre/web/feeds/recipes/recipe_pescanik.py +++ b/src/calibre/web/feeds/recipes/recipe_pescanik.py @@ -8,6 +8,7 @@ pescanik.net import re from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag class Pescanik(BasicNewsRecipe): title = 'Pescanik' @@ -19,20 +20,18 @@ class Pescanik(BasicNewsRecipe): max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False - remove_javascript = True - encoding = 'utf8' - cover_url = "http://pescanik.net/templates/ja_teline/images/logo.png" + encoding = 'utf-8' language = _('Serbian') - extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}' + lang = 'sr-Latn-RS' + extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif} .contentheading{font-size: x-large; font-weight: bold} .small{font-size: small} .createdate{font-size: x-small; font-weight: bold}' - html2lrf_options = [ - '--comment' , description - , '--category' , category - , '--publisher', publisher - , '--ignore-tables' - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True' + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : lang + , 'pretty_print' : True + } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] @@ -40,18 +39,27 @@ class Pescanik(BasicNewsRecipe): remove_tags = [ dict(name='td' , attrs={'class':'buttonheading'}) ,dict(name='span', attrs={'class':'article_seperator'}) - ,dict(name=['object','link','img','h4','ul']) + ,dict(name=['object','link','h4','ul']) ] - feeds = [(u'Pescanik Online', u'http://pescanik.net/index.php?option=com_rd_rss&id=12')] + feeds = [(u'Pescanik Online', u'http://www.pescanik.net/index.php?option=com_rd_rss&id=12')] def print_version(self, url): nurl = url.replace('/index.php','/index2.php') return nurl + '&pop=1&page=0' def preprocess_html(self, soup): - mtag = '' - soup.head.insert(0,mtag) - for item in soup.findAll(style=True): - del item['style'] - return soup + mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) + soup.head.insert(0,mlang) + attribs = [ 'style','font','valign' + ,'colspan','width','height' + ,'rowspan','summary','align' + ,'cellspacing','cellpadding' + ,'frames','rules','border' + ] + for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']): + item.name = 'div' + for attrib in attribs: + if item.has_key(attrib): + del item[attrib] + return self.adeify_images(soup) diff --git a/src/calibre/web/feeds/recipes/recipe_pobjeda.py b/src/calibre/web/feeds/recipes/recipe_pobjeda.py index 6078e6ba0a..6ecc048d2c 100644 --- a/src/calibre/web/feeds/recipes/recipe_pobjeda.py +++ b/src/calibre/web/feeds/recipes/recipe_pobjeda.py @@ -19,22 +19,20 @@ class Pobjeda(BasicNewsRecipe): publisher = 'Pobjeda a.d.' category = 'news, politics, Montenegro' no_stylesheets = True - remove_javascript = True - encoding = 'utf8' - remove_javascript = True + encoding = 'utf-8' use_embedded_content = False - language = _('Serbian') + language = _('Montenegrin') lang = 'sr-Latn-Me' INDEX = u'http://www.pobjeda.co.me' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}' - html2lrf_options = [ - '--comment', description - , '--category', category - , '--publisher', publisher - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"' + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : lang + , 'pretty_print' : True + } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] diff --git a/src/calibre/web/feeds/recipes/recipe_politika.py b/src/calibre/web/feeds/recipes/recipe_politika.py index 93c8f43b36..2015271ca4 100644 --- a/src/calibre/web/feeds/recipes/recipe_politika.py +++ b/src/calibre/web/feeds/recipes/recipe_politika.py @@ -1,15 +1,16 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = '2008, Darko Miletic ' +__copyright__ = '2008-2009, Darko Miletic ' ''' politika.rs ''' import re from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag class Politika(BasicNewsRecipe): - title = u'Politika Online' + title = 'Politika Online' __author__ = 'Darko Miletic' description = 'Najstariji dnevni list na Balkanu' publisher = 'Politika novine i Magazini d.o.o' @@ -21,16 +22,18 @@ class Politika(BasicNewsRecipe): remove_javascript = True encoding = 'utf8' language = _('Serbian') + lang = 'sr-Latn-RS' + direction = 'ltr' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}' - html2lrf_options = [ - '--comment', description - , '--category', category - , '--publisher', publisher - ] + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : lang + , 'pretty_print' : True + } - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' - preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] @@ -55,11 +58,13 @@ class Politika(BasicNewsRecipe): ] def preprocess_html(self, soup): - mtag = '' - soup.head.insert(0,mtag) + soup.html['lang'] = self.lang + soup.html['dir' ] = self.direction + mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) + soup.head.insert(0,mlang) for item in soup.findAll(style=True): del item['style'] ftag = soup.find('div',attrs={'class':'content_center_border'}) if ftag.has_key('align'): del ftag['align'] - return soup + return self.adeify_images(soup) diff --git a/src/calibre/web/feeds/recipes/recipe_pressonline.py b/src/calibre/web/feeds/recipes/recipe_pressonline.py index 71f69b9169..184e80a24f 100644 --- a/src/calibre/web/feeds/recipes/recipe_pressonline.py +++ b/src/calibre/web/feeds/recipes/recipe_pressonline.py @@ -9,6 +9,7 @@ pressonline.rs import re from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag class PressOnline(BasicNewsRecipe): title = 'Press Online' @@ -19,20 +20,21 @@ class PressOnline(BasicNewsRecipe): oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True - encoding = 'utf8' + encoding = 'utf-8' use_embedded_content = True - cover_url = 'http://www.pressonline.rs/img/logo.gif' language = _('Serbian') + lang = 'sr-Latn-RS' + direction = 'ltr' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}' - html2lrf_options = [ - '--comment', description - , '--category', category - , '--publisher', publisher - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"' + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : lang + , 'pretty_print' : True + } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] @@ -57,10 +59,8 @@ class PressOnline(BasicNewsRecipe): ] def preprocess_html(self, soup): - soup.html['xml:lang'] = 'sr-Latn-RS' - soup.html['lang'] = 'sr-Latn-RS' - mtag = '\n' - soup.head.insert(0,mtag) - for img in soup.findAll('img', align=True): - del img['align'] - return soup \ No newline at end of file + soup.html['lang'] = self.lang + soup.html['dir' ] = self.direction + mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) + soup.head.insert(0,mlang) + return self.adeify_images(soup) \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_rts.py b/src/calibre/web/feeds/recipes/recipe_rts.py index 57ee346d62..ebde793723 100644 --- a/src/calibre/web/feeds/recipes/recipe_rts.py +++ b/src/calibre/web/feeds/recipes/recipe_rts.py @@ -24,13 +24,13 @@ class RTS(BasicNewsRecipe): lang = 'sr-Latn-RS' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}' - html2lrf_options = [ - '--comment', description - , '--category', category - , '--publisher', publisher - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"' + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : lang + , 'pretty_print' : True + } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] diff --git a/src/calibre/web/feeds/recipes/recipe_spiegel_int.py b/src/calibre/web/feeds/recipes/recipe_spiegel_int.py index 3326e2b5ca..0040c76f98 100644 --- a/src/calibre/web/feeds/recipes/recipe_spiegel_int.py +++ b/src/calibre/web/feeds/recipes/recipe_spiegel_int.py @@ -1,7 +1,7 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = '2008, Darko Miletic ' +__copyright__ = '2008-2009, Darko Miletic ' ''' spiegel.de ''' @@ -9,21 +9,25 @@ spiegel.de from calibre.web.feeds.news import BasicNewsRecipe class Spiegel_int(BasicNewsRecipe): - title = u'Spiegel Online International' + title = 'Spiegel Online International' __author__ = 'Darko Miletic' description = "News and POV from Europe's largest newsmagazine" oldest_article = 7 max_articles_per_feed = 100 - language = _('English') + language = _('English') no_stylesheets = True use_embedded_content = False - cover_url = 'http://www.spiegel.de/static/sys/v8/headlines/spiegelonline.gif' - html2lrf_options = [ - '--comment', description - , '--base-font-size', '10' - , '--category', 'news, politics, Germany' - , '--publisher', 'SPIEGEL ONLINE GmbH' - ] + publisher = 'SPIEGEL ONLINE GmbH' + category = 'news, politics, Germany' + lang = 'en' + + conversion_options = { + 'comments' : description + ,'tags' : category + ,'language' : lang + ,'publisher' : publisher + ,'pretty_print': True + } remove_tags_after = dict(name='div', attrs={'id':'spArticleBody'}) diff --git a/src/calibre/web/feeds/recipes/recipe_tanjug.py b/src/calibre/web/feeds/recipes/recipe_tanjug.py index 9a8acfaca7..c7dddb8bc1 100644 --- a/src/calibre/web/feeds/recipes/recipe_tanjug.py +++ b/src/calibre/web/feeds/recipes/recipe_tanjug.py @@ -7,6 +7,7 @@ tanjug.rs ''' import re from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag class Tanjug(BasicNewsRecipe): title = 'Tanjug' @@ -14,21 +15,22 @@ class Tanjug(BasicNewsRecipe): description = 'Novinska agencija TANJUG - Dnevne vesti iz Srbije i sveta' publisher = 'Tanjug' category = 'news, politics, Serbia' - oldest_article = 1 + oldest_article = 2 max_articles_per_feed = 100 use_embedded_content = True encoding = 'utf-8' lang = 'sr-Latn-RS' language = _('Serbian') + direction = 'ltr' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}' - html2lrf_options = [ - '--comment', description - , '--category', category - , '--publisher', publisher - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em}"' + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : lang + , 'pretty_print' : True + } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] @@ -37,7 +39,7 @@ class Tanjug(BasicNewsRecipe): def preprocess_html(self, soup): soup.html['xml:lang'] = self.lang soup.html['lang' ] = self.lang - soup.html['dir' ] = "ltr" - mtag = '' - soup.head.insert(0,mtag) - return soup + soup.html['dir' ] = self.direction + mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) + soup.head.insert(0,mlang) + return self.adeify_images(soup) diff --git a/src/calibre/web/feeds/recipes/recipe_twitchfilms.py b/src/calibre/web/feeds/recipes/recipe_twitchfilms.py index dc0bcac88b..395920c4da 100644 --- a/src/calibre/web/feeds/recipes/recipe_twitchfilms.py +++ b/src/calibre/web/feeds/recipes/recipe_twitchfilms.py @@ -20,14 +20,15 @@ class Twitchfilm(BasicNewsRecipe): publisher = 'Twitch' category = 'twitch, twitchfilm, movie news, movie reviews, cult cinema, independent cinema, anime, foreign cinema, geek talk' language = _('English') + lang = 'en-US' - html2lrf_options = [ - '--comment', description - , '--category', category - , '--publisher', publisher - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : lang + , 'pretty_print' : True + } remove_tags = [dict(name='div', attrs={'class':'feedflare'})] @@ -36,6 +37,6 @@ class Twitchfilm(BasicNewsRecipe): def preprocess_html(self, soup): mtag = Tag(soup,'meta',[('http-equiv','Content-Type'),('context','text/html; charset=utf-8')]) soup.head.insert(0,mtag) - soup.html['lang'] = 'en-US' - return soup + soup.html['lang'] = self.lang + return self.adeify_images(soup) diff --git a/src/calibre/web/feeds/recipes/recipe_vecernji_list.py b/src/calibre/web/feeds/recipes/recipe_vecernji_list.py index b34c49d9fa..0c76739b1e 100644 --- a/src/calibre/web/feeds/recipes/recipe_vecernji_list.py +++ b/src/calibre/web/feeds/recipes/recipe_vecernji_list.py @@ -9,6 +9,7 @@ www.vecernji.hr import re from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag class VecernjiList(BasicNewsRecipe): title = 'Vecernji List' @@ -18,23 +19,23 @@ class VecernjiList(BasicNewsRecipe): category = 'news, politics, Croatia' oldest_article = 2 max_articles_per_feed = 100 - delay = 4 + delay = 1 no_stylesheets = True encoding = 'utf-8' use_embedded_content = False - remove_javascript = True language = _('Croatian') + lang = 'hr-HR' + direction = 'ltr' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}' - html2lrf_options = [ - '--comment', description - , '--category', category - , '--publisher', publisher - , '--ignore-tables' - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True' + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : lang + , 'pretty_print' : True + } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] @@ -46,13 +47,16 @@ class VecernjiList(BasicNewsRecipe): feeds = [(u'Vijesti', u'http://www.vecernji.hr/rss/')] def preprocess_html(self, soup): - soup.html['lang'] = 'hr-HR' - mtag = '\n' - soup.head.insert(0,mtag) - for item in soup.findAll(style=True): - del item['style'] - return soup + soup.html['lang'] = self.lang + soup.html['dir' ] = self.direction + + mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) + mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")]) + soup.head.insert(0,mlang) + soup.head.insert(1,mcharset) + return self.adeify_images(soup) def print_version(self, url): - return url.replace('/index.do','/print.do') + artid = url.rpartition('-')[2] + return 'http://www.vecernji.hr/index.php?cmd=show_clanak&action=print_popup&clanak_id='+artid \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_vijesti.py b/src/calibre/web/feeds/recipes/recipe_vijesti.py index 9ef32e636c..fe7d5b0442 100644 --- a/src/calibre/web/feeds/recipes/recipe_vijesti.py +++ b/src/calibre/web/feeds/recipes/recipe_vijesti.py @@ -20,22 +20,19 @@ class Vijesti(BasicNewsRecipe): oldest_article = 2 max_articles_per_feed = 150 no_stylesheets = True - remove_javascript = True encoding = 'cp1250' - cover_url = 'http://www.vijesti.me/img/logo.gif' - remove_javascript = True use_embedded_content = False - language = _('Serbian') + language = _('Montenegrin') lang ='sr-Latn-Me' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}' - html2lrf_options = [ - '--comment', description - , '--category', category - , '--publisher', publisher - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : lang + , 'pretty_print' : True + } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] diff --git a/src/calibre/web/feeds/recipes/recipe_vreme.py b/src/calibre/web/feeds/recipes/recipe_vreme.py index 1f42658d4f..02addb3f23 100644 --- a/src/calibre/web/feeds/recipes/recipe_vreme.py +++ b/src/calibre/web/feeds/recipes/recipe_vreme.py @@ -22,22 +22,20 @@ class Vreme(BasicNewsRecipe): needs_subscription = True INDEX = 'http://www.vreme.com' LOGIN = 'http://www.vreme.com/account/login.php?url=%2F' - remove_javascript = True use_embedded_content = False encoding = 'utf-8' language = _('Serbian') lang = 'sr-Latn-RS' direction = 'ltr' - extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} .heading1{font-family: sans1, sans-serif; font-size: x-large; font-weight: bold} .heading2{font-family: sans1, sans-serif; font-size: large; font-weight: bold} .toc-heading{font-family: sans1, sans-serif; font-size: small} .column-heading2{font-family: sans1, sans-serif; font-size: large} .column-heading1{font-family: sans1, sans-serif; font-size: x-large} .column-normal{font-family: sans1, sans-serif; font-size: medium} .large{font-family: sans1, sans-serif; font-size: large} ' + extra_css = ' @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} .heading1{font-family: sans1, sans-serif; font-size: x-large; font-weight: bold} .heading2{font-family: sans1, sans-serif; font-size: large; font-weight: bold} .toc-heading{font-family: sans1, sans-serif; font-size: small} .column-heading2{font-family: sans1, sans-serif; font-size: large} .column-heading1{font-family: sans1, sans-serif; font-size: x-large} .column-normal{font-family: sans1, sans-serif; font-size: medium} .large{font-family: sans1, sans-serif; font-size: large} ' - html2lrf_options = [ - '--comment' , description - , '--category' , category - , '--publisher', publisher - , '--ignore-tables' - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "' + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : lang + , 'pretty_print' : True + } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] @@ -84,12 +82,21 @@ class Vreme(BasicNewsRecipe): del soup.body['text' ] del soup.body['bgcolor'] del soup.body['onload' ] - for item in soup.findAll(face=True): - del item['face'] - for item in soup.findAll(size=True): - del item['size'] soup.html['lang'] = self.lang soup.html['dir' ] = self.direction + + attribs = [ 'style','font','valign' + ,'colspan','width','height' + ,'rowspan','summary','align' + ,'cellspacing','cellpadding' + ,'frames','rules','border' + ] + for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']): + item.name = 'div' + for attrib in attribs: + if item.has_key(attrib): + del item[attrib] + mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")]) soup.head.insert(0,mlang)