diff --git a/Changelog.yaml b/Changelog.yaml index a7fc86c98e..75bea147cb 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -19,6 +19,68 @@ # new recipes: # - title: +- version: 0.8.41 + date: 2012-02-24 + + new features: + - title: "Driver for Sony Experia Play 4G" + tickets: [938831] + + - title: "News download system: Allow use of __future__ in recipes, and do not change line numbers of code in the recipe when compiling it" + + - title: "Use the My Documents folder as the default location for the Calibre Library folder on first start in windows" + tickets: [934840] + + - title: "Add a tweak to Preferences->Tweaks to control the order in which categories appear in the Tag Browser" + + - title: "Tag Browser: Add an entry to the right click menu to quickly delete tags" + tickets: [934509] + + - title: "Amazon metadata download: Try to scrape series information from the amazon details page. Note that currently very few books have series info available. Often the page for hardcover will have series, but the Kindle edition will not. In such cases calibre may or may not find the series, depending on which page it ends up using." + + - title: "Content server: Add favicon to OPDS feeds." + tickets: [934731] + + bug fixes: + - title: "RTF Input: Fix some WMF images embedded in RTF files being distorted on conversion." + tickets: [934167] + + - title: "Fix long standing bug preventing calibre from working on east asian windows installs when the user name in windows has non-ascii characters" + tickets: [937389] + + - title: "Get Books: Fix Baen Webscription and O'Reilly stores. Fix price detection for Google Books" + + - title: "MOBI Output: When the same anchor is present more than once in the input document, use the first occurrence rather than the last one." + tickets: [934031] + + - title: "Use the 'default cover font' tweak when generating default masthead images as well" + tickets: [939256] + + - title: "Fix content server does not correctly display custom field of type 'rating'" + tickets: [938303] + + - title: "Fix welcome wizard does not save send-from email info unless send-to field is filled" + tickets: [937087] + + - title: "When reading metadata from odt files, use initial-creator in preference to creator for setting the author field" + tickets: [934564] + + - title: "Fix conversion erroring out when the input document has very long and thin images" + tickets: [935234] + + improved recipes: + - The Sun + - Various Polish news sources + - Mediapart + + new recipes: + - title: La pausa caffe + author: faber1971 + + - title: Various Polish news sources + author: fenuks + + - version: 0.8.40 date: 2012-02-17 diff --git a/recipes/archeowiesci.recipe b/recipes/archeowiesci.recipe index 3c93d3644f..e121ba4d42 100644 --- a/recipes/archeowiesci.recipe +++ b/recipes/archeowiesci.recipe @@ -7,6 +7,7 @@ class Archeowiesci(BasicNewsRecipe): language = 'pl' cover_url='http://archeowiesci.pl/wp-content/uploads/2011/05/Archeowiesci2-115x115.jpg' oldest_article = 7 + needs_subscription='optional' max_articles_per_feed = 100 auto_cleanup = True remove_tags=[dict(name='span', attrs={'class':['post-ratings', 'post-ratings-loading']})] @@ -16,6 +17,16 @@ class Archeowiesci(BasicNewsRecipe): feeds = BasicNewsRecipe.parse_feeds(self) for feed in feeds: for article in feed.articles[:]: - if 'subskrypcja' in article.title: + if self.username is None and 'subskrypcja' in article.title: feed.articles.remove(article) return feeds + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + if self.username is not None and self.password is not None: + br.open('http://archeowiesci.pl/wp-login.php') + br.select_form(name='loginform') + br['log'] = self.username + br['pwd'] = self.password + br.submit() + return br \ No newline at end of file diff --git a/recipes/astronomia_pl.recipe b/recipes/astronomia_pl.recipe index a142520ec5..89a0e4c889 100644 --- a/recipes/astronomia_pl.recipe +++ b/recipes/astronomia_pl.recipe @@ -1,15 +1,18 @@ from calibre.web.feeds.news import BasicNewsRecipe - +import re class Astronomia_pl(BasicNewsRecipe): title = u'Astronomia.pl' __author__ = 'fenuks' description = 'Astronomia - polish astronomy site' + masthead_url = 'http://www.astronomia.pl/grafika/logo.gif' cover_url = 'http://www.astronomia.pl/grafika/logo.gif' category = 'astronomy, science' language = 'pl' oldest_article = 8 max_articles_per_feed = 100 - #no_stylesheets=True + extra_css='#h2 {font-size: 18px;}' + no_stylesheets=True + preprocess_regexps = [(re.compile(ur'Przeczytaj także:.*?', re.DOTALL), lambda match: '') ] remove_tags_before=dict(name='div', attrs={'id':'a1'}) keep_only_tags=[dict(name='div', attrs={'id':['a1', 'h2']})] feeds = [(u'Wiadomości z astronomii i astronautyki', u'http://www.astronomia.pl/rss/')] diff --git a/recipes/benchmark_pl.recipe b/recipes/benchmark_pl.recipe index d5b4997aa7..cc74cc9128 100644 --- a/recipes/benchmark_pl.recipe +++ b/recipes/benchmark_pl.recipe @@ -4,16 +4,17 @@ class Benchmark_pl(BasicNewsRecipe): title = u'Benchmark.pl' __author__ = 'fenuks' description = u'benchmark.pl -IT site' + masthead_url = 'http://www.benchmark.pl/i/logo-footer.png' cover_url = 'http://www.ieaddons.pl/benchmark/logo_benchmark_new.gif' category = 'IT' language = 'pl' oldest_article = 8 max_articles_per_feed = 100 no_stylesheets=True - preprocess_regexps = [(re.compile(ur'\bWięcej o .*', re.DOTALL|re.IGNORECASE), lambda match: '')] + preprocess_regexps = [(re.compile(ur'

 Zobacz poprzednie Opinie dnia:.*', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Więcej o .*?', re.DOTALL|re.IGNORECASE), lambda match: '')] keep_only_tags=[dict(name='div', attrs={'class':['m_zwykly', 'gallery']})] remove_tags_after=dict(name='div', attrs={'class':'body'}) - remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery']})] + remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery']}), dict(name='table', attrs={'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})] INDEX= 'http://www.benchmark.pl' feeds = [(u'Aktualności', u'http://www.benchmark.pl/rss/aktualnosci-pliki.xml'), (u'Testy i recenzje', u'http://www.benchmark.pl/rss/testy-recenzje-minirecenzje.xml')] diff --git a/recipes/biolog_pl.recipe b/recipes/biolog_pl.recipe index af9ad77e44..b10bf0d925 100644 --- a/recipes/biolog_pl.recipe +++ b/recipes/biolog_pl.recipe @@ -10,10 +10,11 @@ class Biolog_pl(BasicNewsRecipe): description = u'Przyrodnicze aktualności ze świata nauki (codziennie aktualizowane), kurs biologii, testy i sprawdziany, forum dyskusyjne.' category = 'biology' language = 'pl' + masthead_url= 'http://www.biolog.pl/naukowy,portal,biolog.png' cover_url='http://www.biolog.pl/naukowy,portal,biolog.png' no_stylesheets = True #keeps_only_tags=[dict(id='main')] remove_tags_before=dict(id='main') remove_tags_after=dict(name='a', attrs={'name':'komentarze'}) - remove_tags=[dict(name='img', attrs={'alt':'Komentarze'})] + remove_tags=[dict(name='img', attrs={'alt':'Komentarze'}), dict(name='span', attrs={'class':'menu_odsylacze'})] feeds = [(u'Wszystkie', u'http://www.biolog.pl/backend.php'), (u'Medycyna', u'http://www.biolog.pl/medycyna-rss.php'), (u'Ekologia', u'http://www.biolog.pl/rss-ekologia.php'), (u'Genetyka i biotechnologia', u'http://www.biolog.pl/rss-biotechnologia.php'), (u'Botanika', u'http://www.biolog.pl/rss-botanika.php'), (u'Le\u015bnictwo', u'http://www.biolog.pl/rss-lesnictwo.php'), (u'Zoologia', u'http://www.biolog.pl/rss-zoologia.php')] diff --git a/recipes/cd_action.recipe b/recipes/cd_action.recipe index b4cf6b326c..ff46774dc9 100644 --- a/recipes/cd_action.recipe +++ b/recipes/cd_action.recipe @@ -1,16 +1,20 @@ from calibre.web.feeds.news import BasicNewsRecipe - class CD_Action(BasicNewsRecipe): title = u'CD-Action' __author__ = 'fenuks' - description = 'cdaction.pl - polish magazine about games site' + description = 'cdaction.pl - polish games magazine site' category = 'games' language = 'pl' oldest_article = 8 max_articles_per_feed = 100 no_stylesheets= True - cover_url =u'http://s.cdaction.pl/obrazki/logo-CD-Action_172k9.JPG' keep_only_tags= dict(id='news_content') remove_tags_after= dict(name='div', attrs={'class':'tresc'}) feeds = [(u'Newsy', u'http://www.cdaction.pl/rss_newsy.xml')] + + + def get_cover_url(self): + soup = self.index_to_soup('http://www.cdaction.pl/magazyn/') + self.cover_url='http://www.cdaction.pl'+ soup.find(id='wspolnik').div.a['href'] + return getattr(self, 'cover_url', self.cover_url) \ No newline at end of file diff --git a/recipes/cgm_pl.recipe b/recipes/cgm_pl.recipe index 591155ff85..673a9f940b 100644 --- a/recipes/cgm_pl.recipe +++ b/recipes/cgm_pl.recipe @@ -5,6 +5,7 @@ class CGM(BasicNewsRecipe): oldest_article = 7 __author__ = 'fenuks' description = u'Codzienna Gazeta Muzyczna' + masthead_url='http://www.cgm.pl/img/header/logo.gif' cover_url = 'http://www.krafcy.com/foto/tinymce/Image/cgm%281%29.jpg' category = 'music' language = 'pl' @@ -23,21 +24,19 @@ class CGM(BasicNewsRecipe): def preprocess_html(self, soup): + gallery=soup.find('div', attrs={'class':'galleryFlash'}) + if gallery: + img=gallery.div + gallery.img.extract() + if img: + img=img['style'] + img='http://www.cgm.pl'+img[img.find('url(')+4:img.find(')')] + gallery.contents[1].name='img' + gallery.contents[1]['src']=img for item in soup.findAll(style=True): del item['style'] ad=soup.findAll('a') for r in ad: - if 'http://www.hustla.pl' in r['href'] or 'http://www.ebilet.pl' in r['href']: + if 'www.hustla.pl' in r['href'] or 'www.ebilet.pl' in r['href']: r.extract() - gallery=soup.find('div', attrs={'class':'galleryFlash'}) - if gallery: - img=gallery.find('embed') - if img: - img=img['src'][35:] - img='http://www.cgm.pl/_vault/_gallery/_photo/'+img - param=gallery.findAll(name='param') - for i in param: - i.extract() - gallery.contents[1].name='img' - gallery.contents[1]['src']=img return soup \ No newline at end of file diff --git a/recipes/chr_mon.recipe b/recipes/chr_mon.recipe index 6f41b95763..50b626fcbf 100644 --- a/recipes/chr_mon.recipe +++ b/recipes/chr_mon.recipe @@ -33,6 +33,32 @@ class ChristianScienceMonitor(BasicNewsRecipe): remove_javascript = True no_stylesheets = True + requires_version = (0, 8, 39) + + def preprocess_raw_html(self, raw, url): + try: + from html5lib import parse + root = parse(raw, namespaceHTMLElements=False, + treebuilder='lxml').getroot() + from lxml import etree + for tag in root.xpath( + '//script|//style|//noscript|//meta|//link|//object'): + tag.getparent().remove(tag) + for elem in list(root.iterdescendants(tag=etree.Comment)): + elem.getparent().remove(elem) + ans = etree.tostring(root, encoding=unicode) + ans = re.sub('.*', lambda match : ''), - (r'
.*?
', lambda m: ''), - (r'Full HTML version of this story which may include photos, graphics, and related links.*', - lambda match : ''), - ]] extra_css = ''' h1{ color:#000000;font-family: Georgia,Times,"Times New Roman",serif; font-size: large} .sub{ color:#000000;font-family: Georgia,Times,"Times New Roman",serif; font-size: small;} diff --git a/recipes/ciekawostki_historyczne.recipe b/recipes/ciekawostki_historyczne.recipe new file mode 100644 index 0000000000..7c5138196d --- /dev/null +++ b/recipes/ciekawostki_historyczne.recipe @@ -0,0 +1,48 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re +class Ciekawostki_Historyczne(BasicNewsRecipe): + title = u'Ciekawostki Historyczne' + oldest_article = 7 + __author__ = 'fenuks' + description = u'Serwis popularnonaukowy - odkrycia, kontrowersje, historia, ciekawostki, badania, ciekawostki z przeszłości.' + category = 'history' + language = 'pl' + masthead_url= 'http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg' + cover_url='http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg' + max_articles_per_feed = 100 + preprocess_regexps = [(re.compile(ur'Ten artykuł ma kilka stron.*?', re.DOTALL), lambda match: ''), (re.compile(ur'

Zobacz też:

.*?', re.DOTALL), lambda match: '')] + no_stylesheets=True + remove_empty_feeds=True + keep_only_tags=[dict(name='div', attrs={'class':'post'})] + remove_tags=[dict(id='singlepostinfo')] + feeds = [(u'Staro\u017cytno\u015b\u0107', u'http://ciekawostkihistoryczne.pl/tag/starozytnosc/feed/'), (u'\u015aredniowiecze', u'http://ciekawostkihistoryczne.pl/tag/sredniowiecze/feed/'), (u'Nowo\u017cytno\u015b\u0107', u'http://ciekawostkihistoryczne.pl/tag/nowozytnosc/feed/'), (u'XIX wiek', u'http://ciekawostkihistoryczne.pl/tag/xix-wiek/feed/'), (u'1914-1939', u'http://ciekawostkihistoryczne.pl/tag/1914-1939/feed/'), (u'1939-1945', u'http://ciekawostkihistoryczne.pl/tag/1939-1945/feed/'), (u'Powojnie (od 1945)', u'http://ciekawostkihistoryczne.pl/tag/powojnie/feed/'), (u'Recenzje', u'http://ciekawostkihistoryczne.pl/category/recenzje/feed/')] + + def append_page(self, soup, appendtag): + tag=soup.find(name='h7') + if tag: + if tag.br: + pass + elif tag.nextSibling.name=='p': + tag=tag.nextSibling + nexturl = tag.findAll('a') + for nextpage in nexturl: + tag.extract() + nextpage= nextpage['href'] + soup2 = self.index_to_soup(nextpage) + pagetext = soup2.find(name='div', attrs={'class':'post'}) + for r in pagetext.findAll('div', attrs={'id':'singlepostinfo'}): + r.extract() + for r in pagetext.findAll('div', attrs={'class':'wp-caption alignright'}): + r.extract() + for r in pagetext.findAll('h1'): + r.extract() + pagetext.find('h6').nextSibling.extract() + pagetext.find('h7').nextSibling.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup + + \ No newline at end of file diff --git a/recipes/computerworld_pl.recipe b/recipes/computerworld_pl.recipe index 90b7d63c56..2ec457e4de 100644 --- a/recipes/computerworld_pl.recipe +++ b/recipes/computerworld_pl.recipe @@ -7,10 +7,11 @@ class Computerworld_pl(BasicNewsRecipe): description = u'Serwis o IT w przemyśle, finansach, handlu, administracji oraz rynku IT i telekomunikacyjnym - wiadomości, opinie, analizy, porady prawne' category = 'IT' language = 'pl' + masthead_url= 'http://g1.computerworld.pl/cw/beta_gfx/cw2.gif' no_stylesheets=True oldest_article = 7 max_articles_per_feed = 100 - keep_only_tags=[dict(name='div', attrs={'id':'s'})] + keep_only_tags=[dict(attrs={'class':['tyt_news', 'prawo', 'autor', 'tresc']})] remove_tags_after=dict(name='div', attrs={'class':'rMobi'}) remove_tags=[dict(name='div', attrs={'class':['nnav', 'rMobi']}), dict(name='table', attrs={'class':'ramka_slx'})] feeds = [(u'Wiadomo\u015bci', u'http://rssout.idg.pl/cw/news_iso.xml')] diff --git a/recipes/dobreprogamy.recipe b/recipes/dobreprogamy.recipe index 72f9c966bd..a27a9b0877 100644 --- a/recipes/dobreprogamy.recipe +++ b/recipes/dobreprogamy.recipe @@ -7,6 +7,7 @@ class Dobreprogramy_pl(BasicNewsRecipe): __licence__ ='GPL v3' category = 'IT' language = 'pl' + masthead_url='http://static.dpcdn.pl/css/Black/Images/header_logo_napis_fullVersion.png' cover_url = 'http://userlogos.org/files/logos/Karmody/dobreprogramy_01.png' description = u'Aktualności i blogi z dobreprogramy.pl' encoding = 'utf-8' @@ -16,7 +17,8 @@ class Dobreprogramy_pl(BasicNewsRecipe): oldest_article = 8 max_articles_per_feed = 100 preprocess_regexps = [(re.compile(ur'
Twoja przeglądarka nie obsługuje Flasha i HTML5 lub wyłączono obsługę JavaScript...
'), lambda match: '') ] - remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})] - keep_only_tags = [dict(name='div', attrs={'class':['mainBar', 'newsContent', 'postTitle title', 'postInfo', 'contentText', 'content']})] + keep_only_tags=[dict(attrs={'class':['news', 'entry single']})] + remove_tags = [dict(name='div', attrs={'class':['newsOptions', 'noPrint', 'komentarze', 'tags font-heading-master']})] + #remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})] feeds = [(u'Aktualności', 'http://feeds.feedburner.com/dobreprogramy/Aktualnosci'), ('Blogi', 'http://feeds.feedburner.com/dobreprogramy/BlogCzytelnikow')] diff --git a/recipes/dziennik_pl.recipe b/recipes/dziennik_pl.recipe index b5453659ef..6da7e0240d 100644 --- a/recipes/dziennik_pl.recipe +++ b/recipes/dziennik_pl.recipe @@ -8,15 +8,17 @@ class Dziennik_pl(BasicNewsRecipe): description = u'Wiadomości z kraju i ze świata. Wiadomości gospodarcze. Znajdziesz u nas informacje, wydarzenia, komentarze, opinie.' category = 'newspaper' language = 'pl' - cover_url='http://6.s.dziennik.pl/images/og_dziennik.jpg' + masthead_url= 'http://5.s.dziennik.pl/images/logos.png' + cover_url= 'http://5.s.dziennik.pl/images/logos.png' no_stylesheets = True oldest_article = 7 max_articles_per_feed = 100 remove_javascript=True remove_empty_feeds=True - preprocess_regexps = [(re.compile("Komentarze:"), lambda m: '')] + extra_css= 'ul {list-style: none; padding: 0; margin: 0;} li {float: left;margin: 0 0.15em;}' + preprocess_regexps = [(re.compile("Komentarze:"), lambda m: ''), (re.compile('

>>> CZYTAJ TAKŻE: ".*?"

'), lambda m: '')] keep_only_tags=[dict(id='article')] - remove_tags=[dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget']}), dict(name='a', attrs={'class':'komentarz'})] + remove_tags=[dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget', 'belka-spol', 'belka-spol belka-spol-bottom', 'art_data_tags', 'cl_right', 'boxRounded gal_inside']}), dict(name='a', attrs={'class':['komentarz', 'article_icon_addcommnent']})] feeds = [(u'Wszystko', u'http://rss.dziennik.pl/Dziennik-PL/'), (u'Wiadomości', u'http://rss.dziennik.pl/Dziennik-Wiadomosci'), (u'Gospodarka', u'http://rss.dziennik.pl/Dziennik-Gospodarka'), @@ -30,6 +32,12 @@ class Dziennik_pl(BasicNewsRecipe): (u'Podróże', u'http://rss.dziennik.pl/Dziennik-Podroze/'), (u'Nieruchomości', u'http://rss.dziennik.pl/Dziennik-Nieruchomosci')] + def skip_ad_pages(self, soup): + tag=soup.find(name='a', attrs={'title':'CZYTAJ DALEJ'}) + if tag: + new_soup=self.index_to_soup(tag['href'], raw=True) + return new_soup + def append_page(self, soup, appendtag): tag=soup.find('a', attrs={'class':'page_next'}) if tag: @@ -56,3 +64,4 @@ class Dziennik_pl(BasicNewsRecipe): def preprocess_html(self, soup): self.append_page(soup, soup.body) return soup + diff --git a/recipes/film_web.recipe b/recipes/film_web.recipe index 1c72e5704e..0671deec6c 100644 --- a/recipes/film_web.recipe +++ b/recipes/film_web.recipe @@ -10,7 +10,8 @@ class Filmweb_pl(BasicNewsRecipe): oldest_article = 8 max_articles_per_feed = 100 no_stylesheets= True - extra_css = '.hdrBig {font-size:22px;}' + remove_empty_feeds=True + extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}' remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'})] keep_only_tags= [dict(name='h1', attrs={'class':'hdrBig'}), dict(name='div', attrs={'class':['newsInfo', 'reviewContent fontSizeCont description']})] feeds = [(u'Wszystkie newsy', u'http://www.filmweb.pl/feed/news/latest'), diff --git a/recipes/gameplay_pl.recipe b/recipes/gameplay_pl.recipe new file mode 100644 index 0000000000..f3384263d6 --- /dev/null +++ b/recipes/gameplay_pl.recipe @@ -0,0 +1,21 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class Gameplay_pl(BasicNewsRecipe): + title = u'Gameplay.pl' + oldest_article = 7 + __author__ = 'fenuks' + description = u'gameplay.pl - serwis o naszych zainteresowaniach, grach, filmach, książkach, muzyce, fotografii i konsolach.' + category = 'games, movies, books, music' + language = 'pl' + masthead_url= 'http://gameplay.pl/img/gpy_top_logo.png' + cover_url= 'http://gameplay.pl/img/gpy_top_logo.png' + max_articles_per_feed = 100 + no_stylesheets= True + keep_only_tags=[dict(name='div', attrs={'class':['news_endpage_tit', 'news']})] + remove_tags=[dict(name='div', attrs={'class':['galeria', 'noedit center im']})] + feeds = [(u'Wiadomo\u015bci', u'http://gameplay.pl/rss/')] + + def image_url_processor(self, baseurl, url): + if 'http' not in url: + return 'http://gameplay.pl'+ url[2:] + else: + return url diff --git a/recipes/gazeta_wyborcza.recipe b/recipes/gazeta_wyborcza.recipe index 0959ff80a3..489caf231f 100644 --- a/recipes/gazeta_wyborcza.recipe +++ b/recipes/gazeta_wyborcza.recipe @@ -4,10 +4,11 @@ from calibre.web.feeds.news import BasicNewsRecipe class Gazeta_Wyborcza(BasicNewsRecipe): title = u'Gazeta Wyborcza' __author__ = 'fenuks' - cover_url = 'http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg' language = 'pl' description ='news from gazeta.pl' category='newspaper' + publication_type = 'newspaper' + masthead_url='http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg' INDEX='http://wyborcza.pl' remove_empty_feeds= True oldest_article = 3 @@ -81,3 +82,10 @@ class Gazeta_Wyborcza(BasicNewsRecipe): return url else: return url.replace('http://wyborcza.biz/biznes/1', 'http://wyborcza.biz/biznes/2029020') + + def get_cover_url(self): + soup = self.index_to_soup('http://wyborcza.pl/0,76762,3751429.html') + cover=soup.find(id='GWmini2') + soup = self.index_to_soup('http://wyborcza.pl/'+ cover.contents[3].a['href']) + self.cover_url='http://wyborcza.pl' + soup.img['src'] + return getattr(self, 'cover_url', self.cover_url) diff --git a/recipes/gry_online_pl.recipe b/recipes/gry_online_pl.recipe index d9c461dc63..e188e4988c 100644 --- a/recipes/gry_online_pl.recipe +++ b/recipes/gry_online_pl.recipe @@ -8,29 +8,31 @@ class Gry_online_pl(BasicNewsRecipe): language = 'pl' oldest_article = 13 INDEX= 'http://www.gry-online.pl/' - cover_url='http://www.gry-online.pl/img/1st_10/1st-gol-logo.png' + masthead_url='http://www.gry-online.pl/im/gry-online-logo.png' + cover_url='http://www.gry-online.pl/im/gry-online-logo.png' max_articles_per_feed = 100 no_stylesheets= True - extra_css = 'p.wn1{font-size:22px;}' - remove_tags_after= [dict(name='div', attrs={'class':['tresc-newsa']})] - keep_only_tags = [dict(name='div', attrs={'class':['txthead']}), dict(name='p', attrs={'class':['wtx1', 'wn1', 'wob']}), dict(name='a', attrs={'class':['num_str_nex']})] - #remove_tags= [dict(name='div', attrs={'class':['news_plat']})] + keep_only_tags=[dict(name='div', attrs={'class':'gc660'})] + remove_tags=[dict({'class':['nav-social', 'add-info', 'smlb', 'lista lista3 lista-gry', 'S013po', 'zm_gfx_cnt_bottom', 'ocen-txt', 'wiecej-txt', 'wiecej-txt2']})] feeds = [(u'Newsy', 'http://www.gry-online.pl/rss/news.xml'), ('Teksty', u'http://www.gry-online.pl/rss/teksty.xml')] def append_page(self, soup, appendtag): - nexturl = soup.find('a', attrs={'class':'num_str_nex'}) - if appendtag.find('a', attrs={'class':'num_str_nex'}) is not None: - appendtag.find('a', attrs={'class':'num_str_nex'}).replaceWith('\n') - if nexturl is not None: - if 'strona' in nexturl.div.string: - nexturl= self.INDEX + nexturl['href'] - soup2 = self.index_to_soup(nexturl) - pagetext = soup2.findAll(name='p', attrs={'class':['wtx1', 'wn1', 'wob']}) - for tag in pagetext: - pos = len(appendtag.contents) - appendtag.insert(pos, tag) - self.append_page(soup2, appendtag) + tag = appendtag.find('div', attrs={'class':'n5p'}) + if tag: + nexturls=tag.findAll('a') + for nexturl in nexturls[1:]: + try: + soup2 = self.index_to_soup('http://www.gry-online.pl/S020.asp'+ nexturl['href']) + except: + soup2 = self.index_to_soup('http://www.gry-online.pl/S022.asp'+ nexturl['href']) + pagetext = soup2.find(attrs={'class':'gc660'}) + for r in pagetext.findAll(name='header'): + r.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button']}): + r.extract() def preprocess_html(self, soup): diff --git a/recipes/icons/ciekawostki_historyczne.png b/recipes/icons/ciekawostki_historyczne.png new file mode 100644 index 0000000000..fa0e2c0591 Binary files /dev/null and b/recipes/icons/ciekawostki_historyczne.png differ diff --git a/recipes/icons/gameplay_pl.png b/recipes/icons/gameplay_pl.png new file mode 100644 index 0000000000..1b7081f393 Binary files /dev/null and b/recipes/icons/gameplay_pl.png differ diff --git a/recipes/icons/in4_pl.png b/recipes/icons/in4_pl.png new file mode 100644 index 0000000000..b3351629f0 Binary files /dev/null and b/recipes/icons/in4_pl.png differ diff --git a/recipes/icons/informacje_usa.png b/recipes/icons/informacje_usa.png new file mode 100644 index 0000000000..4c30e3bcbc Binary files /dev/null and b/recipes/icons/informacje_usa.png differ diff --git a/recipes/icons/kresy_pl.png b/recipes/icons/kresy_pl.png new file mode 100644 index 0000000000..db8ef4efec Binary files /dev/null and b/recipes/icons/kresy_pl.png differ diff --git a/recipes/icons/oclab_pl.png b/recipes/icons/oclab_pl.png new file mode 100644 index 0000000000..45ecd2533e Binary files /dev/null and b/recipes/icons/oclab_pl.png differ diff --git a/recipes/icons/overclock_pl.png b/recipes/icons/overclock_pl.png new file mode 100644 index 0000000000..38c0b13bfe Binary files /dev/null and b/recipes/icons/overclock_pl.png differ diff --git a/recipes/icons/palmtop_pl.png b/recipes/icons/palmtop_pl.png new file mode 100644 index 0000000000..d711a41682 Binary files /dev/null and b/recipes/icons/palmtop_pl.png differ diff --git a/recipes/icons/pc_arena.png b/recipes/icons/pc_arena.png new file mode 100644 index 0000000000..10be204b36 Binary files /dev/null and b/recipes/icons/pc_arena.png differ diff --git a/recipes/icons/pc_centre_pl.png b/recipes/icons/pc_centre_pl.png new file mode 100644 index 0000000000..e2fbf1eefb Binary files /dev/null and b/recipes/icons/pc_centre_pl.png differ diff --git a/recipes/icons/pc_foster.png b/recipes/icons/pc_foster.png new file mode 100644 index 0000000000..433970bcc1 Binary files /dev/null and b/recipes/icons/pc_foster.png differ diff --git a/recipes/icons/polska_times.png b/recipes/icons/polska_times.png new file mode 100644 index 0000000000..f233f45518 Binary files /dev/null and b/recipes/icons/polska_times.png differ diff --git a/recipes/icons/pure_pc.png b/recipes/icons/pure_pc.png new file mode 100644 index 0000000000..e5e102eee7 Binary files /dev/null and b/recipes/icons/pure_pc.png differ diff --git a/recipes/icons/tanuki.png b/recipes/icons/tanuki.png new file mode 100644 index 0000000000..fe46d7e8dc Binary files /dev/null and b/recipes/icons/tanuki.png differ diff --git a/recipes/icons/tvn24.png b/recipes/icons/tvn24.png new file mode 100644 index 0000000000..864a6624ac Binary files /dev/null and b/recipes/icons/tvn24.png differ diff --git a/recipes/icons/webhosting_pl.png b/recipes/icons/webhosting_pl.png new file mode 100644 index 0000000000..0e11a3065e Binary files /dev/null and b/recipes/icons/webhosting_pl.png differ diff --git a/recipes/in4_pl.recipe b/recipes/in4_pl.recipe new file mode 100644 index 0000000000..16ad622b46 --- /dev/null +++ b/recipes/in4_pl.recipe @@ -0,0 +1,44 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re +class in4(BasicNewsRecipe): + title = u'IN4.pl' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Serwis Informacyjny - Aktualnosci, recenzje' + category = 'IT' + language = 'pl' + #cover_url= 'http://www.in4.pl/recenzje/337/in4pl.jpg' + no_stylesheets = True + remove_empty_feeds = True + preprocess_regexps = [(re.compile(ur'

', re.DOTALL), lambda match: ''), (re.compile(ur'

Zobacz też:.*?

', re.DOTALL), lambda match: '')] + keep_only_tags=[dict(name='div', attrs={'class':'box box-single'})] + remove_tags_after= dict(attrs={'class':'tags'}) + remove_tags= [dict(attrs={'class':['postmetadata', 'tags', 'banner']}), dict(name='a', attrs={'title':['Drukuj', u'Wyślij']})] + feeds = [(u'Informacje', u'http://www.informacjeusa.com/feed/')] diff --git a/recipes/kresy_pl.recipe b/recipes/kresy_pl.recipe new file mode 100644 index 0000000000..3dfc2c057c --- /dev/null +++ b/recipes/kresy_pl.recipe @@ -0,0 +1,14 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class Kresy(BasicNewsRecipe): + title = u'Kresy' + __author__ = 'fenuks' + description = u'portal społeczności kresowej' + language = 'pl' + masthead_url= 'http://www.kresy.pl/public/img/logo.png' + cover_url= 'http://www.kresy.pl/public/img/logo.png' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + keep_only_tags= [dict(id='artykul')] + remove_tags= [dict(attrs={'class':['twitter-share-button', 'likefbborder', 'tagi']})] + feeds = [(u'Wszystkie', u'http://www.kresy.pl/rss')] diff --git a/recipes/la_pausa_caffe.recipe b/recipes/la_pausa_caffe.recipe new file mode 100644 index 0000000000..1a87d33dcf --- /dev/null +++ b/recipes/la_pausa_caffe.recipe @@ -0,0 +1,17 @@ +__version__ = 'v1.0' +__date__ = '13, February 2011' + +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1329125921(BasicNewsRecipe): + title = u'La pausa caff\xe8' + __author__ = 'faber1971' + description = 'An Italian satirical blog' + language = 'it' + + oldest_article = 7 + max_articles_per_feed = 100 + auto_cleanup = True + no_stylesheets = True + feeds = [(u'La pausa caff\xe8', u'http://feeds.feedburner.com/LapausaCaffe')] + diff --git a/recipes/marketing_magazine.recipe b/recipes/marketing_magazine.recipe index 55b6ea2584..0c14939cd8 100644 --- a/recipes/marketing_magazine.recipe +++ b/recipes/marketing_magazine.recipe @@ -1,4 +1,5 @@ __license__ = 'GPL v3' + from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1327062445(BasicNewsRecipe): @@ -7,10 +8,13 @@ class AdvancedUserRecipe1327062445(BasicNewsRecipe): max_articles_per_feed = 100 auto_cleanup = True remove_javascript = True + no_stylesheets = True + remove_tags = [ + dict(name='ul', attrs={'id':'ads0'}) + ] masthead_url = 'http://www.simrendeogun.com/wp-content/uploads/2011/06/New-Marketing-Magazine-Logo.jpg' - feeds = [(u'My Marketing', u'http://feed43.com/0537744466058428.xml'), (u'My Marketing_', u'http://feed43.com/8126723074604845.xml'), (u'Venturini', u'http://robertoventurini.blogspot.com/feeds/posts/default?alt=rss'), (u'Ninja Marketing', u'http://feeds.feedburner.com/NinjaMarketing'), (u'Comunitàzione', u'http://www.comunitazione.it/feed/novita.asp'), (u'Brandforum news', u'http://www.brandforum.it/rss/news'), (u'Brandforum papers', u'http://www.brandforum.it/rss/papers'), (u'Disambiguando', u'http://giovannacosenza.wordpress.com/feed/')] __author__ = 'faber1971' - description = 'Collection of Italian marketing websites - v1.00 (28, January 2012)' + description = 'Collection of Italian marketing websites - v1.03 (20, February 2012)' language = 'it' - + feeds = [(u'My Marketing', u'http://feed43.com/0537744466058428.xml'), (u'My Marketing_', u'http://feed43.com/8126723074604845.xml'), (u'Venturini', u'http://robertoventurini.blogspot.com/feeds/posts/default?alt=rss'), (u'Ninja Marketing', u'http://feeds.feedburner.com/NinjaMarketing'), (u'Comunitàzione', u'http://www.comunitazione.it/feed/novita.asp'), (u'Brandforum news', u'http://www.brandforum.it/rss/news'), (u'Brandforum papers', u'http://www.brandforum.it/rss/papers'), (u'MarketingArena', u'http://feeds.feedburner.com/marketingarena'), (u'minimarketing', u'http://feeds.feedburner.com/minimarketingit'), (u'Disambiguando', u'http://giovannacosenza.wordpress.com/feed/')] diff --git a/recipes/mediapart.recipe b/recipes/mediapart.recipe index 4540879f72..a5bc4e96f9 100644 --- a/recipes/mediapart.recipe +++ b/recipes/mediapart.recipe @@ -1,16 +1,17 @@ __license__ = 'GPL v3' -__copyright__ = '2009, Mathieu Godlewski ; 2010, Louis Gesbert ' +__copyright__ = '2009, Mathieu Godlewski ; 2010, 2011, Louis Gesbert ' ''' Mediapart ''' -from calibre.ebooks.BeautifulSoup import Tag +import re +from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe class Mediapart(BasicNewsRecipe): title = 'Mediapart' - __author__ = 'Mathieu Godlewski' - description = 'Global news in french from online newspapers' + __author__ = 'Mathieu Godlewski, Louis Gesbert' + description = 'Global news in french from news site Mediapart' oldest_article = 7 language = 'fr' needs_subscription = True @@ -18,52 +19,30 @@ class Mediapart(BasicNewsRecipe): max_articles_per_feed = 50 no_stylesheets = True - cover_url = 'http://www.mediapart.fr/sites/all/themes/mediapart/mediapart/images/annonce.jpg' + cover_url = 'http://static.mediapart.fr/files/pave_mediapart.jpg' feeds = [ ('Les articles', 'http://www.mediapart.fr/articles/feed'), ] -# -- print-version has poor quality on this website, better do the conversion ourselves -# -# preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in -# [ -# (r'', lambda match : '

'+match.group(1)+'

'), -# (r'[^>]+]*>([^<]*)[^<]*', -# lambda match : ''+match.group(1)+''), -# (r'\'', lambda match: '’'), -# ] -# ] -# -# remove_tags = [ dict(name='div', attrs={'class':'print-source_url'}), -# dict(name='div', attrs={'class':'print-links'}), -# dict(name='img', attrs={'src':'entete_article.png'}), -# dict(name='br') ] -# -# def print_version(self, url): -# raw = self.browser.open(url).read() -# soup = BeautifulSoup(raw.decode('utf8', 'replace')) -# div = soup.find('div', {'id':re.compile('node-\d+')}) -# if div is None: -# return None -# article_id = string.replace(div['id'], 'node-', '') -# if article_id is None: -# return None -# return 'http://www.mediapart.fr/print/'+article_id +# -- print-version -# -- Non-print version [dict(name='div', attrs={'class':'advert'})] - - keep_only_tags = [ - dict(name='h1', attrs={'class':'title'}), - dict(name='div', attrs={'class':'page_papier_detail'}), + preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in + [ + (r'', lambda match : '

'+match.group(1)+'

'), + (r'\'', lambda match: '’') ] + ] - def preprocess_html(self,soup): - for title in soup.findAll('div', {'class':'titre'}): - tag = Tag(soup, 'h3') - title.replaceWith(tag) - tag.insert(0,title) - return soup + remove_tags = [ dict(name='div', attrs={'class':'print-source_url'}) ] + + def print_version(self, url): + raw = self.browser.open(url).read() + soup = BeautifulSoup(raw.decode('utf8', 'replace')) + link = soup.find('a', {'title':'Imprimer'}) + if link is None: + return None + return link['href'] # -- Handle login @@ -76,4 +55,3 @@ class Mediapart(BasicNewsRecipe): br['pass'] = self.password br.submit() return br - diff --git a/recipes/naczytniki.recipe b/recipes/naczytniki.recipe index e4769d58bc..2ae6bc391e 100644 --- a/recipes/naczytniki.recipe +++ b/recipes/naczytniki.recipe @@ -1,8 +1,9 @@ from calibre.web.feeds.news import BasicNewsRecipe - +import re class naczytniki(BasicNewsRecipe): title = u'naczytniki.pl' __author__ = 'fenuks' + masthead_url= 'http://naczytniki.pl/wp-content/uploads/2010/08/logo_nc28.png' cover_url = 'http://naczytniki.pl/wp-content/uploads/2010/08/logo_nc28.png' language = 'pl' description ='everything about e-readers' @@ -10,6 +11,7 @@ class naczytniki(BasicNewsRecipe): no_stylesheets=True oldest_article = 7 max_articles_per_feed = 100 + preprocess_regexps = [(re.compile(ur'


Zobacz także:

.*?', re.DOTALL), lambda match: '') ] remove_tags_after= dict(name='div', attrs={'class':'sociable'}) keep_only_tags=[dict(name='div', attrs={'class':'post'})] remove_tags=[dict(name='span', attrs={'class':'comments'}), dict(name='div', attrs={'class':'sociable'})] diff --git a/recipes/nowa_fantastyka.recipe b/recipes/nowa_fantastyka.recipe index 74534f3346..ec556da5fa 100644 --- a/recipes/nowa_fantastyka.recipe +++ b/recipes/nowa_fantastyka.recipe @@ -1,21 +1,33 @@ # -*- coding: utf-8 -*- from calibre.web.feeds.news import BasicNewsRecipe +import re + class Nowa_Fantastyka(BasicNewsRecipe): title = u'Nowa Fantastyka' oldest_article = 7 __author__ = 'fenuks' + __modified_by__ = 'zaslav' language = 'pl' encoding='latin2' description ='site for fantasy readers' category='fantasy' + masthead_url='http://farm5.static.flickr.com/4133/4956658792_7ba7fbf562.jpg' + #extra_css='.tytul {font-size: 20px;}' #not working max_articles_per_feed = 100 INDEX='http://www.fantastyka.pl/' no_stylesheets=True needs_subscription = 'optional' - remove_tags_before=dict(attrs={'class':'belka1-tlo-md'}) + remove_tags_before=dict(attrs={'class':'naglowek2'}) #remove_tags_after=dict(name='span', attrs={'class':'naglowek-oceny'}) - remove_tags_after=dict(name='td', attrs={'class':'belka1-bot'}) - remove_tags=[dict(attrs={'class':'avatar2'}), dict(name='span', attrs={'class':'alert-oceny'}), dict(name='img', attrs={'src':['obrazki/sledz1.png', 'obrazki/print.gif', 'obrazki/mlnf.gif']}), dict(name='b', text='Dodaj komentarz'),dict(name='a', attrs={'href':'http://www.fantastyka.pl/10,1727.html'})] + remove_tags_after=dict(name='form', attrs={'name':'form1'}) + remove_tags=[dict(attrs={'class':['avatar2', 'belka-margin', 'naglowek2']}), dict(name='span', attrs={'class':'alert-oceny'}), dict(name='img', attrs={'src':['obrazki/sledz1.png', 'obrazki/print.gif', 'obrazki/mlnf.gif']}), dict(name='b', text='Dodaj komentarz'),dict(name='a', attrs={'href':'http://www.fantastyka.pl/10,1727.html'}), dict(name='form')] + preprocess_regexps = [ + (re.compile(r'\'), lambda match: ''), + (re.compile(r'\'), lambda match: ''), + (re.compile(r'\'), lambda match: '')] + + + def find_articles(self, url): articles = [] @@ -41,10 +53,10 @@ class Nowa_Fantastyka(BasicNewsRecipe): return feeds + def get_cover_url(self): - soup = self.index_to_soup('http://www.fantastyka.pl/1.html') - cover=soup.find(name='img', attrs={'class':'okladka'}) - self.cover_url=self.INDEX+ cover['src'] + soup = self.index_to_soup('http://www.e-kiosk.pl/nowa_fantastyka') + self.cover_url='http://www.e-kiosk.pl' + soup.find(name='a', attrs={'class':'img'})['href'] return getattr(self, 'cover_url', self.cover_url) def get_browser(self): @@ -56,3 +68,18 @@ class Nowa_Fantastyka(BasicNewsRecipe): br['pass'] = self.password br.submit() return br + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll(font=True): + del item['font'] + for item in soup.findAll(align=True): + del item['align'] + for item in soup.findAll(name='tr'): + item.name='div' + title=soup.find(attrs={'class':'tytul'}) + if title: + title['style']='font-size: 20px; font-weight: bold;' + self.log.warn(soup) + return soup diff --git a/recipes/oclab_pl.recipe b/recipes/oclab_pl.recipe new file mode 100644 index 0000000000..b0df89ba72 --- /dev/null +++ b/recipes/oclab_pl.recipe @@ -0,0 +1,31 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class OCLab(BasicNewsRecipe): + title = u'OCLab.pl' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Portal OCLab.pl jest miejscem przyjaznym pasjonatom sprzętu komputerowego, w szczególności overclockerom, które będzie służyć im za aktualną bazę wiedzy o podkręcaniu komputera, źródło aktualnych informacji z rynku oraz opinii na temat sprzętu komputerowego.' + category = 'IT' + language = 'pl' + cover_url= 'http://www.idealforum.ru/attachment.php?attachmentid=7963&d=1316008118' + no_stylesheets = True + keep_only_tags=[dict(id='main')] + remove_tags_after= dict(attrs={'class':'single-postmetadata'}) + remove_tags=[dict(attrs={'class':['single-postmetadata', 'pagebar']})] + feeds = [(u'Wpisy', u'http://oclab.pl/feed/')] + + + def append_page(self, soup, appendtag): + tag=soup.find(attrs={'class':'contentjumpddl'}) + if tag: + nexturl=tag.findAll('option') + for nextpage in nexturl[1:-1]: + soup2 = self.index_to_soup(nextpage['value']) + pagetext = soup2.find(attrs={'class':'single-entry'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':'post-nav-bottom-list'}): + r.extract() + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup diff --git a/recipes/overclock_pl.recipe b/recipes/overclock_pl.recipe new file mode 100644 index 0000000000..d7f4c8093d --- /dev/null +++ b/recipes/overclock_pl.recipe @@ -0,0 +1,37 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe +class Overclock_pl(BasicNewsRecipe): + title = u'Overclock.pl' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Vortal poświęcony tematyce hardware, kładący największy nacisk na podkręcanie / overclocking (włącznie z extreme) i chłodzenie / cooling (air cooling, water cooling, freon cooling, dry ice, liquid nitrogen).' + category = 'IT' + language = 'pl' + masthead_url='http://www.overclock.pl/gfx/logo_m.png' + cover_url='http://www.overclock.pl/gfx/logo_m.png' + no_stylesheets = True + remove_empty_feeds = True + preprocess_regexps = [(re.compile(ur'Komentarze do aktualności:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'

Nawigacja

', re.DOTALL), lambda match: '') ] + keep_only_tags=[dict(name='div', attrs={'class':'news'}), dict(id='articleContent')] + remove_tags=[dict(name='span', attrs={'class':'info'}), dict(attrs={'class':'shareit'})] + feeds = [(u'Aktualno\u015bci', u'http://www.overclock.pl/rss.news.xml'), (u'Testy i recenzje', u'http://www.overclock.pl/rss.articles.xml')] + + + def append_page(self, soup, appendtag): + tag=soup.find(id='navigation') + if tag: + nexturl=tag.findAll('option') + tag.extract() + for nextpage in nexturl[2:]: + soup2 = self.index_to_soup(nextpage['value']) + pagetext = soup2.find(id='content') + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + rem=appendtag.find(attrs={'alt':'Pierwsza'}) + if rem: + rem.parent.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/palmtop_pl.recipe b/recipes/palmtop_pl.recipe new file mode 100644 index 0000000000..ace772e7e7 --- /dev/null +++ b/recipes/palmtop_pl.recipe @@ -0,0 +1,14 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class palmtop_pl(BasicNewsRecipe): + title = u'Palmtop.pl' + __author__ = 'fenuks' + description = 'wortal technologii mobilnych' + category = 'mobile' + language = 'pl' + cover_url='http://cdn.g-point.biz/wp-content/themes/palmtop-new/images/header_palmtop_logo.png' + masthead_url='http://cdn.g-point.biz/wp-content/themes/palmtop-new/images/header_palmtop_logo.png' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + + feeds = [(u'Newsy', u'http://palmtop.pl/feed/atom/')] diff --git a/recipes/pc_arena.recipe b/recipes/pc_arena.recipe new file mode 100644 index 0000000000..faefeb25c0 --- /dev/null +++ b/recipes/pc_arena.recipe @@ -0,0 +1,31 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class PC_Arena(BasicNewsRecipe): + title = u'PCArena' + oldest_article = 18300 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Najnowsze informacje z branży IT - testy, recenzje, aktualności, rankingi, wywiady. Twoje źródło informacji o sprzęcie komputerowym.' + category = 'IT' + language = 'pl' + masthead_url='http://pcarena.pl/public/design/frontend/images/logo.gif' + cover_url= 'http://pcarena.pl/public/design/frontend/images/logo.gif' + no_stylesheets = True + keep_only_tags=[dict(attrs={'class':['artHeader', 'art']})] + remove_tags=[dict(attrs={'class':'pages'})] + feeds = [(u'Newsy', u'http://pcarena.pl/misc/rss/news'), (u'Artyku\u0142y', u'http://pcarena.pl/misc/rss/articles')] + + def append_page(self, soup, appendtag): + tag=soup.find(name='div', attrs={'class':'pagNum'}) + if tag: + nexturl=tag.findAll('a') + tag.extract() + for nextpage in nexturl[1:]: + nextpage= 'http://pcarena.pl' + nextpage['href'] + soup2 = self.index_to_soup(nextpage) + pagetext = soup2.find(attrs={'class':'artBody'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/pc_centre_pl.recipe b/recipes/pc_centre_pl.recipe new file mode 100644 index 0000000000..68a17888ce --- /dev/null +++ b/recipes/pc_centre_pl.recipe @@ -0,0 +1,41 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class PC_Centre(BasicNewsRecipe): + title = u'PC Centre' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Portal komputerowy, a w nim: testy sprzętu komputerowego, recenzje gier i oprogramowania. a także opisy produktów związanych z komputerami.' + category = 'IT' + language = 'pl' + masthead_url= 'http://pccentre.pl/views/images/logo.gif' + cover_url= 'http://pccentre.pl/views/images/logo.gif' + no_stylesheets = True + keep_only_tags= [dict(id='content')] + remove_tags=[dict(attrs={'class':['ikony r', 'list_of_content', 'dot accordion']}), dict(id='comments')] + feeds = [(u'Publikacje', u'http://pccentre.pl/backend.php?mode=a'), (u'Aktualno\u015bci', u'http://pccentre.pl/backend.php'), (u'Sprz\u0119t komputerowy', u'http://pccentre.pl/backend.php?mode=n§ion=2'), (u'Oprogramowanie', u'http://pccentre.pl/backend.php?mode=n§ion=3'), (u'Gry komputerowe i konsole', u'http://pccentre.pl/backend.php?mode=n§ion=4'), (u'Internet', u'http://pccentre.pl/backend.php?mode=n§ion=7'), (u'Bezpiecze\u0144stwo', u'http://pccentre.pl/backend.php?mode=n§ion=5'), (u'Multimedia', u'http://pccentre.pl/backend.php?mode=n§ion=6'), (u'Biznes', u'http://pccentre.pl/backend.php?mode=n§ion=9')] + + + def append_page(self, soup, appendtag): + tag=soup.find(name='div', attrs={'class':'pages'}) + if tag: + nexturl=tag.findAll('a') + tag.extract() + for nextpage in nexturl[:-1]: + nextpage= 'http://pccentre.pl' + nextpage['href'] + soup2 = self.index_to_soup(nextpage) + pagetext = soup2.find(id='content') + rem=pagetext.findAll(attrs={'class':['subtitle', 'content_info', 'list_of_content', 'pages', 'social2', 'pcc_acc', 'pcc_acc_na']}) + for r in rem: + r.extract() + rem=pagetext.findAll(id='comments') + for r in rem: + r.extract() + rem=pagetext.findAll('h1') + for r in rem: + r.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/pc_foster.recipe b/recipes/pc_foster.recipe new file mode 100644 index 0000000000..ab8c2b66b1 --- /dev/null +++ b/recipes/pc_foster.recipe @@ -0,0 +1,35 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class PC_Foster(BasicNewsRecipe): + title = u'PC Foster' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Vortal technologiczny: testy, recenzje sprzętu komputerowego i telefonów, nowinki hardware, programy i gry dla Windows. Podkręcanie, modding i Overclocking.' + category = 'IT' + language = 'pl' + masthead_url='http://pcfoster.pl/public/images/logo.png' + cover_url= 'http://pcfoster.pl/public/images/logo.png' + no_stylesheets= True + remove_empty_feeds= True + keep_only_tags= [dict(id=['news_details', 'review_details']), dict(attrs={'class':'pager more_top'})] + remove_tags=[dict(name='p', attrs={'class':'right'})] + feeds = [(u'G\u0142\xf3wny', u'http://pcfoster.pl/public/rss/main.xml')] + + + def append_page(self, soup, appendtag): + nexturl= appendtag.find(attrs={'alt':u'Następna strona'}) + if nexturl: + appendtag.find(attrs={'class':'pager more_top'}).extract() + while nexturl: + nexturl='http://pcfoster.pl' + nexturl.parent['href'] + soup2 = self.index_to_soup(nexturl) + nexturl=soup2.find(attrs={'alt':u'Następna strona'}) + pagetext = soup2.find(attrs={'class':'content'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':'review_content double'}): + r.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/polska_times.recipe b/recipes/polska_times.recipe new file mode 100644 index 0000000000..4126576fe2 --- /dev/null +++ b/recipes/polska_times.recipe @@ -0,0 +1,81 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re +class Polska_times(BasicNewsRecipe): + title = u'Polska Times' + __author__ = 'fenuks' + description = u'Internetowe wydanie dziennika ogólnopolskiego Polska The Times. Najświeższe informacje: wydarzenia w kraju i na świecie, reportaże, poradniki, opinie.' + category = 'newspaper' + language = 'pl' + masthead_url = 'http://s.polskatimes.pl/g/logo_naglowek/polska.gif?17' + oldest_article = 7 + max_articles_per_feed = 100 + remove_emty_feeds= True + no_stylesheets = True + preprocess_regexps = [(re.compile(ur'Czytaj także:.*?', re.DOTALL), lambda match: ''), (re.compile(ur',Czytaj też:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'Zobacz także:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'

', re.DOTALL), lambda match: ''), (re.compile(ur'CZYTAJ TEŻ:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'CZYTAJ WIĘCEJ:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'CZYTAJ TAKŻE:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: ''), (re.compile(ur'Nasze serwisy:.*', re.DOTALL), lambda match: '') ] + keep_only_tags= [dict(id=['tytul-artykulu', 'kontent'])] + remove_tags_after= dict(id='material-tagi') + remove_tags=[dict(attrs={'id':'reklama_srodtekst_0'}), dict(attrs={'id':'material-tagi'}), dict(name='div', attrs={'class':'zakladki'}), dict(attrs={'title':u'CZYTAJ TAKŻE'}), dict(attrs={'id':'podobne'}), dict(name='a', attrs={'href':'http://www.dzienniklodzki.pl/newsletter'})] + feeds = [(u'Fakty', u'http://polskatimes.feedsportal.com/c/32980/f/533648/index.rss'), (u'Opinie', u'http://www.polskatimes.pl/rss/opinie.xml'), (u'Sport', u'http://polskatimes.feedsportal.com/c/32980/f/533649/index.rss'), (u'Pieni\u0105dze', u'http://polskatimes.feedsportal.com/c/32980/f/533657/index.rss'), (u'Twoje finanse', u'http://www.polskatimes.pl/rss/twojefinanse.xml'), (u'Kultura', u'http://polskatimes.feedsportal.com/c/32980/f/533650/index.rss'), (u'Dodatki', u'http://www.polskatimes.pl/rss/dodatki.xml')] + + def skip_ad_pages(self, soup): + if 'Advertisement' in soup.title: + nexturl=soup.find('a')['href'] + return self.index_to_soup(nexturl, raw=True) + + def append_page(self, soup, appendtag): + nexturl=soup.find(id='nastepna_strona') + while nexturl: + soup2= self.index_to_soup(nexturl['href']) + nexturl=soup2.find(id='nastepna_strona') + pagetext = soup2.find(id='tresc') + for dictionary in self.remove_tags: + v=pagetext.findAll(attrs=dictionary['attrs']) + for delete in v: + delete.extract() + for b in pagetext.findAll(name='b'): + if b.string: + if u'CZYTAJ TEŻ' in b.string or u'Czytaj także' in b.string or u'Czytaj też' in b.string or u'Zobacz także' in b.string: + b.extract() + for center in pagetext.findAll(name='center'): + if center.h4: + if center.h4.a: + center.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for paginator in appendtag.findAll(attrs={'class':'stronicowanie'}): + paginator.extract() + + def image_article(self, soup, appendtag): + nexturl=soup.find('a', attrs={'class':'nastepna'}) + urls=[] + while nexturl: + if nexturl not in urls: + urls.append(nexturl) + else: + break + soup2= self.index_to_soup('http://www.polskatimes.pl/artykul/' + nexturl['href']) + nexturl=soup2.find('a', attrs={'class':'nastepna'}) + if nexturl in urls: + break; + pagetext = soup2.find(id='galeria-material') + pos = len(appendtag.contents) + appendtag.insert(pos, '
') + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for rem in appendtag.findAll(attrs={'class':['galeriaNawigator', 'miniaturyPojemnik']}): + rem.extract() + for paginator in appendtag.findAll(attrs={'class':'stronicowanie'}): + paginator.extract() + + def preprocess_html(self, soup): + if soup.find('a', attrs={'class':'nastepna'}): + self.image_article(soup, soup.body) + elif soup.find(id='nastepna_strona'): + self.append_page(soup, soup.body) + return soup + + + def get_cover_url(self): + soup = self.index_to_soup('http://www.prasa24.pl/gazeta/metropolia-warszawska/') + self.cover_url=soup.find(id='pojemnik').img['src'] + return getattr(self, 'cover_url', self.cover_url) \ No newline at end of file diff --git a/recipes/pure_pc.recipe b/recipes/pure_pc.recipe new file mode 100644 index 0000000000..7a6c43bb7e --- /dev/null +++ b/recipes/pure_pc.recipe @@ -0,0 +1,33 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class PurePC(BasicNewsRecipe): + title = u'PurePC' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Artykuły, aktualności, sprzęt, forum, chłodzenie, modding, urządzenia mobilne - wszystko w jednym miejscu.' + category = 'IT' + language = 'pl' + masthead_url= 'http://www.purepc.pl/themes/new/images/purepc.jpg' + cover_url= 'http://www.purepc.pl/themes/new/images/purepc.jpg' + no_stylesheets = True + keep_only_tags= [dict(id='content')] + remove_tags_after= dict(attrs={'class':'fivestar-widget'}) + remove_tags= [dict(id='navigator'), dict(attrs={'class':['box-tools', 'fivestar-widget', 'PageMenuList']})] + feeds = [(u'Wiadomo\u015bci', u'http://www.purepc.pl/node/feed')] + + + def append_page(self, soup, appendtag): + nexturl= appendtag.find(attrs={'class':'pager-next'}) + if nexturl: + while nexturl: + soup2 = self.index_to_soup('http://www.purepc.pl'+ nexturl.a['href']) + nexturl=soup2.find(attrs={'class':'pager-next'}) + pagetext = soup2.find(attrs={'class':'article'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':['PageMenuList', 'pager', 'fivestar-widget']}): + r.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/tablety_pl.recipe b/recipes/tablety_pl.recipe index d06e32d9af..f4c1efa9b8 100644 --- a/recipes/tablety_pl.recipe +++ b/recipes/tablety_pl.recipe @@ -1,14 +1,16 @@ from calibre.web.feeds.news import BasicNewsRecipe - +import re class Tablety_pl(BasicNewsRecipe): title = u'Tablety.pl' __author__ = 'fenuks' description = u'tablety.pl - latest tablet news' + masthead_url= 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png' cover_url = 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png' category = 'IT' language = 'pl' oldest_article = 8 max_articles_per_feed = 100 + preprocess_regexps = [(re.compile(ur'

Przeczytaj także.*?

', re.DOTALL), lambda match: ''), (re.compile(ur'

Przeczytaj koniecznie.*?

', re.DOTALL), lambda match: '')] remove_tags_before=dict(name="h1", attrs={'class':'entry-title'}) remove_tags_after=dict(name="div", attrs={'class':'snap_nopreview sharing robots-nocontent'}) remove_tags=[dict(name='div', attrs={'class':'snap_nopreview sharing robots-nocontent'})] diff --git a/recipes/tanuki.recipe b/recipes/tanuki.recipe new file mode 100644 index 0000000000..666cb8aa77 --- /dev/null +++ b/recipes/tanuki.recipe @@ -0,0 +1,37 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re +class tanuki(BasicNewsRecipe): + title = u'Tanuki' + oldest_article = 7 + __author__ = 'fenuks' + category = 'anime, manga' + language = 'pl' + max_articles_per_feed = 100 + encoding='utf-8' + extra_css= 'ul {list-style: none; padding: 0; margin: 0;} .kadr{float: left;} .dwazdania {float: right;}' + preprocess_regexps = [(re.compile(ur'

', re.DOTALL), lambda match: ''), (re.compile(ur'', re.DOTALL), lambda match: '')] + remove_empty_feeds= True + no_stylesheets = True + keep_only_tags=[dict(attrs={'class':['animename', 'storyname', 'nextarrow','sideinfov', 'sidelinfov', 'sideinfo', 'sidelinfo']}), dict(name='table', attrs={'summary':'Technikalia'}), dict(attrs={'class':['chaptername','copycat']}), dict(id='rightcolumn'), dict(attrs={'class':['headn_tt', 'subtable']})] + remove_tags=[dict(name='div', attrs={'class':'screen'}), dict(id='randomtoplist'), dict(attrs={'class':'note'})] + feeds = [(u'Anime', u'http://anime.tanuki.pl/rss_anime.xml'), (u'Manga', u'http://manga.tanuki.pl/rss_manga.xml'), (u'Tomiki', u'http://manga.tanuki.pl/rss_mangabooks.xml'), (u'Artyku\u0142y', u'http://czytelnia.tanuki.pl/rss_czytelnia_artykuly.xml'), (u'Opowiadania', u'http://czytelnia.tanuki.pl/rss_czytelnia.xml')] + + + def append_page(self, soup, appendtag): + nexturl= appendtag.find(attrs={'class':'nextarrow'}) + if nexturl: + while nexturl: + soup2 = self.index_to_soup('http://czytelnia.tanuki.pl'+ nexturl['href']) + nexturl=soup2.find(attrs={'class':'nextarrow'}) + pagetext = soup2.find(attrs={'class':['chaptername', 'copycat']}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + pagetext = soup2.find(attrs={'class':'copycat'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':'nextarrow'}): + r.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/the_sun.recipe b/recipes/the_sun.recipe index 5699ec106c..80b37f329a 100644 --- a/recipes/the_sun.recipe +++ b/recipes/the_sun.recipe @@ -1,49 +1,57 @@ import re -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import Tag +from calibre.web.feeds.recipes import BasicNewsRecipe -class AdvancedUserRecipe1268409464(BasicNewsRecipe): - title = u'The Sun' - __author__ = 'Chaz Ralph' - description = 'News from The Sun' +class AdvancedUserRecipe1325006965(BasicNewsRecipe): + + title = u'The Sun UK' + cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png' + + description = 'A Recipe for The Sun tabloid UK - uses feed43' + __author__ = 'Dave Asbury' + # last updated 20/2/12 + language = 'en_GB' oldest_article = 1 - max_articles_per_feed = 100 - language = 'en' + max_articles_per_feed = 15 + remove_empty_feeds = True no_stylesheets = True - extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }' - encoding= 'iso-8859-1' - remove_javascript = True + + masthead_url = 'http://www.thesun.co.uk/sol/img/global/Sun-logo.gif' + encoding = 'cp1251' + + encoding = 'cp1252' + remove_empty_feeds = True + remove_javascript = True + no_stylesheets = True + + extra_css = ''' + body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;} + ''' + + preprocess_regexps = [ + (re.compile(r'