diff --git a/Changelog.yaml b/Changelog.yaml index a7fc86c98e..75bea147cb 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -19,6 +19,68 @@ # new recipes: # - title: +- version: 0.8.41 + date: 2012-02-24 + + new features: + - title: "Driver for Sony Experia Play 4G" + tickets: [938831] + + - title: "News download system: Allow use of __future__ in recipes, and do not change line numbers of code in the recipe when compiling it" + + - title: "Use the My Documents folder as the default location for the Calibre Library folder on first start in windows" + tickets: [934840] + + - title: "Add a tweak to Preferences->Tweaks to control the order in which categories appear in the Tag Browser" + + - title: "Tag Browser: Add an entry to the right click menu to quickly delete tags" + tickets: [934509] + + - title: "Amazon metadata download: Try to scrape series information from the amazon details page. Note that currently very few books have series info available. Often the page for hardcover will have series, but the Kindle edition will not. In such cases calibre may or may not find the series, depending on which page it ends up using." + + - title: "Content server: Add favicon to OPDS feeds." + tickets: [934731] + + bug fixes: + - title: "RTF Input: Fix some WMF images embedded in RTF files being distorted on conversion." + tickets: [934167] + + - title: "Fix long standing bug preventing calibre from working on east asian windows installs when the user name in windows has non-ascii characters" + tickets: [937389] + + - title: "Get Books: Fix Baen Webscription and O'Reilly stores. Fix price detection for Google Books" + + - title: "MOBI Output: When the same anchor is present more than once in the input document, use the first occurrence rather than the last one." + tickets: [934031] + + - title: "Use the 'default cover font' tweak when generating default masthead images as well" + tickets: [939256] + + - title: "Fix content server does not correctly display custom field of type 'rating'" + tickets: [938303] + + - title: "Fix welcome wizard does not save send-from email info unless send-to field is filled" + tickets: [937087] + + - title: "When reading metadata from odt files, use initial-creator in preference to creator for setting the author field" + tickets: [934564] + + - title: "Fix conversion erroring out when the input document has very long and thin images" + tickets: [935234] + + improved recipes: + - The Sun + - Various Polish news sources + - Mediapart + + new recipes: + - title: La pausa caffe + author: faber1971 + + - title: Various Polish news sources + author: fenuks + + - version: 0.8.40 date: 2012-02-17 diff --git a/imgsrc/calibreSymbols.spd b/imgsrc/calibreSymbols.spd new file mode 100644 index 0000000000..1ef6f532c5 --- /dev/null +++ b/imgsrc/calibreSymbols.spd @@ -0,0 +1,152 @@ +SplineFontDB: 3.0 +FontName: calibreSymbols +FullName: calibre Symbols +FamilyName: calibre Symbols +Weight: Medium +Copyright: Created by Kovid Goyal with FontForge 2.0 (http://fontforge.sf.net) +UComments: "2012-2-27: Created." +Version: 001.000 +ItalicAngle: 0 +UnderlinePosition: -100 +UnderlineWidth: 50 +Ascent: 800 +Descent: 200 +LayerCount: 2 +Layer: 0 0 "Back" 1 +Layer: 1 0 "Fore" 0 +NeedsXUIDChange: 1 +XUID: [1021 913 325894820 11538708] +FSType: 0 +OS2Version: 0 +OS2_WeightWidthSlopeOnly: 0 +OS2_UseTypoMetrics: 1 +CreationTime: 1330331997 +ModificationTime: 1330487767 +OS2TypoAscent: 0 +OS2TypoAOffset: 1 +OS2TypoDescent: 0 +OS2TypoDOffset: 1 +OS2TypoLinegap: 90 +OS2WinAscent: 0 +OS2WinAOffset: 1 +OS2WinDescent: 0 +OS2WinDOffset: 1 +HheadAscent: 0 +HheadAOffset: 1 +HheadDescent: 0 +HheadDOffset: 1 +MarkAttachClasses: 1 +DEI: 91125 +Encoding: UnicodeFull +UnicodeInterp: none +NameList: Adobe Glyph List +DisplaySize: -24 +AntiAlias: 1 +FitToEm: 1 +WidthSeparation: 150 +WinInfo: 9600 75 22 +BeginPrivate: 0 +EndPrivate +BeginChars: 1114112 3 + +StartChar: uni2605 +Encoding: 9733 9733 0 +Width: 979 +VWidth: -26 +Flags: W +LayerCount: 2 +Fore +SplineSet +551.923 352.862 m 1 + 749.497 369.592 l 2 + 804.954 374.123 833.379 376.389 834.765 376.389 c 0 + 852.095 376.389 860.761 368.896 860.761 353.907 c 0 + 860.761 347.981 859.028 343.363 855.562 340.052 c 0 + 852.095 336.74 825.578 319.225 776.012 287.506 c 2 + 609.635 180.323 l 1 + 716.22 -88.417 l 2 + 717.606 -91.2051 718.301 -95.3877 718.301 -100.965 c 0 + 718.301 -106.193 716.394 -110.725 712.58 -114.558 c 0 + 708.769 -118.393 704.608 -120.31 700.104 -120.31 c 0 + 695.943 -120.31 691.61 -118.828 687.103 -115.866 c 0 + 682.598 -112.902 658.162 -92.251 613.795 -53.9082 c 2 + 466.134 74.71 l 1 + 320.554 -51.8184 l 2 + 274.802 -91.5547 249.758 -112.902 245.426 -115.866 c 0 + 241.092 -118.828 236.846 -120.31 232.688 -120.31 c 0 + 227.835 -120.31 223.415 -118.306 219.429 -114.297 c 0 + 215.442 -110.289 213.449 -105.844 213.449 -100.965 c 0 + 213.449 -97.8281 223.329 -71.3379 243.087 -21.4932 c 2 + 322.115 180.323 l 1 + 152.618 289.598 l 2 + 104.783 320.271 79.2217 337.176 75.9297 340.313 c 0 + 72.6357 343.45 70.9893 347.981 70.9893 353.907 c 0 + 70.9893 369.243 79.8291 376.912 97.5059 376.912 c 0 + 98.8926 376.912 123.155 374.82 170.296 370.638 c 2 + 379.825 352.862 l 1 + 427.14 555.201 l 2 + 439.271 607.834 446.811 636.764 449.757 641.992 c 0 + 452.702 647.221 458.162 649.834 466.134 649.834 c 0 + 474.454 649.834 480 646.96 482.772 641.208 c 0 + 485.545 635.457 493.518 604.173 506.689 547.357 c 2 + 551.923 352.862 l 1 +EndSplineSet +Validated: 524289 +EndChar + +StartChar: zero +Encoding: 48 48 1 +Width: 1303 +VWidth: 2048 +Flags: W +HStem: -43.3789 76.7998<582.097 721.09> 623.341 76.7998<582.097 721.091> +VStem: 403.82 97.4395<148.044 508.66> 802.221 96.959<148.044 508.659> +LayerCount: 2 +Fore +SplineSet +651.5 623.341 m 0 + 601.58 623.341 564.061 598.78 538.939 549.66 c 0 + 513.82 500.541 501.26 426.7 501.26 328.141 c 0 + 501.26 229.9 513.82 156.221 538.939 107.101 c 0 + 564.061 57.9805 601.58 33.4209 651.5 33.4209 c 0 + 701.74 33.4209 739.42 57.9805 764.54 107.101 c 0 + 789.66 156.221 802.221 229.9 802.221 328.141 c 0 + 802.221 426.7 789.66 500.541 764.54 549.66 c 0 + 739.42 598.78 701.74 623.341 651.5 623.341 c 0 +651.5 700.141 m 0 + 731.82 700.141 793.18 668.38 835.58 604.859 c 0 + 877.979 541.341 899.18 449.101 899.18 328.141 c 0 + 899.18 207.5 877.979 115.421 835.58 51.9004 c 0 + 793.18 -11.6201 731.819 -43.3789 651.5 -43.3789 c 0 + 571.18 -43.3789 509.82 -11.6201 467.42 51.9004 c 0 + 425.021 115.421 403.82 207.5 403.82 328.141 c 0 + 403.82 449.101 425.021 541.341 467.42 604.859 c 0 + 509.82 668.38 571.18 700.141 651.5 700.141 c 0 +EndSplineSet +Validated: 1 +EndChar + +StartChar: period +Encoding: 46 46 2 +Width: 516 +VWidth: 2048 +Flags: W +HStem: 53.4004 166.199<203.263 309.297> +VStem: 174.6 163.801<82.9501 190.955> +LayerCount: 2 +Fore +SplineSet +338.4 142.8 m 0 + 338.4 119.2 330.5 98.4004 314.7 80.4004 c 0 + 298.9 62.4004 277 53.4004 249 53.4004 c 0 + 225.4 53.4004 207.1 61.2002 194.1 76.7998 c 0 + 181.1 92.4004 174.6 111 174.6 132.6 c 0 + 174.6 155.8 182.6 176.1 198.6 193.5 c 0 + 214.6 210.9 236.8 219.6 265.2 219.6 c 0 + 288.8 219.6 306.9 212.2 319.5 197.4 c 0 + 332.1 182.6 338.4 164.4 338.4 142.8 c 0 +EndSplineSet +Validated: 1 +EndChar +EndChars +EndSplineFont diff --git a/recipes/archeowiesci.recipe b/recipes/archeowiesci.recipe index 3c93d3644f..e121ba4d42 100644 --- a/recipes/archeowiesci.recipe +++ b/recipes/archeowiesci.recipe @@ -7,6 +7,7 @@ class Archeowiesci(BasicNewsRecipe): language = 'pl' cover_url='http://archeowiesci.pl/wp-content/uploads/2011/05/Archeowiesci2-115x115.jpg' oldest_article = 7 + needs_subscription='optional' max_articles_per_feed = 100 auto_cleanup = True remove_tags=[dict(name='span', attrs={'class':['post-ratings', 'post-ratings-loading']})] @@ -16,6 +17,16 @@ class Archeowiesci(BasicNewsRecipe): feeds = BasicNewsRecipe.parse_feeds(self) for feed in feeds: for article in feed.articles[:]: - if 'subskrypcja' in article.title: + if self.username is None and 'subskrypcja' in article.title: feed.articles.remove(article) return feeds + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + if self.username is not None and self.password is not None: + br.open('http://archeowiesci.pl/wp-login.php') + br.select_form(name='loginform') + br['log'] = self.username + br['pwd'] = self.password + br.submit() + return br \ No newline at end of file diff --git a/recipes/astronomia_pl.recipe b/recipes/astronomia_pl.recipe index a142520ec5..89a0e4c889 100644 --- a/recipes/astronomia_pl.recipe +++ b/recipes/astronomia_pl.recipe @@ -1,15 +1,18 @@ from calibre.web.feeds.news import BasicNewsRecipe - +import re class Astronomia_pl(BasicNewsRecipe): title = u'Astronomia.pl' __author__ = 'fenuks' description = 'Astronomia - polish astronomy site' + masthead_url = 'http://www.astronomia.pl/grafika/logo.gif' cover_url = 'http://www.astronomia.pl/grafika/logo.gif' category = 'astronomy, science' language = 'pl' oldest_article = 8 max_articles_per_feed = 100 - #no_stylesheets=True + extra_css='#h2 {font-size: 18px;}' + no_stylesheets=True + preprocess_regexps = [(re.compile(ur'Przeczytaj także:.*?', re.DOTALL), lambda match: '') ] remove_tags_before=dict(name='div', attrs={'id':'a1'}) keep_only_tags=[dict(name='div', attrs={'id':['a1', 'h2']})] feeds = [(u'Wiadomości z astronomii i astronautyki', u'http://www.astronomia.pl/rss/')] diff --git a/recipes/benchmark_pl.recipe b/recipes/benchmark_pl.recipe index d5b4997aa7..cc74cc9128 100644 --- a/recipes/benchmark_pl.recipe +++ b/recipes/benchmark_pl.recipe @@ -4,16 +4,17 @@ class Benchmark_pl(BasicNewsRecipe): title = u'Benchmark.pl' __author__ = 'fenuks' description = u'benchmark.pl -IT site' + masthead_url = 'http://www.benchmark.pl/i/logo-footer.png' cover_url = 'http://www.ieaddons.pl/benchmark/logo_benchmark_new.gif' category = 'IT' language = 'pl' oldest_article = 8 max_articles_per_feed = 100 no_stylesheets=True - preprocess_regexps = [(re.compile(ur'\bWięcej o .*', re.DOTALL|re.IGNORECASE), lambda match: '')] + preprocess_regexps = [(re.compile(ur'

 Zobacz poprzednie Opinie dnia:.*', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Więcej o .*?', re.DOTALL|re.IGNORECASE), lambda match: '')] keep_only_tags=[dict(name='div', attrs={'class':['m_zwykly', 'gallery']})] remove_tags_after=dict(name='div', attrs={'class':'body'}) - remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery']})] + remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery']}), dict(name='table', attrs={'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})] INDEX= 'http://www.benchmark.pl' feeds = [(u'Aktualności', u'http://www.benchmark.pl/rss/aktualnosci-pliki.xml'), (u'Testy i recenzje', u'http://www.benchmark.pl/rss/testy-recenzje-minirecenzje.xml')] diff --git a/recipes/biolog_pl.recipe b/recipes/biolog_pl.recipe index af9ad77e44..b10bf0d925 100644 --- a/recipes/biolog_pl.recipe +++ b/recipes/biolog_pl.recipe @@ -10,10 +10,11 @@ class Biolog_pl(BasicNewsRecipe): description = u'Przyrodnicze aktualności ze świata nauki (codziennie aktualizowane), kurs biologii, testy i sprawdziany, forum dyskusyjne.' category = 'biology' language = 'pl' + masthead_url= 'http://www.biolog.pl/naukowy,portal,biolog.png' cover_url='http://www.biolog.pl/naukowy,portal,biolog.png' no_stylesheets = True #keeps_only_tags=[dict(id='main')] remove_tags_before=dict(id='main') remove_tags_after=dict(name='a', attrs={'name':'komentarze'}) - remove_tags=[dict(name='img', attrs={'alt':'Komentarze'})] + remove_tags=[dict(name='img', attrs={'alt':'Komentarze'}), dict(name='span', attrs={'class':'menu_odsylacze'})] feeds = [(u'Wszystkie', u'http://www.biolog.pl/backend.php'), (u'Medycyna', u'http://www.biolog.pl/medycyna-rss.php'), (u'Ekologia', u'http://www.biolog.pl/rss-ekologia.php'), (u'Genetyka i biotechnologia', u'http://www.biolog.pl/rss-biotechnologia.php'), (u'Botanika', u'http://www.biolog.pl/rss-botanika.php'), (u'Le\u015bnictwo', u'http://www.biolog.pl/rss-lesnictwo.php'), (u'Zoologia', u'http://www.biolog.pl/rss-zoologia.php')] diff --git a/recipes/cd_action.recipe b/recipes/cd_action.recipe index b4cf6b326c..ff46774dc9 100644 --- a/recipes/cd_action.recipe +++ b/recipes/cd_action.recipe @@ -1,16 +1,20 @@ from calibre.web.feeds.news import BasicNewsRecipe - class CD_Action(BasicNewsRecipe): title = u'CD-Action' __author__ = 'fenuks' - description = 'cdaction.pl - polish magazine about games site' + description = 'cdaction.pl - polish games magazine site' category = 'games' language = 'pl' oldest_article = 8 max_articles_per_feed = 100 no_stylesheets= True - cover_url =u'http://s.cdaction.pl/obrazki/logo-CD-Action_172k9.JPG' keep_only_tags= dict(id='news_content') remove_tags_after= dict(name='div', attrs={'class':'tresc'}) feeds = [(u'Newsy', u'http://www.cdaction.pl/rss_newsy.xml')] + + + def get_cover_url(self): + soup = self.index_to_soup('http://www.cdaction.pl/magazyn/') + self.cover_url='http://www.cdaction.pl'+ soup.find(id='wspolnik').div.a['href'] + return getattr(self, 'cover_url', self.cover_url) \ No newline at end of file diff --git a/recipes/cgm_pl.recipe b/recipes/cgm_pl.recipe index 591155ff85..673a9f940b 100644 --- a/recipes/cgm_pl.recipe +++ b/recipes/cgm_pl.recipe @@ -5,6 +5,7 @@ class CGM(BasicNewsRecipe): oldest_article = 7 __author__ = 'fenuks' description = u'Codzienna Gazeta Muzyczna' + masthead_url='http://www.cgm.pl/img/header/logo.gif' cover_url = 'http://www.krafcy.com/foto/tinymce/Image/cgm%281%29.jpg' category = 'music' language = 'pl' @@ -23,21 +24,19 @@ class CGM(BasicNewsRecipe): def preprocess_html(self, soup): + gallery=soup.find('div', attrs={'class':'galleryFlash'}) + if gallery: + img=gallery.div + gallery.img.extract() + if img: + img=img['style'] + img='http://www.cgm.pl'+img[img.find('url(')+4:img.find(')')] + gallery.contents[1].name='img' + gallery.contents[1]['src']=img for item in soup.findAll(style=True): del item['style'] ad=soup.findAll('a') for r in ad: - if 'http://www.hustla.pl' in r['href'] or 'http://www.ebilet.pl' in r['href']: + if 'www.hustla.pl' in r['href'] or 'www.ebilet.pl' in r['href']: r.extract() - gallery=soup.find('div', attrs={'class':'galleryFlash'}) - if gallery: - img=gallery.find('embed') - if img: - img=img['src'][35:] - img='http://www.cgm.pl/_vault/_gallery/_photo/'+img - param=gallery.findAll(name='param') - for i in param: - i.extract() - gallery.contents[1].name='img' - gallery.contents[1]['src']=img return soup \ No newline at end of file diff --git a/recipes/chicago_tribune.recipe b/recipes/chicago_tribune.recipe index a5ec8f0743..684993e251 100644 --- a/recipes/chicago_tribune.recipe +++ b/recipes/chicago_tribune.recipe @@ -3,6 +3,7 @@ __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' +import urllib, re from calibre.web.feeds.news import BasicNewsRecipe class ChicagoTribune(BasicNewsRecipe): @@ -77,10 +78,17 @@ class ChicagoTribune(BasicNewsRecipe): def get_article_url(self, article): - url = article.get('feedburner_origlink', article.get('guid', article.get('link'))) - if url.endswith('?track=rss'): - url = url.partition('?')[0] - return url + ans = None + try: + s = article.summary + ans = urllib.unquote( + re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1)) + except: + pass + if ans is None: + ans = article.get('feedburner_origlink', article.get('guid', article.get('link'))) + if ans is not None: + return ans.replace('?track=rss', '') def skip_ad_pages(self, soup): text = soup.find(text='click here to continue to article') diff --git a/recipes/chr_mon.recipe b/recipes/chr_mon.recipe index 6f41b95763..50b626fcbf 100644 --- a/recipes/chr_mon.recipe +++ b/recipes/chr_mon.recipe @@ -33,6 +33,32 @@ class ChristianScienceMonitor(BasicNewsRecipe): remove_javascript = True no_stylesheets = True + requires_version = (0, 8, 39) + + def preprocess_raw_html(self, raw, url): + try: + from html5lib import parse + root = parse(raw, namespaceHTMLElements=False, + treebuilder='lxml').getroot() + from lxml import etree + for tag in root.xpath( + '//script|//style|//noscript|//meta|//link|//object'): + tag.getparent().remove(tag) + for elem in list(root.iterdescendants(tag=etree.Comment)): + elem.getparent().remove(elem) + ans = etree.tostring(root, encoding=unicode) + ans = re.sub('.*', lambda match : ''), - (r'
.*?
', lambda m: ''), - (r'Full HTML version of this story which may include photos, graphics, and related links.*', - lambda match : ''), - ]] extra_css = ''' h1{ color:#000000;font-family: Georgia,Times,"Times New Roman",serif; font-size: large} .sub{ color:#000000;font-family: Georgia,Times,"Times New Roman",serif; font-size: small;} diff --git a/recipes/ciekawostki_historyczne.recipe b/recipes/ciekawostki_historyczne.recipe new file mode 100644 index 0000000000..7c5138196d --- /dev/null +++ b/recipes/ciekawostki_historyczne.recipe @@ -0,0 +1,48 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re +class Ciekawostki_Historyczne(BasicNewsRecipe): + title = u'Ciekawostki Historyczne' + oldest_article = 7 + __author__ = 'fenuks' + description = u'Serwis popularnonaukowy - odkrycia, kontrowersje, historia, ciekawostki, badania, ciekawostki z przeszłości.' + category = 'history' + language = 'pl' + masthead_url= 'http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg' + cover_url='http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg' + max_articles_per_feed = 100 + preprocess_regexps = [(re.compile(ur'Ten artykuł ma kilka stron.*?', re.DOTALL), lambda match: ''), (re.compile(ur'

Zobacz też:

.*?', re.DOTALL), lambda match: '')] + no_stylesheets=True + remove_empty_feeds=True + keep_only_tags=[dict(name='div', attrs={'class':'post'})] + remove_tags=[dict(id='singlepostinfo')] + feeds = [(u'Staro\u017cytno\u015b\u0107', u'http://ciekawostkihistoryczne.pl/tag/starozytnosc/feed/'), (u'\u015aredniowiecze', u'http://ciekawostkihistoryczne.pl/tag/sredniowiecze/feed/'), (u'Nowo\u017cytno\u015b\u0107', u'http://ciekawostkihistoryczne.pl/tag/nowozytnosc/feed/'), (u'XIX wiek', u'http://ciekawostkihistoryczne.pl/tag/xix-wiek/feed/'), (u'1914-1939', u'http://ciekawostkihistoryczne.pl/tag/1914-1939/feed/'), (u'1939-1945', u'http://ciekawostkihistoryczne.pl/tag/1939-1945/feed/'), (u'Powojnie (od 1945)', u'http://ciekawostkihistoryczne.pl/tag/powojnie/feed/'), (u'Recenzje', u'http://ciekawostkihistoryczne.pl/category/recenzje/feed/')] + + def append_page(self, soup, appendtag): + tag=soup.find(name='h7') + if tag: + if tag.br: + pass + elif tag.nextSibling.name=='p': + tag=tag.nextSibling + nexturl = tag.findAll('a') + for nextpage in nexturl: + tag.extract() + nextpage= nextpage['href'] + soup2 = self.index_to_soup(nextpage) + pagetext = soup2.find(name='div', attrs={'class':'post'}) + for r in pagetext.findAll('div', attrs={'id':'singlepostinfo'}): + r.extract() + for r in pagetext.findAll('div', attrs={'class':'wp-caption alignright'}): + r.extract() + for r in pagetext.findAll('h1'): + r.extract() + pagetext.find('h6').nextSibling.extract() + pagetext.find('h7').nextSibling.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup + + \ No newline at end of file diff --git a/recipes/computerworld_pl.recipe b/recipes/computerworld_pl.recipe index 90b7d63c56..2ec457e4de 100644 --- a/recipes/computerworld_pl.recipe +++ b/recipes/computerworld_pl.recipe @@ -7,10 +7,11 @@ class Computerworld_pl(BasicNewsRecipe): description = u'Serwis o IT w przemyśle, finansach, handlu, administracji oraz rynku IT i telekomunikacyjnym - wiadomości, opinie, analizy, porady prawne' category = 'IT' language = 'pl' + masthead_url= 'http://g1.computerworld.pl/cw/beta_gfx/cw2.gif' no_stylesheets=True oldest_article = 7 max_articles_per_feed = 100 - keep_only_tags=[dict(name='div', attrs={'id':'s'})] + keep_only_tags=[dict(attrs={'class':['tyt_news', 'prawo', 'autor', 'tresc']})] remove_tags_after=dict(name='div', attrs={'class':'rMobi'}) remove_tags=[dict(name='div', attrs={'class':['nnav', 'rMobi']}), dict(name='table', attrs={'class':'ramka_slx'})] feeds = [(u'Wiadomo\u015bci', u'http://rssout.idg.pl/cw/news_iso.xml')] diff --git a/recipes/dobreprogamy.recipe b/recipes/dobreprogamy.recipe index 72f9c966bd..a27a9b0877 100644 --- a/recipes/dobreprogamy.recipe +++ b/recipes/dobreprogamy.recipe @@ -7,6 +7,7 @@ class Dobreprogramy_pl(BasicNewsRecipe): __licence__ ='GPL v3' category = 'IT' language = 'pl' + masthead_url='http://static.dpcdn.pl/css/Black/Images/header_logo_napis_fullVersion.png' cover_url = 'http://userlogos.org/files/logos/Karmody/dobreprogramy_01.png' description = u'Aktualności i blogi z dobreprogramy.pl' encoding = 'utf-8' @@ -16,7 +17,8 @@ class Dobreprogramy_pl(BasicNewsRecipe): oldest_article = 8 max_articles_per_feed = 100 preprocess_regexps = [(re.compile(ur'
Twoja przeglądarka nie obsługuje Flasha i HTML5 lub wyłączono obsługę JavaScript...
'), lambda match: '') ] - remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})] - keep_only_tags = [dict(name='div', attrs={'class':['mainBar', 'newsContent', 'postTitle title', 'postInfo', 'contentText', 'content']})] + keep_only_tags=[dict(attrs={'class':['news', 'entry single']})] + remove_tags = [dict(name='div', attrs={'class':['newsOptions', 'noPrint', 'komentarze', 'tags font-heading-master']})] + #remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})] feeds = [(u'Aktualności', 'http://feeds.feedburner.com/dobreprogramy/Aktualnosci'), ('Blogi', 'http://feeds.feedburner.com/dobreprogramy/BlogCzytelnikow')] diff --git a/recipes/dziennik_pl.recipe b/recipes/dziennik_pl.recipe index b5453659ef..6da7e0240d 100644 --- a/recipes/dziennik_pl.recipe +++ b/recipes/dziennik_pl.recipe @@ -8,15 +8,17 @@ class Dziennik_pl(BasicNewsRecipe): description = u'Wiadomości z kraju i ze świata. Wiadomości gospodarcze. Znajdziesz u nas informacje, wydarzenia, komentarze, opinie.' category = 'newspaper' language = 'pl' - cover_url='http://6.s.dziennik.pl/images/og_dziennik.jpg' + masthead_url= 'http://5.s.dziennik.pl/images/logos.png' + cover_url= 'http://5.s.dziennik.pl/images/logos.png' no_stylesheets = True oldest_article = 7 max_articles_per_feed = 100 remove_javascript=True remove_empty_feeds=True - preprocess_regexps = [(re.compile("Komentarze:"), lambda m: '')] + extra_css= 'ul {list-style: none; padding: 0; margin: 0;} li {float: left;margin: 0 0.15em;}' + preprocess_regexps = [(re.compile("Komentarze:"), lambda m: ''), (re.compile('

>>> CZYTAJ TAKŻE: ".*?"

'), lambda m: '')] keep_only_tags=[dict(id='article')] - remove_tags=[dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget']}), dict(name='a', attrs={'class':'komentarz'})] + remove_tags=[dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget', 'belka-spol', 'belka-spol belka-spol-bottom', 'art_data_tags', 'cl_right', 'boxRounded gal_inside']}), dict(name='a', attrs={'class':['komentarz', 'article_icon_addcommnent']})] feeds = [(u'Wszystko', u'http://rss.dziennik.pl/Dziennik-PL/'), (u'Wiadomości', u'http://rss.dziennik.pl/Dziennik-Wiadomosci'), (u'Gospodarka', u'http://rss.dziennik.pl/Dziennik-Gospodarka'), @@ -30,6 +32,12 @@ class Dziennik_pl(BasicNewsRecipe): (u'Podróże', u'http://rss.dziennik.pl/Dziennik-Podroze/'), (u'Nieruchomości', u'http://rss.dziennik.pl/Dziennik-Nieruchomosci')] + def skip_ad_pages(self, soup): + tag=soup.find(name='a', attrs={'title':'CZYTAJ DALEJ'}) + if tag: + new_soup=self.index_to_soup(tag['href'], raw=True) + return new_soup + def append_page(self, soup, appendtag): tag=soup.find('a', attrs={'class':'page_next'}) if tag: @@ -56,3 +64,4 @@ class Dziennik_pl(BasicNewsRecipe): def preprocess_html(self, soup): self.append_page(soup, soup.body) return soup + diff --git a/recipes/film_web.recipe b/recipes/film_web.recipe index 1c72e5704e..0671deec6c 100644 --- a/recipes/film_web.recipe +++ b/recipes/film_web.recipe @@ -10,7 +10,8 @@ class Filmweb_pl(BasicNewsRecipe): oldest_article = 8 max_articles_per_feed = 100 no_stylesheets= True - extra_css = '.hdrBig {font-size:22px;}' + remove_empty_feeds=True + extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}' remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'})] keep_only_tags= [dict(name='h1', attrs={'class':'hdrBig'}), dict(name='div', attrs={'class':['newsInfo', 'reviewContent fontSizeCont description']})] feeds = [(u'Wszystkie newsy', u'http://www.filmweb.pl/feed/news/latest'), diff --git a/recipes/gameplay_pl.recipe b/recipes/gameplay_pl.recipe new file mode 100644 index 0000000000..f3384263d6 --- /dev/null +++ b/recipes/gameplay_pl.recipe @@ -0,0 +1,21 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class Gameplay_pl(BasicNewsRecipe): + title = u'Gameplay.pl' + oldest_article = 7 + __author__ = 'fenuks' + description = u'gameplay.pl - serwis o naszych zainteresowaniach, grach, filmach, książkach, muzyce, fotografii i konsolach.' + category = 'games, movies, books, music' + language = 'pl' + masthead_url= 'http://gameplay.pl/img/gpy_top_logo.png' + cover_url= 'http://gameplay.pl/img/gpy_top_logo.png' + max_articles_per_feed = 100 + no_stylesheets= True + keep_only_tags=[dict(name='div', attrs={'class':['news_endpage_tit', 'news']})] + remove_tags=[dict(name='div', attrs={'class':['galeria', 'noedit center im']})] + feeds = [(u'Wiadomo\u015bci', u'http://gameplay.pl/rss/')] + + def image_url_processor(self, baseurl, url): + if 'http' not in url: + return 'http://gameplay.pl'+ url[2:] + else: + return url diff --git a/recipes/gazeta_wyborcza.recipe b/recipes/gazeta_wyborcza.recipe index 0959ff80a3..489caf231f 100644 --- a/recipes/gazeta_wyborcza.recipe +++ b/recipes/gazeta_wyborcza.recipe @@ -4,10 +4,11 @@ from calibre.web.feeds.news import BasicNewsRecipe class Gazeta_Wyborcza(BasicNewsRecipe): title = u'Gazeta Wyborcza' __author__ = 'fenuks' - cover_url = 'http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg' language = 'pl' description ='news from gazeta.pl' category='newspaper' + publication_type = 'newspaper' + masthead_url='http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg' INDEX='http://wyborcza.pl' remove_empty_feeds= True oldest_article = 3 @@ -81,3 +82,10 @@ class Gazeta_Wyborcza(BasicNewsRecipe): return url else: return url.replace('http://wyborcza.biz/biznes/1', 'http://wyborcza.biz/biznes/2029020') + + def get_cover_url(self): + soup = self.index_to_soup('http://wyborcza.pl/0,76762,3751429.html') + cover=soup.find(id='GWmini2') + soup = self.index_to_soup('http://wyborcza.pl/'+ cover.contents[3].a['href']) + self.cover_url='http://wyborcza.pl' + soup.img['src'] + return getattr(self, 'cover_url', self.cover_url) diff --git a/recipes/gry_online_pl.recipe b/recipes/gry_online_pl.recipe index d9c461dc63..e188e4988c 100644 --- a/recipes/gry_online_pl.recipe +++ b/recipes/gry_online_pl.recipe @@ -8,29 +8,31 @@ class Gry_online_pl(BasicNewsRecipe): language = 'pl' oldest_article = 13 INDEX= 'http://www.gry-online.pl/' - cover_url='http://www.gry-online.pl/img/1st_10/1st-gol-logo.png' + masthead_url='http://www.gry-online.pl/im/gry-online-logo.png' + cover_url='http://www.gry-online.pl/im/gry-online-logo.png' max_articles_per_feed = 100 no_stylesheets= True - extra_css = 'p.wn1{font-size:22px;}' - remove_tags_after= [dict(name='div', attrs={'class':['tresc-newsa']})] - keep_only_tags = [dict(name='div', attrs={'class':['txthead']}), dict(name='p', attrs={'class':['wtx1', 'wn1', 'wob']}), dict(name='a', attrs={'class':['num_str_nex']})] - #remove_tags= [dict(name='div', attrs={'class':['news_plat']})] + keep_only_tags=[dict(name='div', attrs={'class':'gc660'})] + remove_tags=[dict({'class':['nav-social', 'add-info', 'smlb', 'lista lista3 lista-gry', 'S013po', 'zm_gfx_cnt_bottom', 'ocen-txt', 'wiecej-txt', 'wiecej-txt2']})] feeds = [(u'Newsy', 'http://www.gry-online.pl/rss/news.xml'), ('Teksty', u'http://www.gry-online.pl/rss/teksty.xml')] def append_page(self, soup, appendtag): - nexturl = soup.find('a', attrs={'class':'num_str_nex'}) - if appendtag.find('a', attrs={'class':'num_str_nex'}) is not None: - appendtag.find('a', attrs={'class':'num_str_nex'}).replaceWith('\n') - if nexturl is not None: - if 'strona' in nexturl.div.string: - nexturl= self.INDEX + nexturl['href'] - soup2 = self.index_to_soup(nexturl) - pagetext = soup2.findAll(name='p', attrs={'class':['wtx1', 'wn1', 'wob']}) - for tag in pagetext: - pos = len(appendtag.contents) - appendtag.insert(pos, tag) - self.append_page(soup2, appendtag) + tag = appendtag.find('div', attrs={'class':'n5p'}) + if tag: + nexturls=tag.findAll('a') + for nexturl in nexturls[1:]: + try: + soup2 = self.index_to_soup('http://www.gry-online.pl/S020.asp'+ nexturl['href']) + except: + soup2 = self.index_to_soup('http://www.gry-online.pl/S022.asp'+ nexturl['href']) + pagetext = soup2.find(attrs={'class':'gc660'}) + for r in pagetext.findAll(name='header'): + r.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button']}): + r.extract() def preprocess_html(self, soup): diff --git a/recipes/icons/ciekawostki_historyczne.png b/recipes/icons/ciekawostki_historyczne.png new file mode 100644 index 0000000000..fa0e2c0591 Binary files /dev/null and b/recipes/icons/ciekawostki_historyczne.png differ diff --git a/recipes/icons/gameplay_pl.png b/recipes/icons/gameplay_pl.png new file mode 100644 index 0000000000..1b7081f393 Binary files /dev/null and b/recipes/icons/gameplay_pl.png differ diff --git a/recipes/icons/in4_pl.png b/recipes/icons/in4_pl.png new file mode 100644 index 0000000000..b3351629f0 Binary files /dev/null and b/recipes/icons/in4_pl.png differ diff --git a/recipes/icons/informacje_usa.png b/recipes/icons/informacje_usa.png new file mode 100644 index 0000000000..4c30e3bcbc Binary files /dev/null and b/recipes/icons/informacje_usa.png differ diff --git a/recipes/icons/kresy_pl.png b/recipes/icons/kresy_pl.png new file mode 100644 index 0000000000..db8ef4efec Binary files /dev/null and b/recipes/icons/kresy_pl.png differ diff --git a/recipes/icons/mediapart.png b/recipes/icons/mediapart.png new file mode 100644 index 0000000000..ab489d3db7 Binary files /dev/null and b/recipes/icons/mediapart.png differ diff --git a/recipes/icons/oclab_pl.png b/recipes/icons/oclab_pl.png new file mode 100644 index 0000000000..45ecd2533e Binary files /dev/null and b/recipes/icons/oclab_pl.png differ diff --git a/recipes/icons/overclock_pl.png b/recipes/icons/overclock_pl.png new file mode 100644 index 0000000000..38c0b13bfe Binary files /dev/null and b/recipes/icons/overclock_pl.png differ diff --git a/recipes/icons/palmtop_pl.png b/recipes/icons/palmtop_pl.png new file mode 100644 index 0000000000..d711a41682 Binary files /dev/null and b/recipes/icons/palmtop_pl.png differ diff --git a/recipes/icons/pc_arena.png b/recipes/icons/pc_arena.png new file mode 100644 index 0000000000..10be204b36 Binary files /dev/null and b/recipes/icons/pc_arena.png differ diff --git a/recipes/icons/pc_centre_pl.png b/recipes/icons/pc_centre_pl.png new file mode 100644 index 0000000000..e2fbf1eefb Binary files /dev/null and b/recipes/icons/pc_centre_pl.png differ diff --git a/recipes/icons/pc_foster.png b/recipes/icons/pc_foster.png new file mode 100644 index 0000000000..433970bcc1 Binary files /dev/null and b/recipes/icons/pc_foster.png differ diff --git a/recipes/icons/polska_times.png b/recipes/icons/polska_times.png new file mode 100644 index 0000000000..f233f45518 Binary files /dev/null and b/recipes/icons/polska_times.png differ diff --git a/recipes/icons/pure_pc.png b/recipes/icons/pure_pc.png new file mode 100644 index 0000000000..e5e102eee7 Binary files /dev/null and b/recipes/icons/pure_pc.png differ diff --git a/recipes/icons/racjonalista_pl.png b/recipes/icons/racjonalista_pl.png new file mode 100644 index 0000000000..8f4d3c6c81 Binary files /dev/null and b/recipes/icons/racjonalista_pl.png differ diff --git a/recipes/icons/rue89.png b/recipes/icons/rue89.png new file mode 100644 index 0000000000..55c52bc488 Binary files /dev/null and b/recipes/icons/rue89.png differ diff --git a/recipes/icons/tanuki.png b/recipes/icons/tanuki.png new file mode 100644 index 0000000000..fe46d7e8dc Binary files /dev/null and b/recipes/icons/tanuki.png differ diff --git a/recipes/icons/tvn24.png b/recipes/icons/tvn24.png new file mode 100644 index 0000000000..864a6624ac Binary files /dev/null and b/recipes/icons/tvn24.png differ diff --git a/recipes/icons/webhosting_pl.png b/recipes/icons/webhosting_pl.png new file mode 100644 index 0000000000..0e11a3065e Binary files /dev/null and b/recipes/icons/webhosting_pl.png differ diff --git a/recipes/in4_pl.recipe b/recipes/in4_pl.recipe new file mode 100644 index 0000000000..16ad622b46 --- /dev/null +++ b/recipes/in4_pl.recipe @@ -0,0 +1,44 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re +class in4(BasicNewsRecipe): + title = u'IN4.pl' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Serwis Informacyjny - Aktualnosci, recenzje' + category = 'IT' + language = 'pl' + #cover_url= 'http://www.in4.pl/recenzje/337/in4pl.jpg' + no_stylesheets = True + remove_empty_feeds = True + preprocess_regexps = [(re.compile(ur'

', re.DOTALL), lambda match: ''), (re.compile(ur'

Zobacz też:.*?

', re.DOTALL), lambda match: '')] + keep_only_tags=[dict(name='div', attrs={'class':'box box-single'})] + remove_tags_after= dict(attrs={'class':'tags'}) + remove_tags= [dict(attrs={'class':['postmetadata', 'tags', 'banner']}), dict(name='a', attrs={'title':['Drukuj', u'Wyślij']})] + feeds = [(u'Informacje', u'http://www.informacjeusa.com/feed/')] diff --git a/recipes/instapaper.recipe b/recipes/instapaper.recipe index d182e556a2..40992e4d75 100644 --- a/recipes/instapaper.recipe +++ b/recipes/instapaper.recipe @@ -1,8 +1,9 @@ +#v2 2011-07-25 from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1299694372(BasicNewsRecipe): title = u'Instapaper' - __author__ = 'Darko Miletic' + __author__ = 'Darko Miletic, Stanislav Khromov' publisher = 'Instapaper.com' category = 'info, custom, Instapaper' oldest_article = 365 @@ -15,6 +16,8 @@ class AdvancedUserRecipe1299694372(BasicNewsRecipe): ,dict(name='div', attrs={'id':'text_controls'}) ,dict(name='div', attrs={'id':'editing_controls'}) ,dict(name='div', attrs={'class':'bar bottom'}) + ,dict(name='div', attrs={'id':'controlbar_container'}) + ,dict(name='div', attrs={'id':'footer'}) ] use_embedded_content = False needs_subscription = True diff --git a/recipes/japaa.recipe b/recipes/japaa.recipe new file mode 100644 index 0000000000..93fd3e0cce --- /dev/null +++ b/recipes/japaa.recipe @@ -0,0 +1,99 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1330393641(BasicNewsRecipe): + title = u'JAAPA' + __author__ = 'adoucette' + language = 'en' + oldest_article = 30 + max_articles_per_feed = 100 + auto_cleanup = True + + def get_cover_url(self): + cover_url = None + soup = self.index_to_soup('http://www.jaapa.com') + cover_item = soup.find('img', src=re.compile(r'\w*?cover\w{1,22}\.jpg')) + if cover_item: + cover_url = cover_item['src'] + return cover_url + + feeds = [ + (u'CME Articles', + u'http://feeds.feedburner.com/jaapacmearticles'), + (u'A Day in the Life', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=490'), + (u'Ask A Librarian', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=847'), + (u'Case of the Month', + u'http://feeds.feedburner.com/jaapacaseofthemonth'), + (u'Clinical Watch', + u'http://feeds.feedburner.com/jaapaclinicalwatch'), + (u'Commentary', + u'http://feeds.feedburner.com/jaapacommentary'), + (u'Critically Appraised Topic', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=699'), + (u'Dermatology Digest', + u'http://feeds.feedburner.com/jaapadermatologydigest'), + (u'Diagnostic Imaging Review', + u'http://feeds.feedburner.com/jaapadiagnosticimagingreview'), + (u'Editorial', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=759'), + (u'From the Academy', + u'http://feeds.feedburner.com/jaapafromtheacademy'), + (u'Genomics in PA Practice', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=760'), + (u'Humane Medicine', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=758'), + (u'Inside the AAPA Policy Manual', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=1546'), + (u'Interpreting ECGs', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=1624'), + (u'Letters', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=808'), + (u'PA Quandaries', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=496'), + (u'Pharmacology Consult', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=1614'), + (u'POEMs', u'http://feeds.feedburner.com/jaapapoems'), + (u'Quick Recertification', + u'http://feeds.feedburner.com/jaapaquickrecertificationseries'), + (u'Sounding Board', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=698'), + (u'The Surgical Patient', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=499'), + (u'Topics in Infectious Diseases', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=2495'), + (u"What's New", u'http://feeds.feedburner.com/jaapawhatsnew'), + (u'When the Patient Asks', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=501'), + (u"Women's Health", + u'http://www.jaapa.com/pages/rss.aspx?sectionid=2176'), + (u'AAPA Special Article', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=1453'), + (u'Case Reports', + u'http://feeds.feedburner.com/jaapacasereports'), + (u'Review Articles', + u'http://feeds.feedburner.com/jaapareviewarticles'), + (u'Surgical Reviews', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=505'), + (u'Brief Report', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=2353'), + (u'Research Corner', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=498'), + (u'Research Reports', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=1024'), + (u'The Art of Medicine', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=1289'), + (u'Clinical Practice Guidelines', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=2102'), + (u'Complementary and Alternative Medicine', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=2123'), + (u'Drug Information', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=2089'), + (u'Evidence-Based Medicine', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=1288'), + (u'Patient Information', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=2122')] + + def print_version(self, url): + return url.replace('/article/', '/printarticle/') diff --git a/recipes/kresy_pl.recipe b/recipes/kresy_pl.recipe new file mode 100644 index 0000000000..3dfc2c057c --- /dev/null +++ b/recipes/kresy_pl.recipe @@ -0,0 +1,14 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class Kresy(BasicNewsRecipe): + title = u'Kresy' + __author__ = 'fenuks' + description = u'portal społeczności kresowej' + language = 'pl' + masthead_url= 'http://www.kresy.pl/public/img/logo.png' + cover_url= 'http://www.kresy.pl/public/img/logo.png' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + keep_only_tags= [dict(id='artykul')] + remove_tags= [dict(attrs={'class':['twitter-share-button', 'likefbborder', 'tagi']})] + feeds = [(u'Wszystkie', u'http://www.kresy.pl/rss')] diff --git a/recipes/la_jornada.recipe b/recipes/la_jornada.recipe index 71c526a0a0..74565ab179 100644 --- a/recipes/la_jornada.recipe +++ b/recipes/la_jornada.recipe @@ -1,5 +1,5 @@ __license__ = 'GPL v3' -__copyright__ = '2010, Darko Miletic , Rogelio Domínguez ' +__copyright__ = '2010-2012, Darko Miletic , Rogelio Domínguez ' ''' www.jornada.unam.mx ''' @@ -86,6 +86,6 @@ class LaJornada_mx(BasicNewsRecipe): return soup def get_article_url(self, article): - rurl = article.get('link', None) + rurl = article.get('guid', None) return rurl.rpartition('&partner=')[0] diff --git a/recipes/la_pausa_caffe.recipe b/recipes/la_pausa_caffe.recipe new file mode 100644 index 0000000000..1a87d33dcf --- /dev/null +++ b/recipes/la_pausa_caffe.recipe @@ -0,0 +1,17 @@ +__version__ = 'v1.0' +__date__ = '13, February 2011' + +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1329125921(BasicNewsRecipe): + title = u'La pausa caff\xe8' + __author__ = 'faber1971' + description = 'An Italian satirical blog' + language = 'it' + + oldest_article = 7 + max_articles_per_feed = 100 + auto_cleanup = True + no_stylesheets = True + feeds = [(u'La pausa caff\xe8', u'http://feeds.feedburner.com/LapausaCaffe')] + diff --git a/recipes/marketing_magazine.recipe b/recipes/marketing_magazine.recipe index 55b6ea2584..0c14939cd8 100644 --- a/recipes/marketing_magazine.recipe +++ b/recipes/marketing_magazine.recipe @@ -1,4 +1,5 @@ __license__ = 'GPL v3' + from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1327062445(BasicNewsRecipe): @@ -7,10 +8,13 @@ class AdvancedUserRecipe1327062445(BasicNewsRecipe): max_articles_per_feed = 100 auto_cleanup = True remove_javascript = True + no_stylesheets = True + remove_tags = [ + dict(name='ul', attrs={'id':'ads0'}) + ] masthead_url = 'http://www.simrendeogun.com/wp-content/uploads/2011/06/New-Marketing-Magazine-Logo.jpg' - feeds = [(u'My Marketing', u'http://feed43.com/0537744466058428.xml'), (u'My Marketing_', u'http://feed43.com/8126723074604845.xml'), (u'Venturini', u'http://robertoventurini.blogspot.com/feeds/posts/default?alt=rss'), (u'Ninja Marketing', u'http://feeds.feedburner.com/NinjaMarketing'), (u'Comunitàzione', u'http://www.comunitazione.it/feed/novita.asp'), (u'Brandforum news', u'http://www.brandforum.it/rss/news'), (u'Brandforum papers', u'http://www.brandforum.it/rss/papers'), (u'Disambiguando', u'http://giovannacosenza.wordpress.com/feed/')] __author__ = 'faber1971' - description = 'Collection of Italian marketing websites - v1.00 (28, January 2012)' + description = 'Collection of Italian marketing websites - v1.03 (20, February 2012)' language = 'it' - + feeds = [(u'My Marketing', u'http://feed43.com/0537744466058428.xml'), (u'My Marketing_', u'http://feed43.com/8126723074604845.xml'), (u'Venturini', u'http://robertoventurini.blogspot.com/feeds/posts/default?alt=rss'), (u'Ninja Marketing', u'http://feeds.feedburner.com/NinjaMarketing'), (u'Comunitàzione', u'http://www.comunitazione.it/feed/novita.asp'), (u'Brandforum news', u'http://www.brandforum.it/rss/news'), (u'Brandforum papers', u'http://www.brandforum.it/rss/papers'), (u'MarketingArena', u'http://feeds.feedburner.com/marketingarena'), (u'minimarketing', u'http://feeds.feedburner.com/minimarketingit'), (u'Disambiguando', u'http://giovannacosenza.wordpress.com/feed/')] diff --git a/recipes/mediapart.recipe b/recipes/mediapart.recipe index 4540879f72..f84fb5bc7e 100644 --- a/recipes/mediapart.recipe +++ b/recipes/mediapart.recipe @@ -1,69 +1,45 @@ __license__ = 'GPL v3' -__copyright__ = '2009, Mathieu Godlewski ; 2010, Louis Gesbert ' +__copyright__ = '2009, Mathieu Godlewski ; 2010-2012, Louis Gesbert ' ''' Mediapart ''' -from calibre.ebooks.BeautifulSoup import Tag +__author__ = '2009, Mathieu Godlewski ; 2010-2012, Louis Gesbert ' + +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag from calibre.web.feeds.news import BasicNewsRecipe class Mediapart(BasicNewsRecipe): title = 'Mediapart' - __author__ = 'Mathieu Godlewski' - description = 'Global news in french from online newspapers' + __author__ = 'Mathieu Godlewski, Louis Gesbert' + description = 'Global news in french from news site Mediapart' oldest_article = 7 language = 'fr' needs_subscription = True - max_articles_per_feed = 50 + + use_embedded_content = False no_stylesheets = True - cover_url = 'http://www.mediapart.fr/sites/all/themes/mediapart/mediapart/images/annonce.jpg' + cover_url = 'http://static.mediapart.fr/files/pave_mediapart.jpg' feeds = [ ('Les articles', 'http://www.mediapart.fr/articles/feed'), ] -# -- print-version has poor quality on this website, better do the conversion ourselves -# -# preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in -# [ -# (r'', lambda match : '

'+match.group(1)+'

'), -# (r'[^>]+]*>([^<]*)[^<]*', -# lambda match : ''+match.group(1)+''), -# (r'\'', lambda match: '’'), -# ] -# ] -# -# remove_tags = [ dict(name='div', attrs={'class':'print-source_url'}), -# dict(name='div', attrs={'class':'print-links'}), -# dict(name='img', attrs={'src':'entete_article.png'}), -# dict(name='br') ] -# -# def print_version(self, url): -# raw = self.browser.open(url).read() -# soup = BeautifulSoup(raw.decode('utf8', 'replace')) -# div = soup.find('div', {'id':re.compile('node-\d+')}) -# if div is None: -# return None -# article_id = string.replace(div['id'], 'node-', '') -# if article_id is None: -# return None -# return 'http://www.mediapart.fr/print/'+article_id +# -- print-version -# -- Non-print version [dict(name='div', attrs={'class':'advert'})] + conversion_options = { 'smarten_punctuation' : True } - keep_only_tags = [ - dict(name='h1', attrs={'class':'title'}), - dict(name='div', attrs={'class':'page_papier_detail'}), - ] + remove_tags = [ dict(name='div', attrs={'class':'print-source_url'}) ] - def preprocess_html(self,soup): - for title in soup.findAll('div', {'class':'titre'}): - tag = Tag(soup, 'h3') - title.replaceWith(tag) - tag.insert(0,title) - return soup + def print_version(self, url): + raw = self.browser.open(url).read() + soup = BeautifulSoup(raw.decode('utf8', 'replace')) + link = soup.find('a', {'title':'Imprimer'}) + if link is None: + return None + return link['href'] # -- Handle login @@ -77,3 +53,10 @@ class Mediapart(BasicNewsRecipe): br.submit() return br + def preprocess_html(self, soup): + for title in soup.findAll('p', {'class':'titre_page'}): + title.name = 'h3' + for legend in soup.findAll('span', {'class':'legend'}): + legend.insert(0, Tag(soup, 'br', [])) + legend.name = 'small' + return soup diff --git a/recipes/naczytniki.recipe b/recipes/naczytniki.recipe index e4769d58bc..2ae6bc391e 100644 --- a/recipes/naczytniki.recipe +++ b/recipes/naczytniki.recipe @@ -1,8 +1,9 @@ from calibre.web.feeds.news import BasicNewsRecipe - +import re class naczytniki(BasicNewsRecipe): title = u'naczytniki.pl' __author__ = 'fenuks' + masthead_url= 'http://naczytniki.pl/wp-content/uploads/2010/08/logo_nc28.png' cover_url = 'http://naczytniki.pl/wp-content/uploads/2010/08/logo_nc28.png' language = 'pl' description ='everything about e-readers' @@ -10,6 +11,7 @@ class naczytniki(BasicNewsRecipe): no_stylesheets=True oldest_article = 7 max_articles_per_feed = 100 + preprocess_regexps = [(re.compile(ur'


Zobacz także:

.*?', re.DOTALL), lambda match: '') ] remove_tags_after= dict(name='div', attrs={'class':'sociable'}) keep_only_tags=[dict(name='div', attrs={'class':'post'})] remove_tags=[dict(name='span', attrs={'class':'comments'}), dict(name='div', attrs={'class':'sociable'})] diff --git a/recipes/nowa_fantastyka.recipe b/recipes/nowa_fantastyka.recipe index 74534f3346..ec556da5fa 100644 --- a/recipes/nowa_fantastyka.recipe +++ b/recipes/nowa_fantastyka.recipe @@ -1,21 +1,33 @@ # -*- coding: utf-8 -*- from calibre.web.feeds.news import BasicNewsRecipe +import re + class Nowa_Fantastyka(BasicNewsRecipe): title = u'Nowa Fantastyka' oldest_article = 7 __author__ = 'fenuks' + __modified_by__ = 'zaslav' language = 'pl' encoding='latin2' description ='site for fantasy readers' category='fantasy' + masthead_url='http://farm5.static.flickr.com/4133/4956658792_7ba7fbf562.jpg' + #extra_css='.tytul {font-size: 20px;}' #not working max_articles_per_feed = 100 INDEX='http://www.fantastyka.pl/' no_stylesheets=True needs_subscription = 'optional' - remove_tags_before=dict(attrs={'class':'belka1-tlo-md'}) + remove_tags_before=dict(attrs={'class':'naglowek2'}) #remove_tags_after=dict(name='span', attrs={'class':'naglowek-oceny'}) - remove_tags_after=dict(name='td', attrs={'class':'belka1-bot'}) - remove_tags=[dict(attrs={'class':'avatar2'}), dict(name='span', attrs={'class':'alert-oceny'}), dict(name='img', attrs={'src':['obrazki/sledz1.png', 'obrazki/print.gif', 'obrazki/mlnf.gif']}), dict(name='b', text='Dodaj komentarz'),dict(name='a', attrs={'href':'http://www.fantastyka.pl/10,1727.html'})] + remove_tags_after=dict(name='form', attrs={'name':'form1'}) + remove_tags=[dict(attrs={'class':['avatar2', 'belka-margin', 'naglowek2']}), dict(name='span', attrs={'class':'alert-oceny'}), dict(name='img', attrs={'src':['obrazki/sledz1.png', 'obrazki/print.gif', 'obrazki/mlnf.gif']}), dict(name='b', text='Dodaj komentarz'),dict(name='a', attrs={'href':'http://www.fantastyka.pl/10,1727.html'}), dict(name='form')] + preprocess_regexps = [ + (re.compile(r'\'), lambda match: ''), + (re.compile(r'\'), lambda match: ''), + (re.compile(r'\'), lambda match: '')] + + + def find_articles(self, url): articles = [] @@ -41,10 +53,10 @@ class Nowa_Fantastyka(BasicNewsRecipe): return feeds + def get_cover_url(self): - soup = self.index_to_soup('http://www.fantastyka.pl/1.html') - cover=soup.find(name='img', attrs={'class':'okladka'}) - self.cover_url=self.INDEX+ cover['src'] + soup = self.index_to_soup('http://www.e-kiosk.pl/nowa_fantastyka') + self.cover_url='http://www.e-kiosk.pl' + soup.find(name='a', attrs={'class':'img'})['href'] return getattr(self, 'cover_url', self.cover_url) def get_browser(self): @@ -56,3 +68,18 @@ class Nowa_Fantastyka(BasicNewsRecipe): br['pass'] = self.password br.submit() return br + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll(font=True): + del item['font'] + for item in soup.findAll(align=True): + del item['align'] + for item in soup.findAll(name='tr'): + item.name='div' + title=soup.find(attrs={'class':'tytul'}) + if title: + title['style']='font-size: 20px; font-weight: bold;' + self.log.warn(soup) + return soup diff --git a/recipes/oclab_pl.recipe b/recipes/oclab_pl.recipe new file mode 100644 index 0000000000..b0df89ba72 --- /dev/null +++ b/recipes/oclab_pl.recipe @@ -0,0 +1,31 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class OCLab(BasicNewsRecipe): + title = u'OCLab.pl' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Portal OCLab.pl jest miejscem przyjaznym pasjonatom sprzętu komputerowego, w szczególności overclockerom, które będzie służyć im za aktualną bazę wiedzy o podkręcaniu komputera, źródło aktualnych informacji z rynku oraz opinii na temat sprzętu komputerowego.' + category = 'IT' + language = 'pl' + cover_url= 'http://www.idealforum.ru/attachment.php?attachmentid=7963&d=1316008118' + no_stylesheets = True + keep_only_tags=[dict(id='main')] + remove_tags_after= dict(attrs={'class':'single-postmetadata'}) + remove_tags=[dict(attrs={'class':['single-postmetadata', 'pagebar']})] + feeds = [(u'Wpisy', u'http://oclab.pl/feed/')] + + + def append_page(self, soup, appendtag): + tag=soup.find(attrs={'class':'contentjumpddl'}) + if tag: + nexturl=tag.findAll('option') + for nextpage in nexturl[1:-1]: + soup2 = self.index_to_soup(nextpage['value']) + pagetext = soup2.find(attrs={'class':'single-entry'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':'post-nav-bottom-list'}): + r.extract() + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup diff --git a/recipes/overclock_pl.recipe b/recipes/overclock_pl.recipe new file mode 100644 index 0000000000..d7f4c8093d --- /dev/null +++ b/recipes/overclock_pl.recipe @@ -0,0 +1,37 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe +class Overclock_pl(BasicNewsRecipe): + title = u'Overclock.pl' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Vortal poświęcony tematyce hardware, kładący największy nacisk na podkręcanie / overclocking (włącznie z extreme) i chłodzenie / cooling (air cooling, water cooling, freon cooling, dry ice, liquid nitrogen).' + category = 'IT' + language = 'pl' + masthead_url='http://www.overclock.pl/gfx/logo_m.png' + cover_url='http://www.overclock.pl/gfx/logo_m.png' + no_stylesheets = True + remove_empty_feeds = True + preprocess_regexps = [(re.compile(ur'Komentarze do aktualności:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'

Nawigacja

', re.DOTALL), lambda match: '') ] + keep_only_tags=[dict(name='div', attrs={'class':'news'}), dict(id='articleContent')] + remove_tags=[dict(name='span', attrs={'class':'info'}), dict(attrs={'class':'shareit'})] + feeds = [(u'Aktualno\u015bci', u'http://www.overclock.pl/rss.news.xml'), (u'Testy i recenzje', u'http://www.overclock.pl/rss.articles.xml')] + + + def append_page(self, soup, appendtag): + tag=soup.find(id='navigation') + if tag: + nexturl=tag.findAll('option') + tag.extract() + for nextpage in nexturl[2:]: + soup2 = self.index_to_soup(nextpage['value']) + pagetext = soup2.find(id='content') + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + rem=appendtag.find(attrs={'alt':'Pierwsza'}) + if rem: + rem.parent.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/palmtop_pl.recipe b/recipes/palmtop_pl.recipe new file mode 100644 index 0000000000..ace772e7e7 --- /dev/null +++ b/recipes/palmtop_pl.recipe @@ -0,0 +1,14 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class palmtop_pl(BasicNewsRecipe): + title = u'Palmtop.pl' + __author__ = 'fenuks' + description = 'wortal technologii mobilnych' + category = 'mobile' + language = 'pl' + cover_url='http://cdn.g-point.biz/wp-content/themes/palmtop-new/images/header_palmtop_logo.png' + masthead_url='http://cdn.g-point.biz/wp-content/themes/palmtop-new/images/header_palmtop_logo.png' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + + feeds = [(u'Newsy', u'http://palmtop.pl/feed/atom/')] diff --git a/recipes/pc_arena.recipe b/recipes/pc_arena.recipe new file mode 100644 index 0000000000..faefeb25c0 --- /dev/null +++ b/recipes/pc_arena.recipe @@ -0,0 +1,31 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class PC_Arena(BasicNewsRecipe): + title = u'PCArena' + oldest_article = 18300 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Najnowsze informacje z branży IT - testy, recenzje, aktualności, rankingi, wywiady. Twoje źródło informacji o sprzęcie komputerowym.' + category = 'IT' + language = 'pl' + masthead_url='http://pcarena.pl/public/design/frontend/images/logo.gif' + cover_url= 'http://pcarena.pl/public/design/frontend/images/logo.gif' + no_stylesheets = True + keep_only_tags=[dict(attrs={'class':['artHeader', 'art']})] + remove_tags=[dict(attrs={'class':'pages'})] + feeds = [(u'Newsy', u'http://pcarena.pl/misc/rss/news'), (u'Artyku\u0142y', u'http://pcarena.pl/misc/rss/articles')] + + def append_page(self, soup, appendtag): + tag=soup.find(name='div', attrs={'class':'pagNum'}) + if tag: + nexturl=tag.findAll('a') + tag.extract() + for nextpage in nexturl[1:]: + nextpage= 'http://pcarena.pl' + nextpage['href'] + soup2 = self.index_to_soup(nextpage) + pagetext = soup2.find(attrs={'class':'artBody'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/pc_centre_pl.recipe b/recipes/pc_centre_pl.recipe new file mode 100644 index 0000000000..68a17888ce --- /dev/null +++ b/recipes/pc_centre_pl.recipe @@ -0,0 +1,41 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class PC_Centre(BasicNewsRecipe): + title = u'PC Centre' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Portal komputerowy, a w nim: testy sprzętu komputerowego, recenzje gier i oprogramowania. a także opisy produktów związanych z komputerami.' + category = 'IT' + language = 'pl' + masthead_url= 'http://pccentre.pl/views/images/logo.gif' + cover_url= 'http://pccentre.pl/views/images/logo.gif' + no_stylesheets = True + keep_only_tags= [dict(id='content')] + remove_tags=[dict(attrs={'class':['ikony r', 'list_of_content', 'dot accordion']}), dict(id='comments')] + feeds = [(u'Publikacje', u'http://pccentre.pl/backend.php?mode=a'), (u'Aktualno\u015bci', u'http://pccentre.pl/backend.php'), (u'Sprz\u0119t komputerowy', u'http://pccentre.pl/backend.php?mode=n§ion=2'), (u'Oprogramowanie', u'http://pccentre.pl/backend.php?mode=n§ion=3'), (u'Gry komputerowe i konsole', u'http://pccentre.pl/backend.php?mode=n§ion=4'), (u'Internet', u'http://pccentre.pl/backend.php?mode=n§ion=7'), (u'Bezpiecze\u0144stwo', u'http://pccentre.pl/backend.php?mode=n§ion=5'), (u'Multimedia', u'http://pccentre.pl/backend.php?mode=n§ion=6'), (u'Biznes', u'http://pccentre.pl/backend.php?mode=n§ion=9')] + + + def append_page(self, soup, appendtag): + tag=soup.find(name='div', attrs={'class':'pages'}) + if tag: + nexturl=tag.findAll('a') + tag.extract() + for nextpage in nexturl[:-1]: + nextpage= 'http://pccentre.pl' + nextpage['href'] + soup2 = self.index_to_soup(nextpage) + pagetext = soup2.find(id='content') + rem=pagetext.findAll(attrs={'class':['subtitle', 'content_info', 'list_of_content', 'pages', 'social2', 'pcc_acc', 'pcc_acc_na']}) + for r in rem: + r.extract() + rem=pagetext.findAll(id='comments') + for r in rem: + r.extract() + rem=pagetext.findAll('h1') + for r in rem: + r.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/pc_foster.recipe b/recipes/pc_foster.recipe new file mode 100644 index 0000000000..ab8c2b66b1 --- /dev/null +++ b/recipes/pc_foster.recipe @@ -0,0 +1,35 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class PC_Foster(BasicNewsRecipe): + title = u'PC Foster' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Vortal technologiczny: testy, recenzje sprzętu komputerowego i telefonów, nowinki hardware, programy i gry dla Windows. Podkręcanie, modding i Overclocking.' + category = 'IT' + language = 'pl' + masthead_url='http://pcfoster.pl/public/images/logo.png' + cover_url= 'http://pcfoster.pl/public/images/logo.png' + no_stylesheets= True + remove_empty_feeds= True + keep_only_tags= [dict(id=['news_details', 'review_details']), dict(attrs={'class':'pager more_top'})] + remove_tags=[dict(name='p', attrs={'class':'right'})] + feeds = [(u'G\u0142\xf3wny', u'http://pcfoster.pl/public/rss/main.xml')] + + + def append_page(self, soup, appendtag): + nexturl= appendtag.find(attrs={'alt':u'Następna strona'}) + if nexturl: + appendtag.find(attrs={'class':'pager more_top'}).extract() + while nexturl: + nexturl='http://pcfoster.pl' + nexturl.parent['href'] + soup2 = self.index_to_soup(nexturl) + nexturl=soup2.find(attrs={'alt':u'Następna strona'}) + pagetext = soup2.find(attrs={'class':'content'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':'review_content double'}): + r.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/polska_times.recipe b/recipes/polska_times.recipe new file mode 100644 index 0000000000..4126576fe2 --- /dev/null +++ b/recipes/polska_times.recipe @@ -0,0 +1,81 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re +class Polska_times(BasicNewsRecipe): + title = u'Polska Times' + __author__ = 'fenuks' + description = u'Internetowe wydanie dziennika ogólnopolskiego Polska The Times. Najświeższe informacje: wydarzenia w kraju i na świecie, reportaże, poradniki, opinie.' + category = 'newspaper' + language = 'pl' + masthead_url = 'http://s.polskatimes.pl/g/logo_naglowek/polska.gif?17' + oldest_article = 7 + max_articles_per_feed = 100 + remove_emty_feeds= True + no_stylesheets = True + preprocess_regexps = [(re.compile(ur'Czytaj także:.*?', re.DOTALL), lambda match: ''), (re.compile(ur',Czytaj też:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'Zobacz także:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'

', re.DOTALL), lambda match: ''), (re.compile(ur'CZYTAJ TEŻ:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'CZYTAJ WIĘCEJ:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'CZYTAJ TAKŻE:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: ''), (re.compile(ur'Nasze serwisy:.*', re.DOTALL), lambda match: '') ] + keep_only_tags= [dict(id=['tytul-artykulu', 'kontent'])] + remove_tags_after= dict(id='material-tagi') + remove_tags=[dict(attrs={'id':'reklama_srodtekst_0'}), dict(attrs={'id':'material-tagi'}), dict(name='div', attrs={'class':'zakladki'}), dict(attrs={'title':u'CZYTAJ TAKŻE'}), dict(attrs={'id':'podobne'}), dict(name='a', attrs={'href':'http://www.dzienniklodzki.pl/newsletter'})] + feeds = [(u'Fakty', u'http://polskatimes.feedsportal.com/c/32980/f/533648/index.rss'), (u'Opinie', u'http://www.polskatimes.pl/rss/opinie.xml'), (u'Sport', u'http://polskatimes.feedsportal.com/c/32980/f/533649/index.rss'), (u'Pieni\u0105dze', u'http://polskatimes.feedsportal.com/c/32980/f/533657/index.rss'), (u'Twoje finanse', u'http://www.polskatimes.pl/rss/twojefinanse.xml'), (u'Kultura', u'http://polskatimes.feedsportal.com/c/32980/f/533650/index.rss'), (u'Dodatki', u'http://www.polskatimes.pl/rss/dodatki.xml')] + + def skip_ad_pages(self, soup): + if 'Advertisement' in soup.title: + nexturl=soup.find('a')['href'] + return self.index_to_soup(nexturl, raw=True) + + def append_page(self, soup, appendtag): + nexturl=soup.find(id='nastepna_strona') + while nexturl: + soup2= self.index_to_soup(nexturl['href']) + nexturl=soup2.find(id='nastepna_strona') + pagetext = soup2.find(id='tresc') + for dictionary in self.remove_tags: + v=pagetext.findAll(attrs=dictionary['attrs']) + for delete in v: + delete.extract() + for b in pagetext.findAll(name='b'): + if b.string: + if u'CZYTAJ TEŻ' in b.string or u'Czytaj także' in b.string or u'Czytaj też' in b.string or u'Zobacz także' in b.string: + b.extract() + for center in pagetext.findAll(name='center'): + if center.h4: + if center.h4.a: + center.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for paginator in appendtag.findAll(attrs={'class':'stronicowanie'}): + paginator.extract() + + def image_article(self, soup, appendtag): + nexturl=soup.find('a', attrs={'class':'nastepna'}) + urls=[] + while nexturl: + if nexturl not in urls: + urls.append(nexturl) + else: + break + soup2= self.index_to_soup('http://www.polskatimes.pl/artykul/' + nexturl['href']) + nexturl=soup2.find('a', attrs={'class':'nastepna'}) + if nexturl in urls: + break; + pagetext = soup2.find(id='galeria-material') + pos = len(appendtag.contents) + appendtag.insert(pos, '
') + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for rem in appendtag.findAll(attrs={'class':['galeriaNawigator', 'miniaturyPojemnik']}): + rem.extract() + for paginator in appendtag.findAll(attrs={'class':'stronicowanie'}): + paginator.extract() + + def preprocess_html(self, soup): + if soup.find('a', attrs={'class':'nastepna'}): + self.image_article(soup, soup.body) + elif soup.find(id='nastepna_strona'): + self.append_page(soup, soup.body) + return soup + + + def get_cover_url(self): + soup = self.index_to_soup('http://www.prasa24.pl/gazeta/metropolia-warszawska/') + self.cover_url=soup.find(id='pojemnik').img['src'] + return getattr(self, 'cover_url', self.cover_url) \ No newline at end of file diff --git a/recipes/pure_pc.recipe b/recipes/pure_pc.recipe new file mode 100644 index 0000000000..7a6c43bb7e --- /dev/null +++ b/recipes/pure_pc.recipe @@ -0,0 +1,33 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class PurePC(BasicNewsRecipe): + title = u'PurePC' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Artykuły, aktualności, sprzęt, forum, chłodzenie, modding, urządzenia mobilne - wszystko w jednym miejscu.' + category = 'IT' + language = 'pl' + masthead_url= 'http://www.purepc.pl/themes/new/images/purepc.jpg' + cover_url= 'http://www.purepc.pl/themes/new/images/purepc.jpg' + no_stylesheets = True + keep_only_tags= [dict(id='content')] + remove_tags_after= dict(attrs={'class':'fivestar-widget'}) + remove_tags= [dict(id='navigator'), dict(attrs={'class':['box-tools', 'fivestar-widget', 'PageMenuList']})] + feeds = [(u'Wiadomo\u015bci', u'http://www.purepc.pl/node/feed')] + + + def append_page(self, soup, appendtag): + nexturl= appendtag.find(attrs={'class':'pager-next'}) + if nexturl: + while nexturl: + soup2 = self.index_to_soup('http://www.purepc.pl'+ nexturl.a['href']) + nexturl=soup2.find(attrs={'class':'pager-next'}) + pagetext = soup2.find(attrs={'class':'article'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':['PageMenuList', 'pager', 'fivestar-widget']}): + r.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/racjonalista_pl.recipe b/recipes/racjonalista_pl.recipe new file mode 100644 index 0000000000..d803f22a7b --- /dev/null +++ b/recipes/racjonalista_pl.recipe @@ -0,0 +1,54 @@ +__copyright__ = '2012, Micha\u0142 ' +''' +Racjonalista.pl +''' + +from calibre.web.feeds.news import BasicNewsRecipe +import re + +class Racjonalista(BasicNewsRecipe): + __author__ = u'Micha\u0142 ' + publisher = u'Fundacja Wolnej My\u015bli' + title = u'Racjonalista.pl' + description = u'Racjonalista.pl' + category = 'newspaper' + language = 'pl' + encoding = 'iso-8859-2' + oldest_article = 7 + max_articles_per_feed = 20 + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + simultaneous_downloads = 2 + timeout = 30 + cover_url = 'http://www.racjonalista.pl/img/uimg/rac.gif' + + feeds = [(u'Racjonalista.pl', u'http://www.racjonalista.pl/rss.php')] + + match_regexps = [r'kk\.php'] + + def print_version(self, url): + return url.replace('/s,', '/t,') + + extra_css = 'h2 {font: serif large} .cytat {text-align: right}' + + remove_attributes = ['target', 'width', 'height'] + + preprocess_regexps = [ + (re.compile(i[0], re.DOTALL), i[1]) for i in + [ (r']*> 

', lambda match: ''), + (r' ', lambda match: ' '), + (r']+>', lambda match: ''), + (r']+>', lambda match: ''), + (r'', lambda match: ''), + (r'[^<]+)', lambda match: '' + match.group('a') + ''), + (r'
(?P[^<]+)
', lambda match: '

' + match.group('t') + '

'), + (r'
', lambda match: ''), + (r'
', lambda match: ''), + (r'
', lambda match: ''), + (r']+>(?P

[^<]+)', lambda match: '' + match.group('p') + ''), + (r']+>(?P[^<]+)', lambda match: match.group('a')), + (r'Orygin[^<]+', lambda match: ''), + (r'Poka[^<]+', lambda match: '')] + ] + diff --git a/recipes/rue89.recipe b/recipes/rue89.recipe index 51cf8f6b98..bd3ef7ea4c 100644 --- a/recipes/rue89.recipe +++ b/recipes/rue89.recipe @@ -1,13 +1,11 @@ __license__ = 'GPL v3' -__copyright__ = '2010, Louis Gesbert ' +__copyright__ = '2010-2012, Louis Gesbert ' ''' Rue89 ''' -__author__ = '2010, Louis Gesbert ' +__author__ = '2010-2012, Louis Gesbert ' -import re -from calibre.ebooks.BeautifulSoup import Tag from calibre.web.feeds.news import BasicNewsRecipe class Rue89(BasicNewsRecipe): @@ -17,37 +15,45 @@ class Rue89(BasicNewsRecipe): title = u'Rue89' language = 'fr' oldest_article = 7 - max_articles_per_feed = 50 + max_articles_per_feed = 12 - feeds = [(u'La Une', u'http://www.rue89.com/homepage/feed')] + use_embedded_content = False + + # From http://www.rue89.com/les-flux-rss-de-rue89 + feeds = [ + (u'La Une', u'http://www.rue89.com/feed'), + (u'Rue69', u'http://www.rue89.com/rue69/feed'), + (u'Eco', u'http://www.rue89.com/rue89-eco/feed'), + (u'Planète', u'http://www.rue89.com/rue89-planete/feed'), + (u'Sport', u'http://www.rue89.com/rue89-sport/feed'), + (u'Culture', u'http://www.rue89.com/culture/feed'), + (u'Hi-tech', u'http://www.rue89.com/hi-tech/feed'), + (u'Media', u'http://www.rue89.com/medias/feed'), + (u'Monde', u'http://www.rue89.com/monde/feed'), + (u'Politique', u'http://www.rue89.com/politique/feed'), + (u'Societe', u'http://www.rue89.com/societe/feed'), + ] + + # Follow redirection from feedsportal.com + def get_article_url(self,article): + return self.browser.open_novisit(article.link).geturl() + + def print_version(self, url): + return url + '?imprimer=1' no_stylesheets = True - preprocess_regexps = [ - (re.compile(r'<(/?)h2>', re.IGNORECASE|re.DOTALL), - lambda match : '<'+match.group(1)+'h3>'), - (re.compile(r'

', re.IGNORECASE|re.DOTALL), - lambda match : '

'+match.group(1)+'

'), - (re.compile(r']+src="[^"]*/numeros/(\d+)[^0-9.">]*.gif"[^>]*/>', re.IGNORECASE|re.DOTALL), - lambda match : ''+match.group(1)+''), - (re.compile(r'\''), lambda match: '’'), - ] + conversion_options = { 'smarten_punctuation' : True } - def preprocess_html(self,soup): - body = Tag(soup, 'body') - title = soup.find('h1', {'class':'title'}) - content = soup.find('div', {'class':'content'}) - soup.body.replaceWith(body) - body.insert(0, title) - body.insert(1, content) - return soup + keep_only_tags = [ + dict(name='div', attrs={'id':'article'}), + ] - remove_tags = [ #dict(name='div', attrs={'class':'print-source_url'}), - #dict(name='div', attrs={'class':'print-links'}), - #dict(name='img', attrs={'class':'print-logo'}), - dict(name='div', attrs={'class':'content_top'}), - dict(name='div', attrs={'id':'sidebar-left'}), ] + remove_tags_after = [ + dict(name='div', attrs={'id':'plus_loin'}), + ] -# -- print-version has poor quality on this website, better do the conversion ourselves -# def print_version(self, url): -# return re.sub('^.*-([0-9]+)$', 'http://www.rue89.com/print/\\1',url) + remove_tags = [ + dict(name='div', attrs={'id':'article_tools'}), + dict(name='div', attrs={'id':'plus_loin'}), + ] diff --git a/recipes/tablety_pl.recipe b/recipes/tablety_pl.recipe index d06e32d9af..f4c1efa9b8 100644 --- a/recipes/tablety_pl.recipe +++ b/recipes/tablety_pl.recipe @@ -1,14 +1,16 @@ from calibre.web.feeds.news import BasicNewsRecipe - +import re class Tablety_pl(BasicNewsRecipe): title = u'Tablety.pl' __author__ = 'fenuks' description = u'tablety.pl - latest tablet news' + masthead_url= 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png' cover_url = 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png' category = 'IT' language = 'pl' oldest_article = 8 max_articles_per_feed = 100 + preprocess_regexps = [(re.compile(ur'

Przeczytaj także.*?

', re.DOTALL), lambda match: ''), (re.compile(ur'

Przeczytaj koniecznie.*?

', re.DOTALL), lambda match: '')] remove_tags_before=dict(name="h1", attrs={'class':'entry-title'}) remove_tags_after=dict(name="div", attrs={'class':'snap_nopreview sharing robots-nocontent'}) remove_tags=[dict(name='div', attrs={'class':'snap_nopreview sharing robots-nocontent'})] diff --git a/recipes/tanuki.recipe b/recipes/tanuki.recipe new file mode 100644 index 0000000000..666cb8aa77 --- /dev/null +++ b/recipes/tanuki.recipe @@ -0,0 +1,37 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re +class tanuki(BasicNewsRecipe): + title = u'Tanuki' + oldest_article = 7 + __author__ = 'fenuks' + category = 'anime, manga' + language = 'pl' + max_articles_per_feed = 100 + encoding='utf-8' + extra_css= 'ul {list-style: none; padding: 0; margin: 0;} .kadr{float: left;} .dwazdania {float: right;}' + preprocess_regexps = [(re.compile(ur'

', re.DOTALL), lambda match: ''), (re.compile(ur'', re.DOTALL), lambda match: '')] + remove_empty_feeds= True + no_stylesheets = True + keep_only_tags=[dict(attrs={'class':['animename', 'storyname', 'nextarrow','sideinfov', 'sidelinfov', 'sideinfo', 'sidelinfo']}), dict(name='table', attrs={'summary':'Technikalia'}), dict(attrs={'class':['chaptername','copycat']}), dict(id='rightcolumn'), dict(attrs={'class':['headn_tt', 'subtable']})] + remove_tags=[dict(name='div', attrs={'class':'screen'}), dict(id='randomtoplist'), dict(attrs={'class':'note'})] + feeds = [(u'Anime', u'http://anime.tanuki.pl/rss_anime.xml'), (u'Manga', u'http://manga.tanuki.pl/rss_manga.xml'), (u'Tomiki', u'http://manga.tanuki.pl/rss_mangabooks.xml'), (u'Artyku\u0142y', u'http://czytelnia.tanuki.pl/rss_czytelnia_artykuly.xml'), (u'Opowiadania', u'http://czytelnia.tanuki.pl/rss_czytelnia.xml')] + + + def append_page(self, soup, appendtag): + nexturl= appendtag.find(attrs={'class':'nextarrow'}) + if nexturl: + while nexturl: + soup2 = self.index_to_soup('http://czytelnia.tanuki.pl'+ nexturl['href']) + nexturl=soup2.find(attrs={'class':'nextarrow'}) + pagetext = soup2.find(attrs={'class':['chaptername', 'copycat']}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + pagetext = soup2.find(attrs={'class':'copycat'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':'nextarrow'}): + r.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/the_sun.recipe b/recipes/the_sun.recipe index 5699ec106c..80b37f329a 100644 --- a/recipes/the_sun.recipe +++ b/recipes/the_sun.recipe @@ -1,49 +1,57 @@ import re -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import Tag +from calibre.web.feeds.recipes import BasicNewsRecipe -class AdvancedUserRecipe1268409464(BasicNewsRecipe): - title = u'The Sun' - __author__ = 'Chaz Ralph' - description = 'News from The Sun' +class AdvancedUserRecipe1325006965(BasicNewsRecipe): + + title = u'The Sun UK' + cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png' + + description = 'A Recipe for The Sun tabloid UK - uses feed43' + __author__ = 'Dave Asbury' + # last updated 20/2/12 + language = 'en_GB' oldest_article = 1 - max_articles_per_feed = 100 - language = 'en' + max_articles_per_feed = 15 + remove_empty_feeds = True no_stylesheets = True - extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }' - encoding= 'iso-8859-1' - remove_javascript = True + + masthead_url = 'http://www.thesun.co.uk/sol/img/global/Sun-logo.gif' + encoding = 'cp1251' + + encoding = 'cp1252' + remove_empty_feeds = True + remove_javascript = True + no_stylesheets = True + + extra_css = ''' + body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;} + ''' + + preprocess_regexps = [ + (re.compile(r'