diff --git a/Changelog.yaml b/Changelog.yaml index 43eb775233..17f3ebcf97 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -19,6 +19,51 @@ # new recipes: # - title: +- version: 0.8.47 + date: 2012-04-13 + + new features: + - title: "Conversion pipeline: Add support for all the named entities in the HTML 5 spec." + tickets: [976056] + + - title: "Support for viewing and converting the Haodoo PDB ebook format" + tickets: [976478] + + - title: "Device driver for Laser EB720" + + bug fixes: + - title: "Fix regression in automatic adding in 0.8.46 that broke automatic adding if adding of duplicates is enabled and auto convert is also enabled" + tickets: [976336] + + - title: 'Fix "Tags" field in advanced search does not obey regex setting' + tickets: [980221] + + - title: "EPUB Input: Automatically extract cover image from simple HTML title page that consists of only a single tag, instead of rendering the page" + + - title: "Prevent errors when both author and author_sort are used in a template for reading metadata from filenames for files on a device" + + - title: "Amazon metadata download: Handle books whose titles start with a bracket." + tickets: [976365] + + - title: "Get Books: Fix downloading of purchased books from Baen" + tickets: [975929] + + + improved recipes: + - Forbes + - Caros Amigos + - Trouw + - Sun UK + - Metro + - Daily Mirror + + new recipes: + - title: "Melbourne Herald Sun" + author: Ray Hartley + + - title: "Editoriali and Zerocalcare" + author: faber1971 + - version: 0.8.46 date: 2012-04-06 diff --git a/recipes/adventure_zone_pl.recipe b/recipes/adventure_zone_pl.recipe index 65f4e3e52d..bb311606ac 100644 --- a/recipes/adventure_zone_pl.recipe +++ b/recipes/adventure_zone_pl.recipe @@ -9,6 +9,7 @@ class Adventure_zone(BasicNewsRecipe): no_stylesheets = True oldest_article = 20 max_articles_per_feed = 100 + index='http://www.adventure-zone.info/fusion/' use_embedded_content=False preprocess_regexps = [(re.compile(r"Komentarze", re.IGNORECASE), lambda m: '')] remove_tags_before= dict(name='td', attrs={'class':'main-bg'}) @@ -45,6 +46,19 @@ class Adventure_zone(BasicNewsRecipe): skip_tag = skip_tag.findAll(name='a') for r in skip_tag: if r.strong: - word=r.strong.string - if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word)): - return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True) \ No newline at end of file + word=r.strong.string.lower() + if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)): + return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True) + + def preprocess_html(self, soup): + footer=soup.find(attrs={'class':'news-footer middle-border'}) + if footer and len(footer('a'))>=2: + footer('a')[1].extract() + for item in soup.findAll(style=True): + del item['style'] + for a in soup('a'): + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + a['href']=self.index + a['href'] + return soup + + \ No newline at end of file diff --git a/recipes/benchmark_pl.recipe b/recipes/benchmark_pl.recipe index cc74cc9128..00eea1be68 100644 --- a/recipes/benchmark_pl.recipe +++ b/recipes/benchmark_pl.recipe @@ -68,4 +68,7 @@ class Benchmark_pl(BasicNewsRecipe): self.image_article(soup, soup.body) else: self.append_page(soup, soup.body) + for a in soup('a'): + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + a['href']=self.INDEX + a['href'] return soup diff --git a/recipes/calgary_herald.recipe b/recipes/calgary_herald.recipe index dc919a76f8..12134bc9a4 100644 --- a/recipes/calgary_herald.recipe +++ b/recipes/calgary_herald.recipe @@ -1,220 +1,35 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -__license__ = 'GPL v3' - -''' -www.canada.com -''' - -import re -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup - - -class CanWestPaper(BasicNewsRecipe): - - # un-comment the following four lines for the Victoria Times Colonist -## title = u'Victoria Times Colonist' -## url_prefix = 'http://www.timescolonist.com' -## description = u'News from Victoria, BC' -## fp_tag = 'CAN_TC' - - # un-comment the following four lines for the Vancouver Province -## title = u'Vancouver Province' -## url_prefix = 'http://www.theprovince.com' -## description = u'News from Vancouver, BC' -## fp_tag = 'CAN_VP' - - # un-comment the following four lines for the Vancouver Sun -## title = u'Vancouver Sun' -## url_prefix = 'http://www.vancouversun.com' -## description = u'News from Vancouver, BC' -## fp_tag = 'CAN_VS' - - # un-comment the following four lines for the Edmonton Journal -## title = u'Edmonton Journal' -## url_prefix = 'http://www.edmontonjournal.com' -## description = u'News from Edmonton, AB' -## fp_tag = 'CAN_EJ' - - # un-comment the following four lines for the Calgary Herald - title = u'Calgary Herald' - url_prefix = 'http://www.calgaryherald.com' - description = u'News from Calgary, AB' - fp_tag = 'CAN_CH' - - # un-comment the following four lines for the Regina Leader-Post -## title = u'Regina Leader-Post' -## url_prefix = 'http://www.leaderpost.com' -## description = u'News from Regina, SK' -## fp_tag = '' - - # un-comment the following four lines for the Saskatoon Star-Phoenix -## title = u'Saskatoon Star-Phoenix' -## url_prefix = 'http://www.thestarphoenix.com' -## description = u'News from Saskatoon, SK' -## fp_tag = '' - - # un-comment the following four lines for the Windsor Star -## title = u'Windsor Star' -## url_prefix = 'http://www.windsorstar.com' -## description = u'News from Windsor, ON' -## fp_tag = 'CAN_' - - # un-comment the following four lines for the Ottawa Citizen -## title = u'Ottawa Citizen' -## url_prefix = 'http://www.ottawacitizen.com' -## description = u'News from Ottawa, ON' -## fp_tag = 'CAN_OC' - - # un-comment the following four lines for the Montreal Gazette -## title = u'Montreal Gazette' -## url_prefix = 'http://www.montrealgazette.com' -## description = u'News from Montreal, QC' -## fp_tag = 'CAN_MG' - - - language = 'en_CA' - __author__ = 'Nick Redding' - no_stylesheets = True - timefmt = ' [%b %d]' - extra_css = ''' - .timestamp { font-size:xx-small; display: block; } - #storyheader { font-size: medium; } - #storyheader h1 { font-size: x-large; } - #storyheader h2 { font-size: large; font-style: italic; } - .byline { font-size:xx-small; } - #photocaption { font-size: small; font-style: italic } - #photocredit { font-size: xx-small; }''' - keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] - remove_tags = [{'class':'comments'}, - dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), - dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), - dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), - dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), - dict(name='div', attrs={'class':'rule_grey_solid'}), - dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] - - def get_cover_url(self): - from datetime import timedelta, date - if self.fp_tag=='': - return None - cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' - br = BasicNewsRecipe.get_browser() - daysback=1 - try: - br.open(cover) - except: - while daysback<7: - cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg' - br = BasicNewsRecipe.get_browser() - try: - br.open(cover) - except: - daysback = daysback+1 - continue - break - if daysback==7: - self.log("\nCover unavailable") - cover = None - return cover - - def fixChars(self,string): - # Replace lsquo (\x91) - fixed = re.sub("\x91","‘",string) - # Replace rsquo (\x92) - fixed = re.sub("\x92","’",fixed) - # Replace ldquo (\x93) - fixed = re.sub("\x93","“",fixed) - # Replace rdquo (\x94) - fixed = re.sub("\x94","”",fixed) - # Replace ndash (\x96) - fixed = re.sub("\x96","–",fixed) - # Replace mdash (\x97) - fixed = re.sub("\x97","—",fixed) - fixed = re.sub("’","’",fixed) - return fixed - - def massageNCXText(self, description): - # Kindle TOC descriptions won't render certain characters - if description: - massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) - # Replace '&' with '&' - massaged = re.sub("&","&", massaged) - return self.fixChars(massaged) - else: - return description - - def populate_article_metadata(self, article, soup, first): - if first: - picdiv = soup.find('body').find('img') - if picdiv is not None: - self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src'])) - xtitle = article.text_summary.strip() - if len(xtitle) == 0: - desc = soup.find('meta',attrs={'property':'og:description'}) - if desc is not None: - article.summary = article.text_summary = desc['content'] - - def strip_anchors(self,soup): - paras = soup.findAll(True) - for para in paras: - aTags = para.findAll('a') - for a in aTags: - if a.img is None: - a.replaceWith(a.renderContents().decode('cp1252','replace')) - return soup - - def preprocess_html(self, soup): - return self.strip_anchors(soup) - - - - def parse_index(self): - soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') - - articles = {} - key = 'News' - ans = ['News'] - - # Find each instance of class="sectiontitle", class="featurecontent" - for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): - #self.log(" div class = %s" % divtag['class']) - if divtag['class'].startswith('section_title'): - # div contains section title - if not divtag.h3: - continue - key = self.tag_to_string(divtag.h3,False) - ans.append(key) - self.log("Section name %s" % key) - continue - # div contains article data - h1tag = divtag.find('h1') - if not h1tag: - continue - atag = h1tag.find('a',href=True) - if not atag: - continue - url = self.url_prefix+'/news/todays-paper/'+atag['href'] - #self.log("Section %s" % key) - #self.log("url %s" % url) - title = self.tag_to_string(atag,False) - #self.log("title %s" % title) - pubdate = '' - description = '' - ptag = divtag.find('p'); - if ptag: - description = self.tag_to_string(ptag,False) - #self.log("description %s" % description) - author = '' - autag = divtag.find('h4') - if autag: - author = self.tag_to_string(autag,False) - #self.log("author %s" % author) - if not articles.has_key(key): - articles[key] = [] - articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) - - ans = [(key, articles[key]) for key in ans if articles.has_key(key)] - return ans +from calibre.web.feeds.news import BasicNewsRecipe + +class CalgaryHerald(BasicNewsRecipe): + title = u'Calgary Herald' + oldest_article = 3 + max_articles_per_feed = 100 + + feeds = [ + (u'News', u'http://rss.canada.com/get/?F233'), + (u'Calgary', u'http://www.calgaryherald.com/scripts/sp6query.aspx?catalog=cahr&tags=keyword|calgary&output=rss?link=http%3a%2f%2fwww.calgaryherald'), + (u'Alberta', u'http://www.calgaryherald.com/scripts/Sp6Query.aspx?catalog=CAHR&tags=Keyword|Alberta&output=rss?link=http%3A%2F%2Fwww.calgaryherald.com%2Fnews%2Falberta%2Findex.html'), + (u'Politics', u'http://rss.canada.com/get/?F7551'), + (u'National', u'http://rss.canada.com/get/?F7552'), + (u'World', u'http://rss.canada.com/get/?F7553'), + ] + __author__ = 'rty' + pubisher = 'Calgary Herald' + description = 'Calgary, Alberta, Canada' + category = 'News, Calgary, Alberta, Canada' + + + remove_javascript = True + use_embedded_content = False + no_stylesheets = True + language = 'en_CA' + encoding = 'utf-8' + conversion_options = {'linearize_tables':True} + ##masthead_url = 'http://www.calgaryherald.com/index.html' + keep_only_tags = [ + dict(name='div', attrs={'id':'storyheader'}), + dict(name='div', attrs={'id':'storycontent'}) + + ] + remove_tags_after = {'class':"story_tool_hr"} + diff --git a/recipes/camera_di_commercio_di_bari.recipe b/recipes/camera_di_commercio_di_bari.recipe new file mode 100644 index 0000000000..c80a825883 --- /dev/null +++ b/recipes/camera_di_commercio_di_bari.recipe @@ -0,0 +1,17 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1331729727(BasicNewsRecipe): + title = u'Camera di Commercio di Bari' + oldest_article = 7 + __author__ = 'faber1971' + description = 'News from the Chamber of Commerce of Bari' + language = 'it' + max_articles_per_feed = 100 + auto_cleanup = True + masthead_url = 'http://www.ba.camcom.it/grafica/layout-bordo/logo_camcom_bari.png' + feeds = [(u'Camera di Commercio di Bari', u'http://feed43.com/4715147488845101.xml')] + +__license__ = 'GPL v3' +__copyright__ = '2012, faber1971' +__version__ = 'v1.00' +__date__ = '17, April 2012' diff --git a/recipes/cd_action.recipe b/recipes/cd_action.recipe index ff46774dc9..4e19fbc6c1 100644 --- a/recipes/cd_action.recipe +++ b/recipes/cd_action.recipe @@ -6,6 +6,7 @@ class CD_Action(BasicNewsRecipe): description = 'cdaction.pl - polish games magazine site' category = 'games' language = 'pl' + index='http://www.cdaction.pl' oldest_article = 8 max_articles_per_feed = 100 no_stylesheets= True @@ -17,4 +18,10 @@ class CD_Action(BasicNewsRecipe): def get_cover_url(self): soup = self.index_to_soup('http://www.cdaction.pl/magazyn/') self.cover_url='http://www.cdaction.pl'+ soup.find(id='wspolnik').div.a['href'] - return getattr(self, 'cover_url', self.cover_url) \ No newline at end of file + return getattr(self, 'cover_url', self.cover_url) + + def preprocess_html(self, soup): + for a in soup('a'): + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + a['href']=self.index + a['href'] + return soup \ No newline at end of file diff --git a/recipes/countryfile.recipe b/recipes/countryfile.recipe index 7a41b5b905..0502129791 100644 --- a/recipes/countryfile.recipe +++ b/recipes/countryfile.recipe @@ -1,11 +1,12 @@ +from calibre import browser from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1325006965(BasicNewsRecipe): title = u'Countryfile.com' - cover_url = 'http://www.buysubscriptions.com/static_content/the-immediate/en/images/covers/CFIL_maxi.jpg' + #cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg' __author__ = 'Dave Asbury' description = 'The official website of Countryfile Magazine' - # last updated 29/1/12 + # last updated 15/4/12 language = 'en_GB' oldest_article = 30 max_articles_per_feed = 25 @@ -13,7 +14,23 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe): no_stylesheets = True auto_cleanup = True #articles_are_obfuscated = True + def get_cover_url(self): + soup = self.index_to_soup('http://www.countryfile.com/') + cov = soup.find(attrs={'class' : 'imagecache imagecache-160px_wide imagecache-linked imagecache-160px_wide_linked'}) + #print '******** ',cov,' ***' + cov2 = str(cov) + cov2=cov2[124:-90] + #print '******** ',cov2,' ***' + # try to get cover - if can't get known cover + br = browser() + br.set_handle_redirect(False) + try: + br.open_novisit(cov2) + cover_url = cov2 + except: + cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg' + return cover_url remove_tags = [ # dict(attrs={'class' : ['player']}), diff --git a/recipes/daily_mirror.recipe b/recipes/daily_mirror.recipe index d6794b1d97..8bac57951c 100644 --- a/recipes/daily_mirror.recipe +++ b/recipes/daily_mirror.recipe @@ -1,20 +1,21 @@ + from calibre.web.feeds.news import BasicNewsRecipe import re +import mechanize class AdvancedUserRecipe1306061239(BasicNewsRecipe): title = u'The Daily Mirror' description = 'News as provide by The Daily Mirror -UK' __author__ = 'Dave Asbury' - # last updated 11/2/12 + # last updated 7/4/12 language = 'en_GB' - - cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg' + #cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg' masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif' oldest_article = 1 - max_articles_per_feed = 5 + max_articles_per_feed = 10 remove_empty_feeds = True remove_javascript = True no_stylesheets = True @@ -75,3 +76,28 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe): img { display:block} ''' + def get_cover_url(self): + soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html') + # look for the block containing the mirror button and url + cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_92.gif);'}) + cov2 = str(cov) + cov2='http://www.politicshome.com'+cov2[9:-142] + #cov2 now contains url of the page containing pic + soup = self.index_to_soup(cov2) + cov = soup.find(attrs={'id' : 'large'}) + cov2 = str(cov) + cov2=cov2[27:-18] + #cov2 now is pic url, now go back to original function + br = mechanize.Browser() + br.set_handle_redirect(False) + try: + br.open_novisit(cov2) + cover_url = cov2 + except: + cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg' + + #cover_url = cov2 + #cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png' + return cover_url + + diff --git a/recipes/dobreprogamy.recipe b/recipes/dobreprogamy.recipe index a27a9b0877..0614cf98ee 100644 --- a/recipes/dobreprogamy.recipe +++ b/recipes/dobreprogamy.recipe @@ -11,6 +11,7 @@ class Dobreprogramy_pl(BasicNewsRecipe): cover_url = 'http://userlogos.org/files/logos/Karmody/dobreprogramy_01.png' description = u'Aktualności i blogi z dobreprogramy.pl' encoding = 'utf-8' + index='http://www.dobreprogramy.pl/' no_stylesheets = True language = 'pl' extra_css = '.title {font-size:22px;}' @@ -22,3 +23,10 @@ class Dobreprogramy_pl(BasicNewsRecipe): #remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})] feeds = [(u'Aktualności', 'http://feeds.feedburner.com/dobreprogramy/Aktualnosci'), ('Blogi', 'http://feeds.feedburner.com/dobreprogramy/BlogCzytelnikow')] + + + def preprocess_html(self, soup): + for a in soup('a'): + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + a['href']=self.index + a['href'] + return soup \ No newline at end of file diff --git a/recipes/dzieje_pl.recipe b/recipes/dzieje_pl.recipe index d80161e71a..4c583e4815 100644 --- a/recipes/dzieje_pl.recipe +++ b/recipes/dzieje_pl.recipe @@ -7,6 +7,7 @@ class Dzieje(BasicNewsRecipe): cover_url = 'http://www.dzieje.pl/sites/default/files/dzieje_logo.png' category = 'history' language = 'pl' + index='http://dzieje.pl' oldest_article = 8 max_articles_per_feed = 100 remove_javascript=True @@ -15,3 +16,10 @@ class Dzieje(BasicNewsRecipe): remove_tags_after= dict(id='dogory') remove_tags=[dict(id='dogory')] feeds = [(u'Dzieje', u'http://dzieje.pl/rss.xml')] + + + def preprocess_html(self, soup): + for a in soup('a'): + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + a['href']=self.index + a['href'] + return soup \ No newline at end of file diff --git a/recipes/eioba.recipe b/recipes/eioba.recipe index 14256c5811..1df79d64bd 100644 --- a/recipes/eioba.recipe +++ b/recipes/eioba.recipe @@ -21,3 +21,8 @@ class eioba(BasicNewsRecipe): (u'Rozrywka', u'http://www.eioba.pl/feed/categories/10.xml'), (u'Rożne', u'http://www.eioba.pl/feed/categories/9.xml') ] + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return soup diff --git a/recipes/emuzica_pl.recipe b/recipes/emuzica_pl.recipe index 75271c510a..2fbf9ff514 100644 --- a/recipes/emuzica_pl.recipe +++ b/recipes/emuzica_pl.recipe @@ -7,6 +7,7 @@ class eMuzyka(BasicNewsRecipe): description = u'Emuzyka to największa i najpopularniejsza strona o muzyce w Polsce' category = 'music' language = 'pl' + index='http://www.emuzyka.pl' cover_url='http://s.emuzyka.pl/img/emuzyka_invert_small.jpg' no_stylesheets = True oldest_article = 7 @@ -14,3 +15,9 @@ class eMuzyka(BasicNewsRecipe): keep_only_tags=[dict(name='div', attrs={'id':'news_container'}), dict(name='h3'), dict(name='div', attrs={'class':'review_text'})] remove_tags=[dict(name='span', attrs={'id':'date'})] feeds = [(u'Aktualno\u015bci', u'http://www.emuzyka.pl/rss.php?f=1'), (u'Recenzje', u'http://www.emuzyka.pl/rss.php?f=2')] + + def preprocess_html(self, soup): + for a in soup('a'): + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + a['href']=self.index + a['href'] + return soup \ No newline at end of file diff --git a/recipes/fhm_uk.recipe b/recipes/fhm_uk.recipe index 0e2d5c1ebe..07f2b4b64e 100644 --- a/recipes/fhm_uk.recipe +++ b/recipes/fhm_uk.recipe @@ -7,7 +7,7 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe): # cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg' masthead_url = 'http://www.fhm.com/App_Resources/Images/Site/re-design/logo.gif' __author__ = 'Dave Asbury' - # last updated 17/3/12 + # last updated 14/4/12 language = 'en_GB' oldest_article = 28 max_articles_per_feed = 12 @@ -28,7 +28,8 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe): #] feeds = [ - (u'From the Homepage',u'http://feed43.com/8053226782885416.xml'), + (u'From the Homepage',u'http://feed43.com/0032328550253453.xml'), + #http://feed43.com/8053226782885416.xml'), (u'Funny - The Very Best Of The Internet',u'http://feed43.com/4538510106331565.xml'), (u'Upgrade',u'http://feed43.com/0877305847443234.xml'), #(u'The Final Countdown', u'http://feed43.com/3576106158530118.xml'), diff --git a/recipes/film_web.recipe b/recipes/film_web.recipe index 877d4472bc..2a6e00d501 100644 --- a/recipes/film_web.recipe +++ b/recipes/film_web.recipe @@ -7,6 +7,7 @@ class Filmweb_pl(BasicNewsRecipe): cover_url = 'http://userlogos.org/files/logos/crudus/filmweb.png' category = 'movies' language = 'pl' + index='http://www.filmweb.pl' oldest_article = 8 max_articles_per_feed = 100 no_stylesheets= True @@ -39,3 +40,9 @@ class Filmweb_pl(BasicNewsRecipe): self.log.warn(skip_tag) return self.index_to_soup(skip_tag['href'], raw=True) + + def preprocess_html(self, soup): + for a in soup('a'): + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + a['href']=self.index + a['href'] + return soup \ No newline at end of file diff --git a/recipes/forbes.recipe b/recipes/forbes.recipe index a633d0f543..fe72fda536 100644 --- a/recipes/forbes.recipe +++ b/recipes/forbes.recipe @@ -1,39 +1,49 @@ -from calibre.ebooks.BeautifulSoup import BeautifulSoup +import re from calibre.web.feeds.news import BasicNewsRecipe class Forbes(BasicNewsRecipe): title = u'Forbes' description = 'Business and Financial News' - __author__ = 'Darko Miletic' + __author__ = 'Kovid Goyal' oldest_article = 30 - max_articles_per_feed = 100 + max_articles_per_feed = 20 language = 'en' + encoding = 'utf-8' + recursions = 1 no_stylesheets = True - html2lrf_options = ['--base-font-size', '10'] cover_url = u'http://www.forbes.com/media/current_covers/forbes_120_160.gif' - - feeds = [(u'Latest', u'http://www.forbes.com/news/index.xml'), - (u'Most Popular', u'http://www.forbes.com/feeds/popstories.xml'), - (u'Most Emailed', u'http://www.forbes.com/feeds/mostemailed.xml'), - (u'Faces', u'http://www.forbes.com/facesscan/index.xml'), - (u'Technology', u'http://www.forbes.com/technology/index.xml'), - (u'Personal Tech', u'http://www.forbes.com/personaltech/index.xml'), - (u'Wireless', u'http://www.forbes.com/wireless/index.xml'), - (u'Business', u'http://www.forbes.com/business/index.xml'), - (u'Sports Money', u'http://www.forbes.com/sportsmoney/index.xml'), - (u'Sports', u'http://www.forbes.com/forbeslife/sports/index.xml'), - (u'Vehicles', u'http://www.forbes.com/forbeslife/vehicles/index.xml'), - (u'Leadership', u'http://www.forbes.com/leadership/index.xml'), - (u'Careers', u'http://www.forbes.com/leadership/careers/index.xml'), - (u'Compensation', u'http://www.forbes.com/leadership/compensation/index.xml'), - (u'Managing', u'http://www.forbes.com/leadership/managing/index.xml')] - def print_version(self, url): - raw = self.browser.open(url).read() - soup = BeautifulSoup(raw.decode('latin1', 'replace')) - print_link = soup.find('a', {'onclick':"s_linkTrackVars='prop18';s_linkType='o';s_linkName='Print';if(typeof(globalPageName)!='undefined')s_prop18=globalPageName;s_lnk=s_co(this);s_gs(s_account);"}) - if print_link is None: - return '' - return 'http://www.forbes.com' + print_link['href'] \ No newline at end of file + feeds = [(u'Latest', u'http://www.forbes.com/news/index.xml'), + (u'Most Popular', u'http://www.forbes.com/feeds/popstories.xml'), + (u'Technology', u'http://www.forbes.com/technology/index.xml'), + (u'Business', u'http://www.forbes.com/business/index.xml'), + (u'Sports Money', u'http://www.forbes.com/sportsmoney/index.xml'), + (u'Leadership', u'http://www.forbes.com/leadership/index.xml'),] + + keep_only_tags = \ + {'class':lambda x: x and (set(x.split()) & {'body', 'pagination', + 'articleHead', 'article_head'})} + remove_tags_before = {'name':'h1'} + remove_tags = [ + {'class':['comment_bug', 'engagement_block', + 'video_promo_block', 'article_actions']}, + {'id':'comments'} + ] + + def is_link_wanted(self, url, tag): + ans = re.match(r'http://.*/[2-9]/', url) is not None + if ans: + self.log('Following multipage link: %s'%url) + return ans + + def postprocess_html(self, soup, first_fetch): + for pag in soup.findAll(True, 'pagination'): + pag.extract() + if not first_fetch: + h1 = soup.find('h1') + if h1 is not None: + h1.extract() + return soup + diff --git a/recipes/fotoblogia_pl.recipe b/recipes/fotoblogia_pl.recipe new file mode 100644 index 0000000000..99df46419a --- /dev/null +++ b/recipes/fotoblogia_pl.recipe @@ -0,0 +1,16 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class Fotoblogia_pl(BasicNewsRecipe): + title = u'Fotoblogia.pl' + __author__ = 'fenuks' + category = 'photography' + language = 'pl' + masthead_url = 'http://img.interia.pl/komputery/nimg/u/0/fotoblogia21.jpg' + cover_url= 'http://fotoblogia.pl/images/2009/03/fotoblogia2.jpg' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + keep_only_tags=[dict(name='div', attrs={'class':'post-view post-standard'})] + remove_tags=[dict(attrs={'class':['external fotoblogia', 'categories', 'tags']})] + feeds = [(u'Wszystko', u'http://fotoblogia.pl/feed/rss2')] diff --git a/recipes/gameplay_pl.recipe b/recipes/gameplay_pl.recipe index f3384263d6..7b0ccb4f55 100644 --- a/recipes/gameplay_pl.recipe +++ b/recipes/gameplay_pl.recipe @@ -6,16 +6,24 @@ class Gameplay_pl(BasicNewsRecipe): description = u'gameplay.pl - serwis o naszych zainteresowaniach, grach, filmach, książkach, muzyce, fotografii i konsolach.' category = 'games, movies, books, music' language = 'pl' + index='http://gameplay.pl' masthead_url= 'http://gameplay.pl/img/gpy_top_logo.png' cover_url= 'http://gameplay.pl/img/gpy_top_logo.png' max_articles_per_feed = 100 + remove_javascript= True no_stylesheets= True keep_only_tags=[dict(name='div', attrs={'class':['news_endpage_tit', 'news']})] - remove_tags=[dict(name='div', attrs={'class':['galeria', 'noedit center im']})] + remove_tags=[dict(name='div', attrs={'class':['galeria', 'noedit center im', 'news_list', 'news_list_autor', 'stop_bot', 'tagi']}), dict(attrs={'usemap':'#map'})] feeds = [(u'Wiadomo\u015bci', u'http://gameplay.pl/rss/')] def image_url_processor(self, baseurl, url): if 'http' not in url: return 'http://gameplay.pl'+ url[2:] else: - return url + return url + + def preprocess_html(self, soup): + for a in soup('a'): + if a.has_key('href') and '../' in a['href']: + a['href']=self.index + a['href'][2:] + return soup \ No newline at end of file diff --git a/recipes/gildia_pl.recipe b/recipes/gildia_pl.recipe index 042902b5fc..36d3ef4da2 100644 --- a/recipes/gildia_pl.recipe +++ b/recipes/gildia_pl.recipe @@ -9,6 +9,7 @@ class Gildia(BasicNewsRecipe): language = 'pl' oldest_article = 8 max_articles_per_feed = 100 + remove_empty_feeds=True no_stylesheets=True remove_tags=[dict(name='div', attrs={'class':'backlink'}), dict(name='div', attrs={'class':'im_img'}), dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'})] keep_only_tags=dict(name='div', attrs={'class':'widetext'}) @@ -24,3 +25,16 @@ class Gildia(BasicNewsRecipe): self.log.warn('odnosnik') self.log.warn(link['href']) return self.index_to_soup(link['href'], raw=True) + + def preprocess_html(self, soup): + for a in soup('a'): + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + if '/gry/' in a['href']: + a['href']='http://www.gry.gildia.pl' + a['href'] + elif u'książk' in soup.title.string.lower() or u'komiks' in soup.title.string.lower(): + a['href']='http://www.literatura.gildia.pl' + a['href'] + elif u'komiks' in soup.title.string.lower(): + a['href']='http://www.literatura.gildia.pl' + a['href'] + else: + a['href']='http://www.gildia.pl' + a['href'] + return soup diff --git a/recipes/gram_pl.recipe b/recipes/gram_pl.recipe index 07927796c0..1f8147ba3d 100644 --- a/recipes/gram_pl.recipe +++ b/recipes/gram_pl.recipe @@ -7,6 +7,7 @@ class Gram_pl(BasicNewsRecipe): category = 'games' language = 'pl' oldest_article = 8 + index='http://www.gram.pl' max_articles_per_feed = 100 no_stylesheets= True extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}' @@ -52,4 +53,7 @@ class Gram_pl(BasicNewsRecipe): tag=soup.findAll(name='div', attrs={'class':'picbox'}) for t in tag: t['style']='float: left;' + for a in soup('a'): + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + a['href']=self.index + a['href'] return soup \ No newline at end of file diff --git a/recipes/heise.recipe b/recipes/heise.recipe index 56d5516656..ba93ea96ce 100644 --- a/recipes/heise.recipe +++ b/recipes/heise.recipe @@ -59,6 +59,7 @@ class heiseDe(BasicNewsRecipe): dict(name='span', attrs={'class':'rsaquo'}), dict(name='div', attrs={'class':'news_logo'}), dict(name='div', attrs={'class':'bcadv ISI_IGNORE'}), + dict(name='div', attrs={'class':'navi_top_container'}), dict(name='p', attrs={'class':'news_option'}), dict(name='p', attrs={'class':'news_navi'}), dict(name='div', attrs={'class':'news_foren'})] @@ -69,3 +70,5 @@ class heiseDe(BasicNewsRecipe): + + diff --git a/recipes/historia_news.recipe b/recipes/historia_news.recipe new file mode 100644 index 0000000000..4eca8ade91 --- /dev/null +++ b/recipes/historia_news.recipe @@ -0,0 +1,20 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class historia_news(BasicNewsRecipe): + title = u'historia-news' + __author__ = 'fenuks' + description = u'Historia-news to portal dla ludzi kochających historię. Najnowsze newsy z historii bliższej i dalszej, archeologii, paleontologii oraz ciekawostki i podcasty z historii kultury, sportu, motoryzacji i inne.' + masthead_url = 'http://historia-news.pl/templates/hajak4/images/header.jpg' + cover_url= 'http://www.historia-news.pl/templates/hajak4/images/header.jpg' + category = 'history' + language = 'pl' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + remove_tags=[dict(name='form'), dict(name='img', attrs={'alt':'Print'}), dict(attrs={'class':['commbutt', 'cpr']}), dict(id=['plusone', 'facebook'])] + feeds = [(u'Wiadomo\u015bci', u'http://historia-news.pl/wiadomoci.feed?type=rss'), (u'Artyku\u0142y', u'http://historia-news.pl/artykuy.feed?type=rss')] + + + def print_version(self, url): + return url + '?tmpl=component&print=1&layout=default&page=' diff --git a/recipes/icons/fotoblogia_pl.png b/recipes/icons/fotoblogia_pl.png new file mode 100644 index 0000000000..0204a04e62 Binary files /dev/null and b/recipes/icons/fotoblogia_pl.png differ diff --git a/recipes/icons/historia_news.png b/recipes/icons/historia_news.png new file mode 100644 index 0000000000..79b1b52859 Binary files /dev/null and b/recipes/icons/historia_news.png differ diff --git a/recipes/icons/swiat_obrazu.png b/recipes/icons/swiat_obrazu.png new file mode 100644 index 0000000000..a61662a864 Binary files /dev/null and b/recipes/icons/swiat_obrazu.png differ diff --git a/recipes/in4_pl.recipe b/recipes/in4_pl.recipe index 16ad622b46..e385522714 100644 --- a/recipes/in4_pl.recipe +++ b/recipes/in4_pl.recipe @@ -8,6 +8,7 @@ class in4(BasicNewsRecipe): description = u'Serwis Informacyjny - Aktualnosci, recenzje' category = 'IT' language = 'pl' + index='http://www.in4.pl/' #cover_url= 'http://www.in4.pl/recenzje/337/in4pl.jpg' no_stylesheets = True remove_empty_feeds = True @@ -39,6 +40,7 @@ class in4(BasicNewsRecipe): def preprocess_html(self, soup): self.append_page(soup, soup.body) + for a in soup('a'): + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + a['href']=self.index + a['href'] return soup - - diff --git a/recipes/infra_pl.recipe b/recipes/infra_pl.recipe index 0e035e0980..e021fa0c17 100644 --- a/recipes/infra_pl.recipe +++ b/recipes/infra_pl.recipe @@ -8,6 +8,7 @@ class INFRA(BasicNewsRecipe): description = u'Serwis Informacyjny INFRA - UFO, Zjawiska Paranormalne, Duchy, Tajemnice świata.' cover_url = 'http://npn.nazwa.pl/templates/ja_teline_ii/images/logo.jpg' category = 'UFO' + index='http://infra.org.pl' language = 'pl' max_articles_per_feed = 100 no_stylesheers=True @@ -15,3 +16,11 @@ class INFRA(BasicNewsRecipe): remove_tags_after=dict(attrs={'class':'pagenav'}) remove_tags=[dict(attrs={'class':'pagenav'})] feeds = [(u'Najnowsze wiadomo\u015bci', u'http://www.infra.org.pl/index.php?option=com_rd_rss&id=1')] + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for a in soup('a'): + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + a['href']=self.index + a['href'] + return soup \ No newline at end of file diff --git a/recipes/jakarta_globe.recipe b/recipes/jakarta_globe.recipe new file mode 100644 index 0000000000..1414ac6e5b --- /dev/null +++ b/recipes/jakarta_globe.recipe @@ -0,0 +1,34 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class JakartaGlobe(BasicNewsRecipe): + title = u'Jakarta Globe' + oldest_article = 3 + max_articles_per_feed = 100 + + feeds = [ + (u'News', u'http://www.thejakartaglobe.com/pages/getrss/getrss-news.php'), + (u'Business', u'http://www.thejakartaglobe.com/pages/getrss/getrss-business.php'), + (u'Technology', u'http://www.thejakartaglobe.com/pages/getrss/getrss-tech.php'), + (u'My Jakarta', u'http://www.thejakartaglobe.com/pages/getrss/getrss-myjakarta.php'), + (u'International', u'http://www.thejakartaglobe.com/pages/getrss/getrss-international.php'), + (u'Life and Times', u'http://www.thejakartaglobe.com/pages/getrss/getrss-lifeandtimes.php'), + ] + __author__ = 'rty' + pubisher = 'JakartaGlobe.com' + description = 'JakartaGlobe, Indonesia, Newspaper' + category = 'News, Indonesia' + + + remove_javascript = True + use_embedded_content = False + no_stylesheets = True + language = 'en_ID' + encoding = 'utf-8' + conversion_options = {'linearize_tables':True} + masthead_url = 'http://www.thejakartaglobe.com/pages/2010/images/jak-globe-logo.jpg' + keep_only_tags = [ + dict(name='div', attrs={'class':'story'}), + dict(name='span', attrs={'class':'headline'}), + dict(name='div', attrs={'class':'story'}), + dict(name='p', attrs={'id':'bodytext'}) + ] diff --git a/recipes/konflikty_zbrojne.recipe b/recipes/konflikty_zbrojne.recipe index 7921e98f48..e8b28b49bf 100644 --- a/recipes/konflikty_zbrojne.recipe +++ b/recipes/konflikty_zbrojne.recipe @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup class Konflikty(BasicNewsRecipe): title = u'Konflikty Zbrojne' @@ -10,6 +11,23 @@ class Konflikty(BasicNewsRecipe): category='military, history' oldest_article = 7 max_articles_per_feed = 100 - auto_cleanup = True + no_stylesheets = True + keep_only_tags=[dict(attrs={'class':['title1', 'image']}), dict(id='body')] - feeds = [(u'Aktualności', u'http://www.konflikty.pl/rss_aktualnosci_10.xml'), (u'Artyku\u0142y', u'http://www.konflikty.pl/rss_artykuly_10.xml'), (u'Relacje', u'http://www.konflikty.pl/rss_relacje_10.xml'), (u'Recenzje', u'http://www.konflikty.pl/rss_recenzje_10.xml')] + feeds = [(u'Aktualności', u'http://www.konflikty.pl/rss_aktualnosci_10.xml'), + (u'Artyku\u0142y', u'http://www.konflikty.pl/rss_artykuly_10.xml'), + (u'Historia', u'http://www.konflikty.pl/rss_historia_10.xml'), + (u'Militaria', u'http://www.konflikty.pl/rss_militaria_10.xml'), + (u'Relacje', u'http://www.konflikty.pl/rss_relacje_10.xml'), + (u'Recenzje', u'http://www.konflikty.pl/rss_recenzje_10.xml'), + (u'Teksty źródłowe', u'http://www.konflikty.pl/rss_tekstyzrodlowe_10.xml')] + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for image in soup.findAll(name='a', attrs={'class':'image'}): + if image.img and image.img.has_key('alt'): + image.name='div' + pos = len(image.contents) + image.insert(pos, BeautifulSoup('

'+image.img['alt']+'

')) + return soup diff --git a/recipes/liberatorio_politico.recipe b/recipes/liberatorio_politico.recipe new file mode 100644 index 0000000000..bbffcd89b1 --- /dev/null +++ b/recipes/liberatorio_politico.recipe @@ -0,0 +1,12 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1334649829(BasicNewsRecipe): + title = u'Liberatorio Politico' + oldest_article = 7 + max_articles_per_feed = 100 + auto_cleanup = True + masthead_url = 'http://liberatorio.altervista.org/wp-content/uploads/2012/01/Testata-LIBERATORIO-Altervista1.jpg' + feeds = [(u'Liberatorio Politico', u'http://liberatorio.altervista.org/feed/')] + __author__ = 'faber1971' + description = 'Inquiry journalism - a blog on Molfetta, Land of Bari, Apulia and Italy - v1.00 (07, April 2012)' + language = 'it' diff --git a/recipes/limes.recipe b/recipes/limes.recipe new file mode 100644 index 0000000000..2290b7099e --- /dev/null +++ b/recipes/limes.recipe @@ -0,0 +1,50 @@ +#!/usr/bin/env python +__license__ = 'GPL v3' +__copyright__ = '2012, faber1971' +__version__ = 'v1.00' +__date__ = '16, April 2012' +__description__ = 'Geopolitical Italian magazine' + + +from calibre.web.feeds.news import BasicNewsRecipe + +class Limes(BasicNewsRecipe): + description = 'Italian weekly magazine' + __author__ = 'faber1971' + + cover_url = 'http://temi.repubblica.it/UserFiles/limes/Image/Loghi/logo-limes.gif' + title = 'Limes' + category = 'Geopolitical news' + + language = 'it' +# encoding = 'cp1252' + timefmt = '[%a, %d %b, %Y]' + + oldest_article = 16 + max_articles_per_feed = 100 + use_embedded_content = False + recursion = 10 + + remove_javascript = True + no_stylesheets = True + masthead_url = 'http://temi.repubblica.it/UserFiles/limes/Image/Loghi/logo-limes.gif' + + feeds = [ + (u'Limes', u'http://temi.repubblica.it/limes/feed/') + ] + + + + keep_only_tags = [ + dict(name='div', attrs={'class':['testo','copertina','occhiello','firma','didascalia','content-second-right','detail-articles','titolo-local','generic-articles']}), + dict(name='div', attrs={'class':['generic-articles','summary','detail-articles']}), + dict(name='div', attrs={'id':['content-second-right','content2']}) + ] + + remove_tags = [ + dict(name='div',attrs={'class':['servizi','aggiungi','label-web','bottom-mobile','box-abbonamenti','box-cerca','big','little','stampaweb']}), + dict(name='div',attrs={'id':['topheader','header','navigation-new','navigation','content-second-left','menutext']}), + dict(name='ul',attrs={'id':'user-utility'}), + dict(name=['script','noscript','iframe']) + ] + diff --git a/recipes/metro_news_nl.recipe b/recipes/metro_news_nl.recipe index ac3e23869b..d95f9bdfd7 100644 --- a/recipes/metro_news_nl.recipe +++ b/recipes/metro_news_nl.recipe @@ -3,25 +3,6 @@ from calibre.web.feeds.news import BasicNewsRecipe import re from calibre.utils.magick import Image from BeautifulSoup import BeautifulSoup -try: - from calibre_plugins.drMerry.debug import debuglogger as mlog - print 'drMerry debuglogger found, debug options can be used' - from calibre_plugins.drMerry.stats import statslogger as mstat - print 'drMerry stats tracker found, stat can be tracked' - mlog.setLoglevel(1) #-1 == no log; 0 for normal output - mstat.calculateStats(False) #track stats (to track stats loglevel must be > 0 - KEEPSTATS = mstat.keepmystats() - SHOWDEBUG0 = mlog.showdebuglevel(0) - SHOWDEBUG1 = mlog.showdebuglevel(1) - SHOWDEBUG2 = mlog.showdebuglevel(2) -except: - #print 'drMerry debuglogger not found, skipping debug options' - SHOWDEBUG0 = False - SHOWDEBUG1 = False - SHOWDEBUG2 = False - KEEPSTATS = False - -#print ('level0: %s\nlevel1: %s\nlevel2: %s' % (SHOWDEBUG0,SHOWDEBUG1,SHOWDEBUG2)) ''' Version 1.2, updated cover image to match the changed website. added info date on title @@ -43,6 +24,9 @@ except: extended timeout from 2 to 10 changed oldest article from 10 to 1.2 changed max articles from 15 to 25 + Version 1.9.1 18-04-2012 + removed some debug settings + updated code to match new metro-layout ''' class AdvancedUserRecipe1306097511(BasicNewsRecipe): @@ -70,34 +54,40 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe): 'author_sort' : 'Metro Nederland & calibre & DrMerry', 'publisher' : 'DrMerry/Metro Nederland' } - extra_css = 'body {padding:5px 0px; background:#fff;font-size: 13px;}\ - #date, div.share-and-byline div.byline div.text div.title, div.share-and-byline div.byline div.text div.name {clear: both;margin-bottom: 10px;font-size:0.5em; color: #616262;}\ - .article-box-fact.module-title {clear:both;padding: 8px 0;color: #24763b;font-family: arial, sans-serif;font-size: 14px;font-weight: bold;}\ - h1.title {color: #000000;font-size: 44px;padding-bottom: 10px;font-weight: 300;} h2.subtitle {font-size: 13px;font-weight: 700;padding-bottom: 10px;}\ - .article-body p{padding-bottom:10px;}div.column-1-3{margin-left: 19px;padding-right: 9px;}\ - div.column-1-2 {display: inline;padding-right: 7px;}\ - p.article-image-caption {font-size: 12px;font-weight: 300;color: #616262;margin-top: 5px;} \ - p.article-image-caption .credits {font-style: italic;font-size: 10px;}\ - div.article-image-caption {width: 246px;margin-bottom: 5px;margin-left: 10px;}\ - div.article-image-caption-2column {margin-bottom: 10px;width: 373px;} div.article-image-caption-3column {}\ - img {border:0px; padding:2px;} hr.merryhr {width:30%; border-width:0px; color:green; margin-left:5px; background-color: green} div.column-3 {background-color:#eee; width:50%; margin:2px; float:right; padding:2px;} div.column-3 module-title {border: 1px solid #aaa} div.article-box-fact div.subtitle {font-weight:bold; color:green;}' + extra_css = 'body {padding:5px 0; background-color:#fff;font-size: 1em}\ + #date, div.share-and-byline div.byline div.text div.title, div.share-and-byline div.byline div.text div.name {margin-bottom: 10px}\ + #date, div.share-and-byline div.byline div.text div.title, div.share-and-byline div.byline div.text div.name, p.article-image-caption .credits {font-size:0.5em}\ + .article-box-fact.module-title, #date, div.share-and-byline div.byline div.text div.title, div.share-and-byline div.byline div.text div.name {clear:both}\ + .article-box-fact.module-title {padding: 8px 0}\ + h1.title {color: #000;font-size: 1.4em}\ + .article-box-fact.module-title, h2.subtitle {font-size: 1.2em}\ + h1.title, h2.subtitle, .article-body p{padding-bottom:10px}\ + h1.title, p.article-image-caption {font-weight: 300}\ + div.column-1-3{margin-left: 19px;padding-right: 9px}\ + div.column-1-2 {display: inline;padding-right: 7px}\ + p.article-image-caption {font-size: 0.6em;margin-top: 5px}\ + p.article-image-caption, #date, div.share-and-byline div.byline div.text div.title, div.share-and-byline div.byline div.text div.name {color: #616262}\ + p.article-image-caption .credits {font-style: italic}\ + div.article-image-caption {width: 246px;margin: 5px}\ + div.article-image-caption-2column {width: 373px}\ + div.article-image-caption-2column, div.article-image-caption-3column {margin-bottom: 5px}\ + img {border:0}\ + img, div.column-3 {padding:2px}\ + hr.merryhr {width:30%; border-width:0; margin-left:5px; background-color: #24763b}\ + div.column-3 {background-color:#eee; width:50%; margin:2px; float:right}\ + div.column-3 module-title {border: 1px solid #aaa}\ + div.article-box-fact div.subtitle, .article-box-fact.module-title, h2.subtitle {font-weight:bold}\ + div.article-box-fact div.subtitle, hr.merryhr, .article-box-fact.module-title {color: #24763b}' + preprocess_regexps = [ (re.compile(r']+top-line[^>]+>', re.DOTALL|re.IGNORECASE), lambda match: '
'), - (re.compile(r'(]+metronieuws\.nl/[^>]+/templates/[^>]+jpe?g[^>]+>|metronieuws\.nl/internal\-roxen\-unit\.gif)', re.DOTALL|re.IGNORECASE), + (re.compile(r']+(metronieuws\.nl/[^>]+/templates/[^>]+jpe?g|metronieuws\.nl/internal\-roxen\-unit\.gif)[^>]+>', re.DOTALL|re.IGNORECASE), lambda match: ''), ] def preprocess_html(self, soup): - if SHOWDEBUG0 == True: - mlog.setdefaults() - mlog.addTextAndTag(['Show debug = on with level'], [str(mlog.debuglevel)]) - if KEEPSTATS == True: - mlog.addDebug('Stats will be calculated') - else: - mlog.addTextAndTag(['Stats won\'t be calculated\nTo be enabled, stats must be true, currently','and debug level must be 1 or higher, currently'],[mstat.dokeepmystats, mlog.debuglevel]) - mlog.showDebug() myProcess = MerryProcess() myProcess.removeUnwantedTags(soup) return soup @@ -105,18 +95,6 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe): def postprocess_html(self, soup, first): myProcess = MerryProcess() myProcess.optimizeLayout(soup) - if SHOWDEBUG0 == True: - if KEEPSTATS == True: - statinfo = 'generated stats:' - statinfo += str(mstat.stats(mstat.statslist)) - print statinfo - statinfo = 'generated stats (for removed tags):' - statinfo += str(mstat.stats(mstat.removedtagslist)) - print statinfo - #show all Debug info we forgot to report - #Using print to be sure that this text will not be added at the end of the log. - print '\n!!!!!unreported messages:\n(should be empty)\n' - mlog.showDebug() return soup feeds = [ @@ -142,44 +120,24 @@ class MerryPreProcess(): return soup def optimizePicture(self,soup): - if SHOWDEBUG0 == True: - mlog.addDebug('start image optimize') for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): iurl = tag['src'] img = Image() img.open(iurl) img.trim(0) img.save(iurl) - if SHOWDEBUG0 == True: - mlog.addDebug('Images optimized') - mlog.showDebug() return soup class MerryExtract(): def safeRemovePart(self, killingSoup, soupIsArray): if killingSoup and not killingSoup == None: - if SHOWDEBUG2 == True: - mlog.addTextAndTag(['items to remove'],[killingSoup]) try: if soupIsArray == True: for killer in killingSoup: killer.extract() else: killingSoup.extract() - if SHOWDEBUG1 == True: - mlog.addDebug('tag extracted') - mlog.showDebug() - if KEEPSTATS == True: - try: - mstat.addstat(mstat.removedtagslist,str(killingSoup.name)) - except: - mstat.addstat(mstat.removedtagslist,'unknown') except: - if SHOWDEBUG1 == True: - mlog.addDebug('tag extraction failed') - mlog.showDebug() - if KEEPSTATS == True: - mstat.addstat(mstat.removedtagslist,'exception') return False else: return False @@ -230,60 +188,26 @@ class MerryProcess(BeautifulSoup): def optimizeLayout(self,soup): self.myPrepare.optimizePicture(soup) - if SHOWDEBUG0 == True: - mlog.addDebug('End of Optimize Layout') - mlog.showDebug() return soup def insertFacts(self, soup): allfacts = soup.findAll('div', {'class':re.compile('^article-box-fact.*$')}) - if SHOWDEBUG0 == True: - mlog.addTextAndTag(['allfacts'],[allfacts]) - mlog.showDebug() if allfacts and not allfacts == None: allfactsparent = soup.find('div', {'class':re.compile('^article-box-fact.*$')}).parent - if SHOWDEBUG0 == True: - mlog.addTextAndTag(['allfactsparent'],[allfactsparent]) - mlog.showDebug() for part in allfactsparent: if not part in allfacts: - if SHOWDEBUG0 == True: - mlog.addTextAndTag(['FOUND A non-fact'],[part]) - mlog.showDebug() self.myKiller.safeRemovePart(part, True) - if SHOWDEBUG1 == True: - mlog.addTextAndTag(['New All Facts'],[allfacts]) - mlog.showDebug() articlefacts = soup.find('div', {'class':'article-box-fact column'}) - errorOccured=False if (articlefacts and not articlefacts==None): try: contenttag = soup.find('div', {'class':'article-body'}) - if SHOWDEBUG0 == True: - mlog.addTextAndTag(['curcontag'],[contenttag]) - mlog.showDebug() foundrighttag = False if contenttag and not contenttag == None: foundrighttag = True - if SHOWDEBUG0 == True: - if errorOccured == False: - mlog.addTextAndTag(['type','curcontag (in while)'],[type(contenttag),contenttag]) - else: - mlog.addDebug('Could not find right parent tag. Error Occured') - mlog.showDebug() if foundrighttag == True: contenttag.insert(0, allfactsparent) - if SHOWDEBUG2 == True: - mlog.addTextAndTag(['added parent'],[soup.prettify()]) - mlog.showDebug() except: - errorOccured=True - mlog.addTrace() - else: - errorOccured=True - if SHOWDEBUG0 == True and errorOccured == True: - mlog.addTextAndTag(['no articlefacts'],[articlefacts]) - mlog.showDebug() + pass return soup def previousNextSibRemover(self, soup, previous=True, soupIsArray=False): @@ -300,71 +224,38 @@ class MerryProcess(BeautifulSoup): sibs = findsibsof.nextSiblingGenerator() for sib in sibs: self.myKiller.safeRemovePart(sib, True) - else: - if SHOWDEBUG1 == True: - mlog.addDebug('Not any sib found') return def removeUnwantedTags(self,soup): - if SHOWDEBUG1 == True: - mlog.addTextAndTag(['Len of Soup before RemoveTagsByName'],[len(str(soup))]) - mlog.showDebug() self.removeTagsByName(soup) - if SHOWDEBUG1 == True: - mlog.addDebug('Len of Soup before firstandlastpart: %s' % len(str(soup))) - mlog.showDebug() self.insertFacts(soup) self.removeFirstAndLastPart(soup) - if SHOWDEBUG1 == True: - mlog.addDebug('Len of Soup before unwantedpart: %s' % len(str(soup))) - mlog.showDebug() self.removeUnwantedParts(soup) - if SHOWDEBUG1 == True: - mlog.addDebug('Len of Soup before EmptyParts: %s' % len(str(soup))) - mlog.showDebug() self.removeEmptyTags(soup) - if SHOWDEBUG1 == True: - mlog.addDebug('Len of Soup after EmptyParts: %s' % len(str(soup))) - mlog.showDebug() self.myReplacer.replaceATag(soup) return soup def removeUnwantedParts(self, soup): - if SHOWDEBUG1 == True: - mlog.addDebug('Len of Soup before UnwantedID: %s' % len(str(soup))) - mlog.showDebug() self.removeUnwantedTagsByID(soup) - if SHOWDEBUG1 == True: - mlog.addDebug('Len of Soup before Class: %s' % len(str(soup))) - mlog.showDebug() self.removeUnwantedTagsByClass(soup) - if SHOWDEBUG1 == True: - mlog.addDebug('Len of Soup before Style: %s' % len(str(soup))) - mlog.showDebug() self.removeUnwantedTagsByStyle(soup) return soup def removeUnwantedTagsByStyle(self,soup): - self.removeArrayOfTags(soup.findAll(attrs={'style' : re.compile("^(.*(display\s?:\s?none|img-mask|white)\s?;?.*)$")})) - if SHOWDEBUG0 == True: - mlog.addDebug('end remove by style') + self.removeArrayOfTags(soup.findAll(attrs={'style':re.compile("^(.*(display\s?:\s?none|img-mask|white)\s?;?.*)$")})) + self.removeArrayOfTags(soup.findAll(attrs={'title':'volledig scherm'})) return soup def removeArrayOfTags(self,souparray): return self.myKiller.safeRemovePart(souparray, True) def removeUnwantedTagsByClass(self,soup): - if SHOWDEBUG0 == True: - mlog.addDebug('start remove by class') - self.removeArrayOfTags(soup.findAll("div", { "class" :re.compile('^(promo.*?|article-tools-below-title|metroCommentFormWrap|ad|share-tools|tools|header-links|related-links|padding-top-15)$')})) + self.removeArrayOfTags(soup.findAll("div", { "class" :re.compile('^(promo.*?|share-tools-top|share-tools-bottom|article-tools-below-title|metroCommentFormWrap|ad|share-tools|tools|header-links|related-links|padding-top-15|footer-[a-zA-Z0-9]+)$')})) return soup def removeUnwantedTagsByID(self,soup): - defaultids = ['footer-extra',re.compile('^ad(\d+|adcomp.*?)?$'),'column-4-5','navigation','header',re.compile('^column-1-5-(top|bottom)$'),'footer','hidden_div','sidebar',re.compile('^article-\d$'),'comments','footer'] + defaultids = ['footer-extra',re.compile('^ad(\d+|adcomp.*?)?$'),'column-4-5','navigation','header',re.compile('^column-1-5-(top|bottom)$'),'footer','hidden_div','sidebar',re.compile('^article-\d$'),'comments','footer','gallery-1'] for removeid in defaultids: - if SHOWDEBUG1 == True: - mlog.addDebug('RemoveTagByID, tag: %s, Len of Soup: %s' % (str(removeid), len(str(soup)))) - mlog.showDebug() self.removeArrayOfTags(soup.findAll(id=removeid)) return soup @@ -380,33 +271,12 @@ class MerryProcess(BeautifulSoup): return soup def removeEmptyTags(self,soup,run=0): - if SHOWDEBUG0 == True: - mlog.addDebug('starting removeEmptyTags') - if SHOWDEBUG1 == True: - run += 1 - mlog.addDebug(run) - if SHOWDEBUG2 == True: - mlog.addDebug(str(soup.prettify())) - mlog.showDebug() emptymatches = re.compile('^( |\s|\n|\r|\t)*$') emptytags = soup.findAll(lambda tag: tag.find(True) is None and (tag.string is None or tag.string.strip()=="" or tag.string.strip()==emptymatches) and not tag.isSelfClosing) if emptytags and not (emptytags == None or emptytags == []): - if SHOWDEBUG1 == True: - mlog.addDebug('tags found') - mlog.addDebug(str(emptytags)) self.removeArrayOfTags(emptytags) #recursive in case removing empty tag creates new empty tag self.removeEmptyTags(soup, run=run) - else: - if SHOWDEBUG1 == True: - mlog.addDebug('no empty tags found') - mlog.showDebug() - if SHOWDEBUG0 == True: - if SHOWDEBUG2 == True: - mlog.addDebug('new soup:') - mlog.addDebug(str(soup.prettify())) - mlog.addDebug('RemoveEmptyTags Completed') - mlog.showDebug() return soup def removeFirstAndLastPart(self,soup): diff --git a/recipes/metro_uk.recipe b/recipes/metro_uk.recipe index 8dc7008a68..c30f81c019 100644 --- a/recipes/metro_uk.recipe +++ b/recipes/metro_uk.recipe @@ -1,52 +1,30 @@ -import re from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1306097511(BasicNewsRecipe): title = u'Metro UK' description = 'News as provide by The Metro -UK' - + #timefmt = '' __author__ = 'Dave Asbury' - #last update 3/12/11 cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/276636_117118184990145_2132092232_n.jpg' - no_stylesheets = True + #no_stylesheets = True oldest_article = 1 - max_articles_per_feed = 20 + max_articles_per_feed = 10 remove_empty_feeds = True remove_javascript = True + auto_cleanup = True - #preprocess_regexps = [(re.compile(r'Tweet'), lambda a : '')] - preprocess_regexps = [ - (re.compile(r'', re.IGNORECASE | re.DOTALL), lambda match: '

')] - preprocess_regexps = [ - (re.compile(r'tweet', re.IGNORECASE | re.DOTALL), lambda match: '')] language = 'en_GB' - - masthead_url = 'http://e-edition.metro.co.uk/images/metro_logo.gif' - - keep_only_tags = [ - dict(name='h1'),dict(name='h2', attrs={'class':'h2'}), - dict(attrs={'class':['img-cnt figure']}), - dict(attrs={'class':['art-img']}), - dict(name='div', attrs={'class':'art-lft'}), - dict(name='p') + ] remove_tags = [ - dict(name = 'div',attrs={'id' : ['comments-news','formSubmission']}), - dict(name='div', attrs={'class':[ 'news m12 clrd clr-b p5t shareBtm', 'commentForm', 'metroCommentInnerWrap', - 'art-rgt','pluck-app pluck-comm','news m12 clrd clr-l p5t', 'flt-r','username','clrd' ]}), - dict(attrs={'class':['username', 'metroCommentFormWrap','commentText','commentsNav','avatar','submDateAndTime','addYourComment','displayName']}) - ,dict(name='div', attrs={'class' : 'clrd art-fd fd-gr1-b'}) + ] + + feeds = [ (u'News', u'http://www.metro.co.uk/rss/news/'), (u'Money', u'http://www.metro.co.uk/rss/money/'), (u'Sport', u'http://www.metro.co.uk/rss/sport/'), (u'Film', u'http://www.metro.co.uk/rss/metrolife/film/'), (u'Music', u'http://www.metro.co.uk/rss/metrolife/music/'), (u'TV', u'http://www.metro.co.uk/rss/tv/'), (u'Showbiz', u'http://www.metro.co.uk/rss/showbiz/'), (u'Weird News', u'http://www.metro.co.uk/rss/weird/'), (u'Travel', u'http://www.metro.co.uk/rss/travel/'), (u'Lifestyle', u'http://www.metro.co.uk/rss/lifestyle/'), (u'Books', u'http://www.metro.co.uk/rss/lifestyle/books/'), (u'Food', u'http://www.metro.co.uk/rss/lifestyle/restaurants/')] - extra_css = ''' - body {font: sans-serif medium;}' - h1 {text-align : center; font-family:Arial,Helvetica,sans-serif; font-size:20px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold;} - h2 {text-align : center;color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:15px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; } - span{ font-size:9.5px; font-weight:bold;font-style:italic} - p { text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;} - - ''' + body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;} + ''' diff --git a/recipes/national_geographic_pl.recipe b/recipes/national_geographic_pl.recipe index a2f759e878..07fc0da666 100644 --- a/recipes/national_geographic_pl.recipe +++ b/recipes/national_geographic_pl.recipe @@ -9,8 +9,9 @@ from calibre.web.feeds.recipes import BasicNewsRecipe class recipeMagic(BasicNewsRecipe): title = 'National Geographic PL' __author__ = 'Marcin Urban 2011' + __modified_by__ = 'fenuks' description = 'legenda wśród magazynów z historią sięgającą 120 lat' - cover_url = 'http://www.guj.pl/var/guj/storage/images/media/nasze_magazyny/national_geographic/logo/ng_logo/2606-1-pol-PL/ng_logo.jpg' + #cover_url = 'http://www.guj.pl/var/guj/storage/images/media/nasze_magazyny/national_geographic/logo/ng_logo/2606-1-pol-PL/ng_logo.jpg' oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True @@ -42,11 +43,43 @@ class recipeMagic(BasicNewsRecipe): ] remove_attributes = ['width','height'] + feeds=[] - feeds = [ - ('National Geographic PL', 'http://www.national-geographic.pl/rss/'), - ] + def find_articles(self, url): + articles = [] + soup=self.index_to_soup(url) + tag=soup.find(attrs={'class':'arl'}) + art=tag.ul.findAll('li') + for i in art: + title=i.a['title'] + url=i.a['href'] + #date=soup.find(id='footer').ul.li.string[41:-1] + desc=i.div.p.string + articles.append({'title' : title, + 'url' : url, + 'date' : '', + 'description' : desc + }) + return articles + + def parse_index(self): + feeds = [] + feeds.append((u"Aktualności", self.find_articles('http://www.national-geographic.pl/aktualnosci/'))) + feeds.append((u"Artykuły", self.find_articles('http://www.national-geographic.pl/artykuly/'))) + + return feeds def print_version(self, url): - return url.replace('artykuly0Cpokaz', 'drukuj-artykul') + if 'artykuly' in url: + return url.replace('artykuly/pokaz', 'drukuj-artykul') + elif 'aktualnosci' in url: + return url.replace('aktualnosci/pokaz', 'drukuj-artykul') + else: + return url + + def get_cover_url(self): + soup = self.index_to_soup('http://www.national-geographic.pl/biezace-wydania/') + tag=soup.find(attrs={'class':'txt jus'}) + self.cover_url=tag.img['src'] + return getattr(self, 'cover_url', self.cover_url) diff --git a/recipes/nowa_fantastyka.recipe b/recipes/nowa_fantastyka.recipe index ec556da5fa..0371cb1f58 100644 --- a/recipes/nowa_fantastyka.recipe +++ b/recipes/nowa_fantastyka.recipe @@ -81,5 +81,7 @@ class Nowa_Fantastyka(BasicNewsRecipe): title=soup.find(attrs={'class':'tytul'}) if title: title['style']='font-size: 20px; font-weight: bold;' - self.log.warn(soup) + for a in soup('a'): + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + a['href']=self.INDEX + a['href'] return soup diff --git a/recipes/orlando_sentinel.recipe b/recipes/orlando_sentinel.recipe index 7a59f6f6ba..b327bc2b74 100644 --- a/recipes/orlando_sentinel.recipe +++ b/recipes/orlando_sentinel.recipe @@ -1,3 +1,4 @@ +import urllib, re from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1279258912(BasicNewsRecipe): @@ -27,12 +28,30 @@ class AdvancedUserRecipe1279258912(BasicNewsRecipe): encoding = 'utf-8' conversion_options = {'linearize_tables':True} masthead_url = 'http://www.orlandosentinel.com/media/graphic/2009-07/46844851.gif' - keep_only_tags = [ - dict(name='div', attrs={'class':'story'}) - ] - remove_tags = [ - dict(name='div', attrs={'class':['articlerail','tools','comment-group','clearfix']}), - ] - remove_tags_after = [ - dict(name='p', attrs={'class':'copyright'}), - ] + + auto_cleanup = True + + def get_article_url(self, article): + ans = None + try: + s = article.summary + ans = urllib.unquote( + re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1)) + except: + pass + if ans is None: + link = article.get('feedburner_origlink', None) + if link and link.split('/')[-1]=="story01.htm": + link=link.split('/')[-2] + encoding = {'0B': '.', '0C': '/', '0A': '0', '0F': '=', '0G': '&', + '0D': '?', '0E': '-', '0N': '.com', '0L': 'http:', + '0S':'//'} + for k, v in encoding.iteritems(): + link = link.replace(k, v) + ans = link + elif link: + ans = link + if ans is not None: + return ans.replace('?track=rss', '') + + diff --git a/recipes/pc_arena.recipe b/recipes/pc_arena.recipe index 952db30c3e..56bb601f70 100644 --- a/recipes/pc_arena.recipe +++ b/recipes/pc_arena.recipe @@ -7,6 +7,7 @@ class PC_Arena(BasicNewsRecipe): description = u'Najnowsze informacje z branży IT - testy, recenzje, aktualności, rankingi, wywiady. Twoje źródło informacji o sprzęcie komputerowym.' category = 'IT' language = 'pl' + index='http://pcarena.pl' masthead_url='http://pcarena.pl/pcarena/img/logo.png' cover_url= 'http://pcarena.pl/pcarena/img/logo.png' no_stylesheets = True @@ -22,4 +23,10 @@ class PC_Arena(BasicNewsRecipe): if 'http' not in url: return 'http://pcarena.pl' + url else: - return url \ No newline at end of file + return url + + def preprocess_html(self, soup): + for a in soup('a'): + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + a['href']=self.index + a['href'] + return soup \ No newline at end of file diff --git a/recipes/readitlater.recipe b/recipes/readitlater.recipe index 38f7ec1a9a..92c9aaf9d6 100644 --- a/recipes/readitlater.recipe +++ b/recipes/readitlater.recipe @@ -1,5 +1,5 @@ """ -readitlaterlist.com +Pocket Calibre Recipe v1.0 """ __license__ = 'GPL v3' __copyright__ = ''' @@ -12,22 +12,23 @@ from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe -class Readitlater(BasicNewsRecipe): - title = 'ReadItLater' +class Pocket(BasicNewsRecipe): + title = 'Pocket' __author__ = 'Darko Miletic, Przemyslaw Kryger, Keith Callenberg, tBunnyMan' - description = '''Personalized news feeds. Go to readitlaterlist.com to setup \ - up your news. This version displays pages of articles from \ + description = '''Personalized news feeds. Go to getpocket.com to setup up \ + your news. This version displays pages of articles from \ oldest to newest, with max & minimum counts, and marks articles \ read after downloading.''' - publisher = 'readitlaterlist.com' + publisher = 'getpocket.com' category = 'news, custom' oldest_article = 7 max_articles_per_feed = 50 - minimum_articles = 1 + minimum_articles = 10 + mark_as_read_after_dl = True no_stylesheets = True use_embedded_content = False needs_subscription = True - INDEX = u'http://readitlaterlist.com' + INDEX = u'http://getpocket.com' LOGIN = INDEX + u'/l' readList = [] @@ -100,9 +101,31 @@ class Readitlater(BasicNewsRecipe): br = self.get_browser() for link in markList: url = self.INDEX + link + print 'Marking read: ', url response = br.open(url) - response + print response.info() def cleanup(self): - self.mark_as_read(self.readList) + if self.mark_as_read_after_dl: + self.mark_as_read(self.readList) + else: + pass + def default_cover(self, cover_file): + ''' + Create a generic cover for recipes that don't have a cover + This override adds time to the cover + ''' + try: + from calibre.ebooks import calibre_cover + title = self.title if isinstance(self.title, unicode) else \ + self.title.decode('utf-8', 'replace') + date = strftime(self.timefmt) + time = strftime('[%I:%M %p]') + img_data = calibre_cover(title, date, time) + cover_file.write(img_data) + cover_file.flush() + except: + self.log.exception('Failed to generate default cover') + return False + return True diff --git a/recipes/swiat_obrazu.recipe b/recipes/swiat_obrazu.recipe new file mode 100644 index 0000000000..68740fa4dd --- /dev/null +++ b/recipes/swiat_obrazu.recipe @@ -0,0 +1,25 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class Swiat_Obrazu(BasicNewsRecipe): + title = u'Swiat Obrazu' + __author__ = 'fenuks' + description = u'Internetowy Dziennik o Fotografii i Wideo www.SwiatObrazu.pl to źródło informacji o technice fotografii i wideo, o sprzęcie najbardziej znanych i uznanych firm: Canon, Nikon, Sony, Hasselblad i wielu innych. Znajdziecie tu programy do obróbki zdjęć, forum foto i forum wideo i galerie zdjęć. Codziennie najświeższe informacje: aktualności, testy, poradniki, wywiady, felietony. Swiatobrazu.pl stale organizuje konkursy oraz warsztaty fotograficzne i wideo.' + category = 'photography' + masthead_url = 'http://www.swiatobrazu.pl/img/logo.jpg' + cover_url = 'http://www.swiatobrazu.pl/img/logo.jpg' + language = 'pl' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_javascript= True + use_embedded_content = False + feeds = [(u'Wszystko', u'http://www.swiatobrazu.pl/rss')] + + def print_version(self, url): + return url + ',drukuj' + + def image_url_processor(self, baseurl, url): + if 'http://' not in url or 'https://' not in url: + return 'http://www.swiatobrazu.pl' + url[5:] + else: + return url diff --git a/recipes/tanuki.recipe b/recipes/tanuki.recipe index 666cb8aa77..a615763307 100644 --- a/recipes/tanuki.recipe +++ b/recipes/tanuki.recipe @@ -34,4 +34,12 @@ class tanuki(BasicNewsRecipe): def preprocess_html(self, soup): self.append_page(soup, soup.body) + for a in soup('a'): + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + if 'tanuki-anime' in soup.title.string.lower(): + a['href']='http://anime.tanuki.pl' + a['href'] + elif 'tanuki-manga' in soup.title.string.lower(): + a['href']='http://manga.tanuki.pl' + a['href'] + elif 'tanuki-czytelnia' in soup.title.string.lower(): + a['href']='http://czytelnia.tanuki.pl' + a['href'] return soup \ No newline at end of file diff --git a/recipes/the_sun.recipe b/recipes/the_sun.recipe index 9285c0b2c2..db74e003a0 100644 --- a/recipes/the_sun.recipe +++ b/recipes/the_sun.recipe @@ -1,9 +1,8 @@ -import re +import re, mechanize from calibre.web.feeds.recipes import BasicNewsRecipe class AdvancedUserRecipe1325006965(BasicNewsRecipe): title = u'The Sun UK' - cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png' description = 'A Recipe for The Sun tabloid UK' __author__ = 'Dave Asbury' @@ -24,37 +23,69 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe): no_stylesheets = True extra_css = ''' - body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;} - ''' + body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;} + ''' preprocess_regexps = [ - (re.compile(r'