diff --git a/Changelog.yaml b/Changelog.yaml index c986b51486..f71bdd5907 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -19,6 +19,50 @@ # new recipes: # - title: +- version: 0.8.62 + date: 2012-07-27 + + new features: + - title: "Book details panel: Allow right clicking on a format to delete it." + + - title: "When errors occur in lots of background jobs, add an option to the error message to temporarily suppress subsequent error messages." + tickets: [886904] + + - title: "E-book viewer full screen mode: Allow clicking in the left and right page margins to turn pages." + tickets: [1024819] + + - title: "Drivers for various Android devices" + tickets: [1028690,1027431] + + - title: "Advanced search dialog: When starting on the title/author/etc. tab, restore the previously used search kind as well." + tickets: [1029745] + + - title: "When presenting the calibre must be restarted warning after installing a new plugin, add a restart now button so that the user can conveniently restart calibre. Currently only works when going vie Preferences->Plugins->Get new plugins" + + bug fixes: + - title: "Fix main window layout state being saved incorrectly if calibre is killed without a proper shutdown" + + - title: "Fix boolean and date searching in non english calibre installs." + + - title: "Conversion: Ignore invalid chapter detection and level n ToC expressions instead of erroring out" + + improved recipes: + - Psychology Today + - The Smithsonian + - The New Republic + - Various updated Polish news sources + - The Sun + - San Francisco Bay Guardian + - AnandTech + - Smashing Magazine + + new recipes: + - title: Linux Journal and Conowego.pl + author: fenuks + + - title: A list apart and .net magazine + author: Marc Busque + - version: 0.8.61 date: 2012-07-20 diff --git a/manual/develop.rst b/manual/develop.rst old mode 100755 new mode 100644 index 8cc36d6b58..3a9488ccf5 --- a/manual/develop.rst +++ b/manual/develop.rst @@ -151,25 +151,20 @@ calibre is the directory that contains the src and resources sub-directories. En The next step is to create a bash script that will set the environment variable ``CALIBRE_DEVELOP_FROM`` to the absolute path of the src directory when running calibre in debug mode. -Create a plain text file: +Create a plain text file:: #!/bin/sh export CALIBRE_DEVELOP_FROM="/Users/kovid/work/calibre/src" calibre-debug -g -Save this file as ``/usr/bin/calibre-develop``, then set its permissions so that it can be run: +Save this file as ``/usr/bin/calibre-develop``, then set its permissions so that it can be executed:: chmod +x /usr/bin/calibre-develop -Once you have done this, type +Once you have done this, type:: calibre-develop -You should see some diagnostic information in the Terminal window as calibre starts up, and you should see an asterisk after the version number in the GUI window, indicating that you are running from source. - -That's it! You are now ready to start hacking on the |app| code. For example, open the file :file:`src/calibre/__init__.py` -in your favorite editor and add the line:: - - print ("Hello, world!") - -near the top of the file. Now run the command :command:`calibredb`. The very first line of output should be ``Hello, world!``. +You should see some diagnostic information in the Terminal window as calibre +starts up, and you should see an asterisk after the version number in the GUI +window, indicating that you are running from source. Linux development environment ------------------------------ diff --git a/recipes/anandtech.recipe b/recipes/anandtech.recipe index aa10084070..ff08c828ac 100644 --- a/recipes/anandtech.recipe +++ b/recipes/anandtech.recipe @@ -21,8 +21,12 @@ class anan(BasicNewsRecipe): remove_javascript = True encoding = 'utf-8' - remove_tags=[dict(name='a', attrs={'style':'width:110px; margin-top:0px;text-align:center;'}), - dict(name='a', attrs={'style':'width:110px; margin-top:0px; margin-right:20px;text-align:center;'})] + remove_tags=[ + dict(name='a', attrs={'style':'width:110px; margin-top:0px;text-align:center;'}), + dict(name='a', attrs={'style':'width:110px; margin-top:0px; margin-right:20px;text-align:center;'}), + {'attrs':{'class':['article_links', 'header', 'body_right']}}, + {'id':['crumbs']}, + ] feeds = [ ('Anandtech', 'http://www.anandtech.com/rss/')] diff --git a/recipes/benchmark_pl.recipe b/recipes/benchmark_pl.recipe index 00eea1be68..9544abdfcf 100644 --- a/recipes/benchmark_pl.recipe +++ b/recipes/benchmark_pl.recipe @@ -1,6 +1,6 @@ from calibre.web.feeds.news import BasicNewsRecipe import re -class Benchmark_pl(BasicNewsRecipe): +class BenchmarkPl(BasicNewsRecipe): title = u'Benchmark.pl' __author__ = 'fenuks' description = u'benchmark.pl -IT site' @@ -14,7 +14,7 @@ class Benchmark_pl(BasicNewsRecipe): preprocess_regexps = [(re.compile(ur'

 Zobacz poprzednie Opinie dnia:.*', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Więcej o .*?', re.DOTALL|re.IGNORECASE), lambda match: '')] keep_only_tags=[dict(name='div', attrs={'class':['m_zwykly', 'gallery']})] remove_tags_after=dict(name='div', attrs={'class':'body'}) - remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery']}), dict(name='table', attrs={'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})] + remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery', 'breadcrumb']}), dict(name='table', attrs={'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})] INDEX= 'http://www.benchmark.pl' feeds = [(u'Aktualności', u'http://www.benchmark.pl/rss/aktualnosci-pliki.xml'), (u'Testy i recenzje', u'http://www.benchmark.pl/rss/testy-recenzje-minirecenzje.xml')] diff --git a/recipes/conowego_pl.recipe b/recipes/conowego_pl.recipe new file mode 100755 index 0000000000..8b4288ddcd --- /dev/null +++ b/recipes/conowego_pl.recipe @@ -0,0 +1,38 @@ +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup +class CoNowegoPl(BasicNewsRecipe): + title = u'conowego.pl' + __author__ = 'fenuks' + description = u'Nowy wortal technologiczny oraz gazeta internetowa. Testy najnowszych produktów, fachowe porady i recenzje. U nas znajdziesz wszystko o elektronice użytkowej !' + cover_url = 'http://www.conowego.pl/fileadmin/templates/main/images/logo_top.png' + category = 'IT, news' + language = 'pl' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + use_embedded_content = False + keep_only_tags = [dict(name='div', attrs={'class':'news_list single_view'})] + remove_tags = [dict(name='div', attrs={'class':['ni_bottom', 'ni_rank', 'ni_date']})] + feeds = [(u'Aktualno\u015bci', u'http://www.conowego.pl/rss/aktualnosci-5/?type=100'), (u'Gaming', u'http://www.conowego.pl/rss/gaming-6/?type=100'), (u'Porady', u'http://www.conowego.pl/rss/porady-3/?type=100'), (u'Testy', u'http://www.conowego.pl/rss/testy-2/?type=100')] + + def preprocess_html(self, soup): + for i in soup.findAll('img'): + i.parent.insert(0, BeautifulSoup('
')) + i.insert(len(i), BeautifulSoup('
')) + self.append_page(soup, soup.body) + return soup + + + def append_page(self, soup, appendtag): + tag = appendtag.find('div', attrs={'class':'pages'}) + if tag: + nexturls=tag.findAll('a') + for nexturl in nexturls[:-1]: + soup2 = self.index_to_soup('http://www.conowego.pl/' + nexturl['href']) + pagetext = soup2.find(attrs={'class':'ni_content'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + + for r in appendtag.findAll(attrs={'class':['pages', 'paginationWrap']}): + r.extract() diff --git a/recipes/dot_net.recipe b/recipes/dot_net.recipe new file mode 100644 index 0000000000..50db71e9be --- /dev/null +++ b/recipes/dot_net.recipe @@ -0,0 +1,32 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from calibre.web.feeds.news import BasicNewsRecipe +import re + +class NetMagazineRecipe (BasicNewsRecipe): + __author__ = u'Marc Busqué ' + __url__ = 'http://www.lamarciana.com' + __version__ = '1.0' + __license__ = 'GPL v3' + __copyright__ = u'2012, Marc Busqué ' + title = u'.net magazine' + description = u'net is the world’s best-selling magazine for web designers and developers, featuring tutorials from leading agencies, interviews with the web’s biggest names, and agenda-setting features on the hottest issues affecting the internet today.' + language = 'en' + tags = 'web development, software' + oldest_article = 7 + remove_empty_feeds = True + no_stylesheets = True + cover_url = u'http://media.netmagazine.futurecdn.net/sites/all/themes/netmag/logo.png' + keep_only_tags = [ + dict(name='article', attrs={'class': re.compile('^node.*$', re.IGNORECASE)}) + ] + remove_tags = [ + dict(name='span', attrs={'class': 'comment-count'}), + dict(name='div', attrs={'class': 'item-list share-links'}), + dict(name='footer'), + ] + remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan', 'valign', 'vspace', 'hspace', 'alt', 'width', 'height', 'style'] + extra_css = 'img {max-width: 100%; display: block; margin: auto;} .captioned-image div {text-align: center; font-style: italic;}' + + feeds = [ + (u'.net', u'http://feeds.feedburner.com/net/topstories'), + ] diff --git a/recipes/film_web.recipe b/recipes/film_web.recipe index 2a6e00d501..ba34c9ff63 100644 --- a/recipes/film_web.recipe +++ b/recipes/film_web.recipe @@ -1,6 +1,7 @@ from calibre.web.feeds.news import BasicNewsRecipe - -class Filmweb_pl(BasicNewsRecipe): +import re +from calibre.ebooks.BeautifulSoup import BeautifulSoup +class FilmWebPl(BasicNewsRecipe): title = u'FilmWeb' __author__ = 'fenuks' description = 'FilmWeb - biggest polish movie site' @@ -12,8 +13,9 @@ class Filmweb_pl(BasicNewsRecipe): max_articles_per_feed = 100 no_stylesheets= True remove_empty_feeds=True + preprocess_regexps = [(re.compile(u'\(kliknij\,\ aby powiększyć\)', re.IGNORECASE), lambda m: ''), ]#(re.compile(ur' | ', re.IGNORECASE), lambda m: '')] extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}' - remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'})] + remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'}), dict(attrs={'class':'userSurname anno'})] keep_only_tags= [dict(name='h1', attrs={'class':['hdrBig', 'hdrEntity']}), dict(name='div', attrs={'class':['newsInfo', 'newsInfoSmall', 'reviewContent description']})] feeds = [(u'Wszystkie newsy', u'http://www.filmweb.pl/feed/news/latest'), (u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'), @@ -31,18 +33,22 @@ class Filmweb_pl(BasicNewsRecipe): (u'News / Kino polskie', u'http://www.filmweb.pl/feed/news/category/polish.cinema'), (u'News / Telewizja', u'http://www.filmweb.pl/feed/news/category/tv'), (u'Recenzje redakcji', u'http://www.filmweb.pl/feed/reviews/latest'), - (u'Recenzje użytkowników', u'http://www.filmweb.pl/feed/user-reviews/latest')] + (u'Recenzje użytkowników', u'http://www.filmweb.pl/feed/user-reviews/latest') + ] - def skip_ad_pages(self, soup): + def skip_ad_pages(self, soup): skip_tag = soup.find('a', attrs={'class':'welcomeScreenButton'}) if skip_tag is not None: - self.log.warn('skip_tag') - self.log.warn(skip_tag) return self.index_to_soup(skip_tag['href'], raw=True) - + def preprocess_html(self, soup): for a in soup('a'): if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: a['href']=self.index + a['href'] - return soup \ No newline at end of file + for i in soup.findAll('a', attrs={'class':'fn'}): + i.insert(len(i), BeautifulSoup('
')) + for i in soup.findAll('sup'): + if not i.string or i.string.startswith('(kliknij'): + i.extract() + return soup diff --git a/recipes/gry_online_pl.recipe b/recipes/gry_online_pl.recipe index e188e4988c..fce9674081 100644 --- a/recipes/gry_online_pl.recipe +++ b/recipes/gry_online_pl.recipe @@ -1,6 +1,6 @@ from calibre.web.feeds.recipes import BasicNewsRecipe -class Gry_online_pl(BasicNewsRecipe): +class GryOnlinePl(BasicNewsRecipe): title = u'Gry-Online.pl' __author__ = 'fenuks' description = 'Gry-Online.pl - computer games' @@ -21,17 +21,18 @@ class Gry_online_pl(BasicNewsRecipe): tag = appendtag.find('div', attrs={'class':'n5p'}) if tag: nexturls=tag.findAll('a') - for nexturl in nexturls[1:]: - try: - soup2 = self.index_to_soup('http://www.gry-online.pl/S020.asp'+ nexturl['href']) - except: - soup2 = self.index_to_soup('http://www.gry-online.pl/S022.asp'+ nexturl['href']) + url_part = soup.find('link', attrs={'rel':'canonical'})['href'] + url_part = url_part[25:].rpartition('?')[0] + for nexturl in nexturls[1:-1]: + soup2 = self.index_to_soup('http://www.gry-online.pl/' + url_part + nexturl['href']) pagetext = soup2.find(attrs={'class':'gc660'}) for r in pagetext.findAll(name='header'): r.extract() + for r in pagetext.findAll(attrs={'itemprop':'description'}): + r.extract() pos = len(appendtag.contents) appendtag.insert(pos, pagetext) - for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button']}): + for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button', 'lista lista3 lista-gry']}): r.extract() diff --git a/recipes/icons/conowego_pl.png b/recipes/icons/conowego_pl.png new file mode 100644 index 0000000000..3bc8f2c672 Binary files /dev/null and b/recipes/icons/conowego_pl.png differ diff --git a/recipes/icons/linux_journal.png b/recipes/icons/linux_journal.png new file mode 100644 index 0000000000..ed0092bd1d Binary files /dev/null and b/recipes/icons/linux_journal.png differ diff --git a/recipes/linux_journal.recipe b/recipes/linux_journal.recipe new file mode 100755 index 0000000000..99b1a570dc --- /dev/null +++ b/recipes/linux_journal.recipe @@ -0,0 +1,36 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class LinuxJournal(BasicNewsRecipe): + title = u'Linux Journal' + __author__ = 'fenuks' + description = u'The monthly magazine of the Linux community, promoting the use of Linux worldwide.' + cover_url = 'http://www.linuxjournal.com/files/linuxjournal.com/ufiles/logo-lj.jpg' + category = 'IT, Linux' + language = 'en' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + remove_empty_feeds = True + keep_only_tags=[dict(id='content-inner')] + remove_tags_after= dict(attrs={'class':'user-signature clear-block'}) + remove_tags=[dict(attrs={'class':['user-signature clear-block', 'breadcrumb', 'terms terms-inline']})] + feeds = [(u'Front Page', u'http://feeds.feedburner.com/linuxjournalcom'), (u'News', u'http://feeds.feedburner.com/LinuxJournal-BreakingNews'), (u'Blogs', u'http://www.linuxjournal.com/blog/feed'), (u'Audio/Video', u'http://www.linuxjournal.com/taxonomy/term/28/0/feed'), (u'Community', u'http://www.linuxjournal.com/taxonomy/term/18/0/feed'), (u'Education', u'http://www.linuxjournal.com/taxonomy/term/25/0/feed'), (u'Embedded', u'http://www.linuxjournal.com/taxonomy/term/27/0/feed'), (u'Hardware', u'http://www.linuxjournal.com/taxonomy/term/23/0/feed'), (u'HOWTOs', u'http://www.linuxjournal.com/taxonomy/term/19/0/feed'), (u'International', u'http://www.linuxjournal.com/taxonomy/term/30/0/feed'), (u'Security', u'http://www.linuxjournal.com/taxonomy/term/31/0/feed'), (u'Software', u'http://www.linuxjournal.com/taxonomy/term/17/0/feed'), (u'Sysadmin', u'http://www.linuxjournal.com/taxonomy/term/21/0/feed'), (u'Webmaster', u'http://www.linuxjournal.com/taxonomy/term/24/0/feed')] + + def append_page(self, soup, appendtag): + next = appendtag.find('li', attrs={'class':'pager-next'}) + while next: + nexturl = next.a['href'] + appendtag.find('div', attrs={'class':'links'}).extract() + soup2 = self.index_to_soup('http://www.linuxjournal.com'+ nexturl) + pagetext = soup2.find(attrs={'class':'node-inner'}).find(attrs={'class':'content'}) + next = appendtag.find('li', attrs={'class':'pager-next'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + tag = appendtag.find('div', attrs={'class':'links'}) + if tag: + tag.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/list_apart.recipe b/recipes/list_apart.recipe new file mode 100644 index 0000000000..35cbaad958 --- /dev/null +++ b/recipes/list_apart.recipe @@ -0,0 +1,33 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from calibre.web.feeds.news import BasicNewsRecipe + +class AListApart (BasicNewsRecipe): + __author__ = u'Marc Busqué ' + __url__ = 'http://www.lamarciana.com' + __version__ = '1.0' + __license__ = 'GPL v3' + __copyright__ = u'2012, Marc Busqué ' + title = u'A List Apart' + description = u'A List Apart Magazine (ISSN: 1534-0295) explores the design, development, and meaning of web content, with a special focus on web standards and best practices.' + language = 'en' + tags = 'web development, software' + oldest_article = 120 + remove_empty_feeds = True + no_stylesheets = True + encoding = 'utf8' + cover_url = u'http://alistapart.com/pix/alalogo.gif' + keep_only_tags = [ + dict(name='div', attrs={'id': 'content'}) + ] + remove_tags = [ + dict(name='ul', attrs={'id': 'metastuff'}), + dict(name='div', attrs={'class': 'discuss'}), + dict(name='div', attrs={'class': 'discuss'}), + dict(name='div', attrs={'id': 'learnmore'}), + ] + remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan', 'valign', 'vspace', 'hspace', 'alt', 'width', 'height'] + extra_css = u'img {max-width: 100%; display: block; margin: auto;} #authorbio img {float: left; margin-right: 2%;}' + + feeds = [ + (u'A List Apart', u'http://www.alistapart.com/site/rss'), + ] diff --git a/recipes/natemat_pl.recipe b/recipes/natemat_pl.recipe index faa1b341a0..d6db93dad7 100644 --- a/recipes/natemat_pl.recipe +++ b/recipes/natemat_pl.recipe @@ -1,3 +1,4 @@ +import re from calibre.web.feeds.news import BasicNewsRecipe class NaTemat(BasicNewsRecipe): @@ -8,8 +9,9 @@ class NaTemat(BasicNewsRecipe): description = u'informacje, komentarze, opinie' category = 'news' language = 'pl' + preprocess_regexps = [(re.compile(ur'Czytaj też\:.*?', re.IGNORECASE), lambda m: ''), (re.compile(ur'Zobacz też\:.*?', re.IGNORECASE), lambda m: ''), (re.compile(ur'Czytaj więcej\:.*?', re.IGNORECASE), lambda m: ''), (re.compile(ur'Czytaj również\:.*?', re.IGNORECASE), lambda m: '')] cover_url= 'http://blog.plona.pl/wp-content/uploads/2012/05/natemat.png' no_stylesheets = True keep_only_tags= [dict(id='main')] - remove_tags= [dict(attrs={'class':['button', 'block-inside style_default', 'article-related']})] + remove_tags= [dict(attrs={'class':['button', 'block-inside style_default', 'article-related', 'user-header', 'links']}), dict(name='img', attrs={'class':'indent'})] feeds = [(u'Artyku\u0142y', u'http://natemat.pl/rss/wszystkie')] diff --git a/recipes/psych.recipe b/recipes/psych.recipe index 3fc940b4a2..a21acefe30 100644 --- a/recipes/psych.recipe +++ b/recipes/psych.recipe @@ -1,44 +1,79 @@ +import re +from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ptempfile import PersistentTemporaryFile -from calibre.web.feeds.news import BasicNewsRecipe -class AdvancedUserRecipe1275708473(BasicNewsRecipe): - title = u'Psychology Today' - _author__ = 'rty' - publisher = u'www.psychologytoday.com' - category = u'Psychology' - max_articles_per_feed = 100 - remove_javascript = True - use_embedded_content = False - no_stylesheets = True +class PsychologyToday(BasicNewsRecipe): + + title = 'Psychology Today' + __author__ = 'Rick Shang' + + description = 'This magazine takes information from the latest research in the field of psychology and makes it useful to people in their everyday lives. Its coverage encompasses self-improvement, relationships, the mind-body connection, health, family, the workplace and culture.' language = 'en' - temp_files = [] - articles_are_obfuscated = True - remove_tags = [ - dict(name='div', attrs={'class':['print-source_url','field-items','print-footer']}), - dict(name='span', attrs={'class':'print-footnote'}), - ] - remove_tags_before = dict(name='h1', attrs={'class':'print-title'}) - remove_tags_after = dict(name='div', attrs={'class':['field-items','print-footer']}) + category = 'news' + encoding = 'UTF-8' + keep_only_tags = [dict(attrs={'class':['print-title', 'print-submitted', 'print-content', 'print-footer', 'print-source_url', 'print-links']})] + no_javascript = True + no_stylesheets = True - feeds = [(u'Contents', u'http://www.psychologytoday.com/articles/index.rss')] - def get_article_url(self, article): - return article.get('link', None) + def parse_index(self): + articles = [] + soup = self.index_to_soup('http://www.psychologytoday.com/magazine') + + + #Go to the main body + div = soup.find('div',attrs={'id':'content-content'}) + #Find cover & date + cover_item = div.find('div', attrs={'class':'collections-header-image'}) + cover = cover_item.find('img',src=True) + self.cover_url = cover['src'] + date = self.tag_to_string(cover['title']) + self.timefmt = u' [%s]'%date + + articles = [] + for post in div.findAll('div', attrs={'class':'collections-node-feature-info'}): + title = self.tag_to_string(post.find('h2')) + author_item=post.find('div', attrs={'class':'collection-node-byline'}) + author = re.sub(r'.*by\s',"",self.tag_to_string(author_item).strip()) + title = title + u' (%s)'%author + article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href']) + print_page=article_page.find('li', attrs={'class':'print_html first'}) + url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href'] + desc = self.tag_to_string(post.find('div', attrs={'class':'collection-node-description'})).strip() + self.log('Found article:', title) + self.log('\t', url) + self.log('\t', desc) + articles.append({'title':title, 'url':url, 'date':'','description':desc}) + + for post in div.findAll('div', attrs={'class':'collections-node-thumbnail-info'}): + title = self.tag_to_string(post.find('h2')) + author_item=post.find('div', attrs={'class':'collection-node-byline'}) + article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href']) + print_page=article_page.find('li', attrs={'class':'print_html first'}) + description = post.find('div', attrs={'class':'collection-node-description'}) + author = re.sub(r'.*by\s',"",self.tag_to_string(description.nextSibling).strip()) + desc = self.tag_to_string(description).strip() + url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href'] + title = title + u' (%s)'%author + self.log('Found article:', title) + self.log('\t', url) + self.log('\t', desc) + articles.append({'title':title, 'url':url, 'date':'','description':desc}) + + for post in div.findAll('li', attrs={'class':['collection-item-list-odd','collection-item-list-even']}): + title = self.tag_to_string(post.find('h2')) + author_item=post.find('div', attrs={'class':'collection-node-byline'}) + author = re.sub(r'.*by\s',"",self.tag_to_string(author_item).strip()) + title = title + u' (%s)'%author + article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href']) + print_page=article_page.find('li', attrs={'class':'print_html first'}) + url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href'] + desc = self.tag_to_string(post.find('div', attrs={'class':'collection-node-description'})).strip() + self.log('Found article:', title) + self.log('\t', url) + self.log('\t', desc) + articles.append({'title':title, 'url':url, 'date':'','description':desc}) + + return [('Current Issue', articles)] - def get_obfuscated_article(self, url): - br = self.get_browser() - br.open(url) - response = br.follow_link(url_regex = r'/print/[0-9]+', nr = 0) - html = response.read() - self.temp_files.append(PersistentTemporaryFile('_fa.html')) - self.temp_files[-1].write(html) - self.temp_files[-1].close() - return self.temp_files[-1].name - def get_cover_url(self): - index = 'http://www.psychologytoday.com/magazine/' - soup = self.index_to_soup(index) - for image in soup.findAll('img',{ "class" : "imagefield imagefield-field_magazine_cover" }): - return image['src'] + '.jpg' - return None diff --git a/recipes/sfbg.recipe b/recipes/sfbg.recipe index 0735e760c6..5c77c96f74 100644 --- a/recipes/sfbg.recipe +++ b/recipes/sfbg.recipe @@ -1,25 +1,35 @@ from calibre.web.feeds.news import BasicNewsRecipe class SanFranciscoBayGuardian(BasicNewsRecipe): - title = u'San Francisco Bay Guardian' - language = 'en' - __author__ = 'Krittika Goyal' + title = u'San Francisco Bay Guardian' + language = 'en' + __author__ = 'Krittika Goyal' oldest_article = 31 #days max_articles_per_feed = 25 + #encoding = 'latin1' no_stylesheets = True + #remove_tags_before = dict(name='div', attrs={'id':'story_header'}) + #remove_tags_after = dict(name='div', attrs={'id':'shirttail'}) remove_tags = [ - dict(name='iframe'), + dict(name='iframe'), + #dict(name='div', attrs={'class':'related-articles'}), + #dict(name='div', attrs={'id':['story_tools', 'toolbox', 'shirttail', 'comment_widget']}), + #dict(name='ul', attrs={'class':'article-tools'}), + #dict(name='ul', attrs={'id':'story_tabs'}), ] feeds = [ ('sfbg', 'http://www.sfbg.com/rss.xml'), - ('politics', 'http://www.sfbg.com/politics/rss.xml'), - ('blogs', 'http://www.sfbg.com/blog/rss.xml'), - ('pixel_vision', 'http://www.sfbg.com/pixel_vision/rss.xml'), - ('bruce', 'http://www.sfbg.com/bruce/rss.xml'), ] - + #def preprocess_html(self, soup): + #story = soup.find(name='div', attrs={'id':'story_body'}) + #td = heading.findParent(name='td') + #td.extract() + #soup = BeautifulSoup('t') + #body = soup.find(name='body') + #body.insert(0, story) + #return soup diff --git a/recipes/smashing.recipe b/recipes/smashing.recipe index 04436a05ef..bc24166275 100644 --- a/recipes/smashing.recipe +++ b/recipes/smashing.recipe @@ -1,50 +1,24 @@ -#!/usr/bin/env python - -__license__ = 'GPL v3' -__copyright__ = '2009, Darko Miletic ' -''' -www.smashingmagazine.com -''' - +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai from calibre.web.feeds.news import BasicNewsRecipe -class SmashingMagazine(BasicNewsRecipe): - title = 'Smashing Magazine' - __author__ = 'Darko Miletic' - description = 'We smash you with the information that will make your life easier, really' - oldest_article = 20 - language = 'en' - max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - publisher = 'Smashing Magazine' - category = 'news, web, IT, css, javascript, html' - encoding = 'utf-8' +class SmashingMagazine (BasicNewsRecipe): + __author__ = u'Marc Busqué ' + __url__ = 'http://www.lamarciana.com' + __version__ = '1.0.1' + __license__ = 'GPL v3' + __copyright__ = u'2012, Marc Busqué ' + title = u'Smashing Magazine' + description = u'Founded in September 2006, Smashing Magazine delivers useful and innovative information to Web designers and developers. Our aim is to inform our readers about the latest trends and techniques in Web development. We try to persuade you not with the quantity but with the quality of the information we present. Smashing Magazine is and always has been independent.' + language = 'en' + tags = 'web development, software' + oldest_article = 7 + remove_empty_feeds = True + no_stylesheets = True + encoding = 'utf8' + cover_url = u'http://media.smashingmagazine.com/themes/smashingv4/images/logo.png' + remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan', 'valign', 'vspace', 'hspace', 'alt', 'width', 'height', 'style'] + extra_css = u'body div table:first-child {display: none;} img {max-width: 100%; display: block; margin: auto;}' - conversion_options = { - 'comments' : description - ,'tags' : category - ,'publisher' : publisher - } - - keep_only_tags = [dict(name='div', attrs={'id':'leftcolumn'})] - remove_tags_after = dict(name='ul',attrs={'class':'social'}) - remove_tags = [ - dict(name=['link','object']) - ,dict(name='h1',attrs={'class':'logo'}) - ,dict(name='div',attrs={'id':'booklogosec'}) - ,dict(attrs={'src':'http://media2.smashingmagazine.com/wp-content/uploads/images/the-smashing-book/smbook6.gif'}) - ] - - feeds = [(u'Articles', u'http://rss1.smashingmagazine.com/feed/')] - - def preprocess_html(self, soup): - for iter in soup.findAll('div',attrs={'class':'leftframe'}): - it = iter.find('h1') - if it == None: - iter.extract() - for item in soup.findAll('img'): - oldParent = item.parent - if oldParent.name == 'a': - oldParent.name = 'div' - return soup + feeds = [ + (u'Smashing Magazine', u'http://rss1.smashingmagazine.com/feed/'), + ] diff --git a/recipes/smith.recipe b/recipes/smith.recipe index 8bf60a227a..3d6a95c494 100644 --- a/recipes/smith.recipe +++ b/recipes/smith.recipe @@ -1,61 +1,67 @@ import re -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup +from calibre.web.feeds.recipes import BasicNewsRecipe +from collections import OrderedDict -class SmithsonianMagazine(BasicNewsRecipe): - title = u'Smithsonian Magazine' - language = 'en' - __author__ = 'Krittika Goyal and TerminalVeracity' - oldest_article = 31#days - max_articles_per_feed = 50 - use_embedded_content = False - recursions = 1 - cover_url = 'http://sphotos.xx.fbcdn.net/hphotos-snc7/431147_10150602715983253_764313347_n.jpg' - match_regexps = ['&page=[2-9]$'] - preprocess_regexps = [ - (re.compile(r'for more of Smithsonian\'s coverage on history, science and nature.', re.DOTALL), lambda m: '') - ] - extra_css = """ - h1{font-size: large; margin: .2em 0} - h2{font-size: medium; margin: .2em 0} - h3{font-size: medium; margin: .2em 0} - #byLine{margin: .2em 0} - .articleImageCaptionwide{font-style: italic} - .wp-caption-text{font-style: italic} - img{display: block} - """ +class Smithsonian(BasicNewsRecipe): + title = 'Smithsonian Magazine' + __author__ = 'Rick Shang' - remove_stylesheets = True - remove_tags_after = dict(name='div', attrs={'class':['post','articlePaginationWrapper']}) - remove_tags = [ - dict(name='iframe'), - dict(name='div', attrs={'class':['article_sidebar_border','viewMorePhotos','addtoany_share_save_container','meta','social','OUTBRAIN','related-articles-inpage']}), - dict(name='div', attrs={'id':['article_sidebar_border', 'most-popular_large', 'most-popular-body_large','comment_section','article-related']}), - dict(name='ul', attrs={'class':'cat-breadcrumb col three last'}), - dict(name='h4', attrs={'id':'related-topics'}), - dict(name='table'), - dict(name='a', attrs={'href':['/subArticleBottomWeb','/subArticleTopWeb','/subArticleTopMag','/subArticleBottomMag']}), - dict(name='a', attrs={'name':'comments_shaded'}), - ] + description = 'This magazine chronicles the arts, environment, sciences and popular culture of the times. It is edited for modern, well-rounded individuals with diverse, general interests. With your order, you become a National Associate Member of the Smithsonian. Membership benefits include your subscription to Smithsonian magazine, a personalized membership card, discounts from the Smithsonian catalog, and more.' + language = 'en' + category = 'news' + encoding = 'UTF-8' + keep_only_tags = [dict(attrs={'id':['articleTitle', 'subHead', 'byLine', 'articleImage', 'article-text']})] + remove_tags = [dict(attrs={'class':['related-articles-inpage', 'viewMorePhotos']})] + no_javascript = True + no_stylesheets = True + def parse_index(self): + #Go to the issue + soup0 = self.index_to_soup('http://www.smithsonianmag.com/issue/archive/') + div = soup0.find('div',attrs={'id':'archives'}) + issue = div.find('ul',attrs={'class':'clear-both'}) + current_issue_url = issue.find('a', href=True)['href'] + soup = self.index_to_soup(current_issue_url) - feeds = [ -('History and Archeology', - 'http://feeds.feedburner.com/smithsonianmag/history-archaeology'), -('People and Places', - 'http://feeds.feedburner.com/smithsonianmag/people-places'), -('Science and Nature', - 'http://feeds.feedburner.com/smithsonianmag/science-nature'), -('Arts and Culture', - 'http://feeds.feedburner.com/smithsonianmag/arts-culture'), -('Travel', - 'http://feeds.feedburner.com/smithsonianmag/travel'), -] + #Go to the main body + div = soup.find ('div', attrs={'id':'content-inset'}) + + #Find date + date = re.sub('.*\:\W*', "", self.tag_to_string(div.find('h2')).strip()) + self.timefmt = u' [%s]'%date + + #Find cover + self.cover_url = div.find('img',src=True)['src'] + + feeds = OrderedDict() + section_title = '' + subsection_title = '' + for post in div.findAll('div', attrs={'class':['plainModule', 'departments plainModule']}): + articles = [] + prefix = '' + h3=post.find('h3') + if h3 is not None: + section_title = self.tag_to_string(h3) + else: + subsection=post.find('p',attrs={'class':'article-cat'}) + link=post.find('a',href=True) + url=link['href']+'?c=y&story=fullstory' + if subsection is not None: + subsection_title = self.tag_to_string(subsection) + prefix = (subsection_title+': ') + description=self.tag_to_string(post('p', limit=2)[1]).strip() + else: + description=self.tag_to_string(post.find('p')).strip() + desc=re.sub('\sBy\s.*', '', description, re.DOTALL) + author=re.sub('.*By\s', '', description, re.DOTALL) + title=prefix + self.tag_to_string(link).strip()+ u' (%s)'%author + articles.append({'title':title, 'url':url, 'description':desc, 'date':''}) + + if articles: + if section_title not in feeds: + feeds[section_title] = [] + feeds[section_title] += articles + ans = [(key, val) for key, val in feeds.iteritems()] + return ans - def preprocess_html(self, soup): - story = soup.find(name='div', attrs={'id':'article-body'}) - soup = BeautifulSoup('t') - body = soup.find(name='body') - body.insert(0, story) - return soup diff --git a/recipes/the_new_republic.recipe b/recipes/the_new_republic.recipe index 59ccef3607..649a8c46f3 100644 --- a/recipes/the_new_republic.recipe +++ b/recipes/the_new_republic.recipe @@ -1,45 +1,64 @@ -from calibre.web.feeds.news import BasicNewsRecipe +import re +from calibre.web.feeds.recipes import BasicNewsRecipe +from collections import OrderedDict -class The_New_Republic(BasicNewsRecipe): - title = 'The New Republic' - __author__ = 'cix3' +class TNR(BasicNewsRecipe): + + title = 'The New Republic' + __author__ = 'Rick Shang' + + description = 'The New Republic is a journal of opinion with an emphasis on politics and domestic and international affairs. It carries feature articles by staff and contributing editors. The second half of each issue is devoted to book and the arts, theater, motion pictures, music and art.' language = 'en' - description = 'Intelligent, stimulating and rigorous examination of American politics, foreign policy and culture' - timefmt = ' [%b %d, %Y]' - - oldest_article = 7 - max_articles_per_feed = 100 + category = 'news' + encoding = 'UTF-8' + remove_tags = [dict(attrs={'class':['print-logo','print-site_name','print-hr']})] + no_javascript = True no_stylesheets = True - remove_tags = [ - dict(name='div', attrs={'class':['print-logo', 'print-site_name', 'img-left', 'print-source_url']}), - dict(name='hr', attrs={'class':'print-hr'}), dict(name='img') - ] - feeds = [ - ('Politics', 'http://www.tnr.com/rss/articles/Politics'), - ('Books and Arts', 'http://www.tnr.com/rss/articles/Books-and-Arts'), - ('Economy', 'http://www.tnr.com/rss/articles/Economy'), - ('Environment and Energy', 'http://www.tnr.com/rss/articles/Environment-%2526-Energy'), - ('Health Care', 'http://www.tnr.com/rss/articles/Health-Care'), - ('Metro Policy', 'http://www.tnr.com/rss/articles/Metro-Policy'), - ('World', 'http://www.tnr.com/rss/articles/World'), - ('Film', 'http://www.tnr.com/rss/articles/Film'), - ('Books', 'http://www.tnr.com/rss/articles/books'), - ('The Book', 'http://www.tnr.com/rss/book'), - ('Jonathan Chait', 'http://www.tnr.com/rss/blogs/Jonathan-Chait'), - ('The Plank', 'http://www.tnr.com/rss/blogs/The-Plank'), - ('The Treatment', 'http://www.tnr.com/rss/blogs/The-Treatment'), - ('The Spine', 'http://www.tnr.com/rss/blogs/The-Spine'), - ('The Vine', 'http://www.tnr.com/rss/blogs/The-Vine'), - ('The Avenue', 'http://www.tnr.com/rss/blogs/The-Avenue'), - ('William Galston', 'http://www.tnr.com/rss/blogs/William-Galston'), - ('Simon Johnson', 'http://www.tnr.com/rss/blogs/Simon-Johnson'), - ('Ed Kilgore', 'http://www.tnr.com/rss/blogs/Ed-Kilgore'), - ('Damon Linker', 'http://www.tnr.com/rss/blogs/Damon-Linker'), - ('John McWhorter', 'http://www.tnr.com/rss/blogs/John-McWhorter') - ] + def parse_index(self): - def print_version(self, url): - return url.replace('http://www.tnr.com/', 'http://www.tnr.com/print/') + #Go to the issue + soup0 = self.index_to_soup('http://www.tnr.com/magazine-issues') + issue = soup0.find('div',attrs={'id':'current_issue'}) + #Find date + date = self.tag_to_string(issue.find('div',attrs={'class':'date'})).strip() + self.timefmt = u' [%s]'%date + + #Go to the main body + current_issue_url = 'http://www.tnr.com' + issue.find('a', href=True)['href'] + soup = self.index_to_soup(current_issue_url) + div = soup.find ('div', attrs={'class':'article_detail_body'}) + + + + #Find cover + self.cover_url = div.find('img',src=True)['src'] + + feeds = OrderedDict() + section_title = '' + subsection_title = '' + for post in div.findAll('p'): + articles = [] + em=post.find('em') + b=post.find('b') + a=post.find('a',href=True) + if em is not None: + section_title = self.tag_to_string(em).strip() + subsection_title = '' + elif b is not None: + subsection_title=self.tag_to_string(b).strip() + elif a is not None: + prefix = (subsection_title+': ') if subsection_title else '' + url=re.sub('www.tnr.com','www.tnr.com/print', a['href']) + author=re.sub('.*by\s', '', self.tag_to_string(post), re.DOTALL) + title=prefix + self.tag_to_string(a).strip()+ u' (%s)'%author + articles.append({'title':title, 'url':url, 'description':'', 'date':''}) + + if articles: + if section_title not in feeds: + feeds[section_title] = [] + feeds[section_title] += articles + ans = [(key, val) for key, val in feeds.iteritems()] + return ans diff --git a/recipes/the_sun.recipe b/recipes/the_sun.recipe index ae7c599328..d93ac2c49b 100644 --- a/recipes/the_sun.recipe +++ b/recipes/the_sun.recipe @@ -1,4 +1,4 @@ -import re, random +import random from calibre import browser from calibre.web.feeds.recipes import BasicNewsRecipe @@ -8,45 +8,43 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe): title = u'The Sun UK' description = 'Articles from The Sun tabloid UK' __author__ = 'Dave Asbury' - # last updated 15/7/12 + # last updated 25/7/12 language = 'en_GB' oldest_article = 1 - max_articles_per_feed = 15 + max_articles_per_feed = 12 remove_empty_feeds = True no_stylesheets = True masthead_url = 'http://www.thesun.co.uk/sol/img/global/Sun-logo.gif' encoding = 'UTF-8' - - remove_empty_feeds = True remove_javascript = True no_stylesheets = True + + + #preprocess_regexps = [ + # (re.compile(r'