diff --git a/Changelog.yaml b/Changelog.yaml index c986b51486..f71bdd5907 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -19,6 +19,50 @@ # new recipes: # - title: +- version: 0.8.62 + date: 2012-07-27 + + new features: + - title: "Book details panel: Allow right clicking on a format to delete it." + + - title: "When errors occur in lots of background jobs, add an option to the error message to temporarily suppress subsequent error messages." + tickets: [886904] + + - title: "E-book viewer full screen mode: Allow clicking in the left and right page margins to turn pages." + tickets: [1024819] + + - title: "Drivers for various Android devices" + tickets: [1028690,1027431] + + - title: "Advanced search dialog: When starting on the title/author/etc. tab, restore the previously used search kind as well." + tickets: [1029745] + + - title: "When presenting the calibre must be restarted warning after installing a new plugin, add a restart now button so that the user can conveniently restart calibre. Currently only works when going vie Preferences->Plugins->Get new plugins" + + bug fixes: + - title: "Fix main window layout state being saved incorrectly if calibre is killed without a proper shutdown" + + - title: "Fix boolean and date searching in non english calibre installs." + + - title: "Conversion: Ignore invalid chapter detection and level n ToC expressions instead of erroring out" + + improved recipes: + - Psychology Today + - The Smithsonian + - The New Republic + - Various updated Polish news sources + - The Sun + - San Francisco Bay Guardian + - AnandTech + - Smashing Magazine + + new recipes: + - title: Linux Journal and Conowego.pl + author: fenuks + + - title: A list apart and .net magazine + author: Marc Busque + - version: 0.8.61 date: 2012-07-20 diff --git a/manual/conversion.rst b/manual/conversion.rst index 5eaca5a469..a4ecd902cc 100644 --- a/manual/conversion.rst +++ b/manual/conversion.rst @@ -710,3 +710,31 @@ EPUB from the ZIP file are:: Note that because this file explores the potential of EPUB, most of the advanced formatting is not going to work on readers less capable than |app|'s built-in EPUB viewer. + +Convert ODT documents +~~~~~~~~~~~~~~~~~~~~~ + +|app| can directly convert ODT (OpenDocument Text) files. You should use styles to format your document and minimize the use of direct formatting. +When inserting images into your document you need to anchor them to the paragraph, images anchored to a page will all end up in the front of the conversion. + +To enable automatic detection of chapters, you need to mark them with the build-in styles called 'Heading 1', 'Heading 2', ..., 'Heading 6' ('Heading 1' equates to the HTML tag
', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Więcej o .*?', re.DOTALL|re.IGNORECASE), lambda match: '')]
keep_only_tags=[dict(name='div', attrs={'class':['m_zwykly', 'gallery']})]
remove_tags_after=dict(name='div', attrs={'class':'body'})
- remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery']}), dict(name='table', attrs={'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})]
+ remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery', 'breadcrumb']}), dict(name='table', attrs={'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})]
INDEX= 'http://www.benchmark.pl'
feeds = [(u'Aktualności', u'http://www.benchmark.pl/rss/aktualnosci-pliki.xml'),
(u'Testy i recenzje', u'http://www.benchmark.pl/rss/testy-recenzje-minirecenzje.xml')]
diff --git a/recipes/conowego_pl.recipe b/recipes/conowego_pl.recipe
new file mode 100755
index 0000000000..8b4288ddcd
--- /dev/null
+++ b/recipes/conowego_pl.recipe
@@ -0,0 +1,38 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+class CoNowegoPl(BasicNewsRecipe):
+ title = u'conowego.pl'
+ __author__ = 'fenuks'
+ description = u'Nowy wortal technologiczny oraz gazeta internetowa. Testy najnowszych produktów, fachowe porady i recenzje. U nas znajdziesz wszystko o elektronice użytkowej !'
+ cover_url = 'http://www.conowego.pl/fileadmin/templates/main/images/logo_top.png'
+ category = 'IT, news'
+ language = 'pl'
+ oldest_article = 7
+ max_articles_per_feed = 100
+ no_stylesheets = True
+ remove_empty_feeds = True
+ use_embedded_content = False
+ keep_only_tags = [dict(name='div', attrs={'class':'news_list single_view'})]
+ remove_tags = [dict(name='div', attrs={'class':['ni_bottom', 'ni_rank', 'ni_date']})]
+ feeds = [(u'Aktualno\u015bci', u'http://www.conowego.pl/rss/aktualnosci-5/?type=100'), (u'Gaming', u'http://www.conowego.pl/rss/gaming-6/?type=100'), (u'Porady', u'http://www.conowego.pl/rss/porady-3/?type=100'), (u'Testy', u'http://www.conowego.pl/rss/testy-2/?type=100')]
+
+ def preprocess_html(self, soup):
+ for i in soup.findAll('img'):
+ i.parent.insert(0, BeautifulSoup('
'))
+ i.insert(len(i), BeautifulSoup('
'))
+ self.append_page(soup, soup.body)
+ return soup
+
+
+ def append_page(self, soup, appendtag):
+ tag = appendtag.find('div', attrs={'class':'pages'})
+ if tag:
+ nexturls=tag.findAll('a')
+ for nexturl in nexturls[:-1]:
+ soup2 = self.index_to_soup('http://www.conowego.pl/' + nexturl['href'])
+ pagetext = soup2.find(attrs={'class':'ni_content'})
+ pos = len(appendtag.contents)
+ appendtag.insert(pos, pagetext)
+
+ for r in appendtag.findAll(attrs={'class':['pages', 'paginationWrap']}):
+ r.extract()
diff --git a/recipes/film_web.recipe b/recipes/film_web.recipe
index 2a6e00d501..ba34c9ff63 100644
--- a/recipes/film_web.recipe
+++ b/recipes/film_web.recipe
@@ -1,6 +1,7 @@
from calibre.web.feeds.news import BasicNewsRecipe
-
-class Filmweb_pl(BasicNewsRecipe):
+import re
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+class FilmWebPl(BasicNewsRecipe):
title = u'FilmWeb'
__author__ = 'fenuks'
description = 'FilmWeb - biggest polish movie site'
@@ -12,8 +13,9 @@ class Filmweb_pl(BasicNewsRecipe):
max_articles_per_feed = 100
no_stylesheets= True
remove_empty_feeds=True
+ preprocess_regexps = [(re.compile(u'\(kliknij\,\ aby powiększyć\)', re.IGNORECASE), lambda m: ''), ]#(re.compile(ur' | ', re.IGNORECASE), lambda m: '')]
extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}'
- remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'})]
+ remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'}), dict(attrs={'class':'userSurname anno'})]
keep_only_tags= [dict(name='h1', attrs={'class':['hdrBig', 'hdrEntity']}), dict(name='div', attrs={'class':['newsInfo', 'newsInfoSmall', 'reviewContent description']})]
feeds = [(u'Wszystkie newsy', u'http://www.filmweb.pl/feed/news/latest'),
(u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'),
@@ -31,18 +33,22 @@ class Filmweb_pl(BasicNewsRecipe):
(u'News / Kino polskie', u'http://www.filmweb.pl/feed/news/category/polish.cinema'),
(u'News / Telewizja', u'http://www.filmweb.pl/feed/news/category/tv'),
(u'Recenzje redakcji', u'http://www.filmweb.pl/feed/reviews/latest'),
- (u'Recenzje użytkowników', u'http://www.filmweb.pl/feed/user-reviews/latest')]
+ (u'Recenzje użytkowników', u'http://www.filmweb.pl/feed/user-reviews/latest')
+ ]
- def skip_ad_pages(self, soup):
+ def skip_ad_pages(self, soup):
skip_tag = soup.find('a', attrs={'class':'welcomeScreenButton'})
if skip_tag is not None:
- self.log.warn('skip_tag')
- self.log.warn(skip_tag)
return self.index_to_soup(skip_tag['href'], raw=True)
-
+
def preprocess_html(self, soup):
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
a['href']=self.index + a['href']
- return soup
\ No newline at end of file
+ for i in soup.findAll('a', attrs={'class':'fn'}):
+ i.insert(len(i), BeautifulSoup('
'))
+ for i in soup.findAll('sup'):
+ if not i.string or i.string.startswith('(kliknij'):
+ i.extract()
+ return soup
diff --git a/recipes/gry_online_pl.recipe b/recipes/gry_online_pl.recipe
index e188e4988c..fce9674081 100644
--- a/recipes/gry_online_pl.recipe
+++ b/recipes/gry_online_pl.recipe
@@ -1,6 +1,6 @@
from calibre.web.feeds.recipes import BasicNewsRecipe
-class Gry_online_pl(BasicNewsRecipe):
+class GryOnlinePl(BasicNewsRecipe):
title = u'Gry-Online.pl'
__author__ = 'fenuks'
description = 'Gry-Online.pl - computer games'
@@ -21,17 +21,18 @@ class Gry_online_pl(BasicNewsRecipe):
tag = appendtag.find('div', attrs={'class':'n5p'})
if tag:
nexturls=tag.findAll('a')
- for nexturl in nexturls[1:]:
- try:
- soup2 = self.index_to_soup('http://www.gry-online.pl/S020.asp'+ nexturl['href'])
- except:
- soup2 = self.index_to_soup('http://www.gry-online.pl/S022.asp'+ nexturl['href'])
+ url_part = soup.find('link', attrs={'rel':'canonical'})['href']
+ url_part = url_part[25:].rpartition('?')[0]
+ for nexturl in nexturls[1:-1]:
+ soup2 = self.index_to_soup('http://www.gry-online.pl/' + url_part + nexturl['href'])
pagetext = soup2.find(attrs={'class':'gc660'})
for r in pagetext.findAll(name='header'):
r.extract()
+ for r in pagetext.findAll(attrs={'itemprop':'description'}):
+ r.extract()
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
- for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button']}):
+ for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button', 'lista lista3 lista-gry']}):
r.extract()
diff --git a/recipes/icons/conowego_pl.png b/recipes/icons/conowego_pl.png
new file mode 100644
index 0000000000..3bc8f2c672
Binary files /dev/null and b/recipes/icons/conowego_pl.png differ
diff --git a/recipes/icons/linux_journal.png b/recipes/icons/linux_journal.png
new file mode 100644
index 0000000000..ed0092bd1d
Binary files /dev/null and b/recipes/icons/linux_journal.png differ
diff --git a/recipes/linux_journal.recipe b/recipes/linux_journal.recipe
new file mode 100755
index 0000000000..99b1a570dc
--- /dev/null
+++ b/recipes/linux_journal.recipe
@@ -0,0 +1,36 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class LinuxJournal(BasicNewsRecipe):
+ title = u'Linux Journal'
+ __author__ = 'fenuks'
+ description = u'The monthly magazine of the Linux community, promoting the use of Linux worldwide.'
+ cover_url = 'http://www.linuxjournal.com/files/linuxjournal.com/ufiles/logo-lj.jpg'
+ category = 'IT, Linux'
+ language = 'en'
+ oldest_article = 7
+ max_articles_per_feed = 100
+ no_stylesheets = True
+ use_embedded_content = False
+ remove_empty_feeds = True
+ keep_only_tags=[dict(id='content-inner')]
+ remove_tags_after= dict(attrs={'class':'user-signature clear-block'})
+ remove_tags=[dict(attrs={'class':['user-signature clear-block', 'breadcrumb', 'terms terms-inline']})]
+ feeds = [(u'Front Page', u'http://feeds.feedburner.com/linuxjournalcom'), (u'News', u'http://feeds.feedburner.com/LinuxJournal-BreakingNews'), (u'Blogs', u'http://www.linuxjournal.com/blog/feed'), (u'Audio/Video', u'http://www.linuxjournal.com/taxonomy/term/28/0/feed'), (u'Community', u'http://www.linuxjournal.com/taxonomy/term/18/0/feed'), (u'Education', u'http://www.linuxjournal.com/taxonomy/term/25/0/feed'), (u'Embedded', u'http://www.linuxjournal.com/taxonomy/term/27/0/feed'), (u'Hardware', u'http://www.linuxjournal.com/taxonomy/term/23/0/feed'), (u'HOWTOs', u'http://www.linuxjournal.com/taxonomy/term/19/0/feed'), (u'International', u'http://www.linuxjournal.com/taxonomy/term/30/0/feed'), (u'Security', u'http://www.linuxjournal.com/taxonomy/term/31/0/feed'), (u'Software', u'http://www.linuxjournal.com/taxonomy/term/17/0/feed'), (u'Sysadmin', u'http://www.linuxjournal.com/taxonomy/term/21/0/feed'), (u'Webmaster', u'http://www.linuxjournal.com/taxonomy/term/24/0/feed')]
+
+ def append_page(self, soup, appendtag):
+ next = appendtag.find('li', attrs={'class':'pager-next'})
+ while next:
+ nexturl = next.a['href']
+ appendtag.find('div', attrs={'class':'links'}).extract()
+ soup2 = self.index_to_soup('http://www.linuxjournal.com'+ nexturl)
+ pagetext = soup2.find(attrs={'class':'node-inner'}).find(attrs={'class':'content'})
+ next = appendtag.find('li', attrs={'class':'pager-next'})
+ pos = len(appendtag.contents)
+ appendtag.insert(pos, pagetext)
+ tag = appendtag.find('div', attrs={'class':'links'})
+ if tag:
+ tag.extract()
+
+ def preprocess_html(self, soup):
+ self.append_page(soup, soup.body)
+ return soup
\ No newline at end of file
diff --git a/recipes/natemat_pl.recipe b/recipes/natemat_pl.recipe
index faa1b341a0..d6db93dad7 100644
--- a/recipes/natemat_pl.recipe
+++ b/recipes/natemat_pl.recipe
@@ -1,3 +1,4 @@
+import re
from calibre.web.feeds.news import BasicNewsRecipe
class NaTemat(BasicNewsRecipe):
@@ -8,8 +9,9 @@ class NaTemat(BasicNewsRecipe):
description = u'informacje, komentarze, opinie'
category = 'news'
language = 'pl'
+ preprocess_regexps = [(re.compile(ur'Czytaj też\:.*?', re.IGNORECASE), lambda m: ''), (re.compile(ur'Zobacz też\:.*?', re.IGNORECASE), lambda m: ''), (re.compile(ur'Czytaj więcej\:.*?', re.IGNORECASE), lambda m: ''), (re.compile(ur'Czytaj również\:.*?', re.IGNORECASE), lambda m: '')]
cover_url= 'http://blog.plona.pl/wp-content/uploads/2012/05/natemat.png'
no_stylesheets = True
keep_only_tags= [dict(id='main')]
- remove_tags= [dict(attrs={'class':['button', 'block-inside style_default', 'article-related']})]
+ remove_tags= [dict(attrs={'class':['button', 'block-inside style_default', 'article-related', 'user-header', 'links']}), dict(name='img', attrs={'class':'indent'})]
feeds = [(u'Artyku\u0142y', u'http://natemat.pl/rss/wszystkie')]
diff --git a/recipes/psych.recipe b/recipes/psych.recipe
index 3fc940b4a2..a21acefe30 100644
--- a/recipes/psych.recipe
+++ b/recipes/psych.recipe
@@ -1,44 +1,79 @@
+import re
+from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ptempfile import PersistentTemporaryFile
-from calibre.web.feeds.news import BasicNewsRecipe
-class AdvancedUserRecipe1275708473(BasicNewsRecipe):
- title = u'Psychology Today'
- _author__ = 'rty'
- publisher = u'www.psychologytoday.com'
- category = u'Psychology'
- max_articles_per_feed = 100
- remove_javascript = True
- use_embedded_content = False
- no_stylesheets = True
+class PsychologyToday(BasicNewsRecipe):
+
+ title = 'Psychology Today'
+ __author__ = 'Rick Shang'
+
+ description = 'This magazine takes information from the latest research in the field of psychology and makes it useful to people in their everyday lives. Its coverage encompasses self-improvement, relationships, the mind-body connection, health, family, the workplace and culture.'
language = 'en'
- temp_files = []
- articles_are_obfuscated = True
- remove_tags = [
- dict(name='div', attrs={'class':['print-source_url','field-items','print-footer']}),
- dict(name='span', attrs={'class':'print-footnote'}),
- ]
- remove_tags_before = dict(name='h1', attrs={'class':'print-title'})
- remove_tags_after = dict(name='div', attrs={'class':['field-items','print-footer']})
+ category = 'news'
+ encoding = 'UTF-8'
+ keep_only_tags = [dict(attrs={'class':['print-title', 'print-submitted', 'print-content', 'print-footer', 'print-source_url', 'print-links']})]
+ no_javascript = True
+ no_stylesheets = True
- feeds = [(u'Contents', u'http://www.psychologytoday.com/articles/index.rss')]
- def get_article_url(self, article):
- return article.get('link', None)
+ def parse_index(self):
+ articles = []
+ soup = self.index_to_soup('http://www.psychologytoday.com/magazine')
+
+
+ #Go to the main body
+ div = soup.find('div',attrs={'id':'content-content'})
+ #Find cover & date
+ cover_item = div.find('div', attrs={'class':'collections-header-image'})
+ cover = cover_item.find('img',src=True)
+ self.cover_url = cover['src']
+ date = self.tag_to_string(cover['title'])
+ self.timefmt = u' [%s]'%date
+
+ articles = []
+ for post in div.findAll('div', attrs={'class':'collections-node-feature-info'}):
+ title = self.tag_to_string(post.find('h2'))
+ author_item=post.find('div', attrs={'class':'collection-node-byline'})
+ author = re.sub(r'.*by\s',"",self.tag_to_string(author_item).strip())
+ title = title + u' (%s)'%author
+ article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href'])
+ print_page=article_page.find('li', attrs={'class':'print_html first'})
+ url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href']
+ desc = self.tag_to_string(post.find('div', attrs={'class':'collection-node-description'})).strip()
+ self.log('Found article:', title)
+ self.log('\t', url)
+ self.log('\t', desc)
+ articles.append({'title':title, 'url':url, 'date':'','description':desc})
+
+ for post in div.findAll('div', attrs={'class':'collections-node-thumbnail-info'}):
+ title = self.tag_to_string(post.find('h2'))
+ author_item=post.find('div', attrs={'class':'collection-node-byline'})
+ article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href'])
+ print_page=article_page.find('li', attrs={'class':'print_html first'})
+ description = post.find('div', attrs={'class':'collection-node-description'})
+ author = re.sub(r'.*by\s',"",self.tag_to_string(description.nextSibling).strip())
+ desc = self.tag_to_string(description).strip()
+ url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href']
+ title = title + u' (%s)'%author
+ self.log('Found article:', title)
+ self.log('\t', url)
+ self.log('\t', desc)
+ articles.append({'title':title, 'url':url, 'date':'','description':desc})
+
+ for post in div.findAll('li', attrs={'class':['collection-item-list-odd','collection-item-list-even']}):
+ title = self.tag_to_string(post.find('h2'))
+ author_item=post.find('div', attrs={'class':'collection-node-byline'})
+ author = re.sub(r'.*by\s',"",self.tag_to_string(author_item).strip())
+ title = title + u' (%s)'%author
+ article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href'])
+ print_page=article_page.find('li', attrs={'class':'print_html first'})
+ url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href']
+ desc = self.tag_to_string(post.find('div', attrs={'class':'collection-node-description'})).strip()
+ self.log('Found article:', title)
+ self.log('\t', url)
+ self.log('\t', desc)
+ articles.append({'title':title, 'url':url, 'date':'','description':desc})
+
+ return [('Current Issue', articles)]
- def get_obfuscated_article(self, url):
- br = self.get_browser()
- br.open(url)
- response = br.follow_link(url_regex = r'/print/[0-9]+', nr = 0)
- html = response.read()
- self.temp_files.append(PersistentTemporaryFile('_fa.html'))
- self.temp_files[-1].write(html)
- self.temp_files[-1].close()
- return self.temp_files[-1].name
- def get_cover_url(self):
- index = 'http://www.psychologytoday.com/magazine/'
- soup = self.index_to_soup(index)
- for image in soup.findAll('img',{ "class" : "imagefield imagefield-field_magazine_cover" }):
- return image['src'] + '.jpg'
- return None
diff --git a/recipes/smith.recipe b/recipes/smith.recipe
index 8bf60a227a..3d6a95c494 100644
--- a/recipes/smith.recipe
+++ b/recipes/smith.recipe
@@ -1,61 +1,67 @@
import re
-from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
+from calibre.web.feeds.recipes import BasicNewsRecipe
+from collections import OrderedDict
-class SmithsonianMagazine(BasicNewsRecipe):
- title = u'Smithsonian Magazine'
- language = 'en'
- __author__ = 'Krittika Goyal and TerminalVeracity'
- oldest_article = 31#days
- max_articles_per_feed = 50
- use_embedded_content = False
- recursions = 1
- cover_url = 'http://sphotos.xx.fbcdn.net/hphotos-snc7/431147_10150602715983253_764313347_n.jpg'
- match_regexps = ['&page=[2-9]$']
- preprocess_regexps = [
- (re.compile(r'for more of Smithsonian\'s coverage on history, science and nature.', re.DOTALL), lambda m: '')
- ]
- extra_css = """
- h1{font-size: large; margin: .2em 0}
- h2{font-size: medium; margin: .2em 0}
- h3{font-size: medium; margin: .2em 0}
- #byLine{margin: .2em 0}
- .articleImageCaptionwide{font-style: italic}
- .wp-caption-text{font-style: italic}
- img{display: block}
- """
+class Smithsonian(BasicNewsRecipe):
+ title = 'Smithsonian Magazine'
+ __author__ = 'Rick Shang'
- remove_stylesheets = True
- remove_tags_after = dict(name='div', attrs={'class':['post','articlePaginationWrapper']})
- remove_tags = [
- dict(name='iframe'),
- dict(name='div', attrs={'class':['article_sidebar_border','viewMorePhotos','addtoany_share_save_container','meta','social','OUTBRAIN','related-articles-inpage']}),
- dict(name='div', attrs={'id':['article_sidebar_border', 'most-popular_large', 'most-popular-body_large','comment_section','article-related']}),
- dict(name='ul', attrs={'class':'cat-breadcrumb col three last'}),
- dict(name='h4', attrs={'id':'related-topics'}),
- dict(name='table'),
- dict(name='a', attrs={'href':['/subArticleBottomWeb','/subArticleTopWeb','/subArticleTopMag','/subArticleBottomMag']}),
- dict(name='a', attrs={'name':'comments_shaded'}),
- ]
+ description = 'This magazine chronicles the arts, environment, sciences and popular culture of the times. It is edited for modern, well-rounded individuals with diverse, general interests. With your order, you become a National Associate Member of the Smithsonian. Membership benefits include your subscription to Smithsonian magazine, a personalized membership card, discounts from the Smithsonian catalog, and more.'
+ language = 'en'
+ category = 'news'
+ encoding = 'UTF-8'
+ keep_only_tags = [dict(attrs={'id':['articleTitle', 'subHead', 'byLine', 'articleImage', 'article-text']})]
+ remove_tags = [dict(attrs={'class':['related-articles-inpage', 'viewMorePhotos']})]
+ no_javascript = True
+ no_stylesheets = True
+ def parse_index(self):
+ #Go to the issue
+ soup0 = self.index_to_soup('http://www.smithsonianmag.com/issue/archive/')
+ div = soup0.find('div',attrs={'id':'archives'})
+ issue = div.find('ul',attrs={'class':'clear-both'})
+ current_issue_url = issue.find('a', href=True)['href']
+ soup = self.index_to_soup(current_issue_url)
- feeds = [
-('History and Archeology',
- 'http://feeds.feedburner.com/smithsonianmag/history-archaeology'),
-('People and Places',
- 'http://feeds.feedburner.com/smithsonianmag/people-places'),
-('Science and Nature',
- 'http://feeds.feedburner.com/smithsonianmag/science-nature'),
-('Arts and Culture',
- 'http://feeds.feedburner.com/smithsonianmag/arts-culture'),
-('Travel',
- 'http://feeds.feedburner.com/smithsonianmag/travel'),
-]
+ #Go to the main body
+ div = soup.find ('div', attrs={'id':'content-inset'})
+
+ #Find date
+ date = re.sub('.*\:\W*', "", self.tag_to_string(div.find('h2')).strip())
+ self.timefmt = u' [%s]'%date
+
+ #Find cover
+ self.cover_url = div.find('img',src=True)['src']
+
+ feeds = OrderedDict()
+ section_title = ''
+ subsection_title = ''
+ for post in div.findAll('div', attrs={'class':['plainModule', 'departments plainModule']}):
+ articles = []
+ prefix = ''
+ h3=post.find('h3')
+ if h3 is not None:
+ section_title = self.tag_to_string(h3)
+ else:
+ subsection=post.find('p',attrs={'class':'article-cat'})
+ link=post.find('a',href=True)
+ url=link['href']+'?c=y&story=fullstory'
+ if subsection is not None:
+ subsection_title = self.tag_to_string(subsection)
+ prefix = (subsection_title+': ')
+ description=self.tag_to_string(post('p', limit=2)[1]).strip()
+ else:
+ description=self.tag_to_string(post.find('p')).strip()
+ desc=re.sub('\sBy\s.*', '', description, re.DOTALL)
+ author=re.sub('.*By\s', '', description, re.DOTALL)
+ title=prefix + self.tag_to_string(link).strip()+ u' (%s)'%author
+ articles.append({'title':title, 'url':url, 'description':desc, 'date':''})
+
+ if articles:
+ if section_title not in feeds:
+ feeds[section_title] = []
+ feeds[section_title] += articles
+ ans = [(key, val) for key, val in feeds.iteritems()]
+ return ans
- def preprocess_html(self, soup):
- story = soup.find(name='div', attrs={'id':'article-body'})
- soup = BeautifulSoup('