#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = 'Copyright 2010 Starson17' ''' www.arcamax.com ''' from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Tag class Arcamax(BasicNewsRecipe): title = 'Arcamax' __author__ = 'Starson17' __version__ = '1.04' __date__ = '18 April 2011' description = u'Family Friendly Comics - Customize for more days/comics: Defaults to 7 days, 25 comics - 20 general, 5 editorial.' category = 'news, comics' language = 'en' use_embedded_content= False no_stylesheets = True remove_javascript = True cover_url = 'http://www.arcamax.com/images/pub/amuse/leftcol/zits.jpg' ####### USER PREFERENCES - SET COMICS AND NUMBER OF COMICS TO RETRIEVE ######## num_comics_to_get = 7 # CHOOSE COMIC STRIPS BELOW - REMOVE COMMENT '# ' FROM IN FRONT OF DESIRED STRIPS conversion_options = {'linearize_tables' : True , 'comment' : description , 'tags' : category , 'language' : language } keep_only_tags = [dict(name='div', attrs={'class':['comics-header']}), dict(name='b', attrs={'class':['current']}), dict(name='article', attrs={'class':['comic']}), ] remove_tags = [dict(name='div', attrs={'id':['comicfull' ]}), dict(name='div', attrs={'class':['calendar' ]}), dict(name='nav', attrs={'class':['calendar-nav' ]}), ] def parse_index(self): feeds = [] for title, url in [ ######## COMICS - GENERAL ######## #(u"9 Chickweed Lane", u"http://www.arcamax.com/ninechickweedlane"), #(u"Agnes", u"http://www.arcamax.com/agnes"), #(u"Andy Capp", u"http://www.arcamax.com/andycapp"), (u"BC", u"http://www.arcamax.com/bc"), #(u"Baby Blues", u"http://www.arcamax.com/babyblues"), #(u"Beetle Bailey", u"http://www.arcamax.com/beetlebailey"), (u"Blondie", u"http://www.arcamax.com/blondie"), #u"Boondocks", u"http://www.arcamax.com/boondocks"), #(u"Cathy", u"http://www.arcamax.com/cathy"), #(u"Daddys Home", u"http://www.arcamax.com/daddyshome"), (u"Dilbert", u"http://www.arcamax.com/dilbert"), #(u"Dinette Set", u"http://www.arcamax.com/thedinetteset"), (u"Dog Eat Doug", u"http://www.arcamax.com/dogeatdoug"), (u"Doonesbury", u"http://www.arcamax.com/doonesbury"), #(u"Dustin", u"http://www.arcamax.com/dustin"), (u"Family Circus", u"http://www.arcamax.com/familycircus"), (u"Garfield", u"http://www.arcamax.com/garfield"), #(u"Get Fuzzy", u"http://www.arcamax.com/getfuzzy"), #(u"Girls and Sports", u"http://www.arcamax.com/girlsandsports"), #(u"Hagar the Horrible", u"http://www.arcamax.com/hagarthehorrible"), #(u"Heathcliff", u"http://www.arcamax.com/heathcliff"), #(u"Jerry King Cartoons", u"http://www.arcamax.com/humorcartoon"), #(u"Luann", u"http://www.arcamax.com/luann"), #(u"Momma", u"http://www.arcamax.com/momma"), #(u"Mother Goose and Grimm", u"http://www.arcamax.com/mothergooseandgrimm"), (u"Mutts", u"http://www.arcamax.com/mutts"), #(u"Non Sequitur", u"http://www.arcamax.com/nonsequitur"), #(u"Pearls Before Swine", u"http://www.arcamax.com/pearlsbeforeswine"), #(u"Pickles", u"http://www.arcamax.com/pickles"), #(u"Red and Rover", u"http://www.arcamax.com/redandrover"), #(u"Rubes", u"http://www.arcamax.com/rubes"), #(u"Rugrats", u"http://www.arcamax.com/rugrats"), (u"Speed Bump", u"http://www.arcamax.com/speedbump"), (u"Wizard of Id", u"http://www.arcamax.com/wizardofid"), (u"Zits", u"http://www.arcamax.com/zits"), ]: articles = self.make_links(url) if articles: feeds.append((title, articles)) return feeds def make_links(self, url): title = 'Temp' current_articles = [] pages = range(1, self.num_comics_to_get+1) for page in pages: page_soup = self.index_to_soup(url) if page_soup: title = self.tag_to_string(page_soup.find(name='div', attrs={'class':'comics-header'}).h1.contents[0]) page_url = url # orig prev_page_url = 'http://www.arcamax.com' + page_soup.find('a', attrs={'class':'prev'}, text='Previous').parent['href'] prev_page_url = 'http://www.arcamax.com' + page_soup.find('span', text='Previous').parent.parent['href'] date = self.tag_to_string(page_soup.find(name='b', attrs={'class':['current']})) current_articles.append({'title': title, 'url': page_url, 'description':'', 'date': date}) url = prev_page_url current_articles.reverse() return current_articles def preprocess_html(self, soup): for img_tag in soup.findAll('img'): parent_tag = img_tag.parent if parent_tag.name == 'a': new_tag = Tag(soup,'p') new_tag.insert(0,img_tag) parent_tag.replaceWith(new_tag) elif parent_tag.name == 'p': if not self.tag_to_string(parent_tag) == '': new_div = Tag(soup,'div') new_tag = Tag(soup,'p') new_tag.insert(0,img_tag) parent_tag.replaceWith(new_div) new_div.insert(0,new_tag) new_div.insert(1,parent_tag) return soup extra_css = ''' h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} img {max-width:100%; min-width:100%;} p{font-family:Arial,Helvetica,sans-serif;font-size:small;} body{font-family:Helvetica,Arial,sans-serif;font-size:small;} '''