From 9b8bf25a41b931c989b556414d21ed019e4e9626 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 12 Apr 2015 10:56:52 +0530 Subject: [PATCH] Update Arcamax --- recipes/arcamax.recipe | 160 ++++++++++++++++++++--------------------- 1 file changed, 77 insertions(+), 83 deletions(-) diff --git a/recipes/arcamax.recipe b/recipes/arcamax.recipe index 7c84e08114..c3b5821185 100644 --- a/recipes/arcamax.recipe +++ b/recipes/arcamax.recipe @@ -5,14 +5,14 @@ __copyright__ = 'Copyright 2010 Starson17' ''' www.arcamax.com ''' + +import os from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import Tag +from calibre.ptempfile import PersistentTemporaryDirectory class Arcamax(BasicNewsRecipe): title = 'Arcamax' - __author__ = 'Starson17' - __version__ = '1.04' - __date__ = '18 April 2011' + __author__ = 'Kovid Goyal' description = u'Family Friendly Comics - Customize for more days/comics: Defaults to 7 days, 25 comics - 20 general, 5 editorial.' category = 'news, comics' language = 'en' @@ -21,7 +21,7 @@ class Arcamax(BasicNewsRecipe): remove_javascript = True cover_url = 'http://www.arcamax.com/images/pub/amuse/leftcol/zits.jpg' - ####### USER PREFERENCES - SET COMICS AND NUMBER OF COMICS TO RETRIEVE ######## + # ###### USER PREFERENCES - SET COMICS AND NUMBER OF COMICS TO RETRIEVE ######## num_comics_to_get = 7 # CHOOSE COMIC STRIPS BELOW - REMOVE COMMENT '# ' FROM IN FRONT OF DESIRED STRIPS @@ -31,98 +31,92 @@ class Arcamax(BasicNewsRecipe): , 'language' : language } - keep_only_tags = [dict(name='article', attrs={'class':['comic']}), - ] - - #remove_tags = [dict(name='div', attrs={'id':['comicfull' ]}), - #dict(name='div', attrs={'class':['calendar' ]}), - #dict(name='nav', attrs={'class':['calendar-nav' ]}), - #] + keep_only_tags = [ + dict(name='header', attrs={'class':'fn-content-header bluelabel'}), + dict(name='figure', attrs={'class':['comic']}), + ] def parse_index(self): feeds = [] + self.panel_tdir = PersistentTemporaryDirectory('arcamax') + self.panel_counter = 0 for title, url in [ - ######## COMICS - GENERAL ######## - #(u"9 Chickweed Lane", #u"http://www.arcamax.com/thefunnies/ninechickweedlane"), - #(u"Agnes", u"http://www.arcamax.com/thefunnies/agnes"), - #(u"Andy Capp", #u"http://www.arcamax.com/thefunnies/andycapp"), - (u"BC", u"http://www.arcamax.com/thefunnies/bc"), - #(u"Baby Blues", #u"http://www.arcamax.com/thefunnies/babyblues"), - #(u"Beetle Bailey", #u"http://www.arcamax.com/thefunnies/beetlebailey"), - (u"Blondie", u"http://www.arcamax.com/thefunnies/blondie"), - #u"Boondocks", u"http://www.arcamax.com/thefunnies/boondocks"), - #(u"Cathy", u"http://www.arcamax.com/thefunnies/cathy"), - #(u"Daddys Home", #u"http://www.arcamax.com/thefunnies/daddyshome"), - (u"Dilbert", u"http://www.arcamax.com/thefunnies/dilbert"), - #(u"Dinette Set", #u"http://www.arcamax.com/thefunnies/thedinetteset"), - (u"Dog Eat Doug", u"http://www.arcamax.com/thefunnies/dogeatdoug"), - (u"Doonesbury", u"http://www.arcamax.com/thefunnies/doonesbury"), - #(u"Dustin", u"http://www.arcamax.com/thefunnies/dustin"), - (u"Family Circus", u"http://www.arcamax.com/thefunnies/familycircus"), - (u"Garfield", u"http://www.arcamax.com/thefunnies/garfield"), - #(u"Get Fuzzy", #u"http://www.arcamax.com/thefunnies/getfuzzy"), - #(u"Girls and Sports", #u"http://www.arcamax.com/thefunnies/girlsandsports"), - #(u"Hagar the Horrible", #u"http://www.arcamax.com/thefunnies/hagarthehorrible"), - #(u"Heathcliff", #u"http://www.arcamax.com/thefunnies/heathcliff"), - #(u"Jerry King Cartoons", #u"http://www.arcamax.com/thefunnies/humorcartoon"), - #(u"Luann", u"http://www.arcamax.com/thefunnies/luann"), - #(u"Momma", u"http://www.arcamax.com/thefunnies/momma"), - #(u"Mother Goose and Grimm", #u"http://www.arcamax.com/thefunnies/mothergooseandgrimm"), - (u"Mutts", u"http://www.arcamax.com/thefunnies/mutts"), - #(u"Non Sequitur", #u"http://www.arcamax.com/thefunnies/nonsequitur"), - #(u"Pearls Before Swine", #u"http://www.arcamax.com/thefunnies/pearlsbeforeswine"), - #(u"Pickles", u"http://www.arcamax.com/thefunnies/pickles"), - #(u"Red and Rover", #u"http://www.arcamax.com/thefunnies/redandrover"), - #(u"Rubes", u"http://www.arcamax.com/thefunnies/rubes"), - #(u"Rugrats", u"http://www.arcamax.com/thefunnies/rugrats"), - (u"Speed Bump", u"http://www.arcamax.com/thefunnies/speedbump"), - (u"Wizard of Id", u"http://www.arcamax.com/thefunnies/wizardofid"), - (u"Zits", u"http://www.arcamax.com/thefunnies/zits"), - ]: - articles = self.make_links(url) + # ####### COMICS - GENERAL ######## + # (u"9 Chickweed Lane", #u"http://www.arcamax.com/thefunnies/ninechickweedlane"), + # (u"Agnes", u"http://www.arcamax.com/thefunnies/agnes"), + # (u"Andy Capp", #u"http://www.arcamax.com/thefunnies/andycapp"), + (u"BC", u"http://www.arcamax.com/thefunnies/bc"), + # (u"Baby Blues", #u"http://www.arcamax.com/thefunnies/babyblues"), + # (u"Beetle Bailey", #u"http://www.arcamax.com/thefunnies/beetlebailey"), + (u"Blondie", u"http://www.arcamax.com/thefunnies/blondie"), + # u"Boondocks", u"http://www.arcamax.com/thefunnies/boondocks"), + # (u"Cathy", u"http://www.arcamax.com/thefunnies/cathy"), + # (u"Daddys Home", #u"http://www.arcamax.com/thefunnies/daddyshome"), + (u"Dilbert", u"http://www.arcamax.com/thefunnies/dilbert"), + # (u"Dinette Set", #u"http://www.arcamax.com/thefunnies/thedinetteset"), + (u"Dog Eat Doug", u"http://www.arcamax.com/thefunnies/dogeatdoug"), + (u"Doonesbury", u"http://www.arcamax.com/thefunnies/doonesbury"), + # (u"Dustin", u"http://www.arcamax.com/thefunnies/dustin"), + (u"Family Circus", u"http://www.arcamax.com/thefunnies/familycircus"), + (u"Garfield", u"http://www.arcamax.com/thefunnies/garfield"), + # (u"Get Fuzzy", #u"http://www.arcamax.com/thefunnies/getfuzzy"), + # (u"Girls and Sports", #u"http://www.arcamax.com/thefunnies/girlsandsports"), + # (u"Hagar the Horrible", #u"http://www.arcamax.com/thefunnies/hagarthehorrible"), + # (u"Heathcliff", #u"http://www.arcamax.com/thefunnies/heathcliff"), + # (u"Jerry King Cartoons", #u"http://www.arcamax.com/thefunnies/humorcartoon"), + # (u"Luann", u"http://www.arcamax.com/thefunnies/luann"), + # (u"Momma", u"http://www.arcamax.com/thefunnies/momma"), + # (u"Mother Goose and Grimm", #u"http://www.arcamax.com/thefunnies/mothergooseandgrimm"), + (u"Mutts", u"http://www.arcamax.com/thefunnies/mutts"), + # (u"Non Sequitur", #u"http://www.arcamax.com/thefunnies/nonsequitur"), + # (u"Pearls Before Swine", #u"http://www.arcamax.com/thefunnies/pearlsbeforeswine"), + # (u"Pickles", u"http://www.arcamax.com/thefunnies/pickles"), + # (u"Red and Rover", #u"http://www.arcamax.com/thefunnies/redandrover"), + # (u"Rubes", u"http://www.arcamax.com/thefunnies/rubes"), + # (u"Rugrats", u"http://www.arcamax.com/thefunnies/rugrats"), + (u"Speed Bump", u"http://www.arcamax.com/thefunnies/speedbump"), + (u"Wizard of Id", u"http://www.arcamax.com/thefunnies/wizardofid"), + (u"Zits", u"http://www.arcamax.com/thefunnies/zits"), + ]: + self.log('Finding strips for:', title) + articles = self.make_links(url, title) if articles: feeds.append((title, articles)) + if self.test and len(feeds) >= self.test[0]: + break return feeds - def make_links(self, url): - title = 'Temp' + def make_links(self, url, title): current_articles = [] - pages = range(1, self.num_comics_to_get+1) - for page in pages: - page_soup = self.index_to_soup(url) - if page_soup: - title = self.tag_to_string(page_soup.find(name='div', attrs={'class':'columnheader'}).h1.contents[0]) - page_url = url - # orig prev_page_url = 'http://www.arcamax.com' + page_soup.find('a', attrs={'class':'prev'}, text='Previous').parent['href'] - prev_page_url = 'http://www.arcamax.com' + page_soup.find(name='a', attrs={'class':['prev']})['href'] - date = self.tag_to_string(page_soup.find(name='span', attrs={'class':['cur']})) - current_articles.append({'title': title, 'url': page_url, 'description':'', 'date': date}) + num = self.num_comics_to_get + while num > 0: + num -= 1 + raw = self.index_to_soup(url, raw=True) + self.panel_counter += 1 + path = os.path.join(self.panel_tdir, '%d.html' % self.panel_counter) + with open(path, 'wb') as f: + f.write(raw) + soup = self.index_to_soup(raw) + a = soup.find(name='a', attrs={'class':['prev']}) + prev_page_url = 'http://www.arcamax.com' + a['href'] + title = self.tag_to_string(soup.find('title')).partition('|')[0].strip() + if 'for' not in title.split(): + title = title + ' for today' + date = self.tag_to_string(soup.find(name='span', attrs={'class':['cur']})) + self.log('\tFound:', title, 'at:', url) + current_articles.append({'title': title, 'url':'file://' + path , 'description':'', 'date': date}) + if self.test and len(current_articles) >= self.test[1]: + break url = prev_page_url current_articles.reverse() return current_articles def preprocess_html(self, soup): - for img_tag in soup.findAll('img'): - parent_tag = img_tag.parent - if parent_tag.name == 'a': - new_tag = Tag(soup,'p') - new_tag.insert(0,img_tag) - parent_tag.replaceWith(new_tag) - elif parent_tag.name == 'p': - if not self.tag_to_string(parent_tag) == '': - new_div = Tag(soup,'div') - new_tag = Tag(soup,'p') - new_tag.insert(0,img_tag) - parent_tag.replaceWith(new_div) - new_div.insert(0,new_tag) - new_div.insert(1,parent_tag) + for img in soup.findAll('img', src=True): + if img['src'].startswith('/'): + img['src'] = 'http://arcamax.com' + img['src'] return soup extra_css = ''' - h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} - h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} - img {max-width:100%; min-width:100%;} - p{font-family:Arial,Helvetica,sans-serif;font-size:small;} - body{font-family:Helvetica,Arial,sans-serif;font-size:small;} - ''' - + img {max-width:100%; min-width:100%;} + '''