mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-10-31 02:27:01 -04:00 
			
		
		
		
	
		
			
				
	
	
		
			130 lines
		
	
	
		
			6.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			130 lines
		
	
	
		
			6.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #!/usr/bin/env  python
 | |
| 
 | |
| __license__   = 'GPL v3'
 | |
| __copyright__ = 'Copyright 2010 Starson17'
 | |
| '''
 | |
| www.arcamax.com
 | |
| '''
 | |
| from calibre.web.feeds.news import BasicNewsRecipe
 | |
| from calibre.ebooks.BeautifulSoup import Tag
 | |
| 
 | |
| class Arcamax(BasicNewsRecipe):
 | |
|     title               = 'Arcamax'
 | |
|     __author__          = 'Starson17'
 | |
|     __version__         = '1.04'
 | |
|     __date__            = '18 April 2011'
 | |
|     description         = u'Family Friendly Comics - Customize for more days/comics: Defaults to 7 days, 25 comics - 20 general, 5 editorial.'
 | |
|     category            = 'news, comics'
 | |
|     language            = 'en'
 | |
|     use_embedded_content= False
 | |
|     no_stylesheets      = True
 | |
|     remove_javascript   = True
 | |
|     cover_url           = 'http://www.arcamax.com/images/pub/amuse/leftcol/zits.jpg'
 | |
| 
 | |
|     ####### USER PREFERENCES - SET COMICS AND NUMBER OF COMICS TO RETRIEVE ########
 | |
|     num_comics_to_get = 7
 | |
|     # CHOOSE COMIC STRIPS BELOW - REMOVE COMMENT '# ' FROM IN FRONT OF DESIRED STRIPS
 | |
| 
 | |
|     conversion_options = {'linearize_tables'  : True
 | |
|                         , 'comment'           : description
 | |
|                         , 'tags'              : category
 | |
|                         , 'language'          : language
 | |
|                         }
 | |
| 
 | |
|     keep_only_tags     = [dict(name='div', attrs={'class':['comics-header']}),
 | |
|                                         dict(name='b', attrs={'class':['current']}),
 | |
|                                         dict(name='article', attrs={'class':['comic']}),
 | |
|                                         ]
 | |
| 
 | |
|     remove_tags = [dict(name='div', attrs={'id':['comicfull' ]}),
 | |
|                                dict(name='div', attrs={'class':['calendar' ]}),
 | |
|                                dict(name='nav', attrs={'class':['calendar-nav' ]}),
 | |
|                                ]
 | |
| 
 | |
|     def parse_index(self):
 | |
|         feeds = []
 | |
|         for title, url in [
 | |
|                             ######## COMICS - GENERAL ########
 | |
|                             #(u"9 Chickweed Lane", u"http://www.arcamax.com/ninechickweedlane"),
 | |
|                             #(u"Agnes", u"http://www.arcamax.com/agnes"),
 | |
|                             #(u"Andy Capp", u"http://www.arcamax.com/andycapp"),
 | |
|                             (u"BC", u"http://www.arcamax.com/bc"),
 | |
|                             #(u"Baby Blues", u"http://www.arcamax.com/babyblues"),
 | |
|                             #(u"Beetle Bailey", u"http://www.arcamax.com/beetlebailey"),
 | |
|                             (u"Blondie", u"http://www.arcamax.com/blondie"),
 | |
|                             #u"Boondocks", u"http://www.arcamax.com/boondocks"),
 | |
|                             #(u"Cathy", u"http://www.arcamax.com/cathy"),
 | |
|                             #(u"Daddys Home", u"http://www.arcamax.com/daddyshome"),
 | |
|                             (u"Dilbert", u"http://www.arcamax.com/dilbert"),
 | |
|                             #(u"Dinette Set", u"http://www.arcamax.com/thedinetteset"),
 | |
|                             (u"Dog Eat Doug", u"http://www.arcamax.com/dogeatdoug"),
 | |
|                             (u"Doonesbury", u"http://www.arcamax.com/doonesbury"),
 | |
|                             #(u"Dustin", u"http://www.arcamax.com/dustin"),
 | |
|                             (u"Family Circus", u"http://www.arcamax.com/familycircus"),
 | |
|                             (u"Garfield", u"http://www.arcamax.com/garfield"),
 | |
|                             #(u"Get Fuzzy", u"http://www.arcamax.com/getfuzzy"),
 | |
|                             #(u"Girls and Sports", u"http://www.arcamax.com/girlsandsports"),
 | |
|                             #(u"Hagar the Horrible", u"http://www.arcamax.com/hagarthehorrible"),
 | |
|                             #(u"Heathcliff", u"http://www.arcamax.com/heathcliff"),
 | |
|                             #(u"Jerry King Cartoons", u"http://www.arcamax.com/humorcartoon"),
 | |
|                             #(u"Luann", u"http://www.arcamax.com/luann"),
 | |
|                             #(u"Momma", u"http://www.arcamax.com/momma"),
 | |
|                             #(u"Mother Goose and Grimm", u"http://www.arcamax.com/mothergooseandgrimm"),
 | |
|                             (u"Mutts", u"http://www.arcamax.com/mutts"),
 | |
|                             #(u"Non Sequitur", u"http://www.arcamax.com/nonsequitur"),
 | |
|                             #(u"Pearls Before Swine", u"http://www.arcamax.com/pearlsbeforeswine"),
 | |
|                             #(u"Pickles", u"http://www.arcamax.com/pickles"),
 | |
|                             #(u"Red and Rover", u"http://www.arcamax.com/redandrover"),
 | |
|                             #(u"Rubes", u"http://www.arcamax.com/rubes"),
 | |
|                             #(u"Rugrats", u"http://www.arcamax.com/rugrats"),
 | |
|                             (u"Speed Bump", u"http://www.arcamax.com/speedbump"),
 | |
|                             (u"Wizard of Id", u"http://www.arcamax.com/wizardofid"),
 | |
|                             (u"Zits", u"http://www.arcamax.com/zits"),
 | |
|                              ]:
 | |
|             articles = self.make_links(url)
 | |
|             if articles:
 | |
|                 feeds.append((title, articles))
 | |
|         return feeds
 | |
| 
 | |
|     def make_links(self, url):
 | |
|         title = 'Temp'
 | |
|         current_articles = []
 | |
|         pages = range(1, self.num_comics_to_get+1)
 | |
|         for page in pages:
 | |
|             page_soup = self.index_to_soup(url)
 | |
|             if page_soup:
 | |
|                 title = self.tag_to_string(page_soup.find(name='div', attrs={'class':'comics-header'}).h1.contents[0])
 | |
|                 page_url = url
 | |
|                 # orig prev_page_url = 'http://www.arcamax.com' + page_soup.find('a', attrs={'class':'prev'}, text='Previous').parent['href']
 | |
|                 prev_page_url = 'http://www.arcamax.com' + page_soup.find('span', text='Previous').parent.parent['href']
 | |
|                 date = self.tag_to_string(page_soup.find(name='b', attrs={'class':['current']}))
 | |
|             current_articles.append({'title': title, 'url': page_url, 'description':'', 'date': date})
 | |
|             url = prev_page_url
 | |
|         current_articles.reverse()
 | |
|         return current_articles
 | |
| 
 | |
|     def preprocess_html(self, soup):
 | |
|         for img_tag in soup.findAll('img'):
 | |
|             parent_tag = img_tag.parent
 | |
|             if parent_tag.name == 'a':
 | |
|                 new_tag = Tag(soup,'p')
 | |
|                 new_tag.insert(0,img_tag)
 | |
|                 parent_tag.replaceWith(new_tag)
 | |
|             elif parent_tag.name == 'p':
 | |
|                 if not self.tag_to_string(parent_tag) == '':
 | |
|                     new_div = Tag(soup,'div')
 | |
|                     new_tag = Tag(soup,'p')
 | |
|                     new_tag.insert(0,img_tag)
 | |
|                     parent_tag.replaceWith(new_div)
 | |
|                     new_div.insert(0,new_tag)
 | |
|                     new_div.insert(1,parent_tag)
 | |
|         return soup
 | |
| 
 | |
|     extra_css = '''
 | |
|                     h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
 | |
|                     h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
 | |
|                     img {max-width:100%; min-width:100%;}
 | |
|                     p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
 | |
|                     body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
 | |
| 		'''
 |