GoComics by Starson17. Fixes #6500 (New Recipe: GoComics)

This commit is contained in:
Kovid Goyal 2010-08-14 08:17:43 -06:00
parent 07a63b4362
commit 8f42c11aa3

View File

@ -0,0 +1,348 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = 'Copyright 2010 Starson17'
'''
www.gocomics.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
import mechanize
class GoComics(BasicNewsRecipe):
title = 'GoComics'
__author__ = 'Starson17'
__version__ = '1.02'
__date__ = '14 August 2010'
description = u'200+ Comics - Customize for more days/comics: Defaults to 7 days, 25 comics - 20 general, 5 editorial.'
category = 'news, comics'
language = 'en'
use_embedded_content= False
no_stylesheets = True
remove_javascript = True
cover_url = 'http://paulbuckley14059.files.wordpress.com/2008/06/calvin-and-hobbes.jpg'
####### USER PREFERENCES - COMICS, IMAGE SIZE AND NUMBER OF COMICS TO RETRIEVE ########
# num_comics_to_get - I've tried up to 99 on Calvin&Hobbes
num_comics_to_get = 7
# comic_size 300 is small, 600 is medium, 900 is large, 1500 is extra-large
comic_size = 900
# CHOOSE COMIC STRIPS BELOW - REMOVE COMMENT '# ' FROM IN FRONT OF DESIRED STRIPS
# Please do not overload their servers by selecting all comics and 1000 strips from each!
conversion_options = {'linearize_tables' : True
, 'comment' : description
, 'tags' : category
, 'language' : language
}
keep_only_tags = [dict(name='div', attrs={'class':['feature','banner']}),
]
remove_tags = [dict(name='a', attrs={'class':['beginning','prev','cal','next','newest']}),
dict(name='div', attrs={'class':['tag-wrapper']}),
dict(name='ul', attrs={'class':['share-nav','feature-nav']}),
]
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
cookies = mechanize.CookieJar()
br = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies))
br.addheaders = [('Referer','http://www.gocomics.com/')]
return br
def parse_index(self):
feeds = []
for title, url in [
######## COMICS - GENERAL ########
(u"2 Cows and a Chicken", u"http://www.gocomics.com/2cowsandachicken"),
# (u"9 to 5", u"http://www.gocomics.com/9to5"),
# (u"The Academia Waltz", u"http://www.gocomics.com/academiawaltz"),
# (u"Adam@Home", u"http://www.gocomics.com/adamathome"),
# (u"Agnes", u"http://www.gocomics.com/agnes"),
# (u"Andy Capp", u"http://www.gocomics.com/andycapp"),
# (u"Animal Crackers", u"http://www.gocomics.com/animalcrackers"),
# (u"Annie", u"http://www.gocomics.com/annie"),
(u"The Argyle Sweater", u"http://www.gocomics.com/theargylesweater"),
# (u"Ask Shagg", u"http://www.gocomics.com/askshagg"),
(u"B.C.", u"http://www.gocomics.com/bc"),
# (u"Back in the Day", u"http://www.gocomics.com/backintheday"),
# (u"Bad Reporter", u"http://www.gocomics.com/badreporter"),
# (u"Baldo", u"http://www.gocomics.com/baldo"),
# (u"Ballard Street", u"http://www.gocomics.com/ballardstreet"),
# (u"Barkeater Lake", u"http://www.gocomics.com/barkeaterlake"),
# (u"The Barn", u"http://www.gocomics.com/thebarn"),
# (u"Basic Instructions", u"http://www.gocomics.com/basicinstructions"),
# (u"Bewley", u"http://www.gocomics.com/bewley"),
# (u"Big Top", u"http://www.gocomics.com/bigtop"),
# (u"Biographic", u"http://www.gocomics.com/biographic"),
(u"Birdbrains", u"http://www.gocomics.com/birdbrains"),
# (u"Bleeker: The Rechargeable Dog", u"http://www.gocomics.com/bleeker"),
# (u"Bliss", u"http://www.gocomics.com/bliss"),
(u"Bloom County", u"http://www.gocomics.com/bloomcounty"),
# (u"Bo Nanas", u"http://www.gocomics.com/bonanas"),
# (u"Bob the Squirrel", u"http://www.gocomics.com/bobthesquirrel"),
# (u"The Boiling Point", u"http://www.gocomics.com/theboilingpoint"),
# (u"Boomerangs", u"http://www.gocomics.com/boomerangs"),
# (u"The Boondocks", u"http://www.gocomics.com/boondocks"),
# (u"Bottomliners", u"http://www.gocomics.com/bottomliners"),
# (u"Bound and Gagged", u"http://www.gocomics.com/boundandgagged"),
# (u"Brainwaves", u"http://www.gocomics.com/brainwaves"),
# (u"Brenda Starr", u"http://www.gocomics.com/brendastarr"),
# (u"Brewster Rockit", u"http://www.gocomics.com/brewsterrockit"),
# (u"Broom Hilda", u"http://www.gocomics.com/broomhilda"),
(u"Calvin and Hobbes", u"http://www.gocomics.com/calvinandhobbes"),
# (u"Candorville", u"http://www.gocomics.com/candorville"),
# (u"Cathy", u"http://www.gocomics.com/cathy"),
# (u"C'est la Vie", u"http://www.gocomics.com/cestlavie"),
# (u"Chuckle Bros", u"http://www.gocomics.com/chucklebros"),
# (u"Citizen Dog", u"http://www.gocomics.com/citizendog"),
# (u"The City", u"http://www.gocomics.com/thecity"),
# (u"Cleats", u"http://www.gocomics.com/cleats"),
# (u"Close to Home", u"http://www.gocomics.com/closetohome"),
# (u"Compu-toon", u"http://www.gocomics.com/compu-toon"),
# (u"Cornered", u"http://www.gocomics.com/cornered"),
(u"Cul de Sac", u"http://www.gocomics.com/culdesac"),
# (u"Daddy's Home", u"http://www.gocomics.com/daddyshome"),
# (u"Deep Cover", u"http://www.gocomics.com/deepcover"),
# (u"Dick Tracy", u"http://www.gocomics.com/dicktracy"),
# (u"The Dinette Set", u"http://www.gocomics.com/dinetteset"),
# (u"Dog Eat Doug", u"http://www.gocomics.com/dogeatdoug"),
# (u"Domestic Abuse", u"http://www.gocomics.com/domesticabuse"),
# (u"Doodles", u"http://www.gocomics.com/doodles"),
(u"Doonesbury", u"http://www.gocomics.com/doonesbury"),
# (u"The Doozies", u"http://www.gocomics.com/thedoozies"),
# (u"The Duplex", u"http://www.gocomics.com/duplex"),
# (u"Eek!", u"http://www.gocomics.com/eek"),
# (u"The Elderberries", u"http://www.gocomics.com/theelderberries"),
# (u"Flight Deck", u"http://www.gocomics.com/flightdeck"),
# (u"Flo and Friends", u"http://www.gocomics.com/floandfriends"),
# (u"The Flying McCoys", u"http://www.gocomics.com/theflyingmccoys"),
(u"For Better or For Worse", u"http://www.gocomics.com/forbetterorforworse"),
# (u"For Heaven's Sake", u"http://www.gocomics.com/forheavenssake"),
# (u"Fort Knox", u"http://www.gocomics.com/fortknox"),
# (u"FoxTrot", u"http://www.gocomics.com/foxtrot"),
(u"FoxTrot Classics", u"http://www.gocomics.com/foxtrotclassics"),
# (u"Frank & Ernest", u"http://www.gocomics.com/frankandernest"),
# (u"Fred Basset", u"http://www.gocomics.com/fredbasset"),
# (u"Free Range", u"http://www.gocomics.com/freerange"),
# (u"Frog Applause", u"http://www.gocomics.com/frogapplause"),
# (u"The Fusco Brothers", u"http://www.gocomics.com/thefuscobrothers"),
(u"Garfield", u"http://www.gocomics.com/garfield"),
# (u"Garfield Minus Garfield", u"http://www.gocomics.com/garfieldminusgarfield"),
# (u"Gasoline Alley", u"http://www.gocomics.com/gasolinealley"),
# (u"Gil Thorp", u"http://www.gocomics.com/gilthorp"),
# (u"Ginger Meggs", u"http://www.gocomics.com/gingermeggs"),
# (u"Girls & Sports", u"http://www.gocomics.com/girlsandsports"),
# (u"Haiku Ewe", u"http://www.gocomics.com/haikuewe"),
# (u"Heart of the City", u"http://www.gocomics.com/heartofthecity"),
# (u"Heathcliff", u"http://www.gocomics.com/heathcliff"),
# (u"Herb and Jamaal", u"http://www.gocomics.com/herbandjamaal"),
# (u"Home and Away", u"http://www.gocomics.com/homeandaway"),
# (u"Housebroken", u"http://www.gocomics.com/housebroken"),
# (u"Hubert and Abby", u"http://www.gocomics.com/hubertandabby"),
# (u"Imagine This", u"http://www.gocomics.com/imaginethis"),
# (u"In the Bleachers", u"http://www.gocomics.com/inthebleachers"),
# (u"In the Sticks", u"http://www.gocomics.com/inthesticks"),
# (u"Ink Pen", u"http://www.gocomics.com/inkpen"),
# (u"It's All About You", u"http://www.gocomics.com/itsallaboutyou"),
# (u"Joe Vanilla", u"http://www.gocomics.com/joevanilla"),
# (u"La Cucaracha", u"http://www.gocomics.com/lacucaracha"),
# (u"Last Kiss", u"http://www.gocomics.com/lastkiss"),
# (u"Legend of Bill", u"http://www.gocomics.com/legendofbill"),
# (u"Liberty Meadows", u"http://www.gocomics.com/libertymeadows"),
(u"Lio", u"http://www.gocomics.com/lio"),
# (u"Little Dog Lost", u"http://www.gocomics.com/littledoglost"),
# (u"Little Otto", u"http://www.gocomics.com/littleotto"),
# (u"Loose Parts", u"http://www.gocomics.com/looseparts"),
# (u"Love Is...", u"http://www.gocomics.com/loveis"),
# (u"Maintaining", u"http://www.gocomics.com/maintaining"),
# (u"The Meaning of Lila", u"http://www.gocomics.com/meaningoflila"),
# (u"Middle-Aged White Guy", u"http://www.gocomics.com/middleagedwhiteguy"),
# (u"The Middletons", u"http://www.gocomics.com/themiddletons"),
# (u"Momma", u"http://www.gocomics.com/momma"),
# (u"Mutt & Jeff", u"http://www.gocomics.com/muttandjeff"),
# (u"Mythtickle", u"http://www.gocomics.com/mythtickle"),
# (u"Nest Heads", u"http://www.gocomics.com/nestheads"),
# (u"NEUROTICA", u"http://www.gocomics.com/neurotica"),
(u"New Adventures of Queen Victoria", u"http://www.gocomics.com/thenewadventuresofqueenvictoria"),
(u"Non Sequitur", u"http://www.gocomics.com/nonsequitur"),
# (u"The Norm", u"http://www.gocomics.com/thenorm"),
# (u"On A Claire Day", u"http://www.gocomics.com/onaclaireday"),
# (u"One Big Happy", u"http://www.gocomics.com/onebighappy"),
# (u"The Other Coast", u"http://www.gocomics.com/theothercoast"),
# (u"Out of the Gene Pool Re-Runs", u"http://www.gocomics.com/outofthegenepool"),
# (u"Overboard", u"http://www.gocomics.com/overboard"),
# (u"Pibgorn", u"http://www.gocomics.com/pibgorn"),
# (u"Pibgorn Sketches", u"http://www.gocomics.com/pibgornsketches"),
(u"Pickles", u"http://www.gocomics.com/pickles"),
# (u"Pinkerton", u"http://www.gocomics.com/pinkerton"),
# (u"Pluggers", u"http://www.gocomics.com/pluggers"),
(u"Pooch Cafe", u"http://www.gocomics.com/poochcafe"),
# (u"PreTeena", u"http://www.gocomics.com/preteena"),
# (u"The Quigmans", u"http://www.gocomics.com/thequigmans"),
# (u"Rabbits Against Magic", u"http://www.gocomics.com/rabbitsagainstmagic"),
(u"Real Life Adventures", u"http://www.gocomics.com/reallifeadventures"),
# (u"Red and Rover", u"http://www.gocomics.com/redandrover"),
# (u"Red Meat", u"http://www.gocomics.com/redmeat"),
# (u"Reynolds Unwrapped", u"http://www.gocomics.com/reynoldsunwrapped"),
# (u"Ronaldinho Gaucho", u"http://www.gocomics.com/ronaldinhogaucho"),
# (u"Rubes", u"http://www.gocomics.com/rubes"),
# (u"Scary Gary", u"http://www.gocomics.com/scarygary"),
(u"Shoe", u"http://www.gocomics.com/shoe"),
# (u"Shoecabbage", u"http://www.gocomics.com/shoecabbage"),
# (u"Skin Horse", u"http://www.gocomics.com/skinhorse"),
# (u"Slowpoke", u"http://www.gocomics.com/slowpoke"),
# (u"Speed Bump", u"http://www.gocomics.com/speedbump"),
# (u"State of the Union", u"http://www.gocomics.com/stateoftheunion"),
(u"Stone Soup", u"http://www.gocomics.com/stonesoup"),
# (u"Strange Brew", u"http://www.gocomics.com/strangebrew"),
# (u"Sylvia", u"http://www.gocomics.com/sylvia"),
# (u"Tank McNamara", u"http://www.gocomics.com/tankmcnamara"),
# (u"Tiny Sepuku", u"http://www.gocomics.com/tinysepuku"),
# (u"TOBY", u"http://www.gocomics.com/toby"),
# (u"Tom the Dancing Bug", u"http://www.gocomics.com/tomthedancingbug"),
# (u"Too Much Coffee Man", u"http://www.gocomics.com/toomuchcoffeeman"),
# (u"W.T. Duck", u"http://www.gocomics.com/wtduck"),
# (u"Watch Your Head", u"http://www.gocomics.com/watchyourhead"),
# (u"Wee Pals", u"http://www.gocomics.com/weepals"),
# (u"Winnie the Pooh", u"http://www.gocomics.com/winniethepooh"),
(u"Wizard of Id", u"http://www.gocomics.com/wizardofid"),
# (u"Working It Out", u"http://www.gocomics.com/workingitout"),
# (u"Yenny", u"http://www.gocomics.com/yenny"),
# (u"Zack Hill", u"http://www.gocomics.com/zackhill"),
(u"Ziggy", u"http://www.gocomics.com/ziggy"),
######## COMICS - EDITORIAL ########
("Lalo Alcaraz","http://www.gocomics.com/laloalcaraz"),
("Nick Anderson","http://www.gocomics.com/nickanderson"),
("Chuck Asay","http://www.gocomics.com/chuckasay"),
("Tony Auth","http://www.gocomics.com/tonyauth"),
("Donna Barstow","http://www.gocomics.com/donnabarstow"),
# ("Bruce Beattie","http://www.gocomics.com/brucebeattie"),
# ("Clay Bennett","http://www.gocomics.com/claybennett"),
# ("Lisa Benson","http://www.gocomics.com/lisabenson"),
# ("Steve Benson","http://www.gocomics.com/stevebenson"),
# ("Chip Bok","http://www.gocomics.com/chipbok"),
# ("Steve Breen","http://www.gocomics.com/stevebreen"),
# ("Chris Britt","http://www.gocomics.com/chrisbritt"),
# ("Stuart Carlson","http://www.gocomics.com/stuartcarlson"),
# ("Ken Catalino","http://www.gocomics.com/kencatalino"),
# ("Paul Conrad","http://www.gocomics.com/paulconrad"),
# ("Jeff Danziger","http://www.gocomics.com/jeffdanziger"),
# ("Matt Davies","http://www.gocomics.com/mattdavies"),
# ("John Deering","http://www.gocomics.com/johndeering"),
# ("Bob Gorrell","http://www.gocomics.com/bobgorrell"),
# ("Walt Handelsman","http://www.gocomics.com/walthandelsman"),
# ("Clay Jones","http://www.gocomics.com/clayjones"),
# ("Kevin Kallaugher","http://www.gocomics.com/kevinkallaugher"),
# ("Steve Kelley","http://www.gocomics.com/stevekelley"),
# ("Dick Locher","http://www.gocomics.com/dicklocher"),
# ("Chan Lowe","http://www.gocomics.com/chanlowe"),
# ("Mike Luckovich","http://www.gocomics.com/mikeluckovich"),
# ("Gary Markstein","http://www.gocomics.com/garymarkstein"),
# ("Glenn McCoy","http://www.gocomics.com/glennmccoy"),
# ("Jim Morin","http://www.gocomics.com/jimmorin"),
# ("Jack Ohman","http://www.gocomics.com/jackohman"),
# ("Pat Oliphant","http://www.gocomics.com/patoliphant"),
# ("Joel Pett","http://www.gocomics.com/joelpett"),
# ("Ted Rall","http://www.gocomics.com/tedrall"),
# ("Michael Ramirez","http://www.gocomics.com/michaelramirez"),
# ("Marshall Ramsey","http://www.gocomics.com/marshallramsey"),
# ("Steve Sack","http://www.gocomics.com/stevesack"),
# ("Ben Sargent","http://www.gocomics.com/bensargent"),
# ("Drew Sheneman","http://www.gocomics.com/drewsheneman"),
# ("John Sherffius","http://www.gocomics.com/johnsherffius"),
# ("Small World","http://www.gocomics.com/smallworld"),
# ("Scott Stantis","http://www.gocomics.com/scottstantis"),
# ("Wayne Stayskal","http://www.gocomics.com/waynestayskal"),
# ("Dana Summers","http://www.gocomics.com/danasummers"),
# ("Paul Szep","http://www.gocomics.com/paulszep"),
# ("Mike Thompson","http://www.gocomics.com/mikethompson"),
# ("Tom Toles","http://www.gocomics.com/tomtoles"),
# ("Gary Varvel","http://www.gocomics.com/garyvarvel"),
# ("ViewsAfrica","http://www.gocomics.com/viewsafrica"),
# ("ViewsAmerica","http://www.gocomics.com/viewsamerica"),
# ("ViewsAsia","http://www.gocomics.com/viewsasia"),
# ("ViewsBusiness","http://www.gocomics.com/viewsbusiness"),
# ("ViewsEurope","http://www.gocomics.com/viewseurope"),
# ("ViewsLatinAmerica","http://www.gocomics.com/viewslatinamerica"),
# ("ViewsMidEast","http://www.gocomics.com/viewsmideast"),
# ("Views of the World","http://www.gocomics.com/viewsoftheworld"),
# ("Kerry Waghorn","http://www.gocomics.com/facesinthenews"),
# ("Dan Wasserman","http://www.gocomics.com/danwasserman"),
# ("Signe Wilkinson","http://www.gocomics.com/signewilkinson"),
# ("Wit of the World","http://www.gocomics.com/witoftheworld"),
# ("Don Wright","http://www.gocomics.com/donwright"),
]:
articles = self.make_links(url)
if articles:
feeds.append((title, articles))
return feeds
def make_links(self, url):
title = 'Temp'
current_articles = []
pages = range(1, self.num_comics_to_get+1)
for page in pages:
page_soup = self.index_to_soup(url)
if page_soup:
try:
strip_title = page_soup.h1.a.string
except:
strip_title = 'Error - no page_soup.h1.a.string'
try:
date_title = page_soup.find('ul', attrs={'class': 'feature-nav'}).li.string
except:
date_title = 'Error - no page_soup.h1.li.string'
title = strip_title + ' - ' + date_title
for i in range(2):
try:
strip_url_date = page_soup.h1.a['href']
break #success - this is normal exit
except:
continue #try to get strip_url_date again
continue # give up on this strip date
for i in range(2):
try:
prev_strip_url_date = page_soup.find('a', attrs={'class': 'prev'})['href']
break #success - this is normal exit
except:
continue #try to get prev_strip_url_date again
continue # give up on this prev strip date
if strip_url_date:
page_url = 'http://www.gocomics.com' + strip_url_date
else:
continue
if prev_strip_url_date:
prev_page_url = 'http://www.gocomics.com' + prev_strip_url_date
else:
continue
current_articles.append({'title': title, 'url': page_url, 'description':'', 'date':''})
url = prev_page_url
current_articles.reverse()
return current_articles
def preprocess_html(self, soup):
if soup.title:
title_string = soup.title.string.strip()
_cd = title_string.split(',',1)[1]
comic_date = ' '.join(_cd.split(' ', 4)[0:-1])
if soup.h1.span:
artist = soup.h1.span.string
soup.h1.span.string.replaceWith(comic_date + artist)
feature_item = soup.find('p',attrs={'class':'feature_item'})
if feature_item.a:
a_tag = feature_item.a
a_href = a_tag["href"]
img_tag = a_tag.img
img_tag["src"] = a_href
img_tag["width"] = self.comic_size
img_tag["height"] = None
return self.adeify_images(soup)
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
img {max-width:100%; min-width:100%;}
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
'''