Update Arcamax

2025-07-09 03:04:10 -04:00 · 2015-04-12 10:56:52 +05:30 · 2015-04-12 10:56:52 +05:30 · 9b8bf25a41
commit 9b8bf25a41
parent d49d227e4a
1 changed files with 77 additions and 83 deletions
--- a/recipes/arcamax.recipe
+++ b/recipes/arcamax.recipe
@ -5,14 +5,14 @@ __copyright__ = 'Copyright 2010 Starson17'
 '''
 www.arcamax.com
 '''
+
+import os
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import Tag
+from calibre.ptempfile import PersistentTemporaryDirectory

 class Arcamax(BasicNewsRecipe):
    title               = 'Arcamax'
-    __author__          = 'Starson17'
-    __version__         = '1.04'
-    __date__            = '18 April 2011'
+    __author__          = 'Kovid Goyal'
    description         = u'Family Friendly Comics - Customize for more days/comics: Defaults to 7 days, 25 comics - 20 general, 5 editorial.'
    category            = 'news, comics'
    language            = 'en'
@ -21,7 +21,7 @@ class Arcamax(BasicNewsRecipe):
    remove_javascript   = True
    cover_url           = 'http://www.arcamax.com/images/pub/amuse/leftcol/zits.jpg'

-    ####### USER PREFERENCES - SET COMICS AND NUMBER OF COMICS TO RETRIEVE ########
+    # ###### USER PREFERENCES - SET COMICS AND NUMBER OF COMICS TO RETRIEVE ########
    num_comics_to_get = 7
    # CHOOSE COMIC STRIPS BELOW - REMOVE COMMENT '# ' FROM IN FRONT OF DESIRED STRIPS

@ -31,98 +31,92 @@ class Arcamax(BasicNewsRecipe):
                        , 'language'          : language
                        }

-    keep_only_tags     = [dict(name='article', attrs={'class':['comic']}),
-                                        ]
-
-    #remove_tags = [dict(name='div', attrs={'id':['comicfull' ]}),
-                               #dict(name='div', attrs={'class':['calendar' ]}),
-                               #dict(name='nav', attrs={'class':['calendar-nav' ]}),
-                               #]
+    keep_only_tags     = [
+        dict(name='header', attrs={'class':'fn-content-header bluelabel'}),
+        dict(name='figure', attrs={'class':['comic']}),
+    ]

    def parse_index(self):
        feeds = []
+        self.panel_tdir = PersistentTemporaryDirectory('arcamax')
+        self.panel_counter = 0
        for title, url in [
-                            ######## COMICS - GENERAL ########
-                            #(u"9 Chickweed Lane", #u"http://www.arcamax.com/thefunnies/ninechickweedlane"),
-                            #(u"Agnes", u"http://www.arcamax.com/thefunnies/agnes"),
-                            #(u"Andy Capp", #u"http://www.arcamax.com/thefunnies/andycapp"),
-                            (u"BC", u"http://www.arcamax.com/thefunnies/bc"),
-                            #(u"Baby Blues", #u"http://www.arcamax.com/thefunnies/babyblues"),
-                            #(u"Beetle Bailey", #u"http://www.arcamax.com/thefunnies/beetlebailey"),
-                            (u"Blondie", u"http://www.arcamax.com/thefunnies/blondie"),
-                            #u"Boondocks", u"http://www.arcamax.com/thefunnies/boondocks"),
-                            #(u"Cathy", u"http://www.arcamax.com/thefunnies/cathy"),
-                            #(u"Daddys Home", #u"http://www.arcamax.com/thefunnies/daddyshome"),
-                            (u"Dilbert", u"http://www.arcamax.com/thefunnies/dilbert"),
-                            #(u"Dinette Set", #u"http://www.arcamax.com/thefunnies/thedinetteset"),
-                            (u"Dog Eat Doug", u"http://www.arcamax.com/thefunnies/dogeatdoug"),
-                            (u"Doonesbury", u"http://www.arcamax.com/thefunnies/doonesbury"),
-                            #(u"Dustin", u"http://www.arcamax.com/thefunnies/dustin"),
-                            (u"Family Circus", u"http://www.arcamax.com/thefunnies/familycircus"),
-                            (u"Garfield", u"http://www.arcamax.com/thefunnies/garfield"),
-                            #(u"Get Fuzzy", #u"http://www.arcamax.com/thefunnies/getfuzzy"),
-                            #(u"Girls and Sports", #u"http://www.arcamax.com/thefunnies/girlsandsports"),
-                            #(u"Hagar the Horrible", #u"http://www.arcamax.com/thefunnies/hagarthehorrible"),
-                            #(u"Heathcliff", #u"http://www.arcamax.com/thefunnies/heathcliff"),
-                            #(u"Jerry King Cartoons", #u"http://www.arcamax.com/thefunnies/humorcartoon"),
-                            #(u"Luann", u"http://www.arcamax.com/thefunnies/luann"),
-                            #(u"Momma", u"http://www.arcamax.com/thefunnies/momma"),
-                            #(u"Mother Goose and Grimm", #u"http://www.arcamax.com/thefunnies/mothergooseandgrimm"),
-                            (u"Mutts", u"http://www.arcamax.com/thefunnies/mutts"),
-                            #(u"Non Sequitur", #u"http://www.arcamax.com/thefunnies/nonsequitur"),
-                            #(u"Pearls Before Swine", #u"http://www.arcamax.com/thefunnies/pearlsbeforeswine"),
-                            #(u"Pickles", u"http://www.arcamax.com/thefunnies/pickles"),
-                            #(u"Red and Rover", #u"http://www.arcamax.com/thefunnies/redandrover"),
-                            #(u"Rubes", u"http://www.arcamax.com/thefunnies/rubes"),
-                            #(u"Rugrats", u"http://www.arcamax.com/thefunnies/rugrats"),
-                            (u"Speed Bump", u"http://www.arcamax.com/thefunnies/speedbump"),
-                            (u"Wizard of Id", u"http://www.arcamax.com/thefunnies/wizardofid"),
-                            (u"Zits", u"http://www.arcamax.com/thefunnies/zits"),
-                             ]:
-            articles = self.make_links(url)
+                # ####### COMICS - GENERAL ########
+                # (u"9 Chickweed Lane", #u"http://www.arcamax.com/thefunnies/ninechickweedlane"),
+                # (u"Agnes", u"http://www.arcamax.com/thefunnies/agnes"),
+                # (u"Andy Capp", #u"http://www.arcamax.com/thefunnies/andycapp"),
+                (u"BC", u"http://www.arcamax.com/thefunnies/bc"),
+                # (u"Baby Blues", #u"http://www.arcamax.com/thefunnies/babyblues"),
+                # (u"Beetle Bailey", #u"http://www.arcamax.com/thefunnies/beetlebailey"),
+                (u"Blondie", u"http://www.arcamax.com/thefunnies/blondie"),
+                # u"Boondocks", u"http://www.arcamax.com/thefunnies/boondocks"),
+                # (u"Cathy", u"http://www.arcamax.com/thefunnies/cathy"),
+                # (u"Daddys Home", #u"http://www.arcamax.com/thefunnies/daddyshome"),
+                (u"Dilbert", u"http://www.arcamax.com/thefunnies/dilbert"),
+                # (u"Dinette Set", #u"http://www.arcamax.com/thefunnies/thedinetteset"),
+                (u"Dog Eat Doug", u"http://www.arcamax.com/thefunnies/dogeatdoug"),
+                (u"Doonesbury", u"http://www.arcamax.com/thefunnies/doonesbury"),
+                # (u"Dustin", u"http://www.arcamax.com/thefunnies/dustin"),
+                (u"Family Circus", u"http://www.arcamax.com/thefunnies/familycircus"),
+                (u"Garfield", u"http://www.arcamax.com/thefunnies/garfield"),
+                # (u"Get Fuzzy", #u"http://www.arcamax.com/thefunnies/getfuzzy"),
+                # (u"Girls and Sports", #u"http://www.arcamax.com/thefunnies/girlsandsports"),
+                # (u"Hagar the Horrible", #u"http://www.arcamax.com/thefunnies/hagarthehorrible"),
+                # (u"Heathcliff", #u"http://www.arcamax.com/thefunnies/heathcliff"),
+                # (u"Jerry King Cartoons", #u"http://www.arcamax.com/thefunnies/humorcartoon"),
+                # (u"Luann", u"http://www.arcamax.com/thefunnies/luann"),
+                # (u"Momma", u"http://www.arcamax.com/thefunnies/momma"),
+                # (u"Mother Goose and Grimm", #u"http://www.arcamax.com/thefunnies/mothergooseandgrimm"),
+                (u"Mutts", u"http://www.arcamax.com/thefunnies/mutts"),
+                # (u"Non Sequitur", #u"http://www.arcamax.com/thefunnies/nonsequitur"),
+                # (u"Pearls Before Swine", #u"http://www.arcamax.com/thefunnies/pearlsbeforeswine"),
+                # (u"Pickles", u"http://www.arcamax.com/thefunnies/pickles"),
+                # (u"Red and Rover", #u"http://www.arcamax.com/thefunnies/redandrover"),
+                # (u"Rubes", u"http://www.arcamax.com/thefunnies/rubes"),
+                # (u"Rugrats", u"http://www.arcamax.com/thefunnies/rugrats"),
+                (u"Speed Bump", u"http://www.arcamax.com/thefunnies/speedbump"),
+                (u"Wizard of Id", u"http://www.arcamax.com/thefunnies/wizardofid"),
+                (u"Zits", u"http://www.arcamax.com/thefunnies/zits"),
+        ]:
+            self.log('Finding strips for:', title)
+            articles = self.make_links(url, title)
            if articles:
                feeds.append((title, articles))
+            if self.test and len(feeds) >= self.test[0]:
+                break
        return feeds

-    def make_links(self, url):
-        title = 'Temp'
+    def make_links(self, url, title):
        current_articles = []
-        pages = range(1, self.num_comics_to_get+1)
-        for page in pages:
-            page_soup = self.index_to_soup(url)
-            if page_soup:
-                title = self.tag_to_string(page_soup.find(name='div', attrs={'class':'columnheader'}).h1.contents[0])
-                page_url = url
-                # orig prev_page_url = 'http://www.arcamax.com' + page_soup.find('a', attrs={'class':'prev'}, text='Previous').parent['href']
-                prev_page_url = 'http://www.arcamax.com' + page_soup.find(name='a', attrs={'class':['prev']})['href']
-                date = self.tag_to_string(page_soup.find(name='span', attrs={'class':['cur']}))
-            current_articles.append({'title': title, 'url': page_url, 'description':'', 'date': date})
+        num = self.num_comics_to_get
+        while num > 0:
+            num -= 1
+            raw = self.index_to_soup(url, raw=True)
+            self.panel_counter += 1
+            path = os.path.join(self.panel_tdir, '%d.html' % self.panel_counter)
+            with open(path, 'wb') as f:
+                f.write(raw)
+            soup = self.index_to_soup(raw)
+            a = soup.find(name='a', attrs={'class':['prev']})
+            prev_page_url = 'http://www.arcamax.com' + a['href']
+            title = self.tag_to_string(soup.find('title')).partition('|')[0].strip()
+            if 'for' not in title.split():
+                title = title + ' for today'
+            date = self.tag_to_string(soup.find(name='span', attrs={'class':['cur']}))
+            self.log('\tFound:', title, 'at:', url)
+            current_articles.append({'title': title, 'url':'file://' + path , 'description':'', 'date': date})
+            if self.test and len(current_articles) >= self.test[1]:
+                break
            url = prev_page_url
        current_articles.reverse()
        return current_articles

    def preprocess_html(self, soup):
-        for img_tag in soup.findAll('img'):
-            parent_tag = img_tag.parent
-            if parent_tag.name == 'a':
-                new_tag = Tag(soup,'p')
-                new_tag.insert(0,img_tag)
-                parent_tag.replaceWith(new_tag)
-            elif parent_tag.name == 'p':
-                if not self.tag_to_string(parent_tag) == '':
-                    new_div = Tag(soup,'div')
-                    new_tag = Tag(soup,'p')
-                    new_tag.insert(0,img_tag)
-                    parent_tag.replaceWith(new_div)
-                    new_div.insert(0,new_tag)
-                    new_div.insert(1,parent_tag)
+        for img in soup.findAll('img', src=True):
+            if img['src'].startswith('/'):
+                img['src'] = 'http://arcamax.com' + img['src']
        return soup

    extra_css = '''
-                    h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
-                    h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
-                    img {max-width:100%; min-width:100%;}
-                    p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
-                    body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
-        '''
-
+        img {max-width:100%; min-width:100%;}
+    '''