Update Arcamax

2025-07-09 03:04:10 -04:00 · 2015-04-12 10:56:52 +05:30 · 2015-04-12 10:56:52 +05:30 · 9b8bf25a41
commit 9b8bf25a41
parent d49d227e4a
1 changed files with 77 additions and 83 deletions
--- a/recipes/arcamax.recipe
+++ b/recipes/arcamax.recipe
@ -5,14 +5,14 @@ __copyright__ = 'Copyright 2010 Starson17'
 '''
 www.arcamax.com
 '''
+
+import os
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import Tag
+from calibre.ptempfile import PersistentTemporaryDirectory

 class Arcamax(BasicNewsRecipe):
    title               = 'Arcamax'
-    __author__          = 'Starson17'
-    __version__         = '1.04'
-    __date__            = '18 April 2011'
+    __author__          = 'Kovid Goyal'
    description         = u'Family Friendly Comics - Customize for more days/comics: Defaults to 7 days, 25 comics - 20 general, 5 editorial.'
    category            = 'news, comics'
    language            = 'en'
@ -31,16 +31,15 @@ class Arcamax(BasicNewsRecipe):
                        , 'language'          : language
                        }

-    keep_only_tags     = [dict(name='article', attrs={'class':['comic']}),
+    keep_only_tags     = [
+        dict(name='header', attrs={'class':'fn-content-header bluelabel'}),
+        dict(name='figure', attrs={'class':['comic']}),
    ]

-    #remove_tags = [dict(name='div', attrs={'id':['comicfull' ]}),
-                               #dict(name='div', attrs={'class':['calendar' ]}),
-                               #dict(name='nav', attrs={'class':['calendar-nav' ]}),
-                               #]
-
    def parse_index(self):
        feeds = []
+        self.panel_tdir = PersistentTemporaryDirectory('arcamax')
+        self.panel_counter = 0
        for title, url in [
                # ####### COMICS - GENERAL ########
                # (u"9 Chickweed Lane", #u"http://www.arcamax.com/thefunnies/ninechickweedlane"),
@ -79,50 +78,45 @@ class Arcamax(BasicNewsRecipe):
                (u"Wizard of Id", u"http://www.arcamax.com/thefunnies/wizardofid"),
                (u"Zits", u"http://www.arcamax.com/thefunnies/zits"),
        ]:
-            articles = self.make_links(url)
+            self.log('Finding strips for:', title)
+            articles = self.make_links(url, title)
            if articles:
                feeds.append((title, articles))
+            if self.test and len(feeds) >= self.test[0]:
+                break
        return feeds

-    def make_links(self, url):
-        title = 'Temp'
+    def make_links(self, url, title):
        current_articles = []
-        pages = range(1, self.num_comics_to_get+1)
-        for page in pages:
-            page_soup = self.index_to_soup(url)
-            if page_soup:
-                title = self.tag_to_string(page_soup.find(name='div', attrs={'class':'columnheader'}).h1.contents[0])
-                page_url = url
-                # orig prev_page_url = 'http://www.arcamax.com' + page_soup.find('a', attrs={'class':'prev'}, text='Previous').parent['href']
-                prev_page_url = 'http://www.arcamax.com' + page_soup.find(name='a', attrs={'class':['prev']})['href']
-                date = self.tag_to_string(page_soup.find(name='span', attrs={'class':['cur']}))
-            current_articles.append({'title': title, 'url': page_url, 'description':'', 'date': date})
+        num = self.num_comics_to_get
+        while num > 0:
+            num -= 1
+            raw = self.index_to_soup(url, raw=True)
+            self.panel_counter += 1
+            path = os.path.join(self.panel_tdir, '%d.html' % self.panel_counter)
+            with open(path, 'wb') as f:
+                f.write(raw)
+            soup = self.index_to_soup(raw)
+            a = soup.find(name='a', attrs={'class':['prev']})
+            prev_page_url = 'http://www.arcamax.com' + a['href']
+            title = self.tag_to_string(soup.find('title')).partition('|')[0].strip()
+            if 'for' not in title.split():
+                title = title + ' for today'
+            date = self.tag_to_string(soup.find(name='span', attrs={'class':['cur']}))
+            self.log('\tFound:', title, 'at:', url)
+            current_articles.append({'title': title, 'url':'file://' + path , 'description':'', 'date': date})
+            if self.test and len(current_articles) >= self.test[1]:
+                break
            url = prev_page_url
        current_articles.reverse()
        return current_articles

    def preprocess_html(self, soup):
-        for img_tag in soup.findAll('img'):
-            parent_tag = img_tag.parent
-            if parent_tag.name == 'a':
-                new_tag = Tag(soup,'p')
-                new_tag.insert(0,img_tag)
-                parent_tag.replaceWith(new_tag)
-            elif parent_tag.name == 'p':
-                if not self.tag_to_string(parent_tag) == '':
-                    new_div = Tag(soup,'div')
-                    new_tag = Tag(soup,'p')
-                    new_tag.insert(0,img_tag)
-                    parent_tag.replaceWith(new_div)
-                    new_div.insert(0,new_tag)
-                    new_div.insert(1,parent_tag)
+        for img in soup.findAll('img', src=True):
+            if img['src'].startswith('/'):
+                img['src'] = 'http://arcamax.com' + img['src']
        return soup

    extra_css = '''
-                    h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
-                    h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
        img {max-width:100%; min-width:100%;}
-                    p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
-                    body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
    '''
-