Update Caravan Magazine

2025-08-11 09:13:57 -04:00 · 2019-02-05 11:27:09 +05:30 · 2019-02-05 11:27:09 +05:30 · 0c815cd06d
commit 0c815cd06d
parent 848934643e
1 changed files with 37 additions and 114 deletions
--- a/recipes/caravan_magazine.recipe
+++ b/recipes/caravan_magazine.recipe
@ -1,10 +1,11 @@
 # coding: utf-8
 import html5lib
 import re
 from lxml import etree
 from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.utils.cleantext import clean_xml_chars
+
-from calibre.ebooks.BeautifulSoup import Tag, NavigableString
+
 def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})
 class CaravanMagazine(BasicNewsRecipe):
@ -18,125 +19,47 @@ class CaravanMagazine(BasicNewsRecipe):
    no_stylesheets = True
    keep_only_tags = [
-        dict(attrs={'class': ['post-title']}),
+            classes('post-title short-desc author-details cover'),
-        dict(attrs={'class': ['post-subhheading',
+            dict(itemprop='articleBody'),
                              'authorndate', 'rg-thumbs', 'entry-content']}),
    ]
    remove_tags = [
        dict(name='meta'),
        dict(attrs={'class': ['share-with']}),
    ]
    def preprocess_raw_html(self, raw_html, url):
        root = html5lib.parse(
            clean_xml_chars(raw_html), treebuilder='lxml',
            namespaceHTMLElements=False)
        for s in root.xpath('//script'):
            s.getparent().remove(s)
        return etree.tostring(root, encoding=unicode)
    def preprocess_html(self, soup):
        # Handle the image carousel
        carousel = soup.find('div', {'class': 'rg-thumbs'})
        if carousel is not None:
            # create a new container to collect all images
            all_images = Tag(soup, 'div')
            # all_images['class'] = 'rg-thumbs'
            for index, img in enumerate(carousel.findAll('img')):
                # create a new div to contain image and caption
                div = Tag(soup, 'div')
                div['style'] = 'text-align:left;font-size:70%;margin-bottom: 0.4em;'
                ns = NavigableString(img['data-caption'])
                img['src'] = img['data-large']
                del img['data-large']
                del img['data-caption']
                del img['data-credit']
                img.extract()
                div.insert(0, img)
                div.insert(1, Tag(soup, 'br'))
                div.insert(3, ns)
                div.insert(3, Tag(soup, 'br'))
                all_images.insert(index, div)
            # extracted all images, replace carousel with extracted images
            carousel.replaceWith(all_images)
        return soup
    # To parse artice toc
    def parse_index(self):
-
+        base_url = 'https://www.caravanmagazine.in/'
-        base_url = 'http://www.caravanmagazine.in'
+        soup = self.index_to_soup('{0}magazine'.format(base_url))
        raw = self.index_to_soup('{0}/current-issue'.format(base_url),
                                 raw=True)
        raw = raw.decode('utf-8')
        raw = self.preprocess_raw_html(raw, None)
        soup = self.index_to_soup(raw)
        # find current issue cover
        try:
            cover_img = soup.find('div', {'class': 'issue-image'}).find('img')
            # a = soup.find('a', rel=lambda x:x and '[field_c_issues_image]' in x)
            # if a is not None:
            self.cover_url = cover_img['src']
        except:
            pass
        # ci = soup.find(attrs={'class': 'current-issue-block'})
        ci = soup.findAll(attrs={'class': re.compile('archive-story.*')})
        current_section = 'Section'
        current_articles = []
        feeds = []
-
+        sections = soup.find(attrs={'class': lambda x: x and 'current-magazine-issue' in x.split()}).find(
-        # define some reusable constants
+                attrs={'class': lambda x: x and 'sections' in x.split()})
-        heading_class = 'subject-heading'
+        for section in sections.findAll(attrs={'class': lambda x: x and 'section' in x.split()}):
-        content_class = 'subject-content'
+            a = section.find('a')
-        stories_re = re.compile('({0}|{1}).*'.format(heading_class,
+            section_title = self.tag_to_string(a)
-                                                     content_class))
+            self.log('\nSection:', section_title)
-
+            articles = []
-        for story in ci:
+            for article in section.findAll('article'):
-            for ele in story.findAll(attrs={'class': stories_re}):
+                details = article.find(attrs={'class': lambda x: x and 'details' in x.split()})
-                if ele['class'].startswith(heading_class):
+                pre = details.find(attrs={'class': lambda x: x and 'pre-heading' in x.split()})
-                    # heading section
+                if pre is not None:
-                    if current_articles:
+                    pre.extract()
-                        self.log('Adding {0} articles to {1}'.format(
+                a = details.find('a')
-                            len(current_articles), current_section))
+                url = base_url + a['href']
-                        feeds.append((current_section, current_articles))
+                title = self.tag_to_string(a)
-                    current_section = self.tag_to_string(ele)
+                desc = self.tag_to_string(details.find('div'))
-                    current_articles = []
+                self.log('\t', title, url)
-                    self.log('Section:', current_section)
+                articles.append({'title': title, 'description': desc, 'url': url})
-                    pass
+            if articles:
-                else:
+                feeds.append((section_title, articles))
                    # content Section
                    for art in ele.findAll('article',
                                           attrs={'id': re.compile('post-.*')}):
                        title = art.find('h1')
                        if title is not None:
                            a = title.find('a', href=True)
                            if a is not None:
                                href = a['href']
                                # convert relative url to absolute url
                                if href.startswith('/'):
                                    href = '{0}{1}'.format(base_url, href)
                                article = {
                                    'title': self.tag_to_string(title),
                                    'url': href
                                }
                                title.extract()
                                desc = self.tag_to_string(art).strip()
                                if desc:
                                    article['description'] = desc
                                current_articles.append(article)
                                self.log('\t' + article['title'])
                                self.log('\t\t' + article['url'])
        # append any remaining articles that were probably from last section,
        # we ran out of heading_class to push them
        if current_articles:
            self.log('Adding {0} articles to {1}'.format(
                len(current_articles), current_section))
            feeds.append((current_section, current_articles))
        return feeds
    def preprocess_html(self, soup):
        for div in soup.findAll(itemprop='image'):
            for img in div.findAll('img'):
                img['src'] = div['content']
        return soup